Spaces:

TencentARC
/

VLog

Build error

App Files Files Community

VLog / app.py

leiwx52

VLog app

ca5cecc over 2 years ago

raw

history blame contribute delete

8.04 kB

	import os
	import gradio as gr
	import openai
	import requests
	import csv
	import argparse
	from models.vlog import Vlogger

	parser = argparse.ArgumentParser()
	parser.add_argument('--video_path', default='examples/huaqiang.mp4')
	parser.add_argument('--alpha', default=10, type=int, help='Determine the maximum segment number for KTS algorithm, the larger the value, the fewer segments.')
	parser.add_argument('--beta', default=1, type=int, help='The smallest time gap between successive clips, in seconds.')
	parser.add_argument('--data_dir', default='./examples', type=str, help='Directory for saving videos and logs.')
	parser.add_argument('--tmp_dir', default='./tmp', type=str, help='Directory for saving intermediate files.')

	# * Models settings *
	parser.add_argument('--openai_api_key', default='xxx', type=str, help='OpenAI API key')
	parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP Image Caption')
	parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
	parser.add_argument('--feature_extractor', default='openai/clip-vit-base-patch32', help='Select the feature extractor model for video segmentation')
	parser.add_argument('--feature_extractor_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu')
	parser.add_argument('--image_captioner', choices=['blip', 'blip2'], dest='captioner_base_model', default='blip2', help='blip2 requires 15G GPU memory, blip requires 6G GPU memory')
	parser.add_argument('--image_captioner_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
	parser.add_argument('--dense_captioner_device', choices=['cuda', 'cpu'], default='cuda', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
	parser.add_argument('--audio_translator', default='large')
	parser.add_argument('--audio_translator_device', choices=['cuda', 'cpu'], default='cuda')
	parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo'], default='gpt-3.5-turbo')

	args = parser.parse_args()


	def get_empty_state():
	return {"total_tokens": 0, "messages": []}


	def submit_api_key_fn(api_key, vlogger):
	try:
	vlogger.init_llm_with_api_key(api_key)
	return gr.update(value = "OpenAI key submitted successful 🎉"), True, vlogger

	except Exception as e:
	return gr.update(value = f"Error {e}"), False, vlogger


	def submit_message(prompt, state, vlogger, api_key_submitted, vlog_loaded):
	if not api_key_submitted:
	return gr.update(value=''), [("👀", "Please enter your OpenAI API key 😊"),], state, vlogger

	if not vlog_loaded:
	return gr.update(value=''), [("👀", "Please follow the instruction to select a video and generate the document for chatting 😊"),], state, vlogger

	history = state['messages']

	if not prompt:
	return gr.update(value=''), [(history[i]['content'], history[i+1]['content']) for i in range(0, len(history)-1, 2)], state, vlogger

	prompt_msg = { "role": "user", "content": prompt }

	try:
	history.append(prompt_msg)
	answer = vlogger.chat2video(prompt)
	history.append({"role": "system", "content": answer})

	except Exception as e:
	history.append(prompt_msg)
	history.append({
	"role": "system",
	"content": f"Error: {e}"
	})

	chat_messages = [(history[i]['content'], history[i+1]['content']) for i in range(0, len(history)-1, 2)]
	return '', chat_messages, state, vlogger

	def clear_conversation(vlogger):
	vlogger.clean_history()

	# return input_message, video_inp, chatbot, vlog_outp, state, vlogger, vlog_loaded
	return gr.update(value=None, visible=True), gr.update(value=None, interactive=False), None, gr.update(value=None, visible=True), get_empty_state(), vlogger, False

	def vlog_fn(vid_path, vlogger, api_key_submitted):
	if not api_key_submitted:
	log_text = "====== Please enter your OpenAI API key first 😊 ====="
	return gr.update(value=log_text, visible=True), False, vlogger

	print(vid_path)
	if vid_path is None:
	log_text = "====== Please select an video from examples first 🤔 ====="
	vloaded_flag = False
	else:
	log_list = vlogger.video2log(vid_path)
	log_text = "\n".join(log_list)
	vloaded_flag = True
	return gr.update(value=log_text, visible=True), vloaded_flag, vlogger

	css = """
	#col-container {max-width: 90%; margin-left: auto; margin-right: auto;}
	#video_inp {min-height: 300px}
	#chatbox {min-height: 100px;}
	#header {text-align: center;
	#hint {font-size: 0.9em; padding: 0.5em; margin: 0;}
	.message { font-size: 1.2em; }
	"""

	with gr.Blocks(css=css) as demo:

	state = gr.State(get_empty_state())
	vlogger = gr.State(Vlogger(args))
	vlog_loaded = gr.State(False)
	api_key_submitted = gr.State(False)


	with gr.Column(elem_id="col-container"):
	gr.Markdown("""## 🎞️ VLog Demo
	Powered by BLIP2, GRIT, Whisper, ChatGPT and LangChain
	Github: [https://github.com/showlab/VLog](https://github.com/showlab/VLog)""",
	elem_id="header")
	gr.Markdown("Instruction: For the current demo, please enter OpenAI api key, select an example video, click the button to generate a document and try chatting over the video 😊", elem_id="hint")
	with gr.Row():
	with gr.Column(scale=6):
	video_inp = gr.Video(label="video_input", interactive=False)
	chatbot = gr.Chatbot(elem_id="chatbox")
	input_message = gr.Textbox(show_label=False, placeholder="Enter text and press enter", visible=True).style(container=False)
	btn_submit = gr.Button("Submit")
	btn_clear_conversation = gr.Button("🔃 Start New Conversation")

	with gr.Column(scale=6):
	vlog_btn = gr.Button("Generate Video Document")
	vlog_outp = gr.Textbox(label="Document output", lines=30)

	with gr.Column(scale=1):
	openai_api_key = gr.Textbox(
	placeholder="Input OpenAI API key and press Enter",
	show_label=False,
	label = "OpenAI API Key",
	lines=1,
	type="password"
	)
	examples = gr.Examples(
	examples=[
	["examples/basketball_vlog.mp4"],
	["examples/travel_in_roman.mp4"],
	["examples/C8lMW0MODFs.mp4"],
	["examples/outcGtbnMuQ.mp4"],
	["examples/huaqiang.mp4"],
	],
	inputs=[video_inp],
	)

	gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/TencentARC/VLog?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br></center>''')

	btn_submit.click(submit_message, [input_message, state, vlogger, api_key_submitted, vlog_loaded], [input_message, chatbot, state, vlogger])
	input_message.submit(submit_message, [input_message, state, vlogger, api_key_submitted, vlog_loaded], [input_message, chatbot, state, vlogger])
	btn_clear_conversation.click(clear_conversation, [vlogger], [input_message, video_inp, chatbot, vlog_outp, state, vlogger, vlog_loaded])
	vlog_btn.click(vlog_fn, [video_inp, vlogger, api_key_submitted], [vlog_outp, vlog_loaded, vlogger])
	openai_api_key.submit(submit_api_key_fn, [openai_api_key, vlogger], [vlog_outp, api_key_submitted, vlogger])
	demo.load(queur=False)

	demo.queue(concurrency_count=5)
	demo.launch(height='800px')