Spaces:

TIGER-Lab
/

VideoScore2

Running on Zero

App Files Files Community

VideoScore2 / app.py

hexuan21

Update app.py

1887b7b verified 18 days ago

raw

history blame contribute delete

5.73 kB

	import gradio as gr
	import spaces
	import torch
	from string import Template
	from eval_methods.vs2_float import eval_VideoScore2_float
	import json

	# ----------------------------
	# Constants and Model Init
	# ----------------------------
	MODEL_NAME = "TIGER-Lab/VideoScore2"

	vs2_evaluator = eval_VideoScore2_float(MODEL_NAME)
	VS2_QUERY_TEMPLATE = Template("""
	You are an expert for evaluating and thinking about the quality of AI videos from diverse dimensions.

	We would like to evaluate its quality from three dimensions: 'visual quality', 'text-to-video alignment' and 'physical/common-sense consistency'. Below is the definition of each dimension:
	(1) visual quality:
	The dimension 'visual quality' cares about the video's visual and optical propertities, including resolution, overall clarity, local blurriness, smoothness, stability of brightness/contrast, distortion/misalignment, abrupt changes, and any other factors that affect the watching experience.
	(2) text-to-video alignment:
	The dimension 't2v_alignment' mainly assesses whether the generated video fully and accurately depicts the elements mentioned in the text prompt, such as characters, actions, animals, etc., as well as background, quantity, color, weather, and so on.
	(3) physical/common-sense consistency:
	The dimension 'physical/common-sense consistency' mainly examines whether there are any violations of common sense, physical laws, or any other aspects in the video that appear strange or unnatural.

	Here we provide an AI video generated by text-to-video models and its text prompt:
	$t2v_prompt.

	Based on the video content and the dimension definitions, please evaluate the video and give the quality score.
	The quality score must be integers in the range of 1 - 5.

	Your output must be in the following format:
	visual quality: <v_score>;
	text-to-video alignment: <t_score>;
	physical/common-sense consistency: <p_score>

	DO NOT include any other things behind or after your output.
	""")

	space_description = """
	[📃Paper](https://www.arxiv.org/abs/2509.22799) \| [🌐Website](https://tiger-ai-lab.github.io/VideoScore2/) \| [💻GitHub](https://github.com/TIGER-AI-Lab/VideoScore2) \| [🛢️Dataset](https://huggingface.co/datasets/TIGER-Lab/VideoFeedback2) \| [🤗Model](https://huggingface.co/TIGER-Lab/VideoScore2)

	VideoScore2 is a next-generation, interpretable and multi-dimensional video evaluation model designed to align with human judgment on text-to-video generation tasks.
	It explicitly evaluates visual quality, text-to-video alignment, and physical/common-sense consistency, producing structured scores and reasoning.
	"""

	with open("./examples/examples.json", 'r') as f:
	examples = json.load(f)

	# ----------------------------
	# Evaluation Core
	# ----------------------------
	@spaces.GPU(duration=60)
	def eval_vs2(video_path, t2v_prompt):
	if not video_path:
	raise gr.Error("Please upload a video.")
	if not t2v_prompt:
	raise gr.Error("Please provide a text prompt.")

	user_prompt = VS2_QUERY_TEMPLATE.substitute(t2v_prompt=t2v_prompt)
	method_kwargs = {
	"max_tokens": 1024,
	"infer_fps": 2.0
	}

	with torch.no_grad():
	v_score, t_score, p_score, full_text = vs2_evaluator.evaluate_video(
	user_prompt=user_prompt,
	video_path=video_path,
	kwargs=method_kwargs
	)

	return {
	"visual quality": v_score,
	"text-to-video alignment": t_score,
	"physical/common-sense consistency": p_score,
	"full analysis": full_text
	}

	# ----------------------------
	# Build Gradio Demo
	# ----------------------------
	def build_demo():
	with gr.Blocks() as demo:
	gr.Markdown("## VideoScore2: Think before You Score in Generative Video Evaluation")
	with gr.Row():
	gr.Markdown(space_description)
	gr.Image("https://tiger-ai-lab.github.io/VideoScore2/static/images/teaser.png", label="Teaser")

	with gr.Row():
	video_input = gr.Video(label="Upload your video", width=500)
	with gr.Column():
	t2v_prompt = gr.Textbox(label="Text-to-Video Prompt", placeholder="Describe the video prompt...")
	eval_btn = gr.Button("Evaluate Video", variant="primary")
	result_box = gr.Json(label="Evaluation Result")

	eval_btn.click(fn=eval_vs2, inputs=[video_input, t2v_prompt], outputs=[result_box])

	gr.Examples(
	examples=
	[
	[
	item['video'],
	item['prompt'],
	] for item in examples if item['prompt']
	],
	inputs=[video_input,t2v_prompt],
	)

	gr.Markdown("""
	### 📚 Citation
	```
	@misc{he2025videoscore2thinkscoregenerative,
	title={VideoScore2: Think before You Score in Generative Video Evaluation},
	author={Xuan He and Dongfu Jiang and Ping Nie and Minghao Liu and Zhengxuan Jiang and Mingyi Su and Wentao Ma and Junru Lin and Chun Ye and Yi Lu and Keming Wu and Benjamin Schneider and Quy Duc Do and Zhuofeng Li and Yiming Jia and Yuxuan Zhang and Guo Cheng and Haozhe Wang and Wangchunshu Zhou and Qunshu Lin and Yuanxing Zhang and Ge Zhang and Wenhao Huang and Wenhu Chen},
	year={2025},
	eprint={2509.22799},
	archivePrefix={arXiv},
	primaryClass={cs.CV},
	url={https://arxiv.org/abs/2509.22799},
	}
	```
	""")
	return demo

	# ----------------------------
	# Main
	# ----------------------------
	if __name__ == "__main__":
	demo = build_demo()
	demo.launch(share=True)