Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import spaces | |
| import torch | |
| from string import Template | |
| from eval_methods.vs2_float import eval_VideoScore2_float | |
| import json | |
| # ---------------------------- | |
| # Constants and Model Init | |
| # ---------------------------- | |
| MODEL_NAME = "TIGER-Lab/VideoScore2" | |
| vs2_evaluator = eval_VideoScore2_float(MODEL_NAME) | |
| VS2_QUERY_TEMPLATE = Template(""" | |
| You are an expert for evaluating and thinking about the quality of AI videos from diverse dimensions. | |
| We would like to evaluate its quality from three dimensions: 'visual quality', 'text-to-video alignment' and 'physical/common-sense consistency'. Below is the definition of each dimension: | |
| (1) visual quality: | |
| The dimension 'visual quality' cares about the video's visual and optical propertities, including resolution, overall clarity, local blurriness, smoothness, stability of brightness/contrast, distortion/misalignment, abrupt changes, and any other factors that affect the watching experience. | |
| (2) text-to-video alignment: | |
| The dimension 't2v_alignment' mainly assesses whether the generated video fully and accurately depicts the elements mentioned in the text prompt, such as characters, actions, animals, etc., as well as background, quantity, color, weather, and so on. | |
| (3) physical/common-sense consistency: | |
| The dimension 'physical/common-sense consistency' mainly examines whether there are any violations of common sense, physical laws, or any other aspects in the video that appear strange or unnatural. | |
| Here we provide an AI video generated by text-to-video models and its text prompt: | |
| $t2v_prompt. | |
| Based on the video content and the dimension definitions, please evaluate the video and give the quality score. | |
| The quality score must be integers in the range of 1 - 5. | |
| Your output must be in the following format: | |
| visual quality: <v_score>; | |
| text-to-video alignment: <t_score>; | |
| physical/common-sense consistency: <p_score> | |
| DO NOT include any other things behind or after your output. | |
| """) | |
| space_description = """ | |
| [📃Paper](https://www.arxiv.org/abs/2509.22799) | [🌐Website](https://tiger-ai-lab.github.io/VideoScore2/) | [💻GitHub](https://github.com/TIGER-AI-Lab/VideoScore2) | [🛢️Dataset](https://huggingface.co/datasets/TIGER-Lab/VideoFeedback2) | [🤗Model](https://huggingface.co/TIGER-Lab/VideoScore2) | |
| **VideoScore2** is a next-generation, interpretable and multi-dimensional video evaluation model designed to align with human judgment on text-to-video generation tasks. | |
| It explicitly evaluates **visual quality**, **text-to-video alignment**, and **physical/common-sense consistency**, producing structured scores and reasoning. | |
| """ | |
| with open("./examples/examples.json", 'r') as f: | |
| examples = json.load(f) | |
| # ---------------------------- | |
| # Evaluation Core | |
| # ---------------------------- | |
| def eval_vs2(video_path, t2v_prompt): | |
| if not video_path: | |
| raise gr.Error("Please upload a video.") | |
| if not t2v_prompt: | |
| raise gr.Error("Please provide a text prompt.") | |
| user_prompt = VS2_QUERY_TEMPLATE.substitute(t2v_prompt=t2v_prompt) | |
| method_kwargs = { | |
| "max_tokens": 1024, | |
| "infer_fps": 2.0 | |
| } | |
| with torch.no_grad(): | |
| v_score, t_score, p_score, full_text = vs2_evaluator.evaluate_video( | |
| user_prompt=user_prompt, | |
| video_path=video_path, | |
| kwargs=method_kwargs | |
| ) | |
| return { | |
| "visual quality": v_score, | |
| "text-to-video alignment": t_score, | |
| "physical/common-sense consistency": p_score, | |
| "full analysis": full_text | |
| } | |
| # ---------------------------- | |
| # Build Gradio Demo | |
| # ---------------------------- | |
| def build_demo(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## VideoScore2: Think before You Score in Generative Video Evaluation") | |
| with gr.Row(): | |
| gr.Markdown(space_description) | |
| gr.Image("https://tiger-ai-lab.github.io/VideoScore2/static/images/teaser.png", label="Teaser") | |
| with gr.Row(): | |
| video_input = gr.Video(label="Upload your video", width=500) | |
| with gr.Column(): | |
| t2v_prompt = gr.Textbox(label="Text-to-Video Prompt", placeholder="Describe the video prompt...") | |
| eval_btn = gr.Button("Evaluate Video", variant="primary") | |
| result_box = gr.Json(label="Evaluation Result") | |
| eval_btn.click(fn=eval_vs2, inputs=[video_input, t2v_prompt], outputs=[result_box]) | |
| gr.Examples( | |
| examples= | |
| [ | |
| [ | |
| item['video'], | |
| item['prompt'], | |
| ] for item in examples if item['prompt'] | |
| ], | |
| inputs=[video_input,t2v_prompt], | |
| ) | |
| gr.Markdown(""" | |
| ### 📚 Citation | |
| ``` | |
| @misc{he2025videoscore2thinkscoregenerative, | |
| title={VideoScore2: Think before You Score in Generative Video Evaluation}, | |
| author={Xuan He and Dongfu Jiang and Ping Nie and Minghao Liu and Zhengxuan Jiang and Mingyi Su and Wentao Ma and Junru Lin and Chun Ye and Yi Lu and Keming Wu and Benjamin Schneider and Quy Duc Do and Zhuofeng Li and Yiming Jia and Yuxuan Zhang and Guo Cheng and Haozhe Wang and Wangchunshu Zhou and Qunshu Lin and Yuanxing Zhang and Ge Zhang and Wenhao Huang and Wenhu Chen}, | |
| year={2025}, | |
| eprint={2509.22799}, | |
| archivePrefix={arXiv}, | |
| primaryClass={cs.CV}, | |
| url={https://arxiv.org/abs/2509.22799}, | |
| } | |
| ``` | |
| """) | |
| return demo | |
| # ---------------------------- | |
| # Main | |
| # ---------------------------- | |
| if __name__ == "__main__": | |
| demo = build_demo() | |
| demo.launch(share=True) | |