Spaces:
Runtime error
Runtime error
| import copy | |
| import datetime | |
| import json | |
| import os | |
| from email.utils import parseaddr | |
| import re | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from datasets import Dataset, DatasetDict, VerificationMode, get_dataset_config_names, load_dataset | |
| from huggingface_hub import HfApi | |
| from content import ( | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| INTRODUCTION_TEXT, | |
| SUBMISSION_TEXT, | |
| TITLE, | |
| format_error, | |
| format_log, | |
| format_warning, | |
| model_hyperlink, | |
| ) | |
| TOKEN = os.environ.get("HF_TOKEN", None) | |
| OWNER = "facebook" | |
| ## private datasets | |
| SUBMISSION_DATASET = f"{OWNER}/pwm_leaderboard_submissions_internal" | |
| CONTACT_DATASET = f"{OWNER}/pwm_leaderboard_contact_info_internal" | |
| ## public datasets | |
| RESULTS_DATASET = f"{OWNER}/pwm_leaderboard_results_public" | |
| LEADERBOARD_PATH = f"{OWNER}/pwm_leaderboard" | |
| DATA_VERSION = "1.0.0" | |
| # Dataset paths | |
| MVP_DATASET = "facebook/minimal_video_pairs" | |
| INTP_DATASET = "facebook/IntPhys2_test" | |
| WMQA_DATASET = "facebook/CausalVQA" | |
| # Dataset names | |
| MVP_NAME = "MVPBench" | |
| INTP_NAME = "IntPhys 2" | |
| WMQA_NAME = "CausalVQA" | |
| # Dataset keys | |
| MVP_KEY = "mvp" | |
| MVP_MINI_KEY = "mvp_mini" | |
| INTP_KEY = "intphys2" | |
| WMQA_KEY = "causalvqa" | |
| TASKS = [ | |
| (INTP_KEY, INTP_NAME), | |
| (MVP_KEY, MVP_NAME), | |
| (WMQA_KEY, WMQA_NAME), | |
| ] | |
| VISIBLE_TASKS = copy.deepcopy(TASKS) | |
| PRE_COL_NAMES = ["Model Name"] | |
| POST_COL_NAMES = ["Model Type", "Vision Backbone", "LLM Backbone", "Submission Date"] | |
| api = HfApi() | |
| os.makedirs("scored", exist_ok=True) | |
| LOCAL_DEBUG = False | |
| # Display the results | |
| LDB_TEXT_KEYS = ["model", "model_type", "vision_backbone", "llm_backbone"] | |
| LDB_TEXT_TYPES = ["markdown", "text", "text", "text"] | |
| MISSING_VALUE = -1.0 | |
| HUMAN_BASELINES = { | |
| "url": "", | |
| "model": "Human", | |
| "model_type": "Human", | |
| "system_prompt": "test", | |
| "vision_backbone": " - ", | |
| "llm_backbone": " - ", | |
| "num_frames": -1, | |
| f"score_{INTP_KEY}": 92.44, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": 92.9, | |
| f"score_{WMQA_KEY}": 84.78, | |
| "date": "2025-06-11", | |
| "organization": "Meta", | |
| "submitted_by": "user", | |
| } | |
| GEMINI2_5 = { | |
| "url": "https://deepmind.google/models/gemini/flash/", | |
| "model": "Gemini 2.5 Flash", | |
| "model_type": "Closed", | |
| "system_prompt": "test", | |
| "vision_backbone": " - ", | |
| "llm_backbone": " - ", | |
| "num_frames": 10, | |
| f"score_{INTP_KEY}": 56.1, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": MISSING_VALUE, | |
| f"score_{WMQA_KEY}": 61.66, | |
| "date": "2025-06-11", | |
| "organization": "Meta", | |
| "submitted_by": "user", | |
| } | |
| GPT4O = { | |
| "url": "https://openai.com/index/gpt-4o-system-card/", | |
| "model": "GPT-4o", | |
| "model_type": "Closed", | |
| "system_prompt": "test", | |
| "vision_backbone": " - ", | |
| "llm_backbone": " - ", | |
| "num_frames": 10, | |
| f"score_{INTP_KEY}": 53.19, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": 32.5, | |
| f"score_{WMQA_KEY}": 50.95, | |
| "date": "2025-06-11", | |
| "organization": "Meta", | |
| "submitted_by": "user", | |
| } | |
| INTERN_VL = { | |
| "url": "https://internvl.github.io/blog/2024-12-05-InternVL-2.5/", | |
| "model": "InternVL2.5", | |
| "model_type": "Open", | |
| "system_prompt": "test", | |
| "vision_backbone": "InternViT-300M", | |
| "llm_backbone": "InternLM2.5-7B-Chat", | |
| "num_frames": 16, | |
| f"score_{INTP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": 39.9, | |
| f"score_{WMQA_KEY}": 47.54, | |
| "date": "2025-06-11", | |
| "organization": "Meta", | |
| "submitted_by": "user", | |
| } | |
| LLAVA = { | |
| "url": "https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov", | |
| "model": "LLaVA-OneVision", | |
| "model_type": "Open", | |
| "system_prompt": "test", | |
| "vision_backbone": "SigLIP", | |
| "llm_backbone": "Qwen2-7B", | |
| "num_frames": 16, | |
| f"score_{INTP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": 20.7, | |
| f"score_{WMQA_KEY}": 45.27, | |
| "date": "2025-06-11", | |
| "organization": "Meta", | |
| "submitted_by": "user", | |
| } | |
| PLM = { | |
| "url": "https://github.com/facebookresearch/perception_models", | |
| "model": "Perception Language Model (PLM)", | |
| "model_type": "Open", | |
| "system_prompt": "test", | |
| "vision_backbone": "PE", | |
| "llm_backbone": "Llama3.1 8B", | |
| "num_frames": 16, | |
| f"score_{INTP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": 39.7, | |
| f"score_{WMQA_KEY}": 50.06, | |
| "date": "2025-06-11", | |
| "organization": "Meta", | |
| "submitted_by": "user", | |
| } | |
| QWENVL = { | |
| "url": "https://github.com/QwenLM/Qwen2.5-VL", | |
| "model": "Qwen2.5-VL", | |
| "model_type": "Open", | |
| "system_prompt": "test", | |
| "vision_backbone": "ViT", | |
| "llm_backbone": "Qwen2.5-7B-Instruct", | |
| "num_frames": 16, | |
| f"score_{INTP_KEY}": 49.12, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": 36.7, | |
| f"score_{WMQA_KEY}": 49.05, | |
| "date": "2025-06-11", | |
| "organization": "Meta", | |
| "submitted_by": "user", | |
| } | |
| GEMINI1_5 = { | |
| "url": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/1-5-pro", | |
| "model": "Gemini 1.5 Pro", | |
| "model_type": "Closed", | |
| "system_prompt": "test", | |
| "vision_backbone": " - ", | |
| "llm_backbone": " - ", | |
| "num_frames": -1, | |
| f"score_{INTP_KEY}": 52.1, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": 29.6, | |
| f"score_{WMQA_KEY}": MISSING_VALUE, | |
| "date": "2025-06-11", | |
| "organization": "Meta", | |
| "submitted_by": "user", | |
| } | |
| VJEPA2 = { | |
| "url": "https://ai.meta.com/vjepa/", | |
| "model": "V-JEPA 2", | |
| "model_type": "Open", | |
| "system_prompt": "test", | |
| "vision_backbone": "VJEPA 2", | |
| "llm_backbone": "Llama3.1 8B", | |
| "num_frames": -1, | |
| f"score_{INTP_KEY}": 56.4, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": 44.5, | |
| f"score_{WMQA_KEY}": 44.89, | |
| "date": "2025-06-11", | |
| "organization": "Meta", | |
| "submitted_by": "user", | |
| } | |
| def get_dataframe_from_results(eval_results, split): | |
| local_df = eval_results[split] | |
| local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])}) | |
| local_df = local_df.remove_columns(["system_prompt"])#, "url"]) | |
| df = pd.DataFrame(local_df) | |
| # reformat the data to keep a single row for a given model and organization pair | |
| # in case of multiple entries, choose the ones with latest values | |
| df["model_org"] = df["model"].str.cat(df["organization"], sep="-") | |
| ldb_m2r = {} | |
| for i, row in df.iterrows(): | |
| if row["model_org"] not in ldb_m2r: | |
| ldb_m2r[row["model_org"]] = {} | |
| prev_d = ldb_m2r[row["model_org"]] | |
| new_d = {} | |
| for key in LDB_TEXT_KEYS: | |
| new_d[key] = row[key] if len(row[key]) > 0 else prev_d.get(key, "NA") | |
| for tname, _ in TASKS: | |
| new_d[f"score_{tname}"] = ( | |
| row[f"score_{tname}"] if row[f"score_{tname}"] >= 0 else prev_d.get(f"score_{tname}", MISSING_VALUE) | |
| ) | |
| if tname == "mvp": | |
| new_d[f"score_mvp_mini"] = ( | |
| row[f"score_mvp_mini"] | |
| if row[f"score_mvp_mini"] >= 0 | |
| else prev_d.get(f"score_mvp_mini", MISSING_VALUE) | |
| ) | |
| new_d["date"] = row["date"] | |
| ldb_m2r[row["model_org"]] = new_d | |
| # add Human baseline | |
| ldb_m2r["human"] = HUMAN_BASELINES | |
| ldb_m2r["gemini2.5"] = GEMINI2_5 | |
| ldb_m2r["gemini1.5"] = GEMINI1_5 | |
| ldb_m2r["gpt4o"] = GPT4O | |
| ldb_m2r["internvl"] = INTERN_VL | |
| ldb_m2r["llavaov"] = LLAVA | |
| ldb_m2r["plm"] = PLM | |
| ldb_m2r["qwen2.5"] = QWENVL | |
| ldb_m2r["vjepa2"] = VJEPA2 | |
| # compute average and convert back to rows | |
| ldb_rows = [] | |
| for key, val in ldb_m2r.items(): | |
| print(ldb_m2r[key]) | |
| if "url" in ldb_m2r[key].keys() and ldb_m2r[key]["url"] != "": | |
| ldb_m2r[key]["model"] = model_hyperlink(ldb_m2r[key]["url"],ldb_m2r[key]["model"]) | |
| row = copy.deepcopy(val) | |
| score_keys = {k for k in val if k.startswith("score_")} | |
| row["score"] = np.round(np.mean([row[sk] for sk in score_keys if (row[sk] != MISSING_VALUE and row[sk] != "-")]), 2) | |
| tasks_completed = 0 | |
| for sk in score_keys: | |
| if row[sk] == MISSING_VALUE: | |
| row[sk] = "-" | |
| else: | |
| tasks_completed += 1 | |
| row["tasks_completed"] = tasks_completed | |
| ldb_rows.append(row) | |
| df = pd.DataFrame(ldb_rows) | |
| df = df.query('date >= "2025-06-11"') | |
| # df = df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])}) | |
| # sort | |
| df = df.sort_values(by=["tasks_completed", "score"], ascending=False) | |
| # format numerics | |
| numeric_cols = [c for c in df.columns if c.startswith("score_")] | |
| for nc in numeric_cols: | |
| df[nc] = df[nc].apply(lambda x: np.round(x, 2) if type(x) == float else x) | |
| # remove columns and rename | |
| df.drop(["tasks_completed"], axis=1, inplace=True) | |
| col_mapper = {f"score_{tname}": f"{tdisplay} (%)" for tname, tdisplay in TASKS if tname != "mvp"} | |
| col_mapper.update( | |
| { | |
| "model": "Model Name", | |
| "model_type": "Model Type", | |
| "vision_backbone": "Vision Backbone", | |
| "llm_backbone": "LLM Backbone", | |
| #"score": "Average Score (%)", | |
| "date": "Submission Date", | |
| } | |
| ) | |
| df.rename(col_mapper, axis=1, inplace=True) | |
| df[f"{MVP_NAME} (%)"] = df.score_mvp_mini.astype(str) | |
| df.drop([f"score_{MVP_KEY}", f"score_{MVP_MINI_KEY}"], axis=1, inplace=True) | |
| # order columns | |
| df = df[PRE_COL_NAMES + [f"{t[1]} (%)" for t in VISIBLE_TASKS] + POST_COL_NAMES] | |
| return df | |
| def create_dummy_data(): | |
| # Dummy evals data | |
| rows = [ | |
| { | |
| "url": "https://deepmind.google/models/gemini/flash/", | |
| "model": "Gemini Test", | |
| "model_type": "Closed", | |
| "system_prompt": "test", | |
| "vision_backbone": " - ", | |
| "llm_backbone": " - ", | |
| "num_frames": 10, | |
| f"score_{INTP_KEY}": 56.1, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": MISSING_VALUE, | |
| f"score_{WMQA_KEY}": 61.66, | |
| "date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
| "organization": "test", | |
| "submitted_by": "octocat", | |
| }, | |
| { | |
| "url": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf", | |
| "model": "Llava 1.6", | |
| "model_type": "Open", | |
| "system_prompt": "test", | |
| "vision_backbone": "CLIP", | |
| "llm_backbone": "Mistral", | |
| "num_frames": 16, | |
| f"score_{INTP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": MISSING_VALUE, | |
| f"score_{WMQA_KEY}": MISSING_VALUE, | |
| "date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
| "organization": "test", | |
| "submitted_by": "octocat", | |
| }, | |
| { | |
| "url": "https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf", | |
| "model": "Llava 1.6", | |
| "model_type": "Open", | |
| "system_prompt": "test", | |
| "vision_backbone": "CLIP", | |
| "llm_backbone": "Mistral", | |
| "num_frames": 16, | |
| f"score_{INTP_KEY}": 0.0, | |
| f"score_{MVP_KEY}": MISSING_VALUE, | |
| f"score_{MVP_MINI_KEY}": MISSING_VALUE, | |
| f"score_{WMQA_KEY}": 0.0, | |
| "date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
| "organization": "test", | |
| "submitted_by": "octocat", | |
| }, | |
| ] | |
| dt = DatasetDict({"valid": Dataset.from_list(rows), "test": Dataset.from_list(rows)}) | |
| # Dummy contact | |
| contact_info = { | |
| "model": "llama", | |
| "url": "test", | |
| "organization": "test", | |
| "username": "test", | |
| "mail": "test", | |
| "date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
| } | |
| cdt = DatasetDict({"valid": Dataset.from_list([contact_info]), "test": Dataset.from_list([contact_info])}) | |
| return dt, cdt | |
| DUMMY_DATA = False | |
| def get_eval_data(): | |
| if DUMMY_DATA: | |
| eval_results, _ = create_dummy_data() | |
| else: | |
| eval_results = load_dataset( | |
| RESULTS_DATASET, | |
| token=TOKEN, | |
| download_mode="force_redownload", | |
| verification_mode=VerificationMode.NO_CHECKS, | |
| trust_remote_code=True, | |
| ) | |
| eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="valid") | |
| eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") | |
| return eval_results, eval_dataframe_val, eval_dataframe_test | |
| def restart_space(): | |
| api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) | |
| # --- MVP Functions __ | |
| def validate_mvp(submission_df, split="valid"): | |
| subsets = submission_df.data_name.unique() | |
| for subset in subsets: | |
| assert subset in [MVP_KEY, MVP_MINI_KEY], format_error( | |
| f"Wrong tasks, got {subset} but expecting either mvp or mvp_mini" | |
| ) | |
| gold_tasks = get_dataset_config_names(MVP_DATASET, token=TOKEN) | |
| for subset in subsets: | |
| tasks = submission_df[submission_df.data_name == subset].task.unique() | |
| assert len(tasks) == len(gold_tasks), format_error( | |
| f"{MVP_NAME} submission must have all tasks, found = {tasks}, expecting = {gold_tasks}" | |
| ) | |
| for task in tasks: | |
| sub_df = submission_df[(submission_df.data_name == subset) & (submission_df.task == task)].copy() | |
| assert task in gold_tasks, format_error(f"Found unknown task {task} for {MVP_NAME}, check submission") | |
| gold_dataset = load_dataset(MVP_DATASET, task, split="full" if subset == MVP_KEY else "mini", token=TOKEN) | |
| assert len(sub_df) == len(gold_dataset), format_error( | |
| f"Number of examples do not match in user submission, found {len(sub_df)} but expecting {len(gold_dataset)} for task {task} in split {subset}" | |
| ) | |
| id2answer = {row["video_id"]: row["answer"] for row in gold_dataset} | |
| for i, r in sub_df.iterrows(): | |
| assert r["row_id"] in id2answer, format_error( | |
| f"Submission contains row_id {r['row_id']} which doesn't match the dataset's video_id" | |
| ) | |
| def compute_scores_mvp(submission_df, split="valid"): | |
| gold_tasks = get_dataset_config_names(MVP_DATASET, token=TOKEN) | |
| subsets = submission_df.data_name.unique() | |
| scored_subs = [] | |
| for subset in subsets: | |
| tasks = submission_df[submission_df.data_name == subset].task.unique() | |
| assert len(tasks) == len(gold_tasks), format_error(f"{MVP_NAME} submission must have all tasks") | |
| for task in tasks: | |
| sub_df = submission_df[(submission_df.data_name == subset) & (submission_df.task == task)].copy() | |
| gold_dataset = load_dataset(MVP_DATASET, task, split="full" if subset == MVP_KEY else "mini", token=TOKEN) | |
| id2answer = {row["video_id"]: row["answer"] for row in gold_dataset} | |
| correct = [] | |
| for i, r in sub_df.iterrows(): | |
| gold_answer = id2answer[r["row_id"]] | |
| model_answer = r["model_answer"] | |
| if gold_answer == model_answer: | |
| correct.append(1) | |
| else: | |
| correct.append(0) | |
| sub_df["rating"] = correct | |
| scored_subs.append(sub_df) | |
| return pd.concat(scored_subs) | |
| def aggregate_scores_mvp(scored_submission_df, split="valid"): | |
| subsets = scored_submission_df.data_name.unique() | |
| subset_scores = {f"score_{s}": 0 for s in subsets} | |
| for subset in subsets: | |
| tasks = scored_submission_df[scored_submission_df.data_name == subset].task.unique() | |
| task_pair_accuracies = [] | |
| for task in tasks: | |
| sub_df = scored_submission_df[ | |
| (scored_submission_df.data_name == subset) & (scored_submission_df.task == task) | |
| ].copy() | |
| result_by_vid = {} | |
| pair_correct_count = 0 | |
| for i, row in sub_df.iterrows(): | |
| video_id = "_".join(row["row_id"].split("_")[:-1]) | |
| if video_id not in result_by_vid: | |
| result_by_vid[video_id] = [row.to_dict()] | |
| else: | |
| result_by_vid[video_id].append(row.to_dict()) | |
| for video_id, answer_dict_pair in result_by_vid.items(): | |
| answer_dict_1, answer_dict_2 = answer_dict_pair | |
| if answer_dict_1["rating"] == 1 and answer_dict_2["rating"] == 1: | |
| pair_correct_count += 1 | |
| task_pair_accuracies.append((pair_correct_count / len(result_by_vid)) * 100) | |
| # compute macro scores | |
| subset_scores[f"score_{subset}"] = np.mean(task_pair_accuracies) | |
| return subset_scores | |
| # --- CausalVQA functions --- | |
| def validate_causalvqa(submission_df, split="test"): | |
| #assert split == "test", format_error(f"Split {split} not available for dataset {WMQA_NAME}") | |
| split = "train" | |
| subsets = submission_df.data_name.unique() | |
| for subset in subsets: | |
| assert subset in [WMQA_KEY], format_error( | |
| f"Wrong tasks, got {subset} but expecting causalvqa" | |
| ) | |
| gold_tasks = get_dataset_config_names(WMQA_DATASET, token=TOKEN) | |
| for subset in subsets: | |
| tasks = "default"#submission_df[submission_df.data_name == subset].task.unique() | |
| sub_df = submission_df[(submission_df.data_name == subset)].copy() | |
| gold_dataset = load_dataset(WMQA_DATASET, "", split="train", token=TOKEN) #note, causalvqa only has a test dataset under hf split 'valid' | |
| assert len(sub_df) == len(gold_dataset), format_error( | |
| f"Number of examples do not match in user submission, found {len(sub_df)} but expecting {len(gold_dataset)} for task {task} in split {subset}" | |
| ) | |
| id2answer = {row["id"]+'_'+str(row["n"]): row["answer"] for row in gold_dataset} | |
| for i, r in sub_df.iterrows(): | |
| assert r["row_id"] in id2answer, format_error( | |
| f"Submission contains row_id {r['row_id']} which doesn't match the dataset's qid" | |
| ) | |
| print('validated') | |
| def compute_scores_causalvqa(submission_df, split="test"): | |
| #assert split == "test", format_error(f"Split {split} not available for dataset {WMQA_NAME}") | |
| split = "train" | |
| gold_tasks = get_dataset_config_names(WMQA_DATASET, token=TOKEN) | |
| subsets = submission_df.data_name.unique() | |
| scored_subs = [] | |
| for subset in subsets: | |
| sub_df = submission_df[(submission_df.data_name == subset)].copy() | |
| sub_df['model_answer'] = sub_df['model_answer'].str.replace(r'[^a-eA-E]', '', regex=True, flags=re.IGNORECASE).str.upper() | |
| gold_dataset = load_dataset(WMQA_DATASET, "", split="train", token=TOKEN) | |
| gold_dataset = gold_dataset.to_pandas() | |
| gold_dataset['row_id'] = gold_dataset.apply(lambda x: x['id']+'_'+str(x['n']), axis=1) | |
| joined = pd.merge(gold_dataset, sub_df, on='row_id', how='left') | |
| correct = [] | |
| for i, r in joined.iterrows(): | |
| gold_answer = r['answer'] | |
| model_answer = r["model_answer"] | |
| if gold_answer == model_answer: | |
| correct.append(1) | |
| else: | |
| correct.append(0) | |
| joined["rating"] = correct | |
| scored_subs.append(joined) | |
| print(joined.columns) | |
| print('scored') | |
| return pd.concat(scored_subs) | |
| def aggregate_scores_causalvqa(scored_submission_df, split="test"): | |
| subsets = scored_submission_df.data_name.unique() | |
| subset_scores = {f"score_{s}": 0 for s in subsets} | |
| for subset in subsets: | |
| sub_df = scored_submission_df[scored_submission_df.data_name == subset].copy() | |
| agg_df = sub_df.groupby(['id','strata'])['rating'].sum().reset_index() | |
| agg_df['points'] = 0 | |
| agg_df.loc[agg_df['rating']==2, 'points'] = 1 | |
| # compute macro scores | |
| subset_scores[f"score_{subset}"] = agg_df.points.mean()*100.00 | |
| print('aggregated') | |
| return subset_scores | |
| # --- IntPhys functions --- | |
| def validate_intphys(submission_df, split="test"): | |
| assert split == "test", format_error(f"Split {split} not available for dataset {INTP_NAME}") | |
| subsets = submission_df.data_name.unique() | |
| for subset in subsets: | |
| assert subset in [INTP_KEY], format_error( | |
| f"Wrong tasks, got {subset} but expecting " + INTP_KEY | |
| ) | |
| gold_tasks = get_dataset_config_names(INTP_DATASET, token=TOKEN) | |
| for subset in subsets: | |
| sub_df = submission_df[(submission_df.data_name == subset)].copy() | |
| gold_dataset = load_dataset(INTP_DATASET, "", split="test") | |
| assert len(sub_df) == len(gold_dataset), format_error( | |
| f"Number of examples do not match in user submission, found {len(sub_df)} but expecting {len(gold_dataset)} in split {subset}" | |
| ) | |
| id2answer = {row["name"]: row["answer"] for row in gold_dataset} | |
| for i, r in sub_df.iterrows(): | |
| assert r["row_id"] in id2answer, format_error( | |
| f"Submission contains row_id {r['row_id']} which doesn't match the dataset's video_id" | |
| ) | |
| def compute_scores_intphys(submission_df, split="test"): | |
| assert split == "test", format_error(f"Split {split} not available for dataset {INTP_NAME}") | |
| gold_tasks = get_dataset_config_names(INTP_DATASET, token=TOKEN) | |
| subsets = submission_df.data_name.unique() | |
| scored_subs = [] | |
| for subset in subsets: | |
| sub_df = submission_df[(submission_df.data_name == subset)].copy() | |
| gold_dataset = load_dataset(INTP_DATASET, "", split="test", token=TOKEN) | |
| id2answer = {row["name"]: row["answer"] for row in gold_dataset} | |
| correct = [] | |
| for i, r in sub_df.iterrows(): | |
| gold_answer = id2answer[r["row_id"]] | |
| model_answer = r["model_answer"] | |
| if gold_answer == model_answer: | |
| correct.append(1) | |
| else: | |
| correct.append(0) | |
| sub_df["rating"] = correct | |
| scored_subs.append(sub_df) | |
| return pd.concat(scored_subs) | |
| def aggregate_scores_intphys(scored_submission_df, split="test"): | |
| subsets = scored_submission_df.data_name.unique() | |
| subset_scores = {f"score_{s}": 0 for s in subsets} | |
| accuracies = [] | |
| for subset in subsets: | |
| sub_df = scored_submission_df[ | |
| (scored_submission_df.data_name == subset) | |
| ].copy() | |
| result_by_vid = {} | |
| pair_correct_count = 0 | |
| for i, row in sub_df.iterrows(): | |
| if row["rating"] == 1: | |
| pair_correct_count += 1 | |
| accuracies.append((pair_correct_count / len(sub_df)) * 100) | |
| # compute macro scores | |
| subset_scores[f"score_{subset}"] = np.mean(accuracies) | |
| return subset_scores | |
| VALIDATION_FN = { | |
| MVP_KEY: validate_mvp, | |
| MVP_MINI_KEY: validate_mvp, | |
| INTP_KEY: validate_intphys, | |
| WMQA_KEY: validate_causalvqa, | |
| } | |
| SCORER_FN = { | |
| MVP_KEY: compute_scores_mvp, | |
| MVP_MINI_KEY: compute_scores_mvp, | |
| INTP_KEY: compute_scores_intphys, | |
| WMQA_KEY: compute_scores_causalvqa, | |
| } | |
| AGGREGATE_FN = { | |
| MVP_KEY: aggregate_scores_mvp, | |
| MVP_MINI_KEY: aggregate_scores_mvp, | |
| INTP_KEY: aggregate_scores_intphys, | |
| WMQA_KEY: aggregate_scores_causalvqa, | |
| } | |
| def compute_scores(submission_df, split="valid"): | |
| """ | |
| Runs the scores with held out valid/test sets, and updates the submission with metrics for each dataset | |
| - First, runs validation for the input to ensure the right keys are present | |
| - Then, runs the evaluations | |
| """ | |
| tasks = submission_df.data_name.unique() | |
| scored_subs = [] | |
| for t in tasks: | |
| task_sub = submission_df[submission_df.data_name == t].copy() | |
| scored_subs.append(SCORER_FN[t](task_sub, split)) | |
| scored_subs = pd.concat(scored_subs) | |
| return scored_subs | |
| def aggregate_scores(scored_df, split="valid"): | |
| tasks = scored_df.data_name.unique() | |
| agg_scores = {} | |
| for task in tasks: | |
| task_sub = scored_df[scored_df.data_name == task].copy() | |
| agg_metrics = AGGREGATE_FN[task](task_sub, split=split) | |
| agg_scores.update(agg_metrics) | |
| return agg_scores | |
| def validate_submission(submission_df, split="valid"): | |
| """ | |
| Validate user submissions | |
| """ | |
| # Run checks | |
| assert "data_name" in submission_df.columns, format_error("Submission missing column data_name") | |
| assert "row_id" in submission_df.columns, format_error("Submission missing column row_id") | |
| assert "task" in submission_df.columns, format_error("Submission missing column task") | |
| assert "model_answer" in submission_df.columns, format_error("Submission missing column model_answer") | |
| tasks = submission_df.data_name.unique() | |
| valid_tasks = [t[0] for t in TASKS] + [MVP_MINI_KEY] | |
| for t in tasks: | |
| assert t in valid_tasks, format_error( | |
| f"Submission contains one or more rows with data_name={t}, which is not a valid task for this leaderboard (expecting to match a dataset in {valid_tasks})" | |
| ) | |
| # Dataset specific checks | |
| for task in tasks: | |
| task_sub = submission_df[submission_df.data_name == task].copy() | |
| VALIDATION_FN[task](task_sub) | |
| def add_new_eval( | |
| model: str, | |
| vision_backbone: str, | |
| llm_backbone: str, | |
| url: str, | |
| model_type: str, | |
| path_to_file: str, | |
| organization: str, | |
| mail: str, | |
| profile: gr.OAuthProfile, | |
| progress=gr.Progress(), | |
| ): | |
| progress(0, desc="Validating user ...") | |
| contact_infos = load_dataset( | |
| CONTACT_DATASET, | |
| token=TOKEN, | |
| download_mode="force_redownload", | |
| verification_mode=VerificationMode.NO_CHECKS, | |
| trust_remote_code=True, | |
| ) | |
| user_submission_dates = sorted( | |
| row["date"] for row in contact_infos["test"] if row["username"] == profile.username | |
| ) | |
| # Logic to limit submissions per day | |
| if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime("%Y-%m-%d"): | |
| return format_error("You already submitted once today, please try again tomorrow.") | |
| # Very basic email parsing | |
| _, parsed_mail = parseaddr(mail) | |
| if not "@" in parsed_mail: | |
| return format_warning("Please provide a valid email adress.") | |
| print("Adding new eval") | |
| progress(0.1, desc="Fetching recent evals ...") | |
| eval_results, _, _ = get_eval_data() | |
| # # Check if the combination model/org already exists and prints a warning message if yes | |
| # if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organization.lower() in set( | |
| # [o.lower() for o in eval_results[val_or_test]["organization"]] | |
| # ): | |
| # return format_warning("This model has been already submitted.") | |
| if path_to_file is None: | |
| return format_warning("Please attach a file.") | |
| # validate submission - do not save submission until its fully validated | |
| progress(0.3, desc="Validating user submission ...") | |
| file_path = path_to_file.name | |
| assert file_path.endswith(".jsonl"), format_error("Please submit a jsonl file") | |
| submissions_df = pd.read_json(file_path, lines=True, orient="records") | |
| validate_submission(submissions_df) | |
| # Save submitted file | |
| if LOCAL_DEBUG: | |
| gr.Info("In local debug mode, mock uploading submission dataset.") | |
| else: | |
| api.upload_file( | |
| repo_id=SUBMISSION_DATASET, | |
| path_or_fileobj=path_to_file.name, | |
| path_in_repo=f"{organization}/{model}/submissions/test_raw_{datetime.datetime.today()}.jsonl", | |
| repo_type="dataset", | |
| token=TOKEN, | |
| ) | |
| # Compute score | |
| progress(0.5, desc="Computing scores ...") | |
| scored_df = compute_scores(submissions_df, split="test") | |
| # Save scored file | |
| if LOCAL_DEBUG: | |
| gr.Info("In local debug mode, mock uploading scored files") | |
| else: | |
| tasks = scored_df.data_name.unique() | |
| for task in tasks: | |
| scored_df.to_json(f"scored/{organization}_{model}_{task}.jsonl", lines=True, orient="records") | |
| api.upload_file( | |
| repo_id=SUBMISSION_DATASET, | |
| path_or_fileobj=f"scored/{organization}_{model}_{task}.jsonl", | |
| path_in_repo=f"{organization}/{model}/scored/{task}/test_scored_{datetime.datetime.today()}.jsonl", | |
| repo_type="dataset", | |
| token=TOKEN, | |
| ) | |
| # Actual submission | |
| progress(0.7, desc="Submitting leaderboard entry ...") | |
| eval_entry = { | |
| "model": model, | |
| "model_type": model_type, | |
| "vision_backbone": vision_backbone, | |
| "llm_backbone": llm_backbone, | |
| "url": url, | |
| "organization": organization, | |
| "submitted_by": profile.username, | |
| "date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
| } | |
| agg_metrics = aggregate_scores(scored_df, split="test") | |
| eval_entry.update(agg_metrics) | |
| # update missing tasks to MISSING_VALUE | |
| task_keys = [t[0] for t in TASKS] + [MVP_MINI_KEY] | |
| missing_metrics = {f"score_{task}": MISSING_VALUE for task in task_keys if f"score_{task}" not in eval_entry} | |
| eval_entry.update(missing_metrics) | |
| eval_results["test"] = eval_results["test"].add_item(eval_entry) | |
| if LOCAL_DEBUG: | |
| print(eval_results["valid"][-1]) | |
| gr.Info("In local debug mode, mock uploading aggregated scores") | |
| else: | |
| eval_results.push_to_hub(RESULTS_DATASET, token=TOKEN) | |
| progress(0.9, desc="Updating contacts ...") | |
| contact_info = { | |
| "model": model, | |
| "url": url, | |
| "organization": organization, | |
| "username": profile.username, | |
| "mail": mail, | |
| "date": datetime.datetime.today().strftime("%Y-%m-%d"), | |
| } | |
| contact_infos["test"] = contact_infos["test"].add_item(contact_info) | |
| if LOCAL_DEBUG: | |
| print("mock uploaded contact info") | |
| else: | |
| contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN) | |
| progress(1.0, desc="Completed evaluation successfully. Please refresh leaderboard") | |
| success_str = f"Model {model} submitted by {organization} is successfully evaluated and stored in our database.\nPlease wait a few hours and refresh the leaderboard to see your score displayed." | |
| format_log(success_str) | |
| return success_str | |
| def on_filter_model_size_method_change(): | |
| _, eval_dataframe_val, eval_dataframe_test = get_eval_data() | |
| # eval_dataframe_val = eval_dataframe_val[PRE_COL_NAMES + [f"{t} (%)" for t in selected_columns] + POST_COL_NAMES] | |
| eval_dataframe_test = eval_dataframe_test[PRE_COL_NAMES + [f"{t} (%)" for _,t in VISIBLE_TASKS] + POST_COL_NAMES] | |
| datatypes = ["markdown"] + ["number" for _ in VISIBLE_TASKS] + ["text"] + ["text"] + ["text"] + ["date"] | |
| # val_ldb = gr.components.Dataframe( | |
| # value=eval_dataframe_val, datatype=datatypes, interactive=False, column_widths=["20%"] | |
| # ) | |
| test_ldb = gr.components.Dataframe( | |
| value=eval_dataframe_test, datatype=datatypes, interactive=False, column_widths=["20%"] | |
| ) | |
| return test_ldb | |
| def upload_file(files): | |
| file_paths = [file.name for file in files] | |
| return file_paths | |
| if __name__ == "__main__": | |
| _, eval_dataframe_val, eval_dataframe_test = get_eval_data() | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.HTML(TITLE) | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| with gr.Row(): | |
| with gr.Accordion("π Citation", open=False): | |
| gr.Markdown(CITATION_BUTTON_LABEL) | |
| gr.Markdown(CITATION_BUTTON_TEXT) | |
| datatypes = ["markdown"] + ["number" for _ in VISIBLE_TASKS] + ["text"] + ["text"] + ["text"] + ["date"] | |
| with gr.Tab("Results: Test"): | |
| leaderboard_table_test = gr.components.Dataframe( | |
| value=eval_dataframe_test, datatype=datatypes, interactive=False, column_widths=["20%"] | |
| ) | |
| refresh_button = gr.Button("Refresh") | |
| refresh_button.click( | |
| # print(task_filter) | |
| on_filter_model_size_method_change, | |
| #inputs=[VISIBLE_TASKS], | |
| #inputs=[], | |
| outputs=[ | |
| #leaderboard_table_val, | |
| leaderboard_table_test, | |
| ], | |
| ) | |
| with gr.Accordion("Submit a new model for evaluation"): | |
| with gr.Row(): | |
| gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text") | |
| with gr.Row(): | |
| with gr.Column(): | |
| # level_of_test = "test" | |
| model_name_textbox = gr.Textbox(label="Model name") | |
| model_url = gr.Textbox(label="Model URL") | |
| model_type = gr.Dropdown(choices=["Open", "Closed"], label="Model Type") | |
| # num_frames = gr.Textbox(label="Number of frames used") | |
| llm_backbone_textbox = gr.Textbox(label="LLM Backbone") | |
| vision_backbone_textbox = gr.Textbox(label="Vision Backbone") | |
| # system_prompt_textbox = gr.Textbox(label="System prompt example") | |
| # url_textbox = gr.Textbox(label="Url to model information") | |
| with gr.Column(): | |
| organization = gr.Textbox(label="Organization") | |
| mail = gr.Textbox( | |
| label="Contact email" | |
| ) | |
| file_output = gr.File() | |
| submission_result = gr.Textbox(label="Status") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.LoginButton() | |
| with gr.Column(): | |
| submit_button = gr.Button("Submit Eval") | |
| submit_button.click( | |
| add_new_eval, | |
| [ | |
| #level_of_test, | |
| model_name_textbox, | |
| vision_backbone_textbox, | |
| llm_backbone_textbox, | |
| model_url, | |
| model_type, | |
| # num_frames, | |
| file_output, | |
| organization, | |
| mail, | |
| ], | |
| submission_result, | |
| ) | |
| scheduler = BackgroundScheduler() | |
| scheduler.add_job(restart_space, "interval", seconds=3600) | |
| scheduler.start() | |
| demo.launch(debug=True) |