Spaces:

TIGER-Lab
/

MMEB-Leaderboard

Running

App Files Files Community

Fixed some errors

by MINGYISU - opened Dec 30, 2024

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+86

-50

Files changed (3) hide show

app.py +19 -20
results.csv +1 -1
utils.py +66 -29

app.py CHANGED Viewed

@@ -2,12 +2,11 @@ from utils import *
 global data_component
-def update_table(query, min_size, max_size, selected_subjects=None):
     df = get_df()
     filtered_df = search_and_filter_models(df, query, min_size, max_size)
-    if selected_subjects and len(selected_subjects) > 0:
-        base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
-        selected_columns = base_columns + selected_subjects
         filtered_df = filtered_df[selected_columns]
     return filtered_df
@@ -53,13 +52,13 @@ with gr.Blocks() as block:
                     label="Maximum number of parameters (B)",
                 )
-            subject_choices = [col for col in COLUMN_NAMES if col not in ['Models', 'Model Size(B)', 'Data Source', 'Overall', 'IND', 'OOD']]
             with gr.Row():
-                subjects_select = gr.CheckboxGroup(
-                    choices=subject_choices,
-                    value=subject_choices,
-                    label="Select Subjects to Display",
-                    elem_id="subjects-select"
                 )
             data_component = gr.components.Dataframe(
@@ -73,27 +72,27 @@ with gr.Blocks() as block:
             refresh_button = gr.Button("Refresh")
-            def update_with_subjects(*args):
                 return update_table(*args)
             search_bar.change(
-                fn=update_with_subjects,
-                inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
                 outputs=data_component
             )
             min_size_slider.change(
-                fn=update_with_subjects,
-                inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
                 outputs=data_component
             )
             max_size_slider.change(
-                fn=update_with_subjects,
-                inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
                 outputs=data_component
             )
-            subjects_select.change(
-                fn=update_with_subjects,
-                inputs=[search_bar, min_size_slider, max_size_slider, subjects_select],
                 outputs=data_component
             )
             refresh_button.click(fn=refresh_data, outputs=data_component)

 global data_component
+def update_table(query, min_size, max_size, selected_tasks=None):
     df = get_df()
     filtered_df = search_and_filter_models(df, query, min_size, max_size)
+    if selected_tasks and len(selected_tasks) > 0:
+        selected_columns = BASE_COLS + selected_tasks
         filtered_df = filtered_df[selected_columns]
     return filtered_df
                     label="Maximum number of parameters (B)",
                 )
+            task_choices = [col for col in COLUMN_NAMES if col not in BASE_COLS]
             with gr.Row():
+                tasks_select = gr.CheckboxGroup(
+                    choices=task_choices,
+                    value=task_choices,
+                    label="Select tasks to Display",
+                    elem_id="tasks-select"
                 )
             data_component = gr.components.Dataframe(
             refresh_button = gr.Button("Refresh")
+            def update_with_tasks(*args):
                 return update_table(*args)
             search_bar.change(
+                fn=update_with_tasks,
+                inputs=[search_bar, min_size_slider, max_size_slider, tasks_select],
                 outputs=data_component
             )
             min_size_slider.change(
+                fn=update_with_tasks,
+                inputs=[search_bar, min_size_slider, max_size_slider, tasks_select],
                 outputs=data_component
             )
             max_size_slider.change(
+                fn=update_with_tasks,
+                inputs=[search_bar, min_size_slider, max_size_slider, tasks_select],
                 outputs=data_component
             )
+            tasks_select.change(
+                fn=update_with_tasks,
+                inputs=[search_bar, min_size_slider, max_size_slider, tasks_select],
                 outputs=data_component
             )
             refresh_button.click(fn=refresh_data, outputs=data_component)

results.csv CHANGED Viewed

@@ -12,4 +12,4 @@ OpenCLIP-FFT,unk,unk,47.2,50.5,43.1,56.0,21.9,55.4,64.1
 VLM2Vec (Phi-3.5-V-FFT),unk,TIGER-Lab,55.9,62.8,47.4,52.8,50.3,57.8,72.3
 VLM2Vec (Phi-3.5-V-LoRA),unk,TIGER-Lab,60.1,66.5,52.0,54.8,54.9,62.3,79.5
 VLM2Vec (LLaVA-1.6-LoRA-LowRes),unk,TIGER-Lab,55.0,61.0,47.5,54.7,50.3,56.2,64.0
-VLM2Vec (LLaVA-1.6-LoRA-HighRes),unk,TIGER-Lab,62.9,67.5,57.1,61.2,49.9,67.4,86.1

 VLM2Vec (Phi-3.5-V-FFT),unk,TIGER-Lab,55.9,62.8,47.4,52.8,50.3,57.8,72.3
 VLM2Vec (Phi-3.5-V-LoRA),unk,TIGER-Lab,60.1,66.5,52.0,54.8,54.9,62.3,79.5
 VLM2Vec (LLaVA-1.6-LoRA-LowRes),unk,TIGER-Lab,55.0,61.0,47.5,54.7,50.3,56.2,64.0
+VLM2Vec (LLaVA-1.6-LoRA-HighRes),unk,TIGER-Lab,62.9,67.5,57.1,61.2,49.9,67.4,86.1

utils.py CHANGED Viewed

@@ -3,12 +3,14 @@ import gradio as gr
 import csv
 import json
 import os
 import shutil
 from huggingface_hub import Repository
 HF_TOKEN = os.environ.get("HF_TOKEN")
-SUBJECTS = ["Classification", "VQA", "Retrieval", "Grounding"]
 MODEL_INFO = [
     "Models", "Model Size(B)", "Data Source",
@@ -16,27 +18,54 @@ MODEL_INFO = [
     "Classification", "VQA", "Retrieval", "Grounding"
 ]
 DATA_TITLE_TYPE = ['markdown', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
-# TODO: submission process not implemented yet
-SUBMISSION_NAME = ""
-SUBMISSION_URL = ""
-CSV_DIR = "results.csv" # TODO: Temporary file, to be updated with the actual file
 COLUMN_NAMES = MODEL_INFO
-LEADERBOARD_INTRODUCTION = """# MMEB Leaderboard
 ## Introduction
-We introduce MMEB, a benchmark for multimodal evaluation of models. The benchmark consists of four tasks: Classification, VQA, Retrieval, and Grounding. Models are evaluated based on 36 datasets.
 """
 TABLE_INTRODUCTION = """"""
 LEADERBOARD_INFO = """
 ## Dataset Summary
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -63,46 +92,52 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
 """
 def get_df():
-    # TODO: Update this after the hf dataset has been created!
-    # repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
-    # repo.git_pull()
-    df = pd.read_csv(CSV_DIR)
     df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
     df = df.sort_values(by=['Overall'], ascending=False)
     return df
-def add_new_eval(
-    input_file,
-):
     if input_file is None:
         return "Error! Empty file!"
     upload_data = json.loads(input_file)
     print("upload_data:\n", upload_data)
-    data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
-    for subject in SUBJECTS:
-        data_row += [upload_data[subject]]
     print("data_row:\n", data_row)
     submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
-                                 use_auth_token=HF_TOKEN, repo_type="dataset")
     submission_repo.git_pull()
     already_submitted = []
     with open(CSV_DIR, mode='r') as file:
         reader = csv.reader(file, delimiter=',')
         for row in reader:
             already_submitted.append(row[0])
     if data_row[0] not in already_submitted:
         with open(CSV_DIR, mode='a', newline='') as file:
             writer = csv.writer(file)
             writer.writerow(data_row)
         submission_repo.push_to_hub()
         print('Submission Successful')
     else:
-        print('The entry already exists')
 def refresh_data():
     df = get_df()
@@ -154,7 +189,9 @@ def search_models(df, query):
 def get_size_range(df):
-    sizes = df['Model Size(B)'].apply(lambda x: 1000.0 if x == 'unknown' else x)
     return float(sizes.min()), float(sizes.max())
@@ -168,16 +205,16 @@ def process_model_size(size):
         return 'unknown'
-def filter_columns_by_subjects(df, selected_subjects=None):
-    if selected_subjects is None or len(selected_subjects) == 0:
         return df[COLUMN_NAMES]
     base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
-    selected_columns = base_columns + selected_subjects
     available_columns = [col for col in selected_columns if col in df.columns]
     return df[available_columns]
-def get_subject_choices():
-    return SUBJECTS

 import csv
 import json
 import os
+import requests
+import io
 import shutil
 from huggingface_hub import Repository
 HF_TOKEN = os.environ.get("HF_TOKEN")
+TASKS = ["Classification", "VQA", "Retrieval", "Grounding"]
 MODEL_INFO = [
     "Models", "Model Size(B)", "Data Source",
     "Classification", "VQA", "Retrieval", "Grounding"
 ]
+BASE_COLS = [col for col in MODEL_INFO if col not in TASKS]
 DATA_TITLE_TYPE = ['markdown', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
+SUBMISSION_NAME = "MMEB"
+SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/TIGER-Lab/", SUBMISSION_NAME)
+FILE_NAME = "results.csv"
+CSV_DIR = "./results.csv"
 COLUMN_NAMES = MODEL_INFO
+LEADERBOARD_INTRODUCTION = """
+# MMEB Leaderboard
 ## Introduction
+We introduce a novel benchmark, MMEB (Massive Multimodal Embedding Benchmark),
+which includes 36 datasets spanning four meta-task categories: classification, visual question answering, retrieval, and visual grounding. MMEB provides a comprehensive framework for training
+and evaluating embedding models across various combinations of text and image modalities.
+All tasks are reformulated as ranking tasks, where the model follows instructions, processes a query, and selects the correct target from a set of candidates. The query and target can be an image, text,
+or a combination of both. MMEB is divided into 20 in-distribution datasets, which can be used for
+training, and 16 out-of-distribution datasets, reserved for evaluation.
+The detailed explanation of the benchmark and datasets can be found in our paper: https://doi.org/10.48550/arXiv.2410.05160.
 """
 TABLE_INTRODUCTION = """"""
 LEADERBOARD_INFO = """
 ## Dataset Summary
+MMEB is organized into four primary meta-task categories:
+- **Classification**: This category comprises 5 in-distribution and 5 out-of-distribution datasets. Queries
+consist of instructions and images, optionally accompanied by related text. Targets are class labels,
+and the number of class labels corresponds to the number of classes in the dataset. \n
+        - IND: ImageNet-1k, N24News, HatefulMemes, VOC2007, SUN397 \n
+        - OOD: Place365, ImageNet-A, ImageNet-R, ObjectNet, Country-211 \n
+- **Visual Question Answering**: This category includes 6 in-distribution and 4 out-of-distribution
+datasets. The query consists of an instruction, an image, and a piece of text as the question, while
+the target is the answer. Each query has 1,000 target candidates: 1 ground truth and 999 distractors. \n
+        - IND: OK-VQA, A-OKVQA, DocVQA, InfographicVQA, ChartQA, Visual7W \n
+        - OOD: ScienceQA, VizWiz, GQA, TextVQA \n
+- **Information Retrieval**: This category contains 8 in-distribution and 4 out-of-distribution datasets.
+Both the query and target sides can involve a combination of text, images, and instructions. Similar
+to the VQA task, each query has 1,000 candidates, with 1 ground truth and 999 distractors. \n
+        - IND: VisDial, CIRR, VisualNews_t2i, VisualNews_i2t, MSCOCO_t2i, MSCOCO_i2t, NIGHTS, WebQA \n
+        - OOD: OVEN, FashionIQ, EDIS, Wiki-SS-NQ \n
+- **Visual Grounding**: This category includes 1 in-distribution and 3 out-of-distribution datasets, which are adapted from object detection tasks. Queries consist of an instruction, an image, and text referring to a specific region or object within the image. The target may include a cropped image of the object or text describing the same region. Each query includes 1,000 candidates: 1 ground truth and 999 distractors. These distractors may include hard negatives from the same object class, other objects in the image, or random objects from different images. \n
+        - IND: MSCOCO \n
+        - OOD: Visual7W-Pointing, RefCOCO, RefCOCO-Matching \n
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 """
 def get_df():
+    # fetch the leaderboard data
+    url = "https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/results.csv"
+    response = requests.get(url, headers={"Authorization": f"Bearer {HF_TOKEN}"})
+    if response.status_code != 200:
+        import sys
+        sys.exit(f"Error: {response.status_code}")
+    df = pd.read_csv(io.StringIO(response.text))
+    df.to_csv(CSV_DIR, index=False) # update local file
     df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
     df = df.sort_values(by=['Overall'], ascending=False)
     return df
+def add_new_eval(input_file):
     if input_file is None:
         return "Error! Empty file!"
+    # Load the input json file
     upload_data = json.loads(input_file)
     print("upload_data:\n", upload_data)
+    data_row = [f'{upload_data["Model"]}']
+    for col in ['Overall', 'Model Size(B)', 'IND', 'OOD'] + TASKS:
+        if not col in upload_data.keys():
+            return f"Error! Missing {col} column!"
+        data_row += [upload_data[col]]
     print("data_row:\n", data_row)
     submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
+                                 use_auth_token=HF_TOKEN, repo_type="space")
     submission_repo.git_pull()
+    # Track submitted models
     already_submitted = []
     with open(CSV_DIR, mode='r') as file:
         reader = csv.reader(file, delimiter=',')
         for row in reader:
             already_submitted.append(row[0])
+    # if not in the existing models list, add it to the csv file
     if data_row[0] not in already_submitted:
         with open(CSV_DIR, mode='a', newline='') as file:
             writer = csv.writer(file)
             writer.writerow(data_row)
         submission_repo.push_to_hub()
         print('Submission Successful')
     else:
+        print('The model already exists in the leaderboard!')
 def refresh_data():
     df = get_df()
 def get_size_range(df):
+    sizes = df['Model Size(B)'].apply(lambda x: 0.0 if x == 'unknown' else x)
+    if (sizes == 0.0).all():
+        return 0.0, 1000.0
     return float(sizes.min()), float(sizes.max())
         return 'unknown'
+def filter_columns_by_tasks(df, selected_tasks=None):
+    if selected_tasks is None or len(selected_tasks) == 0:
         return df[COLUMN_NAMES]
     base_columns = ['Models', 'Model Size(B)', 'Data Source', 'Overall']
+    selected_columns = base_columns + selected_tasks
     available_columns = [col for col in selected_columns if col in df.columns]
     return df[available_columns]
+def get_task_choices():
+    return TASKS