Spaces:

TIGER-Lab
/

MMEB-Leaderboard

Running

App Files Files Community

MINGYISU commited on Jun 13

Commit

a84b286

1 Parent(s): de61b89

integrate v1 scores into v2

Browse files

Files changed (3) hide show

app.py +22 -6
utils.py +3 -19
utils_v2.py +6 -4

app.py CHANGED Viewed

@@ -11,12 +11,9 @@ def update_table(query, min_size, max_size, selected_tasks=None):
         filtered_df = filtered_df[selected_columns]
     return filtered_df
-def update_table_v2(query, min_size, max_size, selected_tasks=None):
     df = v2.get_df()
     filtered_df = v2.search_and_filter_models(df, query, min_size, max_size)
-    if selected_tasks and len(selected_tasks) > 0:
-        selected_columns = v2.BASE_COLS + selected_tasks
-        filtered_df = filtered_df[selected_columns]
     return filtered_df
 with gr.Blocks() as block:
@@ -42,6 +39,7 @@ with gr.Blocks() as block:
                     elem_id="search-bar"
                 )
             df2 = v2.get_df()
             min_size2, max_size2 = get_size_range(df2)
@@ -92,11 +90,30 @@ with gr.Blocks() as block:
             )
             refresh_button2.click(fn=v2.refresh_data, outputs=data_component2)
         # table 2, image scores only
         with gr.TabItem("🖼️ Image", elem_id="qa-tab-table1", id=2):
             gr.Markdown(v2.TABLE_INTRODUCTION_I)
             data_component3 = gr.components.Dataframe(
-                value=v2.rank_models(df2[v2.COLUMN_NAMES_I], 'Image-Overall'),
                 headers=v2.COLUMN_NAMES_I,
                 type="pandas",
                 datatype=v2.DATA_TITLE_TYPE_I,
@@ -160,7 +177,6 @@ with gr.Blocks() as block:
                     elem_id="search-bar"
                 )
-            df = get_df()
             min_size, max_size = get_size_range(df)
             with gr.Row():

         filtered_df = filtered_df[selected_columns]
     return filtered_df
+def update_table_v2(query, min_size, max_size):
     df = v2.get_df()
     filtered_df = v2.search_and_filter_models(df, query, min_size, max_size)
     return filtered_df
 with gr.Blocks() as block:
                     elem_id="search-bar"
                 )
+            df = get_df()
             df2 = v2.get_df()
             min_size2, max_size2 = get_size_range(df2)
             )
             refresh_button2.click(fn=v2.refresh_data, outputs=data_component2)
+        def get_special_processed_df2():
+            """Temporary special processing to merge v1 scores with v2 image scores.
+            Will be removed later after v2 is fully adopted."""
+            df2_i = df2[v2.COLUMN_NAMES_I]
+            df1 = df.rename(columns={'V1-Overall': 'Image-Overall'})
+            df1 = df1[v2.BASE_COLS + v2.SUB_TASKS_I + ['Image-Overall']]
+            df1['Models'] = df1['Models'].apply(lambda x: x + ' (V1)' if not x.endswith(' (V1)') else x)
+            combined_df = pd.concat([df1, df2_i], ignore_index=True)
+            for task in v2.TASKS_I:
+                combined_df[task] = combined_df[task].apply(lambda score: '-' if pd.isna(score) else score)
+            combined_df = v2.rank_models(combined_df, 'Image-Overall')
+            return combined_df[v2.COLUMN_NAMES_I]
         # table 2, image scores only
         with gr.TabItem("🖼️ Image", elem_id="qa-tab-table1", id=2):
             gr.Markdown(v2.TABLE_INTRODUCTION_I)
+            df2_i = get_special_processed_df2()
+            print('load completed')
+            print(df2_i.shape)
+            print(df2_i.head())
+            print(df2_i.columns)
             data_component3 = gr.components.Dataframe(
+                value=df2_i,
                 headers=v2.COLUMN_NAMES_I,
                 type="pandas",
                 datatype=v2.DATA_TITLE_TYPE_I,
                     elem_id="search-bar"
                 )
             min_size, max_size = get_size_range(df)
             with gr.Row():

utils.py CHANGED Viewed

@@ -38,8 +38,8 @@ This comprehensive suite enables robust evaluation of multimodal embedding model
 | [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
 """
-TABLE_INTRODUCTION = """***Important Notes:*** \n
-**We will be depreciating the MMEB-V1 leaderboard soon, and we will be releasing MMEB-V2 with more detailed scores and automatic evaluation.** \n"""
 LEADERBOARD_INFO = """
 ## Dataset Summary
@@ -103,23 +103,7 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
     }
 }
 ```
-### **TO SUBMIT V1 ONLY (Depreciated, but we still accept this format until 2025-06-30)**
-```json
-[
-    {
-        "Model": "<Model Name>",
-        "URL": "<Model URL>" or null,
-        "Model Size(B)": 1000 or null,
-        "Data Source": "Self-Reported",
-        "V1-Overall": 50.0,
-        "I-CLS": 50.0,
-        "I-QA": 50.0,
-        "I-RET": 50.0,
-        "I-VG": 50.0
-    },
-]
-```
 Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
 To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at m7su@uwaterloo.ca, including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
 Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!

 | [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
 """
+TABLE_INTRODUCTION = """***Important Notes: ***
+This is the MMEB-V1 leaderboard, which is now deprecated. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image tab. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n"""
 LEADERBOARD_INFO = """
 ## Dataset Summary
     }
 }
 ```
+Note: We still accept the old format until 2025-06-30, and after that we no longer support the old format, so it is important to follow the new format for your submission. \n
 Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
 To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at m7su@uwaterloo.ca, including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
 Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!

utils_v2.py CHANGED Viewed

@@ -44,12 +44,14 @@ COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'VisDoc
 DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
                     ['number'] * 3
-TASKS_I = ['Image-Overall'] + TASKS[1:5] + ALL_DATASETS_SPLITS['image']
 COLUMN_NAMES_I = BASE_COLS + TASKS_I
 DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
                     ['number'] * (len(TASKS_I) + 4)
-TASKS_V = ['Video-Overall'] + TASKS[6:10] + ALL_DATASETS_SPLITS['video']
 COLUMN_NAMES_V = BASE_COLS + TASKS_V
 DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
                     ['number'] * (len(TASKS_V) + 4)
@@ -147,10 +149,10 @@ def generate_model_row(data):
     row.update(scores)
     return row
-def rank_models(df, column='Overall'):
     """Ranks the models based on the specific score."""
     df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
-    df['Rank'] = range(1, len(df) + 1)
     return df
 def get_df():

 DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
                     ['number'] * 3
+SUB_TASKS_I = TASKS[1:5]
+TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image']
 COLUMN_NAMES_I = BASE_COLS + TASKS_I
 DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
                     ['number'] * (len(TASKS_I) + 4)
+SUB_TASKS_V = TASKS[6:10]
+TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video']
 COLUMN_NAMES_V = BASE_COLS + TASKS_V
 DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
                     ['number'] * (len(TASKS_V) + 4)
     row.update(scores)
     return row
+def rank_models(df, column='Overall', rank_name='Rank'):
     """Ranks the models based on the specific score."""
     df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
+    df[rank_name] = range(1, len(df) + 1)
     return df
 def get_df():