Spaces:
Running
Running
integrate v1 scores into v2
Browse files- app.py +22 -6
- utils.py +3 -19
- utils_v2.py +6 -4
app.py
CHANGED
|
@@ -11,12 +11,9 @@ def update_table(query, min_size, max_size, selected_tasks=None):
|
|
| 11 |
filtered_df = filtered_df[selected_columns]
|
| 12 |
return filtered_df
|
| 13 |
|
| 14 |
-
def update_table_v2(query, min_size, max_size
|
| 15 |
df = v2.get_df()
|
| 16 |
filtered_df = v2.search_and_filter_models(df, query, min_size, max_size)
|
| 17 |
-
if selected_tasks and len(selected_tasks) > 0:
|
| 18 |
-
selected_columns = v2.BASE_COLS + selected_tasks
|
| 19 |
-
filtered_df = filtered_df[selected_columns]
|
| 20 |
return filtered_df
|
| 21 |
|
| 22 |
with gr.Blocks() as block:
|
|
@@ -42,6 +39,7 @@ with gr.Blocks() as block:
|
|
| 42 |
elem_id="search-bar"
|
| 43 |
)
|
| 44 |
|
|
|
|
| 45 |
df2 = v2.get_df()
|
| 46 |
min_size2, max_size2 = get_size_range(df2)
|
| 47 |
|
|
@@ -92,11 +90,30 @@ with gr.Blocks() as block:
|
|
| 92 |
)
|
| 93 |
refresh_button2.click(fn=v2.refresh_data, outputs=data_component2)
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
# table 2, image scores only
|
| 96 |
with gr.TabItem("🖼️ Image", elem_id="qa-tab-table1", id=2):
|
| 97 |
gr.Markdown(v2.TABLE_INTRODUCTION_I)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
data_component3 = gr.components.Dataframe(
|
| 99 |
-
value=
|
| 100 |
headers=v2.COLUMN_NAMES_I,
|
| 101 |
type="pandas",
|
| 102 |
datatype=v2.DATA_TITLE_TYPE_I,
|
|
@@ -160,7 +177,6 @@ with gr.Blocks() as block:
|
|
| 160 |
elem_id="search-bar"
|
| 161 |
)
|
| 162 |
|
| 163 |
-
df = get_df()
|
| 164 |
min_size, max_size = get_size_range(df)
|
| 165 |
|
| 166 |
with gr.Row():
|
|
|
|
| 11 |
filtered_df = filtered_df[selected_columns]
|
| 12 |
return filtered_df
|
| 13 |
|
| 14 |
+
def update_table_v2(query, min_size, max_size):
|
| 15 |
df = v2.get_df()
|
| 16 |
filtered_df = v2.search_and_filter_models(df, query, min_size, max_size)
|
|
|
|
|
|
|
|
|
|
| 17 |
return filtered_df
|
| 18 |
|
| 19 |
with gr.Blocks() as block:
|
|
|
|
| 39 |
elem_id="search-bar"
|
| 40 |
)
|
| 41 |
|
| 42 |
+
df = get_df()
|
| 43 |
df2 = v2.get_df()
|
| 44 |
min_size2, max_size2 = get_size_range(df2)
|
| 45 |
|
|
|
|
| 90 |
)
|
| 91 |
refresh_button2.click(fn=v2.refresh_data, outputs=data_component2)
|
| 92 |
|
| 93 |
+
|
| 94 |
+
def get_special_processed_df2():
|
| 95 |
+
"""Temporary special processing to merge v1 scores with v2 image scores.
|
| 96 |
+
Will be removed later after v2 is fully adopted."""
|
| 97 |
+
df2_i = df2[v2.COLUMN_NAMES_I]
|
| 98 |
+
df1 = df.rename(columns={'V1-Overall': 'Image-Overall'})
|
| 99 |
+
df1 = df1[v2.BASE_COLS + v2.SUB_TASKS_I + ['Image-Overall']]
|
| 100 |
+
df1['Models'] = df1['Models'].apply(lambda x: x + ' (V1)' if not x.endswith(' (V1)') else x)
|
| 101 |
+
combined_df = pd.concat([df1, df2_i], ignore_index=True)
|
| 102 |
+
for task in v2.TASKS_I:
|
| 103 |
+
combined_df[task] = combined_df[task].apply(lambda score: '-' if pd.isna(score) else score)
|
| 104 |
+
combined_df = v2.rank_models(combined_df, 'Image-Overall')
|
| 105 |
+
return combined_df[v2.COLUMN_NAMES_I]
|
| 106 |
+
|
| 107 |
# table 2, image scores only
|
| 108 |
with gr.TabItem("🖼️ Image", elem_id="qa-tab-table1", id=2):
|
| 109 |
gr.Markdown(v2.TABLE_INTRODUCTION_I)
|
| 110 |
+
df2_i = get_special_processed_df2()
|
| 111 |
+
print('load completed')
|
| 112 |
+
print(df2_i.shape)
|
| 113 |
+
print(df2_i.head())
|
| 114 |
+
print(df2_i.columns)
|
| 115 |
data_component3 = gr.components.Dataframe(
|
| 116 |
+
value=df2_i,
|
| 117 |
headers=v2.COLUMN_NAMES_I,
|
| 118 |
type="pandas",
|
| 119 |
datatype=v2.DATA_TITLE_TYPE_I,
|
|
|
|
| 177 |
elem_id="search-bar"
|
| 178 |
)
|
| 179 |
|
|
|
|
| 180 |
min_size, max_size = get_size_range(df)
|
| 181 |
|
| 182 |
with gr.Row():
|
utils.py
CHANGED
|
@@ -38,8 +38,8 @@ This comprehensive suite enables robust evaluation of multimodal embedding model
|
|
| 38 |
| [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
|
| 39 |
"""
|
| 40 |
|
| 41 |
-
TABLE_INTRODUCTION = """***Important Notes
|
| 42 |
-
|
| 43 |
|
| 44 |
LEADERBOARD_INFO = """
|
| 45 |
## Dataset Summary
|
|
@@ -103,23 +103,7 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
|
|
| 103 |
}
|
| 104 |
}
|
| 105 |
```
|
| 106 |
-
|
| 107 |
-
### **TO SUBMIT V1 ONLY (Depreciated, but we still accept this format until 2025-06-30)**
|
| 108 |
-
```json
|
| 109 |
-
[
|
| 110 |
-
{
|
| 111 |
-
"Model": "<Model Name>",
|
| 112 |
-
"URL": "<Model URL>" or null,
|
| 113 |
-
"Model Size(B)": 1000 or null,
|
| 114 |
-
"Data Source": "Self-Reported",
|
| 115 |
-
"V1-Overall": 50.0,
|
| 116 |
-
"I-CLS": 50.0,
|
| 117 |
-
"I-QA": 50.0,
|
| 118 |
-
"I-RET": 50.0,
|
| 119 |
-
"I-VG": 50.0
|
| 120 |
-
},
|
| 121 |
-
]
|
| 122 |
-
```
|
| 123 |
Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
|
| 124 |
To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at m7su@uwaterloo.ca, including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
|
| 125 |
Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
|
|
|
|
| 38 |
| [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
|
| 39 |
"""
|
| 40 |
|
| 41 |
+
TABLE_INTRODUCTION = """***Important Notes: ***
|
| 42 |
+
This is the MMEB-V1 leaderboard, which is now deprecated. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image tab. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n"""
|
| 43 |
|
| 44 |
LEADERBOARD_INFO = """
|
| 45 |
## Dataset Summary
|
|
|
|
| 103 |
}
|
| 104 |
}
|
| 105 |
```
|
| 106 |
+
Note: We still accept the old format until 2025-06-30, and after that we no longer support the old format, so it is important to follow the new format for your submission. \n
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
|
| 108 |
To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at m7su@uwaterloo.ca, including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
|
| 109 |
Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
|
utils_v2.py
CHANGED
|
@@ -44,12 +44,14 @@ COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'VisDoc
|
|
| 44 |
DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
|
| 45 |
['number'] * 3
|
| 46 |
|
| 47 |
-
|
|
|
|
| 48 |
COLUMN_NAMES_I = BASE_COLS + TASKS_I
|
| 49 |
DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
|
| 50 |
['number'] * (len(TASKS_I) + 4)
|
| 51 |
|
| 52 |
-
|
|
|
|
| 53 |
COLUMN_NAMES_V = BASE_COLS + TASKS_V
|
| 54 |
DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
|
| 55 |
['number'] * (len(TASKS_V) + 4)
|
|
@@ -147,10 +149,10 @@ def generate_model_row(data):
|
|
| 147 |
row.update(scores)
|
| 148 |
return row
|
| 149 |
|
| 150 |
-
def rank_models(df, column='Overall'):
|
| 151 |
"""Ranks the models based on the specific score."""
|
| 152 |
df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
|
| 153 |
-
df[
|
| 154 |
return df
|
| 155 |
|
| 156 |
def get_df():
|
|
|
|
| 44 |
DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
|
| 45 |
['number'] * 3
|
| 46 |
|
| 47 |
+
SUB_TASKS_I = TASKS[1:5]
|
| 48 |
+
TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image']
|
| 49 |
COLUMN_NAMES_I = BASE_COLS + TASKS_I
|
| 50 |
DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
|
| 51 |
['number'] * (len(TASKS_I) + 4)
|
| 52 |
|
| 53 |
+
SUB_TASKS_V = TASKS[6:10]
|
| 54 |
+
TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video']
|
| 55 |
COLUMN_NAMES_V = BASE_COLS + TASKS_V
|
| 56 |
DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
|
| 57 |
['number'] * (len(TASKS_V) + 4)
|
|
|
|
| 149 |
row.update(scores)
|
| 150 |
return row
|
| 151 |
|
| 152 |
+
def rank_models(df, column='Overall', rank_name='Rank'):
|
| 153 |
"""Ranks the models based on the specific score."""
|
| 154 |
df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
|
| 155 |
+
df[rank_name] = range(1, len(df) + 1)
|
| 156 |
return df
|
| 157 |
|
| 158 |
def get_df():
|