Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
kennymckormick
commited on
Commit
·
044f86f
1
Parent(s):
746f6aa
update lb_info.py
Browse files- lb_info.py +18 -1
lb_info.py
CHANGED
|
@@ -32,7 +32,7 @@ This leaderboard was last updated: {}.
|
|
| 32 |
"""
|
| 33 |
# CONSTANTS-FIELDS
|
| 34 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
| 35 |
-
MAIN_FIELDS = ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench', 'MME', 'SEEDBench_IMG', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench']
|
| 36 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
| 37 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
| 38 |
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
|
@@ -122,6 +122,23 @@ LEADERBOARD_MD['LLaVABench'] = """
|
|
| 122 |
- We also include the official results (obtained by gpt-4-0314) for applicable models.
|
| 123 |
"""
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
from urllib.request import urlopen
|
| 126 |
|
| 127 |
def load_results():
|
|
|
|
| 32 |
"""
|
| 33 |
# CONSTANTS-FIELDS
|
| 34 |
META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
| 35 |
+
MAIN_FIELDS = ['MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench', 'MME', 'SEEDBench_IMG', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench', 'AI2D_TEST']
|
| 36 |
MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
|
| 37 |
MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
| 38 |
MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
|
|
|
| 122 |
- We also include the official results (obtained by gpt-4-0314) for applicable models.
|
| 123 |
"""
|
| 124 |
|
| 125 |
+
LEADERBOARD_MD['COCO_VAL'] = """
|
| 126 |
+
## COCO Caption Results
|
| 127 |
+
|
| 128 |
+
- By default, we evaluate COCO Caption Validation set (5000 samples), and report the following metrics: `BLEU-1, BLEU-4, CIDEr, ROUGE-L
|
| 129 |
+
- We use the following prompt to evaluate all VLMs: `Please describe this image in general. Directly provide the description, do not include prefix like "This image depicts". `
|
| 130 |
+
- **No specific prompt is adopted for all VLMs.**
|
| 131 |
+
"""
|
| 132 |
+
|
| 133 |
+
LEADERBOARD_MD['ScienceQA_VAL'] = """
|
| 134 |
+
# ScienceQA Evaluation Results
|
| 135 |
+
|
| 136 |
+
- We benchmark the **image** subset of ScienceQA validation and test set, and report the Top-1 accuracy.
|
| 137 |
+
- During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
|
| 138 |
+
"""
|
| 139 |
+
|
| 140 |
+
LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']
|
| 141 |
+
|
| 142 |
from urllib.request import urlopen
|
| 143 |
|
| 144 |
def load_results():
|