Spaces:

MERaLiON
/

SeaEval_Leaderboard

Running

App Files Files Community

zhuohan-7 commited on Dec 13, 2024

Commit

917eff6

verified ·

1 Parent(s): ed4682d

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

app/__pycache__/draw_diagram.cpython-310.pyc +0 -0
app/__pycache__/pages.cpython-310.pyc +0 -0
app/draw_diagram.py +5 -1
app/pages.py +62 -50

app/__pycache__/draw_diagram.cpython-310.pyc CHANGED Viewed

Binary files a/app/__pycache__/draw_diagram.cpython-310.pyc and b/app/__pycache__/draw_diagram.cpython-310.pyc differ

app/__pycache__/pages.cpython-310.pyc CHANGED Viewed

Binary files a/app/__pycache__/pages.cpython-310.pyc and b/app/__pycache__/pages.cpython-310.pyc differ

app/draw_diagram.py CHANGED Viewed

@@ -104,7 +104,11 @@ def draw(folder_name, category_one, category_two, sort, num_sort, model_size_ran
                 ascending=False
             ).reset_index(drop=True)
-        styled_df = chart_data_table.style.highlight_max(
             subset=[chart_data_table.columns[2]], color='yellow'
         )

                 ascending=False
             ).reset_index(drop=True)
+        styled_df = chart_data_table.style.format(
+            {
+                chart_data_table.columns[i]: "{:.3f}" for i in range(2, len(chart_data_table.columns))
+             }
+        ).highlight_max(
             subset=[chart_data_table.columns[2]], color='yellow'
         )

app/pages.py CHANGED Viewed

@@ -11,15 +11,21 @@ def dashboard():
             [![GitHub Repo stars](https://img.shields.io/github/stars/SeaEval/SeaEval?style=social)][gh]
             """)
-    st.markdown("#### News")
-    st.markdown("Dec, 2024: Update Cross-MMLU, Cross-LogiQA, Cross-XQuad, MMLU, IndoMMLU, SG-Eval-v2 results with new prompts (simple prompts to encourage reasoning).")
-    st.markdown("Dec, 2024: New models added: SEA-LION v3, Gemma-2, Sailor 2")
-    st.markdown("Nov, 2024: Update layout and support comparison between models with similar model sizes.")
     st.divider()
-    seaeval_url = "https://seaeval.github.io/"
-    st.markdown("#### What is [SeaEval](%s)?" % seaeval_url)
     with st.container():
         left_co, cent_co,last_co = st.columns(3)
@@ -64,13 +70,14 @@ def dashboard():
         st.markdown("##### Citations")
         st.markdown('''
-                    :round_pushpin: SeaEval Paper \n
                         @article{SeaEval,
                         title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
                         author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
                         journal={NAACL},
                         year={2024}
                         }
                     ''')
@@ -80,11 +87,8 @@ def cross_lingual_consistency():
     filters_levelone = ['Zero Shot', 'Few Shot']
     filters_leveltwo = [
                         'Cross-MMLU',
-                        #'Cross-MMLU-No-Prompt',
                         'Cross-XQUAD',
-                        #'Cross-XQUAD-No-Prompt',
                         'Cross-LogiQA',
-                        #'Cross-LogiQA-No-Prompt',
                         ]
     category_one_dict = {
@@ -94,11 +98,8 @@ def cross_lingual_consistency():
     category_two_dict = {
                         'Cross-MMLU'            : 'cross_mmlu_no_prompt',
-                        #'Cross-MMLU-No-Prompt'  : 'cross_mmlu_no_prompt',
                         'Cross-XQUAD'           : 'cross_xquad_no_prompt',
-                        #'Cross-XQUAD-No-Prompt' : 'cross_xquad_no_prompt',
                         'Cross-LogiQA'          : 'cross_logiqa_no_prompt',
-                        #'Cross-LogiQA-No-Prompt': 'cross_logiqa_no_prompt',
                          }
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
@@ -128,7 +129,6 @@ def cultural_reasoning():
     filters_levelone = ['Zero Shot', 'Few Shot']
     filters_leveltwo = [
                         'SG-EVAL-v2-MCQ',
-                        #'SG EVAL V2 MCQ No Prompt',
                         'SG-EVAL-v2-Open-Ended',
                         'SG-EVAL-v1-Cleaned',
                         'SG-EVAL-v1',
@@ -145,7 +145,6 @@ def cultural_reasoning():
                          'SG-EVAL-v2-MCQ'          : 'sg_eval_v2_mcq_no_prompt',
                          'SG-EVAL-v1'              : 'sg_eval',
                          'SG-EVAL-v1-Cleaned'      : 'sg_eval_v1_cleaned',
-                         # 'SG EVAL V2 MCQ No Prompt': 'sg_eval_v2_mcq_no_prompt',
                          'SG-EVAL-v2-Open-Ended'   : 'sg_eval_v2_open',
                          'US-EVAL'                 : 'us_eval',
                          'CN-EVAL'                 : 'cn_eval',
@@ -175,24 +174,22 @@ def general_reasoning():
     filters_leveltwo = [
                         'IndoMMLU',
                         'MMLU',
-                        #'MMLU-No-Prompt',
                         'CMMLU',
-                        #'IndoMMLU-No-Prompt',
                         'C-Eval',
                         'ZBench',
                         ]
-    category_one_dict = {'Zero Shot': 'zero_shot',
-                         'Few Shot': 'few_shot'}
     category_two_dict = {
                          'IndoMMLU': 'indommlu_no_prompt',
-                         'MMLU': 'mmlu_no_prompt',
-                         #'MMLU-No-Prompt': 'mmlu_no_prompt',
-                         'C-Eval': 'c_eval',
-                         'CMMLU': 'cmmlu',
-                         'ZBench': 'zbench',
-                         #'IndoMMLU-No-Prompt': 'indommlu_no_prompt',
                          }
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
@@ -215,18 +212,23 @@ def flores():
     st.title("Task: FLORES-Translation")
     filters_levelone = ['Zero Shot', 'Few Shot']
-    filters_leveltwo = ['Indonesian to English',
                         'Vitenamese to English',
                         'Chinese to English',
                         'Malay to English'
                         ]
-    category_one_dict = {'Zero Shot': 'zero_shot',
-                         'Few Shot': 'few_shot'}
-    category_two_dict = {'Indonesian to English': 'ind2eng',
-                         'Vitenamese to English': 'vie2eng',
-                         'Chinese to English': 'zho2eng',
-                         'Malay to English': 'zsm2eng'}
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
@@ -256,8 +258,10 @@ def emotion():
     category_one_dict = {'Zero Shot': 'zero_shot',
                          'Few Shot': 'few_shot'}
-    category_two_dict = {'Indonesian Emotion  Classification': 'ind_emotion',
-                             'SST2': 'sst2'}
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
@@ -285,11 +289,15 @@ def dialogue():
                         'DialogSum',
                         ]
-    category_one_dict = {'Zero Shot': 'zero_shot',
-                         'Few Shot': 'few_shot'}
-    category_two_dict = {'DREAM': 'dream',
-                         'SAMSum': 'samsum',
-                         'DialogSum': 'dialogsum'}
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
@@ -319,17 +327,21 @@ def fundamental_nlp_tasks():
     filters_levelone = ['Zero Shot', 'Few Shot']
     filters_leveltwo = ['OCNLI', 'C3', 'COLA', 'QQP', 'MNLI', 'QNLI', 'WNLI', 'RTE', 'MRPC']
-    category_one_dict = {'Zero Shot': 'zero_shot',
-                         'Few Shot': 'few_shot'}
-    category_two_dict = {'OCNLI': 'ocnli',
-                        'C3': 'c3',
-                        'COLA': 'cola',
-                        'QQP': 'qqp',
-                        'MNLI': 'mnli',
-                        'QNLI': 'qnli',
-                        'WNLI': 'wnli',
-                        'RTE': 'rte',
-                        'MRPC': 'mrpc'}
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:

             [![GitHub Repo stars](https://img.shields.io/github/stars/SeaEval/SeaEval?style=social)][gh]
             """)
+    st.markdown("""
+            ### Changelog
+            - **Dec 2024**:
+                - Updated results for **Cross-MMLU**, **Cross-LogiQA**, **Cross-XQuad**, **MMLU**, **IndoMMLU**, and **SG-Eval-v2** with new prompts (simple prompts to encourage reasoning).
+                - Added new models: **SEA-LION v3**, **Gemma-2**, and **Sailor 2**.
+            - **Nov 2024**:
+                - Updated layout and added support for comparison between models with similar sizes.
+            """)
     st.divider()
+    st.markdown("#### What is [SeaEval](https://seaeval.github.io/)?")
     with st.container():
         left_co, cent_co,last_co = st.columns(3)
         st.markdown("##### Citations")
         st.markdown('''
+                    ```
                         @article{SeaEval,
                         title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
                         author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
                         journal={NAACL},
                         year={2024}
                         }
+                    ```
                     ''')
     filters_levelone = ['Zero Shot', 'Few Shot']
     filters_leveltwo = [
                         'Cross-MMLU',
                         'Cross-XQUAD',
                         'Cross-LogiQA',
                         ]
     category_one_dict = {
     category_two_dict = {
                         'Cross-MMLU'            : 'cross_mmlu_no_prompt',
                         'Cross-XQUAD'           : 'cross_xquad_no_prompt',
                         'Cross-LogiQA'          : 'cross_logiqa_no_prompt',
                          }
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     filters_levelone = ['Zero Shot', 'Few Shot']
     filters_leveltwo = [
                         'SG-EVAL-v2-MCQ',
                         'SG-EVAL-v2-Open-Ended',
                         'SG-EVAL-v1-Cleaned',
                         'SG-EVAL-v1',
                          'SG-EVAL-v2-MCQ'          : 'sg_eval_v2_mcq_no_prompt',
                          'SG-EVAL-v1'              : 'sg_eval',
                          'SG-EVAL-v1-Cleaned'      : 'sg_eval_v1_cleaned',
                          'SG-EVAL-v2-Open-Ended'   : 'sg_eval_v2_open',
                          'US-EVAL'                 : 'us_eval',
                          'CN-EVAL'                 : 'cn_eval',
     filters_leveltwo = [
                         'IndoMMLU',
                         'MMLU',
                         'CMMLU',
                         'C-Eval',
                         'ZBench',
                         ]
+    category_one_dict = {
+                        'Zero Shot': 'zero_shot',
+                        'Few Shot' : 'few_shot'
+                         }
     category_two_dict = {
                          'IndoMMLU': 'indommlu_no_prompt',
+                         'MMLU'    : 'mmlu_no_prompt',
+                         'C-Eval'  : 'c_eval',
+                         'CMMLU'   : 'cmmlu',
+                         'ZBench'  : 'zbench',
                          }
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     st.title("Task: FLORES-Translation")
     filters_levelone = ['Zero Shot', 'Few Shot']
+    filters_leveltwo = [
+                        'Indonesian to English',
                         'Vitenamese to English',
                         'Chinese to English',
                         'Malay to English'
                         ]
+    category_one_dict = {
+                        'Zero Shot': 'zero_shot',
+                        'Few Shot' : 'few_shot'
+                         }
+    category_two_dict = {
+                        'Indonesian to English': 'ind2eng',
+                        'Vitenamese to English': 'vie2eng',
+                        'Chinese to English'   : 'zho2eng',
+                        'Malay to English'     : 'zsm2eng'
+                         }
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     category_one_dict = {'Zero Shot': 'zero_shot',
                          'Few Shot': 'few_shot'}
+    category_two_dict = {
+                            'Indonesian Emotion  Classification': 'ind_emotion',
+                            'SST2'                              : 'sst2'
+                             }
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
                         'DialogSum',
                         ]
+    category_one_dict = {
+                        'Zero Shot': 'zero_shot',
+                        'Few Shot' : 'few_shot'
+                         }
+    category_two_dict = {
+                        'DREAM'    : 'dream',
+                        'SAMSum'   : 'samsum',
+                        'DialogSum': 'dialogsum'
+                         }
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left:
     filters_levelone = ['Zero Shot', 'Few Shot']
     filters_leveltwo = ['OCNLI', 'C3', 'COLA', 'QQP', 'MNLI', 'QNLI', 'WNLI', 'RTE', 'MRPC']
+    category_one_dict = {
+                         'Zero Shot': 'zero_shot',
+                         'Few Shot' : 'few_shot'
+                         }
+    category_two_dict = {
+                        'OCNLI': 'ocnli',
+                        'C3'   : 'c3',
+                        'COLA' : 'cola',
+                        'QQP'  : 'qqp',
+                        'MNLI' : 'mnli',
+                        'QNLI' : 'qnli',
+                        'WNLI' : 'wnli',
+                        'RTE'  : 'rte',
+                        'MRPC' : 'mrpc'
+                        }
     left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
     with left: