Spaces:

LLM360
/

de-arena

Running

App Files Files Community

yzabc007 commited on Oct 9, 2024

Commit

2f5cc84

1 Parent(s): 841a40d

Update space

Browse files

Files changed (2) hide show

app.py +18 -1
src/populate.py +9 -4

app.py CHANGED Viewed

@@ -156,6 +156,7 @@ with demo:
                         AutoEvalColumn.rank_math_probability.name,
                         AutoEvalColumn.rank_reason_logical.name,
                         AutoEvalColumn.rank_reason_social.name,
                         ],
                     rank_col=[],
                 )
@@ -313,7 +314,8 @@ with demo:
         with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
             CURRENT_TEXT = """
-            We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
             We have diversely and aggressively collected recent science datasets, including but not limited to
             [GPQA](https://arxiv.org/abs/2311.12022),
             [JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
@@ -323,6 +325,7 @@ with demo:
             [SciEval](https://arxiv.org/abs/2308.13149).
             """
             gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
             with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
@@ -340,6 +343,20 @@ with demo:
                     )
                 )
         with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
             CURRENT_TEXT = """
             # Coming soon!

                         AutoEvalColumn.rank_math_probability.name,
                         AutoEvalColumn.rank_reason_logical.name,
                         AutoEvalColumn.rank_reason_social.name,
+                        # AutoEvalColumn.rank_chemistry.name,
                         ],
                     rank_col=[],
                 )
         with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
             CURRENT_TEXT = """
+            Sicnece domain is a critical area for evaluating LLMs.
+            We are working on adding several tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
             We have diversely and aggressively collected recent science datasets, including but not limited to
             [GPQA](https://arxiv.org/abs/2311.12022),
             [JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
             [SciEval](https://arxiv.org/abs/2308.13149).
             """
             gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
             with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                     )
                 )
+            with gr.TabItem("⚛️ Physics", elem_id="physics_subtab", id=1, elem_classes="subtab"):
+                CURRENT_TEXT = """
+                # Coming soon!
+                """
+                gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("🧬 Biology", elem_id="biology_subtab", id=2, elem_classes="subtab"):
+                CURRENT_TEXT = """
+                # Coming soon!
+                """
+                gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
             CURRENT_TEXT = """
             # Coming soon!

src/populate.py CHANGED Viewed

@@ -25,11 +25,16 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
         df = df.dropna(subset=benchmark_cols)
         df = df.sort_values(by=[rank_col[0]], ascending=True)
         # print(rank_col)
-    else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
-        avg_rank = df.iloc[:, 1:].mean(axis=1) # we'll skip NaN, instrad of deleting the whole row
-        df["Average Rank"] = avg_rank
-        df = df.sort_values(by=["Average Rank"], ascending=True)
         df = df.fillna('--')
         rank = np.arange(1, len(df)+1)
         df.insert(0, 'Rank', rank)

         df = df.dropna(subset=benchmark_cols)
         df = df.sort_values(by=[rank_col[0]], ascending=True)
         # print(rank_col)
+    else:
+        # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
+        avg_rank = df.iloc[:, 1:].mean(axis=1)
+        df["Average Rank"] = avg_rank.round(decimals=4)
+        df = df.sort_values(by=["Average Rank"], ascending=True)
+        df["Average Rank"] = df["Average Rank"].map('{:.4f}'.format)
+        # we'll skip NaN, instrad of deleting the whole row
         df = df.fillna('--')
+        # insert a rank column
         rank = np.arange(1, len(df)+1)
         df.insert(0, 'Rank', rank)