Update space
Browse files- app.py +18 -1
- src/populate.py +9 -4
app.py
CHANGED
|
@@ -156,6 +156,7 @@ with demo:
|
|
| 156 |
AutoEvalColumn.rank_math_probability.name,
|
| 157 |
AutoEvalColumn.rank_reason_logical.name,
|
| 158 |
AutoEvalColumn.rank_reason_social.name,
|
|
|
|
| 159 |
],
|
| 160 |
rank_col=[],
|
| 161 |
)
|
|
@@ -313,7 +314,8 @@ with demo:
|
|
| 313 |
|
| 314 |
with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
|
| 315 |
CURRENT_TEXT = """
|
| 316 |
-
|
|
|
|
| 317 |
We have diversely and aggressively collected recent science datasets, including but not limited to
|
| 318 |
[GPQA](https://arxiv.org/abs/2311.12022),
|
| 319 |
[JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
|
|
@@ -323,6 +325,7 @@ with demo:
|
|
| 323 |
[SciEval](https://arxiv.org/abs/2308.13149).
|
| 324 |
"""
|
| 325 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
|
|
|
| 326 |
with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
|
| 327 |
leaderboard = overall_leaderboard(
|
| 328 |
get_model_leaderboard_df(
|
|
@@ -340,6 +343,20 @@ with demo:
|
|
| 340 |
)
|
| 341 |
)
|
| 342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
|
| 344 |
CURRENT_TEXT = """
|
| 345 |
# Coming soon!
|
|
|
|
| 156 |
AutoEvalColumn.rank_math_probability.name,
|
| 157 |
AutoEvalColumn.rank_reason_logical.name,
|
| 158 |
AutoEvalColumn.rank_reason_social.name,
|
| 159 |
+
# AutoEvalColumn.rank_chemistry.name,
|
| 160 |
],
|
| 161 |
rank_col=[],
|
| 162 |
)
|
|
|
|
| 314 |
|
| 315 |
with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
|
| 316 |
CURRENT_TEXT = """
|
| 317 |
+
Sicnece domain is a critical area for evaluating LLMs.
|
| 318 |
+
We are working on adding several tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
|
| 319 |
We have diversely and aggressively collected recent science datasets, including but not limited to
|
| 320 |
[GPQA](https://arxiv.org/abs/2311.12022),
|
| 321 |
[JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
|
|
|
|
| 325 |
[SciEval](https://arxiv.org/abs/2308.13149).
|
| 326 |
"""
|
| 327 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
| 328 |
+
|
| 329 |
with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
|
| 330 |
leaderboard = overall_leaderboard(
|
| 331 |
get_model_leaderboard_df(
|
|
|
|
| 343 |
)
|
| 344 |
)
|
| 345 |
|
| 346 |
+
with gr.TabItem("⚛️ Physics", elem_id="physics_subtab", id=1, elem_classes="subtab"):
|
| 347 |
+
CURRENT_TEXT = """
|
| 348 |
+
# Coming soon!
|
| 349 |
+
"""
|
| 350 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
with gr.TabItem("🧬 Biology", elem_id="biology_subtab", id=2, elem_classes="subtab"):
|
| 354 |
+
CURRENT_TEXT = """
|
| 355 |
+
# Coming soon!
|
| 356 |
+
"""
|
| 357 |
+
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
| 358 |
+
|
| 359 |
+
|
| 360 |
with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
|
| 361 |
CURRENT_TEXT = """
|
| 362 |
# Coming soon!
|
src/populate.py
CHANGED
|
@@ -25,11 +25,16 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
| 25 |
df = df.dropna(subset=benchmark_cols)
|
| 26 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
| 27 |
# print(rank_col)
|
| 28 |
-
else:
|
| 29 |
-
|
| 30 |
-
df[
|
| 31 |
-
df
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
df = df.fillna('--')
|
|
|
|
| 33 |
rank = np.arange(1, len(df)+1)
|
| 34 |
df.insert(0, 'Rank', rank)
|
| 35 |
|
|
|
|
| 25 |
df = df.dropna(subset=benchmark_cols)
|
| 26 |
df = df.sort_values(by=[rank_col[0]], ascending=True)
|
| 27 |
# print(rank_col)
|
| 28 |
+
else:
|
| 29 |
+
# when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
|
| 30 |
+
avg_rank = df.iloc[:, 1:].mean(axis=1)
|
| 31 |
+
df["Average Rank"] = avg_rank.round(decimals=4)
|
| 32 |
+
df = df.sort_values(by=["Average Rank"], ascending=True)
|
| 33 |
+
df["Average Rank"] = df["Average Rank"].map('{:.4f}'.format)
|
| 34 |
+
|
| 35 |
+
# we'll skip NaN, instrad of deleting the whole row
|
| 36 |
df = df.fillna('--')
|
| 37 |
+
# insert a rank column
|
| 38 |
rank = np.arange(1, len(df)+1)
|
| 39 |
df.insert(0, 'Rank', rank)
|
| 40 |
|