Update space
Browse files- app.py +14 -5
- src/populate.py +23 -9
app.py
CHANGED
|
@@ -151,6 +151,15 @@ with demo:
|
|
| 151 |
'</p>'
|
| 152 |
)
|
| 153 |
gr.HTML(INTRODUCTION_TEXT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 156 |
|
|
@@ -256,7 +265,7 @@ with demo:
|
|
| 256 |
AutoEvalColumn.rank_math_geometry.name,
|
| 257 |
AutoEvalColumn.rank_math_probability.name,
|
| 258 |
],
|
| 259 |
-
rank_col=['sort_by_rank', 1, 4],
|
| 260 |
)
|
| 261 |
)
|
| 262 |
|
|
@@ -277,7 +286,7 @@ with demo:
|
|
| 277 |
# AutoEvalColumn.rank_math_geometry.name,
|
| 278 |
# AutoEvalColumn.rank_math_probability.name,
|
| 279 |
],
|
| 280 |
-
rank_col=['sort_by_score', 1, 4],
|
| 281 |
)
|
| 282 |
)
|
| 283 |
|
|
@@ -389,7 +398,7 @@ with demo:
|
|
| 389 |
AutoEvalColumn.rank_reason_logical.name,
|
| 390 |
AutoEvalColumn.rank_reason_social.name,
|
| 391 |
],
|
| 392 |
-
rank_col=['sort_by_rank', 1, 3],
|
| 393 |
)
|
| 394 |
)
|
| 395 |
|
|
@@ -406,7 +415,7 @@ with demo:
|
|
| 406 |
AutoEvalColumn.score_reason_logical.name,
|
| 407 |
AutoEvalColumn.score_reason_social.name,
|
| 408 |
],
|
| 409 |
-
rank_col=['sort_by_score', 1, 3],
|
| 410 |
)
|
| 411 |
)
|
| 412 |
|
|
@@ -488,7 +497,7 @@ with demo:
|
|
| 488 |
|
| 489 |
AutoEvalColumn.rank_chemistry.name,
|
| 490 |
],
|
| 491 |
-
rank_col=['sort_by_rank', 4, 5],
|
| 492 |
)
|
| 493 |
)
|
| 494 |
|
|
|
|
| 151 |
'</p>'
|
| 152 |
)
|
| 153 |
gr.HTML(INTRODUCTION_TEXT)
|
| 154 |
+
|
| 155 |
+
'''
|
| 156 |
+
TEXT = (
|
| 157 |
+
'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
|
| 158 |
+
''
|
| 159 |
+
'</p>'
|
| 160 |
+
)
|
| 161 |
+
gr.HTML(TEXT)
|
| 162 |
+
'''
|
| 163 |
|
| 164 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 165 |
|
|
|
|
| 265 |
AutoEvalColumn.rank_math_geometry.name,
|
| 266 |
AutoEvalColumn.rank_math_probability.name,
|
| 267 |
],
|
| 268 |
+
rank_col=['sort_by_rank', 1, 4, 'Math'],
|
| 269 |
)
|
| 270 |
)
|
| 271 |
|
|
|
|
| 286 |
# AutoEvalColumn.rank_math_geometry.name,
|
| 287 |
# AutoEvalColumn.rank_math_probability.name,
|
| 288 |
],
|
| 289 |
+
rank_col=['sort_by_score', 1, 4, 'Math'],
|
| 290 |
)
|
| 291 |
)
|
| 292 |
|
|
|
|
| 398 |
AutoEvalColumn.rank_reason_logical.name,
|
| 399 |
AutoEvalColumn.rank_reason_social.name,
|
| 400 |
],
|
| 401 |
+
rank_col=['sort_by_rank', 1, 3, 'Reasoning'],
|
| 402 |
)
|
| 403 |
)
|
| 404 |
|
|
|
|
| 415 |
AutoEvalColumn.score_reason_logical.name,
|
| 416 |
AutoEvalColumn.score_reason_social.name,
|
| 417 |
],
|
| 418 |
+
rank_col=['sort_by_score', 1, 3, 'Reasoning'],
|
| 419 |
)
|
| 420 |
)
|
| 421 |
|
|
|
|
| 497 |
|
| 498 |
AutoEvalColumn.rank_chemistry.name,
|
| 499 |
],
|
| 500 |
+
rank_col=['sort_by_rank', 4, 5, 'Science'],
|
| 501 |
)
|
| 502 |
)
|
| 503 |
|
src/populate.py
CHANGED
|
@@ -42,11 +42,17 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
| 42 |
start_idx = rank_col[1]
|
| 43 |
end_idx = rank_col[2]
|
| 44 |
avg_scores = df.iloc[:, start_idx:end_idx].mean(axis=1)
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
df[
|
| 48 |
-
df = df.sort_values(by=[
|
| 49 |
-
df[
|
| 50 |
|
| 51 |
# df = df.drop(columns=benchmark_cols[offset_idx:])
|
| 52 |
# print(benchmark_cols)
|
|
@@ -68,11 +74,18 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
| 68 |
start_idx = rank_col[1]
|
| 69 |
end_idx = rank_col[2]
|
| 70 |
avg_rank = df.iloc[:, start_idx:end_idx].mean(axis=1)
|
| 71 |
-
df.insert(1, "Average Rank", avg_rank)
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
# we'll skip NaN, instrad of deleting the whole row
|
| 78 |
df = df.fillna('--')
|
|
@@ -80,7 +93,8 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
| 80 |
rank = np.arange(1, len(df)+1)
|
| 81 |
df.insert(0, 'Rank', rank)
|
| 82 |
|
| 83 |
-
|
|
|
|
| 84 |
|
| 85 |
|
| 86 |
|
|
|
|
| 42 |
start_idx = rank_col[1]
|
| 43 |
end_idx = rank_col[2]
|
| 44 |
avg_scores = df.iloc[:, start_idx:end_idx].mean(axis=1)
|
| 45 |
+
if len(rank_col) == 4:
|
| 46 |
+
avg_col_name = f"Overall ({rank_col[3]})"
|
| 47 |
+
else:
|
| 48 |
+
# avg_col_name = "Average Score"
|
| 49 |
+
avg_col_name = 'Overall'
|
| 50 |
+
|
| 51 |
+
df.insert(1, avg_col_name, avg_scores)
|
| 52 |
|
| 53 |
+
df[avg_col_name] = avg_scores.round(decimals=4)
|
| 54 |
+
df = df.sort_values(by=[avg_col_name], ascending=False)
|
| 55 |
+
df[avg_col_name] = df[avg_col_name].map('{:.2f}'.format)
|
| 56 |
|
| 57 |
# df = df.drop(columns=benchmark_cols[offset_idx:])
|
| 58 |
# print(benchmark_cols)
|
|
|
|
| 74 |
start_idx = rank_col[1]
|
| 75 |
end_idx = rank_col[2]
|
| 76 |
avg_rank = df.iloc[:, start_idx:end_idx].mean(axis=1)
|
|
|
|
| 77 |
|
| 78 |
+
if len(rank_col) == 4:
|
| 79 |
+
avg_col_name = f"Overall ({rank_col[3]})"
|
| 80 |
+
else:
|
| 81 |
+
# avg_col_name = "Average Rank"
|
| 82 |
+
avg_col_name = 'Overall'
|
| 83 |
+
|
| 84 |
+
df.insert(1, avg_col_name, avg_rank)
|
| 85 |
+
|
| 86 |
+
df[avg_col_name] = avg_rank.round(decimals=4)
|
| 87 |
+
df = df.sort_values(by=[avg_col_name], ascending=True)
|
| 88 |
+
df[avg_col_name] = df[avg_col_name].map('{:.2f}'.format)
|
| 89 |
|
| 90 |
# we'll skip NaN, instrad of deleting the whole row
|
| 91 |
df = df.fillna('--')
|
|
|
|
| 93 |
rank = np.arange(1, len(df)+1)
|
| 94 |
df.insert(0, 'Rank', rank)
|
| 95 |
|
| 96 |
+
# print(benchmark_cols)
|
| 97 |
+
# df.style.background_gradient(cmap='coolwarm', subset=benchmark_cols)
|
| 98 |
|
| 99 |
|
| 100 |
|