Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
|
@@ -38,12 +38,13 @@ def load_data(files, model_type):
|
|
| 38 |
|
| 39 |
# Load and label all data
|
| 40 |
data = load_data(noncot_results, "Text Only")
|
|
|
|
| 41 |
vision_data = load_data(vision_results, "Vision")
|
| 42 |
cot_text_data = load_data(cot_text_results, "CoT Text Only")
|
| 43 |
# cot_vision_data = load_data(cot_vision_results, "CoT Vision")
|
| 44 |
|
| 45 |
# Combine all data into a single DataFrame
|
| 46 |
-
all_data = pd.concat([
|
| 47 |
|
| 48 |
all_model_names = all_data["Model Name"].unique()
|
| 49 |
all_text_only_model_names = list(
|
|
@@ -414,7 +415,7 @@ def generate_heatmap_for_intersection_model(model_name):
|
|
| 414 |
|
| 415 |
sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)
|
| 416 |
|
| 417 |
-
plt.close(fig)
|
| 418 |
return fig
|
| 419 |
|
| 420 |
|
|
@@ -443,7 +444,7 @@ with gr.Blocks() as demo:
|
|
| 443 |
heatmap_image_qwen = gr.Image(label="", show_label=False)
|
| 444 |
leader_board.select(fn=load_heatmap_qwen, outputs=[heatmap_image_qwen])
|
| 445 |
|
| 446 |
-
with gr.Tab("Vision Benchmark"):
|
| 447 |
gr.Markdown("# Vision Benchmark Leaderboard")
|
| 448 |
leader_board_vision = gr.Dataframe(
|
| 449 |
vision_accuracy_df, headers=headers_with_icons
|
|
@@ -454,7 +455,7 @@ with gr.Blocks() as demo:
|
|
| 454 |
fn=load_vision_heatmap, outputs=[heatmap_image_vision]
|
| 455 |
)
|
| 456 |
|
| 457 |
-
with gr.Tab("Text-only Benchmark (CoT)"):
|
| 458 |
gr.Markdown("# Text-only Leaderboard (CoT)")
|
| 459 |
cot_leader_board_text = gr.Dataframe(
|
| 460 |
cot_text_accuracy_df, headers=headers_with_icons
|
|
@@ -499,7 +500,7 @@ with gr.Blocks() as demo:
|
|
| 499 |
queue=True,
|
| 500 |
)
|
| 501 |
|
| 502 |
-
with gr.Tab("Constraint Text-only Results (CoT)"):
|
| 503 |
gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
|
| 504 |
included_models_cot = gr.CheckboxGroup(
|
| 505 |
label="Models to include",
|
|
@@ -514,14 +515,14 @@ with gr.Blocks() as demo:
|
|
| 514 |
constrained_leader_board_text_cot = gr.Dataframe()
|
| 515 |
constrained_leader_board_plot_cot = gr.Plot()
|
| 516 |
|
| 517 |
-
with gr.Tab("Majority Vote (Subset 1)"):
|
| 518 |
gr.Markdown("## Majority Vote (Subset 1)")
|
| 519 |
intersection_leader_board = gr.Dataframe(
|
| 520 |
intersection_df_acc, headers=headers_with_icons
|
| 521 |
)
|
| 522 |
heatmap_image = gr.Plot(label="Model Heatmap")
|
| 523 |
|
| 524 |
-
with gr.Tab("Text-only Benchmark (deprecated)"):
|
| 525 |
gr.Markdown("# Text-only Leaderboard")
|
| 526 |
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
|
| 527 |
gr.Markdown("## Heatmap")
|
|
|
|
| 38 |
|
| 39 |
# Load and label all data
|
| 40 |
data = load_data(noncot_results, "Text Only")
|
| 41 |
+
data_qwen = load_data(noncot_results_qwen, "Text Only")
|
| 42 |
vision_data = load_data(vision_results, "Vision")
|
| 43 |
cot_text_data = load_data(cot_text_results, "CoT Text Only")
|
| 44 |
# cot_vision_data = load_data(cot_vision_results, "CoT Vision")
|
| 45 |
|
| 46 |
# Combine all data into a single DataFrame
|
| 47 |
+
all_data = pd.concat([data_qwen, vision_data, cot_text_data], ignore_index=True)
|
| 48 |
|
| 49 |
all_model_names = all_data["Model Name"].unique()
|
| 50 |
all_text_only_model_names = list(
|
|
|
|
| 415 |
|
| 416 |
sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)
|
| 417 |
|
| 418 |
+
plt.close(fig)
|
| 419 |
return fig
|
| 420 |
|
| 421 |
|
|
|
|
| 444 |
heatmap_image_qwen = gr.Image(label="", show_label=False)
|
| 445 |
leader_board.select(fn=load_heatmap_qwen, outputs=[heatmap_image_qwen])
|
| 446 |
|
| 447 |
+
with gr.Tab("Vision Benchmark", visible=False):
|
| 448 |
gr.Markdown("# Vision Benchmark Leaderboard")
|
| 449 |
leader_board_vision = gr.Dataframe(
|
| 450 |
vision_accuracy_df, headers=headers_with_icons
|
|
|
|
| 455 |
fn=load_vision_heatmap, outputs=[heatmap_image_vision]
|
| 456 |
)
|
| 457 |
|
| 458 |
+
with gr.Tab("Text-only Benchmark (CoT)", visible=False):
|
| 459 |
gr.Markdown("# Text-only Leaderboard (CoT)")
|
| 460 |
cot_leader_board_text = gr.Dataframe(
|
| 461 |
cot_text_accuracy_df, headers=headers_with_icons
|
|
|
|
| 500 |
queue=True,
|
| 501 |
)
|
| 502 |
|
| 503 |
+
with gr.Tab("Constraint Text-only Results (CoT)", visible=False):
|
| 504 |
gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
|
| 505 |
included_models_cot = gr.CheckboxGroup(
|
| 506 |
label="Models to include",
|
|
|
|
| 515 |
constrained_leader_board_text_cot = gr.Dataframe()
|
| 516 |
constrained_leader_board_plot_cot = gr.Plot()
|
| 517 |
|
| 518 |
+
with gr.Tab("Majority Vote (Subset 1)", visible=False):
|
| 519 |
gr.Markdown("## Majority Vote (Subset 1)")
|
| 520 |
intersection_leader_board = gr.Dataframe(
|
| 521 |
intersection_df_acc, headers=headers_with_icons
|
| 522 |
)
|
| 523 |
heatmap_image = gr.Plot(label="Model Heatmap")
|
| 524 |
|
| 525 |
+
with gr.Tab("Text-only Benchmark (deprecated)", visible=False):
|
| 526 |
gr.Markdown("# Text-only Leaderboard")
|
| 527 |
leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
|
| 528 |
gr.Markdown("## Heatmap")
|