Spaces:
Sleeping
Sleeping
update
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- .gitattributes +50 -0
- app.py +106 -124
- results/CodeLlama-70b-Instruct-hf.jpg β heatmaps/CodeLlama-70b-Instruct-hf_CoT.jpg +2 -2
- results/GPT-4-0125-preview.png β heatmaps/CodeLlama-70b-Instruct-hf_Textonly.jpg +2 -2
- results/CodeLlama-70b-Instruct-hf.png β heatmaps/Llama-2-70b-chat-hf_CoT.jpg +2 -2
- results/GPT-4-0125-preview.jpg β heatmaps/Llama-2-70b-chat-hf_Textonly.jpg +2 -2
- heatmaps/Llama-3-70b-chat-hf_CoT.jpg +3 -0
- heatmaps/Llama-3-70b-chat-hf_Textonly.jpg +3 -0
- heatmaps/Mistral-7B-Instruct-v0.2_CoT.jpg +3 -0
- heatmaps/Mistral-7B-Instruct-v0.2_Textonly.jpg +3 -0
- heatmaps/Mixtral-8x7B-Instruct-v0.1_CoT.jpg +3 -0
- heatmaps/Mixtral-8x7B-Instruct-v0.1_Textonly.jpg +3 -0
- heatmaps/Qwen1.5-72B-Chat_CoT.jpg +3 -0
- heatmaps/Qwen1.5-72B-Chat_Textonly.jpg +3 -0
- heatmaps/Yi-34B-Chat_CoT.jpg +3 -0
- heatmaps/Yi-34B-Chat_Textonly.jpg +3 -0
- heatmaps/claude-3-haiku-20240307_1shot.jpg +3 -0
- heatmaps/claude-3-haiku-20240307_CoT.jpg +3 -0
- heatmaps/claude-3-haiku-20240307_Textonly.jpg +3 -0
- heatmaps/claude-3-haiku-20240307_vision-CoT.jpg +3 -0
- heatmaps/claude-3-haiku-20240307_vision.jpg +3 -0
- heatmaps/claude-3-opus-20240229_CoT.jpg +3 -0
- heatmaps/claude-3-opus-20240229_Textonly.jpg +3 -0
- heatmaps/claude-3-opus-20240229_vision-CoT.jpg +3 -0
- heatmaps/claude-3-opus-20240229_vision.jpg +3 -0
- heatmaps/claude-3-sonnet-20240229_CoT.jpg +3 -0
- heatmaps/claude-3-sonnet-20240229_Textonly.jpg +3 -0
- heatmaps/claude-3-sonnet-20240229_vision-CoT.jpg +3 -0
- heatmaps/claude-3-sonnet-20240229_vision.jpg +3 -0
- heatmaps/dbrx-instruct_CoT.jpg +3 -0
- heatmaps/dbrx-instruct_Textonly.jpg +3 -0
- heatmaps/deepseek-llm-67b-chat_CoT.jpg +3 -0
- heatmaps/deepseek-llm-67b-chat_Textonly.jpg +3 -0
- heatmaps/gemini-pro_CoT.jpg +3 -0
- heatmaps/gemini-pro_vision-CoT.jpg +3 -0
- heatmaps/gemini-pro_vision.jpg +3 -0
- heatmaps/gemma-7b-it_CoT.jpg +3 -0
- heatmaps/gemma-7b-it_Textonly.jpg +3 -0
- heatmaps/gpt-3.5-0613_CoT.jpg +3 -0
- heatmaps/gpt-3.5-0613_Textonly.jpg +3 -0
- heatmaps/gpt-3.5-turbo-0125_1shot.jpg +3 -0
- heatmaps/gpt-3.5-turbo-0125_CoT.jpg +3 -0
- heatmaps/gpt-3.5-turbo-0125_Textonly.jpg +3 -0
- heatmaps/gpt-4-0125-preview_CoT.jpg +3 -0
- heatmaps/gpt-4-0125-preview_Textonly.jpg +3 -0
- heatmaps/gpt-4-1106_CoT.jpg +3 -0
- heatmaps/gpt-4-1106_Textonly.jpg +3 -0
- heatmaps/gpt-4-turbo-2024-04-09_CoT.jpg +3 -0
- results/gpt-4-turbo-2024-04-09.jpg β heatmaps/gpt-4-turbo-2024-04-09_Textonly.jpg +0 -0
- heatmaps/gpt-4-vision-preview_vision-CoT.jpg +3 -0
.gitattributes
CHANGED
|
@@ -284,3 +284,53 @@ results/gpt-4-1106.pkl filter=lfs diff=lfs merge=lfs -text
|
|
| 284 |
results/Llama-3-70b-chat-hf.pkl filter=lfs diff=lfs merge=lfs -text
|
| 285 |
results/dbrx-instruct.pkl filter=lfs diff=lfs merge=lfs -text
|
| 286 |
results/gpt-3.5-0613.pkl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
results/Llama-3-70b-chat-hf.pkl filter=lfs diff=lfs merge=lfs -text
|
| 285 |
results/dbrx-instruct.pkl filter=lfs diff=lfs merge=lfs -text
|
| 286 |
results/gpt-3.5-0613.pkl filter=lfs diff=lfs merge=lfs -text
|
| 287 |
+
final_df.pkl filter=lfs diff=lfs merge=lfs -text
|
| 288 |
+
heatmaps/claude-3-haiku-20240307_vision.jpg filter=lfs diff=lfs merge=lfs -text
|
| 289 |
+
heatmaps/Mixtral-8x7B-Instruct-v0.1_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 290 |
+
heatmaps/claude-3-haiku-20240307_vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 291 |
+
heatmaps/gpt-4-1106_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 292 |
+
heatmaps/gpt-4-vision-preview_vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 293 |
+
heatmaps/dbrx-instruct_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 294 |
+
heatmaps/Llama-2-70b-chat-hf_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 295 |
+
heatmaps/Llama-3-70b-chat-hf_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 296 |
+
heatmaps/Llama-3-70b-chat-hf_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 297 |
+
heatmaps/Mistral-7B-Instruct-v0.2_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 298 |
+
heatmaps/claude-3-sonnet-20240229_vision.jpg filter=lfs diff=lfs merge=lfs -text
|
| 299 |
+
heatmaps/CodeLlama-70b-Instruct-hf_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 300 |
+
heatmaps/claude-3-haiku-20240307_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 301 |
+
heatmaps/gemma-7b-it_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 302 |
+
heatmaps/gpt-4-0125-preview_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 303 |
+
heatmaps/claude-3-haiku-20240307_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 304 |
+
heatmaps/Mistral-7B-Instruct-v0.2_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 305 |
+
heatmaps/Qwen1.5-72B-Chat_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 306 |
+
heatmaps/gpt-3.5-turbo-0125_1shot.jpg filter=lfs diff=lfs merge=lfs -text
|
| 307 |
+
heatmaps/claude-3-sonnet-20240229_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 308 |
+
heatmaps/Mixtral-8x7B-Instruct-v0.1_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 309 |
+
heatmaps/gemma-7b-it_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 310 |
+
heatmaps/gpt-4-0125-preview_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 311 |
+
heatmaps/gpt-4-turbo-2024-04-09_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 312 |
+
heatmaps/claude-3-haiku-20240307_1shot.jpg filter=lfs diff=lfs merge=lfs -text
|
| 313 |
+
heatmaps/claude-3-opus-20240229_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 314 |
+
heatmaps/deepseek-llm-67b-chat_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 315 |
+
heatmaps/claude-3-opus-20240229_vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 316 |
+
heatmaps/gemini-pro_vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 317 |
+
heatmaps/gpt-3.5-0613_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 318 |
+
heatmaps/gpt-4-vision-preview_vision.jpg filter=lfs diff=lfs merge=lfs -text
|
| 319 |
+
heatmaps/gemini-pro_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 320 |
+
heatmaps/CodeLlama-70b-Instruct-hf_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 321 |
+
heatmaps/Qwen1.5-72B-Chat_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 322 |
+
heatmaps/Yi-34B-Chat_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 323 |
+
heatmaps/claude-3-sonnet-20240229_vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 324 |
+
heatmaps/dbrx-instruct_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 325 |
+
heatmaps/gemini-pro_vision.jpg filter=lfs diff=lfs merge=lfs -text
|
| 326 |
+
heatmaps/gpt-3.5-0613_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 327 |
+
heatmaps/gpt-4-1106_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 328 |
+
heatmaps/gpt-4-turbo-2024-04-09_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 329 |
+
heatmaps/Llama-2-70b-chat-hf_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 330 |
+
heatmaps/claude-3-opus-20240229_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 331 |
+
heatmaps/gpt-3.5-turbo-0125_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
| 332 |
+
heatmaps/gpt-3.5-turbo-0125_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 333 |
+
heatmaps/Yi-34B-Chat_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 334 |
+
heatmaps/claude-3-opus-20240229_vision.jpg filter=lfs diff=lfs merge=lfs -text
|
| 335 |
+
heatmaps/claude-3-sonnet-20240229_CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 336 |
+
heatmaps/deepseek-llm-67b-chat_Textonly.jpg filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -7,27 +7,50 @@ import pandas as pd
|
|
| 7 |
import seaborn as sns
|
| 8 |
from matplotlib.colors import BoundaryNorm, ListedColormap
|
| 9 |
|
| 10 |
-
all_results = pd.read_pickle("
|
| 11 |
|
| 12 |
|
| 13 |
-
def get_accuracy_dataframe(
|
| 14 |
# Calculate overall model accuracy
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
# Calculate model accuracy per difficulty level
|
| 19 |
-
df[
|
| 20 |
-
model_accuracy_per_level =
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# Merge overall accuracy and level-based accuracy into a single DataFrame
|
| 24 |
-
model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on=
|
| 25 |
-
model_accuracy_df.rename(
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# Multiply by 100 and format to one decimal point
|
| 29 |
-
model_accuracy_df = model_accuracy_df.applymap(
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
# Add headers with icons
|
| 32 |
model_accuracy_df.columns = [
|
| 33 |
"π€ Model Name",
|
|
@@ -40,13 +63,15 @@ def get_accuracy_dataframe(df):
|
|
| 40 |
|
| 41 |
model_accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
|
| 42 |
|
| 43 |
-
# Add a new column at the beginning for the rank
|
| 44 |
-
model_accuracy_df.insert(0, '#', range(1, len(model_accuracy_df) + 1))
|
| 45 |
-
|
| 46 |
return model_accuracy_df
|
| 47 |
|
| 48 |
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
# Define the column names with icons
|
|
@@ -68,126 +93,83 @@ column_names = [
|
|
| 68 |
"Level 4 Accuracy",
|
| 69 |
]
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
| 73 |
return heatmap_image
|
| 74 |
|
| 75 |
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
# # Function to process data
|
| 78 |
-
# def process_data(data):
|
| 79 |
-
# data_for_df = []
|
| 80 |
-
# for file, df in data.items():
|
| 81 |
-
# overall_accuracy = round(calculate_accuracy(df), 2)
|
| 82 |
-
# breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
|
| 83 |
-
# model_name = file.split("/")[-1].replace(".pkl", "")
|
| 84 |
-
# data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
|
| 85 |
-
# return data_for_df
|
| 86 |
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
# # Function to finalize DataFrame
|
| 89 |
-
# def finalize_df(df):
|
| 90 |
-
# df = df.round(1) # Round to one decimal place
|
| 91 |
-
# df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
|
| 92 |
-
# df.columns = headers_with_icons
|
| 93 |
-
# df.sort_values(by="β Overall", ascending=False, inplace=True)
|
| 94 |
-
# # add a new column with the order (index)
|
| 95 |
-
# df["#"] = range(1, len(df) + 1)
|
| 96 |
-
# # bring rank to the first column
|
| 97 |
-
# cols = df.columns.tolist()
|
| 98 |
-
# cols = cols[-1:] + cols[:-1]
|
| 99 |
-
# df = df[cols]
|
| 100 |
|
| 101 |
-
|
|
|
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
-
def
|
| 105 |
-
heatmap_image = gr.Image(f"
|
| 106 |
return heatmap_image
|
| 107 |
|
| 108 |
|
|
|
|
|
|
|
| 109 |
with gr.Blocks() as demo:
|
| 110 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
|
|
|
|
|
|
| 111 |
with gr.Tab("Text-only Benchmark"):
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
gr.Markdown("## Heatmap")
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
# with gr.Tab("Vision Benchmark", visible=False):
|
| 118 |
-
# gr.Markdown("# Vision Benchmark Leaderboard")
|
| 119 |
-
# leader_board_vision = gr.Dataframe(
|
| 120 |
-
# vision_accuracy_df, headers=headers_with_icons
|
| 121 |
-
# )
|
| 122 |
-
# gr.Markdown("## Heatmap")
|
| 123 |
-
# heatmap_image_vision = gr.Image(label="", show_label=False)
|
| 124 |
-
# leader_board_vision.select(
|
| 125 |
-
# fn=load_vision_heatmap, outputs=[heatmap_image_vision]
|
| 126 |
-
# )
|
| 127 |
-
|
| 128 |
-
# with gr.Tab("Text-only Benchmark (CoT)", visible=False):
|
| 129 |
-
# gr.Markdown("# Text-only Leaderboard (CoT)")
|
| 130 |
-
# cot_leader_board_text = gr.Dataframe(
|
| 131 |
-
# cot_text_accuracy_df, headers=headers_with_icons
|
| 132 |
-
# )
|
| 133 |
-
# gr.Markdown("## Heatmap")
|
| 134 |
-
# cot_heatmap_image_text = gr.Image(label="", show_label=False)
|
| 135 |
-
# cot_leader_board_text.select(
|
| 136 |
-
# fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
|
| 137 |
-
# )
|
| 138 |
-
|
| 139 |
-
# with gr.Tab("Constraint Text-only Results (CoT)", visible=False):
|
| 140 |
-
# gr.Markdown("## Constraint Text-only Leaderboard by first substrin (CoT)")
|
| 141 |
-
# included_models_cot = gr.CheckboxGroup(
|
| 142 |
-
# label="Models to include",
|
| 143 |
-
# choices=all_cot_text_only_models,
|
| 144 |
-
# value=all_cot_text_only_models,
|
| 145 |
-
# interactive=True,
|
| 146 |
-
# )
|
| 147 |
-
# with gr.Row():
|
| 148 |
-
# number_of_queries_cot = gr.Textbox(label="Number of included queries")
|
| 149 |
-
# number_of_fsms_cot = gr.Textbox(label="Number of included FSMs")
|
| 150 |
-
|
| 151 |
-
# constrained_leader_board_text_cot = gr.Dataframe()
|
| 152 |
-
# constrained_leader_board_plot_cot = gr.Plot()
|
| 153 |
-
|
| 154 |
-
# with gr.Tab("Majority Vote (Subset 1)", visible=False):
|
| 155 |
-
# gr.Markdown("## Majority Vote (Subset 1)")
|
| 156 |
-
# intersection_leader_board = gr.Dataframe(
|
| 157 |
-
# intersection_df_acc, headers=headers_with_icons
|
| 158 |
-
# )
|
| 159 |
-
# heatmap_image = gr.Plot(label="Model Heatmap")
|
| 160 |
-
|
| 161 |
-
# with gr.Tab("Text-only Benchmark (deprecated)", visible=False):
|
| 162 |
-
# gr.Markdown("# Text-only Leaderboard")
|
| 163 |
-
# leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons)
|
| 164 |
-
# gr.Markdown("## Heatmap")
|
| 165 |
-
# heatmap_image = gr.Image(label="", show_label=False)
|
| 166 |
-
# leader_board.select(fn=load_heatmap, outputs=[heatmap_image])
|
| 167 |
-
|
| 168 |
-
# # ============ Callbacks ============
|
| 169 |
-
|
| 170 |
-
# included_models_cot.select(
|
| 171 |
-
# fn=calculate_order_by_first_substring_cot,
|
| 172 |
-
# inputs=[included_models_cot],
|
| 173 |
-
# outputs=[
|
| 174 |
-
# constrained_leader_board_text_cot,
|
| 175 |
-
# number_of_queries_cot,
|
| 176 |
-
# number_of_fsms_cot,
|
| 177 |
-
# ],
|
| 178 |
-
# queue=True,
|
| 179 |
-
# )
|
| 180 |
-
|
| 181 |
-
# constrained_leader_board_text.select(
|
| 182 |
-
# fn=show_constraint_heatmap, outputs=[constrained_leader_board_plot]
|
| 183 |
-
# )
|
| 184 |
-
|
| 185 |
-
# constrained_leader_board_text_cot.select(
|
| 186 |
-
# fn=show_constraint_heatmap_cot, outputs=[constrained_leader_board_plot_cot]
|
| 187 |
-
# )
|
| 188 |
-
|
| 189 |
-
# intersection_leader_board.select(
|
| 190 |
-
# fn=show_intersection_heatmap, outputs=[heatmap_image]
|
| 191 |
-
# )
|
| 192 |
|
| 193 |
demo.launch()
|
|
|
|
| 7 |
import seaborn as sns
|
| 8 |
from matplotlib.colors import BoundaryNorm, ListedColormap
|
| 9 |
|
| 10 |
+
all_results = pd.read_pickle("final_df.pkl")
|
| 11 |
|
| 12 |
|
| 13 |
+
def get_accuracy_dataframe(df_mother, category):
|
| 14 |
# Calculate overall model accuracy
|
| 15 |
+
# filter for category only
|
| 16 |
+
df = df_mother[df_mother["category"] == category].copy()
|
| 17 |
+
df["is_answer_correct"] = df["is_answer_correct"].astype(float)
|
| 18 |
+
model_accuracy = df.groupby("model")["is_answer_correct"].mean().reset_index()
|
| 19 |
+
|
| 20 |
# Calculate model accuracy per difficulty level
|
| 21 |
+
df["difficulty_level"] = df["difficulty_level"].astype(int)
|
| 22 |
+
model_accuracy_per_level = (
|
| 23 |
+
df.groupby(["model", "difficulty_level"])["is_answer_correct"]
|
| 24 |
+
.mean()
|
| 25 |
+
.reset_index()
|
| 26 |
+
)
|
| 27 |
+
model_accuracy_per_level_df = model_accuracy_per_level.pivot(
|
| 28 |
+
index="model", columns="difficulty_level", values="is_answer_correct"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
# Merge overall accuracy and level-based accuracy into a single DataFrame
|
| 32 |
+
model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on="model")
|
| 33 |
+
model_accuracy_df.rename(
|
| 34 |
+
columns={"is_answer_correct": "Overall Accuracy"}, inplace=True
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Ensure all expected difficulty levels are present
|
| 38 |
+
expected_levels = [1, 2, 3, 4] # Adjust based on your data
|
| 39 |
+
for level in expected_levels:
|
| 40 |
+
if level not in model_accuracy_df.columns:
|
| 41 |
+
model_accuracy_df[
|
| 42 |
+
level
|
| 43 |
+
] = None # Fill missing levels with None or an appropriate value
|
| 44 |
+
|
| 45 |
+
# Rename columns to include levels
|
| 46 |
+
level_columns = {level: f"Level {level} Accuracy" for level in expected_levels}
|
| 47 |
+
model_accuracy_df.rename(columns=level_columns, inplace=True)
|
| 48 |
+
|
| 49 |
# Multiply by 100 and format to one decimal point
|
| 50 |
+
model_accuracy_df = model_accuracy_df.applymap(
|
| 51 |
+
lambda x: round(x * 100, 1) if isinstance(x, float) else x
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
# Add headers with icons
|
| 55 |
model_accuracy_df.columns = [
|
| 56 |
"π€ Model Name",
|
|
|
|
| 63 |
|
| 64 |
model_accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True)
|
| 65 |
|
|
|
|
|
|
|
|
|
|
| 66 |
return model_accuracy_df
|
| 67 |
|
| 68 |
|
| 69 |
+
# categories = array(['1shot', 'CoT', 'Textonly', 'vision', 'vision-CoT'], dtype=object)
|
| 70 |
+
accuracy_df_textonly = get_accuracy_dataframe(all_results, "Textonly")
|
| 71 |
+
accuracy_df_cot = get_accuracy_dataframe(all_results, "CoT")
|
| 72 |
+
accuracy_df_vision = get_accuracy_dataframe(all_results, "vision")
|
| 73 |
+
accuracy_df_vision_cot = get_accuracy_dataframe(all_results, "vision-CoT")
|
| 74 |
+
accuracy_df_1shot = get_accuracy_dataframe(all_results, "1shot")
|
| 75 |
|
| 76 |
|
| 77 |
# Define the column names with icons
|
|
|
|
| 93 |
"Level 4 Accuracy",
|
| 94 |
]
|
| 95 |
|
| 96 |
+
|
| 97 |
+
def load_heatmap_textonly(evt: gr.SelectData):
|
| 98 |
+
print(f"./heatmaps/{evt.value}_Textonly.jpg")
|
| 99 |
+
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_Textonly.jpg")
|
| 100 |
return heatmap_image
|
| 101 |
|
| 102 |
|
| 103 |
+
def load_heatmap_cot(evt: gr.SelectData):
|
| 104 |
+
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_CoT.jpg")
|
| 105 |
+
return heatmap_image
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
+
def load_heatmap_vision(evt: gr.SelectData):
|
| 109 |
+
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision.jpg")
|
| 110 |
+
return heatmap_image
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
def load_heatmap_vision_cot(evt: gr.SelectData):
|
| 114 |
+
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision-CoT.jpg")
|
| 115 |
+
return heatmap_image
|
| 116 |
|
| 117 |
|
| 118 |
+
def load_heatmap_1shot(evt: gr.SelectData):
|
| 119 |
+
heatmap_image = gr.Image(f"./heatmaps/{evt.value}_1shot.jpg")
|
| 120 |
return heatmap_image
|
| 121 |
|
| 122 |
|
| 123 |
+
# Then, use these functions in the corresponding select method calls:
|
| 124 |
+
|
| 125 |
with gr.Blocks() as demo:
|
| 126 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
| 127 |
+
|
| 128 |
+
# Text-only Benchmark
|
| 129 |
with gr.Tab("Text-only Benchmark"):
|
| 130 |
+
leader_board_textonly = gr.Dataframe(
|
| 131 |
+
accuracy_df_textonly, headers=headers_with_icons
|
| 132 |
+
)
|
| 133 |
+
gr.Markdown("## Heatmap")
|
| 134 |
+
heatmap_image_textonly = gr.Image(label="", show_label=False)
|
| 135 |
+
leader_board_textonly.select(
|
| 136 |
+
fn=load_heatmap_textonly, outputs=[heatmap_image_textonly]
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# CoT Benchmark
|
| 140 |
+
with gr.Tab("CoT Benchmark"):
|
| 141 |
+
leader_board_cot = gr.Dataframe(accuracy_df_cot, headers=headers_with_icons)
|
| 142 |
+
gr.Markdown("## Heatmap")
|
| 143 |
+
heatmap_image_cot = gr.Image(label="", show_label=False)
|
| 144 |
+
leader_board_cot.select(fn=load_heatmap_cot, outputs=[heatmap_image_cot])
|
| 145 |
+
|
| 146 |
+
# Vision Benchmark
|
| 147 |
+
with gr.Tab("Vision Benchmark"):
|
| 148 |
+
leader_board_vision = gr.Dataframe(
|
| 149 |
+
accuracy_df_vision, headers=headers_with_icons
|
| 150 |
+
)
|
| 151 |
+
gr.Markdown("## Heatmap")
|
| 152 |
+
heatmap_image_vision = gr.Image(label="", show_label=False)
|
| 153 |
+
leader_board_vision.select(
|
| 154 |
+
fn=load_heatmap_vision, outputs=[heatmap_image_vision]
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Vision-CoT Benchmark
|
| 158 |
+
with gr.Tab("Vision-CoT Benchmark"):
|
| 159 |
+
leader_board_vision_cot = gr.Dataframe(
|
| 160 |
+
accuracy_df_vision_cot, headers=headers_with_icons
|
| 161 |
+
)
|
| 162 |
+
gr.Markdown("## Heatmap")
|
| 163 |
+
heatmap_image_vision_cot = gr.Image(label="", show_label=False)
|
| 164 |
+
leader_board_vision_cot.select(
|
| 165 |
+
fn=load_heatmap_vision_cot, outputs=[heatmap_image_vision_cot]
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# 1shot Benchmark
|
| 169 |
+
with gr.Tab("1shot Benchmark"):
|
| 170 |
+
leader_board_1shot = gr.Dataframe(accuracy_df_1shot, headers=headers_with_icons)
|
| 171 |
gr.Markdown("## Heatmap")
|
| 172 |
+
heatmap_image_1shot = gr.Image(label="", show_label=False)
|
| 173 |
+
leader_board_1shot.select(fn=load_heatmap_1shot, outputs=[heatmap_image_1shot])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
demo.launch()
|
results/CodeLlama-70b-Instruct-hf.jpg β heatmaps/CodeLlama-70b-Instruct-hf_CoT.jpg
RENAMED
|
File without changes
|
results/GPT-4-0125-preview.png β heatmaps/CodeLlama-70b-Instruct-hf_Textonly.jpg
RENAMED
|
File without changes
|
results/CodeLlama-70b-Instruct-hf.png β heatmaps/Llama-2-70b-chat-hf_CoT.jpg
RENAMED
|
File without changes
|
results/GPT-4-0125-preview.jpg β heatmaps/Llama-2-70b-chat-hf_Textonly.jpg
RENAMED
|
File without changes
|
heatmaps/Llama-3-70b-chat-hf_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/Llama-3-70b-chat-hf_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/Mistral-7B-Instruct-v0.2_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/Mistral-7B-Instruct-v0.2_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/Mixtral-8x7B-Instruct-v0.1_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/Mixtral-8x7B-Instruct-v0.1_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/Qwen1.5-72B-Chat_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/Qwen1.5-72B-Chat_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/Yi-34B-Chat_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/Yi-34B-Chat_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-haiku-20240307_1shot.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-haiku-20240307_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-haiku-20240307_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-haiku-20240307_vision-CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-haiku-20240307_vision.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-opus-20240229_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-opus-20240229_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-opus-20240229_vision-CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-opus-20240229_vision.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-sonnet-20240229_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-sonnet-20240229_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-sonnet-20240229_vision-CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/claude-3-sonnet-20240229_vision.jpg
ADDED
|
Git LFS Details
|
heatmaps/dbrx-instruct_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/dbrx-instruct_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/deepseek-llm-67b-chat_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/deepseek-llm-67b-chat_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/gemini-pro_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/gemini-pro_vision-CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/gemini-pro_vision.jpg
ADDED
|
Git LFS Details
|
heatmaps/gemma-7b-it_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/gemma-7b-it_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/gpt-3.5-0613_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/gpt-3.5-0613_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/gpt-3.5-turbo-0125_1shot.jpg
ADDED
|
Git LFS Details
|
heatmaps/gpt-3.5-turbo-0125_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/gpt-3.5-turbo-0125_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/gpt-4-0125-preview_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/gpt-4-0125-preview_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/gpt-4-1106_CoT.jpg
ADDED
|
Git LFS Details
|
heatmaps/gpt-4-1106_Textonly.jpg
ADDED
|
Git LFS Details
|
heatmaps/gpt-4-turbo-2024-04-09_CoT.jpg
ADDED
|
Git LFS Details
|
results/gpt-4-turbo-2024-04-09.jpg β heatmaps/gpt-4-turbo-2024-04-09_Textonly.jpg
RENAMED
|
File without changes
|
heatmaps/gpt-4-vision-preview_vision-CoT.jpg
ADDED
|
Git LFS Details
|