Spaces:
Sleeping
Sleeping
update
Browse files- .gitattributes +12 -0
- app.py +86 -40
- results-cot/gpt-3.5-CoT.csv +3 -0
- results-cot/gpt-3.5-CoT.jpg +3 -0
- results-cot/gpt-3.5-CoT.pkl +3 -0
- results-cot/gpt-3.5-CoT.png +3 -0
- results-cot/gpt-4v-CoT-Azure.csv +3 -0
- results-cot/gpt-4v-CoT-Azure.jpg +3 -0
- results-cot/gpt-4v-CoT-Azure.pkl +3 -0
- results-cot/gpt-4v-CoT-Azure.png +3 -0
- results-vision-CoT/gemini-pro-vision-CoT.csv +3 -0
- results-vision-CoT/gemini-pro-vision-CoT.jpg +3 -0
- results-vision-CoT/gemini-pro-vision-CoT.pkl +3 -0
- results-vision-CoT/gemini-pro-vision-CoT.png +3 -0
.gitattributes
CHANGED
|
@@ -103,3 +103,15 @@ results-vision/claude-3-opus-vision.png filter=lfs diff=lfs merge=lfs -text
|
|
| 103 |
results-vision/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 104 |
results-vision/gemini-pro-vision.pkl filter=lfs diff=lfs merge=lfs -text
|
| 105 |
results-vision/gpt-4v-CoT.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
results-vision/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 104 |
results-vision/gemini-pro-vision.pkl filter=lfs diff=lfs merge=lfs -text
|
| 105 |
results-vision/gpt-4v-CoT.png filter=lfs diff=lfs merge=lfs -text
|
| 106 |
+
results-cot/gpt-3.5-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 107 |
+
results-cot/gpt-4v-CoT-Azure.jpg filter=lfs diff=lfs merge=lfs -text
|
| 108 |
+
results-vision-CoT/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
|
| 109 |
+
results-cot/gpt-3.5-CoT.png filter=lfs diff=lfs merge=lfs -text
|
| 110 |
+
results-cot/gpt-4v-CoT-Azure.png filter=lfs diff=lfs merge=lfs -text
|
| 111 |
+
results-vision-CoT/gemini-pro-vision-CoT.png filter=lfs diff=lfs merge=lfs -text
|
| 112 |
+
results-cot/gpt-3.5-CoT.pkl filter=lfs diff=lfs merge=lfs -text
|
| 113 |
+
results-vision-CoT/gemini-pro-vision-CoT.pkl filter=lfs diff=lfs merge=lfs -text
|
| 114 |
+
results-cot/gpt-4v-CoT-Azure.pkl filter=lfs diff=lfs merge=lfs -text
|
| 115 |
+
results-cot/gpt-4v-CoT-Azure.csv filter=lfs diff=lfs merge=lfs -text
|
| 116 |
+
results-vision-CoT/gemini-pro-vision-CoT.csv filter=lfs diff=lfs merge=lfs -text
|
| 117 |
+
results-cot/gpt-3.5-CoT.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -6,19 +6,30 @@ from glob import glob
|
|
| 6 |
csv_results = glob("results/*.pkl")
|
| 7 |
# Load vision benchmark results
|
| 8 |
vision_results = glob("results-vision/*.pkl")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Load the csv files into a dict with keys being name of the file and values being the data
|
| 11 |
data = {file: pd.read_pickle(file) for file in csv_results}
|
| 12 |
# Load the vision files into a dict
|
| 13 |
vision_data = {file: pd.read_pickle(file) for file in vision_results}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
def calculate_accuracy(df):
|
| 16 |
return df["parsed_judge_response"].mean() * 100
|
| 17 |
|
|
|
|
| 18 |
def accuracy_breakdown(df):
|
| 19 |
# 4 level accuracy
|
| 20 |
return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
|
| 21 |
|
|
|
|
| 22 |
# Define the column names with icons
|
| 23 |
headers_with_icons = [
|
| 24 |
"🤖 Model Name",
|
|
@@ -29,16 +40,6 @@ headers_with_icons = [
|
|
| 29 |
"🔬 Level 4",
|
| 30 |
]
|
| 31 |
|
| 32 |
-
# Process text benchmark data
|
| 33 |
-
accuracy = {file: calculate_accuracy(data[file]) for file in data}
|
| 34 |
-
data_for_df = []
|
| 35 |
-
|
| 36 |
-
for file, df in data.items():
|
| 37 |
-
overall_accuracy = round(calculate_accuracy(df), 2)
|
| 38 |
-
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
|
| 39 |
-
model_name = file.split("/")[-1].replace(".pkl", "")
|
| 40 |
-
data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
|
| 41 |
-
|
| 42 |
column_names = [
|
| 43 |
"Model Name",
|
| 44 |
"Overall Accuracy",
|
|
@@ -48,46 +49,65 @@ column_names = [
|
|
| 48 |
"Level 4 Accuracy",
|
| 49 |
]
|
| 50 |
|
| 51 |
-
#
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
# Process vision benchmark data
|
| 64 |
-
vision_data_for_df = []
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
|
| 72 |
-
# vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
|
| 73 |
-
# vision_accuracy_df.columns = headers_with_icons
|
| 74 |
-
# vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
|
| 75 |
-
|
| 76 |
-
# Do the same for vision_accuracy_df
|
| 77 |
-
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
|
| 78 |
-
vision_accuracy_df = vision_accuracy_df.round(1) # Round to one decimal place
|
| 79 |
-
vision_accuracy_df = vision_accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
|
| 80 |
-
vision_accuracy_df.columns = headers_with_icons
|
| 81 |
-
vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
|
| 82 |
|
| 83 |
def load_heatmap(evt: gr.SelectData):
|
| 84 |
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
|
| 85 |
return heatmap_image
|
| 86 |
|
|
|
|
| 87 |
def load_vision_heatmap(evt: gr.SelectData):
|
| 88 |
heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
|
| 89 |
return heatmap_image
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
with gr.Blocks() as demo:
|
| 92 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
| 93 |
with gr.Tab("Text-only Benchmark"):
|
|
@@ -99,9 +119,35 @@ with gr.Blocks() as demo:
|
|
| 99 |
|
| 100 |
with gr.Tab("Vision Benchmark"):
|
| 101 |
gr.Markdown("# Vision Benchmark Leaderboard")
|
| 102 |
-
leader_board_vision = gr.Dataframe(
|
|
|
|
|
|
|
| 103 |
gr.Markdown("## Heatmap")
|
| 104 |
heatmap_image_vision = gr.Image(label="", show_label=False)
|
| 105 |
-
leader_board_vision.select(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
demo.launch()
|
|
|
|
| 6 |
csv_results = glob("results/*.pkl")
|
| 7 |
# Load vision benchmark results
|
| 8 |
vision_results = glob("results-vision/*.pkl")
|
| 9 |
+
# Load CoT text benchmark results
|
| 10 |
+
cot_text_results = glob("results-cot/*.pkl")
|
| 11 |
+
# Load CoT vision benchmark results
|
| 12 |
+
cot_vision_results = glob("results-vision-CoT/*.pkl")
|
| 13 |
|
| 14 |
# Load the csv files into a dict with keys being name of the file and values being the data
|
| 15 |
data = {file: pd.read_pickle(file) for file in csv_results}
|
| 16 |
# Load the vision files into a dict
|
| 17 |
vision_data = {file: pd.read_pickle(file) for file in vision_results}
|
| 18 |
+
# Load the CoT text files into a dict
|
| 19 |
+
cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results}
|
| 20 |
+
# Load the CoT vision files into a dict
|
| 21 |
+
cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}
|
| 22 |
+
|
| 23 |
|
| 24 |
def calculate_accuracy(df):
|
| 25 |
return df["parsed_judge_response"].mean() * 100
|
| 26 |
|
| 27 |
+
|
| 28 |
def accuracy_breakdown(df):
|
| 29 |
# 4 level accuracy
|
| 30 |
return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
|
| 31 |
|
| 32 |
+
|
| 33 |
# Define the column names with icons
|
| 34 |
headers_with_icons = [
|
| 35 |
"🤖 Model Name",
|
|
|
|
| 40 |
"🔬 Level 4",
|
| 41 |
]
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
column_names = [
|
| 44 |
"Model Name",
|
| 45 |
"Overall Accuracy",
|
|
|
|
| 49 |
"Level 4 Accuracy",
|
| 50 |
]
|
| 51 |
|
| 52 |
+
# Function to process data
|
| 53 |
+
def process_data(data):
|
| 54 |
+
data_for_df = []
|
| 55 |
+
for file, df in data.items():
|
| 56 |
+
overall_accuracy = round(calculate_accuracy(df), 2)
|
| 57 |
+
breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
|
| 58 |
+
model_name = file.split("/")[-1].replace(".pkl", "")
|
| 59 |
+
data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
|
| 60 |
+
return data_for_df
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# Process all data
|
| 64 |
+
text_data_for_df = process_data(data)
|
| 65 |
+
vision_data_for_df = process_data(vision_data)
|
| 66 |
+
cot_text_data_for_df = process_data(cot_text_data)
|
| 67 |
+
cot_vision_data_for_df = process_data(cot_vision_data)
|
| 68 |
+
|
| 69 |
+
# Create DataFrames
|
| 70 |
+
accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names)
|
| 71 |
+
vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
|
| 72 |
+
cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
|
| 73 |
+
cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)
|
| 74 |
|
| 75 |
+
# Function to finalize DataFrame
|
| 76 |
+
def finalize_df(df):
|
| 77 |
+
df = df.round(1) # Round to one decimal place
|
| 78 |
+
df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
|
| 79 |
+
df.columns = headers_with_icons
|
| 80 |
+
df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
|
| 81 |
+
return df
|
| 82 |
|
|
|
|
|
|
|
| 83 |
|
| 84 |
+
# Finalize all DataFrames
|
| 85 |
+
accuracy_df = finalize_df(accuracy_df)
|
| 86 |
+
vision_accuracy_df = finalize_df(vision_accuracy_df)
|
| 87 |
+
cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
|
| 88 |
+
cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
def load_heatmap(evt: gr.SelectData):
|
| 92 |
heatmap_image = gr.Image(f"results/{evt.value}.jpg")
|
| 93 |
return heatmap_image
|
| 94 |
|
| 95 |
+
|
| 96 |
def load_vision_heatmap(evt: gr.SelectData):
|
| 97 |
heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
|
| 98 |
return heatmap_image
|
| 99 |
|
| 100 |
+
|
| 101 |
+
def load_cot_heatmap(evt: gr.SelectData):
|
| 102 |
+
heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg")
|
| 103 |
+
return heatmap_image
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def load_cot_vision_heatmap(evt: gr.SelectData):
|
| 107 |
+
heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg")
|
| 108 |
+
return heatmap_image
|
| 109 |
+
|
| 110 |
+
|
| 111 |
with gr.Blocks() as demo:
|
| 112 |
gr.Markdown("# FSM Benchmark Leaderboard")
|
| 113 |
with gr.Tab("Text-only Benchmark"):
|
|
|
|
| 119 |
|
| 120 |
with gr.Tab("Vision Benchmark"):
|
| 121 |
gr.Markdown("# Vision Benchmark Leaderboard")
|
| 122 |
+
leader_board_vision = gr.Dataframe(
|
| 123 |
+
vision_accuracy_df, headers=headers_with_icons
|
| 124 |
+
)
|
| 125 |
gr.Markdown("## Heatmap")
|
| 126 |
heatmap_image_vision = gr.Image(label="", show_label=False)
|
| 127 |
+
leader_board_vision.select(
|
| 128 |
+
fn=load_vision_heatmap, outputs=[heatmap_image_vision]
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
with gr.Tab("CoT Text-only Benchmark"):
|
| 132 |
+
gr.Markdown("# CoT Text-only Leaderboard")
|
| 133 |
+
cot_leader_board_text = gr.Dataframe(
|
| 134 |
+
cot_text_accuracy_df, headers=headers_with_icons
|
| 135 |
+
)
|
| 136 |
+
gr.Markdown("## Heatmap")
|
| 137 |
+
cot_heatmap_image_text = gr.Image(label="", show_label=False)
|
| 138 |
+
cot_leader_board_text.select(
|
| 139 |
+
fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
with gr.Tab("CoT Vision Benchmark"):
|
| 143 |
+
gr.Markdown("# CoT Vision Benchmark Leaderboard")
|
| 144 |
+
cot_leader_board_vision = gr.Dataframe(
|
| 145 |
+
cot_vision_accuracy_df, headers=headers_with_icons
|
| 146 |
+
)
|
| 147 |
+
gr.Markdown("## Heatmap")
|
| 148 |
+
cot_heatmap_image_vision = gr.Image(label="", show_label=False)
|
| 149 |
+
cot_leader_board_vision.select(
|
| 150 |
+
fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision]
|
| 151 |
+
)
|
| 152 |
|
| 153 |
+
demo.launch()
|
results-cot/gpt-3.5-CoT.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25400229561733404647fa6aa2ab0372a8507f8c32a17339e9566a57c2618c93
|
| 3 |
+
size 14472393
|
results-cot/gpt-3.5-CoT.jpg
ADDED
|
Git LFS Details
|
results-cot/gpt-3.5-CoT.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a5429ee7014934ba056f77e642157cb5ed3305246b6bfb6a335dc6cd874b4fd
|
| 3 |
+
size 14487910
|
results-cot/gpt-3.5-CoT.png
ADDED
|
Git LFS Details
|
results-cot/gpt-4v-CoT-Azure.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04b4de1a7a4280354c89609d15282109ee60f8f58129960dc0edbb046b12a5c6
|
| 3 |
+
size 6374181
|
results-cot/gpt-4v-CoT-Azure.jpg
ADDED
|
Git LFS Details
|
results-cot/gpt-4v-CoT-Azure.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52ae5e417e011db84976acd51a024eae7ccea1e686b7f3f0e8158cd77be4f847
|
| 3 |
+
size 6320889
|
results-cot/gpt-4v-CoT-Azure.png
ADDED
|
Git LFS Details
|
results-vision-CoT/gemini-pro-vision-CoT.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ebebe1d6caee19a4f714bf13eaba72e7a0b5d15281c407cd4dc53a2820ad312
|
| 3 |
+
size 6184119
|
results-vision-CoT/gemini-pro-vision-CoT.jpg
ADDED
|
Git LFS Details
|
results-vision-CoT/gemini-pro-vision-CoT.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:049d575dbad9da04496fea752e19f915bcec445b13f3010f9c67544012c936ff
|
| 3 |
+
size 6144275
|
results-vision-CoT/gemini-pro-vision-CoT.png
ADDED
|
Git LFS Details
|