Spaces:

FSMBench
/

Leaderboard

Sleeping

App Files Files Community

taesiri commited on Apr 9, 2024

Commit

16b6bb4

1 Parent(s): b6df3c4

update

Browse files

Files changed (14) hide show

.gitattributes +12 -0
app.py +86 -40
results-cot/gpt-3.5-CoT.csv +3 -0
results-cot/gpt-3.5-CoT.jpg +3 -0
results-cot/gpt-3.5-CoT.pkl +3 -0
results-cot/gpt-3.5-CoT.png +3 -0
results-cot/gpt-4v-CoT-Azure.csv +3 -0
results-cot/gpt-4v-CoT-Azure.jpg +3 -0
results-cot/gpt-4v-CoT-Azure.pkl +3 -0
results-cot/gpt-4v-CoT-Azure.png +3 -0
results-vision-CoT/gemini-pro-vision-CoT.csv +3 -0
results-vision-CoT/gemini-pro-vision-CoT.jpg +3 -0
results-vision-CoT/gemini-pro-vision-CoT.pkl +3 -0
results-vision-CoT/gemini-pro-vision-CoT.png +3 -0

.gitattributes CHANGED Viewed

@@ -103,3 +103,15 @@ results-vision/claude-3-opus-vision.png filter=lfs diff=lfs merge=lfs -text
 results-vision/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
 results-vision/gemini-pro-vision.pkl filter=lfs diff=lfs merge=lfs -text
 results-vision/gpt-4v-CoT.png filter=lfs diff=lfs merge=lfs -text

 results-vision/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
 results-vision/gemini-pro-vision.pkl filter=lfs diff=lfs merge=lfs -text
 results-vision/gpt-4v-CoT.png filter=lfs diff=lfs merge=lfs -text
+results-cot/gpt-3.5-CoT.jpg filter=lfs diff=lfs merge=lfs -text
+results-cot/gpt-4v-CoT-Azure.jpg filter=lfs diff=lfs merge=lfs -text
+results-vision-CoT/gemini-pro-vision-CoT.jpg filter=lfs diff=lfs merge=lfs -text
+results-cot/gpt-3.5-CoT.png filter=lfs diff=lfs merge=lfs -text
+results-cot/gpt-4v-CoT-Azure.png filter=lfs diff=lfs merge=lfs -text
+results-vision-CoT/gemini-pro-vision-CoT.png filter=lfs diff=lfs merge=lfs -text
+results-cot/gpt-3.5-CoT.pkl filter=lfs diff=lfs merge=lfs -text
+results-vision-CoT/gemini-pro-vision-CoT.pkl filter=lfs diff=lfs merge=lfs -text
+results-cot/gpt-4v-CoT-Azure.pkl filter=lfs diff=lfs merge=lfs -text
+results-cot/gpt-4v-CoT-Azure.csv filter=lfs diff=lfs merge=lfs -text
+results-vision-CoT/gemini-pro-vision-CoT.csv filter=lfs diff=lfs merge=lfs -text
+results-cot/gpt-3.5-CoT.csv filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -6,19 +6,30 @@ from glob import glob
 csv_results = glob("results/*.pkl")
 # Load vision benchmark results
 vision_results = glob("results-vision/*.pkl")
 # Load the csv files into a dict with keys being name of the file and values being the data
 data = {file: pd.read_pickle(file) for file in csv_results}
 # Load the vision files into a dict
 vision_data = {file: pd.read_pickle(file) for file in vision_results}
 def calculate_accuracy(df):
     return df["parsed_judge_response"].mean() * 100
 def accuracy_breakdown(df):
     # 4 level accuracy
     return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
 # Define the column names with icons
 headers_with_icons = [
     "🤖 Model Name",
@@ -29,16 +40,6 @@ headers_with_icons = [
     "🔬 Level 4",
 ]
-# Process text benchmark data
-accuracy = {file: calculate_accuracy(data[file]) for file in data}
-data_for_df = []
-for file, df in data.items():
-    overall_accuracy = round(calculate_accuracy(df), 2)
-    breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
-    model_name = file.split("/")[-1].replace(".pkl", "")
-    data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
 column_names = [
     "Model Name",
     "Overall Accuracy",
@@ -48,46 +49,65 @@ column_names = [
     "Level 4 Accuracy",
 ]
-# accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
-# accuracy_df.columns = headers_with_icons
-# accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
-# After creating the DataFrame and before sorting
-accuracy_df = pd.DataFrame(data_for_df, columns=column_names)
-accuracy_df = accuracy_df.round(1)  # Round to one decimal place
-accuracy_df = accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
-accuracy_df.columns = headers_with_icons
-accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
-# Process vision benchmark data
-vision_data_for_df = []
-for file, df in vision_data.items():
-    overall_accuracy = round(calculate_accuracy(df), 2)
-    breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
-    model_name = file.split("/")[-1].replace(".pkl", "")
-    vision_data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
-# vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
-# vision_accuracy_df.columns = headers_with_icons
-# vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
-# Do the same for vision_accuracy_df
-vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
-vision_accuracy_df = vision_accuracy_df.round(1)  # Round to one decimal place
-vision_accuracy_df = vision_accuracy_df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
-vision_accuracy_df.columns = headers_with_icons
-vision_accuracy_df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
 def load_heatmap(evt: gr.SelectData):
     heatmap_image = gr.Image(f"results/{evt.value}.jpg")
     return heatmap_image
 def load_vision_heatmap(evt: gr.SelectData):
     heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
     return heatmap_image
 with gr.Blocks() as demo:
     gr.Markdown("# FSM Benchmark Leaderboard")
     with gr.Tab("Text-only Benchmark"):
@@ -99,9 +119,35 @@ with gr.Blocks() as demo:
     with gr.Tab("Vision Benchmark"):
         gr.Markdown("# Vision Benchmark Leaderboard")
-        leader_board_vision = gr.Dataframe(vision_accuracy_df, headers=headers_with_icons)
         gr.Markdown("## Heatmap")
         heatmap_image_vision = gr.Image(label="", show_label=False)
-        leader_board_vision.select(fn=load_vision_heatmap, outputs=[heatmap_image_vision])
-    demo.launch()

 csv_results = glob("results/*.pkl")
 # Load vision benchmark results
 vision_results = glob("results-vision/*.pkl")
+# Load CoT text benchmark results
+cot_text_results = glob("results-cot/*.pkl")
+# Load CoT vision benchmark results
+cot_vision_results = glob("results-vision-CoT/*.pkl")
 # Load the csv files into a dict with keys being name of the file and values being the data
 data = {file: pd.read_pickle(file) for file in csv_results}
 # Load the vision files into a dict
 vision_data = {file: pd.read_pickle(file) for file in vision_results}
+# Load the CoT text files into a dict
+cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results}
+# Load the CoT vision files into a dict
+cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results}
 def calculate_accuracy(df):
     return df["parsed_judge_response"].mean() * 100
 def accuracy_breakdown(df):
     # 4 level accuracy
     return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values
 # Define the column names with icons
 headers_with_icons = [
     "🤖 Model Name",
     "🔬 Level 4",
 ]
 column_names = [
     "Model Name",
     "Overall Accuracy",
     "Level 4 Accuracy",
 ]
+# Function to process data
+def process_data(data):
+    data_for_df = []
+    for file, df in data.items():
+        overall_accuracy = round(calculate_accuracy(df), 2)
+        breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)]
+        model_name = file.split("/")[-1].replace(".pkl", "")
+        data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy)
+    return data_for_df
+# Process all data
+text_data_for_df = process_data(data)
+vision_data_for_df = process_data(vision_data)
+cot_text_data_for_df = process_data(cot_text_data)
+cot_vision_data_for_df = process_data(cot_vision_data)
+# Create DataFrames
+accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names)
+vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names)
+cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names)
+cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names)
+# Function to finalize DataFrame
+def finalize_df(df):
+    df = df.round(1)  # Round to one decimal place
+    df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x)
+    df.columns = headers_with_icons
+    df.sort_values(by="⭐ Overall", ascending=False, inplace=True)
+    return df
+# Finalize all DataFrames
+accuracy_df = finalize_df(accuracy_df)
+vision_accuracy_df = finalize_df(vision_accuracy_df)
+cot_text_accuracy_df = finalize_df(cot_text_accuracy_df)
+cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df)
 def load_heatmap(evt: gr.SelectData):
     heatmap_image = gr.Image(f"results/{evt.value}.jpg")
     return heatmap_image
 def load_vision_heatmap(evt: gr.SelectData):
     heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg")
     return heatmap_image
+def load_cot_heatmap(evt: gr.SelectData):
+    heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg")
+    return heatmap_image
+def load_cot_vision_heatmap(evt: gr.SelectData):
+    heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg")
+    return heatmap_image
 with gr.Blocks() as demo:
     gr.Markdown("# FSM Benchmark Leaderboard")
     with gr.Tab("Text-only Benchmark"):
     with gr.Tab("Vision Benchmark"):
         gr.Markdown("# Vision Benchmark Leaderboard")
+        leader_board_vision = gr.Dataframe(
+            vision_accuracy_df, headers=headers_with_icons
+        )
         gr.Markdown("## Heatmap")
         heatmap_image_vision = gr.Image(label="", show_label=False)
+        leader_board_vision.select(
+            fn=load_vision_heatmap, outputs=[heatmap_image_vision]
+        )
+    with gr.Tab("CoT Text-only Benchmark"):
+        gr.Markdown("# CoT Text-only Leaderboard")
+        cot_leader_board_text = gr.Dataframe(
+            cot_text_accuracy_df, headers=headers_with_icons
+        )
+        gr.Markdown("## Heatmap")
+        cot_heatmap_image_text = gr.Image(label="", show_label=False)
+        cot_leader_board_text.select(
+            fn=load_cot_heatmap, outputs=[cot_heatmap_image_text]
+        )
+    with gr.Tab("CoT Vision Benchmark"):
+        gr.Markdown("# CoT Vision Benchmark Leaderboard")
+        cot_leader_board_vision = gr.Dataframe(
+            cot_vision_accuracy_df, headers=headers_with_icons
+        )
+        gr.Markdown("## Heatmap")
+        cot_heatmap_image_vision = gr.Image(label="", show_label=False)
+        cot_leader_board_vision.select(
+            fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision]
+        )
+    demo.launch()

results-cot/gpt-3.5-CoT.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25400229561733404647fa6aa2ab0372a8507f8c32a17339e9566a57c2618c93
+size 14472393

results-cot/gpt-3.5-CoT.jpg ADDED Viewed

Git LFS Details

SHA256: ecdcdb3508a90af17ff384cb0a0e1065e33ffad5a69b813a51bfb0bf8287dc92
Pointer size: 132 Bytes
Size of remote file: 1.32 MB

results-cot/gpt-3.5-CoT.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a5429ee7014934ba056f77e642157cb5ed3305246b6bfb6a335dc6cd874b4fd
+size 14487910

results-cot/gpt-3.5-CoT.png ADDED Viewed

Git LFS Details

SHA256: 6ef6e2275200de4bb9853889b68935806cc3f74716007d812808c49c1c19d46f
Pointer size: 132 Bytes
Size of remote file: 1.02 MB

results-cot/gpt-4v-CoT-Azure.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04b4de1a7a4280354c89609d15282109ee60f8f58129960dc0edbb046b12a5c6
+size 6374181

results-cot/gpt-4v-CoT-Azure.jpg ADDED Viewed

Git LFS Details

SHA256: 6d63da74c747dc220638351069b927925aaa34e580e2c00e70dd29e0d2cefebb
Pointer size: 132 Bytes
Size of remote file: 1.33 MB

results-cot/gpt-4v-CoT-Azure.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52ae5e417e011db84976acd51a024eae7ccea1e686b7f3f0e8158cd77be4f847
+size 6320889

results-cot/gpt-4v-CoT-Azure.png ADDED Viewed

Git LFS Details

SHA256: b8a96d76a726ab67813368f0a630576aee5cda6b5264c2edc65af93932fe4a32
Pointer size: 132 Bytes
Size of remote file: 1.01 MB

results-vision-CoT/gemini-pro-vision-CoT.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ebebe1d6caee19a4f714bf13eaba72e7a0b5d15281c407cd4dc53a2820ad312
+size 6184119

results-vision-CoT/gemini-pro-vision-CoT.jpg ADDED Viewed

Git LFS Details

SHA256: fed7a1736c7550edca80305d90c975e36da47331bc67f824c23b6bb5525289b4
Pointer size: 132 Bytes
Size of remote file: 1.33 MB

results-vision-CoT/gemini-pro-vision-CoT.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:049d575dbad9da04496fea752e19f915bcec445b13f3010f9c67544012c936ff
+size 6144275

results-vision-CoT/gemini-pro-vision-CoT.png ADDED Viewed

Git LFS Details

SHA256: 49ab8af8d2e3d2fb671b375a830808eb92a84e0faef35d2844f8eed62bd6acf5
Pointer size: 132 Bytes
Size of remote file: 1.01 MB