Spaces:
Sleeping
Sleeping
| import os | |
| from glob import glob | |
| import gradio as gr | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| import seaborn as sns | |
| from matplotlib.colors import BoundaryNorm, ListedColormap | |
| all_results = pd.read_pickle("final_df.pkl") | |
| def get_accuracy_dataframe(df_mother, category): | |
| # Calculate overall model accuracy | |
| # filter for category only | |
| df = df_mother[df_mother["category"] == category].copy() | |
| df["is_answer_correct"] = df["is_answer_correct"].astype(float) | |
| model_accuracy = df.groupby("model")["is_answer_correct"].mean().reset_index() | |
| # Calculate model accuracy per difficulty level | |
| df["difficulty_level"] = df["difficulty_level"].astype(int) | |
| model_accuracy_per_level = ( | |
| df.groupby(["model", "difficulty_level"])["is_answer_correct"] | |
| .mean() | |
| .reset_index() | |
| ) | |
| model_accuracy_per_level_df = model_accuracy_per_level.pivot( | |
| index="model", columns="difficulty_level", values="is_answer_correct" | |
| ) | |
| # Merge overall accuracy and level-based accuracy into a single DataFrame | |
| model_accuracy_df = model_accuracy.merge(model_accuracy_per_level_df, on="model") | |
| model_accuracy_df.rename( | |
| columns={"is_answer_correct": "Overall Accuracy"}, inplace=True | |
| ) | |
| model_accuracy_df['model'] = model_accuracy_df['model'].apply(lambda x: x.split('/')[-1]) | |
| # Ensure all expected difficulty levels are present | |
| expected_levels = [1, 2, 3, 4] # Adjust based on your data | |
| for level in expected_levels: | |
| if level not in model_accuracy_df.columns: | |
| model_accuracy_df[ | |
| level | |
| ] = None # Fill missing levels with None or an appropriate value | |
| # Rename columns to include levels | |
| level_columns = {level: f"Level {level} Accuracy" for level in expected_levels} | |
| model_accuracy_df.rename(columns=level_columns, inplace=True) | |
| # Multiply by 100 and format to one decimal point | |
| model_accuracy_df = model_accuracy_df.applymap( | |
| lambda x: round(x * 100, 1) if isinstance(x, float) else x | |
| ) | |
| # Add headers with icons | |
| model_accuracy_df.columns = [ | |
| "π€ Model Name", | |
| "β Overall", | |
| "π Level 1", | |
| "π Level 2", | |
| "π Level 3", | |
| "π¬ Level 4", | |
| ] | |
| model_accuracy_df.sort_values(by="β Overall", ascending=False, inplace=True) | |
| return model_accuracy_df | |
| # categories = array(['1shot', 'CoT', 'Textonly', 'vision', 'vision-CoT'], dtype=object) | |
| accuracy_df_textonly = get_accuracy_dataframe(all_results, "Textonly") | |
| accuracy_df_cot = get_accuracy_dataframe(all_results, "CoT") | |
| accuracy_df_vision = get_accuracy_dataframe(all_results, "vision") | |
| accuracy_df_vision_cot = get_accuracy_dataframe(all_results, "vision-CoT") | |
| accuracy_df_1shot = get_accuracy_dataframe(all_results, "1shot") | |
| # Define the column names with icons | |
| headers_with_icons = [ | |
| "π€ Model Name", | |
| "β Overall", | |
| "π Level 1", | |
| "π Level 2", | |
| "π Level 3", | |
| "π¬ Level 4", | |
| ] | |
| column_names = [ | |
| "Model Name", | |
| "Overall Accuracy", | |
| "Level 1 Accuracy", | |
| "Level 2 Accuracy", | |
| "Level 3 Accuracy", | |
| "Level 4 Accuracy", | |
| ] | |
| def load_heatmap_textonly(evt: gr.SelectData): | |
| print(f"./heatmaps/{evt.value}_Textonly.jpg") | |
| heatmap_image = gr.Image(f"./heatmaps/{evt.value}_Textonly.jpg") | |
| return heatmap_image | |
| def load_heatmap_cot(evt: gr.SelectData): | |
| heatmap_image = gr.Image(f"./heatmaps/{evt.value}_CoT.jpg") | |
| return heatmap_image | |
| def load_heatmap_vision(evt: gr.SelectData): | |
| heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision.jpg") | |
| return heatmap_image | |
| def load_heatmap_vision_cot(evt: gr.SelectData): | |
| heatmap_image = gr.Image(f"./heatmaps/{evt.value}_vision-CoT.jpg") | |
| return heatmap_image | |
| def load_heatmap_1shot(evt: gr.SelectData): | |
| heatmap_image = gr.Image(f"./heatmaps/{evt.value}_1shot.jpg") | |
| return heatmap_image | |
| # Then, use these functions in the corresponding select method calls: | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# FSM Benchmark Leaderboard") | |
| # Text-only Benchmark | |
| with gr.Tab("Text-only Benchmark"): | |
| leader_board_textonly = gr.Dataframe( | |
| accuracy_df_textonly, headers=headers_with_icons | |
| ) | |
| gr.Markdown("## Heatmap") | |
| heatmap_image_textonly = gr.Image(label="", show_label=False) | |
| leader_board_textonly.select( | |
| fn=load_heatmap_textonly, outputs=[heatmap_image_textonly] | |
| ) | |
| # CoT Benchmark | |
| with gr.Tab("CoT Benchmark"): | |
| leader_board_cot = gr.Dataframe(accuracy_df_cot, headers=headers_with_icons) | |
| gr.Markdown("## Heatmap") | |
| heatmap_image_cot = gr.Image(label="", show_label=False) | |
| leader_board_cot.select(fn=load_heatmap_cot, outputs=[heatmap_image_cot]) | |
| # Vision Benchmark | |
| with gr.Tab("Vision Benchmark"): | |
| leader_board_vision = gr.Dataframe( | |
| accuracy_df_vision, headers=headers_with_icons | |
| ) | |
| gr.Markdown("## Heatmap") | |
| heatmap_image_vision = gr.Image(label="", show_label=False) | |
| leader_board_vision.select( | |
| fn=load_heatmap_vision, outputs=[heatmap_image_vision] | |
| ) | |
| # Vision-CoT Benchmark | |
| with gr.Tab("Vision-CoT Benchmark"): | |
| leader_board_vision_cot = gr.Dataframe( | |
| accuracy_df_vision_cot, headers=headers_with_icons | |
| ) | |
| gr.Markdown("## Heatmap") | |
| heatmap_image_vision_cot = gr.Image(label="", show_label=False) | |
| leader_board_vision_cot.select( | |
| fn=load_heatmap_vision_cot, outputs=[heatmap_image_vision_cot] | |
| ) | |
| # 1shot Benchmark | |
| with gr.Tab("1shot Benchmark"): | |
| leader_board_1shot = gr.Dataframe(accuracy_df_1shot, headers=headers_with_icons) | |
| gr.Markdown("## Heatmap") | |
| heatmap_image_1shot = gr.Image(label="", show_label=False) | |
| leader_board_1shot.select(fn=load_heatmap_1shot, outputs=[heatmap_image_1shot]) | |
| demo.launch() | |