Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from glob import glob | |
| # Load text benchmark results | |
| csv_results = glob("results/*.pkl") | |
| # Load vision benchmark results | |
| vision_results = glob("results-vision/*.pkl") | |
| # Load CoT text benchmark results | |
| cot_text_results = glob("results-cot/*.pkl") | |
| # Load CoT vision benchmark results | |
| cot_vision_results = glob("results-vision-CoT/*.pkl") | |
| # Load the csv files into a dict with keys being name of the file and values being the data | |
| data = {file: pd.read_pickle(file) for file in csv_results} | |
| # Load the vision files into a dict | |
| vision_data = {file: pd.read_pickle(file) for file in vision_results} | |
| # Load the CoT text files into a dict | |
| cot_text_data = {file: pd.read_pickle(file) for file in cot_text_results} | |
| # Load the CoT vision files into a dict | |
| cot_vision_data = {file: pd.read_pickle(file) for file in cot_vision_results} | |
| def calculate_accuracy(df): | |
| return df["parsed_judge_response"].mean() * 100 | |
| def accuracy_breakdown(df): | |
| # 4 level accuracy | |
| return (df.groupby("difficulty_level")["parsed_judge_response"].mean() * 100).values | |
| # Define the column names with icons | |
| headers_with_icons = [ | |
| "π€ Model Name", | |
| "β Overall", | |
| "π Level 1", | |
| "π Level 2", | |
| "π Level 3", | |
| "π¬ Level 4", | |
| ] | |
| column_names = [ | |
| "Model Name", | |
| "Overall Accuracy", | |
| "Level 1 Accuracy", | |
| "Level 2 Accuracy", | |
| "Level 3 Accuracy", | |
| "Level 4 Accuracy", | |
| ] | |
| # Function to process data | |
| def process_data(data): | |
| data_for_df = [] | |
| for file, df in data.items(): | |
| overall_accuracy = round(calculate_accuracy(df), 2) | |
| breakdown_accuracy = [round(acc, 2) for acc in accuracy_breakdown(df)] | |
| model_name = file.split("/")[-1].replace(".pkl", "") | |
| data_for_df.append([model_name, overall_accuracy] + breakdown_accuracy) | |
| return data_for_df | |
| # Process all data | |
| text_data_for_df = process_data(data) | |
| vision_data_for_df = process_data(vision_data) | |
| cot_text_data_for_df = process_data(cot_text_data) | |
| cot_vision_data_for_df = process_data(cot_vision_data) | |
| # Create DataFrames | |
| accuracy_df = pd.DataFrame(text_data_for_df, columns=column_names) | |
| vision_accuracy_df = pd.DataFrame(vision_data_for_df, columns=column_names) | |
| cot_text_accuracy_df = pd.DataFrame(cot_text_data_for_df, columns=column_names) | |
| cot_vision_accuracy_df = pd.DataFrame(cot_vision_data_for_df, columns=column_names) | |
| # Function to finalize DataFrame | |
| def finalize_df(df): | |
| df = df.round(1) # Round to one decimal place | |
| df = df.applymap(lambda x: f"{x:.1f}" if isinstance(x, (int, float)) else x) | |
| df.columns = headers_with_icons | |
| df.sort_values(by="β Overall", ascending=False, inplace=True) | |
| return df | |
| # Finalize all DataFrames | |
| accuracy_df = finalize_df(accuracy_df) | |
| vision_accuracy_df = finalize_df(vision_accuracy_df) | |
| cot_text_accuracy_df = finalize_df(cot_text_accuracy_df) | |
| cot_vision_accuracy_df = finalize_df(cot_vision_accuracy_df) | |
| def load_heatmap(evt: gr.SelectData): | |
| heatmap_image = gr.Image(f"results/{evt.value}.jpg") | |
| return heatmap_image | |
| def load_vision_heatmap(evt: gr.SelectData): | |
| heatmap_image = gr.Image(f"results-vision/{evt.value}.jpg") | |
| return heatmap_image | |
| def load_cot_heatmap(evt: gr.SelectData): | |
| heatmap_image = gr.Image(f"results-cot/{evt.value}.jpg") | |
| return heatmap_image | |
| def load_cot_vision_heatmap(evt: gr.SelectData): | |
| heatmap_image = gr.Image(f"results-vision-CoT/{evt.value}.jpg") | |
| return heatmap_image | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# FSM Benchmark Leaderboard") | |
| with gr.Tab("Text-only Benchmark"): | |
| gr.Markdown("# Text-only Leaderboard") | |
| leader_board = gr.Dataframe(accuracy_df, headers=headers_with_icons) | |
| gr.Markdown("## Heatmap") | |
| heatmap_image = gr.Image(label="", show_label=False) | |
| leader_board.select(fn=load_heatmap, outputs=[heatmap_image]) | |
| with gr.Tab("Vision Benchmark"): | |
| gr.Markdown("# Vision Benchmark Leaderboard") | |
| leader_board_vision = gr.Dataframe( | |
| vision_accuracy_df, headers=headers_with_icons | |
| ) | |
| gr.Markdown("## Heatmap") | |
| heatmap_image_vision = gr.Image(label="", show_label=False) | |
| leader_board_vision.select( | |
| fn=load_vision_heatmap, outputs=[heatmap_image_vision] | |
| ) | |
| with gr.Tab("CoT Text-only Benchmark"): | |
| gr.Markdown("# CoT Text-only Leaderboard") | |
| cot_leader_board_text = gr.Dataframe( | |
| cot_text_accuracy_df, headers=headers_with_icons | |
| ) | |
| gr.Markdown("## Heatmap") | |
| cot_heatmap_image_text = gr.Image(label="", show_label=False) | |
| cot_leader_board_text.select( | |
| fn=load_cot_heatmap, outputs=[cot_heatmap_image_text] | |
| ) | |
| with gr.Tab("CoT Vision Benchmark"): | |
| gr.Markdown("# CoT Vision Benchmark Leaderboard") | |
| cot_leader_board_vision = gr.Dataframe( | |
| cot_vision_accuracy_df, headers=headers_with_icons | |
| ) | |
| gr.Markdown("## Heatmap") | |
| cot_heatmap_image_vision = gr.Image(label="", show_label=False) | |
| cot_leader_board_vision.select( | |
| fn=load_cot_vision_heatmap, outputs=[cot_heatmap_image_vision] | |
| ) | |
| demo.launch() | |