Spaces:

Metric-AI
/

ArmBench-LLM

Running

App Files Files Community

daniel7an commited on Feb 28

Commit

4781b83

1 Parent(s): d21b14f

commit

Browse files

Files changed (3) hide show

app.py +122 -0
mmlu_pro_hy_results.csv +5 -0
benchmark_results.csv → unified_exam_results.csv +2 -2

app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import gradio as gr
+import pandas as pd
+import plotly.express as px
+def display_table(exam_type):
+    if exam_type == "Armenian Exams":
+        df = pd.read_csv('unified_exam_results.csv')
+        df = df.sort_values(by='Average score', ascending=False)
+        cols = df.columns.tolist()
+        cols.insert(1, cols.pop(cols.index('Average score')))
+        df = df[cols]
+    elif exam_type == "MMLU-Pro-Hy":
+        df = pd.read_csv('mmlu_pro_hy_results.csv')
+        df = df.sort_values(by='Accuracy', ascending=False)
+    return df
+def create_bar_chart(exam_type, plot_column):
+    if exam_type == "Armenian Exams":
+        df = pd.read_csv('unified_exam_results.csv')
+        df = df.sort_values(by='Average score', ascending=False)
+        df = df.sort_values(by=[plot_column, 'Model'], ascending=[False, True]).reset_index(drop=True)
+        x_col = plot_column
+        title = f'{plot_column} per Model'
+        if plot_column == 'Average score':
+            range_max = 20
+            x_range_max = 20
+        else:
+            range_max = 20
+            x_range_max = 20
+            def get_label(score):
+                if score < 8:
+                    return "Fail"
+                elif 8 <= score <= 18:
+                    return "Pass"
+                else:
+                    return "Distinction"
+            df['Test Result'] = df[plot_column].apply(get_label)
+        if plot_column in ['Average score', 'Accuracy']:
+            fig = px.bar(df,
+                x=x_col,
+                y='Model',
+                color=x_col,
+                color_continuous_scale='tealrose_r',
+                labels={x_col: plot_column, 'Model': 'Model'},
+                title=title,
+                orientation='h',
+                range_color=[0, range_max])
+        else:
+            color_discrete_map = {
+                "Fail": "#d15d80",
+                "Pass": "#edd8be",
+                "Distinction": "#059492"
+            }
+            fig = px.bar(df,
+                x=x_col,
+                y='Model',
+                color=df['Test Result'],
+                color_discrete_map=color_discrete_map,
+                labels={x_col: plot_column, 'Model': 'Model'},
+                title=title,
+                orientation='h')
+        fig.update_layout(
+            xaxis=dict(range=[0, x_range_max]),
+            title=dict(text=title, font=dict(size=16)),
+            xaxis_title=dict(font=dict(size=12)),
+            yaxis_title=dict(font=dict(size=12)),
+            yaxis=dict(autorange="reversed")
+        )
+        return fig
+    elif exam_type == "MMLU-Pro-Hy":
+        df = pd.read_csv('mmlu_pro_hy_results.csv')
+        df = df.sort_values(by='Accuracy', ascending=False)
+        x_col = 'Accuracy'
+        title = 'Accuracy per Model (MMLU-Pro-Hy)'
+        range_max = 1.0
+        x_range_max = 1.0
+        if plot_column != 'Accuracy':
+            def get_label(accuracy):
+                if accuracy < 0.5:
+                    return "Low"
+                elif 0.5 <= accuracy <= 0.8:
+                    return "Medium"
+                else:
+                    return "High"
+            df['Test Result'] = df['Accuracy'].apply(get_label)
+        fig = px.bar(df,
+                                x=x_col,
+                                y='Model',
+                                color=x_col,
+                                color_continuous_scale='tealrose_r',
+                                labels={x_col: plot_column, 'Model': 'Model'},
+                                title=title,
+                                orientation='h',
+                                range_color=[0, range_max])
+        fig.update_layout(
+            xaxis=dict(range=[0, x_range_max]),
+            title=dict(text=title, font=dict(size=16)),
+            xaxis_title=dict(font=dict(size=12)),
+            yaxis_title=dict(font=dict(size=12)),
+            yaxis=dict(autorange="reversed")
+        )
+        return fig
+with gr.Blocks() as app:
+    with gr.Tabs():
+        with gr.TabItem("Armenian Unified Exams"):
+            table_output_armenian = gr.DataFrame(value=lambda: display_table("Armenian Exams"))
+            plot_column_dropdown = gr.Dropdown(choices=['Average score', 'Armenian language exam score', 'Armenian history exam score', 'Mathematics exam score'], value='Average score', label='Select Column to Plot')
+            plot_output_armenian = gr.Plot(lambda column: create_bar_chart("Armenian Exams", column), inputs=plot_column_dropdown)
+        with gr.TabItem("MMLU-Pro-Hy"):
+            table_output_mmlu = gr.DataFrame(value=lambda: display_table("MMLU-Pro-Hy"))
+            plot_output_mmlu = gr.Plot(lambda: create_bar_chart("MMLU-Pro-Hy", 'Accuracy'))
+app.launch(share=True)

mmlu_pro_hy_results.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+Model,Accuracy
+claude-3-5-haiku-20241022,0.526
+claude-3-5-sonnet-20241022,0.701
+gemini-2.0-flash,0.741
+gemini-1.5-flash,0.586

benchmark_results.csv → unified_exam_results.csv RENAMED Viewed

@@ -1,10 +1,10 @@
-model,armenian_language_score,armenian_history_score,mathematics_score,average_score
 claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
 claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
 gemini-2.0-flash,5.5,6.75,17.25,9.83
 gpt-4o,6.75,6.75,13.25,8.92
 qwen-max-2025-01-25,7.25,4.5,14.25,8.67
 gemini-1.5-flash,4.75,3.75,15.0,7.83
-deepseek-ai/DeepSeek-V3,5.25,5.0,12.25,7.5
 Meta-Llama-3.3-70B-Instruct,4.5,5.25,11.5,7.08
 claude-3-5-haiku-20241022,5.0,3.75,10.75,6.5

+Model,Armenian language exam score,Armenian history exam score,Mathematics exam score,Average score
 claude-3-7-sonnet-20250219,10.5,7.75,15.0,11.08
 claude-3-5-sonnet-20241022,10.0,9.25,12.75,10.67
 gemini-2.0-flash,5.5,6.75,17.25,9.83
 gpt-4o,6.75,6.75,13.25,8.92
 qwen-max-2025-01-25,7.25,4.5,14.25,8.67
 gemini-1.5-flash,4.75,3.75,15.0,7.83
+DeepSeek-V3,5.25,5.0,12.25,7.5
 Meta-Llama-3.3-70B-Instruct,4.5,5.25,11.5,7.08
 claude-3-5-haiku-20241022,5.0,3.75,10.75,6.5