evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on Aug 25

Commit

97a28b9

1 Parent(s): b2119dc

Add the chart showing the model accuracy by task

Browse files

Files changed (1) hide show

app.py +98 -0

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import matplotlib.pyplot as plt
 import re
 import plotly.express as px
 import plotly.graph_objects as go
 def mean_of_max_per_field(df):
@@ -29,6 +30,8 @@ def mean_of_max_per_field(df):
     """
     fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
     # Controlla che tutte le colonne esistano nel DataFrame
     missing = [f for f in fields if f not in df.columns]
     if missing:
@@ -43,6 +46,99 @@ def mean_of_max_per_field(df):
     return mean_max
 def line_chart(dataframe):
     # Separiamo i dati in base a IS_FS
     df_true = dataframe[dataframe['IS_FS'] == True]
@@ -99,6 +195,7 @@ def line_chart(dataframe):
     # Disabilita lo zoom e altri controlli
     fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
     fig.update_yaxes(fixedrange=True)
     return fig
@@ -405,6 +502,7 @@ with demo:
             #gr.Plot(value=line_chart(LEADERBOARD_DF), label="Andamento di esempio")
             #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
             gr.Plot(value=line_chart(LEADERBOARD_DF))
         # About tab
         with gr.TabItem("📝 About"):

 import re
 import plotly.express as px
 import plotly.graph_objects as go
+import numpy as np
 def mean_of_max_per_field(df):
     """
     fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
+    #print(df.columns)
     # Controlla che tutte le colonne esistano nel DataFrame
     missing = [f for f in fields if f not in df.columns]
     if missing:
     return mean_max
+def boxplot_per_task(dataframe=None, baselines=None):
+    tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
+    if dataframe is None:
+        np.random.seed(42)
+        dataframe = pd.DataFrame({
+            task: np.random.uniform(0.4, 0.9, 20) * 100
+            for task in tasks
+        })
+    # baseline per ciascun task (se non viene passata, metto random tra 50 e 70)
+    if baselines is None:
+        baselines = {task: np.random.randint(50, 70) for task in tasks}
+    colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
+              "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
+    fig = go.Figure()
+    for i, task in enumerate(tasks):
+        if task in dataframe.columns:
+            y_data = dataframe[task].dropna().tolist()
+            # boxplot
+            fig.add_trace(go.Box(
+                y=y_data,
+                name=task,
+                boxmean="sd",
+                marker=dict(color=colors[i], line=dict(width=1)),
+                line=dict(color=colors[i]),
+                fillcolor=colors[i],
+                opacity=0.7,
+                hovertemplate=f"<b>{task}</b><br>Accuracy: "+"%{y:.2f}%"+"<extra></extra>",
+                width=0.6
+            ))
+            # baseline per task (se disponibile)
+            if task in baselines and baselines[task] is not None:
+                # baseline come linea orizzontale
+                fig.add_shape(
+                    type="line",
+                    x0=i-0.3, x1=i+0.3,   # larghezza in corrispondenza del box
+                    y0=baselines[task], y1=baselines[task],
+                    line=dict(color="black", width=2, dash="dash"),
+                    xref="x", yref="y"
+                )
+                # label con valore baseline
+                fig.add_annotation(
+                    x=i, y=baselines[task],
+                    text=f"{baselines[task]}%",
+                    showarrow=False,
+                    yshift=10,
+                    font=dict(size=10, color="black")
+                )
+    fig.update_layout(
+        title="Distribution of Model Accuracy by Task.",
+        xaxis_title="Task",
+        yaxis_title="Accuracy (%)",
+        template="plotly_white",
+        boxmode="group",
+        dragmode=False,
+        font=dict(family="Arial", size=13),
+        margin=dict(b=80),
+        annotations = [
+            dict(
+                text=(
+                    "Boxplots show LLM accuracy in zero/few-shot settings. <br>"
+                    "Black dashed lines indicate the best-performing supervised models evaluated during EVALITA."
+                ),
+                xref="paper", yref="paper",
+                x=0.5, y=-0.33,
+                showarrow=False,
+                font=dict(size=12, color="gray")
+            )
+        ]
+    )
+    #fig.update_yaxes(fixedrange=True)
+    fig.update_yaxes(range=[0, 100], fixedrange=True)
+    return fig
+# 🔹 Esempio d’uso
+BASELINES = {
+    "TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
+    "LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99
+}
 def line_chart(dataframe):
     # Separiamo i dati in base a IS_FS
     df_true = dataframe[dataframe['IS_FS'] == True]
     # Disabilita lo zoom e altri controlli
     fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
     fig.update_yaxes(fixedrange=True)
+    #fig.update_yaxes(range=[0, 100], fixedrange=True)
     return fig
             #gr.Plot(value=line_chart(LEADERBOARD_DF), label="Andamento di esempio")
             #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
             gr.Plot(value=line_chart(LEADERBOARD_DF))
+            gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES), interactive_plot_config={'displayModeBar': False })
         # About tab
         with gr.TabItem("📝 About"):