LLM-leaderboard / main.py
geoalgo's picture
change tab order
0548301
import random
from pathlib import Path
import gradio as gr
import pandas as pd
from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns
abs_path = Path(__file__).parent
df_core = pd.read_csv("opensci-ref-table.csv")
df_core.drop("#Tokens", axis=1, inplace=True)
df_core.drop("AVG", axis=1, inplace=True)
benchmarks_core = df_core.columns[1:]
df_core["Average โฌ†๏ธ"] = df_core.loc[:, benchmarks_core].mean(axis=1)
df_core.sort_values(by="Average โฌ†๏ธ", ascending=False, inplace=True)
df_instruction_tuning = pd.read_csv("results_instruction_tuning.csv.zip")
df_instruction_tuning = df_instruction_tuning[
~df_instruction_tuning.model_B.str.contains("12b")
]
df_instruction_tuning.model_B = df_instruction_tuning.model_B.apply(
lambda s: s.split("/")[-1]
)
df_instruction_tuning_pivot = df_instruction_tuning.pivot_table(
index="model_B", columns="benchmark", values="preference"
)
df_instruction_tuning_pivot.index.rename("Model", inplace=True)
df_instruction_tuning_pivot.reset_index(drop=False, inplace=True)
df_instruction_tuning_pivot.columns = [
x.capitalize() for x in df_instruction_tuning_pivot.columns
]
# first column is model
df_instruction_tuning_pivot["Average โฌ†๏ธ"] = df_instruction_tuning_pivot.loc[
:, df_instruction_tuning_pivot.columns[1:]
].mean(axis=1)
# df_instruction_tuning.drop("benchmark", axis=1, inplace=True)
df_instruction_tuning_pivot.sort_values(by="Average โฌ†๏ธ", ascending=False, inplace=True)
df_mah_pivot = df_instruction_tuning[
df_instruction_tuning.benchmark == "m-arena-hard-EU"
].copy()
df_mah_pivot["lang"] = df_instruction_tuning.instruction_index.apply(
lambda s: s.split("-")[-1]
)
df_mah_pivot = df_mah_pivot.pivot_table(
index="model_B", columns="lang", values="preference"
)
df_mah_pivot["Average โฌ†๏ธ"] = df_mah_pivot.mean(axis=1)
df_mah_pivot.sort_values(by="Average โฌ†๏ธ", ascending=False, inplace=True)
df_mah_pivot.index.rename("Model", inplace=True)
df_mah_pivot.reset_index(drop=False, inplace=True)
df_eval = pd.read_csv("multilingual_results.csv")
def map_task_to_group(task: str) -> str | None:
if task == "xcopa":
return "XCOPA"
if task == "xstorycloze":
return "XStoryCloze"
if task == "xwinograd":
return "XWinograd"
if task.startswith("include_base_44_"):
return "INCLUDE"
if task.startswith("belebele_"):
return "Belebele"
if task.startswith("global_mmlu_full_"):
return "Global MMLU"
return None
df_eval["group"] = df_eval.task.apply(map_task_to_group)
df_eval_grouped = df_eval[df_eval["group"].notna()].copy()
df_eval_grouped["Model"] = df_eval_grouped.model_name.apply(lambda s: s.split("/")[-1])
df_multilingual_pivot = df_eval_grouped.pivot_table(
index="Model", columns="group", values="performance", aggfunc="mean"
)
df_multilingual_pivot["Average โฌ†๏ธ"] = df_multilingual_pivot.mean(axis=1)
df_multilingual_pivot.sort_values(by="Average โฌ†๏ธ", ascending=False, inplace=True)
df_multilingual_pivot.index.rename("Model", inplace=True)
df_multilingual_pivot.reset_index(drop=False, inplace=True)
# Determine display names for groups including n_shot when unique
group_nshot = (
df_eval_grouped.groupby("group")["n_shot"]
.agg(lambda s: s.iloc[0] if s.nunique() == 1 else "mixed")
.to_dict()
)
def display_name(group: str) -> str:
label = group_nshot.get(group, "unknown")
if label == "mixed" or label == "unknown" or label == "unknown":
return f"{group} [mixed]" if label == "mixed" else f"{group} [unknown]"
return f"{group} [{label}]"
# Build a renamed version for display, preserving Model and Average columns
display_columns_map = {
col: display_name(col)
for col in df_multilingual_pivot.columns
if col not in ["Model", "Average โฌ†๏ธ"]
}
df_multilingual_display_all = df_multilingual_pivot.rename(columns=display_columns_map)
cols = [
#'Llama-3.1-8B',
"Llama-3.1-Tulu-3-8B-SFT",
"Llama-3.2-3B-Instruct",
"Llama-3.1-Tulu-3-8B-DPO",
"Apertus-8B-Instruct-2509",
]
with gr.Blocks() as demo:
gr.Markdown(
"""
# ๐Ÿฅ‡ OpenEuroLLM Leaderboard ๐Ÿ‡ช๐Ÿ‡บ
"""
)
with gr.Tabs():
with gr.Tab("English Core ๐Ÿด๓ ง๓ ข๓ ฅ๓ ฎ๓ ง๓ ฟ๐Ÿ‡บ๐Ÿ‡ธ"):
Leaderboard(
value=df_core.round(2),
select_columns=SelectColumns(
default_selection=list(df_core.columns),
cant_deselect=["Model"],
label="Select Columns to Display:",
),
search_columns=SearchColumns(
primary_column="Model",
label="Filter a model",
secondary_columns=[],
),
)
with gr.Tab("Multilingual evaluations ๐ŸŒ"):
gr.Markdown(
"""
Aggregated multilingual performance by task group (mean across languages when applicable).
"""
)
# Order columns: Model, groups..., Average
raw_group_columns = [
col
for col in [
"INCLUDE",
"Belebele",
"Global MMLU",
"XCOPA",
"XStoryCloze",
"XWinograd",
]
if col in df_multilingual_pivot.columns
]
display_group_columns = [
display_columns_map[col] for col in raw_group_columns
]
ordered_columns = ["Model", *display_group_columns, "Average โฌ†๏ธ"]
df_multilingual_display = df_multilingual_display_all.loc[
:, ordered_columns
]
Leaderboard(
value=df_multilingual_display.round(2),
select_columns=SelectColumns(
default_selection=list(df_multilingual_display.columns),
cant_deselect=["Model"],
label="Select Columns to Display:",
),
search_columns=SearchColumns(
primary_column="Model",
label="Filter a model",
secondary_columns=[],
),
)
with gr.Tab("Instruction-tuning ๐ŸŽฏ๓ ง๓ ข๓ ฅ๐Ÿด๓ ง๓ ข๓ ฅ๓ ฎ๓ ง๓ ฟ"):
gr.Markdown(
"""
Winrate against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge.
"""
)
Leaderboard(
value=df_instruction_tuning_pivot.round(2),
select_columns=SelectColumns(
# default_selection=[
# col
# for col in df_instruction_tuning_pivot.columns
# if not "-eu" in col
# ],
cant_deselect=["Model"],
label="Select Columns to Display:",
),
search_columns=SearchColumns(
primary_column="Model",
label="Filter a model",
secondary_columns=[],
),
)
with gr.Tab("Instruction-tuning multi-lingual ๐ŸŽฏ๐Ÿ‡ช๐Ÿ‡บ"):
gr.Markdown(
"""
Winrate on m-Arena-Hard instructions against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge.
"""
)
language_flags = {
"cs": "๐Ÿ‡จ๐Ÿ‡ฟ",
"de": "๐Ÿ‡ฉ๐Ÿ‡ช",
"el": "๐Ÿ‡ฌ๐Ÿ‡ท",
"en": "๐Ÿ‡ฌ๐Ÿ‡ง",
"es": "๐Ÿ‡ช๐Ÿ‡ธ",
"fr": "๐Ÿ‡ซ๐Ÿ‡ท",
"it": "๐Ÿ‡ฎ๐Ÿ‡น",
"nl": "๐Ÿ‡ณ๐Ÿ‡ฑ",
"pl": "๐Ÿ‡ต๐Ÿ‡ฑ",
"pt": "๐Ÿ‡ต๐Ÿ‡น",
"ro": "๐Ÿ‡ท๐Ÿ‡ด",
"uk": "๐Ÿ‡บ๐Ÿ‡ฆ",
}
df_mah_pivot.columns = [
f"{x} {language_flags[x]}" if x in language_flags else x
for x in df_mah_pivot.columns
]
Leaderboard(
value=df_mah_pivot.round(2),
select_columns=SelectColumns(
default_selection=list(df_mah_pivot.columns),
cant_deselect=["Model"],
label="Select Columns to Display:",
),
search_columns=SearchColumns(
primary_column="Model",
label="Filter a model",
secondary_columns=[],
),
)
if __name__ == "__main__":
demo.launch()