import random from pathlib import Path import gradio as gr import pandas as pd from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns abs_path = Path(__file__).parent df_core = pd.read_csv("opensci-ref-table.csv") df_core.drop("#Tokens", axis=1, inplace=True) df_core.drop("AVG", axis=1, inplace=True) benchmarks_core = df_core.columns[1:] df_core["Average ⬆️"] = df_core.loc[:, benchmarks_core].mean(axis=1) df_core.sort_values(by="Average ⬆️", ascending=False, inplace=True) df_instruction_tuning = pd.read_csv("results_instruction_tuning.csv.zip") df_instruction_tuning = df_instruction_tuning[ ~df_instruction_tuning.model_B.str.contains("12b") ] df_instruction_tuning.model_B = df_instruction_tuning.model_B.apply( lambda s: s.split("/")[-1] ) df_instruction_tuning_pivot = df_instruction_tuning.pivot_table( index="model_B", columns="benchmark", values="preference" ) df_instruction_tuning_pivot.index.rename("Model", inplace=True) df_instruction_tuning_pivot.reset_index(drop=False, inplace=True) df_instruction_tuning_pivot.columns = [ x.capitalize() for x in df_instruction_tuning_pivot.columns ] # first column is model df_instruction_tuning_pivot["Average ⬆️"] = df_instruction_tuning_pivot.loc[ :, df_instruction_tuning_pivot.columns[1:] ].mean(axis=1) # df_instruction_tuning.drop("benchmark", axis=1, inplace=True) df_instruction_tuning_pivot.sort_values(by="Average ⬆️", ascending=False, inplace=True) df_mah_pivot = df_instruction_tuning[ df_instruction_tuning.benchmark == "m-arena-hard-EU" ].copy() df_mah_pivot["lang"] = df_instruction_tuning.instruction_index.apply( lambda s: s.split("-")[-1] ) df_mah_pivot = df_mah_pivot.pivot_table( index="model_B", columns="lang", values="preference" ) df_mah_pivot["Average ⬆️"] = df_mah_pivot.mean(axis=1) df_mah_pivot.sort_values(by="Average ⬆️", ascending=False, inplace=True) df_mah_pivot.index.rename("Model", inplace=True) df_mah_pivot.reset_index(drop=False, inplace=True) df_eval = pd.read_csv("multilingual_results.csv") def map_task_to_group(task: str) -> str | None: if task == "xcopa": return "XCOPA" if task == "xstorycloze": return "XStoryCloze" if task == "xwinograd": return "XWinograd" if task.startswith("include_base_44_"): return "INCLUDE" if task.startswith("belebele_"): return "Belebele" if task.startswith("global_mmlu_full_"): return "Global MMLU" return None df_eval["group"] = df_eval.task.apply(map_task_to_group) df_eval_grouped = df_eval[df_eval["group"].notna()].copy() df_eval_grouped["Model"] = df_eval_grouped.model_name.apply(lambda s: s.split("/")[-1]) df_multilingual_pivot = df_eval_grouped.pivot_table( index="Model", columns="group", values="performance", aggfunc="mean" ) df_multilingual_pivot["Average ⬆️"] = df_multilingual_pivot.mean(axis=1) df_multilingual_pivot.sort_values(by="Average ⬆️", ascending=False, inplace=True) df_multilingual_pivot.index.rename("Model", inplace=True) df_multilingual_pivot.reset_index(drop=False, inplace=True) # Determine display names for groups including n_shot when unique group_nshot = ( df_eval_grouped.groupby("group")["n_shot"] .agg(lambda s: s.iloc[0] if s.nunique() == 1 else "mixed") .to_dict() ) def display_name(group: str) -> str: label = group_nshot.get(group, "unknown") if label == "mixed" or label == "unknown" or label == "unknown": return f"{group} [mixed]" if label == "mixed" else f"{group} [unknown]" return f"{group} [{label}]" # Build a renamed version for display, preserving Model and Average columns display_columns_map = { col: display_name(col) for col in df_multilingual_pivot.columns if col not in ["Model", "Average ⬆️"] } df_multilingual_display_all = df_multilingual_pivot.rename(columns=display_columns_map) cols = [ #'Llama-3.1-8B', "Llama-3.1-Tulu-3-8B-SFT", "Llama-3.2-3B-Instruct", "Llama-3.1-Tulu-3-8B-DPO", "Apertus-8B-Instruct-2509", ] with gr.Blocks() as demo: gr.Markdown( """ # 🥇 OpenEuroLLM Leaderboard 🇪🇺 """ ) with gr.Tabs(): with gr.Tab("English Core 🏴󠁧󠁢󠁥󠁮󠁧󠁿🇺🇸"): Leaderboard( value=df_core.round(2), select_columns=SelectColumns( default_selection=list(df_core.columns), cant_deselect=["Model"], label="Select Columns to Display:", ), search_columns=SearchColumns( primary_column="Model", label="Filter a model", secondary_columns=[], ), ) with gr.Tab("Multilingual evaluations 🌍"): gr.Markdown( """ Aggregated multilingual performance by task group (mean across languages when applicable). """ ) # Order columns: Model, groups..., Average raw_group_columns = [ col for col in [ "INCLUDE", "Belebele", "Global MMLU", "XCOPA", "XStoryCloze", "XWinograd", ] if col in df_multilingual_pivot.columns ] display_group_columns = [ display_columns_map[col] for col in raw_group_columns ] ordered_columns = ["Model", *display_group_columns, "Average ⬆️"] df_multilingual_display = df_multilingual_display_all.loc[ :, ordered_columns ] Leaderboard( value=df_multilingual_display.round(2), select_columns=SelectColumns( default_selection=list(df_multilingual_display.columns), cant_deselect=["Model"], label="Select Columns to Display:", ), search_columns=SearchColumns( primary_column="Model", label="Filter a model", secondary_columns=[], ), ) with gr.Tab("Instruction-tuning 🎯󠁧󠁢󠁥🏴󠁧󠁢󠁥󠁮󠁧󠁿"): gr.Markdown( """ Winrate against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge. """ ) Leaderboard( value=df_instruction_tuning_pivot.round(2), select_columns=SelectColumns( # default_selection=[ # col # for col in df_instruction_tuning_pivot.columns # if not "-eu" in col # ], cant_deselect=["Model"], label="Select Columns to Display:", ), search_columns=SearchColumns( primary_column="Model", label="Filter a model", secondary_columns=[], ), ) with gr.Tab("Instruction-tuning multi-lingual 🎯🇪🇺"): gr.Markdown( """ Winrate on m-Arena-Hard instructions against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge. """ ) language_flags = { "cs": "🇨🇿", "de": "🇩🇪", "el": "🇬🇷", "en": "🇬🇧", "es": "🇪🇸", "fr": "🇫🇷", "it": "🇮🇹", "nl": "🇳🇱", "pl": "🇵🇱", "pt": "🇵🇹", "ro": "🇷🇴", "uk": "🇺🇦", } df_mah_pivot.columns = [ f"{x} {language_flags[x]}" if x in language_flags else x for x in df_mah_pivot.columns ] Leaderboard( value=df_mah_pivot.round(2), select_columns=SelectColumns( default_selection=list(df_mah_pivot.columns), cant_deselect=["Model"], label="Select Columns to Display:", ), search_columns=SearchColumns( primary_column="Model", label="Filter a model", secondary_columns=[], ), ) if __name__ == "__main__": demo.launch()