Spaces:
Running
Running
| import random | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| from gradio_leaderboard import Leaderboard, SelectColumns, SearchColumns | |
| abs_path = Path(__file__).parent | |
| df_core = pd.read_csv("opensci-ref-table.csv") | |
| df_core.drop("#Tokens", axis=1, inplace=True) | |
| df_core.drop("AVG", axis=1, inplace=True) | |
| benchmarks_core = df_core.columns[1:] | |
| df_core["Average โฌ๏ธ"] = df_core.loc[:, benchmarks_core].mean(axis=1) | |
| df_core.sort_values(by="Average โฌ๏ธ", ascending=False, inplace=True) | |
| df_instruction_tuning = pd.read_csv("results_instruction_tuning.csv.zip") | |
| df_instruction_tuning = df_instruction_tuning[ | |
| ~df_instruction_tuning.model_B.str.contains("12b") | |
| ] | |
| df_instruction_tuning.model_B = df_instruction_tuning.model_B.apply( | |
| lambda s: s.split("/")[-1] | |
| ) | |
| df_instruction_tuning_pivot = df_instruction_tuning.pivot_table( | |
| index="model_B", columns="benchmark", values="preference" | |
| ) | |
| df_instruction_tuning_pivot.index.rename("Model", inplace=True) | |
| df_instruction_tuning_pivot.reset_index(drop=False, inplace=True) | |
| df_instruction_tuning_pivot.columns = [ | |
| x.capitalize() for x in df_instruction_tuning_pivot.columns | |
| ] | |
| # first column is model | |
| df_instruction_tuning_pivot["Average โฌ๏ธ"] = df_instruction_tuning_pivot.loc[ | |
| :, df_instruction_tuning_pivot.columns[1:] | |
| ].mean(axis=1) | |
| # df_instruction_tuning.drop("benchmark", axis=1, inplace=True) | |
| df_instruction_tuning_pivot.sort_values(by="Average โฌ๏ธ", ascending=False, inplace=True) | |
| df_mah_pivot = df_instruction_tuning[ | |
| df_instruction_tuning.benchmark == "m-arena-hard-EU" | |
| ].copy() | |
| df_mah_pivot["lang"] = df_instruction_tuning.instruction_index.apply( | |
| lambda s: s.split("-")[-1] | |
| ) | |
| df_mah_pivot = df_mah_pivot.pivot_table( | |
| index="model_B", columns="lang", values="preference" | |
| ) | |
| df_mah_pivot["Average โฌ๏ธ"] = df_mah_pivot.mean(axis=1) | |
| df_mah_pivot.sort_values(by="Average โฌ๏ธ", ascending=False, inplace=True) | |
| df_mah_pivot.index.rename("Model", inplace=True) | |
| df_mah_pivot.reset_index(drop=False, inplace=True) | |
| df_eval = pd.read_csv("multilingual_results.csv") | |
| def map_task_to_group(task: str) -> str | None: | |
| if task == "xcopa": | |
| return "XCOPA" | |
| if task == "xstorycloze": | |
| return "XStoryCloze" | |
| if task == "xwinograd": | |
| return "XWinograd" | |
| if task.startswith("include_base_44_"): | |
| return "INCLUDE" | |
| if task.startswith("belebele_"): | |
| return "Belebele" | |
| if task.startswith("global_mmlu_full_"): | |
| return "Global MMLU" | |
| return None | |
| df_eval["group"] = df_eval.task.apply(map_task_to_group) | |
| df_eval_grouped = df_eval[df_eval["group"].notna()].copy() | |
| df_eval_grouped["Model"] = df_eval_grouped.model_name.apply(lambda s: s.split("/")[-1]) | |
| df_multilingual_pivot = df_eval_grouped.pivot_table( | |
| index="Model", columns="group", values="performance", aggfunc="mean" | |
| ) | |
| df_multilingual_pivot["Average โฌ๏ธ"] = df_multilingual_pivot.mean(axis=1) | |
| df_multilingual_pivot.sort_values(by="Average โฌ๏ธ", ascending=False, inplace=True) | |
| df_multilingual_pivot.index.rename("Model", inplace=True) | |
| df_multilingual_pivot.reset_index(drop=False, inplace=True) | |
| # Determine display names for groups including n_shot when unique | |
| group_nshot = ( | |
| df_eval_grouped.groupby("group")["n_shot"] | |
| .agg(lambda s: s.iloc[0] if s.nunique() == 1 else "mixed") | |
| .to_dict() | |
| ) | |
| def display_name(group: str) -> str: | |
| label = group_nshot.get(group, "unknown") | |
| if label == "mixed" or label == "unknown" or label == "unknown": | |
| return f"{group} [mixed]" if label == "mixed" else f"{group} [unknown]" | |
| return f"{group} [{label}]" | |
| # Build a renamed version for display, preserving Model and Average columns | |
| display_columns_map = { | |
| col: display_name(col) | |
| for col in df_multilingual_pivot.columns | |
| if col not in ["Model", "Average โฌ๏ธ"] | |
| } | |
| df_multilingual_display_all = df_multilingual_pivot.rename(columns=display_columns_map) | |
| cols = [ | |
| #'Llama-3.1-8B', | |
| "Llama-3.1-Tulu-3-8B-SFT", | |
| "Llama-3.2-3B-Instruct", | |
| "Llama-3.1-Tulu-3-8B-DPO", | |
| "Apertus-8B-Instruct-2509", | |
| ] | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # ๐ฅ OpenEuroLLM Leaderboard ๐ช๐บ | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("English Core ๐ด๓ ง๓ ข๓ ฅ๓ ฎ๓ ง๓ ฟ๐บ๐ธ"): | |
| Leaderboard( | |
| value=df_core.round(2), | |
| select_columns=SelectColumns( | |
| default_selection=list(df_core.columns), | |
| cant_deselect=["Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| search_columns=SearchColumns( | |
| primary_column="Model", | |
| label="Filter a model", | |
| secondary_columns=[], | |
| ), | |
| ) | |
| with gr.Tab("Multilingual evaluations ๐"): | |
| gr.Markdown( | |
| """ | |
| Aggregated multilingual performance by task group (mean across languages when applicable). | |
| """ | |
| ) | |
| # Order columns: Model, groups..., Average | |
| raw_group_columns = [ | |
| col | |
| for col in [ | |
| "INCLUDE", | |
| "Belebele", | |
| "Global MMLU", | |
| "XCOPA", | |
| "XStoryCloze", | |
| "XWinograd", | |
| ] | |
| if col in df_multilingual_pivot.columns | |
| ] | |
| display_group_columns = [ | |
| display_columns_map[col] for col in raw_group_columns | |
| ] | |
| ordered_columns = ["Model", *display_group_columns, "Average โฌ๏ธ"] | |
| df_multilingual_display = df_multilingual_display_all.loc[ | |
| :, ordered_columns | |
| ] | |
| Leaderboard( | |
| value=df_multilingual_display.round(2), | |
| select_columns=SelectColumns( | |
| default_selection=list(df_multilingual_display.columns), | |
| cant_deselect=["Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| search_columns=SearchColumns( | |
| primary_column="Model", | |
| label="Filter a model", | |
| secondary_columns=[], | |
| ), | |
| ) | |
| with gr.Tab("Instruction-tuning ๐ฏ๓ ง๓ ข๓ ฅ๐ด๓ ง๓ ข๓ ฅ๓ ฎ๓ ง๓ ฟ"): | |
| gr.Markdown( | |
| """ | |
| Winrate against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge. | |
| """ | |
| ) | |
| Leaderboard( | |
| value=df_instruction_tuning_pivot.round(2), | |
| select_columns=SelectColumns( | |
| # default_selection=[ | |
| # col | |
| # for col in df_instruction_tuning_pivot.columns | |
| # if not "-eu" in col | |
| # ], | |
| cant_deselect=["Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| search_columns=SearchColumns( | |
| primary_column="Model", | |
| label="Filter a model", | |
| secondary_columns=[], | |
| ), | |
| ) | |
| with gr.Tab("Instruction-tuning multi-lingual ๐ฏ๐ช๐บ"): | |
| gr.Markdown( | |
| """ | |
| Winrate on m-Arena-Hard instructions against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge. | |
| """ | |
| ) | |
| language_flags = { | |
| "cs": "๐จ๐ฟ", | |
| "de": "๐ฉ๐ช", | |
| "el": "๐ฌ๐ท", | |
| "en": "๐ฌ๐ง", | |
| "es": "๐ช๐ธ", | |
| "fr": "๐ซ๐ท", | |
| "it": "๐ฎ๐น", | |
| "nl": "๐ณ๐ฑ", | |
| "pl": "๐ต๐ฑ", | |
| "pt": "๐ต๐น", | |
| "ro": "๐ท๐ด", | |
| "uk": "๐บ๐ฆ", | |
| } | |
| df_mah_pivot.columns = [ | |
| f"{x} {language_flags[x]}" if x in language_flags else x | |
| for x in df_mah_pivot.columns | |
| ] | |
| Leaderboard( | |
| value=df_mah_pivot.round(2), | |
| select_columns=SelectColumns( | |
| default_selection=list(df_mah_pivot.columns), | |
| cant_deselect=["Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| search_columns=SearchColumns( | |
| primary_column="Model", | |
| label="Filter a model", | |
| secondary_columns=[], | |
| ), | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |