Spaces:
Running
Running
| # source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/utils_display.py | |
| import json | |
| import hashlib | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from dataclasses import dataclass | |
| import plotly.graph_objects as go | |
| from transformers import AutoConfig | |
| from src.config import afrobench_path, afrobench_lite_path, lite_languages_path | |
| # These classes are for user facing column names, to avoid having to change them | |
| # all around the code when a modif is needed | |
| class ColumnContent: | |
| name: str | |
| type: str | |
| displayed_by_default: bool | |
| hidden: bool = False | |
| def fields(raw_class): | |
| return [ | |
| v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__" | |
| ] | |
| def model_hyperlink(link, model_name): | |
| return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | |
| def styled_error(error): | |
| return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>" | |
| def styled_warning(warn): | |
| return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>" | |
| def styled_message(message): | |
| return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>" | |
| def has_no_nan_values(df, columns): | |
| return df[columns].notna().all(axis=1) | |
| def has_nan_values(df, columns): | |
| return df[columns].isna().any(axis=1) | |
| def is_model_on_hub(model_name: str, revision: str) -> bool: | |
| try: | |
| AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False) | |
| return True, None | |
| except ValueError: | |
| return ( | |
| False, | |
| "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.", | |
| ) | |
| except Exception as e: | |
| print(f"Could not get the model config from the hub.: {e}") | |
| return False, "was not found on hub!" | |
| def get_color(name): | |
| # Hash and map to a consistent color | |
| color = plt.cm.tab20(hash(name) % 20) # 20 unique colors | |
| return f"rgb({int(color[0]*255)}, {int(color[1]*255)}, {int(color[2]*255)})" | |
| # def plot_model_scores(df): | |
| # # Assume df already has: Model, Score, and columns you filtered on | |
| # color_map = { | |
| # "LLaMa": "cornflowerblue", | |
| # "Aya": "lightcoral", | |
| # "Gemma": "mediumpurple", | |
| # "GPT": "seagreen", | |
| # "Gemini": "goldenrod", | |
| # "AfroLLaMa": "indianred", | |
| # } | |
| # | |
| # def assign_color(model_name): | |
| # for key, color in color_map.items(): | |
| # if key.lower() in model_name.lower(): | |
| # return color | |
| # return "gray" | |
| # | |
| # df_sorted = df.copy() | |
| # df_sorted["Color"] = df_sorted["Model"].apply(assign_color) | |
| # df_sorted = df_sorted.sort_values("Score", ascending=False) | |
| # | |
| # fig = go.Figure() | |
| # fig.add_trace( | |
| # go.Bar( | |
| # x=df_sorted["Score"], | |
| # y=df_sorted["Model"], | |
| # orientation='h', | |
| # marker_color=df_sorted["Color"], | |
| # hoverinfo="x+y", | |
| # ) | |
| # ) | |
| # | |
| # fig.update_layout( | |
| # title="📊 Model Score Comparison", | |
| # xaxis_title="Average Score", | |
| # yaxis_title="Model", | |
| # height=600, | |
| # margin=dict(l=100, r=20, t=40, b=40), | |
| # ) | |
| # return fig | |
| # def plot_model_scores(df): | |
| # df_sorted = df.copy() | |
| # df_sorted["Color"] = df_sorted["Model"].apply(get_color) | |
| # | |
| # fig = go.Figure() | |
| # fig.add_trace( | |
| # go.Bar( | |
| # x=df_sorted["Score"], | |
| # y=df_sorted["Model"], | |
| # orientation='h', | |
| # marker_color=df_sorted["Color"], | |
| # hoverinfo="x+y", | |
| # ) | |
| # ) | |
| # | |
| # fig.update_layout( | |
| # title="📊 Model Score Comparison", | |
| # xaxis_title="Average Score", | |
| # yaxis_title="Model", | |
| # height=600, | |
| # margin=dict(l=100, r=20, t=40, b=40), | |
| # ) | |
| # return fig | |
| def plot_model_scores(df): | |
| df = df.copy() | |
| df["Color"] = df["Model"].apply(get_color) | |
| # Extract model size as string ("8B", "13B", or "UNK") | |
| def extract_size_str(model): | |
| parts = model.split() | |
| for part in parts: | |
| if part.endswith("B") and part[:-1].isdigit(): | |
| return part | |
| return "UNK" | |
| # For plotting: numeric value of size (used only for x-axis) | |
| def size_to_num(size_str): | |
| return int(size_str[:-1]) if size_str != "UNK" else 100 | |
| df["Size"] = df["Model"].apply(extract_size_str) | |
| df["Size Num"] = df["Size"].apply(size_to_num) | |
| size_order = df.drop_duplicates("Size").sort_values("Size Num")["Size"].tolist() | |
| fig = go.Figure() | |
| for _, row in df.iterrows(): | |
| fig.add_trace( | |
| go.Scatter( | |
| x=[row["Size"]], | |
| y=[row["Score"]], | |
| mode="markers", | |
| name=row["Model"], | |
| marker=dict( | |
| size=14, | |
| color=row["Color"], | |
| line=dict(width=1, color="black"), | |
| ), | |
| hovertemplate=f"<b>{row['Model']}</b><br>Score: {row['Score']}<br>Size: {row['Size']}", | |
| showlegend=True, | |
| ) | |
| ) | |
| fig.update_layout( | |
| title="📊 Model Score vs Size", | |
| xaxis=dict( | |
| title="Model Size", | |
| type="category", | |
| categoryorder="array", | |
| categoryarray=size_order | |
| ), | |
| yaxis_title="Average Score", | |
| height=600, | |
| margin=dict(l=60, r=60, t=40, b=40), | |
| legend=dict(title="Model", orientation="v", x=1.05, y=1), | |
| ) | |
| return fig | |
| def plot_leaderboard_scores(view_type, selected_cols, source): | |
| # Load leaderboard data | |
| if source == "afrobench_lite": | |
| df = create_result_dataframes_lite(afrobench_lite_path, level=view_type) | |
| else: | |
| df = create_result_dataframes(afrobench_path, level=view_type) | |
| df.reset_index(inplace=True) | |
| df.rename(columns={"index": "Model"}, inplace=True) | |
| metric_cols = [c for c in df.columns if c not in ["Model"]] | |
| if selected_cols: | |
| metric_cols = [c for c in selected_cols if c in metric_cols] | |
| df["Score"] = df[metric_cols].mean(axis=1).round(1) | |
| df_sorted = df.sort_values("Score", ascending=False) | |
| fig = plot_model_scores(df_sorted) | |
| return fig | |
| def average_nested_scores(score_dict): | |
| return { | |
| model: {k: round(sum(v) / len(v), 1) for k, v in group.items()} | |
| for model, group in score_dict.items() | |
| } | |
| def create_result_dataframes(json_file, level="category"): | |
| with open(json_file, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| task_scores = {} | |
| dataset_scores = {} | |
| category_scores = {} | |
| for category, subtasks in data.items(): | |
| for task, content in subtasks.items(): | |
| for dataset, scores in content["datasets"].items(): | |
| for model, score in scores.items(): | |
| # Task-level | |
| task_scores.setdefault(model, {}).setdefault(task, []).append(score) | |
| # Dataset-level | |
| dataset_scores.setdefault(model, {})[dataset] = score | |
| # Category-level | |
| category_scores.setdefault(model, {}).setdefault(category, []).append(score) | |
| task_df = pd.DataFrame(average_nested_scores(task_scores)).T.sort_index() | |
| dataset_df = pd.DataFrame(dataset_scores).T.sort_index() | |
| category_df = pd.DataFrame(average_nested_scores(category_scores)).T.sort_index() | |
| return { | |
| "task": task_df, | |
| "dataset": dataset_df, | |
| "category": category_df, | |
| }.get(level, "Invalid level. Choose from: ['category', 'task', 'dataset']") | |
| def create_result_dataframes_lite(json_file, level="task"): | |
| with open(json_file, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| # Task-level: average across datasets in each task group | |
| task_scores = {} | |
| dataset_scores = {} | |
| for task, datasets in data.items(): | |
| for dataset, scores in datasets.items(): | |
| for model, score in scores.items(): | |
| dataset_scores.setdefault(model, {})[dataset] = score | |
| task_scores.setdefault(model, {}).setdefault(task, []).append(score) | |
| task_level_df = pd.DataFrame({ | |
| model: {task: round(sum(scores) / len(scores), 1) for task, scores in task_dict.items()} | |
| for model, task_dict in task_scores.items() | |
| }).T.sort_index() | |
| dataset_level_df = pd.DataFrame(dataset_scores).T.sort_index() | |
| level_map = { | |
| "task": task_level_df, | |
| "dataset": dataset_level_df, | |
| } | |
| if level == "language": | |
| with open(lite_languages_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| language_level_df = pd.DataFrame(data).T.sort_index() | |
| level_map["language"] = language_level_df | |
| return level_map.get(level, "Invalid level. Choose from: ['task', 'dataset']") | |