Spaces:
Running
Running
| import json | |
| import os | |
| from dataclasses import dataclass, field | |
| import pandas as pd | |
| from huggingface_hub import model_info, ModelInfo | |
| from transformers import AutoConfig | |
| UNKNOWN_MODEL_SHOW_SIZE = 150 | |
| PERCENT_ROUND_DIGITS = 1 | |
| MODEL_CONFIG_CACHE = {} | |
| MODEL_CARD_CACHE = {} | |
| def get_hf_model_info_card_or_none(model_name: str) -> ModelInfo | None: | |
| if model_name in MODEL_CARD_CACHE: | |
| return MODEL_CARD_CACHE[model_name] | |
| try: | |
| info = model_info(repo_id=model_name) | |
| MODEL_CARD_CACHE[model_name] = info | |
| return info | |
| except Exception: | |
| MODEL_CARD_CACHE[model_name] = None | |
| return None | |
| def get_hf_hub_config_or_none(model_name: str) -> AutoConfig | None: | |
| if model_name in MODEL_CONFIG_CACHE: | |
| return MODEL_CONFIG_CACHE[model_name] | |
| try: | |
| config = AutoConfig.from_pretrained(model_name, revision="main", trust_remote_code=True) | |
| MODEL_CONFIG_CACHE[model_name] = config | |
| return config | |
| except Exception: | |
| MODEL_CONFIG_CACHE[model_name] = None | |
| return None | |
| def get_hf_model_info_card_or_none(model_name: str) -> ModelInfo | None: | |
| try: | |
| info = model_info(repo_id=model_name) | |
| return info | |
| except Exception: | |
| return None | |
| def model_size_to_symbol(model_size_in_b_params: int | None) -> str: | |
| """Converts model size to a symbol""" | |
| if model_size_in_b_params is None or model_size_in_b_params == 0 or not model_size_in_b_params: | |
| return "β" | |
| if model_size_in_b_params < 5: | |
| return "π΄" | |
| elif model_size_in_b_params < 50: | |
| return "π" | |
| elif model_size_in_b_params < 100: | |
| return "π" | |
| elif model_size_in_b_params < 1000: | |
| return "π" | |
| else: | |
| return "β" | |
| def model_type_to_symbol(model_type: str) -> str: | |
| """Converts model type to a symbol""" | |
| if model_type == "Open-Weights": | |
| return "π" | |
| elif model_type == "Proprietary": | |
| return "π₯" | |
| else: | |
| return "β" | |
| def get_hf_data_by_model_name(model_name: str) -> dict: | |
| """Get model data from Hugging Face API by model name""" | |
| model_family = "Unknown" | |
| if 'mistral' in model_name.lower() or 'numina' in model_name.lower(): | |
| model_family = "Mistral" | |
| elif 'meta-llama' in model_name.lower(): | |
| model_family = "LLaMA" | |
| elif 'claude' in model_name.lower(): | |
| model_family = "Claude" | |
| elif 'qwen' in model_name.lower() or 'athene' in model_name.lower() or 'qwq' in model_name.lower() or 'qvq' in model_name.lower(): | |
| model_family = "Qwen" | |
| elif 'gpt' in model_name.lower() or 'o1' in model_name.lower(): | |
| model_family = "GPT" | |
| elif 'gemini' in model_name.lower(): | |
| model_family = "Gemini" | |
| elif 'deepseek' in model_name.lower(): | |
| model_family = "DeepSeek" | |
| print(model_name, model_family) | |
| model_config = get_hf_hub_config_or_none(model_name) | |
| model_info_card = get_hf_model_info_card_or_none(model_name) | |
| print('model_config', type(model_config)) | |
| print('model_info_card', type(model_info_card)) | |
| # If model name is a path, try to get the model name from the hub | |
| if '/' in model_name: | |
| print(f"Model {model_name} is not on the hub, try unsloth/...") | |
| model_name = "unsloth/" + model_name.split("/")[-1] | |
| if not model_config: | |
| model_config = get_hf_hub_config_or_none(model_name) | |
| if not model_info_card: | |
| model_info_card = get_hf_model_info_card_or_none(model_name) | |
| architecture = "Unknown" | |
| if model_config: | |
| architectures = getattr(model_config, "architectures", None) | |
| if architectures: | |
| architecture = ";".join(architectures) | |
| num_params = None | |
| if model_info_card: | |
| try: | |
| num_params = round(model_info_card.safetensors["total"] / 1e9, 1) | |
| except Exception as e: | |
| print("SafeTensors not found in", model_name, e) | |
| if 'Pixtral-12B' in model_name: | |
| num_params = 12 | |
| elif 'Pixtral-Large-Instruct-2411' in model_name: | |
| num_params = 123.3 | |
| pass | |
| print("num_params", model_name, num_params) | |
| model_url = None | |
| if model_config or model_info_card: | |
| model_url = f"https://huggingface.co/{model_name}" | |
| model_license = "Unknown" | |
| if model_info_card: | |
| model_license = model_info_card.card_data["license_name"] | |
| model_license_link = model_info_card.card_data["license_link"] | |
| if model_license_link: | |
| model_license = f"[{model_license}]({model_license_link})" | |
| if not model_license: | |
| model_license = "Unknown" | |
| return { | |
| "model_architecture": architecture, | |
| "model_type": "Open-Weights" if model_info_card else "Proprietary", | |
| "model_size": num_params if num_params else None, | |
| "model_url": model_url, | |
| "model_license": model_license, | |
| "model_family": model_family, | |
| } | |
| class Field: | |
| pretty_name: str | |
| column_type: str # rate (auto-convert to percent number), number, str, markdown | |
| displayed_by_default: bool = True | |
| never_hidden: bool = False | |
| fully_hidden: bool = False | |
| tags: list[str] = field(default_factory=list) | |
| def gradio_column_type(self) -> str: | |
| if self.column_type == "rate": | |
| return "number" | |
| return self.column_type | |
| MODEL_COLUMNS_DICT = { | |
| "model_type_symbol": Field("T", "str", never_hidden=True), | |
| "model_size_symbol": Field("S", "str", never_hidden=True), | |
| "full_model_name": Field("Full Model Name", "markdown", fully_hidden=True, displayed_by_default=False), | |
| "model_name": Field("Model Name", "markdown", never_hidden=True), | |
| "model_type": Field("Type", "str", displayed_by_default=False), | |
| "model_size": Field("#Params (B)", "number", displayed_by_default=False), | |
| "model_size_including_unknown": Field("#Params inc. Proprietary (B)", "number", fully_hidden=True, displayed_by_default=False), | |
| "model_architecture": Field("Architecture", "str", displayed_by_default=False), | |
| "model_license": Field("License", "markdown", displayed_by_default=False), | |
| "model_family": Field("Family", "str", displayed_by_default=False), | |
| "model_url": Field("Model URL", "str", fully_hidden=True, displayed_by_default=False), | |
| } | |
| U_MATH_COLUMNS_DICT = { | |
| "rank": Field("Rank", "number", never_hidden=True), | |
| **MODEL_COLUMNS_DICT, | |
| "judge_model_name": Field("Judge Model Name", "markdown", displayed_by_default=False), | |
| "u_math_acc": Field("U-MATH Acc", "rate", never_hidden=True, tags=["default"]), | |
| "u_math_text_acc": Field("U-MATH Text Acc", "rate", tags=["default", "text"]), | |
| "u_math_visual_acc": Field("U-MATH Visual Acc", "rate", tags=["default", "visual"]), | |
| "u_math_text_hard_acc": Field("U-MATH TextHard Acc", "rate", tags=["default", "text"]), | |
| "differential_calc_acc": Field("Diff Calc Acc", "rate", displayed_by_default=False, tags=["subjects"]), | |
| "differential_calc_text_acc": Field("Diff Calc Text Acc", "rate", displayed_by_default=False, tags=["text"]), | |
| "differential_calc_visual_acc": Field( | |
| "Diff Calc Visual Acc", "rate", displayed_by_default=False, tags=["visual"] | |
| ), | |
| "integral_calc_acc": Field("Integral Calc Acc", "rate", displayed_by_default=False, tags=["subjects"]), | |
| "integral_calc_text_acc": Field("Integral Calc Text Acc", "rate", displayed_by_default=False, tags=["text"]), | |
| "integral_calc_visual_acc": Field( | |
| "Integral Calc Visual Acc", "rate", displayed_by_default=False, tags=["visual"] | |
| ), | |
| "algebra_acc": Field("Algebra Acc", "rate", displayed_by_default=False, tags=["subjects"]), | |
| "algebra_text_acc": Field("Algebra Text Acc", "rate", displayed_by_default=False, tags=["text"]), | |
| "algebra_visual_acc": Field("Algebra Visual Acc", "rate", displayed_by_default=False, tags=["visual"]), | |
| "multivariable_calculus_acc": Field("Multivar Calc Acc", "rate", displayed_by_default=False, tags=["subjects"]), | |
| "multivariable_calculus_text_acc": Field( | |
| "Multivar Calc Text Acc", "rate", displayed_by_default=False, tags=["text"] | |
| ), | |
| "multivariable_calculus_visual_acc": Field( | |
| "Multivar Calc Visual Acc", "rate", displayed_by_default=False, tags=["visual"] | |
| ), | |
| "precalculus_review_acc": Field("Precalc Acc", "rate", displayed_by_default=False, tags=["subjects"]), | |
| "precalculus_review_text_acc": Field("Precalc Text Acc", "rate", displayed_by_default=False, tags=["text"]), | |
| "precalculus_review_visual_acc": Field( | |
| "Precalc Visual Acc", "rate", displayed_by_default=False, tags=["visual"] | |
| ), | |
| "sequences_series_acc": Field("Seq & Series Acc", "rate", displayed_by_default=False, tags=["subjects"]), | |
| "sequences_series_text_acc": Field("Seq & Series Text Acc", "rate", displayed_by_default=False, tags=["text"]), | |
| "sequences_series_visual_acc": Field( | |
| "Seq & Series Visual Acc", "rate", displayed_by_default=False, tags=["visual"] | |
| ), | |
| } | |
| MU_MATH_COLUMNS_DICT = { | |
| "rank": Field("Rank", "number", never_hidden=True), | |
| **MODEL_COLUMNS_DICT, | |
| "extract_model_name": Field("Extract Model Name", "markdown", displayed_by_default=False), | |
| "mu_math_f1": Field("ΞΌ-MATH F1", "rate", never_hidden=True, tags=["default", "splits"]), | |
| "mu_math_tpr": Field("ΞΌ-MATH TPR", "rate", displayed_by_default=False, tags=["default"]), | |
| "mu_math_tnr": Field("ΞΌ-MATH TNR", "rate", displayed_by_default=False, tags=["default"]), | |
| "mu_math_ppv": Field("ΞΌ-MATH PPV", "rate", displayed_by_default=False, tags=["default"]), | |
| "mu_math_npv": Field("ΞΌ-MATH NPV", "rate", displayed_by_default=False, tags=["default"]), | |
| "GPT-4o_f1": Field("GPT-4o Subset F1", "rate", tags=["splits"]), | |
| "GPT-4o_tpr": Field("GPT-4o Subset TPR", "rate", displayed_by_default=False), | |
| "GPT-4o_tnr": Field("GPT-4o Subset TNR", "rate", displayed_by_default=False), | |
| "GPT-4o_ppv": Field("GPT-4o Subset PPV", "rate", displayed_by_default=False), | |
| "GPT-4o_npv": Field("GPT-4o Subset NPV", "rate", displayed_by_default=False), | |
| "Gemini-1.5-Pro_f1": Field("Gemini-1.5-Pro Subset F1", "rate", tags=["splits"]), | |
| "Gemini-1.5-Pro_tpr": Field("Gemini-1.5-Pro Subset TPR", "rate", displayed_by_default=False), | |
| "Gemini-1.5-Pro_tnr": Field("Gemini-1.5-Pro Subset TNR", "rate", displayed_by_default=False), | |
| "Gemini-1.5-Pro_ppv": Field("Gemini-1.5-Pro Subset PPV", "rate", displayed_by_default=False), | |
| "Gemini-1.5-Pro_npv": Field("Gemini-1.5-Pro Subset NPV", "rate", displayed_by_default=False), | |
| "Llama-3.1-70B-Instruct_f1": Field("Llama-3.1-70B Subset F1", "rate", tags=["splits"]), | |
| "Llama-3.1-70B-Instruct_tpr": Field("Llama-3.1-70B Subset TPR", "rate", displayed_by_default=False), | |
| "Llama-3.1-70B-Instruct_tnr": Field("Llama-3.1-70B Subset TNR", "rate", displayed_by_default=False), | |
| "Llama-3.1-70B-Instruct_ppv": Field("Llama-3.1-70B Subset PPV", "rate", displayed_by_default=False), | |
| "Llama-3.1-70B-Instruct_npv": Field("Llama-3.1-70B Subset NPV", "rate", displayed_by_default=False), | |
| "Qwen2.5-72B-Instruct_f1": Field("Qwen2.5-72B Subset F1", "rate", tags=["splits"]), | |
| "Qwen2.5-72B-Instruct_tpr": Field("Qwen2.5-72B Subset TPR", "rate", displayed_by_default=False), | |
| "Qwen2.5-72B-Instruct_tnr": Field("Qwen2.5-72B Subset TNR", "rate", displayed_by_default=False), | |
| "Qwen2.5-72B-Instruct_ppv": Field("Qwen2.5-72B Subset PPV", "rate", displayed_by_default=False), | |
| "Qwen2.5-72B-Instruct_npv": Field("Qwen2.5-72B Subset NPV", "rate", displayed_by_default=False), | |
| } | |
| U_MATH_AND_MU_MATH_COLUMNS_DICT = { | |
| "u_math_rank": Field("U-MATH Rank", "number", never_hidden=True), | |
| "mu_math_rank": Field("ΞΌ-MATH Rank", "number", never_hidden=True), | |
| **MODEL_COLUMNS_DICT, | |
| "u_math_acc": Field("U-MATH Acc", "rate", tags=["default", "u_math", "mu_math"]), | |
| "u_math_text_acc": Field("U-MATH Text Acc", "rate", displayed_by_default=False, tags=["u_math"]), | |
| "u_math_visual_acc": Field("U-MATH Visual Acc", "rate", displayed_by_default=False, tags=["u_math"]), | |
| "judge_model_name": Field("Judge Model Name", "markdown", displayed_by_default=False), | |
| "extract_model_name": Field("Extract Model Name", "markdown", displayed_by_default=False), | |
| "mu_math_f1": Field("ΞΌ-MATH F1", "rate", tags=["default", "u_math", "mu_math"]), | |
| "mu_math_tpr": Field("ΞΌ-MATH TPR", "rate", displayed_by_default=False, tags=["mu_math"]), | |
| "mu_math_tnr": Field("ΞΌ-MATH TNR", "rate", displayed_by_default=False, tags=["mu_math"]), | |
| "mu_math_ppv": Field("ΞΌ-MATH PPV", "rate", displayed_by_default=False, tags=["mu_math"]), | |
| "mu_math_npv": Field("ΞΌ-MATH NPV", "rate", displayed_by_default=False, tags=["mu_math"]), | |
| } | |
| def load_json_data(json_path: str, main_col: str | None = None) -> pd.DataFrame: | |
| """Loads json data from a file""" | |
| with open(json_path, "r") as f: | |
| data = json.load(f) | |
| df = pd.DataFrame.from_records(data) | |
| if main_col: | |
| df = df.sort_values(by=[main_col], ascending=False) | |
| for col in df.columns: | |
| if df.dtypes[col] == "float64": | |
| df[col] = df[col].round(decimals=2) | |
| return df | |
| def get_model_meta_info_df(model_full_names: list[str]) -> pd.DataFrame: | |
| """Given a list of model names, returns a dataframe with model meta info""" | |
| model_to_meta_dict = { | |
| model_name: get_hf_data_by_model_name(model_name) for model_name in model_full_names | |
| } | |
| df = pd.DataFrame.from_dict(model_to_meta_dict, orient="index") | |
| df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol) | |
| df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol) | |
| df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float) | |
| df["full_model_name"] = df.index | |
| df = df.reset_index(drop=True) | |
| df["model_name"] = df["full_model_name"].apply( | |
| lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x | |
| ) | |
| return df | |
| def get_u_math_leaderboard_df(use_pretty_names: bool = True, add_meta: bool = True) -> pd.DataFrame: | |
| """Creates a dataframe from json with U-MATH eval results""" | |
| json_path = os.path.join("data", "u_math_eval_results.json") | |
| df = load_json_data(json_path) | |
| df = df.rename(columns={"model_name": "full_model_name"}) | |
| # flatten list [x, y, z] in columns as ["_acc", "_text_acc", "_visual_acc"] suffixes for columns | |
| for col in [ | |
| "u_math", | |
| "differential_calc", | |
| "integral_calc", | |
| "algebra", | |
| "multivariable_calculus", | |
| "precalculus_review", | |
| "sequences_series", | |
| ]: | |
| df[col + "_acc"] = df[col].apply(lambda x: x[0]) | |
| df[col + "_text_acc"] = df[col].apply(lambda x: x[1]) | |
| df[col + "_visual_acc"] = df[col].apply(lambda x: x[2]) | |
| del df[col] | |
| df["u_math_text_hard_acc"] = (df["differential_calc_text_acc"] + df["integral_calc_text_acc"] + df["multivariable_calculus_text_acc"] + df["sequences_series_text_acc"]) / 4 | |
| # Sort and add rank | |
| df = df.sort_values(by=["u_math_acc"], ascending=False) | |
| df["rank"] = range(1, len(df) + 1) | |
| # populate with model info | |
| if add_meta: | |
| df_meta = get_model_meta_info_df(df["full_model_name"].unique()) | |
| df = pd.merge(df, df_meta, on=["full_model_name"], how="left") | |
| # resolve rate columns to percent | |
| for col in df.columns: | |
| if U_MATH_COLUMNS_DICT[col].column_type == "rate": | |
| if all(df[col] <= 1): | |
| df[col] = (df[col] * 100).round(PERCENT_ROUND_DIGITS) | |
| elif any(df[col] > 1) and all(df[col] <= 100): | |
| df[col] = df[col].round(PERCENT_ROUND_DIGITS) | |
| else: | |
| raise ValueError(f"Column {col} has values {df[col]} that are not in [0, 1] or [0, 100]") | |
| # convert to pretty names and sort columns by order in dict | |
| if use_pretty_names: | |
| df = df[U_MATH_COLUMNS_DICT.keys()] | |
| df = df.rename(columns={key: col.pretty_name for key, col in U_MATH_COLUMNS_DICT.items() if key in df.columns}) | |
| return df | |
| def get_mu_math_leaderboard_df(use_pretty_names: bool = True, add_meta: bool = True) -> pd.DataFrame: | |
| """Creates a dataframe from json with mu-MATH eval results""" | |
| json_path = os.path.join("data", "mu_math_eval_results.json") | |
| df = load_json_data(json_path) | |
| df = df.rename(columns={"model_name": "full_model_name"}) | |
| # Calculate columns with prefixes f1, tpr, tnr, ppv, npv | |
| for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]: | |
| df[col + "_acc"] = df[col].apply(lambda x: x[0]) | |
| df[col + "_f1"] = df[col].apply(lambda x: x[1]) | |
| df[col + "_mcc"] = df[col].apply(lambda x: x[2]) | |
| df[col + "_tpr"] = df[col].apply(lambda x: x[3]) | |
| df[col + "_tnr"] = df[col].apply(lambda x: x[4]) | |
| df[col + "_ppv"] = df[col].apply(lambda x: x[5]) | |
| df[col + "_npv"] = df[col].apply(lambda x: x[6]) | |
| del df[col] | |
| del df[col + "_acc"] | |
| del df[col + "_mcc"] | |
| # # flatten list [x, y, z] in columns as ["_f1", "_precision", "_recall"] suffixes for columns | |
| # for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]: | |
| # df[col + "_f1"] = df[col].apply(lambda x: x[0]) | |
| # df[col + "_precision"] = df[col].apply(lambda x: x[1]) | |
| # df[col + "_recall"] = df[col].apply(lambda x: x[2]) | |
| # del df[col] | |
| NUM_MU_MATH_SAMPLES = 1084 | |
| # Sort and add rank | |
| df = df.sort_values(by=["mu_math_f1"], ascending=False) | |
| df["rank"] = range(1, len(df) + 1) | |
| # populate with model info | |
| if add_meta: | |
| df_meta = get_model_meta_info_df(df["full_model_name"].unique()) | |
| df = pd.merge(df, df_meta, on=["full_model_name"], how="left") | |
| # resolve rate columns to percent | |
| for col in df.columns: | |
| if MU_MATH_COLUMNS_DICT[col].column_type == "rate": | |
| if all(df[col] <= 1): | |
| df[col] = (df[col] * 100).round(2) | |
| elif any(df[col] > 1) and all(df[col] <= 100): | |
| df[col] = df[col].round(2) | |
| else: | |
| raise ValueError(f"Column {col} has values {df[col]} that are not in [0, 1] or [0, 100]") | |
| # convert to pretty names and sort columns by order in dict | |
| if use_pretty_names: | |
| df = df[MU_MATH_COLUMNS_DICT.keys()] | |
| df = df.rename(columns={key: col.pretty_name for key, col in MU_MATH_COLUMNS_DICT.items() if key in df.columns}) | |
| return df | |
| def get_joined_leaderboard_df(use_pretty_names: bool = True, add_meta: bool = True) -> pd.DataFrame: | |
| """Creates a dataframe from json with U-MATH and mu-MATH eval results""" | |
| u_math_df = get_u_math_leaderboard_df(use_pretty_names=False, add_meta=False) | |
| u_math_df = u_math_df.rename(columns={"rank": "u_math_rank"}) | |
| mu_math_df = get_mu_math_leaderboard_df(use_pretty_names=False, add_meta=False) | |
| mu_math_df = mu_math_df.rename(columns={"rank": "mu_math_rank"}) | |
| assert set(u_math_df.columns).intersection(set(mu_math_df.columns)) == {"full_model_name"}, f"Columns overlap in {u_math_df.columns} and {mu_math_df.columns}" | |
| # merge U-MATH and mu-MATH dataframes | |
| df = pd.merge(u_math_df, mu_math_df, on=["full_model_name"], how="inner", suffixes=("", "")) | |
| # sort by rank on u_math | |
| df = df.sort_values(by=["u_math_rank"], ascending=True) | |
| # add meta info | |
| if add_meta: | |
| df_meta = get_model_meta_info_df(df["full_model_name"].unique()) | |
| df = pd.merge(df, df_meta, on=["full_model_name"], how="left") | |
| # convert to pretty names and sort columns by order in dict | |
| if use_pretty_names: | |
| df = df[U_MATH_AND_MU_MATH_COLUMNS_DICT.keys()] | |
| df = df.rename( | |
| columns={key: col.pretty_name for key, col in U_MATH_AND_MU_MATH_COLUMNS_DICT.items() if key in df.columns} | |
| ) | |
| return df | |