Spaces:

autogluon
/

fev-leaderboard-mirror

Running

File size: 13,731 Bytes

import altair as alt
import fev
import pandas as pd
import pandas.io.formats.style

# Color constants - all colors defined in one place

COLORS = {
    "dl_text": "#5A7FA5",
    "st_text": "#A5795A",
    # "st_text": "#666666",
    "bar_fill": "#8d5eb7",
    "error_bar": "#222222",
    "point": "#111111",
    "text_white": "white",
    "text_black": "black",
    "text_default": "#111",
    "gold": "#F7D36B",
    "silver": "#E5E7EB",
    "bronze": "#E6B089",
    "leakage_impute": "#3B82A0",
    "failure_impute": "#E07B39",
}
HEATMAP_COLOR_SCHEME = "purplegreen"

# Model configuration: (url, org, zero_shot, model_type)
MODEL_CONFIG = {
    # Chronos Models
    "chronos_tiny": ("amazon/chronos-t5-tiny", "AWS", True, "DL"),
    "chronos_mini": ("amazon/chronos-t5-mini", "AWS", True, "DL"),
    "chronos_small": ("amazon/chronos-t5-small", "AWS", True, "DL"),
    "chronos_base": ("amazon/chronos-t5-base", "AWS", True, "DL"),
    "chronos_large": ("amazon/chronos-t5-large", "AWS", True, "DL"),
    "chronos_bolt_tiny": ("amazon/chronos-bolt-tiny", "AWS", True, "DL"),
    "chronos_bolt_mini": ("amazon/chronos-bolt-mini", "AWS", True, "DL"),
    "chronos_bolt_small": ("amazon/chronos-bolt-small", "AWS", True, "DL"),
    "chronos_bolt_base": ("amazon/chronos-bolt-base", "AWS", True, "DL"),
    "chronos-bolt": ("amazon/chronos-bolt-base", "AWS", True, "DL"),
    # Moirai Models
    "moirai_large": ("Salesforce/moirai-1.1-R-large", "Salesforce", True, "DL"),
    "moirai_base": ("Salesforce/moirai-1.1-R-base", "Salesforce", True, "DL"),
    "moirai_small": ("Salesforce/moirai-1.1-R-small", "Salesforce", True, "DL"),
    "moirai-2.0": ("Salesforce/moirai-2.0-R-small", "Salesforce", True, "DL"),
    # TimesFM Models
    "timesfm": ("google/timesfm-1.0-200m-pytorch", "Google", True, "DL"),
    "timesfm-2.0": ("google/timesfm-2.0-500m-pytorch", "Google", True, "DL"),
    "timesfm-2.5": ("google/timesfm-2.5-200m-pytorch", "Google", True, "DL"),
    # Toto Models
    "toto-1.0": ("Datadog/Toto-Open-Base-1.0", "Datadog", True, "DL"),
    # Other Models
    "tirex": ("NX-AI/TiRex", "NX-AI", True, "DL"),
    "tabpfn-ts": ("Prior-Labs/TabPFN-v2-reg", "Prior Labs", True, "DL"),
    "sundial-base": ("thuml/sundial-base-128m", "Tsinghua University", True, "DL"),
    "ttm-r2": ("ibm-granite/granite-timeseries-ttm-r2", "IBM", True, "DL"),
    # Task-specific models
    "stat. ensemble": (
        "https://nixtlaverse.nixtla.io/statsforecast/",
        "—",
        False,
        "ST",
    ),
    "autoarima": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
    "autotheta": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
    "autoets": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
    "seasonalnaive": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
    "seasonal naive": (
        "https://nixtlaverse.nixtla.io/statsforecast/",
        "—",
        False,
        "ST",
    ),
    "drift": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
    "naive": ("https://nixtlaverse.nixtla.io/statsforecast/", "—", False, "ST"),
}


ALL_METRICS = {
    "SQL": (
        "SQL: Scaled Quantile Loss",
        "The [Scaled Quantile Loss (SQL)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.SQL) is a **scale-invariant** metric for evaluating **probabilistic** forecasts.",
    ),
    "MASE": (
        "MASE: Mean Absolute Scaled Error",
        "The [Mean Absolute Scaled Error (MASE)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.MASE) is a **scale-invariant** metric for evaluating **point** forecasts.",
    ),
    "WQL": (
        "WQL: Weighted Quantile Loss",
        "The [Weighted Quantile Loss (WQL)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.WQL), is a **scale-dependent** metric for evaluating **probabilistic** forecasts.",
    ),
    "WAPE": (
        "WAPE: Weighted Absolute Percentage Error",
        "The [Weighted Absolute Percentage Error (WAPE)](https://auto.gluon.ai/dev/tutorials/timeseries/forecasting-metrics.html#autogluon.timeseries.metrics.WAPE) is a **scale-dependent** metric for evaluating **point** forecasts.",
    ),
}


def format_metric_name(metric_name: str):
    return ALL_METRICS[metric_name][0]


def get_metric_description(metric_name: str):
    return ALL_METRICS[metric_name][1]


def get_model_link(model_name):
    config = MODEL_CONFIG.get(model_name.lower())
    if not config or not config[0]:
        return ""
    url = config[0]
    return url if url.startswith("https:") else f"https://huggingface.co/{url}"


def get_model_organization(model_name):
    config = MODEL_CONFIG.get(model_name.lower())
    return config[1] if config else "—"


def get_zero_shot_status(model_name):
    config = MODEL_CONFIG.get(model_name.lower())
    return "✓" if config and config[2] else "×"


def get_model_type(model_name):
    config = MODEL_CONFIG.get(model_name.lower())
    return config[3] if config else "—"


def highlight_model_type_color(cell):
    config = MODEL_CONFIG.get(cell.lower())
    if config:
        color = COLORS["dl_text"] if config[3] == "DL" else COLORS["st_text"]
        return f"font-weight: bold; color: {color}"
    return "font-weight: bold"


def format_leaderboard(df: pd.DataFrame):
    df = df.copy()
    df["skill_score"] = df["skill_score"].round(1)
    df["win_rate"] = df["win_rate"].round(1)
    df["zero_shot"] = df["model_name"].apply(get_zero_shot_status)
    # Format leakage column: convert to int for all models, 0 for non-zero-shot
    df["training_corpus_overlap"] = df.apply(
        lambda row: int(round(row["training_corpus_overlap"] * 100))
        if row["zero_shot"] == "✓"
        else 0,
        axis=1,
    )
    df["link"] = df["model_name"].apply(get_model_link)
    df["org"] = df["model_name"].apply(get_model_organization)
    df = df[
        [
            "model_name",
            "win_rate",
            "skill_score",
            "median_inference_time_s",
            "training_corpus_overlap",
            "num_failures",
            "zero_shot",
            "org",
            "link",
        ]
    ]
    return (
        df.style.map(highlight_model_type_color, subset=["model_name"])
        .map(lambda x: "font-weight: bold", subset=["zero_shot"])
        .apply(
            lambda x: [
                "background-color: #f8f9fa" if i % 2 == 1 else "" for i in range(len(x))
            ],
            axis=0,
        )
    )


def construct_bar_chart(df: pd.DataFrame, col: str, metric_name: str):
    label = "Skill Score" if col == "skill_score" else "Win Rate"

    tooltip = [
        alt.Tooltip("model_name:N"),
        alt.Tooltip(f"{col}:Q", format=".2f"),
        alt.Tooltip(f"{col}_lower:Q", title="95% CI Lower", format=".2f"),
        alt.Tooltip(f"{col}_upper:Q", title="95% CI Upper", format=".2f"),
    ]

    base_encode = {
        "y": alt.Y("model_name:N", title="Forecasting Model", sort=None),
        "tooltip": tooltip,
    }

    bars = (
        alt.Chart(df)
        .mark_bar(color=COLORS["bar_fill"], cornerRadius=4)
        .encode(
            x=alt.X(f"{col}:Q", title=f"{label} (%)", scale=alt.Scale(zero=False)),
            **base_encode,
        )
    )

    error_bars = (
        alt.Chart(df)
        .mark_errorbar(ticks={"height": 5}, color=COLORS["error_bar"])
        .encode(
            y=alt.Y("model_name:N", title=None, sort=None),
            x=alt.X(f"{col}_lower:Q", title=f"{label} (%)"),
            x2=alt.X2(f"{col}_upper:Q"),
            tooltip=tooltip,
        )
    )

    points = (
        alt.Chart(df)
        .mark_point(filled=True, color=COLORS["point"])
        .encode(x=alt.X(f"{col}:Q", title=f"{label} (%)"), **base_encode)
    )

    return (
        (bars + error_bars + points)
        .properties(height=500, title=f"{label} ({metric_name}) with 95% CIs")
        .configure_title(fontSize=16)
    )


def construct_pairwise_chart(df: pd.DataFrame, col: str, metric_name: str):
    config = {
        "win_rate": ("Win Rate", [0, 100], 50, f"abs(datum.{col} - 50) > 30"),
        "skill_score": ("Skill Score", [-15, 15], 0, f"abs(datum.{col}) > 10"),
    }
    cbar_label, domain, domain_mid, text_condition = config[col]

    df = df.copy()
    for c in [col, f"{col}_lower", f"{col}_upper"]:
        df[c] *= 100

    model_order = (
        df.groupby("model_1")[col].mean().sort_values(ascending=False).index.tolist()
    )

    tooltip = [
        alt.Tooltip("model_1:N", title="Model 1"),
        alt.Tooltip("model_2:N", title="Model 2"),
        alt.Tooltip(f"{col}:Q", title=cbar_label.split(" ")[0], format=".1f"),
        alt.Tooltip(f"{col}_lower:Q", title="95% CI Lower", format=".1f"),
        alt.Tooltip(f"{col}_upper:Q", title="95% CI Upper", format=".1f"),
    ]

    base = alt.Chart(df).encode(
        x=alt.X(
            "model_2:N",
            sort=model_order,
            title="Model 2",
            axis=alt.Axis(orient="top", labelAngle=-90),
        ),
        y=alt.Y("model_1:N", sort=model_order, title="Model 1"),
    )

    heatmap = base.mark_rect().encode(
        color=alt.Color(
            f"{col}:Q",
            legend=None,
            scale=alt.Scale(
                scheme=HEATMAP_COLOR_SCHEME,
                domain=domain,
                domainMid=domain_mid,
                clamp=True,
            ),
        ),
        tooltip=tooltip,
    )

    text_main = base.mark_text(dy=-8, fontSize=8, baseline="top", yOffset=5).encode(
        text=alt.Text(f"{col}:Q", format=".1f"),
        color=alt.condition(
            text_condition,
            alt.value(COLORS["text_white"]),
            alt.value(COLORS["text_black"]),
        ),
        tooltip=tooltip,
    )

    return (
        (heatmap + text_main)
        .properties(
            height=550,
            title={
                "text": f"Pairwise {cbar_label} ({metric_name}) with 95% CIs",
                "fontSize": 16,
            },
        )
        .configure_axis(labelFontSize=11, titleFontSize=13, titleFontWeight="bold")
        .resolve_scale(color="independent")
    )


def construct_pivot_table_from_df(
    errors: pd.DataFrame, metric_name: str
) -> pd.io.formats.style.Styler:
    """Construct styled pivot table from precomputed DataFrame."""

    def highlight_by_position(styler):
        rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]}

        for row_idx in errors.index:
            row_ranks = errors.loc[row_idx].rank(method="min")
            for col_idx in errors.columns:
                rank = row_ranks[col_idx]
                style_parts = []

                # Rank background colors
                if rank <= 3:
                    style_parts.append(f"background-color: {rank_colors[rank]}")
                else:
                    style_parts.append(f"color: {COLORS['text_default']}")

                if style_parts:
                    styler = styler.map(
                        lambda x, s="; ".join(style_parts): s,
                        subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx],
                    )
        return styler

    return highlight_by_position(errors.style).format(precision=3)


def construct_pivot_table(
    summaries: pd.DataFrame,
    metric_name: str,
    baseline_model: str,
    leakage_imputation_model: str,
) -> pd.io.formats.style.Styler:
    errors = fev.pivot_table(
        summaries=summaries, metric_column=metric_name, task_columns=["task_name"]
    )
    train_overlap = (
        fev.pivot_table(
            summaries=summaries,
            metric_column="trained_on_this_dataset",
            task_columns=["task_name"],
        )
        .fillna(False)
        .astype(bool)
    )

    is_imputed_baseline = errors.isna()
    is_leakage_imputed = train_overlap

    # Handle imputations
    errors = errors.mask(train_overlap, errors[leakage_imputation_model], axis=0)
    for col in errors.columns:
        if col != baseline_model:
            errors[col] = errors[col].fillna(errors[baseline_model])

    errors = errors[errors.rank(axis=1).mean().sort_values().index]
    errors.index.rename("Task name", inplace=True)

    def highlight_by_position(styler):
        rank_colors = {1: COLORS["gold"], 2: COLORS["silver"], 3: COLORS["bronze"]}

        for row_idx in errors.index:
            row_ranks = errors.loc[row_idx].rank(method="min")
            for col_idx in errors.columns:
                rank = row_ranks[col_idx]
                style_parts = []

                # Rank background colors
                if rank <= 3:
                    style_parts.append(f"background-color: {rank_colors[rank]}")

                # Imputation text colors
                if is_leakage_imputed.loc[row_idx, col_idx]:
                    style_parts.append(f"color: {COLORS['leakage_impute']}")
                elif is_imputed_baseline.loc[row_idx, col_idx]:
                    style_parts.append(f"color: {COLORS['failure_impute']}")
                elif not style_parts or (
                    len(style_parts) == 1 and "font-weight" in style_parts[0]
                ):
                    style_parts.append(f"color: {COLORS['text_default']}")

                if style_parts:
                    styler = styler.map(
                        lambda x, s="; ".join(style_parts): s,
                        subset=pd.IndexSlice[row_idx:row_idx, col_idx:col_idx],
                    )
        return styler

    return highlight_by_position(errors.style).format(precision=3)