Spaces:

meta-agents-research-environments
/

leaderboard

Running on CPU Upgrade

File size: 19,395 Bytes

import datetime
import json
import os
from pathlib import Path

import datasets

import gradio as gr
import pandas as pd
import requests
from apscheduler.schedulers.background import BackgroundScheduler

# InfoStrings
from content import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    CONTACT_DATASET,
    INTRODUCTION_TEXT,
    LEADERBOARD_PATH,
    OWNER,
    RESULTS_DATASET,
    SCENARIO_LIST,
    SUBMISSION_TEXT,
    TITLE,
)
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns

from huggingface_hub import create_repo, snapshot_download, upload_folder

from utils import api, Experiment, format_log, model_hyperlink, TOKEN

contact_infos = datasets.load_dataset(
    CONTACT_DATASET, token=TOKEN, verification_mode=datasets.VerificationMode.NO_CHECKS
)  # download_mode="force_redownload"


def get_display_name(capability: str) -> str:
    """
    Convert internal capability names to user-friendly display names.

    Args:
        capability: Internal capability name from the benchmark

    Returns:
        User-friendly display name for the leaderboard
    """
    if "noise" in capability:
        return "noise"
    elif "agent2agent" in capability or "a2a" in capability:
        return "A2A"
    else:
        return capability


def cleanup(row) -> dict:
    """
    Transform raw evaluation data into a clean format for the leaderboard display.

    Args:
        row: Raw evaluation result row from the dataset

    Returns:
        Dictionary with cleaned and formatted data for leaderboard display
    """
    result = {}

    # Basic model information
    result["Model"] = row["metadata.model"]
    result["Provider"] = row["metadata.model_provider"]
    result["Total score (%)"] = round(row["statistics.global.macro_success_rate"], 1)

    # Define the order of capability columns for consistent display
    scenario_order = [
        "execution",
        "search",
        "ambiguity",
        "adaptability",
        "time",
        "mini_noise",
        "mini_agent2agent",
    ]

    # Process each capability score with aligned formatting
    for capability in scenario_order:
        if capability in SCENARIO_LIST:
            display_name = get_display_name(capability)

            # Extract score and standard error
            score = row[f"statistics.per_capability.{capability}.success_rate"]
            sem = row[f"statistics.per_capability.{capability}.success_rate_sem"]

            # Format with decimal alignment using non-breaking spaces
            score_str = f"{score:4.1f}".replace(" ", "\u00A0")
            sem_str = f"{sem:.1f}"  # No width formatting for SEM to avoid extra spaces

            result[f"{display_name} (%)"] = f"{score_str} ± {sem_str}"

    # Add metadata fields
    result["Number of runs"] = (
        row["statistics.global.total_runs"] / row["statistics.global.total_scenarios"]
        if row["statistics.global.total_scenarios"] != 0
        else 0
    )
    result["Submitter"] = row["metadata.organisation"]
    result["Submission date"] = row["metadata.timestamp"][:10]

    return result


def get_dataframe_from_results() -> pd.DataFrame:
    """
    Load and process evaluation results from the dataset to create a leaderboard DataFrame.

    Retrieves raw evaluation data, processes it through the cleanup function,
    and returns a sorted DataFrame ready for leaderboard display.

    Returns:
        Pandas DataFrame with processed leaderboard data, sorted by total score
        Returns empty DataFrame if no data is available
    """
    split = "train"

    # Load evaluation results dataset
    try:
        eval_results = datasets.load_dataset(
            RESULTS_DATASET,
            token=TOKEN,
            verification_mode=datasets.VerificationMode.NO_CHECKS,
        )
    except datasets.data_files.EmptyDatasetError:
        eval_results = datasets.DatasetDict()

    # Return empty DataFrame if no data available
    if not eval_results or split not in eval_results or len(eval_results[split]) == 0:
        return pd.DataFrame([])

    results = eval_results[split]
    local_df = results.flatten()

    # Define columns to extract from the raw data
    metadata_columns = [
        "metadata.model",
        "metadata.model_provider",
        "metadata.organisation",
        "metadata.timestamp",
        "metadata.url",
    ]

    global_stats_columns = [
        "statistics.global.macro_success_rate",
        "statistics.global.total_runs",
        "statistics.global.total_scenarios",
    ]

    # Add per-capability statistics columns
    capability_columns = []
    for capability in SCENARIO_LIST:
        capability_columns.extend(
            [
                f"statistics.per_capability.{capability}.success_rate",
                f"statistics.per_capability.{capability}.success_rate_sem",
            ]
        )

    # Combine all required columns
    columns = metadata_columns + global_stats_columns + capability_columns

    # Process the data: select columns, clean up, and remove original columns
    local_df = local_df.select_columns(columns)
    mapped_df = local_df.map(cleanup, batched=False)
    mapped_df = mapped_df.remove_columns(columns)

    # Convert to pandas DataFrame and sort by total score (highest first)
    df = pd.DataFrame(mapped_df)
    df = df.sort_values(by=["Total score (%)"], ascending=False)

    return df


# ATM only one set
eval_dataframe_val = get_dataframe_from_results()


def restart_space():
    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)


def add_new_eval(
    organisation: str,
    path_to_repository: str,
    profile: gr.OAuthProfile,
    token: gr.OAuthToken,
):
    # ---- USER CHECKS ----
    # Was the profile created less than 2 month ago?
    user_data = requests.get(
        f"https://huggingface.co/api/users/{profile.username}/overview"
    )
    creation_date = json.loads(user_data.content)["createdAt"]
    if datetime.datetime.now() - datetime.datetime.strptime(
        creation_date, "%Y-%m-%dT%H:%M:%S.%fZ"
    ) < datetime.timedelta(days=60):
        raise Exception("This account is not authorized to submit on Gaia2.")

    # Can't submit several times per day
    contact_infos = datasets.load_dataset(
        CONTACT_DATASET,
        token=TOKEN,
        verification_mode=datasets.VerificationMode.NO_CHECKS,
    )
    user_submission_dates = sorted(
        row["date"]
        for row in contact_infos["train"]
        if row["username"] == profile.username
    )
    if len(user_submission_dates) > 0 and user_submission_dates[
        -1
    ] == datetime.datetime.today().strftime("%Y-%m-%d"):
        raise Exception("You already submitted once today, please try again tomorrow.")

    # ---- EXPERIMENT MANAGEMENT ----
    # Download locally with HF hub
    snapshot_path = snapshot_download(
        repo_id=path_to_repository, token=token.token, repo_type="dataset"
    )

    # Test completeness with datasets
    try:
        for scenario in SCENARIO_LIST:
            # Loading what the user provided
            datasets.load_dataset(
                snapshot_path,
                scenario,
                split="test",
                verification_mode=datasets.VerificationMode.NO_CHECKS,
            )
    except Exception as e:
        print(e)
        raise ValueError(
            f"We cannot load the scenario {scenario} for your dataset ({path_to_repository}). Please make sure the dataset is accessible and all subsets are there."
        )

    with open(Path(snapshot_path, "computed_stats.json")) as f:
        results = json.load(f)
    model = results["metadata"]["model"]
    results["metadata"]["organisation"] = organisation
    results["metadata"]["url"] = path_to_repository

    try:
        ds = datasets.load_dataset(RESULTS_DATASET, split="train")
    except datasets.data_files.EmptyDatasetError:
        ds = datasets.Dataset.from_dict({})

    if results in ds:
        raise Exception("This precise model and results file was already submitted")
    ds = ds.add_item(results)
    ds.push_to_hub(RESULTS_DATASET, split="train", private=True)

    experiment = Experiment(path_to_repository, organisation, model)

    # Save copy to hub
    create_repo(
        repo_id=f"{OWNER}/{str(experiment)}",
        repo_type="dataset",
        token=TOKEN,
        private=True,
    )
    upload_folder(
        folder_path=snapshot_path,
        repo_id=f"{OWNER}/{str(experiment)}",
        repo_type="dataset",
        token=TOKEN,
    )

    print(f"Adding new eval: {str(experiment)}")

    # SAVE ALL INFO
    contact_info = {
        "model": experiment.model,
        "path_to_hub": experiment.path_to_hub,
        "path_to_hub_private_copy": f"{OWNER}/{str(experiment)}",
        "organisation": experiment.organisation,
        "date": experiment.cur_date,
        "username": profile.username,
        "mail": getattr(profile, "email", None),
    }
    contact_infos["test"] = contact_infos["test"].add_item(contact_info)
    contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN)

    return format_log(
        f"Model {model} submitted by {organisation} successfully.\nPlease wait a couple minutes and refresh the leaderboard to see your score displayed."
    )


def refresh():
    return get_dataframe_from_results()


# Custom CSS for sleek styling
custom_css = """
<style>
    /* Global styling */
    .gradio-container {
        max-width: 1400px !important;
        margin: auto;
        padding: 20px;
        background: linear-gradient(135deg, #f8fbff 0%, #e3f2fd 100%);
        min-height: auto !important; /* override HF default */
        padding-bottom: 0 !important; /* remove extra bottom padding */
    }

    html, body, #root {
    margin: 0;
    padding: 0;
    height: auto !important;          /* don't lock to viewport height */
    min-height: 100%;
    overflow-x: hidden !important;
    overflow-y: auto !important;     /* ensure vertical scroll is possible */
    box-sizing: border-box;
    }

    /* Markdown text styling */
    .markdown-text {
        background: white;
        padding: 25px;
        border-radius: 12px;
        box-shadow: 0 4px 20px rgba(0,0,0,0.08);
        margin: 20px 0;
        border-left: 4px solid #0081FB;
        font-size: 16px;
        line-height: 1.6;
    }

    /* Button styling */
    .gr-button {
        background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%) !important;
        border: none !important;
        border-radius: 8px !important;
        color: white !important;
        font-weight: 600 !important;
        padding: 12px 24px !important;
        transition: all 0.3s ease !important;
        box-shadow: 0 4px 15px rgba(0, 129, 251, 0.3) !important;
    }

    .gr-button:hover {
        transform: translateY(-2px) !important;
        box-shadow: 0 8px 25px rgba(0, 129, 251, 0.4) !important;
    }

    /* Input fields styling */
    .gr-textbox {
        border-radius: 8px !important;
        border: 2px solid #e1e5e9 !important;
        background: white !important;
        transition: all 0.3s ease !important;
    }

    .gr-textbox:focus {
        border-color: #667eea !important;
        box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
    }

    /* Accordion styling */
    .gr-accordion {
        background: white !important;
        border-radius: 12px !important;
        box-shadow: 0 4px 20px rgba(0,0,0,0.08) !important;
        border: none !important;
        margin: 15px 0 !important;
    }

    /* Leaderboard styling */
    .leaderboard-container {
        background: white !important;
        border-radius: 15px !important;
        box-shadow: 0 8px 32px rgba(0,0,0,0.1) !important;
        overflow: hidden !important;
        margin: 25px 0 !important;
        border: none !important;
    }

    /* Remove any default Gradio gray backgrounds */
    .gradio-container .gr-column,
    .gradio-container .gr-row {
        background: transparent !important;
    }

    /* Ensure leaderboard table has clean white background */
    .leaderboard-container table,
    .leaderboard-container .gr-table {
        background: white !important;
        border: none !important;
    }

    /* Submission form styling */
    .submission-section {
        background: white;
        padding: 30px;
        border-radius: 15px;
        box-shadow: 0 6px 25px rgba(0,0,0,0.08);
        margin: 25px 0;
    }
</style>
"""

demo = gr.Blocks(
    # css=custom_css,
    theme=gr.themes.Soft(
        font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue"
    ),
)
with demo:
    gr.HTML(TITLE)

    with gr.Accordion("About", open=True):
        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    # Enhanced leaderboard with custom styling
    with gr.Column(elem_classes="leaderboard-container"):
        # gr.HTML(
        #    """
        # <div style="padding: 20px 20px 0 20px;">
        #    <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;">
        #        🏆 Gaia2 Leaderboard Rankings
        #    </h2>
        #    <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;">
        #        Click on column headers to sort • Use filters to narrow results
        #    </p>
        # </div>
        # """
        # )

        leaderboard_table_val = Leaderboard(
            value=eval_dataframe_val,
            select_columns=SelectColumns(
                default_selection=[
                    "Model",
                    "Provider",
                    "Total score (%)",
                    "execution (%)",
                    "search (%)",
                    "ambiguity (%)",
                    "adaptability (%)",
                    "time (%)",
                    "noise (%)",
                    "A2A (%)",
                    "Submission date",
                ],
                cant_deselect=[
                    "Model",
                    "Provider",
                    "Total score (%)",
                    "Submission date",
                ],
            ),
            search_columns=["Model", "Provider", "Submitter"],
            filter_columns=[
                "Provider",
                ColumnFilter("Model", type="dropdown", label="🔍 Select Model"),
            ],
        )

    # Enhanced submission section
    with gr.Column(elem_classes="submission-section"):
        gr.HTML(
            """
        <h2 style="margin: 0 0 20px 0; font-weight: 700; font-size: 1.8em;">
            🚀 Submit Your Model
        </h2>
        """
        )

        with gr.Accordion("📋 How to submit", open=True):
            gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")

        with gr.Row(equal_height=True):
            with gr.Column(scale=1):
                gr.LoginButton(size="lg")
            with gr.Column(scale=2):
                organisation_tbox = gr.Textbox(
                    label="🏢 Organization",
                    placeholder="Enter your organization name",
                    container=True,
                )
            with gr.Column(scale=3):
                dataset_tbox = gr.Textbox(
                    label="📊 Hub Dataset Path",
                    placeholder="username/dataset-name",
                    container=True,
                )
            with gr.Column(scale=1):
                submit_button = gr.Button("Submit", variant="primary", size="lg")
            with gr.Column(scale=1):
                refresh_button = gr.Button(
                    "🔄 Refresh the display", variant="secondary", size="lg"
                )

        submission_result = gr.Markdown()

    with gr.Column():
        gr.HTML(
            """
            <div style="text-align: center; margin: 20px 0; display: flex; justify-content: center; gap: 50px; flex-wrap: wrap;">
                <!-- GitHub Button -->
                <a href="https://github.com/facebookresearch/meta-agents-research-environments" target="_blank"
                style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
                        background: linear-gradient(135deg, #24292e 0%, #000000 100%);
                        color: white; font-weight: 600; padding: 14px 28px;
                        border-radius: 10px; text-decoration: none; font-size: 16px;
                        box-shadow: 0 4px 12px rgba(0,0,0,0.3); transition: all 0.3s ease;
                        min-width: 220px; text-align: center;">
                    <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="white" viewBox="0 0 24 24">
                        <path d="M12 .5C5.7.5.5 5.7.5 12c0 5.1 3.3 9.4 7.9 10.9.6.1.8-.2.8-.6v-2.1c-3.2.7-3.9-1.4-3.9-1.4-.5-1.2-1.2-1.6-1.2-1.6-1-.7.1-.7.1-.7 1.1.1 1.7 1.1 1.7 1.1 1 .1.8 1.4 2.9 1.9.3-.8.6-1.3.6-1.3-2.6-.3-5.3-1.3-5.3-5.8 0-1.3.5-2.4 1.1-3.3 0-.3-.5-1.6.1-3.2 0 0 1-.3 3.3 1.2a11.5 11.5 0 0 1 6 0c2.3-1.5 3.3-1.2 3.3-1.2.6 1.6.1 2.9.1 3.2.7.9 1.1 2 1.1 3.3 0 4.5-2.7 5.5-5.3 5.8.4.3.7 1 .7 2v3c0 .3.2.7.8.6A11.5 11.5 0 0 0 23.5 12C23.5 5.7 18.3.5 12 .5Z"/>
                    </svg>
                    Star ARE on GitHub ⭐
                </a>
                <!-- Blog Post -->
                <a href="https://ai.meta.com/research/publications/are-scaling-up-agent-environments-and-evaluations/" target="_blank"
                style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
                        background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%);
                        color: white; font-weight: 600; padding: 14px 28px;
                        border-radius: 10px; text-decoration: none; font-size: 16px;
                        box-shadow: 0 4px 12px rgba(0,0,0,0.25); transition: all 0.3s ease;
                        min-width: 220px; text-align: center;">
                    🧑‍🔬 Read the paper
                </a>
                <!-- Demo Button -->
                <a href="https://huggingface.co/spaces/meta-agents-research-environments/demo" target="_blank"
                style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
                        background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%);
                        color: white; font-weight: 600; padding: 14px 28px;
                        border-radius: 10px; text-decoration: none; font-size: 16px;
                        box-shadow: 0 4px 12px rgba(0,0,0,0.25); transition: all 0.3s ease;
                        min-width: 220px; text-align: center;">
                    🚀 Try the ARE Demo
                </a>
            </div>
            """
        )

    with gr.Column():
        with gr.Accordion("📙 Citation", open=True):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                elem_id="citation-button",
                show_copy_button=True,
            )

    submit_button.click(
        add_new_eval,
        [organisation_tbox, dataset_tbox],
        submission_result,
    )

    refresh_button.click(
        refresh,
        inputs=[],
        outputs=[leaderboard_table_val],
    )


scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.launch(debug=True, server_name="0.0.0.0", server_port=7860)