leaderboard / app.py
RomainFroger's picture
Added citation
dbfdab8
import datetime
import json
import os
from pathlib import Path
import datasets
import gradio as gr
import pandas as pd
import requests
from apscheduler.schedulers.background import BackgroundScheduler
# InfoStrings
from content import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
CONTACT_DATASET,
INTRODUCTION_TEXT,
LEADERBOARD_PATH,
OWNER,
RESULTS_DATASET,
SCENARIO_LIST,
SUBMISSION_TEXT,
TITLE,
)
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
from huggingface_hub import create_repo, snapshot_download, upload_folder
from utils import api, Experiment, format_log, model_hyperlink, TOKEN
contact_infos = datasets.load_dataset(
CONTACT_DATASET, token=TOKEN, verification_mode=datasets.VerificationMode.NO_CHECKS
) # download_mode="force_redownload"
def get_display_name(capability: str) -> str:
"""
Convert internal capability names to user-friendly display names.
Args:
capability: Internal capability name from the benchmark
Returns:
User-friendly display name for the leaderboard
"""
if "noise" in capability:
return "noise"
elif "agent2agent" in capability or "a2a" in capability:
return "A2A"
else:
return capability
def cleanup(row) -> dict:
"""
Transform raw evaluation data into a clean format for the leaderboard display.
Args:
row: Raw evaluation result row from the dataset
Returns:
Dictionary with cleaned and formatted data for leaderboard display
"""
result = {}
# Basic model information
result["Model"] = row["metadata.model"]
result["Provider"] = row["metadata.model_provider"]
result["Total score (%)"] = round(row["statistics.global.macro_success_rate"], 1)
# Define the order of capability columns for consistent display
scenario_order = [
"execution",
"search",
"ambiguity",
"adaptability",
"time",
"mini_noise",
"mini_agent2agent",
]
# Process each capability score with aligned formatting
for capability in scenario_order:
if capability in SCENARIO_LIST:
display_name = get_display_name(capability)
# Extract score and standard error
score = row[f"statistics.per_capability.{capability}.success_rate"]
sem = row[f"statistics.per_capability.{capability}.success_rate_sem"]
# Format with decimal alignment using non-breaking spaces
score_str = f"{score:4.1f}".replace(" ", "\u00A0")
sem_str = f"{sem:.1f}" # No width formatting for SEM to avoid extra spaces
result[f"{display_name} (%)"] = f"{score_str} Β± {sem_str}"
# Add metadata fields
result["Number of runs"] = (
row["statistics.global.total_runs"] / row["statistics.global.total_scenarios"]
if row["statistics.global.total_scenarios"] != 0
else 0
)
result["Submitter"] = row["metadata.organisation"]
result["Submission date"] = row["metadata.timestamp"][:10]
return result
def get_dataframe_from_results() -> pd.DataFrame:
"""
Load and process evaluation results from the dataset to create a leaderboard DataFrame.
Retrieves raw evaluation data, processes it through the cleanup function,
and returns a sorted DataFrame ready for leaderboard display.
Returns:
Pandas DataFrame with processed leaderboard data, sorted by total score
Returns empty DataFrame if no data is available
"""
split = "train"
# Load evaluation results dataset
try:
eval_results = datasets.load_dataset(
RESULTS_DATASET,
token=TOKEN,
verification_mode=datasets.VerificationMode.NO_CHECKS,
)
except datasets.data_files.EmptyDatasetError:
eval_results = datasets.DatasetDict()
# Return empty DataFrame if no data available
if not eval_results or split not in eval_results or len(eval_results[split]) == 0:
return pd.DataFrame([])
results = eval_results[split]
local_df = results.flatten()
# Define columns to extract from the raw data
metadata_columns = [
"metadata.model",
"metadata.model_provider",
"metadata.organisation",
"metadata.timestamp",
"metadata.url",
]
global_stats_columns = [
"statistics.global.macro_success_rate",
"statistics.global.total_runs",
"statistics.global.total_scenarios",
]
# Add per-capability statistics columns
capability_columns = []
for capability in SCENARIO_LIST:
capability_columns.extend(
[
f"statistics.per_capability.{capability}.success_rate",
f"statistics.per_capability.{capability}.success_rate_sem",
]
)
# Combine all required columns
columns = metadata_columns + global_stats_columns + capability_columns
# Process the data: select columns, clean up, and remove original columns
local_df = local_df.select_columns(columns)
mapped_df = local_df.map(cleanup, batched=False)
mapped_df = mapped_df.remove_columns(columns)
# Convert to pandas DataFrame and sort by total score (highest first)
df = pd.DataFrame(mapped_df)
df = df.sort_values(by=["Total score (%)"], ascending=False)
return df
# ATM only one set
eval_dataframe_val = get_dataframe_from_results()
def restart_space():
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
def add_new_eval(
organisation: str,
path_to_repository: str,
profile: gr.OAuthProfile,
token: gr.OAuthToken,
):
# ---- USER CHECKS ----
# Was the profile created less than 2 month ago?
user_data = requests.get(
f"https://huggingface.co/api/users/{profile.username}/overview"
)
creation_date = json.loads(user_data.content)["createdAt"]
if datetime.datetime.now() - datetime.datetime.strptime(
creation_date, "%Y-%m-%dT%H:%M:%S.%fZ"
) < datetime.timedelta(days=60):
raise Exception("This account is not authorized to submit on Gaia2.")
# Can't submit several times per day
contact_infos = datasets.load_dataset(
CONTACT_DATASET,
token=TOKEN,
verification_mode=datasets.VerificationMode.NO_CHECKS,
)
user_submission_dates = sorted(
row["date"]
for row in contact_infos["train"]
if row["username"] == profile.username
)
if len(user_submission_dates) > 0 and user_submission_dates[
-1
] == datetime.datetime.today().strftime("%Y-%m-%d"):
raise Exception("You already submitted once today, please try again tomorrow.")
# ---- EXPERIMENT MANAGEMENT ----
# Download locally with HF hub
snapshot_path = snapshot_download(
repo_id=path_to_repository, token=token.token, repo_type="dataset"
)
# Test completeness with datasets
try:
for scenario in SCENARIO_LIST:
# Loading what the user provided
datasets.load_dataset(
snapshot_path,
scenario,
split="test",
verification_mode=datasets.VerificationMode.NO_CHECKS,
)
except Exception as e:
print(e)
raise ValueError(
f"We cannot load the scenario {scenario} for your dataset ({path_to_repository}). Please make sure the dataset is accessible and all subsets are there."
)
with open(Path(snapshot_path, "computed_stats.json")) as f:
results = json.load(f)
model = results["metadata"]["model"]
results["metadata"]["organisation"] = organisation
results["metadata"]["url"] = path_to_repository
try:
ds = datasets.load_dataset(RESULTS_DATASET, split="train")
except datasets.data_files.EmptyDatasetError:
ds = datasets.Dataset.from_dict({})
if results in ds:
raise Exception("This precise model and results file was already submitted")
ds = ds.add_item(results)
ds.push_to_hub(RESULTS_DATASET, split="train", private=True)
experiment = Experiment(path_to_repository, organisation, model)
# Save copy to hub
create_repo(
repo_id=f"{OWNER}/{str(experiment)}",
repo_type="dataset",
token=TOKEN,
private=True,
)
upload_folder(
folder_path=snapshot_path,
repo_id=f"{OWNER}/{str(experiment)}",
repo_type="dataset",
token=TOKEN,
)
print(f"Adding new eval: {str(experiment)}")
# SAVE ALL INFO
contact_info = {
"model": experiment.model,
"path_to_hub": experiment.path_to_hub,
"path_to_hub_private_copy": f"{OWNER}/{str(experiment)}",
"organisation": experiment.organisation,
"date": experiment.cur_date,
"username": profile.username,
"mail": getattr(profile, "email", None),
}
contact_infos["test"] = contact_infos["test"].add_item(contact_info)
contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN)
return format_log(
f"Model {model} submitted by {organisation} successfully.\nPlease wait a couple minutes and refresh the leaderboard to see your score displayed."
)
def refresh():
return get_dataframe_from_results()
# Custom CSS for sleek styling
custom_css = """
<style>
/* Global styling */
.gradio-container {
max-width: 1400px !important;
margin: auto;
padding: 20px;
background: linear-gradient(135deg, #f8fbff 0%, #e3f2fd 100%);
min-height: auto !important; /* override HF default */
padding-bottom: 0 !important; /* remove extra bottom padding */
}
html, body, #root {
margin: 0;
padding: 0;
height: auto !important; /* don't lock to viewport height */
min-height: 100%;
overflow-x: hidden !important;
overflow-y: auto !important; /* ensure vertical scroll is possible */
box-sizing: border-box;
}
/* Markdown text styling */
.markdown-text {
background: white;
padding: 25px;
border-radius: 12px;
box-shadow: 0 4px 20px rgba(0,0,0,0.08);
margin: 20px 0;
border-left: 4px solid #0081FB;
font-size: 16px;
line-height: 1.6;
}
/* Button styling */
.gr-button {
background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%) !important;
border: none !important;
border-radius: 8px !important;
color: white !important;
font-weight: 600 !important;
padding: 12px 24px !important;
transition: all 0.3s ease !important;
box-shadow: 0 4px 15px rgba(0, 129, 251, 0.3) !important;
}
.gr-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 25px rgba(0, 129, 251, 0.4) !important;
}
/* Input fields styling */
.gr-textbox {
border-radius: 8px !important;
border: 2px solid #e1e5e9 !important;
background: white !important;
transition: all 0.3s ease !important;
}
.gr-textbox:focus {
border-color: #667eea !important;
box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
}
/* Accordion styling */
.gr-accordion {
background: white !important;
border-radius: 12px !important;
box-shadow: 0 4px 20px rgba(0,0,0,0.08) !important;
border: none !important;
margin: 15px 0 !important;
}
/* Leaderboard styling */
.leaderboard-container {
background: white !important;
border-radius: 15px !important;
box-shadow: 0 8px 32px rgba(0,0,0,0.1) !important;
overflow: hidden !important;
margin: 25px 0 !important;
border: none !important;
}
/* Remove any default Gradio gray backgrounds */
.gradio-container .gr-column,
.gradio-container .gr-row {
background: transparent !important;
}
/* Ensure leaderboard table has clean white background */
.leaderboard-container table,
.leaderboard-container .gr-table {
background: white !important;
border: none !important;
}
/* Submission form styling */
.submission-section {
background: white;
padding: 30px;
border-radius: 15px;
box-shadow: 0 6px 25px rgba(0,0,0,0.08);
margin: 25px 0;
}
</style>
"""
demo = gr.Blocks(
# css=custom_css,
theme=gr.themes.Soft(
font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue"
),
)
with demo:
gr.HTML(TITLE)
with gr.Accordion("About", open=True):
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
# Enhanced leaderboard with custom styling
with gr.Column(elem_classes="leaderboard-container"):
# gr.HTML(
# """
# <div style="padding: 20px 20px 0 20px;">
# <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;">
# πŸ† Gaia2 Leaderboard Rankings
# </h2>
# <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;">
# Click on column headers to sort β€’ Use filters to narrow results
# </p>
# </div>
# """
# )
leaderboard_table_val = Leaderboard(
value=eval_dataframe_val,
select_columns=SelectColumns(
default_selection=[
"Model",
"Provider",
"Total score (%)",
"execution (%)",
"search (%)",
"ambiguity (%)",
"adaptability (%)",
"time (%)",
"noise (%)",
"A2A (%)",
"Submission date",
],
cant_deselect=[
"Model",
"Provider",
"Total score (%)",
"Submission date",
],
),
search_columns=["Model", "Provider", "Submitter"],
filter_columns=[
"Provider",
ColumnFilter("Model", type="dropdown", label="πŸ” Select Model"),
],
)
# Enhanced submission section
with gr.Column(elem_classes="submission-section"):
gr.HTML(
"""
<h2 style="margin: 0 0 20px 0; font-weight: 700; font-size: 1.8em;">
πŸš€ Submit Your Model
</h2>
"""
)
with gr.Accordion("πŸ“‹ How to submit", open=True):
gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
with gr.Row(equal_height=True):
with gr.Column(scale=1):
gr.LoginButton(size="lg")
with gr.Column(scale=2):
organisation_tbox = gr.Textbox(
label="🏒 Organization",
placeholder="Enter your organization name",
container=True,
)
with gr.Column(scale=3):
dataset_tbox = gr.Textbox(
label="πŸ“Š Hub Dataset Path",
placeholder="username/dataset-name",
container=True,
)
with gr.Column(scale=1):
submit_button = gr.Button("Submit", variant="primary", size="lg")
with gr.Column(scale=1):
refresh_button = gr.Button(
"πŸ”„ Refresh the display", variant="secondary", size="lg"
)
submission_result = gr.Markdown()
with gr.Column():
gr.HTML(
"""
<div style="text-align: center; margin: 20px 0; display: flex; justify-content: center; gap: 50px; flex-wrap: wrap;">
<!-- GitHub Button -->
<a href="https://github.com/facebookresearch/meta-agents-research-environments" target="_blank"
style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
background: linear-gradient(135deg, #24292e 0%, #000000 100%);
color: white; font-weight: 600; padding: 14px 28px;
border-radius: 10px; text-decoration: none; font-size: 16px;
box-shadow: 0 4px 12px rgba(0,0,0,0.3); transition: all 0.3s ease;
min-width: 220px; text-align: center;">
<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="white" viewBox="0 0 24 24">
<path d="M12 .5C5.7.5.5 5.7.5 12c0 5.1 3.3 9.4 7.9 10.9.6.1.8-.2.8-.6v-2.1c-3.2.7-3.9-1.4-3.9-1.4-.5-1.2-1.2-1.6-1.2-1.6-1-.7.1-.7.1-.7 1.1.1 1.7 1.1 1.7 1.1 1 .1.8 1.4 2.9 1.9.3-.8.6-1.3.6-1.3-2.6-.3-5.3-1.3-5.3-5.8 0-1.3.5-2.4 1.1-3.3 0-.3-.5-1.6.1-3.2 0 0 1-.3 3.3 1.2a11.5 11.5 0 0 1 6 0c2.3-1.5 3.3-1.2 3.3-1.2.6 1.6.1 2.9.1 3.2.7.9 1.1 2 1.1 3.3 0 4.5-2.7 5.5-5.3 5.8.4.3.7 1 .7 2v3c0 .3.2.7.8.6A11.5 11.5 0 0 0 23.5 12C23.5 5.7 18.3.5 12 .5Z"/>
</svg>
Star ARE on GitHub ⭐
</a>
<!-- Blog Post -->
<a href="https://ai.meta.com/research/publications/are-scaling-up-agent-environments-and-evaluations/" target="_blank"
style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%);
color: white; font-weight: 600; padding: 14px 28px;
border-radius: 10px; text-decoration: none; font-size: 16px;
box-shadow: 0 4px 12px rgba(0,0,0,0.25); transition: all 0.3s ease;
min-width: 220px; text-align: center;">
πŸ§‘β€πŸ”¬ Read the paper
</a>
<!-- Demo Button -->
<a href="https://huggingface.co/spaces/meta-agents-research-environments/demo" target="_blank"
style="display: inline-flex; align-items: center; justify-content: center; gap: 10px;
background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%);
color: white; font-weight: 600; padding: 14px 28px;
border-radius: 10px; text-decoration: none; font-size: 16px;
box-shadow: 0 4px 12px rgba(0,0,0,0.25); transition: all 0.3s ease;
min-width: 220px; text-align: center;">
πŸš€ Try the ARE Demo
</a>
</div>
"""
)
with gr.Column():
with gr.Accordion("πŸ“™ Citation", open=True):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
elem_id="citation-button",
show_copy_button=True,
)
submit_button.click(
add_new_eval,
[organisation_tbox, dataset_tbox],
submission_result,
)
refresh_button.click(
refresh,
inputs=[],
outputs=[leaderboard_table_val],
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.launch(debug=True, server_name="0.0.0.0", server_port=7860)