Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import datetime | |
| import json | |
| import os | |
| from pathlib import Path | |
| import datasets | |
| import gradio as gr | |
| import pandas as pd | |
| import requests | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| # InfoStrings | |
| from content import ( | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| CONTACT_DATASET, | |
| INTRODUCTION_TEXT, | |
| LEADERBOARD_PATH, | |
| OWNER, | |
| RESULTS_DATASET, | |
| SCENARIO_LIST, | |
| SUBMISSION_TEXT, | |
| TITLE, | |
| ) | |
| from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns | |
| from huggingface_hub import create_repo, snapshot_download, upload_folder | |
| from utils import api, Experiment, format_log, model_hyperlink, TOKEN | |
| contact_infos = datasets.load_dataset( | |
| CONTACT_DATASET, token=TOKEN, verification_mode=datasets.VerificationMode.NO_CHECKS | |
| ) # download_mode="force_redownload" | |
| def get_display_name(capability: str) -> str: | |
| """ | |
| Convert internal capability names to user-friendly display names. | |
| Args: | |
| capability: Internal capability name from the benchmark | |
| Returns: | |
| User-friendly display name for the leaderboard | |
| """ | |
| if "noise" in capability: | |
| return "noise" | |
| elif "agent2agent" in capability or "a2a" in capability: | |
| return "A2A" | |
| else: | |
| return capability | |
| def cleanup(row) -> dict: | |
| """ | |
| Transform raw evaluation data into a clean format for the leaderboard display. | |
| Args: | |
| row: Raw evaluation result row from the dataset | |
| Returns: | |
| Dictionary with cleaned and formatted data for leaderboard display | |
| """ | |
| result = {} | |
| # Basic model information | |
| result["Model"] = row["metadata.model"] | |
| result["Provider"] = row["metadata.model_provider"] | |
| result["Total score (%)"] = round(row["statistics.global.macro_success_rate"], 1) | |
| # Define the order of capability columns for consistent display | |
| scenario_order = [ | |
| "execution", | |
| "search", | |
| "ambiguity", | |
| "adaptability", | |
| "time", | |
| "mini_noise", | |
| "mini_agent2agent", | |
| ] | |
| # Process each capability score with aligned formatting | |
| for capability in scenario_order: | |
| if capability in SCENARIO_LIST: | |
| display_name = get_display_name(capability) | |
| # Extract score and standard error | |
| score = row[f"statistics.per_capability.{capability}.success_rate"] | |
| sem = row[f"statistics.per_capability.{capability}.success_rate_sem"] | |
| # Format with decimal alignment using non-breaking spaces | |
| score_str = f"{score:4.1f}".replace(" ", "\u00A0") | |
| sem_str = f"{sem:.1f}" # No width formatting for SEM to avoid extra spaces | |
| result[f"{display_name} (%)"] = f"{score_str} Β± {sem_str}" | |
| # Add metadata fields | |
| result["Number of runs"] = ( | |
| row["statistics.global.total_runs"] / row["statistics.global.total_scenarios"] | |
| if row["statistics.global.total_scenarios"] != 0 | |
| else 0 | |
| ) | |
| result["Submitter"] = row["metadata.organisation"] | |
| result["Submission date"] = row["metadata.timestamp"][:10] | |
| return result | |
| def get_dataframe_from_results() -> pd.DataFrame: | |
| """ | |
| Load and process evaluation results from the dataset to create a leaderboard DataFrame. | |
| Retrieves raw evaluation data, processes it through the cleanup function, | |
| and returns a sorted DataFrame ready for leaderboard display. | |
| Returns: | |
| Pandas DataFrame with processed leaderboard data, sorted by total score | |
| Returns empty DataFrame if no data is available | |
| """ | |
| split = "train" | |
| # Load evaluation results dataset | |
| try: | |
| eval_results = datasets.load_dataset( | |
| RESULTS_DATASET, | |
| token=TOKEN, | |
| verification_mode=datasets.VerificationMode.NO_CHECKS, | |
| ) | |
| except datasets.data_files.EmptyDatasetError: | |
| eval_results = datasets.DatasetDict() | |
| # Return empty DataFrame if no data available | |
| if not eval_results or split not in eval_results or len(eval_results[split]) == 0: | |
| return pd.DataFrame([]) | |
| results = eval_results[split] | |
| local_df = results.flatten() | |
| # Define columns to extract from the raw data | |
| metadata_columns = [ | |
| "metadata.model", | |
| "metadata.model_provider", | |
| "metadata.organisation", | |
| "metadata.timestamp", | |
| "metadata.url", | |
| ] | |
| global_stats_columns = [ | |
| "statistics.global.macro_success_rate", | |
| "statistics.global.total_runs", | |
| "statistics.global.total_scenarios", | |
| ] | |
| # Add per-capability statistics columns | |
| capability_columns = [] | |
| for capability in SCENARIO_LIST: | |
| capability_columns.extend( | |
| [ | |
| f"statistics.per_capability.{capability}.success_rate", | |
| f"statistics.per_capability.{capability}.success_rate_sem", | |
| ] | |
| ) | |
| # Combine all required columns | |
| columns = metadata_columns + global_stats_columns + capability_columns | |
| # Process the data: select columns, clean up, and remove original columns | |
| local_df = local_df.select_columns(columns) | |
| mapped_df = local_df.map(cleanup, batched=False) | |
| mapped_df = mapped_df.remove_columns(columns) | |
| # Convert to pandas DataFrame and sort by total score (highest first) | |
| df = pd.DataFrame(mapped_df) | |
| df = df.sort_values(by=["Total score (%)"], ascending=False) | |
| return df | |
| # ATM only one set | |
| eval_dataframe_val = get_dataframe_from_results() | |
| def restart_space(): | |
| api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) | |
| def add_new_eval( | |
| organisation: str, | |
| path_to_repository: str, | |
| profile: gr.OAuthProfile, | |
| token: gr.OAuthToken, | |
| ): | |
| # ---- USER CHECKS ---- | |
| # Was the profile created less than 2 month ago? | |
| user_data = requests.get( | |
| f"https://huggingface.co/api/users/{profile.username}/overview" | |
| ) | |
| creation_date = json.loads(user_data.content)["createdAt"] | |
| if datetime.datetime.now() - datetime.datetime.strptime( | |
| creation_date, "%Y-%m-%dT%H:%M:%S.%fZ" | |
| ) < datetime.timedelta(days=60): | |
| raise Exception("This account is not authorized to submit on Gaia2.") | |
| # Can't submit several times per day | |
| contact_infos = datasets.load_dataset( | |
| CONTACT_DATASET, | |
| token=TOKEN, | |
| verification_mode=datasets.VerificationMode.NO_CHECKS, | |
| ) | |
| user_submission_dates = sorted( | |
| row["date"] | |
| for row in contact_infos["train"] | |
| if row["username"] == profile.username | |
| ) | |
| if len(user_submission_dates) > 0 and user_submission_dates[ | |
| -1 | |
| ] == datetime.datetime.today().strftime("%Y-%m-%d"): | |
| raise Exception("You already submitted once today, please try again tomorrow.") | |
| # ---- EXPERIMENT MANAGEMENT ---- | |
| # Download locally with HF hub | |
| snapshot_path = snapshot_download( | |
| repo_id=path_to_repository, token=token.token, repo_type="dataset" | |
| ) | |
| # Test completeness with datasets | |
| try: | |
| for scenario in SCENARIO_LIST: | |
| # Loading what the user provided | |
| datasets.load_dataset( | |
| snapshot_path, | |
| scenario, | |
| split="test", | |
| verification_mode=datasets.VerificationMode.NO_CHECKS, | |
| ) | |
| except Exception as e: | |
| print(e) | |
| raise ValueError( | |
| f"We cannot load the scenario {scenario} for your dataset ({path_to_repository}). Please make sure the dataset is accessible and all subsets are there." | |
| ) | |
| with open(Path(snapshot_path, "computed_stats.json")) as f: | |
| results = json.load(f) | |
| model = results["metadata"]["model"] | |
| results["metadata"]["organisation"] = organisation | |
| results["metadata"]["url"] = path_to_repository | |
| try: | |
| ds = datasets.load_dataset(RESULTS_DATASET, split="train") | |
| except datasets.data_files.EmptyDatasetError: | |
| ds = datasets.Dataset.from_dict({}) | |
| if results in ds: | |
| raise Exception("This precise model and results file was already submitted") | |
| ds = ds.add_item(results) | |
| ds.push_to_hub(RESULTS_DATASET, split="train", private=True) | |
| experiment = Experiment(path_to_repository, organisation, model) | |
| # Save copy to hub | |
| create_repo( | |
| repo_id=f"{OWNER}/{str(experiment)}", | |
| repo_type="dataset", | |
| token=TOKEN, | |
| private=True, | |
| ) | |
| upload_folder( | |
| folder_path=snapshot_path, | |
| repo_id=f"{OWNER}/{str(experiment)}", | |
| repo_type="dataset", | |
| token=TOKEN, | |
| ) | |
| print(f"Adding new eval: {str(experiment)}") | |
| # SAVE ALL INFO | |
| contact_info = { | |
| "model": experiment.model, | |
| "path_to_hub": experiment.path_to_hub, | |
| "path_to_hub_private_copy": f"{OWNER}/{str(experiment)}", | |
| "organisation": experiment.organisation, | |
| "date": experiment.cur_date, | |
| "username": profile.username, | |
| "mail": getattr(profile, "email", None), | |
| } | |
| contact_infos["test"] = contact_infos["test"].add_item(contact_info) | |
| contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN) | |
| return format_log( | |
| f"Model {model} submitted by {organisation} successfully.\nPlease wait a couple minutes and refresh the leaderboard to see your score displayed." | |
| ) | |
| def refresh(): | |
| return get_dataframe_from_results() | |
| # Custom CSS for sleek styling | |
| custom_css = """ | |
| <style> | |
| /* Global styling */ | |
| .gradio-container { | |
| max-width: 1400px !important; | |
| margin: auto; | |
| padding: 20px; | |
| background: linear-gradient(135deg, #f8fbff 0%, #e3f2fd 100%); | |
| min-height: auto !important; /* override HF default */ | |
| padding-bottom: 0 !important; /* remove extra bottom padding */ | |
| } | |
| html, body, #root { | |
| margin: 0; | |
| padding: 0; | |
| height: auto !important; /* don't lock to viewport height */ | |
| min-height: 100%; | |
| overflow-x: hidden !important; | |
| overflow-y: auto !important; /* ensure vertical scroll is possible */ | |
| box-sizing: border-box; | |
| } | |
| /* Markdown text styling */ | |
| .markdown-text { | |
| background: white; | |
| padding: 25px; | |
| border-radius: 12px; | |
| box-shadow: 0 4px 20px rgba(0,0,0,0.08); | |
| margin: 20px 0; | |
| border-left: 4px solid #0081FB; | |
| font-size: 16px; | |
| line-height: 1.6; | |
| } | |
| /* Button styling */ | |
| .gr-button { | |
| background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%) !important; | |
| border: none !important; | |
| border-radius: 8px !important; | |
| color: white !important; | |
| font-weight: 600 !important; | |
| padding: 12px 24px !important; | |
| transition: all 0.3s ease !important; | |
| box-shadow: 0 4px 15px rgba(0, 129, 251, 0.3) !important; | |
| } | |
| .gr-button:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 8px 25px rgba(0, 129, 251, 0.4) !important; | |
| } | |
| /* Input fields styling */ | |
| .gr-textbox { | |
| border-radius: 8px !important; | |
| border: 2px solid #e1e5e9 !important; | |
| background: white !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .gr-textbox:focus { | |
| border-color: #667eea !important; | |
| box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important; | |
| } | |
| /* Accordion styling */ | |
| .gr-accordion { | |
| background: white !important; | |
| border-radius: 12px !important; | |
| box-shadow: 0 4px 20px rgba(0,0,0,0.08) !important; | |
| border: none !important; | |
| margin: 15px 0 !important; | |
| } | |
| /* Leaderboard styling */ | |
| .leaderboard-container { | |
| background: white !important; | |
| border-radius: 15px !important; | |
| box-shadow: 0 8px 32px rgba(0,0,0,0.1) !important; | |
| overflow: hidden !important; | |
| margin: 25px 0 !important; | |
| border: none !important; | |
| } | |
| /* Remove any default Gradio gray backgrounds */ | |
| .gradio-container .gr-column, | |
| .gradio-container .gr-row { | |
| background: transparent !important; | |
| } | |
| /* Ensure leaderboard table has clean white background */ | |
| .leaderboard-container table, | |
| .leaderboard-container .gr-table { | |
| background: white !important; | |
| border: none !important; | |
| } | |
| /* Submission form styling */ | |
| .submission-section { | |
| background: white; | |
| padding: 30px; | |
| border-radius: 15px; | |
| box-shadow: 0 6px 25px rgba(0,0,0,0.08); | |
| margin: 25px 0; | |
| } | |
| </style> | |
| """ | |
| demo = gr.Blocks( | |
| # css=custom_css, | |
| theme=gr.themes.Soft( | |
| font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"], primary_hue="blue" | |
| ), | |
| ) | |
| with demo: | |
| gr.HTML(TITLE) | |
| with gr.Accordion("About", open=True): | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| # Enhanced leaderboard with custom styling | |
| with gr.Column(elem_classes="leaderboard-container"): | |
| # gr.HTML( | |
| # """ | |
| # <div style="padding: 20px 20px 0 20px;"> | |
| # <h2 style="margin: 0; font-weight: 700; font-size: 1.8em;"> | |
| # π Gaia2 Leaderboard Rankings | |
| # </h2> | |
| # <p style="margin: 10px 0 20px 0; color: #666; font-size: 16px;"> | |
| # Click on column headers to sort β’ Use filters to narrow results | |
| # </p> | |
| # </div> | |
| # """ | |
| # ) | |
| leaderboard_table_val = Leaderboard( | |
| value=eval_dataframe_val, | |
| select_columns=SelectColumns( | |
| default_selection=[ | |
| "Model", | |
| "Provider", | |
| "Total score (%)", | |
| "execution (%)", | |
| "search (%)", | |
| "ambiguity (%)", | |
| "adaptability (%)", | |
| "time (%)", | |
| "noise (%)", | |
| "A2A (%)", | |
| "Submission date", | |
| ], | |
| cant_deselect=[ | |
| "Model", | |
| "Provider", | |
| "Total score (%)", | |
| "Submission date", | |
| ], | |
| ), | |
| search_columns=["Model", "Provider", "Submitter"], | |
| filter_columns=[ | |
| "Provider", | |
| ColumnFilter("Model", type="dropdown", label="π Select Model"), | |
| ], | |
| ) | |
| # Enhanced submission section | |
| with gr.Column(elem_classes="submission-section"): | |
| gr.HTML( | |
| """ | |
| <h2 style="margin: 0 0 20px 0; font-weight: 700; font-size: 1.8em;"> | |
| π Submit Your Model | |
| </h2> | |
| """ | |
| ) | |
| with gr.Accordion("π How to submit", open=True): | |
| gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text") | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| gr.LoginButton(size="lg") | |
| with gr.Column(scale=2): | |
| organisation_tbox = gr.Textbox( | |
| label="π’ Organization", | |
| placeholder="Enter your organization name", | |
| container=True, | |
| ) | |
| with gr.Column(scale=3): | |
| dataset_tbox = gr.Textbox( | |
| label="π Hub Dataset Path", | |
| placeholder="username/dataset-name", | |
| container=True, | |
| ) | |
| with gr.Column(scale=1): | |
| submit_button = gr.Button("Submit", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| refresh_button = gr.Button( | |
| "π Refresh the display", variant="secondary", size="lg" | |
| ) | |
| submission_result = gr.Markdown() | |
| with gr.Column(): | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; margin: 20px 0; display: flex; justify-content: center; gap: 50px; flex-wrap: wrap;"> | |
| <!-- GitHub Button --> | |
| <a href="https://github.com/facebookresearch/meta-agents-research-environments" target="_blank" | |
| style="display: inline-flex; align-items: center; justify-content: center; gap: 10px; | |
| background: linear-gradient(135deg, #24292e 0%, #000000 100%); | |
| color: white; font-weight: 600; padding: 14px 28px; | |
| border-radius: 10px; text-decoration: none; font-size: 16px; | |
| box-shadow: 0 4px 12px rgba(0,0,0,0.3); transition: all 0.3s ease; | |
| min-width: 220px; text-align: center;"> | |
| <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="white" viewBox="0 0 24 24"> | |
| <path d="M12 .5C5.7.5.5 5.7.5 12c0 5.1 3.3 9.4 7.9 10.9.6.1.8-.2.8-.6v-2.1c-3.2.7-3.9-1.4-3.9-1.4-.5-1.2-1.2-1.6-1.2-1.6-1-.7.1-.7.1-.7 1.1.1 1.7 1.1 1.7 1.1 1 .1.8 1.4 2.9 1.9.3-.8.6-1.3.6-1.3-2.6-.3-5.3-1.3-5.3-5.8 0-1.3.5-2.4 1.1-3.3 0-.3-.5-1.6.1-3.2 0 0 1-.3 3.3 1.2a11.5 11.5 0 0 1 6 0c2.3-1.5 3.3-1.2 3.3-1.2.6 1.6.1 2.9.1 3.2.7.9 1.1 2 1.1 3.3 0 4.5-2.7 5.5-5.3 5.8.4.3.7 1 .7 2v3c0 .3.2.7.8.6A11.5 11.5 0 0 0 23.5 12C23.5 5.7 18.3.5 12 .5Z"/> | |
| </svg> | |
| Star ARE on GitHub β | |
| </a> | |
| <!-- Blog Post --> | |
| <a href="https://ai.meta.com/research/publications/are-scaling-up-agent-environments-and-evaluations/" target="_blank" | |
| style="display: inline-flex; align-items: center; justify-content: center; gap: 10px; | |
| background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%); | |
| color: white; font-weight: 600; padding: 14px 28px; | |
| border-radius: 10px; text-decoration: none; font-size: 16px; | |
| box-shadow: 0 4px 12px rgba(0,0,0,0.25); transition: all 0.3s ease; | |
| min-width: 220px; text-align: center;"> | |
| π§βπ¬ Read the paper | |
| </a> | |
| <!-- Demo Button --> | |
| <a href="https://huggingface.co/spaces/meta-agents-research-environments/demo" target="_blank" | |
| style="display: inline-flex; align-items: center; justify-content: center; gap: 10px; | |
| background: linear-gradient(135deg, #0081FB 0%, #42A5F5 100%); | |
| color: white; font-weight: 600; padding: 14px 28px; | |
| border-radius: 10px; text-decoration: none; font-size: 16px; | |
| box-shadow: 0 4px 12px rgba(0,0,0,0.25); transition: all 0.3s ease; | |
| min-width: 220px; text-align: center;"> | |
| π Try the ARE Demo | |
| </a> | |
| </div> | |
| """ | |
| ) | |
| with gr.Column(): | |
| with gr.Accordion("π Citation", open=True): | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| submit_button.click( | |
| add_new_eval, | |
| [organisation_tbox, dataset_tbox], | |
| submission_result, | |
| ) | |
| refresh_button.click( | |
| refresh, | |
| inputs=[], | |
| outputs=[leaderboard_table_val], | |
| ) | |
| scheduler = BackgroundScheduler() | |
| scheduler.add_job(restart_space, "interval", seconds=3600) | |
| scheduler.start() | |
| demo.launch(debug=True, server_name="0.0.0.0", server_port=7860) | |