File size: 21,084 Bytes
8939028
 
 
1971175
8939028
 
d60c9d9
21516b7
8939028
1971175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18f8616
1971175
 
 
 
 
53e8dbb
 
 
 
 
 
 
 
 
1971175
 
 
21516b7
1971175
 
 
 
 
8939028
 
 
1971175
2eee49c
1971175
 
18f8616
1971175
 
 
 
 
 
 
d60c9d9
 
1971175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21516b7
 
 
1971175
 
 
 
 
 
 
 
 
ad0be25
8939028
ad0be25
1971175
 
8939028
21516b7
 
 
 
 
 
8939028
 
21516b7
 
 
 
 
 
8939028
 
 
1971175
 
 
 
 
 
 
 
 
 
 
 
 
8939028
1971175
d60c9d9
 
 
 
 
 
 
 
 
 
1971175
8939028
21516b7
 
 
 
 
 
1971175
8939028
1971175
 
 
 
 
 
 
 
 
8939028
21516b7
 
 
 
 
 
1971175
8939028
 
1971175
21516b7
 
 
 
 
 
1971175
8939028
1971175
 
 
 
 
21516b7
 
 
 
 
 
1971175
 
 
 
 
8939028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21516b7
 
 
 
 
 
8939028
21516b7
 
 
 
 
 
1971175
 
 
8939028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21516b7
 
 
 
 
 
8939028
21516b7
 
 
 
 
 
8939028
 
 
 
 
 
d60c9d9
 
8939028
 
1971175
 
 
 
 
8939028
 
 
21516b7
 
 
 
 
 
 
 
 
 
 
 
 
 
1971175
 
d60c9d9
 
 
 
 
 
1971175
 
50fb0f5
 
20c57a4
0d80b6f
50fb0f5
 
 
 
 
 
 
 
 
 
 
 
9832707
50fb0f5
 
 
 
 
 
 
1971175
9b2d66c
 
 
 
 
 
 
1971175
11de2f8
20c57a4
 
8939028
20c57a4
 
 
 
 
ad0be25
20c57a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d60c9d9
 
 
 
20c57a4
 
 
 
 
 
 
 
 
 
 
 
1971175
20c57a4
2eee49c
1971175
20c57a4
21516b7
 
 
 
 
 
 
 
 
 
 
 
 
1971175
 
21516b7
 
 
 
1971175
 
 
 
 
 
 
 
 
 
ad0be25
20c57a4
 
1971175
21516b7
1971175
8bd1c00
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
import logging
import sys

import matplotlib
from agenteval.cli import SUBMISSION_METADATA_FILENAME
from agenteval.models import SubmissionMetadata
from datasets.exceptions import DataFilesNotFoundError
from gradio_modal import Modal

matplotlib.use('Agg')

import os
import shutil
import tarfile
import tempfile
from datetime import datetime, timedelta, timezone
from email.utils import parseaddr
from pathlib import Path

import gradio as gr
import requests
from agenteval import (
    process_eval_logs,
    upload_folder_to_hf,
)
from agenteval.leaderboard.models import LeaderboardSubmission
from agenteval.leaderboard.upload import sanitize_path_component
from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
from datasets.data_files import EmptyDatasetError
from huggingface_hub import HfApi

from config import (
    CONFIG_NAME,
    CONTACT_DATASET,
    EXTRACTED_DATA_DIR,
    IS_INTERNAL,
    LOCAL_DEBUG,
    RESULTS_DATASET,
    SUBMISSION_DATASET,
)
from content import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    SUBMISSION_CONFIRMATION,
    format_error,
    format_log,
    format_warning,
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

api = HfApi()
MAX_UPLOAD_BYTES = 5e9
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)

# --- Submission Logic (largely unchanged from original, ensure LeaderboardSubmission and other deps are fine) ---
def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to avoid conflict if LV has one
    try:
        return load_dataset(*args, **kwargs)
    except EmptyDatasetError:
        return DatasetDict()
    except ValueError: # Handles cases where dataset is empty or ill-formed
        return DatasetDict()
    except DataFilesNotFoundError:
        return DatasetDict()

def checked_upload_folder(
        api_hf: HfApi, # Renamed to avoid conflict with global api
        folder_path: str,
        repo_id: str,
        config_name_ul: str, # Renamed
        split_ul: str, # Renamed
        submission_name_ul: str, # Renamed
) -> str:
    total = 0
    for root, _, files in os.walk(folder_path):
        for f_ul in files: # Renamed
            total += os.path.getsize(os.path.join(root, f_ul))
            if total > MAX_UPLOAD_BYTES:
                raise ValueError(
                    f"Upload too large: exceeds {MAX_UPLOAD_BYTES // (1024**2)} MB limit."
                )
    return upload_folder_to_hf(
        api=api_hf, # Use renamed parameter
        folder_path=folder_path,
        repo_id=repo_id,
        config_name=config_name_ul,
        split=split_ul,
        submission_name=submission_name_ul,
    )

def show_loading_spinner():
    return gr.update(visible=True)

def add_new_eval(
        val_or_test: str,
        agent_name: str | None,
        agent_description: str,
        agent_url: str,
        openness: str | None,
        degree_of_control: str | None,
        path_to_file: tempfile._TemporaryFileWrapper | None,
        username: str,
        role: str,
        email: str,
        email_opt_in: bool,
        profile: gr.OAuthProfile,
):
    if not agent_name:
        return (
            format_warning("Please provide an agent name."),  # error_message
            gr.update(visible=True),                            # error_modal
            gr.update(visible=False),                           # success_modal
            gr.update(visible=False)                            # loading_modal
        )

    if path_to_file is None:
        return (
            format_warning("Please attach a .tar.gz file."),  # error_message
            gr.update(visible=True),                            # error_modal
            gr.update(visible=False),                           # success_modal
            gr.update(visible=False)                            # loading_modal
        )

    logger.info(f"agent {agent_name}: Checking submission")

    # Load current eval_results for submission checks
    # This is a bit redundant if display part reloads it, but submission needs its own consistent view
    current_eval_results_for_submission = try_load_dataset_submission(
        RESULTS_DATASET,
        CONFIG_NAME,
        download_mode="force_redownload", # Or a less aggressive mode
        verification_mode=VerificationMode.NO_CHECKS,
    )

    submission_time = datetime.now(timezone.utc)
    if not username or username.strip() == "":
        username = profile.username # Default to HF username

    logger.debug(f"agent {agent_name}: User account age check {profile.username}")
    try:
        # Account age check disabled for launch.
        # https://github.com/allenai/astabench-issues/issues/419
        # if _is_hf_acct_too_new(submission_time, profile.username):
        #     return (
        #         format_error("This account is not authorized to submit here (account too new)."),  # error_message
        #         gr.update(visible=True),                            # error_modal
        #         gr.update(visible=False),                           # success_modal
        #         gr.update(visible=False)                            # loading_modal
        #     )
        pass
    except Exception as e:
        logger.warning(f"Error checking user account age: {e}")
        return (
            format_error("Could not verify account age. Please try again later."),  # error_message
            gr.update(visible=True),                            # error_modal
            gr.update(visible=False),                           # success_modal
            gr.update(visible=False)                            # loading_modal
        )

    logger.debug(f"agent {agent_name}: Submission frequency check {profile.username}")
    contact_infos = try_load_dataset_submission(
        CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload",
        verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True
    )
    user_submission_dates = sorted(
        datetime.fromisoformat(row["submit_time"])
        for row in contact_infos.get(val_or_test, []) if row["username_auth"] == profile.username
    )
    if user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1)):
        logger.info(f"agent {agent_name}: Denied submission because user {username} submitted recently")
        return (
            format_error("You already submitted once in the last 24h for this split; please try again later."),  # error_message
            gr.update(visible=True),                            # error_modal
            gr.update(visible=False),                           # success_modal
            gr.update(visible=False)                            # loading_modal
        )

    logger.debug(f"agent {agent_name}: Email validation {email}")
    _, parsed_mail = parseaddr(email)
    if "@" not in parsed_mail:
        return (
            format_warning("Please provide a valid email address."),  # error_message
            gr.update(visible=True),                            # error_modal
            gr.update(visible=False),                           # success_modal
            gr.update(visible=False)                            # loading_modal
        )

    logger.debug(f"agent {agent_name}: Duplicate submission check")
    if val_or_test in current_eval_results_for_submission and len(current_eval_results_for_submission[val_or_test]) > 0:
        existing_submissions = current_eval_results_for_submission[val_or_test].to_dict().get("submission", [])
        for sub_item in existing_submissions:
            if (sub_item.get("agent_name", "").lower() == agent_name.lower() and
                    sub_item.get("username", "").lower() == username.lower()):
                return (
                    format_warning("This agent name by this user has already been submitted to this split."),  # error_message
                    gr.update(visible=True),                            # error_modal
                    gr.update(visible=False),                           # success_modal
                    gr.update(visible=False)                            # loading_modal
                )

    safe_username = sanitize_path_component(username)
    safe_agent_name = sanitize_path_component(agent_name)
    extracted_dir = os.path.join(EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}")

    logger.debug(f"agent {agent_name}: File extraction to {extracted_dir}")
    try:
        if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir)
        os.makedirs(extracted_dir, exist_ok=True)
        with tarfile.open(path_to_file.name, "r:gz") as tar:
            members_extracted = 0
            for member in tar.getmembers():
                if not member.isreg(): continue
                fname = os.path.basename(member.name)
                if not fname or fname.startswith("."): continue
                fobj = tar.extractfile(member)
                if not fobj: continue
                with open(os.path.join(extracted_dir, fname), "wb") as out:
                    out.write(fobj.read())
                members_extracted +=1
            if members_extracted == 0:
                return (
                    format_error("Submission tarball is empty or contains no valid files."),  # error_message
                    gr.update(visible=True),                            # error_modal
                    gr.update(visible=False),                           # success_modal
                    gr.update(visible=False)                            # loading_modal
                )
    except Exception as e:
        return (
            format_error(f"Error extracting file: {e}. Ensure it's a valid .tar.gz."),  # error_message
            gr.update(visible=True),                            # error_modal
            gr.update(visible=False),                           # success_modal
            gr.update(visible=False)                            # loading_modal
        )

    submission_name = f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d_%H-%M-%S')}"

    logger.debug(f"agent {agent_name}: Generate submission.json")
    subm_meta = SubmissionMetadata(
        agent_name=agent_name,
        agent_description=agent_description,
        agent_url=agent_url,
        openness=openness,
        tool_usage=degree_of_control,
        username=username,
        submit_time=submission_time,
    )
    with open(os.path.join(extracted_dir, SUBMISSION_METADATA_FILENAME), "w", encoding="utf-8") as fp:
        fp.write(subm_meta.model_dump_json(indent=2))

    logger.info(f"agent {agent_name}: Upload raw (unscored) submission files")
    try:
        checked_upload_folder(api, extracted_dir, SUBMISSION_DATASET, CONFIG_NAME, val_or_test, submission_name)
    except ValueError as e:
        return (
            format_error(str(e)),                               # error_message
            gr.update(visible=True),                            # error_modal
            gr.update(visible=False),                           # success_modal
            gr.update(visible=False)                            # loading_modal
        )
    except Exception as e:
        return (
            format_error(f"Failed to upload raw submission: {e}"),  # error_message
            gr.update(visible=True),                            # error_modal
            gr.update(visible=False),                           # success_modal
            gr.update(visible=False)                            # loading_modal
        )

    logger.info(f"agent {agent_name}: Save contact information")
    contact_info = subm_meta.model_dump()
    contact_info["submit_time"] = submission_time.isoformat()
    contact_info["username_auth"] = profile.username
    contact_info["email"] = email
    contact_info["email_opt_in"] = email_opt_in
    contact_info["role"] = role

    logger.debug(f"agent {agent_name}: Contact info: {contact_info}")
    if val_or_test in contact_infos:
        contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info)
    else:
        contact_infos[val_or_test] = Dataset.from_list([contact_info])

    try:
        contact_infos.push_to_hub(CONTACT_DATASET, config_name=CONFIG_NAME)
    except Exception as e:
        return (
            format_error(f"Submission recorded, but contact info failed to save: {e}"),  # error_message
            gr.update(visible=True),                            # error_modal
            gr.update(visible=False),                           # success_modal
            gr.update(visible=False)                            # loading_modal
        )

    logger.info(f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split.")
    return (
        "",                                                 # error_message
        gr.update(visible=False),                           # error_modal
        gr.update(visible=True),                            # success_modal
        gr.update(visible=False)                            # loading_modal
    )


def _is_hf_acct_too_new(submission_time: datetime, username: str):
    user_data_resp = requests.get(f"https://huggingface.co/api/users/{username}/overview")
    user_data_resp.raise_for_status()
    creation_date_str = user_data_resp.json()["createdAt"]
    created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
    return submission_time - created_at < timedelta(days=60)


openness_label_html = """
<div class="form-label-with-tooltip">
    Agent Openness
    <span class="tooltip-icon" data-tooltip="• Closed: No API or code available
        • API Available: API available, but no code
        • Open Source: Code available, but no weights
        • Open Source + Open Weights: Code and weights available"
    >

    </span>
</div>
"""

agent_tooling_label_html = """
<div class="form-label-with-tooltip">
    Agent Tooling
    <span class="tooltip-icon" data-tooltip="• Standard: Only uses tools explicitly provided in state.tools
        • Equivalent: Uses custom tools with identical or more restricted capabilities
        • Fully Custom: Uses tools beyond constraints of Standard or Equivalent"
    >

    </span>
</div>
"""

heading_html = """
<h2>🚀 Submit an agent for evaluation</h2>
<p>Submit your agent to AstaBench for evaluation on real-world scientific tasks. Once submitted, your run will be reviewed by our team. If there are any issues, we’ll reach out within 5–7 business days. We’re working toward full automation, but in the meantime, human review helps ensure quality and trust.</p>
<h3>How to run an evaluation</h3>
<p>Please follow the steps in our <a href="https://github.com/allenai/asta-bench" target="_blank">README</a>. You’ll upload your run file at the end of this form.</p>
"""

# --- Submission Accordion ---
def build_page():
    with gr.Column(elem_id="submission-page-container"):
        gr.HTML(heading_html)
        gr.LoginButton()
        with gr.Group(elem_classes="custom-form-group"):
            gr.HTML(value="""<h2>Submitter Information</h2>""", elem_id="submitter-info-label-html")
            gr.HTML(value="""<h3>Username</h3>""", elem_classes="form-label")
            username_tb = gr.Textbox(label="This will show on the leaderboard. By default, we’ll use your Hugging Face username; but you can enter your organization name instead (e.g., university, company, or lab).")
            gr.HTML(value="""<h3>Role</h3>""", elem_classes="form-label")
            role = gr.Dropdown(label="Please select the role that most closely matches your current position. Helps us improve AstaBench for different user types. Not displayed on the leaderboard.",
                interactive=True,
                choices=[
                    "Undergraduate Student",
                    "Masters Student",
                    "PhD Student",
                    "Postdoctoral Researcher",
                    "Academic Faculty (e.g., Professor, Lecturer)",
                    "Industry Researcher (e.g., Research Scientist, Applied Scientist)",
                    "Engineer or Developer (e.g., Software or ML Engineer)",
                    "Data Scientist or Analyst",
                    "Product or Program Manager",
                    "Startup Founder or Independent Researcher",
                    "Other"
            ])
            gr.HTML(value="""<h3>Contact email</h3>""", elem_classes="form-label")
            mail_tb = gr.Textbox(label="We'll only use your email to communicate about your submission.")
            mail_opt_in = gr.Checkbox(label="I’m open to being contacted by email for user research studies or feedback opportunities.")
        with gr.Group(elem_classes="custom-form-group"):
            gr.HTML(value="""<h2>Agent Information</h2>""", elem_id="agent-info-label-html")
            gr.HTML(value="""<h3>Split</h3>""", elem_classes="form-label")
            level_of_test_radio = gr.Radio(choices=[
                ("Test set", "test"),
                ("Validation set", "validation"),
            ], elem_classes="form-label-fieldset", value="validation", label="The Test Set is used for final leaderboard rankings. The Validation Set is for development and iteration. Choose based on your evaluation goal.")
            gr.HTML(value="""<h3>Agent name</h3>""", elem_classes="form-label")
            agent_name_tb = gr.Textbox(label="This is how your agent will appear on the leaderboard. Use a clear, descriptive name (e.g., Asta Scholar QA, Perplexity Deep Research). Omit model names (e.g. GPT-4, Mistral) as they’ll be shown automatically based on your logs.")
            gr.HTML(value="""<h3>Agent description</h3>""", elem_classes="form-label")
            agent_desc_tb = gr.Textbox(label="Briefly describe your agent’s approach, core strategies, or what makes it distinct. This description may appear on the leaderboard.")
            gr.HTML(value="""<h3>URL</h3>""", elem_classes="form-label")
            agent_url_tb = gr.Textbox(label="Link to more information about your agent (e.g. GitHub repo, blog post, or website). This optional link may be shown on the leaderboard to let others explore your agent in more depth.")
            gr.HTML(value="""<h3>Agent openness</h3>""", elem_classes="form-label")
            openness_radio = gr.Radio(["Open Source","Open Source Open Weights", "API Available", "Closed"], elem_classes="form-label-fieldset", value=None, label="This affects how your submission is categorized on the leaderboard. Choose based on the availability of your code, model weights, or APIs.")
            gr.HTML(value="""<h3>Agent tooling</h3>""", elem_classes="form-label")
            degree_of_control_radio = gr.Radio(["Standard","Equivalent", "Fully Custom"], elem_classes="form-label-fieldset",value=None, label="Choose based on the tools and the execution environment your agent used during evaluation.")
            gr.HTML(value="""<h3>Submission file</h3>""", elem_classes="form-label")
            gr.HTML("<div id='submission-file-label'>Upload your run file, which is an archive prepared following the instructions in the <a href='https://github.com/allenai/asta-bench?tab=readme-ov-file#submitting-to-the-leaderboard' target='_blank'>README</a> (“Submitting to the Leaderboard”).</div>")
            file_upload_comp = gr.File(
                show_label=False,
                file_types=[".gz", ".tar.gz"],
            )
            submit_eval_button = gr.Button("Submit Evaluation", elem_id="submission-button")
    # Modals for loading spinner, success and error messages
    with Modal(visible=False, elem_id="submission-modal") as loading_modal:
        with gr.Column(elem_id="submission-modal-content"):
            gr.HTML('<div class="spinner-container"><div class="spinner"></div><p>Processing your submission...</p></div>')
    
    with Modal(visible=False, elem_id="submission-modal") as error_modal:
        with gr.Column(elem_id="submission-modal-content"):
            gr.Markdown("## ⚠️ Error")
            error_message = gr.Markdown()

    with Modal(visible=False, elem_id="submission-modal") as success_modal:
        with gr.Column(elem_id="submission-modal-content"):
            gr.Markdown(SUBMISSION_CONFIRMATION)

    submit_eval_button.click(
        show_loading_spinner,
        None,
        [loading_modal],
    ).then(
        add_new_eval,
        [
            level_of_test_radio,
            agent_name_tb,
            agent_desc_tb,
            agent_url_tb,
            openness_radio,
            degree_of_control_radio,
            file_upload_comp,
            username_tb,
            role,
            mail_tb,
            mail_opt_in
        ],
        [error_message, error_modal, success_modal, loading_modal],
    )
    # hiding this for now till we have the real paper data
    # with gr.Accordion("📙 Citation", open=False):
    #     gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)