| TITLE = """<h1 align="center" id="space-title">π Online-Mind2Web Leaderboard</h1>""" | |
| LINKS = """ | |
| <div align="center"> | |
| <a href="https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4?pvs=4">Blog</a> | | |
| <a href="#">Paper</a> | | |
| <a href="https://github.com/OSU-NLP-Group/Online-Mind2Web">Code</a> | | |
| <a href="https://huggingface.co/datasets/osunlp/Online-Mind2Web">Data</a> | |
| </div> | |
| """ | |
| INTRODUCTION_TEXT = """ | |
| Online-Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains. | |
| Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1β5 steps), Medium (6β10 steps), and Hard (11+ steps). | |
| """ | |
| LEADERBOARD_TEXT = """ | |
| ### Leaderboard | |
| Our goal is to conduct a rigorous assessment of the current state of web agents. We maintain two leaderboardsβone for automatic evaluation and another for human evaluation. | |
| Please click "Submission Guideline" for details. | |
| """ | |
| SUBMISSION_TEXT = """ | |
| ## Submissions | |
| Participants are invited to submit your agent's trajectory to test. The submissions will be evaluated based on our auto-eval. | |
| ### Format of submission | |
| Submissions must include a sequence of images (i.e., screenshots in the trajectory) and a result.json file for each task. The JSON file should contain the fields: "Task", "Task_id", and "action_history". You can refer to an example of the submission files. | |
| """ | |
| CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Note: Online-Mind2Web is derived from the original Mind2Web dataset. We kindly ask that you cite both the original and this work when using or referencing the data." | |
| CITATION_BUTTON_TEXT = r""" | |
| @article{xue2025webagents, | |
| title = "An Illusion of Progress? Assessing the Current State of Web Agents", | |
| author = "Xue, Tianci and Qi, Weijian and Shi, Tianneng and Song, Chan Hee and Gou, Boyu and Song, Dawn and Sun, Huan and Su, Yu", | |
| journal = "OSU NLP Blog", | |
| year = "2025", | |
| month = "Mar", | |
| url = "https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4" | |
| } | |
| @inproceedings{deng2023mind2web, | |
| author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu}, | |
| booktitle = {Advances in Neural Information Processing Systems}, | |
| editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine}, | |
| pages = {28091--28114}, | |
| publisher = {Curran Associates, Inc.}, | |
| title = {Mind2Web: Towards a Generalist Agent for the Web}, | |
| url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5950bf290a1570ea401bf98882128160-Paper-Datasets_and_Benchmarks.pdf}, | |
| volume = {36}, | |
| year = {2023} | |
| } | |
| """ | |
| SUBMIT_INTRODUCTION = """ | |
| You should use the script provided in our GitHub repository to obtain automatic evaluation results on your own and submit them along with all trajectories. | |
| To ensure the authenticity and reliability of the reported results, we will also conduct a verification of auto-eval results. | |
| If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table. | |
| ## β Please submit the trajectory file with the following format: | |
| The result of each task is stored in a folder named as its `task_id`, containing: | |
| - `trajectory/`: Stores screenshots of each step. | |
| - `result.json`: Task metadata and action history. | |
| Here is an [example](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f) of the format. | |
| **Structure:** | |
| ``` | |
| main_directory/ | |
| βββ task_id/ | |
| βββ result.json | |
| βββ trajectory/ | |
| βββ 0_screenshot.png | |
| βββ 1_screenshot.png | |
| βββ ... | |
| ``` | |
| **`result.json` format:** | |
| ```json | |
| { | |
| "task_id": 123, | |
| "task": "abc", | |
| "action_history": ["abc", "xyz", "..."] | |
| } | |
| ``` | |
| **`human_result.json` format:** | |
| ```json | |
| [ | |
| { | |
| "task_id": 123, | |
| "task": "abc", | |
| "human_label": 0 or 1 (failure or success) | |
| }, | |
| { | |
| "task_id": 456, | |
| "task": "def", | |
| "human_label": 0 or 1 (failure or success) | |
| }, | |
| ] | |
| ``` | |
| Please email your agent's name, model family, and organization to xue.681@osu.edu, and include the trajectory directory and auto-evaluation result file as attachments (optional: human evaluation result). | |
| """ | |
| DATA_DATASET = """## More Statistics for Online-Mind2Web Benchmark | |
| """ | |
| def format_error(msg): | |
| return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>" | |
| def format_warning(msg): | |
| return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>" | |
| def format_log(msg): | |
| return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>" | |
| def model_hyperlink(link, model_name): | |
| return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | |