Spaces:

osunlp
/

Online_Mind2Web_Leaderboard

Running

WeijianQi1999 commited on 27 days ago

Commit

4fcb84f

1 Parent(s): c941e28

update google compute use

Files changed (4) hide show

app.py CHANGED Viewed

@@ -34,14 +34,13 @@ def get_dataframe_from_results(eval_path):
     else:
         df = df.sort_values(
         by=["Verified", "Average SR"],
-        ascending=[False, False],
-        kind="mergesort"
     )
     for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
         if col in df.columns:
             df[col] = _format_sr_column(df[col])
     return df
 auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
@@ -283,7 +282,7 @@ with demo:
         )
         gr.Markdown("### Visualization")
         gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
-        fig = plot_heatmap_with_performance_bar("./human_label_082325.json")
         gr.Plot(fig)
         gr.Markdown(EVALUATION_DETAILS)

     else:
         df = df.sort_values(
         by=["Verified", "Average SR"],
+        ascending=[False, False],      # False 表示降序；Verified=True 会排到最上面
+        kind="mergesort"              # 稳定排序，保证次序可预期
     )
     for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
         if col in df.columns:
             df[col] = _format_sr_column(df[col])
     return df
 auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
         )
         gr.Markdown("### Visualization")
         gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
+        fig = plot_heatmap_with_performance_bar("./human_label_092925.json")
         gr.Plot(fig)
         gr.Markdown(EVALUATION_DETAILS)

auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv CHANGED Viewed

@@ -9,4 +9,5 @@ ACT-1-20250703,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,[Enhans](https:
 ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,[Enhans](https://www.enhans.ai/),71.1,52.4,32.4,52.7,2025-8-23,True,,2025-08
 Eko-V2,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),95.0,76.0,70.0,78.0,2025-5-24,False,Unknown evaluation method,2025-05
 Eko-V1,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),-,-,-,31.0,2025-5-24,False,Unknown evaluation method,2025-05
-Seed1.5-VL,Seed1.5-VL,ByteDance,[ByteDance](https://arxiv.org/pdf/2505.07062),-,-,-,76.4,2025-5-11,False,Evaluated by WebJudge(GPT-4o),2025-05

 ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,[Enhans](https://www.enhans.ai/),71.1,52.4,32.4,52.7,2025-8-23,True,,2025-08
 Eko-V2,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),95.0,76.0,70.0,78.0,2025-5-24,False,Unknown evaluation method,2025-05
 Eko-V1,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),-,-,-,31.0,2025-5-24,False,Unknown evaluation method,2025-05
+Seed1.5-VL,Seed1.5-VL,ByteDance,[ByteDance](https://arxiv.org/pdf/2505.07062),-,-,-,76.4,2025-5-11,False,Evaluated by WebJudge(GPT-4o),2025-05
+Google Computer Use (09-2025),Gemini 2.5 Computer Use,Google DeepMind,Google DeepMind,77.1,55.2,45.9,57.3,2025-09-29,True,,2025-09

human_Mind2Web-Online - Leaderboard_data.csv CHANGED Viewed

@@ -6,4 +6,5 @@ Claude Computer Use 3.5,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,56.6,20.3,1
 Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22
 Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,90.4,49.0,32.4,56.3,2025-4-20
 ACT-1-20250703,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,65.1,46.2,23.0,45.7,2025-7-16
-ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,81.9,54.5,35.1,57.3,2025-8-23

 Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22
 Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,90.4,49.0,32.4,56.3,2025-4-20
 ACT-1-20250703,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,65.1,46.2,23.0,45.7,2025-7-16
+ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,81.9,54.5,35.1,57.3,2025-8-23
+Google Computer Use (09-2025),Gemini 2.5 Computer Use,Google DeepMind,Google DeepMind,77.1,71.3,55.4,69.0,2025-9-29

human_label_092925.json ADDED Viewed

The diff for this file is too large to render. See raw diff