WeijianQi1999 commited on
Commit
4fcb84f
·
1 Parent(s): c941e28

update google compute use

Browse files
app.py CHANGED
@@ -34,14 +34,13 @@ def get_dataframe_from_results(eval_path):
34
  else:
35
  df = df.sort_values(
36
  by=["Verified", "Average SR"],
37
- ascending=[False, False],
38
- kind="mergesort"
39
  )
40
 
41
  for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
42
  if col in df.columns:
43
  df[col] = _format_sr_column(df[col])
44
-
45
  return df
46
 
47
  auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
@@ -283,7 +282,7 @@ with demo:
283
  )
284
  gr.Markdown("### Visualization")
285
  gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
286
- fig = plot_heatmap_with_performance_bar("./human_label_082325.json")
287
  gr.Plot(fig)
288
  gr.Markdown(EVALUATION_DETAILS)
289
 
 
34
  else:
35
  df = df.sort_values(
36
  by=["Verified", "Average SR"],
37
+ ascending=[False, False], # False 表示降序;Verified=True 会排到最上面
38
+ kind="mergesort" # 稳定排序,保证次序可预期
39
  )
40
 
41
  for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
42
  if col in df.columns:
43
  df[col] = _format_sr_column(df[col])
 
44
  return df
45
 
46
  auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
 
282
  )
283
  gr.Markdown("### Visualization")
284
  gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
285
+ fig = plot_heatmap_with_performance_bar("./human_label_092925.json")
286
  gr.Plot(fig)
287
  gr.Markdown(EVALUATION_DETAILS)
288
 
auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv CHANGED
@@ -9,4 +9,5 @@ ACT-1-20250703,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,[Enhans](https:
9
  ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,[Enhans](https://www.enhans.ai/),71.1,52.4,32.4,52.7,2025-8-23,True,,2025-08
10
  Eko-V2,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),95.0,76.0,70.0,78.0,2025-5-24,False,Unknown evaluation method,2025-05
11
  Eko-V1,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),-,-,-,31.0,2025-5-24,False,Unknown evaluation method,2025-05
12
- Seed1.5-VL,Seed1.5-VL,ByteDance,[ByteDance](https://arxiv.org/pdf/2505.07062),-,-,-,76.4,2025-5-11,False,Evaluated by WebJudge(GPT-4o),2025-05
 
 
9
  ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,[Enhans](https://www.enhans.ai/),71.1,52.4,32.4,52.7,2025-8-23,True,,2025-08
10
  Eko-V2,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),95.0,76.0,70.0,78.0,2025-5-24,False,Unknown evaluation method,2025-05
11
  Eko-V1,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),-,-,-,31.0,2025-5-24,False,Unknown evaluation method,2025-05
12
+ Seed1.5-VL,Seed1.5-VL,ByteDance,[ByteDance](https://arxiv.org/pdf/2505.07062),-,-,-,76.4,2025-5-11,False,Evaluated by WebJudge(GPT-4o),2025-05
13
+ Google Computer Use (09-2025),Gemini 2.5 Computer Use,Google DeepMind,Google DeepMind,77.1,55.2,45.9,57.3,2025-09-29,True,,2025-09
human_Mind2Web-Online - Leaderboard_data.csv CHANGED
@@ -6,4 +6,5 @@ Claude Computer Use 3.5,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,56.6,20.3,1
6
  Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22
7
  Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,90.4,49.0,32.4,56.3,2025-4-20
8
  ACT-1-20250703,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,65.1,46.2,23.0,45.7,2025-7-16
9
- ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,81.9,54.5,35.1,57.3,2025-8-23
 
 
6
  Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22
7
  Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,90.4,49.0,32.4,56.3,2025-4-20
8
  ACT-1-20250703,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,65.1,46.2,23.0,45.7,2025-7-16
9
+ ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,81.9,54.5,35.1,57.3,2025-8-23
10
+ Google Computer Use (09-2025),Gemini 2.5 Computer Use,Google DeepMind,Google DeepMind,77.1,71.3,55.4,69.0,2025-9-29
human_label_092925.json ADDED
The diff for this file is too large to render. See raw diff