Commit
·
4fcb84f
1
Parent(s):
c941e28
update google compute use
Browse files
app.py
CHANGED
|
@@ -34,14 +34,13 @@ def get_dataframe_from_results(eval_path):
|
|
| 34 |
else:
|
| 35 |
df = df.sort_values(
|
| 36 |
by=["Verified", "Average SR"],
|
| 37 |
-
ascending=[False, False],
|
| 38 |
-
kind="mergesort"
|
| 39 |
)
|
| 40 |
|
| 41 |
for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
|
| 42 |
if col in df.columns:
|
| 43 |
df[col] = _format_sr_column(df[col])
|
| 44 |
-
|
| 45 |
return df
|
| 46 |
|
| 47 |
auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
|
|
@@ -283,7 +282,7 @@ with demo:
|
|
| 283 |
)
|
| 284 |
gr.Markdown("### Visualization")
|
| 285 |
gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
|
| 286 |
-
fig = plot_heatmap_with_performance_bar("./
|
| 287 |
gr.Plot(fig)
|
| 288 |
gr.Markdown(EVALUATION_DETAILS)
|
| 289 |
|
|
|
|
| 34 |
else:
|
| 35 |
df = df.sort_values(
|
| 36 |
by=["Verified", "Average SR"],
|
| 37 |
+
ascending=[False, False], # False 表示降序;Verified=True 会排到最上面
|
| 38 |
+
kind="mergesort" # 稳定排序,保证次序可预期
|
| 39 |
)
|
| 40 |
|
| 41 |
for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
|
| 42 |
if col in df.columns:
|
| 43 |
df[col] = _format_sr_column(df[col])
|
|
|
|
| 44 |
return df
|
| 45 |
|
| 46 |
auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
|
|
|
|
| 282 |
)
|
| 283 |
gr.Markdown("### Visualization")
|
| 284 |
gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
|
| 285 |
+
fig = plot_heatmap_with_performance_bar("./human_label_092925.json")
|
| 286 |
gr.Plot(fig)
|
| 287 |
gr.Markdown(EVALUATION_DETAILS)
|
| 288 |
|
auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv
CHANGED
|
@@ -9,4 +9,5 @@ ACT-1-20250703,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,[Enhans](https:
|
|
| 9 |
ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,[Enhans](https://www.enhans.ai/),71.1,52.4,32.4,52.7,2025-8-23,True,,2025-08
|
| 10 |
Eko-V2,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),95.0,76.0,70.0,78.0,2025-5-24,False,Unknown evaluation method,2025-05
|
| 11 |
Eko-V1,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),-,-,-,31.0,2025-5-24,False,Unknown evaluation method,2025-05
|
| 12 |
-
Seed1.5-VL,Seed1.5-VL,ByteDance,[ByteDance](https://arxiv.org/pdf/2505.07062),-,-,-,76.4,2025-5-11,False,Evaluated by WebJudge(GPT-4o),2025-05
|
|
|
|
|
|
| 9 |
ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,[Enhans](https://www.enhans.ai/),71.1,52.4,32.4,52.7,2025-8-23,True,,2025-08
|
| 10 |
Eko-V2,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),95.0,76.0,70.0,78.0,2025-5-24,False,Unknown evaluation method,2025-05
|
| 11 |
Eko-V1,Unknown,Fellou,[Fellou](https://fellou.ai/blog/post/eko20-launch/),-,-,-,31.0,2025-5-24,False,Unknown evaluation method,2025-05
|
| 12 |
+
Seed1.5-VL,Seed1.5-VL,ByteDance,[ByteDance](https://arxiv.org/pdf/2505.07062),-,-,-,76.4,2025-5-11,False,Evaluated by WebJudge(GPT-4o),2025-05
|
| 13 |
+
Google Computer Use (09-2025),Gemini 2.5 Computer Use,Google DeepMind,Google DeepMind,77.1,55.2,45.9,57.3,2025-09-29,True,,2025-09
|
human_Mind2Web-Online - Leaderboard_data.csv
CHANGED
|
@@ -6,4 +6,5 @@ Claude Computer Use 3.5,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,56.6,20.3,1
|
|
| 6 |
Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22
|
| 7 |
Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,90.4,49.0,32.4,56.3,2025-4-20
|
| 8 |
ACT-1-20250703,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,65.1,46.2,23.0,45.7,2025-7-16
|
| 9 |
-
ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,81.9,54.5,35.1,57.3,2025-8-23
|
|
|
|
|
|
| 6 |
Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,49.4,26.6,6.8,28.0,2025-3-22
|
| 7 |
Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,90.4,49.0,32.4,56.3,2025-4-20
|
| 8 |
ACT-1-20250703,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,65.1,46.2,23.0,45.7,2025-7-16
|
| 9 |
+
ACT-1-20250814,o3-2025-04-16 and Claude-sonnet-4-20250514,Enhans,Enhans,81.9,54.5,35.1,57.3,2025-8-23
|
| 10 |
+
Google Computer Use (09-2025),Gemini 2.5 Computer Use,Google DeepMind,Google DeepMind,77.1,71.3,55.4,69.0,2025-9-29
|
human_label_092925.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|