add new results
Browse files- GenericAgent-Claude-3.7-Sonnet/README.md +44 -0
- GenericAgent-Claude-3.7-Sonnet/webarena.json +16 -0
- GenericAgent-Claude-4-Sonnet/README.md +44 -0
- GenericAgent-Claude-4-Sonnet/miniwob.json +17 -0
- GenericAgent-Claude-4-Sonnet/workarena-l1.json +16 -0
- GenericAgent-Claude-4-Sonnet/workarena-l2.json +16 -0
- GenericAgent-GPT-4_1-Mini/README.md +44 -0
- GenericAgent-GPT-4_1-Mini/webarena.json +16 -0
- GenericAgent-GPT-5-mini/README.md +44 -0
- GenericAgent-GPT-5-mini/miniwob.json +16 -0
- GenericAgent-GPT-5-mini/workarena-l1.json +16 -0
- GenericAgent-GPT-5-mini/workarena-l2.json +16 -0
- GenericAgent-GPT-5-nano/README.md +44 -0
- GenericAgent-GPT-5-nano/miniwob.json +16 -0
- GenericAgent-GPT-5-nano/workarena-l1.json +16 -0
- GenericAgent-GPT-5-nano/workarena-l2.json +16 -0
- GenericAgent-GPT-5/README.md +44 -0
- GenericAgent-GPT-5/miniwob.json +16 -0
- GenericAgent-GPT-5/workarena-l1.json +31 -0
- GenericAgent-GPT-5/workarena-l2.json +16 -0
- GenericAgent-GPT-5/workarena-l3.json +16 -0
- GenericAgent-GPT-oss-120b/README.md +44 -0
- GenericAgent-GPT-oss-120b/miniwob.json +16 -0
- GenericAgent-GPT-oss-120b/workarena-l1.json +16 -0
- GenericAgent-GPT-oss-120b/workarena-l2.json +16 -0
- GenericAgent-GPT-oss-20b/README.md +44 -0
- GenericAgent-GPT-oss-20b/miniwob.json +16 -0
- GenericAgent-GPT-oss-20b/workarena-l1.json +16 -0
- GenericAgent-GPT-oss-20b/workarena-l2.json +16 -0
- results/OrbyAgent-Claude-3.5-Sonnet/README.md +1 -0
GenericAgent-Claude-3.7-Sonnet/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-Claude-3.7-Sonnet
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses Claude-3.7-Sonnet (claude-3-7-sonnet-20250219) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True,
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
GenericAgent-Claude-3.7-Sonnet/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-claude-3-7-sonnet",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Webarena",
|
| 6 |
+
"score": 0.446,
|
| 7 |
+
"std_err": 0.025,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-Claude-4-Sonnet/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-Claude-4-Sonnet
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses claude-4-sonnet (claude-sonnet-4-20250514) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True,
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
GenericAgent-Claude-4-Sonnet/miniwob.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-claude-sonnet-4",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Miniwob",
|
| 6 |
+
"score": 0.707,
|
| 7 |
+
"std_err": 0.018,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
]
|
GenericAgent-Claude-4-Sonnet/workarena-l1.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-claude-sonnet-4-20250514",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L1",
|
| 6 |
+
"score": 0.633,
|
| 7 |
+
"std_err": 0.027,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-Claude-4-Sonnet/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-claude-sonnet-4-20250514",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L2",
|
| 6 |
+
"score": 0.404,
|
| 7 |
+
"std_err": 0.032,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-4_1-Mini/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-GPT_4_1_mini
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses gpt-4.1-mini (gpt-4.1-mini-2025-04-14) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True,
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
GenericAgent-GPT-4_1-Mini/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-4.1-mini",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "webarena",
|
| 6 |
+
"score": 0.307,
|
| 7 |
+
"std_err": 0.024,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-5-mini/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-GPT-5-Mini
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses gpt-5-mini (gpt-5-mini-2025-08-07) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True,
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
GenericAgent-GPT-5-mini/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-5-mini",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 0.71,
|
| 7 |
+
"std_err": 0.018,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-5-mini/workarena-l1.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-5-mini",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L1",
|
| 6 |
+
"score": 0.606,
|
| 7 |
+
"std_err": 0.027,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-5-mini/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-5-mini",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L2",
|
| 6 |
+
"score": 0.477,
|
| 7 |
+
"std_err": 0.033,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-5-nano/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-GPT-5-Nano
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses gpt-5-nano (gpt-5-nano-2025-08-07) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True,
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
GenericAgent-GPT-5-nano/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-5-nano",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 0.648,
|
| 7 |
+
"std_err": 0.019,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-5-nano/workarena-l1.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-5-nano",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L1",
|
| 6 |
+
"score": 0.406,
|
| 7 |
+
"std_err": 0.027,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-5-nano/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-5-nano",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L2",
|
| 6 |
+
"score": 0.034,
|
| 7 |
+
"std_err": 0.012,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-5/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-GPT-5
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses gpt-5 (gpt-5-2025-08-07) as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True,
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
GenericAgent-GPT-5/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-5-2025-08-07",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 0.715,
|
| 7 |
+
"std_err": 0.018,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-5/workarena-l1.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-5-2025-08-07",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L1",
|
| 6 |
+
"score": 0.661,
|
| 7 |
+
"std_err": 0.026,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"agent_name": "GenericAgent-gpt-5-2025-08-07",
|
| 18 |
+
"study_id": "2025-08-07_21-09-16",
|
| 19 |
+
"benchmark": "Workarena-L1",
|
| 20 |
+
"score": 0.791,
|
| 21 |
+
"std_err": 0.022,
|
| 22 |
+
"benchmark_specific": "No",
|
| 23 |
+
"benchmark_tuned": "No",
|
| 24 |
+
"followed_evaluation_protocol": "No",
|
| 25 |
+
"reproducible": "Yes",
|
| 26 |
+
"comments": "Increased max_steps from 15 to 30",
|
| 27 |
+
"original_or_reproduced": "Original",
|
| 28 |
+
"date_time": "2025-08-07 21:09:16"
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
]
|
GenericAgent-GPT-5/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-5-2025-08-07",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L2",
|
| 6 |
+
"score": 0.694,
|
| 7 |
+
"std_err": 0.03,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-5/workarena-l3.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-5-2025-08-07",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L3",
|
| 6 |
+
"score": 0.115,
|
| 7 |
+
"std_err": 0.021,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "No",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "Increased max_steps from 50 to 100",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-oss-120b/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-OSS-120B
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses gpt-oss-120b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True,
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
GenericAgent-GPT-oss-120b/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-openai_gpt-oss-120b",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 0.664,
|
| 7 |
+
"std_err": 0.019,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-oss-120b/workarena-l1.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-openai_gpt-oss-120b",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L1",
|
| 6 |
+
"score": 0.509,
|
| 7 |
+
"std_err": 0.028,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-oss-120b/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-openai_gpt-oss-120b",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L2",
|
| 6 |
+
"score": 0.115,
|
| 7 |
+
"std_err": 0.021,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-oss-20b/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### GenericAgent-OSS-20b
|
| 2 |
+
|
| 3 |
+
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
+
|
| 5 |
+
It uses gpt-oss-20b as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
+
```python
|
| 7 |
+
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
+
obs=dp.ObsFlags(
|
| 9 |
+
use_html=False,
|
| 10 |
+
use_ax_tree=True,
|
| 11 |
+
use_focused_element=True,
|
| 12 |
+
use_error_logs=True,
|
| 13 |
+
use_history=True,
|
| 14 |
+
use_past_error_logs=False,
|
| 15 |
+
use_action_history=True,
|
| 16 |
+
use_think_history=True,
|
| 17 |
+
use_diff=False,
|
| 18 |
+
html_type="pruned_html",
|
| 19 |
+
use_screenshot=False,
|
| 20 |
+
use_som=False,
|
| 21 |
+
extract_visible_tag=True,
|
| 22 |
+
extract_clickable_tag=True,
|
| 23 |
+
extract_coords="False",
|
| 24 |
+
filter_visible_elements_only=False,
|
| 25 |
+
),
|
| 26 |
+
action=dp.ActionFlags(
|
| 27 |
+
multi_actions=False,
|
| 28 |
+
action_set="bid",
|
| 29 |
+
long_description=False,
|
| 30 |
+
individual_examples=False,
|
| 31 |
+
),
|
| 32 |
+
use_plan=False,
|
| 33 |
+
use_criticise=False,
|
| 34 |
+
use_thinking=True,
|
| 35 |
+
use_memory=False,
|
| 36 |
+
use_concrete_example=True,
|
| 37 |
+
use_abstract_example=True,
|
| 38 |
+
use_hints=True,
|
| 39 |
+
enable_chat=False,
|
| 40 |
+
max_prompt_tokens=40_000,
|
| 41 |
+
be_cautious=True,
|
| 42 |
+
extra_instructions=None,
|
| 43 |
+
)
|
| 44 |
+
```
|
GenericAgent-GPT-oss-20b/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-openai_gpt-oss-20b",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "MiniWoB",
|
| 6 |
+
"score": 0.64,
|
| 7 |
+
"std_err": 0.019,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-oss-20b/workarena-l1.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-oss-20b",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L1",
|
| 6 |
+
"score": 0.385,
|
| 7 |
+
"std_err": 0.027,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
GenericAgent-GPT-oss-20b/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "GenericAgent-gpt-oss-20b",
|
| 4 |
+
"study_id": "2025-08-07_21-09-16",
|
| 5 |
+
"benchmark": "Workarena-L2",
|
| 6 |
+
"score": 0.026,
|
| 7 |
+
"std_err": 0.01,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2025-08-07 21:09:16"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/OrbyAgent-Claude-3.5-Sonnet/README.md
CHANGED
|
@@ -5,3 +5,4 @@ This agent is developed by [Orby AI](https://www.orby.ai/).
|
|
| 5 |
The agent does not use any benchmark-specific information in the prompts. For WebArena benchmark, we use the original evaluator and task definitions for fair comparison.
|
| 6 |
|
| 7 |
It uses Claude-3.5-sonnet-20241022 as a backend, with both screenshot and HTML as inputs. More details can be found in our [research blog](https://www.orby.ai/resources/elevating-automation-orby-ais-generic-agent-framework-and-self-adaptive-interface-learning-technique).
|
|
|
|
|
|
| 5 |
The agent does not use any benchmark-specific information in the prompts. For WebArena benchmark, we use the original evaluator and task definitions for fair comparison.
|
| 6 |
|
| 7 |
It uses Claude-3.5-sonnet-20241022 as a backend, with both screenshot and HTML as inputs. More details can be found in our [research blog](https://www.orby.ai/resources/elevating-automation-orby-ais-generic-agent-framework-and-self-adaptive-interface-learning-technique).
|
| 8 |
+
|