Result updates
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- app.py +4 -3
- results/Bgym-GPT-3.5/README.md +0 -1
- results/Bgym-GPT-3.5/config.json +0 -4
- results/Bgym-GPT-3.5/miniwob.json +0 -16
- results/Bgym-GPT-3.5/webarena.json +0 -16
- results/Bgym-GPT-3.5/workarena-l1.json +0 -16
- results/Bgym-GPT-3.5/workarena-l2.json +0 -16
- results/Bgym-GPT-4o-V/README.md +0 -1
- results/Bgym-GPT-4o-V/miniwob.json +0 -16
- results/Bgym-GPT-4o-V/webarena.json +0 -16
- results/Bgym-GPT-4o-V/workarena-l1.json +0 -16
- results/Bgym-GPT-4o-V/workarena-l2.json +0 -16
- results/Bgym-GPT-4o-V/workarena-l3.json +0 -16
- results/Bgym-GPT-o1-mini/workarena-l3.json +0 -16
- results/Bgym-Llama-3-70b/README.md +0 -1
- results/Bgym-Llama-3-70b/miniwob.json +0 -16
- results/Bgym-Llama-3-70b/webarena.json +0 -16
- results/Bgym-Llama-3-70b/workarena-l1.json +0 -16
- results/Bgym-Llama-3-70b/workarena-l2.json +0 -16
- results/Bgym-Llama-3-70b/workarena-l3.json +0 -16
- results/Bgym-Mixtral-8x22b/README.md +0 -1
- results/Bgym-Mixtral-8x22b/miniwob.json +0 -16
- results/Bgym-Mixtral-8x22b/webarena.json +0 -16
- results/Bgym-Mixtral-8x22b/workarena-l1.json +0 -16
- results/Bgym-Mixtral-8x22b/workarena-l2.json +0 -16
- results/Bgym-Mixtral-8x22b/workarena-l3.json +0 -16
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/README.md +0 -0
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/assistantbench.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/miniwob.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/webarena.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/weblinx.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l1.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l2.json +1 -1
- results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l3.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/README.md +0 -0
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/assistantbench.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/miniwob.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/webarena.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/weblinx.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l1.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l2.json +1 -1
- results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l3.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/README.md +0 -0
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/assistantbench.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/miniwob.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/webarena.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/weblinx.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l1.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l2.json +1 -1
- results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l3.json +1 -1
app.py
CHANGED
|
@@ -155,8 +155,9 @@ def create_html_table_benchmark(df, benchmark):
|
|
| 155 |
html += '<table>'
|
| 156 |
html += '<thead><tr>'
|
| 157 |
for column in df.columns:
|
| 158 |
-
if column
|
| 159 |
-
|
|
|
|
| 160 |
html += '</tr></thead>'
|
| 161 |
html += '<tbody>'
|
| 162 |
for _, row in df.iterrows():
|
|
@@ -169,7 +170,7 @@ def create_html_table_benchmark(df, benchmark):
|
|
| 169 |
summary = sanitize_cell_value(row[column])
|
| 170 |
details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
|
| 171 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
| 172 |
-
elif column == "Reproduced_all":
|
| 173 |
continue
|
| 174 |
elif column == "Score":
|
| 175 |
score_with_std_err = f'{row[column]} Β± {row["std_err"]}'
|
|
|
|
| 155 |
html += '<table>'
|
| 156 |
html += '<thead><tr>'
|
| 157 |
for column in df.columns:
|
| 158 |
+
if column == "Reproduced_all" or column == "std_err":
|
| 159 |
+
continue
|
| 160 |
+
html += f'<th>{sanitize_column_name(column)}</th>'
|
| 161 |
html += '</tr></thead>'
|
| 162 |
html += '<tbody>'
|
| 163 |
for _, row in df.iterrows():
|
|
|
|
| 170 |
summary = sanitize_cell_value(row[column])
|
| 171 |
details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
|
| 172 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
| 173 |
+
elif column == "Reproduced_all" or column == "std_err":
|
| 174 |
continue
|
| 175 |
elif column == "Score":
|
| 176 |
score_with_std_err = f'{row[column]} Β± {row["std_err"]}'
|
results/Bgym-GPT-3.5/README.md
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
## GPT-3.5 model
|
|
|
|
|
|
results/Bgym-GPT-3.5/config.json
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"agent_name": "GPT-3.5",
|
| 3 |
-
"backend_llm": "GPT-3.5"
|
| 4 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-3.5/miniwob.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-GPT-3.5",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "MiniWoB",
|
| 7 |
-
"score": 43.4,
|
| 8 |
-
"std_err": 0.1,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-3.5/webarena.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-GPT-3.5",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WebArena",
|
| 7 |
-
"score": 6.7,
|
| 8 |
-
"std_err": 0.2,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-3.5/workarena-l1.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-GPT-3.5",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WorkArena-L1",
|
| 7 |
-
"score": 6.1,
|
| 8 |
-
"std_err": 0.3,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-3.5/workarena-l2.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-GPT-3.5",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WorkArena-L2",
|
| 7 |
-
"score": 0.0,
|
| 8 |
-
"std_err": 0.0,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-V/README.md
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
## GPT-4o-V model
|
|
|
|
|
|
results/Bgym-GPT-4o-V/miniwob.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-GPT-4o-V",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "MiniWoB",
|
| 7 |
-
"score": 72.5,
|
| 8 |
-
"std_err": 0.5,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-V/webarena.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-GPT-4o-V",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WebArena",
|
| 7 |
-
"score": 24.0,
|
| 8 |
-
"std_err": 0.4,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-V/workarena-l1.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-GPT-4o-V",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WorkArena-L1",
|
| 7 |
-
"score": 41.8,
|
| 8 |
-
"std_err": 0.4,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-V/workarena-l2.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-GPT-4o-V",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WorkArena-L2",
|
| 7 |
-
"score": 3.8,
|
| 8 |
-
"std_err": 0.6,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-4o-V/workarena-l3.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-GPT-4o-V",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WorkArena-L3",
|
| 7 |
-
"score": 0.0,
|
| 8 |
-
"std_err": 0.0,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-GPT-o1-mini/workarena-l3.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-GPT-o1-mini",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WorkArena-L3",
|
| 7 |
-
"score": 0.0,
|
| 8 |
-
"std_err": 0.0,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/README.md
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
### Llama-3-70B
|
|
|
|
|
|
results/Bgym-Llama-3-70b/miniwob.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-Llama-3-70b",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "MiniWoB",
|
| 7 |
-
"score": 68.2,
|
| 8 |
-
"std_err": 0.7,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/webarena.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-Llama-3-70b",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WebArena",
|
| 7 |
-
"score": 11.0,
|
| 8 |
-
"std_err": 0.3,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/workarena-l1.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-Llama-3-70b",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"benchmark": "WorkArena-L1",
|
| 6 |
-
"score": 17.9,
|
| 7 |
-
"std_err": 0.6,
|
| 8 |
-
"benchmark_specific": "No",
|
| 9 |
-
"benchmark_tuned": "No",
|
| 10 |
-
"followed_evaluation_protocol": "Yes",
|
| 11 |
-
"reproducible": "Yes",
|
| 12 |
-
"comments": "NA",
|
| 13 |
-
"original_or_reproduced": "Original",
|
| 14 |
-
"date_time": "2021-01-01 12:00:00"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/workarena-l2.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-Llama-3-70b",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WorkArena-L2",
|
| 7 |
-
"score": 0.0,
|
| 8 |
-
"std_err": 0.0,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Llama-3-70b/workarena-l3.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-Llama-3-70b",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WorkArena-L3",
|
| 7 |
-
"score": 0.0,
|
| 8 |
-
"std_err": 0.0,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/README.md
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
## Mixtral 8x22B
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/miniwob.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "MiniWoB",
|
| 7 |
-
"score": 62.4,
|
| 8 |
-
"std_err": 0.5,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/webarena.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WebArena",
|
| 7 |
-
"score": 12.6,
|
| 8 |
-
"std_err": 0.9,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/workarena-l1.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"benchmark": "WorkArena-L1",
|
| 6 |
-
"score": 12.4,
|
| 7 |
-
"std_err": 0.7,
|
| 8 |
-
"benchmark_specific": "No",
|
| 9 |
-
"benchmark_tuned": "No",
|
| 10 |
-
"followed_evaluation_protocol": "Yes",
|
| 11 |
-
"reproducible": "Yes",
|
| 12 |
-
"comments": "NA",
|
| 13 |
-
"original_or_reproduced": "Original",
|
| 14 |
-
"date_time": "2021-01-04 12:06:00"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/workarena-l2.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WorkArena-L2",
|
| 7 |
-
"score": 0.0,
|
| 8 |
-
"std_err": 0.0,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/Bgym-Mixtral-8x22b/workarena-l3.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "Bgym-Mixtral-8x22b",
|
| 4 |
-
"study_id": "study_id",
|
| 5 |
-
"date_time": "2021-01-01 12:00:00",
|
| 6 |
-
"benchmark": "WorkArena-L3",
|
| 7 |
-
"score": 0.0,
|
| 8 |
-
"std_err": 0.0,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "NA",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/README.md
RENAMED
|
File without changes
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/assistantbench.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"benchmark": "AssistantBench",
|
| 6 |
"score": 5.2,
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"benchmark": "AssistantBench",
|
| 6 |
"score": 5.2,
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/miniwob.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
"score": 69.8,
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"benchmark": "MiniWoB",
|
| 6 |
"score": 69.8,
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/webarena.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"benchmark": "WebArena",
|
| 6 |
"score": 36.2,
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"benchmark": "WebArena",
|
| 6 |
"score": 36.2,
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/weblinx.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"benchmark": "WebLINX",
|
| 6 |
"score": 13.7,
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"benchmark": "WebLINX",
|
| 6 |
"score": 13.7,
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l1.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
"score": 56.4,
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"benchmark": "WorkArena-L1",
|
| 6 |
"score": 56.4,
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l2.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
results/{Bgym-Claude-3.5-Sonnet β GenericAgent-Claude-3.5-Sonnet}/workarena-l3.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/README.md
RENAMED
|
File without changes
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/assistantbench.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "AssistantBench",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "AssistantBench",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/miniwob.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/webarena.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WebArena",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WebArena",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/weblinx.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WebLINX",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WebLINX",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l1.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l2.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
results/{Bgym-GPT-4o-mini β GenericAgent-GPT-4o-mini}/workarena-l3.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o-mini",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/README.md
RENAMED
|
File without changes
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/assistantbench.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "AssistantBench",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "AssistantBench",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/miniwob.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/webarena.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WebArena",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WebArena",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/weblinx.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WebLINX",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WebLINX",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l1.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l2.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
results/{Bgym-GPT-4o β GenericAgent-GPT-4o}/workarena-l3.json
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "GenericAgent-GPT-4o",
|
| 4 |
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|