Spaces:

ServiceNow
/

browsergym-leaderboard

Running

App Files Files Community

meghsn commited on Dec 5, 2024

Commit

d5581cc

1 Parent(s): 97d7e59

Result updates

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +4 -3
results/Bgym-GPT-3.5/README.md +0 -1
results/Bgym-GPT-3.5/config.json +0 -4
results/Bgym-GPT-3.5/miniwob.json +0 -16
results/Bgym-GPT-3.5/webarena.json +0 -16
results/Bgym-GPT-3.5/workarena-l1.json +0 -16
results/Bgym-GPT-3.5/workarena-l2.json +0 -16
results/Bgym-GPT-4o-V/README.md +0 -1
results/Bgym-GPT-4o-V/miniwob.json +0 -16
results/Bgym-GPT-4o-V/webarena.json +0 -16
results/Bgym-GPT-4o-V/workarena-l1.json +0 -16
results/Bgym-GPT-4o-V/workarena-l2.json +0 -16
results/Bgym-GPT-4o-V/workarena-l3.json +0 -16
results/Bgym-GPT-o1-mini/workarena-l3.json +0 -16
results/Bgym-Llama-3-70b/README.md +0 -1
results/Bgym-Llama-3-70b/miniwob.json +0 -16
results/Bgym-Llama-3-70b/webarena.json +0 -16
results/Bgym-Llama-3-70b/workarena-l1.json +0 -16
results/Bgym-Llama-3-70b/workarena-l2.json +0 -16
results/Bgym-Llama-3-70b/workarena-l3.json +0 -16
results/Bgym-Mixtral-8x22b/README.md +0 -1
results/Bgym-Mixtral-8x22b/miniwob.json +0 -16
results/Bgym-Mixtral-8x22b/webarena.json +0 -16
results/Bgym-Mixtral-8x22b/workarena-l1.json +0 -16
results/Bgym-Mixtral-8x22b/workarena-l2.json +0 -16
results/Bgym-Mixtral-8x22b/workarena-l3.json +0 -16
results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/README.md +0 -0
results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/assistantbench.json +1 -1
results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/miniwob.json +1 -1
results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/webarena.json +1 -1
results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/weblinx.json +1 -1
results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/workarena-l1.json +1 -1
results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/workarena-l2.json +1 -1
results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/workarena-l3.json +1 -1
results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/README.md +0 -0
results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/assistantbench.json +1 -1
results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/miniwob.json +1 -1
results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/webarena.json +1 -1
results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/weblinx.json +1 -1
results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/workarena-l1.json +1 -1
results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/workarena-l2.json +1 -1
results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/workarena-l3.json +1 -1
results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/README.md +0 -0
results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/assistantbench.json +1 -1
results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/miniwob.json +1 -1
results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/webarena.json +1 -1
results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/weblinx.json +1 -1
results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/workarena-l1.json +1 -1
results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/workarena-l2.json +1 -1
results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/workarena-l3.json +1 -1

app.py CHANGED Viewed

@@ -155,8 +155,9 @@ def create_html_table_benchmark(df, benchmark):
     html += '<table>'
     html += '<thead><tr>'
     for column in df.columns:
-        if column != "Reproduced_all":
-            html += f'<th>{sanitize_column_name(column)}</th>'
     html += '</tr></thead>'
     html += '<tbody>'
     for _, row in df.iterrows():
@@ -169,7 +170,7 @@ def create_html_table_benchmark(df, benchmark):
                     summary = sanitize_cell_value(row[column])
                     details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
                     html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
-            elif column == "Reproduced_all":
                 continue
             elif column == "Score":
                 score_with_std_err = f'{row[column]} ± {row["std_err"]}'

     html += '<table>'
     html += '<thead><tr>'
     for column in df.columns:
+        if column == "Reproduced_all" or column == "std_err":
+            continue
+        html += f'<th>{sanitize_column_name(column)}</th>'
     html += '</tr></thead>'
     html += '<tbody>'
     for _, row in df.iterrows():
                     summary = sanitize_cell_value(row[column])
                     details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
                     html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
+            elif column == "Reproduced_all" or column == "std_err":
                 continue
             elif column == "Score":
                 score_with_std_err = f'{row[column]} ± {row["std_err"]}'

results/Bgym-GPT-3.5/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- ## GPT-3.5 model

results/Bgym-GPT-3.5/config.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-    "agent_name": "GPT-3.5",
-    "backend_llm": "GPT-3.5"
-}

results/Bgym-GPT-3.5/miniwob.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-GPT-3.5",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "MiniWoB",
-        "score": 43.4,
-        "std_err": 0.1,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-GPT-3.5/webarena.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-GPT-3.5",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WebArena",
-        "score": 6.7,
-        "std_err": 0.2,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-GPT-3.5/workarena-l1.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-GPT-3.5",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WorkArena-L1",
-        "score": 6.1,
-        "std_err": 0.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-GPT-3.5/workarena-l2.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-GPT-3.5",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WorkArena-L2",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-GPT-4o-V/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- ## GPT-4o-V model

results/Bgym-GPT-4o-V/miniwob.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-GPT-4o-V",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "MiniWoB",
-        "score": 72.5,
-        "std_err": 0.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-GPT-4o-V/webarena.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-GPT-4o-V",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WebArena",
-        "score": 24.0,
-        "std_err": 0.4,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-GPT-4o-V/workarena-l1.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-GPT-4o-V",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WorkArena-L1",
-        "score": 41.8,
-        "std_err": 0.4,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-GPT-4o-V/workarena-l2.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-GPT-4o-V",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WorkArena-L2",
-        "score": 3.8,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-GPT-4o-V/workarena-l3.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-GPT-4o-V",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WorkArena-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-GPT-o1-mini/workarena-l3.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-GPT-o1-mini",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WorkArena-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-Llama-3-70b/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- ### Llama-3-70B

results/Bgym-Llama-3-70b/miniwob.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-Llama-3-70b",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "MiniWoB",
-        "score": 68.2,
-        "std_err": 0.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-Llama-3-70b/webarena.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-Llama-3-70b",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WebArena",
-        "score": 11.0,
-        "std_err": 0.3,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-Llama-3-70b/workarena-l1.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-Llama-3-70b",
-        "study_id": "study_id",
-        "benchmark": "WorkArena-L1",
-        "score": 17.9,
-        "std_err": 0.6,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2021-01-01 12:00:00"
-    }
-]

results/Bgym-Llama-3-70b/workarena-l2.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-Llama-3-70b",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WorkArena-L2",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-Llama-3-70b/workarena-l3.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-Llama-3-70b",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WorkArena-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-Mixtral-8x22b/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- ## Mixtral 8x22B

results/Bgym-Mixtral-8x22b/miniwob.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-Mixtral-8x22b",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "MiniWoB",
-        "score": 62.4,
-        "std_err": 0.5,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-Mixtral-8x22b/webarena.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-Mixtral-8x22b",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WebArena",
-        "score": 12.6,
-        "std_err": 0.9,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-Mixtral-8x22b/workarena-l1.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-Mixtral-8x22b",
-        "study_id": "study_id",
-        "benchmark": "WorkArena-L1",
-        "score": 12.4,
-        "std_err": 0.7,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original",
-        "date_time": "2021-01-04 12:06:00"
-    }
-]

results/Bgym-Mixtral-8x22b/workarena-l2.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-Mixtral-8x22b",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WorkArena-L2",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/Bgym-Mixtral-8x22b/workarena-l3.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-    {
-        "agent_name": "Bgym-Mixtral-8x22b",
-        "study_id": "study_id",
-        "date_time": "2021-01-01 12:00:00",
-        "benchmark": "WorkArena-L3",
-        "score": 0.0,
-        "std_err": 0.0,
-        "benchmark_specific": "No",
-        "benchmark_tuned": "No",
-        "followed_evaluation_protocol": "Yes",
-        "reproducible": "Yes",
-        "comments": "NA",
-        "original_or_reproduced": "Original"
-    }
-]

results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/README.md RENAMED Viewed

File without changes

results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/assistantbench.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "benchmark": "AssistantBench",
         "score": 5.2,

 [
     {
+        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "benchmark": "AssistantBench",
         "score": 5.2,

results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/miniwob.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "benchmark": "MiniWoB",
         "score": 69.8,

 [
     {
+        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "benchmark": "MiniWoB",
         "score": 69.8,

results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/webarena.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "benchmark": "WebArena",
         "score": 36.2,

 [
     {
+        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "benchmark": "WebArena",
         "score": 36.2,

results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/weblinx.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "benchmark": "WebLINX",
         "score": 13.7,

 [
     {
+        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "benchmark": "WebLINX",
         "score": 13.7,

results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/workarena-l1.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "benchmark": "WorkArena-L1",
         "score": 56.4,

 [
     {
+        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "benchmark": "WorkArena-L1",
         "score": 56.4,

results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/workarena-l2.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",

 [
     {
+        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",

results/{Bgym-Claude-3.5-Sonnet → GenericAgent-Claude-3.5-Sonnet}/workarena-l3.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",

 [
     {
+        "agent_name": "GenericAgent-Claude-3.5-Sonnet",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",

results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/README.md RENAMED Viewed

File without changes

results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/assistantbench.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "AssistantBench",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "AssistantBench",

results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/miniwob.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",

results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/webarena.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WebArena",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WebArena",

results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/weblinx.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WebLINX",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WebLINX",

results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/workarena-l1.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",

results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/workarena-l2.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",

results/{Bgym-GPT-4o-mini → GenericAgent-GPT-4o-mini}/workarena-l3.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o-mini",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",

results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/README.md RENAMED Viewed

File without changes

results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/assistantbench.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "AssistantBench",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "AssistantBench",

results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/miniwob.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "MiniWoB",

results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/webarena.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WebArena",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WebArena",

results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/weblinx.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WebLINX",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WebLINX",

results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/workarena-l1.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L1",

results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/workarena-l2.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L2",

results/{Bgym-GPT-4o → GenericAgent-GPT-4o}/workarena-l3.json RENAMED Viewed

@@ -1,6 +1,6 @@
 [
     {
-        "agent_name": "Bgym-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",

 [
     {
+        "agent_name": "GenericAgent-GPT-4o",
         "study_id": "study_id",
         "date_time": "2021-01-01 12:00:00",
         "benchmark": "WorkArena-L3",