ZebraLogic

Runtime error

App Files Files Community

yuchenlin commited on Jul 12, 2024

Commit

3d2e59d

1 Parent(s): d74dfe0

explore data

Browse files

Files changed (8) hide show

.gitignore +1 -0
_header.md +2 -2
app.py +48 -9
constants.py +14 -9
data_utils.py +93 -5
eval_utils.py +217 -0
update_data.sh +3 -2
zebra_banner.png +0 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 *.pyc
 *.DS_Store

 *.pyc
 *.DS_Store
+ZeroEval-main/result_dirs/zebra-grid/

_header.md CHANGED Viewed

@@ -1,5 +1,5 @@
 <br/>
-# 🦁 WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild
-[📑 Paper](https://allenai.github.io/WildBench/WildBench_paper.pdf) | [💻 GitHub](https://github.com/allenai/WildBench) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/wildbench-65e8f2fa9c1260a85a933627) | [🐦 X](https://x.com/billyuchenlin/status/1795746137875554531) | [💬 Discussion](https://huggingface.co/spaces/allenai/WildBench/discussions) | ⚙️ **Version**: **V2** | **# Models**: {model_num} | Updated: **{LAST_UPDATED}**

 <br/>
+# 🦓 ZebraLogic Bench: Testing the Limits of LLMs in Logical Reasoning
+[📑 Blog](https://allenai.github.io/WildBench/WildBench_paper.pdf) | [💻 GitHub](https://github.com/allenai/WildBench) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/wildbench-65e8f2fa9c1260a85a933627) | [🐦 X]() | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**

app.py CHANGED Viewed

@@ -18,7 +18,7 @@ import os, uuid
 from utils_display import model_info
 from constants import column_names,  LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
 import pytz
-from data_utils import post_processing
 # get the last updated time from the elo_ranks.all.jsonl file
 LAST_UPDATED = None
@@ -34,6 +34,7 @@ with open("_header.md", "r") as f:
 with open("_metrics.md", "r") as f:
     METRICS_MD = f.read()
 original_df = None
 # available_models = [] # to be filled in later
 available_models = list(model_info.keys())
@@ -89,7 +90,44 @@ def _tab_leaderboard():
         mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
 def _tab_submit():
     pass
@@ -101,13 +139,14 @@ def build_demo():
         gr.HTML(BANNER, elem_id="banner")
         # convert LAST_UPDATED to the PDT time
         LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
-        # header_md_text = HEADER_MD.replace("{model_num}", str(len(original_df["-1"]))).replace("{LAST_UPDATED}", str(LAST_UPDATED))
-        # gr.Markdown(header_md_text, elem_classes="markdown-text")
         with gr.Tabs(elem_classes="tab-buttons") as tabs:
             with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
                 _tab_leaderboard()
             with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
                 _tab_submit()
@@ -129,7 +168,7 @@ def build_demo():
 def data_load(result_file):
-    global original_df
     print(f"Loading {result_file}")
     column_names_main = column_names.copy()
     # column_names_main.update({})
@@ -137,15 +176,15 @@ def data_load(result_file):
     click_url = True
     # read json file from the result_file
     with open(result_file, "r") as f:
-        data = json.load(f)
     # floatify the data, if possible
-    for d in data:
         for k, v in d.items():
             try:
                 d[k] = float(v)
             except:
                 pass
-    original_df = pd.DataFrame(data)
     original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
     # print(original_df.columns)

 from utils_display import model_info
 from constants import column_names,  LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
 import pytz
+from data_utils import post_processing, get_random_item
 # get the last updated time from the elo_ranks.all.jsonl file
 LAST_UPDATED = None
 with open("_metrics.md", "r") as f:
     METRICS_MD = f.read()
+raw_data = None
 original_df = None
 # available_models = [] # to be filled in later
 available_models = list(model_info.keys())
         mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
+def sample_explore_item(model_name, size_H, size_W, greedy_or_sample):
+    print(model_name, size_H, size_W, greedy_or_sample)
+    explore_item = get_random_item(model_name, size_H, size_W)
+    if explore_item is None:
+        return "No item found", "No item found", "No item found", "No item found"
+    model_name = explore_item['Model']
+    example_id = explore_item['id']
+    puzzle_md = f"### Puzzle [{example_id}]:\n\n" + explore_item['puzzle'].replace("## Clues", "### Clues").replace("\n", "<br>")
+    model_reasoning_md = f"### {model_name}'s Reasoning:\n\n {explore_item['reasoning']}"
+    model_prediction_md = f"### {model_name}'s Prediction:\n\n {explore_item['solution']}"  + "\n\n" + explore_item['solution_table_md']
+    puzzle_solved = explore_item['correct_cells'] == explore_item['total_cells']
+    cell_acc = explore_item["correct_cells"] / explore_item["total_cells"] * 100
+    model_eval_md = f"### Evaluation:\n\n  **Total Cells**: {explore_item['total_cells']} | **Correct Cells**: {explore_item['correct_cells']} | **Puzzle solved**: {puzzle_solved} | **Cell Acc**: {cell_acc:.2f}%"
+    return puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md
+def _tab_explore():
+    global raw_data
+    model_names = [item["Model"] for item in raw_data]
+    with gr.Row():
+        model_selection = gr.Dropdown(choices = ["random"] + model_names, label="Model: ", elem_id="select-models", value="random", interactive=True)
+        size_H_selection = gr.Dropdown(choices = ["random"] + [f"{i}" for i in range(2,7)], label="Num of Houses", elem_id="select-H", value="random", interactive=True)
+        size_W_selection = gr.Dropdown(choices = ["random"] + [f"{i}" for i in range(2,7)], label="Num of Features", elem_id="select-W", value="random", interactive=True)
+        with gr.Column(scale=1):
+            greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
+            explore_button = gr.Button("Sample", elem_id="explore-button")
+    puzzle_md = gr.Markdown("\n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
+    model_reasoning_md = gr.Markdown("\n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
+    model_prediction_md = gr.Markdown("\n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
+    model_eval_md = gr.Markdown("\n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
+    explore_button.click(fn=sample_explore_item,
+                         inputs=[model_selection, size_H_selection, size_W_selection, greedy_or_sample],
+                         outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md])
 def _tab_submit():
     pass
         gr.HTML(BANNER, elem_id="banner")
         # convert LAST_UPDATED to the PDT time
         LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
+        header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
+        gr.Markdown(header_md_text, elem_classes="markdown-text")
         with gr.Tabs(elem_classes="tab-buttons") as tabs:
             with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
                 _tab_leaderboard()
+            with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
+                _tab_explore()
             with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
                 _tab_submit()
 def data_load(result_file):
+    global raw_data, original_df
     print(f"Loading {result_file}")
     column_names_main = column_names.copy()
     # column_names_main.update({})
     click_url = True
     # read json file from the result_file
     with open(result_file, "r") as f:
+        raw_data = json.load(f)
     # floatify the data, if possible
+    for d in raw_data:
         for k, v in d.items():
             try:
                 d[k] = float(v)
             except:
                 pass
+    original_df = pd.DataFrame(raw_data)
     original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
     # print(original_df.columns)

constants.py CHANGED Viewed

@@ -5,20 +5,17 @@ DEFAULT_K = "∞"
 # DEFAULT_K = "1500"
 banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
-BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 50vw; min-width: 300px; max-width: 800px;border: 3px solid gray; border-color: gray black;"> </div>'
 TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
 WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
-CITATION_TEXT = """@article{lin2024wildbench,
-    title={WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild},
-    author={Bill Yuchen Lin and Yuntian Deng and Khyathi Chandu and Faeze Brahman and Abhilasha Ravichander and Valentina Pyatkin and Nouha Dziri and Ronan Le Bras and Yejin Choi},
-    year={2024},
-    eprint={2406.04770},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL},
-    url={https://arxiv.org/abs/2406.04770}
 }
 """
@@ -279,5 +276,13 @@ button.selected[role="tab"][aria-selected="true"] {
     font-size: 12pt;
     font-decoration: bold;
 }
 """

 # DEFAULT_K = "1500"
 banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
+BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
 TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
 WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
+CITATION_TEXT = """@article{tbd,
+    title={tbd},
+    author={tbd},
+    journal={tbd},
+    year={2024}
 }
 """
     font-size: 12pt;
     font-decoration: bold;
 }
+.box_md{
+    border: 1px solid #000000;
+    border-radius: 10px;
+    padding: 5px;
+    font-size: 12pt;
+    margin: 5px;
+}
 """

data_utils.py CHANGED Viewed

@@ -11,12 +11,13 @@ import math
 import json
 from tqdm import tqdm
 import numpy as np
-id_to_data = None
-model_len_info = None
-bench_data = None
-eval_results = None
-score_eval_results = None
 # Formats the columns
 def formatter(x):
@@ -41,3 +42,90 @@ def post_processing(df, column_names, rank_column=RANKING_COLUMN, ordered_column
         df.sort_values(by=rank_column, inplace=True, ascending=False)
     return df

 import json
 from tqdm import tqdm
 import numpy as np
+import os
+from eval_utils import *
+summary_file = "ZeroEval-main/result_dirs/zebra-grid.summary.json"
+result_dir = "ZeroEval-main/result_dirs/zebra-grid/"
+results_by_model = {}
 # Formats the columns
 def formatter(x):
         df.sort_values(by=rank_column, inplace=True, ascending=False)
     return df
+def load_all_data():
+    global summary_file, result_dir
+    with open(summary_file, "r") as f:
+        model_summary = json.load(f)
+    model_names = [model["Model"] for model in model_summary]
+    for model_name in model_names:
+        download_url = f"https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
+        output_file = os.path.join(result_dir, f"{model_name}.json")
+        # mkdir -p result_dir if not exists
+        os.makedirs(result_dir, exist_ok=True)
+        if not os.path.exists(output_file):
+            os.system(f"wget {download_url} -O {output_file}")
+            print(f"Downloaded {model_name}.json")
+        with open(output_file, "r") as f:
+            print(f"Loading {output_file}")
+            results_by_model[model_name] = json.load(f)
+def get_random_item(model_name="random", size_H="random", size_W="random"):
+    global summary_file, result_dir, results_by_model
+    if results_by_model is None or len(results_by_model) == 0:
+        load_all_data()
+    if model_name == "random":
+        model_name = random.choice(list(results_by_model.keys()))
+    data = results_by_model[model_name]
+    random.shuffle(data)
+    selected_item = None
+    prediction_table = None
+    prediction_reasoning = None
+    id_to_item = {}
+    for item in data:
+        id_to_item[item["id"]] = item
+    if size_H == "random":
+        size_H_choice =  random.choice(list(range(2, 7)))
+    else:
+        size_H_choice = size_H
+    if size_W == "random":
+        size_W_choice =  random.choice(list(range(2, 7)))
+    else:
+        size_W_choice = size_W
+    ok_ids = [id for id in id_to_item if id_to_item[id]["size"].startswith(f"{size_H_choice}*{size_W_choice}")]
+    for ok_id in ok_ids:
+        item = id_to_item[ok_id]
+        prediction_str = item["output"][0]
+        prediction_json = extract_last_complete_json(prediction_str)
+        if prediction_json is None or "solution" not in prediction_json:
+            continue
+        prediction_reasoning = prediction_json.get("reasoning", "")
+        prediction_table = prediction_json["solution"]
+        if prediction_table is not None:
+            selected_item = item
+            break
+    if selected_item is None:
+        # selected_item = random.choice(data)
+        print("No item found!")
+        return None
+    explore_item = {}
+    explore_item["id"] = selected_item["id"]
+    explore_item["Model"] = model_name
+    explore_item["size"] = selected_item["size"]
+    explore_item["puzzle"] = selected_item["puzzle"]
+    explore_item["solution"] = prediction_table
+    explore_item["reasoning"] = prediction_reasoning
+    headers = ["Houses"] + list(prediction_table["House 1"].keys())
+    rows = []
+    for row_id in range(len(prediction_table)):
+        row = [row_id+1]
+        for feature in headers[1:]:
+            row.append(prediction_table[f"House {row_id+1}"][feature])
+        rows.append(row)
+    table_md = tabulate(rows, headers=headers, tablefmt="github")
+    explore_item["solution_table_md"] = table_md
+    this_total_cells, this_correct_cells = eval_each_puzzle(explore_item["id"], prediction_table)
+    # print(table_md)
+    explore_item["correct_cells"] = this_correct_cells
+    explore_item["total_cells"] = this_total_cells
+    return explore_item
+if __name__ == "__main__":
+    load_all_data()
+    print("All data downloaded!")
+    print(json.dumps(get_random_item(model_name="gemini-1.5-pro", size_H="2", size_W="5"), indent=2))

eval_utils.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import json
+from collections import defaultdict
+import os
+from tabulate import tabulate
+from datasets import load_dataset
+private_solutions = {}
+def load_private_solutions():
+    global private_solutions
+    private_zebra_data = load_dataset("allenai/ZebraLogicBench-private", "grid_mode", split="test")
+    for item in private_zebra_data:
+        private_solutions[item["id"]] = item["solution"]
+    return
+def load_model_results(run_name_folders):
+    model_results = {}
+    for run_name, folder in run_name_folders.items():
+        # iterate all json files under the folder
+        for filename in os.listdir(folder):
+            filepath = os.path.join(folder, filename)
+            if not filename.endswith(".json"):
+                continue
+            model_name = filename.replace(".json", "")
+            model_name = f"{model_name}%{run_name}"
+            model_results[model_name] = filepath
+    return model_results
+def extract_last_complete_json(s):
+    # Stack to keep track of opening and closing braces
+    stack = []
+    last_json_start = None
+    last_json_str = None
+    for i, char in enumerate(s):
+        if char == '{':
+            stack.append(i)
+            if last_json_start is None:
+                last_json_start = i
+        elif char == '}':
+            if stack:
+                start = stack.pop()
+                if not stack:
+                    # Complete JSON object found
+                    last_json_str = s[last_json_start:i+1]
+                    last_json_start = None
+    # Load the last JSON object
+    if last_json_str:
+        try:
+            return json.loads(last_json_str.replace("\n", ""))
+        except json.JSONDecodeError:
+            pass
+    return None
+def eval_each_puzzle(id, prediction_table):
+    global private_solutions
+    if not private_solutions:
+        load_private_solutions()
+    solution = private_solutions[id]
+    solution_table = {}
+    num_houses = len(solution["rows"])
+    columns = solution["header"]
+    assert columns[0] == "House"
+    solution_table = {}
+    this_total_cells = 0
+    for i in range(num_houses):
+        solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
+        this_total_cells += len(columns) - 1
+    this_correct_cells = 0 # number in the solution_table
+    for house in solution_table:
+        for column in solution_table[house]:
+            # if prediction_table[house][column] not exist then pass
+            if house in prediction_table and column in prediction_table[house]:
+                truth_cell = solution_table[house][column].lower().strip()
+                if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
+                    continue
+                if type(prediction_table[house][column]) == list:
+                    predicted_cell = prediction_table[house][column][0].lower().strip()
+                elif type(prediction_table[house][column]) == str:
+                    predicted_cell = prediction_table[house][column].lower().strip()
+                if truth_cell == predicted_cell:
+                    this_correct_cells += 1
+    return this_total_cells, this_correct_cells
+def eval_model(model, filepath):
+    global private_solutions
+    with open(filepath, "r") as f:
+        print(f"Processing {filepath}")
+        data = json.load(f)
+    solved_puzzles = 0
+    num_total_puzzles = len(data)
+    correct_cells = 0
+    total_cells = 0
+    no_asnwer = 0
+    num_total_puzzles_by_size = defaultdict(int)
+    solved_puzzles_by_size = defaultdict(int)
+    reason_lens = []
+    for item in data:
+        # solution = item["solution"]
+        solution = private_solutions[item["id"]]
+        size = item["size"]
+        num_total_puzzles_by_size[size] += 1
+        # Process the solution
+        solution_table = {}
+        num_houses = len(solution["rows"])
+        columns = solution["header"]
+        assert columns[0] == "House"
+        solution_table = {}
+        this_total_cells = 0
+        for i in range(num_houses):
+            solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
+            this_total_cells += len(columns) - 1
+        total_cells += this_total_cells
+        # Read and Parse the prediction from model output
+        prediction_str = item["output"][0]
+        prediction_json = extract_last_complete_json(prediction_str)
+        if prediction_json is None or "solution" not in prediction_json:
+            # print("-"*100)
+            # prediction_str = prediction_str.replace("\n", "")
+            # print([prediction_str])
+            # json.loads(prediction_str)
+            no_asnwer += 1
+            # print(item["id"])
+            continue
+        reason = prediction_json.get("reasoning", "")
+        prediction_table = prediction_json["solution"]
+        reason_lens.append(len(reason))
+        this_correct_cells = 0 # number in the solution_table
+        for house in solution_table:
+            for column in solution_table[house]:
+                # if prediction_table[house][column] not exist then pass
+                if house in prediction_table and column in prediction_table[house]:
+                    truth_cell = solution_table[house][column].lower().strip()
+                    if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
+                        continue
+                    if type(prediction_table[house][column]) == list:
+                        predicted_cell = prediction_table[house][column][0].lower().strip()
+                    elif type(prediction_table[house][column]) == str:
+                        predicted_cell = prediction_table[house][column].lower().strip()
+                    else:
+                        raise ValueError(f"Unknown type: {type(prediction_table[house][column])}")
+                    if truth_cell == predicted_cell:
+                        this_correct_cells += 1
+        correct_cells += this_correct_cells
+        # compute puzzle success rate
+        if this_correct_cells == this_total_cells:
+            solved_puzzles += 1
+            solved_puzzles_by_size[size] += 1
+    # # print the success rate by size; order the dict by size first
+    sizes = sorted(num_total_puzzles_by_size.keys())
+    easy_sizes =  ['2*2', '2*3', '2*4', '2*5', '2*6', '3*2', '3*3',]
+    hard_sizes =  ['3*4', '3*5', '4*2', '3*6', '4*3', '4*4', '5*2', '6*2', '4*5', '4*6', '5*3', '5*4', '5*5', '5*6', '6*3', '6*4', '6*5', '6*6']
+    easy_solved_puzzles = sum([solved_puzzles_by_size[size] for size in easy_sizes])
+    easy_total_puzzles = sum([num_total_puzzles_by_size[size] for size in easy_sizes])
+    hard_solved_puzzles = sum([solved_puzzles_by_size[size] for size in hard_sizes])
+    hard_total_puzzles = sum([num_total_puzzles_by_size[size] for size in hard_sizes])
+    # for size in sizes:
+        # print(f"Size {size}: {solved_puzzles_by_size[size]}/{num_total_puzzles_by_size[size]} -> {solved_puzzles_by_size[size]/num_total_puzzles_by_size[size]*100:.2f}%")
+    result = {}
+    result["Model"] = model.split("%")[0]
+    result["Mode"] = model.split("%")[1]
+    result["Puzzle Acc"] = f"{solved_puzzles/num_total_puzzles*100:.2f}"
+    result["Cell Acc"] = f"{correct_cells/total_cells*100:.2f}"
+    result["No answer"] = f"{no_asnwer/num_total_puzzles*100:.2f}"
+    result["Easy Puzzle Acc"] = f"{easy_solved_puzzles/easy_total_puzzles*100:.2f}"
+    result["Hard Puzzle Acc"] = f"{hard_solved_puzzles/hard_total_puzzles*100:.2f}"
+    result["Total Puzzles"] = num_total_puzzles
+    result["Reason Lens"] = f"{sum(reason_lens)/len(reason_lens):.2f}"
+    return result
+def gen_results(run_name_folders):
+    model_results = load_model_results(run_name_folders)
+    columns = ["Model", "Mode", "Puzzle Acc", "Cell Acc", "No answer", "Easy Puzzle Acc", "Hard Puzzle Acc", "Total Puzzles", "Reason Lens"]
+    rows = []
+    for model_name, filepath in model_results.items():
+        result = eval_model(model_name, filepath)
+        rows.append(result)
+    # sort the rows by puzzle accuracy
+    rows = sorted(rows, key=lambda x: -float(x["Puzzle Acc"]))
+    # Convert rows to the expected format for tabulate
+    table_data = [[row[col] for col in columns] for row in rows]
+    print(tabulate(table_data, headers=columns, tablefmt="fancy_outline", stralign="center", numalign="center"))
+    # print(tabulate(rows, headers=columns, tablefmt="github"))
+    # write to json file
+    with open("result_dirs/zebra-grid.summary.json", "w") as f:
+        json.dump(rows, f, indent=2)
+if __name__ == "__main__":
+    run_name_folders = {
+        "greedy": "result_dirs/zebra-grid",
+        "sampling": "result_dirs/zebra-grid/sampling",
+    }
+    load_private_solutions()
+    gen_results(run_name_folders)

update_data.sh CHANGED Viewed

@@ -1,4 +1,5 @@
 # download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
 # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
-mkdir -p ZeroEval-main/result_dirs
-wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json

 # download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
 # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
+mkdir -p ZeroEval-main/result_dirs/zebra-grid/
+wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
+wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json

zebra_banner.png CHANGED Viewed