Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	explore data
Browse files- .gitignore +1 -0
 - _header.md +2 -2
 - app.py +48 -9
 - constants.py +14 -9
 - data_utils.py +93 -5
 - eval_utils.py +217 -0
 - update_data.sh +3 -2
 - zebra_banner.png +0 -0
 
    	
        .gitignore
    CHANGED
    
    | 
         @@ -1,3 +1,4 @@ 
     | 
|
| 1 | 
         | 
| 2 | 
         
             
            *.pyc 
         
     | 
| 3 | 
         
             
            *.DS_Store
         
     | 
| 
         | 
| 
         | 
|
| 1 | 
         | 
| 2 | 
         
             
            *.pyc 
         
     | 
| 3 | 
         
             
            *.DS_Store
         
     | 
| 4 | 
         
            +
            ZeroEval-main/result_dirs/zebra-grid/
         
     | 
    	
        _header.md
    CHANGED
    
    | 
         @@ -1,5 +1,5 @@ 
     | 
|
| 1 | 
         
             
            <br/>
         
     | 
| 2 | 
         | 
| 3 | 
         
            -
            #  
     | 
| 4 | 
         
            -
            [📑  
     | 
| 5 | 
         | 
| 
         | 
|
| 1 | 
         
             
            <br/>
         
     | 
| 2 | 
         | 
| 3 | 
         
            +
            # 🦓 ZebraLogic Bench: Testing the Limits of LLMs in Logical Reasoning
         
     | 
| 4 | 
         
            +
            [📑 Blog](https://allenai.github.io/WildBench/WildBench_paper.pdf) | [💻 GitHub](https://github.com/allenai/WildBench) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/wildbench-65e8f2fa9c1260a85a933627) | [🐦 X]() | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
         
     | 
| 5 | 
         | 
    	
        app.py
    CHANGED
    
    | 
         @@ -18,7 +18,7 @@ import os, uuid 
     | 
|
| 18 | 
         
             
            from utils_display import model_info
         
     | 
| 19 | 
         
             
            from constants import column_names,  LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
         
     | 
| 20 | 
         
             
            import pytz
         
     | 
| 21 | 
         
            -
            from data_utils import post_processing
         
     | 
| 22 | 
         | 
| 23 | 
         
             
            # get the last updated time from the elo_ranks.all.jsonl file
         
     | 
| 24 | 
         
             
            LAST_UPDATED = None 
         
     | 
| 
         @@ -34,6 +34,7 @@ with open("_header.md", "r") as f: 
     | 
|
| 34 | 
         
             
            with open("_metrics.md", "r") as f:
         
     | 
| 35 | 
         
             
                METRICS_MD = f.read()
         
     | 
| 36 | 
         | 
| 
         | 
|
| 37 | 
         
             
            original_df = None  
         
     | 
| 38 | 
         
             
            # available_models = [] # to be filled in later
         
     | 
| 39 | 
         
             
            available_models = list(model_info.keys()) 
         
     | 
| 
         @@ -89,7 +90,44 @@ def _tab_leaderboard(): 
     | 
|
| 89 | 
         
             
                    mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
         
     | 
| 90 | 
         | 
| 91 | 
         | 
| 92 | 
         
            -
             
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 93 | 
         
             
            def _tab_submit():
         
     | 
| 94 | 
         
             
                pass
         
     | 
| 95 | 
         | 
| 
         @@ -101,13 +139,14 @@ def build_demo(): 
     | 
|
| 101 | 
         
             
                    gr.HTML(BANNER, elem_id="banner")
         
     | 
| 102 | 
         
             
                    # convert LAST_UPDATED to the PDT time 
         
     | 
| 103 | 
         
             
                    LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
         
     | 
| 104 | 
         
            -
                     
     | 
| 105 | 
         
            -
                     
     | 
| 106 | 
         | 
| 107 | 
         
             
                    with gr.Tabs(elem_classes="tab-buttons") as tabs: 
         
     | 
| 108 | 
         
             
                        with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
         
     | 
| 109 | 
         
             
                            _tab_leaderboard() 
         
     | 
| 110 | 
         
            -
             
     | 
| 
         | 
|
| 111 | 
         
             
                        with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
         
     | 
| 112 | 
         
             
                            _tab_submit() 
         
     | 
| 113 | 
         | 
| 
         @@ -129,7 +168,7 @@ def build_demo(): 
     | 
|
| 129 | 
         | 
| 130 | 
         | 
| 131 | 
         
             
            def data_load(result_file):
         
     | 
| 132 | 
         
            -
                global original_df
         
     | 
| 133 | 
         
             
                print(f"Loading {result_file}")
         
     | 
| 134 | 
         
             
                column_names_main = column_names.copy()
         
     | 
| 135 | 
         
             
                # column_names_main.update({})
         
     | 
| 
         @@ -137,15 +176,15 @@ def data_load(result_file): 
     | 
|
| 137 | 
         
             
                click_url = True 
         
     | 
| 138 | 
         
             
                # read json file from the result_file 
         
     | 
| 139 | 
         
             
                with open(result_file, "r") as f:
         
     | 
| 140 | 
         
            -
                     
     | 
| 141 | 
         
             
                # floatify the data, if possible
         
     | 
| 142 | 
         
            -
                for d in  
     | 
| 143 | 
         
             
                    for k, v in d.items():
         
     | 
| 144 | 
         
             
                        try:
         
     | 
| 145 | 
         
             
                            d[k] = float(v)
         
     | 
| 146 | 
         
             
                        except:
         
     | 
| 147 | 
         
             
                            pass
         
     | 
| 148 | 
         
            -
                original_df = pd.DataFrame( 
     | 
| 149 | 
         
             
                original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
         
     | 
| 150 | 
         
             
                # print(original_df.columns) 
         
     | 
| 151 | 
         | 
| 
         | 
|
| 18 | 
         
             
            from utils_display import model_info
         
     | 
| 19 | 
         
             
            from constants import column_names,  LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
         
     | 
| 20 | 
         
             
            import pytz
         
     | 
| 21 | 
         
            +
            from data_utils import post_processing, get_random_item
         
     | 
| 22 | 
         | 
| 23 | 
         
             
            # get the last updated time from the elo_ranks.all.jsonl file
         
     | 
| 24 | 
         
             
            LAST_UPDATED = None 
         
     | 
| 
         | 
|
| 34 | 
         
             
            with open("_metrics.md", "r") as f:
         
     | 
| 35 | 
         
             
                METRICS_MD = f.read()
         
     | 
| 36 | 
         | 
| 37 | 
         
            +
            raw_data = None 
         
     | 
| 38 | 
         
             
            original_df = None  
         
     | 
| 39 | 
         
             
            # available_models = [] # to be filled in later
         
     | 
| 40 | 
         
             
            available_models = list(model_info.keys()) 
         
     | 
| 
         | 
|
| 90 | 
         
             
                    mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
         
     | 
| 91 | 
         | 
| 92 | 
         | 
| 93 | 
         
            +
            def sample_explore_item(model_name, size_H, size_W, greedy_or_sample):
         
     | 
| 94 | 
         
            +
                print(model_name, size_H, size_W, greedy_or_sample)
         
     | 
| 95 | 
         
            +
                explore_item = get_random_item(model_name, size_H, size_W)
         
     | 
| 96 | 
         
            +
                if explore_item is None:
         
     | 
| 97 | 
         
            +
                    return "No item found", "No item found", "No item found", "No item found"
         
     | 
| 98 | 
         
            +
                model_name = explore_item['Model']
         
     | 
| 99 | 
         
            +
                example_id = explore_item['id']
         
     | 
| 100 | 
         
            +
                puzzle_md = f"### Puzzle [{example_id}]:\n\n" + explore_item['puzzle'].replace("## Clues", "### Clues").replace("\n", "<br>")
         
     | 
| 101 | 
         
            +
                model_reasoning_md = f"### {model_name}'s Reasoning:\n\n {explore_item['reasoning']}"
         
     | 
| 102 | 
         
            +
                model_prediction_md = f"### {model_name}'s Prediction:\n\n {explore_item['solution']}"  + "\n\n" + explore_item['solution_table_md']
         
     | 
| 103 | 
         
            +
                puzzle_solved = explore_item['correct_cells'] == explore_item['total_cells']
         
     | 
| 104 | 
         
            +
                cell_acc = explore_item["correct_cells"] / explore_item["total_cells"] * 100
         
     | 
| 105 | 
         
            +
                model_eval_md = f"### Evaluation:\n\n  **Total Cells**: {explore_item['total_cells']} | **Correct Cells**: {explore_item['correct_cells']} | **Puzzle solved**: {puzzle_solved} | **Cell Acc**: {cell_acc:.2f}%"
         
     | 
| 106 | 
         
            +
                return puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md
         
     | 
| 107 | 
         
            +
             
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
            def _tab_explore():
         
     | 
| 110 | 
         
            +
                global raw_data
         
     | 
| 111 | 
         
            +
                model_names = [item["Model"] for item in raw_data]
         
     | 
| 112 | 
         
            +
                with gr.Row():
         
     | 
| 113 | 
         
            +
                    model_selection = gr.Dropdown(choices = ["random"] + model_names, label="Model: ", elem_id="select-models", value="random", interactive=True)
         
     | 
| 114 | 
         
            +
                    size_H_selection = gr.Dropdown(choices = ["random"] + [f"{i}" for i in range(2,7)], label="Num of Houses", elem_id="select-H", value="random", interactive=True)
         
     | 
| 115 | 
         
            +
                    size_W_selection = gr.Dropdown(choices = ["random"] + [f"{i}" for i in range(2,7)], label="Num of Features", elem_id="select-W", value="random", interactive=True)
         
     | 
| 116 | 
         
            +
                    with gr.Column(scale=1):
         
     | 
| 117 | 
         
            +
                        greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
         
     | 
| 118 | 
         
            +
                        explore_button = gr.Button("Sample", elem_id="explore-button")
         
     | 
| 119 | 
         
            +
                
         
     | 
| 120 | 
         
            +
                puzzle_md = gr.Markdown("\n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
         
     | 
| 121 | 
         
            +
                model_reasoning_md = gr.Markdown("\n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
         
     | 
| 122 | 
         
            +
                model_prediction_md = gr.Markdown("\n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
         
     | 
| 123 | 
         
            +
                model_eval_md = gr.Markdown("\n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
         
     | 
| 124 | 
         
            +
                
         
     | 
| 125 | 
         
            +
                explore_button.click(fn=sample_explore_item, 
         
     | 
| 126 | 
         
            +
                                     inputs=[model_selection, size_H_selection, size_W_selection, greedy_or_sample], 
         
     | 
| 127 | 
         
            +
                                     outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md])
         
     | 
| 128 | 
         
            +
             
     | 
| 129 | 
         
            +
             
     | 
| 130 | 
         
            +
             
     | 
| 131 | 
         
             
            def _tab_submit():
         
     | 
| 132 | 
         
             
                pass
         
     | 
| 133 | 
         | 
| 
         | 
|
| 139 | 
         
             
                    gr.HTML(BANNER, elem_id="banner")
         
     | 
| 140 | 
         
             
                    # convert LAST_UPDATED to the PDT time 
         
     | 
| 141 | 
         
             
                    LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
         
     | 
| 142 | 
         
            +
                    header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
         
     | 
| 143 | 
         
            +
                    gr.Markdown(header_md_text, elem_classes="markdown-text") 
         
     | 
| 144 | 
         | 
| 145 | 
         
             
                    with gr.Tabs(elem_classes="tab-buttons") as tabs: 
         
     | 
| 146 | 
         
             
                        with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
         
     | 
| 147 | 
         
             
                            _tab_leaderboard() 
         
     | 
| 148 | 
         
            +
                        with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
         
     | 
| 149 | 
         
            +
                            _tab_explore()
         
     | 
| 150 | 
         
             
                        with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
         
     | 
| 151 | 
         
             
                            _tab_submit() 
         
     | 
| 152 | 
         | 
| 
         | 
|
| 168 | 
         | 
| 169 | 
         | 
| 170 | 
         
             
            def data_load(result_file):
         
     | 
| 171 | 
         
            +
                global raw_data, original_df
         
     | 
| 172 | 
         
             
                print(f"Loading {result_file}")
         
     | 
| 173 | 
         
             
                column_names_main = column_names.copy()
         
     | 
| 174 | 
         
             
                # column_names_main.update({})
         
     | 
| 
         | 
|
| 176 | 
         
             
                click_url = True 
         
     | 
| 177 | 
         
             
                # read json file from the result_file 
         
     | 
| 178 | 
         
             
                with open(result_file, "r") as f:
         
     | 
| 179 | 
         
            +
                    raw_data = json.load(f)
         
     | 
| 180 | 
         
             
                # floatify the data, if possible
         
     | 
| 181 | 
         
            +
                for d in raw_data:
         
     | 
| 182 | 
         
             
                    for k, v in d.items():
         
     | 
| 183 | 
         
             
                        try:
         
     | 
| 184 | 
         
             
                            d[k] = float(v)
         
     | 
| 185 | 
         
             
                        except:
         
     | 
| 186 | 
         
             
                            pass
         
     | 
| 187 | 
         
            +
                original_df = pd.DataFrame(raw_data)
         
     | 
| 188 | 
         
             
                original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
         
     | 
| 189 | 
         
             
                # print(original_df.columns) 
         
     | 
| 190 | 
         | 
    	
        constants.py
    CHANGED
    
    | 
         @@ -5,20 +5,17 @@ DEFAULT_K = "∞" 
     | 
|
| 5 | 
         
             
            # DEFAULT_K = "1500"
         
     | 
| 6 | 
         | 
| 7 | 
         
             
            banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
         
     | 
| 8 | 
         
            -
            BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width:  
     | 
| 9 | 
         | 
| 10 | 
         
             
            TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
         
     | 
| 11 | 
         | 
| 12 | 
         
             
            WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
         
     | 
| 13 | 
         | 
| 14 | 
         
            -
            CITATION_TEXT = """@article{ 
     | 
| 15 | 
         
            -
                title={ 
     | 
| 16 | 
         
            -
                author={ 
     | 
| 17 | 
         
            -
                 
     | 
| 18 | 
         
            -
                 
     | 
| 19 | 
         
            -
                archivePrefix={arXiv},
         
     | 
| 20 | 
         
            -
                primaryClass={cs.CL},
         
     | 
| 21 | 
         
            -
                url={https://arxiv.org/abs/2406.04770}
         
     | 
| 22 | 
         
             
            }
         
     | 
| 23 | 
         
             
            """
         
     | 
| 24 | 
         | 
| 
         @@ -279,5 +276,13 @@ button.selected[role="tab"][aria-selected="true"] { 
     | 
|
| 279 | 
         
             
                font-size: 12pt; 
         
     | 
| 280 | 
         
             
                font-decoration: bold;
         
     | 
| 281 | 
         
             
            }
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 282 | 
         
             
            """
         
     | 
| 283 | 
         | 
| 
         | 
|
| 5 | 
         
             
            # DEFAULT_K = "1500"
         
     | 
| 6 | 
         | 
| 7 | 
         
             
            banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
         
     | 
| 8 | 
         
            +
            BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
         
     | 
| 11 | 
         | 
| 12 | 
         
             
            WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
         
     | 
| 13 | 
         | 
| 14 | 
         
            +
            CITATION_TEXT = """@article{tbd,
         
     | 
| 15 | 
         
            +
                title={tbd},
         
     | 
| 16 | 
         
            +
                author={tbd},
         
     | 
| 17 | 
         
            +
                journal={tbd},
         
     | 
| 18 | 
         
            +
                year={2024}
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 19 | 
         
             
            }
         
     | 
| 20 | 
         
             
            """
         
     | 
| 21 | 
         | 
| 
         | 
|
| 276 | 
         
             
                font-size: 12pt; 
         
     | 
| 277 | 
         
             
                font-decoration: bold;
         
     | 
| 278 | 
         
             
            }
         
     | 
| 279 | 
         
            +
             
     | 
| 280 | 
         
            +
            .box_md{
         
     | 
| 281 | 
         
            +
                border: 1px solid #000000;
         
     | 
| 282 | 
         
            +
                border-radius: 10px;
         
     | 
| 283 | 
         
            +
                padding: 5px;
         
     | 
| 284 | 
         
            +
                font-size: 12pt;
         
     | 
| 285 | 
         
            +
                margin: 5px;
         
     | 
| 286 | 
         
            +
            }
         
     | 
| 287 | 
         
             
            """
         
     | 
| 288 | 
         | 
    	
        data_utils.py
    CHANGED
    
    | 
         @@ -11,12 +11,13 @@ import math 
     | 
|
| 11 | 
         
             
            import json 
         
     | 
| 12 | 
         
             
            from tqdm import tqdm
         
     | 
| 13 | 
         
             
            import numpy as np
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 14 | 
         | 
| 15 | 
         
            -
             
     | 
| 16 | 
         
            -
             
     | 
| 17 | 
         
            -
             
     | 
| 18 | 
         
            -
            eval_results = None 
         
     | 
| 19 | 
         
            -
            score_eval_results = None  
         
     | 
| 20 | 
         | 
| 21 | 
         
             
            # Formats the columns
         
     | 
| 22 | 
         
             
            def formatter(x):
         
     | 
| 
         @@ -41,3 +42,90 @@ def post_processing(df, column_names, rank_column=RANKING_COLUMN, ordered_column 
     | 
|
| 41 | 
         
             
                    df.sort_values(by=rank_column, inplace=True, ascending=False)
         
     | 
| 42 | 
         
             
                return df
         
     | 
| 43 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 11 | 
         
             
            import json 
         
     | 
| 12 | 
         
             
            from tqdm import tqdm
         
     | 
| 13 | 
         
             
            import numpy as np
         
     | 
| 14 | 
         
            +
            import os 
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            from eval_utils import *
         
     | 
| 17 | 
         | 
| 18 | 
         
            +
            summary_file = "ZeroEval-main/result_dirs/zebra-grid.summary.json"
         
     | 
| 19 | 
         
            +
            result_dir = "ZeroEval-main/result_dirs/zebra-grid/"
         
     | 
| 20 | 
         
            +
            results_by_model = {}
         
     | 
| 
         | 
|
| 
         | 
|
| 21 | 
         | 
| 22 | 
         
             
            # Formats the columns
         
     | 
| 23 | 
         
             
            def formatter(x):
         
     | 
| 
         | 
|
| 42 | 
         
             
                    df.sort_values(by=rank_column, inplace=True, ascending=False)
         
     | 
| 43 | 
         
             
                return df
         
     | 
| 44 | 
         | 
| 45 | 
         
            +
             
     | 
| 46 | 
         
            +
            def load_all_data():
         
     | 
| 47 | 
         
            +
                global summary_file, result_dir
         
     | 
| 48 | 
         
            +
                with open(summary_file, "r") as f:
         
     | 
| 49 | 
         
            +
                    model_summary = json.load(f)
         
     | 
| 50 | 
         
            +
                model_names = [model["Model"] for model in model_summary]
         
     | 
| 51 | 
         
            +
                for model_name in model_names:
         
     | 
| 52 | 
         
            +
                    download_url = f"https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
         
     | 
| 53 | 
         
            +
                    output_file = os.path.join(result_dir, f"{model_name}.json")
         
     | 
| 54 | 
         
            +
                    # mkdir -p result_dir if not exists 
         
     | 
| 55 | 
         
            +
                    os.makedirs(result_dir, exist_ok=True)
         
     | 
| 56 | 
         
            +
                    if not os.path.exists(output_file):
         
     | 
| 57 | 
         
            +
                        os.system(f"wget {download_url} -O {output_file}")
         
     | 
| 58 | 
         
            +
                        print(f"Downloaded {model_name}.json")
         
     | 
| 59 | 
         
            +
                    with open(output_file, "r") as f:
         
     | 
| 60 | 
         
            +
                        print(f"Loading {output_file}")
         
     | 
| 61 | 
         
            +
                        results_by_model[model_name] = json.load(f) 
         
     | 
| 62 | 
         
            +
                
         
     | 
| 63 | 
         
            +
            def get_random_item(model_name="random", size_H="random", size_W="random"):
         
     | 
| 64 | 
         
            +
                global summary_file, result_dir, results_by_model
         
     | 
| 65 | 
         
            +
                if results_by_model is None or len(results_by_model) == 0:
         
     | 
| 66 | 
         
            +
                    load_all_data()
         
     | 
| 67 | 
         
            +
                if model_name == "random":
         
     | 
| 68 | 
         
            +
                    model_name = random.choice(list(results_by_model.keys()))
         
     | 
| 69 | 
         
            +
                data = results_by_model[model_name]
         
     | 
| 70 | 
         
            +
                random.shuffle(data)
         
     | 
| 71 | 
         
            +
                selected_item = None
         
     | 
| 72 | 
         
            +
                prediction_table = None  
         
     | 
| 73 | 
         
            +
                prediction_reasoning = None 
         
     | 
| 74 | 
         
            +
                id_to_item = {}
         
     | 
| 75 | 
         
            +
                for item in data:
         
     | 
| 76 | 
         
            +
                    id_to_item[item["id"]] = item
         
     | 
| 77 | 
         
            +
                
         
     | 
| 78 | 
         
            +
                if size_H == "random":
         
     | 
| 79 | 
         
            +
                    size_H_choice =  random.choice(list(range(2, 7)))
         
     | 
| 80 | 
         
            +
                else:
         
     | 
| 81 | 
         
            +
                    size_H_choice = size_H
         
     | 
| 82 | 
         
            +
                if size_W == "random":
         
     | 
| 83 | 
         
            +
                    size_W_choice =  random.choice(list(range(2, 7)))
         
     | 
| 84 | 
         
            +
                else:
         
     | 
| 85 | 
         
            +
                    size_W_choice = size_W
         
     | 
| 86 | 
         
            +
                ok_ids = [id for id in id_to_item if id_to_item[id]["size"].startswith(f"{size_H_choice}*{size_W_choice}")] 
         
     | 
| 87 | 
         
            +
                for ok_id in ok_ids:
         
     | 
| 88 | 
         
            +
                    item = id_to_item[ok_id] 
         
     | 
| 89 | 
         
            +
                    prediction_str = item["output"][0]
         
     | 
| 90 | 
         
            +
                    prediction_json = extract_last_complete_json(prediction_str)
         
     | 
| 91 | 
         
            +
                    if prediction_json is None or "solution" not in prediction_json:  
         
     | 
| 92 | 
         
            +
                        continue 
         
     | 
| 93 | 
         
            +
                    prediction_reasoning = prediction_json.get("reasoning", "")
         
     | 
| 94 | 
         
            +
                    prediction_table = prediction_json["solution"]
         
     | 
| 95 | 
         
            +
                    if prediction_table is not None:
         
     | 
| 96 | 
         
            +
                        selected_item = item
         
     | 
| 97 | 
         
            +
                        break 
         
     | 
| 98 | 
         
            +
             
     | 
| 99 | 
         
            +
                if selected_item is None:
         
     | 
| 100 | 
         
            +
                    # selected_item = random.choice(data)
         
     | 
| 101 | 
         
            +
                    print("No item found!")
         
     | 
| 102 | 
         
            +
                    return None 
         
     | 
| 103 | 
         
            +
             
     | 
| 104 | 
         
            +
                explore_item = {}
         
     | 
| 105 | 
         
            +
                explore_item["id"] = selected_item["id"]
         
     | 
| 106 | 
         
            +
                explore_item["Model"] = model_name
         
     | 
| 107 | 
         
            +
                explore_item["size"] = selected_item["size"]
         
     | 
| 108 | 
         
            +
                explore_item["puzzle"] = selected_item["puzzle"]
         
     | 
| 109 | 
         
            +
                explore_item["solution"] = prediction_table
         
     | 
| 110 | 
         
            +
                explore_item["reasoning"] = prediction_reasoning
         
     | 
| 111 | 
         
            +
                headers = ["Houses"] + list(prediction_table["House 1"].keys())
         
     | 
| 112 | 
         
            +
                rows = []
         
     | 
| 113 | 
         
            +
                for row_id in range(len(prediction_table)):
         
     | 
| 114 | 
         
            +
                    row = [row_id+1] 
         
     | 
| 115 | 
         
            +
                    for feature in headers[1:]:
         
     | 
| 116 | 
         
            +
                        row.append(prediction_table[f"House {row_id+1}"][feature])
         
     | 
| 117 | 
         
            +
                    rows.append(row)
         
     | 
| 118 | 
         
            +
                table_md = tabulate(rows, headers=headers, tablefmt="github")
         
     | 
| 119 | 
         
            +
                explore_item["solution_table_md"] = table_md
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
                this_total_cells, this_correct_cells = eval_each_puzzle(explore_item["id"], prediction_table)
         
     | 
| 122 | 
         
            +
                # print(table_md)
         
     | 
| 123 | 
         
            +
                explore_item["correct_cells"] = this_correct_cells
         
     | 
| 124 | 
         
            +
                explore_item["total_cells"] = this_total_cells
         
     | 
| 125 | 
         
            +
                return explore_item
         
     | 
| 126 | 
         
            +
             
     | 
| 127 | 
         
            +
             
     | 
| 128 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 129 | 
         
            +
                load_all_data()
         
     | 
| 130 | 
         
            +
                print("All data downloaded!")
         
     | 
| 131 | 
         
            +
                print(json.dumps(get_random_item(model_name="gemini-1.5-pro", size_H="2", size_W="5"), indent=2))
         
     | 
    	
        eval_utils.py
    ADDED
    
    | 
         @@ -0,0 +1,217 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import json 
         
     | 
| 2 | 
         
            +
            from collections import defaultdict
         
     | 
| 3 | 
         
            +
            import os 
         
     | 
| 4 | 
         
            +
            from tabulate import tabulate
         
     | 
| 5 | 
         
            +
            from datasets import load_dataset
         
     | 
| 6 | 
         
            +
             
     | 
| 7 | 
         
            +
            private_solutions = {}
         
     | 
| 8 | 
         
            +
             
     | 
| 9 | 
         
            +
            def load_private_solutions():
         
     | 
| 10 | 
         
            +
                global private_solutions
         
     | 
| 11 | 
         
            +
                private_zebra_data = load_dataset("allenai/ZebraLogicBench-private", "grid_mode", split="test")
         
     | 
| 12 | 
         
            +
                for item in private_zebra_data:
         
     | 
| 13 | 
         
            +
                    private_solutions[item["id"]] = item["solution"] 
         
     | 
| 14 | 
         
            +
                return 
         
     | 
| 15 | 
         
            +
             
     | 
| 16 | 
         
            +
            def load_model_results(run_name_folders):
         
     | 
| 17 | 
         
            +
                model_results = {}
         
     | 
| 18 | 
         
            +
                for run_name, folder in run_name_folders.items():
         
     | 
| 19 | 
         
            +
                    # iterate all json files under the folder 
         
     | 
| 20 | 
         
            +
                    for filename in os.listdir(folder):
         
     | 
| 21 | 
         
            +
                        filepath = os.path.join(folder, filename)
         
     | 
| 22 | 
         
            +
                        if not filename.endswith(".json"):
         
     | 
| 23 | 
         
            +
                            continue
         
     | 
| 24 | 
         
            +
                        model_name = filename.replace(".json", "")  
         
     | 
| 25 | 
         
            +
                        model_name = f"{model_name}%{run_name}"
         
     | 
| 26 | 
         
            +
                        model_results[model_name] = filepath  
         
     | 
| 27 | 
         
            +
                return model_results
         
     | 
| 28 | 
         
            +
             
         
     | 
| 29 | 
         
            +
            def extract_last_complete_json(s):
         
     | 
| 30 | 
         
            +
                # Stack to keep track of opening and closing braces
         
     | 
| 31 | 
         
            +
                stack = []
         
     | 
| 32 | 
         
            +
                last_json_start = None
         
     | 
| 33 | 
         
            +
                last_json_str = None
         
     | 
| 34 | 
         
            +
                
         
     | 
| 35 | 
         
            +
                for i, char in enumerate(s):
         
     | 
| 36 | 
         
            +
                    if char == '{':
         
     | 
| 37 | 
         
            +
                        stack.append(i)
         
     | 
| 38 | 
         
            +
                        if last_json_start is None:
         
     | 
| 39 | 
         
            +
                            last_json_start = i
         
     | 
| 40 | 
         
            +
                    elif char == '}':
         
     | 
| 41 | 
         
            +
                        if stack:
         
     | 
| 42 | 
         
            +
                            start = stack.pop()
         
     | 
| 43 | 
         
            +
                            if not stack:
         
     | 
| 44 | 
         
            +
                                # Complete JSON object found
         
     | 
| 45 | 
         
            +
                                last_json_str = s[last_json_start:i+1]
         
     | 
| 46 | 
         
            +
                                last_json_start = None
         
     | 
| 47 | 
         
            +
                
         
     | 
| 48 | 
         
            +
                # Load the last JSON object
         
     | 
| 49 | 
         
            +
                if last_json_str:
         
     | 
| 50 | 
         
            +
                    try:
         
     | 
| 51 | 
         
            +
                        return json.loads(last_json_str.replace("\n", ""))
         
     | 
| 52 | 
         
            +
                    except json.JSONDecodeError:
         
     | 
| 53 | 
         
            +
                        pass
         
     | 
| 54 | 
         
            +
                
         
     | 
| 55 | 
         
            +
                return None
         
     | 
| 56 | 
         
            +
             
     | 
| 57 | 
         
            +
            def eval_each_puzzle(id, prediction_table):
         
     | 
| 58 | 
         
            +
                global private_solutions
         
     | 
| 59 | 
         
            +
                if not private_solutions:
         
     | 
| 60 | 
         
            +
                    load_private_solutions()
         
     | 
| 61 | 
         
            +
                solution = private_solutions[id]
         
     | 
| 62 | 
         
            +
                solution_table = {}
         
     | 
| 63 | 
         
            +
                num_houses = len(solution["rows"])
         
     | 
| 64 | 
         
            +
                columns = solution["header"]
         
     | 
| 65 | 
         
            +
                assert columns[0] == "House"
         
     | 
| 66 | 
         
            +
                solution_table = {}
         
     | 
| 67 | 
         
            +
                this_total_cells = 0 
         
     | 
| 68 | 
         
            +
                for i in range(num_houses):
         
     | 
| 69 | 
         
            +
                    solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))} 
         
     | 
| 70 | 
         
            +
                    this_total_cells += len(columns) - 1
         
     | 
| 71 | 
         
            +
                
         
     | 
| 72 | 
         
            +
                this_correct_cells = 0 # number in the solution_table 
         
     | 
| 73 | 
         
            +
                for house in solution_table:
         
     | 
| 74 | 
         
            +
                    for column in solution_table[house]: 
         
     | 
| 75 | 
         
            +
                        # if prediction_table[house][column] not exist then pass 
         
     | 
| 76 | 
         
            +
                        if house in prediction_table and column in prediction_table[house]:
         
     | 
| 77 | 
         
            +
                            truth_cell = solution_table[house][column].lower().strip()
         
     | 
| 78 | 
         
            +
                            if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
         
     | 
| 79 | 
         
            +
                                continue
         
     | 
| 80 | 
         
            +
                            if type(prediction_table[house][column]) == list:
         
     | 
| 81 | 
         
            +
                                predicted_cell = prediction_table[house][column][0].lower().strip()
         
     | 
| 82 | 
         
            +
                            elif type(prediction_table[house][column]) == str:
         
     | 
| 83 | 
         
            +
                                predicted_cell = prediction_table[house][column].lower().strip()
         
     | 
| 84 | 
         
            +
                            if truth_cell == predicted_cell:
         
     | 
| 85 | 
         
            +
                                this_correct_cells += 1  
         
     | 
| 86 | 
         
            +
                return this_total_cells, this_correct_cells
         
     | 
| 87 | 
         
            +
             
     | 
| 88 | 
         
            +
            def eval_model(model, filepath):
         
     | 
| 89 | 
         
            +
                global private_solutions
         
     | 
| 90 | 
         
            +
                with open(filepath, "r") as f:
         
     | 
| 91 | 
         
            +
                    print(f"Processing {filepath}")
         
     | 
| 92 | 
         
            +
                    data = json.load(f)
         
     | 
| 93 | 
         
            +
             
     | 
| 94 | 
         
            +
                solved_puzzles = 0 
         
     | 
| 95 | 
         
            +
                num_total_puzzles = len(data)
         
     | 
| 96 | 
         
            +
                correct_cells = 0
         
     | 
| 97 | 
         
            +
                total_cells = 0
         
     | 
| 98 | 
         
            +
                no_asnwer = 0 
         
     | 
| 99 | 
         
            +
             
     | 
| 100 | 
         
            +
                num_total_puzzles_by_size = defaultdict(int)
         
     | 
| 101 | 
         
            +
                solved_puzzles_by_size = defaultdict(int) 
         
     | 
| 102 | 
         
            +
                reason_lens = []
         
     | 
| 103 | 
         
            +
                for item in data:
         
     | 
| 104 | 
         
            +
                    # solution = item["solution"]
         
     | 
| 105 | 
         
            +
                    solution = private_solutions[item["id"]]
         
     | 
| 106 | 
         
            +
                    size = item["size"]
         
     | 
| 107 | 
         
            +
                    num_total_puzzles_by_size[size] += 1
         
     | 
| 108 | 
         
            +
             
     | 
| 109 | 
         
            +
                    # Process the solution 
         
     | 
| 110 | 
         
            +
                    solution_table = {}
         
     | 
| 111 | 
         
            +
                    num_houses = len(solution["rows"])
         
     | 
| 112 | 
         
            +
                    columns = solution["header"]
         
     | 
| 113 | 
         
            +
                    assert columns[0] == "House"
         
     | 
| 114 | 
         
            +
                    solution_table = {}
         
     | 
| 115 | 
         
            +
                    this_total_cells = 0 
         
     | 
| 116 | 
         
            +
                    for i in range(num_houses):
         
     | 
| 117 | 
         
            +
                        solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))} 
         
     | 
| 118 | 
         
            +
                        this_total_cells += len(columns) - 1
         
     | 
| 119 | 
         
            +
                    total_cells += this_total_cells
         
     | 
| 120 | 
         
            +
             
     | 
| 121 | 
         
            +
                    # Read and Parse the prediction from model output
         
     | 
| 122 | 
         
            +
                    prediction_str = item["output"][0]     
         
     | 
| 123 | 
         
            +
                    prediction_json = extract_last_complete_json(prediction_str)
         
     | 
| 124 | 
         
            +
                    if prediction_json is None or "solution" not in prediction_json:
         
     | 
| 125 | 
         
            +
                        # print("-"*100)
         
     | 
| 126 | 
         
            +
                        # prediction_str = prediction_str.replace("\n", "")
         
     | 
| 127 | 
         
            +
                        # print([prediction_str])
         
     | 
| 128 | 
         
            +
                        # json.loads(prediction_str)
         
     | 
| 129 | 
         
            +
                        no_asnwer += 1
         
     | 
| 130 | 
         
            +
                        # print(item["id"])
         
     | 
| 131 | 
         
            +
                        continue 
         
     | 
| 132 | 
         
            +
                    reason = prediction_json.get("reasoning", "")
         
     | 
| 133 | 
         
            +
                    prediction_table = prediction_json["solution"]
         
     | 
| 134 | 
         
            +
                    
         
     | 
| 135 | 
         
            +
                    reason_lens.append(len(reason))
         
     | 
| 136 | 
         
            +
             
     | 
| 137 | 
         
            +
                    this_correct_cells = 0 # number in the solution_table 
         
     | 
| 138 | 
         
            +
                    for house in solution_table:
         
     | 
| 139 | 
         
            +
                        for column in solution_table[house]: 
         
     | 
| 140 | 
         
            +
                            # if prediction_table[house][column] not exist then pass 
         
     | 
| 141 | 
         
            +
                            if house in prediction_table and column in prediction_table[house]:
         
     | 
| 142 | 
         
            +
                                truth_cell = solution_table[house][column].lower().strip()
         
     | 
| 143 | 
         
            +
                                if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
         
     | 
| 144 | 
         
            +
                                    continue
         
     | 
| 145 | 
         
            +
                                if type(prediction_table[house][column]) == list:
         
     | 
| 146 | 
         
            +
                                    predicted_cell = prediction_table[house][column][0].lower().strip()
         
     | 
| 147 | 
         
            +
                                elif type(prediction_table[house][column]) == str:
         
     | 
| 148 | 
         
            +
                                    predicted_cell = prediction_table[house][column].lower().strip()
         
     | 
| 149 | 
         
            +
                                else:
         
     | 
| 150 | 
         
            +
                                    raise ValueError(f"Unknown type: {type(prediction_table[house][column])}")
         
     | 
| 151 | 
         
            +
                                if truth_cell == predicted_cell:
         
     | 
| 152 | 
         
            +
                                    this_correct_cells += 1  
         
     | 
| 153 | 
         
            +
                    correct_cells += this_correct_cells
         
     | 
| 154 | 
         
            +
                    
         
     | 
| 155 | 
         
            +
                    # compute puzzle success rate
         
     | 
| 156 | 
         
            +
                    if this_correct_cells == this_total_cells:
         
     | 
| 157 | 
         
            +
                        solved_puzzles += 1
         
     | 
| 158 | 
         
            +
                        solved_puzzles_by_size[size] += 1
         
     | 
| 159 | 
         
            +
             
     | 
| 160 | 
         
            +
                    
         
     | 
| 161 | 
         
            +
                     
         
     | 
| 162 | 
         
            +
             
     | 
| 163 | 
         
            +
                # # print the success rate by size; order the dict by size first  
         
     | 
| 164 | 
         
            +
                sizes = sorted(num_total_puzzles_by_size.keys()) 
         
     | 
| 165 | 
         
            +
                easy_sizes =  ['2*2', '2*3', '2*4', '2*5', '2*6', '3*2', '3*3',] 
         
     | 
| 166 | 
         
            +
                hard_sizes =  ['3*4', '3*5', '4*2', '3*6', '4*3', '4*4', '5*2', '6*2', '4*5', '4*6', '5*3', '5*4', '5*5', '5*6', '6*3', '6*4', '6*5', '6*6']
         
     | 
| 167 | 
         
            +
                
         
     | 
| 168 | 
         
            +
                easy_solved_puzzles = sum([solved_puzzles_by_size[size] for size in easy_sizes])
         
     | 
| 169 | 
         
            +
                easy_total_puzzles = sum([num_total_puzzles_by_size[size] for size in easy_sizes]) 
         
     | 
| 170 | 
         
            +
                hard_solved_puzzles = sum([solved_puzzles_by_size[size] for size in hard_sizes])
         
     | 
| 171 | 
         
            +
                hard_total_puzzles = sum([num_total_puzzles_by_size[size] for size in hard_sizes])
         
     | 
| 172 | 
         
            +
             
     | 
| 173 | 
         
            +
                # for size in sizes:
         
     | 
| 174 | 
         
            +
                    # print(f"Size {size}: {solved_puzzles_by_size[size]}/{num_total_puzzles_by_size[size]} -> {solved_puzzles_by_size[size]/num_total_puzzles_by_size[size]*100:.2f}%")
         
     | 
| 175 | 
         
            +
             
     | 
| 176 | 
         
            +
                result = {}
         
     | 
| 177 | 
         
            +
                result["Model"] = model.split("%")[0]
         
     | 
| 178 | 
         
            +
                result["Mode"] = model.split("%")[1]
         
     | 
| 179 | 
         
            +
                result["Puzzle Acc"] = f"{solved_puzzles/num_total_puzzles*100:.2f}"
         
     | 
| 180 | 
         
            +
                result["Cell Acc"] = f"{correct_cells/total_cells*100:.2f}"
         
     | 
| 181 | 
         
            +
                result["No answer"] = f"{no_asnwer/num_total_puzzles*100:.2f}"
         
     | 
| 182 | 
         
            +
                result["Easy Puzzle Acc"] = f"{easy_solved_puzzles/easy_total_puzzles*100:.2f}" 
         
     | 
| 183 | 
         
            +
                result["Hard Puzzle Acc"] = f"{hard_solved_puzzles/hard_total_puzzles*100:.2f}"
         
     | 
| 184 | 
         
            +
                result["Total Puzzles"] = num_total_puzzles
         
     | 
| 185 | 
         
            +
                result["Reason Lens"] = f"{sum(reason_lens)/len(reason_lens):.2f}"
         
     | 
| 186 | 
         
            +
                return result
         
     | 
| 187 | 
         
            +
             
     | 
| 188 | 
         
            +
             
     | 
| 189 | 
         
            +
            def gen_results(run_name_folders): 
         
     | 
| 190 | 
         
            +
                model_results = load_model_results(run_name_folders)
         
     | 
| 191 | 
         
            +
             
     | 
| 192 | 
         
            +
                columns = ["Model", "Mode", "Puzzle Acc", "Cell Acc", "No answer", "Easy Puzzle Acc", "Hard Puzzle Acc", "Total Puzzles", "Reason Lens"]
         
     | 
| 193 | 
         
            +
                rows = []
         
     | 
| 194 | 
         
            +
                for model_name, filepath in model_results.items(): 
         
     | 
| 195 | 
         
            +
                    result = eval_model(model_name, filepath) 
         
     | 
| 196 | 
         
            +
                    rows.append(result)
         
     | 
| 197 | 
         
            +
             
     | 
| 198 | 
         
            +
                # sort the rows by puzzle accuracy
         
     | 
| 199 | 
         
            +
                rows = sorted(rows, key=lambda x: -float(x["Puzzle Acc"]))
         
     | 
| 200 | 
         
            +
                # Convert rows to the expected format for tabulate
         
     | 
| 201 | 
         
            +
                table_data = [[row[col] for col in columns] for row in rows]
         
     | 
| 202 | 
         
            +
             
     | 
| 203 | 
         
            +
                print(tabulate(table_data, headers=columns, tablefmt="fancy_outline", stralign="center", numalign="center"))
         
     | 
| 204 | 
         
            +
                # print(tabulate(rows, headers=columns, tablefmt="github"))
         
     | 
| 205 | 
         
            +
             
     | 
| 206 | 
         
            +
                # write to json file 
         
     | 
| 207 | 
         
            +
                with open("result_dirs/zebra-grid.summary.json", "w") as f:
         
     | 
| 208 | 
         
            +
                    json.dump(rows, f, indent=2)
         
     | 
| 209 | 
         
            +
             
     | 
| 210 | 
         
            +
             
     | 
| 211 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 212 | 
         
            +
                run_name_folders = {
         
     | 
| 213 | 
         
            +
                    "greedy": "result_dirs/zebra-grid",
         
     | 
| 214 | 
         
            +
                    "sampling": "result_dirs/zebra-grid/sampling",
         
     | 
| 215 | 
         
            +
                } 
         
     | 
| 216 | 
         
            +
                load_private_solutions()
         
     | 
| 217 | 
         
            +
                gen_results(run_name_folders)
         
     | 
    	
        update_data.sh
    CHANGED
    
    | 
         @@ -1,4 +1,5 @@ 
     | 
|
| 1 | 
         
             
            # download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
         
     | 
| 2 | 
         
             
            # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
         
     | 
| 3 | 
         
            -
            mkdir -p ZeroEval-main/result_dirs
         
     | 
| 4 | 
         
            -
            wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
         
     | 
| 
         | 
| 
         | 
|
| 1 | 
         
             
            # download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
         
     | 
| 2 | 
         
             
            # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
         
     | 
| 3 | 
         
            +
            mkdir -p ZeroEval-main/result_dirs/zebra-grid/
         
     | 
| 4 | 
         
            +
            wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
         
     | 
| 5 | 
         
            +
            wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json
         
     | 
    	
        zebra_banner.png
    CHANGED
    
    
												 
											 | 
										
												 
									 |