Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	modify paper names and paths to datasets
Browse files- README.md +5 -4
- _header.md +1 -2
- app.py +36 -36
- constants.py +31 -32
- eval_utils.py +1 -1
    	
        README.md
    CHANGED
    
    | @@ -10,12 +10,12 @@ pinned: true | |
| 10 | 
             
            fullWidth: true
         | 
| 11 | 
             
            hf_oauth: true
         | 
| 12 | 
             
            api: false
         | 
| 13 | 
            -
            tags: | 
| 14 | 
             
                - leaderboard
         | 
| 15 | 
            -
            datasets: | 
| 16 | 
             
                - allenai/ZebraLogicBench
         | 
| 17 | 
            -
                -  | 
| 18 | 
            -
            models: | 
| 19 | 
             
                - Qwen/Qwen2-72B-Instruct
         | 
| 20 | 
             
                - Qwen/Qwen1.5-72B-Chat
         | 
| 21 | 
             
                - Qwen/Qwen1.5-7B-Chat
         | 
| @@ -58,3 +58,4 @@ models: | |
| 58 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
| 59 |  | 
| 60 | 
             
            Paper: arxiv.org/abs/2406.04770
         | 
|  | 
|  | |
| 10 | 
             
            fullWidth: true
         | 
| 11 | 
             
            hf_oauth: true
         | 
| 12 | 
             
            api: false
         | 
| 13 | 
            +
            tags:
         | 
| 14 | 
             
                - leaderboard
         | 
| 15 | 
            +
            datasets:
         | 
| 16 | 
             
                - allenai/ZebraLogicBench
         | 
| 17 | 
            +
                - WildEval/ZebraLogic
         | 
| 18 | 
            +
            models:
         | 
| 19 | 
             
                - Qwen/Qwen2-72B-Instruct
         | 
| 20 | 
             
                - Qwen/Qwen1.5-72B-Chat
         | 
| 21 | 
             
                - Qwen/Qwen1.5-7B-Chat
         | 
|  | |
| 58 | 
             
            Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
         | 
| 59 |  | 
| 60 | 
             
            Paper: arxiv.org/abs/2406.04770
         | 
| 61 | 
            +
            Paper: arxiv.org/abs/2502.01100
         | 
    	
        _header.md
    CHANGED
    
    | @@ -1,6 +1,5 @@ | |
| 1 | 
             
            <br/>
         | 
| 2 |  | 
| 3 | 
            -
            # 🦓 ZebraLogic:  | 
| 4 | 
             
            <!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) |  -->
         | 
| 5 | 
             
            [📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
         | 
| 6 | 
            -
             
         | 
|  | |
| 1 | 
             
            <br/>
         | 
| 2 |  | 
| 3 | 
            +
            # 🦓 ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning
         | 
| 4 | 
             
            <!-- [📑 FnF Paper](https://arxiv.org/abs/2305.18654) |  -->
         | 
| 5 | 
             
            [📰 Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [💻 GitHub](https://github.com/WildEval/ZeroEval) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [🐦 X](https://twitter.com/billyuchenlin/) | [💬 Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
         | 
|  | 
    	
        app.py
    CHANGED
    
    | @@ -12,16 +12,16 @@ import pandas as pd | |
| 12 | 
             
            from pathlib import Path
         | 
| 13 | 
             
            import json
         | 
| 14 | 
             
            from constants import *
         | 
| 15 | 
            -
            from datetime import datetime, timezone | 
| 16 | 
             
            # from datasets import Dataset, load_dataset, concatenate_datasets
         | 
| 17 | 
            -
            import os, uuid | 
| 18 | 
             
            from utils_display import model_info
         | 
| 19 | 
             
            from constants import column_names,  LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
         | 
| 20 | 
             
            import pytz
         | 
| 21 | 
             
            from data_utils import post_processing, get_random_item
         | 
| 22 |  | 
| 23 | 
             
            # get the last updated time from the elo_ranks.all.jsonl file
         | 
| 24 | 
            -
            LAST_UPDATED = None | 
| 25 | 
             
            # with open("_intro.md", "r") as f:
         | 
| 26 | 
             
            #     INTRO_MD = f.read()
         | 
| 27 | 
             
            INTRO_MD = ""
         | 
| @@ -33,11 +33,11 @@ with open("_header.md", "r") as f: | |
| 33 |  | 
| 34 | 
             
            with open("_metrics.md", "r") as f:
         | 
| 35 | 
             
                METRICS_MD = f.read()
         | 
| 36 | 
            -
             | 
| 37 | 
            -
            raw_data = None | 
| 38 | 
            -
            original_df = None | 
| 39 | 
             
            # available_models = [] # to be filled in later
         | 
| 40 | 
            -
            available_models = list(model_info.keys()) | 
| 41 |  | 
| 42 | 
             
            def df_filters(mode_selection_radio, show_open_source_model_only):
         | 
| 43 | 
             
                global original_df
         | 
| @@ -59,19 +59,19 @@ def _gstr(text): | |
| 59 |  | 
| 60 | 
             
            def _tab_leaderboard():
         | 
| 61 | 
             
                global original_df, available_models
         | 
| 62 | 
            -
                # with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"): | 
| 63 | 
             
                if True:
         | 
| 64 | 
            -
                    default_main_df = original_df.copy() | 
| 65 | 
             
                    # default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
         | 
| 66 | 
            -
                    # default_main_df_no_task = default_main_df.copy() | 
| 67 | 
             
                    default_mode = "greedy"
         | 
| 68 | 
             
                    default_main_df = df_filters(default_mode, False)
         | 
| 69 | 
            -
                    with gr.Row(): | 
| 70 | 
            -
                        with gr.Column(scale=5): | 
| 71 | 
             
                            mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
         | 
| 72 | 
             
                    # with gr.Row():
         | 
| 73 | 
             
                    #     with gr.Column(scale=2):
         | 
| 74 | 
            -
             | 
| 75 | 
             
                    leaderboard_table = gr.components.Dataframe(
         | 
| 76 | 
             
                        value=default_main_df,
         | 
| 77 | 
             
                        datatype= ["number", "markdown", "markdown", "number"],
         | 
| @@ -83,7 +83,7 @@ def _tab_leaderboard(): | |
| 83 | 
             
                        column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
         | 
| 84 | 
             
                        wrap=True
         | 
| 85 | 
             
                        # min_width=60,
         | 
| 86 | 
            -
                    ) | 
| 87 | 
             
                    # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         | 
| 88 | 
             
                    # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         | 
| 89 | 
             
                    # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         | 
| @@ -121,14 +121,14 @@ def _tab_explore(): | |
| 121 | 
             
                        # greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
         | 
| 122 | 
             
                        gr.Markdown("### 🚀 Click below to sample a puzzle. ⬇️ ")
         | 
| 123 | 
             
                        explore_button = gr.Button("🦓 Sample a Zebra Puzzle!", elem_id="explore-button")
         | 
| 124 | 
            -
             | 
| 125 | 
             
                puzzle_md = gr.Markdown("### 🦓 Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
         | 
| 126 | 
             
                model_reasoning_md = gr.Markdown("### 🤖 Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
         | 
| 127 | 
             
                model_prediction_md = gr.Markdown("### 💬 Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
         | 
| 128 | 
             
                turht_solution_md = gr.Markdown("### ✅ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md")
         | 
| 129 | 
             
                model_eval_md = gr.Markdown("### 🆚 Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
         | 
| 130 | 
            -
                explore_button.click(fn=sample_explore_item, | 
| 131 | 
            -
                                     inputs=[model_selection, size_H_selection, size_W_selection], | 
| 132 | 
             
                                     outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md])
         | 
| 133 |  | 
| 134 |  | 
| @@ -136,8 +136,8 @@ def _tab_explore(): | |
| 136 | 
             
            def _tab_submit():
         | 
| 137 | 
             
                markdown_text = """
         | 
| 138 | 
             
                Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
         | 
| 139 | 
            -
                If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py) | 
| 140 | 
            -
                and apply for the access for the [private dataset](https://huggingface.co/datasets/ | 
| 141 | 
             
                """
         | 
| 142 |  | 
| 143 | 
             
                gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
         | 
| @@ -149,33 +149,33 @@ def build_demo(): | |
| 149 |  | 
| 150 | 
             
                with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
         | 
| 151 | 
             
                    gr.HTML(BANNER, elem_id="banner")
         | 
| 152 | 
            -
                    # convert LAST_UPDATED to the PDT time | 
| 153 | 
             
                    LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
         | 
| 154 | 
             
                    header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
         | 
| 155 | 
            -
                    gr.Markdown(header_md_text, elem_classes="markdown-text") | 
| 156 |  | 
| 157 | 
            -
                    with gr.Tabs(elem_classes="tab-buttons") as tabs: | 
| 158 | 
             
                        with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
         | 
| 159 | 
            -
                            _tab_leaderboard() | 
| 160 | 
             
                        with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
         | 
| 161 | 
             
                            _tab_explore()
         | 
| 162 | 
             
                        with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
         | 
| 163 | 
            -
                            _tab_submit() | 
| 164 |  | 
| 165 | 
             
                        with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
         | 
| 166 | 
             
                            gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
         | 
| 167 | 
            -
             | 
| 168 | 
             
                    with gr.Row():
         | 
| 169 | 
             
                        with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
         | 
| 170 | 
             
                            gr.Textbox(
         | 
| 171 | 
            -
                                value=CITATION_TEXT, | 
| 172 | 
             
                                lines=7,
         | 
| 173 | 
             
                                label="Copy the BibTeX snippet to cite this source",
         | 
| 174 | 
             
                                elem_id="citation-button",
         | 
| 175 | 
             
                                show_copy_button=True)
         | 
| 176 | 
             
                            # ).style(show_copy_button=True)
         | 
| 177 |  | 
| 178 | 
            -
                return demo | 
| 179 |  | 
| 180 |  | 
| 181 |  | 
| @@ -184,11 +184,11 @@ def data_load(result_file): | |
| 184 | 
             
                print(f"Loading {result_file}")
         | 
| 185 | 
             
                column_names_main = column_names.copy()
         | 
| 186 | 
             
                # column_names_main.update({})
         | 
| 187 | 
            -
                main_ordered_columns = ORDERED_COLUMN_NAMES | 
| 188 | 
            -
                # filter the data with Total Puzzles == 1000 | 
| 189 | 
            -
             | 
| 190 | 
            -
                click_url = True | 
| 191 | 
            -
                # read json file from the result_file | 
| 192 | 
             
                with open(result_file, "r") as f:
         | 
| 193 | 
             
                    raw_data = json.load(f)
         | 
| 194 | 
             
                # floatify the data, if possible
         | 
| @@ -201,16 +201,16 @@ def data_load(result_file): | |
| 201 | 
             
                original_df = pd.DataFrame(raw_data)
         | 
| 202 | 
             
                original_df = original_df[original_df["Total Puzzles"] == 1000]
         | 
| 203 | 
             
                original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
         | 
| 204 | 
            -
                # print(original_df.columns) | 
| 205 | 
            -
             | 
| 206 |  | 
| 207 | 
             
            if __name__ == "__main__":
         | 
| 208 | 
             
                parser = argparse.ArgumentParser()
         | 
| 209 | 
             
                parser.add_argument("--share", action="store_true")
         | 
| 210 | 
             
                parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
         | 
| 211 | 
            -
             | 
| 212 | 
             
                args = parser.parse_args()
         | 
| 213 | 
            -
                data_load(args.result_file) | 
| 214 | 
             
                print(original_df)
         | 
| 215 | 
             
                demo = build_demo()
         | 
| 216 | 
             
                demo.launch(share=args.share, height=3000, width="100%")
         | 
|  | |
| 12 | 
             
            from pathlib import Path
         | 
| 13 | 
             
            import json
         | 
| 14 | 
             
            from constants import *
         | 
| 15 | 
            +
            from datetime import datetime, timezone
         | 
| 16 | 
             
            # from datasets import Dataset, load_dataset, concatenate_datasets
         | 
| 17 | 
            +
            import os, uuid
         | 
| 18 | 
             
            from utils_display import model_info
         | 
| 19 | 
             
            from constants import column_names,  LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
         | 
| 20 | 
             
            import pytz
         | 
| 21 | 
             
            from data_utils import post_processing, get_random_item
         | 
| 22 |  | 
| 23 | 
             
            # get the last updated time from the elo_ranks.all.jsonl file
         | 
| 24 | 
            +
            LAST_UPDATED = None
         | 
| 25 | 
             
            # with open("_intro.md", "r") as f:
         | 
| 26 | 
             
            #     INTRO_MD = f.read()
         | 
| 27 | 
             
            INTRO_MD = ""
         | 
|  | |
| 33 |  | 
| 34 | 
             
            with open("_metrics.md", "r") as f:
         | 
| 35 | 
             
                METRICS_MD = f.read()
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            raw_data = None
         | 
| 38 | 
            +
            original_df = None
         | 
| 39 | 
             
            # available_models = [] # to be filled in later
         | 
| 40 | 
            +
            available_models = list(model_info.keys())
         | 
| 41 |  | 
| 42 | 
             
            def df_filters(mode_selection_radio, show_open_source_model_only):
         | 
| 43 | 
             
                global original_df
         | 
|  | |
| 59 |  | 
| 60 | 
             
            def _tab_leaderboard():
         | 
| 61 | 
             
                global original_df, available_models
         | 
| 62 | 
            +
                # with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
         | 
| 63 | 
             
                if True:
         | 
| 64 | 
            +
                    default_main_df = original_df.copy()
         | 
| 65 | 
             
                    # default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
         | 
| 66 | 
            +
                    # default_main_df_no_task = default_main_df.copy()
         | 
| 67 | 
             
                    default_mode = "greedy"
         | 
| 68 | 
             
                    default_main_df = df_filters(default_mode, False)
         | 
| 69 | 
            +
                    with gr.Row():
         | 
| 70 | 
            +
                        with gr.Column(scale=5):
         | 
| 71 | 
             
                            mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
         | 
| 72 | 
             
                    # with gr.Row():
         | 
| 73 | 
             
                    #     with gr.Column(scale=2):
         | 
| 74 | 
            +
             | 
| 75 | 
             
                    leaderboard_table = gr.components.Dataframe(
         | 
| 76 | 
             
                        value=default_main_df,
         | 
| 77 | 
             
                        datatype= ["number", "markdown", "markdown", "number"],
         | 
|  | |
| 83 | 
             
                        column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
         | 
| 84 | 
             
                        wrap=True
         | 
| 85 | 
             
                        # min_width=60,
         | 
| 86 | 
            +
                    )
         | 
| 87 | 
             
                    # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         | 
| 88 | 
             
                    # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         | 
| 89 | 
             
                    # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         | 
|  | |
| 121 | 
             
                        # greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True)
         | 
| 122 | 
             
                        gr.Markdown("### 🚀 Click below to sample a puzzle. ⬇️ ")
         | 
| 123 | 
             
                        explore_button = gr.Button("🦓 Sample a Zebra Puzzle!", elem_id="explore-button")
         | 
| 124 | 
            +
             | 
| 125 | 
             
                puzzle_md = gr.Markdown("### 🦓 Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md")
         | 
| 126 | 
             
                model_reasoning_md = gr.Markdown("### 🤖 Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md")
         | 
| 127 | 
             
                model_prediction_md = gr.Markdown("### 💬 Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md")
         | 
| 128 | 
             
                turht_solution_md = gr.Markdown("### ✅ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md")
         | 
| 129 | 
             
                model_eval_md = gr.Markdown("### 🆚 Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md")
         | 
| 130 | 
            +
                explore_button.click(fn=sample_explore_item,
         | 
| 131 | 
            +
                                     inputs=[model_selection, size_H_selection, size_W_selection],
         | 
| 132 | 
             
                                     outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md])
         | 
| 133 |  | 
| 134 |  | 
|  | |
| 136 | 
             
            def _tab_submit():
         | 
| 137 | 
             
                markdown_text = """
         | 
| 138 | 
             
                Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
         | 
| 139 | 
            +
                If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py)
         | 
| 140 | 
            +
                and apply for the access for the [private dataset](https://huggingface.co/datasets/WildEval/ZebraLogic) that contains the truth solutions.
         | 
| 141 | 
             
                """
         | 
| 142 |  | 
| 143 | 
             
                gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
         | 
|  | |
| 149 |  | 
| 150 | 
             
                with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
         | 
| 151 | 
             
                    gr.HTML(BANNER, elem_id="banner")
         | 
| 152 | 
            +
                    # convert LAST_UPDATED to the PDT time
         | 
| 153 | 
             
                    LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
         | 
| 154 | 
             
                    header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
         | 
| 155 | 
            +
                    gr.Markdown(header_md_text, elem_classes="markdown-text")
         | 
| 156 |  | 
| 157 | 
            +
                    with gr.Tabs(elem_classes="tab-buttons") as tabs:
         | 
| 158 | 
             
                        with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
         | 
| 159 | 
            +
                            _tab_leaderboard()
         | 
| 160 | 
             
                        with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
         | 
| 161 | 
             
                            _tab_explore()
         | 
| 162 | 
             
                        with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
         | 
| 163 | 
            +
                            _tab_submit()
         | 
| 164 |  | 
| 165 | 
             
                        with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
         | 
| 166 | 
             
                            gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
         | 
| 167 | 
            +
             | 
| 168 | 
             
                    with gr.Row():
         | 
| 169 | 
             
                        with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
         | 
| 170 | 
             
                            gr.Textbox(
         | 
| 171 | 
            +
                                value=CITATION_TEXT,
         | 
| 172 | 
             
                                lines=7,
         | 
| 173 | 
             
                                label="Copy the BibTeX snippet to cite this source",
         | 
| 174 | 
             
                                elem_id="citation-button",
         | 
| 175 | 
             
                                show_copy_button=True)
         | 
| 176 | 
             
                            # ).style(show_copy_button=True)
         | 
| 177 |  | 
| 178 | 
            +
                return demo
         | 
| 179 |  | 
| 180 |  | 
| 181 |  | 
|  | |
| 184 | 
             
                print(f"Loading {result_file}")
         | 
| 185 | 
             
                column_names_main = column_names.copy()
         | 
| 186 | 
             
                # column_names_main.update({})
         | 
| 187 | 
            +
                main_ordered_columns = ORDERED_COLUMN_NAMES
         | 
| 188 | 
            +
                # filter the data with Total Puzzles == 1000
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                click_url = True
         | 
| 191 | 
            +
                # read json file from the result_file
         | 
| 192 | 
             
                with open(result_file, "r") as f:
         | 
| 193 | 
             
                    raw_data = json.load(f)
         | 
| 194 | 
             
                # floatify the data, if possible
         | 
|  | |
| 201 | 
             
                original_df = pd.DataFrame(raw_data)
         | 
| 202 | 
             
                original_df = original_df[original_df["Total Puzzles"] == 1000]
         | 
| 203 | 
             
                original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
         | 
| 204 | 
            +
                # print(original_df.columns)
         | 
| 205 | 
            +
             | 
| 206 |  | 
| 207 | 
             
            if __name__ == "__main__":
         | 
| 208 | 
             
                parser = argparse.ArgumentParser()
         | 
| 209 | 
             
                parser.add_argument("--share", action="store_true")
         | 
| 210 | 
             
                parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
         | 
| 211 | 
            +
             | 
| 212 | 
             
                args = parser.parse_args()
         | 
| 213 | 
            +
                data_load(args.result_file)
         | 
| 214 | 
             
                print(original_df)
         | 
| 215 | 
             
                demo = build_demo()
         | 
| 216 | 
             
                demo.launch(share=args.share, height=3000, width="100%")
         | 
    	
        constants.py
    CHANGED
    
    | @@ -8,15 +8,15 @@ banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_ba | |
| 8 | 
             
            BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
         | 
| 9 |  | 
| 10 | 
             
            # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
         | 
| 11 | 
            -
             | 
| 12 |  | 
| 13 | 
             
            CITATION_TEXT = """
         | 
| 14 |  | 
| 15 | 
            -
            @ | 
| 16 | 
            -
                title={ZebraLogic:  | 
| 17 | 
            -
                author={Bill Yuchen Lin and Ronan Le Bras and Peter Clark and Yejin Choi},
         | 
| 18 | 
            -
                 | 
| 19 | 
            -
                 | 
| 20 | 
             
            }
         | 
| 21 |  | 
| 22 |  | 
| @@ -27,15 +27,15 @@ CITATION_TEXT = """ | |
| 27 | 
             
              volume={36},
         | 
| 28 | 
             
              year={2024}
         | 
| 29 | 
             
            }
         | 
| 30 | 
            -
             | 
| 31 | 
             
            """
         | 
| 32 |  | 
| 33 | 
             
            # make column_names as an ordered dict
         | 
| 34 | 
            -
             | 
| 35 |  | 
| 36 |  | 
| 37 | 
             
            column_names = OrderedDict({
         | 
| 38 | 
            -
                "Model": "Model", | 
| 39 | 
             
                "Mode": "Mode",
         | 
| 40 | 
             
                "Puzzle Acc": "Puzzle Acc",
         | 
| 41 | 
             
                "Cell Acc": "Cell Acc",
         | 
| @@ -48,29 +48,29 @@ column_names = OrderedDict({ | |
| 48 |  | 
| 49 |  | 
| 50 |  | 
| 51 | 
            -
            LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**. | 
| 52 | 
             
            """
         | 
| 53 |  | 
| 54 | 
             
            # **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
         | 
| 55 | 
            -
            # The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three. | 
| 56 | 
             
            # **WB Score** individually scores each model based on checklists.
         | 
| 57 | 
             
            # Evaluator is GPT-4-Turbo.
         | 
| 58 | 
            -
            LEADERBOARD_REMARKS_MAIN = """ | 
| 59 | 
             
            """
         | 
| 60 | 
            -
             | 
| 61 | 
             
            RANKING_COLUMN = "Puzzle Acc"
         | 
| 62 |  | 
| 63 | 
             
            ORDERED_COLUMN_NAMES = [
         | 
| 64 | 
            -
                "Model", | 
| 65 | 
             
                "Mode",
         | 
| 66 | 
             
                "Puzzle Acc",
         | 
| 67 | 
             
                "Easy Puzzle Acc",
         | 
| 68 | 
             
                "Hard Puzzle Acc",
         | 
| 69 | 
             
                "Cell Acc",
         | 
| 70 | 
            -
                "No answer", | 
| 71 | 
             
            ]
         | 
| 72 |  | 
| 73 | 
            -
             | 
| 74 | 
             
            js_light = """
         | 
| 75 | 
             
            function refresh() {
         | 
| 76 | 
             
                const url = new URL(window.location);
         | 
| @@ -110,15 +110,15 @@ function refresh() { | |
| 110 |  | 
| 111 | 
             
            js_code = """
         | 
| 112 | 
             
            function scroll_top() {
         | 
| 113 | 
            -
                console.log("Hello from Gradio!"); | 
| 114 | 
             
                const bubbles = document.querySelectorAll('.bubble-wrap');
         | 
| 115 | 
             
                bubbles.forEach((bubble, index) => {
         | 
| 116 | 
             
                    setTimeout(() => {
         | 
| 117 | 
             
                        bubble.scrollTop = 0;
         | 
| 118 | 
             
                    }, index * 100); // Delay of 100ms between each iteration
         | 
| 119 | 
             
                });
         | 
| 120 | 
            -
             | 
| 121 | 
            -
            } | 
| 122 | 
             
            """
         | 
| 123 |  | 
| 124 |  | 
| @@ -126,7 +126,7 @@ TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtW | |
| 126 |  | 
| 127 | 
             
            css = """
         | 
| 128 |  | 
| 129 | 
            -
             | 
| 130 |  | 
| 131 | 
             
            code {
         | 
| 132 | 
             
                font-size: large;
         | 
| @@ -179,17 +179,17 @@ td { | |
| 179 | 
             
            .chat-common{
         | 
| 180 | 
             
                height: auto;
         | 
| 181 | 
             
                max-height: 400px;
         | 
| 182 | 
            -
                min-height: 100px; | 
| 183 | 
             
            }
         | 
| 184 | 
             
            .chat-specific{
         | 
| 185 | 
             
                height: auto;
         | 
| 186 | 
             
                max-height: 600px;
         | 
| 187 | 
            -
                min-height: 200px; | 
| 188 | 
             
            }
         | 
| 189 | 
             
            #od-benchmark-tab-table-button{
         | 
| 190 | 
             
                font-size: 15pt;
         | 
| 191 | 
             
                font-weight: bold;
         | 
| 192 | 
            -
            } | 
| 193 |  | 
| 194 | 
             
            .btn_boderline{
         | 
| 195 | 
             
                border: 1px solid #000000;
         | 
| @@ -197,7 +197,7 @@ td { | |
| 197 | 
             
                padding: 5px;
         | 
| 198 | 
             
                margin: 5px;
         | 
| 199 | 
             
                font-size: 15pt;
         | 
| 200 | 
            -
                font-weight: bold; | 
| 201 | 
             
            }
         | 
| 202 |  | 
| 203 | 
             
            .btn_boderline_next{
         | 
| @@ -206,7 +206,7 @@ td { | |
| 206 | 
             
                padding: 5px;
         | 
| 207 | 
             
                margin: 5px;
         | 
| 208 | 
             
                font-size: 15pt;
         | 
| 209 | 
            -
                font-weight: bold; | 
| 210 | 
             
            }
         | 
| 211 |  | 
| 212 | 
             
            .btn_boderline_gray{
         | 
| @@ -215,7 +215,7 @@ td { | |
| 215 | 
             
                padding: 5px;
         | 
| 216 | 
             
                margin: 5px;
         | 
| 217 | 
             
                font-size: 15pt;
         | 
| 218 | 
            -
                font-weight: italic; | 
| 219 | 
             
            }
         | 
| 220 | 
             
            .btn_boderline_selected{
         | 
| 221 | 
             
                border: 2px solid purple;
         | 
| @@ -224,12 +224,12 @@ td { | |
| 224 | 
             
                padding: 5px;
         | 
| 225 | 
             
                margin: 5px;
         | 
| 226 | 
             
                font-size: 15pt;
         | 
| 227 | 
            -
                font-weight: bold; | 
| 228 | 
             
            }
         | 
| 229 | 
             
            .accordion-label button span{
         | 
| 230 | 
             
                font-size: 14pt;
         | 
| 231 | 
             
                font-weight: bold;
         | 
| 232 | 
            -
            } | 
| 233 |  | 
| 234 | 
             
            #show-task-categorized span{
         | 
| 235 | 
             
                font-size: 13pt;
         | 
| @@ -269,7 +269,7 @@ button.selected[role="tab"][aria-selected="true"] { | |
| 269 | 
             
            .plotly-plot{
         | 
| 270 | 
             
                height: auto;
         | 
| 271 | 
             
                max-height: 600px;
         | 
| 272 | 
            -
                min-height: 600px; | 
| 273 | 
             
            }
         | 
| 274 |  | 
| 275 | 
             
            #length-margin-radio{
         | 
| @@ -279,12 +279,12 @@ button.selected[role="tab"][aria-selected="true"] { | |
| 279 | 
             
            }
         | 
| 280 |  | 
| 281 | 
             
            #show-task-categorized{
         | 
| 282 | 
            -
                font-size: 12pt; | 
| 283 | 
             
                font-decoration: bold;
         | 
| 284 | 
             
            }
         | 
| 285 |  | 
| 286 | 
             
            #show-open-source-models{
         | 
| 287 | 
            -
                font-size: 12pt; | 
| 288 | 
             
                font-decoration: bold;
         | 
| 289 | 
             
            }
         | 
| 290 |  | 
| @@ -296,4 +296,3 @@ button.selected[role="tab"][aria-selected="true"] { | |
| 296 | 
             
                margin: 5px;
         | 
| 297 | 
             
            }
         | 
| 298 | 
             
            """
         | 
| 299 | 
            -
             
         | 
|  | |
| 8 | 
             
            BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
         | 
| 9 |  | 
| 10 | 
             
            # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
         | 
| 11 | 
            +
             | 
| 12 |  | 
| 13 | 
             
            CITATION_TEXT = """
         | 
| 14 |  | 
| 15 | 
            +
            @article{zebralogic2025,
         | 
| 16 | 
            +
                title={ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning},
         | 
| 17 | 
            +
                author={Bill Yuchen Lin and Ronan Le Bras and Kyle Richardson and Ashish Sabharwal and Radha Poovendran and Peter Clark and Yejin Choi},
         | 
| 18 | 
            +
                year={2025},
         | 
| 19 | 
            +
                url={https://arxiv.org/abs/2502.01100},
         | 
| 20 | 
             
            }
         | 
| 21 |  | 
| 22 |  | 
|  | |
| 27 | 
             
              volume={36},
         | 
| 28 | 
             
              year={2024}
         | 
| 29 | 
             
            }
         | 
| 30 | 
            +
             | 
| 31 | 
             
            """
         | 
| 32 |  | 
| 33 | 
             
            # make column_names as an ordered dict
         | 
| 34 | 
            +
             | 
| 35 |  | 
| 36 |  | 
| 37 | 
             
            column_names = OrderedDict({
         | 
| 38 | 
            +
                "Model": "Model",
         | 
| 39 | 
             
                "Mode": "Mode",
         | 
| 40 | 
             
                "Puzzle Acc": "Puzzle Acc",
         | 
| 41 | 
             
                "Cell Acc": "Cell Acc",
         | 
|  | |
| 48 |  | 
| 49 |  | 
| 50 |  | 
| 51 | 
            +
            LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
         | 
| 52 | 
             
            """
         | 
| 53 |  | 
| 54 | 
             
            # **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
         | 
| 55 | 
            +
            # The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
         | 
| 56 | 
             
            # **WB Score** individually scores each model based on checklists.
         | 
| 57 | 
             
            # Evaluator is GPT-4-Turbo.
         | 
| 58 | 
            +
            LEADERBOARD_REMARKS_MAIN = """
         | 
| 59 | 
             
            """
         | 
| 60 | 
            +
             | 
| 61 | 
             
            RANKING_COLUMN = "Puzzle Acc"
         | 
| 62 |  | 
| 63 | 
             
            ORDERED_COLUMN_NAMES = [
         | 
| 64 | 
            +
                "Model",
         | 
| 65 | 
             
                "Mode",
         | 
| 66 | 
             
                "Puzzle Acc",
         | 
| 67 | 
             
                "Easy Puzzle Acc",
         | 
| 68 | 
             
                "Hard Puzzle Acc",
         | 
| 69 | 
             
                "Cell Acc",
         | 
| 70 | 
            +
                "No answer",
         | 
| 71 | 
             
            ]
         | 
| 72 |  | 
| 73 | 
            +
             | 
| 74 | 
             
            js_light = """
         | 
| 75 | 
             
            function refresh() {
         | 
| 76 | 
             
                const url = new URL(window.location);
         | 
|  | |
| 110 |  | 
| 111 | 
             
            js_code = """
         | 
| 112 | 
             
            function scroll_top() {
         | 
| 113 | 
            +
                console.log("Hello from Gradio!");
         | 
| 114 | 
             
                const bubbles = document.querySelectorAll('.bubble-wrap');
         | 
| 115 | 
             
                bubbles.forEach((bubble, index) => {
         | 
| 116 | 
             
                    setTimeout(() => {
         | 
| 117 | 
             
                        bubble.scrollTop = 0;
         | 
| 118 | 
             
                    }, index * 100); // Delay of 100ms between each iteration
         | 
| 119 | 
             
                });
         | 
| 120 | 
            +
             | 
| 121 | 
            +
            }
         | 
| 122 | 
             
            """
         | 
| 123 |  | 
| 124 |  | 
|  | |
| 126 |  | 
| 127 | 
             
            css = """
         | 
| 128 |  | 
| 129 | 
            +
             | 
| 130 |  | 
| 131 | 
             
            code {
         | 
| 132 | 
             
                font-size: large;
         | 
|  | |
| 179 | 
             
            .chat-common{
         | 
| 180 | 
             
                height: auto;
         | 
| 181 | 
             
                max-height: 400px;
         | 
| 182 | 
            +
                min-height: 100px;
         | 
| 183 | 
             
            }
         | 
| 184 | 
             
            .chat-specific{
         | 
| 185 | 
             
                height: auto;
         | 
| 186 | 
             
                max-height: 600px;
         | 
| 187 | 
            +
                min-height: 200px;
         | 
| 188 | 
             
            }
         | 
| 189 | 
             
            #od-benchmark-tab-table-button{
         | 
| 190 | 
             
                font-size: 15pt;
         | 
| 191 | 
             
                font-weight: bold;
         | 
| 192 | 
            +
            }
         | 
| 193 |  | 
| 194 | 
             
            .btn_boderline{
         | 
| 195 | 
             
                border: 1px solid #000000;
         | 
|  | |
| 197 | 
             
                padding: 5px;
         | 
| 198 | 
             
                margin: 5px;
         | 
| 199 | 
             
                font-size: 15pt;
         | 
| 200 | 
            +
                font-weight: bold;
         | 
| 201 | 
             
            }
         | 
| 202 |  | 
| 203 | 
             
            .btn_boderline_next{
         | 
|  | |
| 206 | 
             
                padding: 5px;
         | 
| 207 | 
             
                margin: 5px;
         | 
| 208 | 
             
                font-size: 15pt;
         | 
| 209 | 
            +
                font-weight: bold;
         | 
| 210 | 
             
            }
         | 
| 211 |  | 
| 212 | 
             
            .btn_boderline_gray{
         | 
|  | |
| 215 | 
             
                padding: 5px;
         | 
| 216 | 
             
                margin: 5px;
         | 
| 217 | 
             
                font-size: 15pt;
         | 
| 218 | 
            +
                font-weight: italic;
         | 
| 219 | 
             
            }
         | 
| 220 | 
             
            .btn_boderline_selected{
         | 
| 221 | 
             
                border: 2px solid purple;
         | 
|  | |
| 224 | 
             
                padding: 5px;
         | 
| 225 | 
             
                margin: 5px;
         | 
| 226 | 
             
                font-size: 15pt;
         | 
| 227 | 
            +
                font-weight: bold;
         | 
| 228 | 
             
            }
         | 
| 229 | 
             
            .accordion-label button span{
         | 
| 230 | 
             
                font-size: 14pt;
         | 
| 231 | 
             
                font-weight: bold;
         | 
| 232 | 
            +
            }
         | 
| 233 |  | 
| 234 | 
             
            #show-task-categorized span{
         | 
| 235 | 
             
                font-size: 13pt;
         | 
|  | |
| 269 | 
             
            .plotly-plot{
         | 
| 270 | 
             
                height: auto;
         | 
| 271 | 
             
                max-height: 600px;
         | 
| 272 | 
            +
                min-height: 600px;
         | 
| 273 | 
             
            }
         | 
| 274 |  | 
| 275 | 
             
            #length-margin-radio{
         | 
|  | |
| 279 | 
             
            }
         | 
| 280 |  | 
| 281 | 
             
            #show-task-categorized{
         | 
| 282 | 
            +
                font-size: 12pt;
         | 
| 283 | 
             
                font-decoration: bold;
         | 
| 284 | 
             
            }
         | 
| 285 |  | 
| 286 | 
             
            #show-open-source-models{
         | 
| 287 | 
            +
                font-size: 12pt;
         | 
| 288 | 
             
                font-decoration: bold;
         | 
| 289 | 
             
            }
         | 
| 290 |  | 
|  | |
| 296 | 
             
                margin: 5px;
         | 
| 297 | 
             
            }
         | 
| 298 | 
             
            """
         | 
|  | 
    	
        eval_utils.py
    CHANGED
    
    | @@ -8,7 +8,7 @@ private_solutions = {} | |
| 8 |  | 
| 9 | 
             
            def load_private_solutions():
         | 
| 10 | 
             
                global private_solutions
         | 
| 11 | 
            -
                private_zebra_data = load_dataset(" | 
| 12 | 
             
                for item in private_zebra_data:
         | 
| 13 | 
             
                    private_solutions[item["id"]] = item["solution"] 
         | 
| 14 | 
             
                return 
         | 
|  | |
| 8 |  | 
| 9 | 
             
            def load_private_solutions():
         | 
| 10 | 
             
                global private_solutions
         | 
| 11 | 
            +
                private_zebra_data = load_dataset("WildEval/ZebraLogic", "grid_mode", split="test")
         | 
| 12 | 
             
                for item in private_zebra_data:
         | 
| 13 | 
             
                    private_solutions[item["id"]] = item["solution"] 
         | 
| 14 | 
             
                return 
         | 
 
			

