Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	remove winrates and update the length penalty method
Browse files- app.py +51 -22
- compute_model_output_len.py +5 -2
- constants.py +1 -1
- data_dir/battle_outcome.png +0 -0
- data_dir/battle_outcome_2d.png +0 -0
- data_dir/bootstrap_elo_estimates.png +0 -0
- data_dir/elo_ranks.L=0.0.jsonl +24 -0
- data_dir/elo_ranks.L=0.1.jsonl +24 -0
- data_dir/elo_ranks.L=0.2.jsonl +24 -0
- data_dir/elo_ranks.L=0.3.jsonl +24 -0
- data_dir/elo_ranks.L=0.4.jsonl +24 -0
- data_dir/elo_ranks.L=0.5.jsonl +24 -0
- data_dir/elo_ranks.L=0.6.jsonl +24 -0
- data_dir/elo_ranks.L=0.7.jsonl +24 -0
- data_dir/elo_ranks.L=0.8.jsonl +24 -0
- data_dir/elo_ranks.L=0.9.jsonl +24 -0
- data_dir/elo_ranks.L=1.0.jsonl +24 -0
- data_dir/elo_ranks.all.L=0.0.jsonl +24 -0
- data_dir/elo_ranks.all.L=0.1.jsonl +24 -0
- data_dir/elo_ranks.all.L=0.2.jsonl +24 -0
- data_dir/elo_ranks.all.L=0.3.jsonl +24 -0
- data_dir/elo_ranks.all.L=0.4.jsonl +24 -0
- data_dir/elo_ranks.all.L=0.5.jsonl +24 -0
- data_dir/elo_ranks.all.L=0.6.jsonl +24 -0
- data_dir/elo_ranks.all.L=0.7.jsonl +24 -0
- data_dir/elo_ranks.all.L=0.8.jsonl +24 -0
- data_dir/elo_ranks.all.L=0.9.jsonl +24 -0
- data_dir/elo_ranks.all.L=1.0.jsonl +24 -0
- data_dir/elo_ranks.all.jsonl +24 -22
- data_dir/elo_ranks.jsonl +24 -22
- data_dir/elo_ranks.length_ablation.all.jsonl +24 -22
- data_dir/elo_ranks.length_ablation.jsonl +24 -22
- data_dir/elo_ranks.skip_empty.all.jsonl +24 -22
- data_dir/elo_ranks.skip_empty.jsonl +24 -22
- data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl +24 -22
- data_dir/elo_ranks.skip_empty.length_ablation.jsonl +24 -22
- data_dir/elo_ranks.test.jsonl +23 -0
- data_dir/pairwise_win_fractions.pkl +2 -2
- data_dir/pairwise_win_fractions.png +0 -0
- data_utils.py +22 -15
- model_info.json +5 -2
- model_len_info.json +100 -17
    	
        app.py
    CHANGED
    
    | @@ -32,7 +32,9 @@ with open("_about_us.md", "r") as f: | |
| 32 | 
             
            with open("_header.md", "r") as f:
         | 
| 33 | 
             
                HEADER_MD = f.read()
         | 
| 34 |  | 
|  | |
| 35 | 
             
            original_df, ablation_df = None, None
         | 
|  | |
| 36 | 
             
            eval_results = load_eval_results() 
         | 
| 37 |  | 
| 38 | 
             
            available_models = [] # to be filled in later
         | 
| @@ -68,31 +70,33 @@ def display_chat_history(model_selections, task_selections): | |
| 68 |  | 
| 69 |  | 
| 70 | 
             
            def slider_change_main(length_penalty):
         | 
| 71 | 
            -
                global original_df, ablation_df
         | 
| 72 | 
            -
                adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty) 
         | 
| 73 | 
             
                adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
         | 
| 74 | 
             
                adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
         | 
| 75 | 
            -
                adjusted_df = add_winrates(adjusted_df) 
         | 
| 76 | 
            -
                adjusted_df = adjusted_df.drop(columns=["Length"])
         | 
| 77 | 
             
                return adjusted_df
         | 
| 78 |  | 
| 79 | 
             
            def slider_change_full(length_penalty, show_winrate):
         | 
| 80 | 
            -
                global original_df, ablation_df
         | 
| 81 | 
            -
                adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty)
         | 
| 82 | 
             
                # sort the model by the "Task-Avg Elo" column
         | 
| 83 | 
            -
                adjusted_df = adjusted_df.sort_values(by=" | 
| 84 | 
             
                adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
         | 
| 85 | 
             
                if show_winrate == "none":
         | 
| 86 | 
             
                    return adjusted_df
         | 
| 87 | 
             
                elif show_winrate == "gpt-3.5":
         | 
| 88 | 
            -
                    adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5")
         | 
| 89 | 
             
                elif show_winrate == "gpt-4":
         | 
| 90 | 
            -
                    adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
         | 
| 91 | 
             
                return adjusted_df
         | 
| 92 |  | 
| 93 | 
             
            seafoam = Seafoam()
         | 
| 94 | 
             
            def build_demo(TYPES):
         | 
| 95 | 
             
                global original_df, ablation_df, skip_empty_original_df, skip_empty_ablation_df, available_models
         | 
|  | |
|  | |
| 96 | 
             
                with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
         | 
| 97 | 
             
                # with gr.Blocks(theme=seafoam, css=css) as demo:
         | 
| 98 | 
             
                    gr.HTML(BANNER, elem_id="banner")
         | 
| @@ -106,14 +110,16 @@ def build_demo(TYPES): | |
| 106 |  | 
| 107 | 
             
                            with gr.TabItem("Main Table", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
         | 
| 108 | 
             
                                # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
         | 
| 109 | 
            -
                                default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP) 
         | 
| 110 | 
             
                                default_main_df = default_main_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]] 
         | 
| 111 | 
            -
                                default_main_df =  | 
| 112 | 
            -
                                default_main_df = default_main_df | 
|  | |
|  | |
| 113 | 
             
                                # TODO: add the win rate for GPT-4 and GPT-3.5T
         | 
| 114 | 
             
                                with gr.Row():
         | 
| 115 | 
             
                                    with gr.Column(scale=4):
         | 
| 116 | 
            -
                                        gr.Markdown("** | 
| 117 | 
             
                                    with gr.Column(scale=0.8):
         | 
| 118 | 
             
                                        length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider") 
         | 
| 119 | 
             
                                # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
         | 
| @@ -130,16 +136,17 @@ def build_demo(TYPES): | |
| 130 | 
             
                                length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])
         | 
| 131 |  | 
| 132 |  | 
| 133 | 
            -
                            with gr.TabItem("All Tasks (Win% vs GPT-3.5T)", elem_id="od-benchmark-tab-table-ablation", id=1):
         | 
| 134 | 
             
                                with gr.Row():
         | 
| 135 | 
             
                                    with gr.Column(scale=4):
         | 
| 136 | 
             
                                        gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP") 
         | 
| 137 | 
             
                                    with gr.Column(scale=0.8):
         | 
| 138 | 
             
                                        length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
         | 
| 139 | 
            -
                                default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
         | 
| 140 | 
             
                                # do not show the "# battles" column here 
         | 
|  | |
| 141 | 
             
                                default_full_df = default_full_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
         | 
| 142 | 
            -
                                default_full_df = add_winrates_tasks(default_full_df, ref="gpt-3.5")
         | 
| 143 |  | 
| 144 | 
             
                                leaderboard_table_full = gr.components.Dataframe(
         | 
| 145 | 
             
                                    value=default_full_df,
         | 
| @@ -155,16 +162,18 @@ def build_demo(TYPES): | |
| 155 | 
             
                                length_penlty_slider_full.change(fn=slider_change_full, inputs=[length_penlty_slider_full, show_winrate], outputs=[leaderboard_table_full])
         | 
| 156 |  | 
| 157 |  | 
| 158 | 
            -
                            with gr.TabItem("All Tasks (Win% vs GPT-4)", elem_id="od-benchmark-tab-table-ablation", id=2):
         | 
| 159 | 
             
                                with gr.Row():
         | 
| 160 | 
             
                                    with gr.Column(scale=4):
         | 
| 161 | 
             
                                        gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP") 
         | 
| 162 | 
             
                                    with gr.Column(scale=0.8):
         | 
| 163 | 
             
                                        length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
         | 
| 164 | 
            -
                                default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
         | 
| 165 | 
             
                                # do not show the "# battles" column here 
         | 
|  | |
| 166 | 
             
                                default_full_df = default_full_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
         | 
| 167 | 
            -
                                default_full_df = add_winrates_tasks(default_full_df, ref="gpt-4")
         | 
|  | |
| 168 | 
             
                                leaderboard_table_full = gr.components.Dataframe(
         | 
| 169 | 
             
                                    value=default_full_df,
         | 
| 170 | 
             
                                    datatype=TYPES,
         | 
| @@ -185,7 +194,7 @@ def build_demo(TYPES): | |
| 185 | 
             
                                        gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP") 
         | 
| 186 | 
             
                                    with gr.Column(scale=0.8):
         | 
| 187 | 
             
                                        length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
         | 
| 188 | 
            -
                                default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
         | 
| 189 | 
             
                                # do not show the "# battles" column here 
         | 
| 190 | 
             
                                default_full_df = default_full_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
         | 
| 191 | 
             
                                leaderboard_table_full = gr.components.Dataframe(
         | 
| @@ -427,6 +436,7 @@ if __name__ == "__main__": | |
| 427 | 
             
                parser.add_argument("--length_balation_file", help="Path to results table", default="data_dir/elo_ranks.length_ablation.all.jsonl")
         | 
| 428 | 
             
                parser.add_argument("--skip_empty_result_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.all.jsonl")
         | 
| 429 | 
             
                parser.add_argument("--skip_empty_length_balation_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
         | 
|  | |
| 430 | 
             
                args = parser.parse_args()
         | 
| 431 |  | 
| 432 | 
             
                LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
         | 
| @@ -435,17 +445,32 @@ if __name__ == "__main__": | |
| 435 | 
             
                ablation_df = pd.read_json(args.length_balation_file, lines=True)
         | 
| 436 | 
             
                skip_empty_original_df = pd.read_json(args.skip_empty_result_file , lines=True)
         | 
| 437 | 
             
                skip_empty_ablation_df = pd.read_json(args.skip_empty_length_balation_file, lines=True)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 438 |  | 
| 439 |  | 
| 440 | 
             
                # available_models = sorted(list(set(list(original_df["model name "])))) 
         | 
| 441 | 
             
                available_models = list(model_info.keys())
         | 
|  | |
|  | |
| 442 | 
             
                # remove the rows where the model name is not in the available_models
         | 
| 443 | 
             
                original_df = original_df[original_df["model name "].isin(available_models)]
         | 
| 444 | 
             
                ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
         | 
| 445 | 
             
                skip_empty_ablation_df = skip_empty_ablation_df[skip_empty_ablation_df["model name "].isin(available_models)]
         | 
| 446 | 
             
                skip_empty_original_df = skip_empty_original_df[skip_empty_original_df["model name "].isin(available_models)]
         | 
| 447 |  | 
| 448 | 
            -
             | 
| 449 |  | 
| 450 | 
             
                original_df = post_processing(original_df, model_len_info)
         | 
| 451 | 
             
                ablation_df = post_processing(ablation_df, model_len_info)
         | 
| @@ -453,7 +478,11 @@ if __name__ == "__main__": | |
| 453 | 
             
                skip_empty_ablation_df = post_processing(skip_empty_ablation_df, model_len_info)
         | 
| 454 |  | 
| 455 |  | 
| 456 | 
            -
                
         | 
|  | |
|  | |
|  | |
|  | |
| 457 |  | 
| 458 | 
             
                TYPES = ["markdown", "number"]
         | 
| 459 |  | 
|  | |
| 32 | 
             
            with open("_header.md", "r") as f:
         | 
| 33 | 
             
                HEADER_MD = f.read()
         | 
| 34 |  | 
| 35 | 
            +
            LP_MODE = "v2"
         | 
| 36 | 
             
            original_df, ablation_df = None, None
         | 
| 37 | 
            +
            LP_original_dfs = {} 
         | 
| 38 | 
             
            eval_results = load_eval_results() 
         | 
| 39 |  | 
| 40 | 
             
            available_models = [] # to be filled in later
         | 
|  | |
| 70 |  | 
| 71 |  | 
| 72 | 
             
            def slider_change_main(length_penalty):
         | 
| 73 | 
            +
                global original_df, ablation_df, LP_MODE
         | 
| 74 | 
            +
                adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs) 
         | 
| 75 | 
             
                adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
         | 
| 76 | 
             
                adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
         | 
| 77 | 
            +
                # adjusted_df = add_winrates(adjusted_df, LP=length_penalty) 
         | 
| 78 | 
            +
                # adjusted_df = adjusted_df.drop(columns=["Length"])
         | 
| 79 | 
             
                return adjusted_df
         | 
| 80 |  | 
| 81 | 
             
            def slider_change_full(length_penalty, show_winrate):
         | 
| 82 | 
            +
                global original_df, ablation_df, LP_MODE
         | 
| 83 | 
            +
                adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
         | 
| 84 | 
             
                # sort the model by the "Task-Avg Elo" column
         | 
| 85 | 
            +
                adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
         | 
| 86 | 
             
                adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
         | 
| 87 | 
             
                if show_winrate == "none":
         | 
| 88 | 
             
                    return adjusted_df
         | 
| 89 | 
             
                elif show_winrate == "gpt-3.5":
         | 
| 90 | 
            +
                    adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
         | 
| 91 | 
             
                elif show_winrate == "gpt-4":
         | 
| 92 | 
            +
                    adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
         | 
| 93 | 
             
                return adjusted_df
         | 
| 94 |  | 
| 95 | 
             
            seafoam = Seafoam()
         | 
| 96 | 
             
            def build_demo(TYPES):
         | 
| 97 | 
             
                global original_df, ablation_df, skip_empty_original_df, skip_empty_ablation_df, available_models
         | 
| 98 | 
            +
                global LP_original_dfs, LP_MODE
         | 
| 99 | 
            +
             | 
| 100 | 
             
                with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
         | 
| 101 | 
             
                # with gr.Blocks(theme=seafoam, css=css) as demo:
         | 
| 102 | 
             
                    gr.HTML(BANNER, elem_id="banner")
         | 
|  | |
| 110 |  | 
| 111 | 
             
                            with gr.TabItem("Main Table", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
         | 
| 112 | 
             
                                # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
         | 
| 113 | 
            +
                                default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs) 
         | 
| 114 | 
             
                                default_main_df = default_main_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]] 
         | 
| 115 | 
            +
                                default_main_df = default_main_df.sort_values(by="Overall Elo", ascending=False)
         | 
| 116 | 
            +
                                # default_main_df = add_winrates(default_main_df, LP=DEFAULT_LP)
         | 
| 117 | 
            +
                                # default_main_df = default_main_df.drop(columns=["Overall Elo"])
         | 
| 118 | 
            +
                                # default_main_df = default_main_df.drop(columns=["Length"])
         | 
| 119 | 
             
                                # TODO: add the win rate for GPT-4 and GPT-3.5T
         | 
| 120 | 
             
                                with gr.Row():
         | 
| 121 | 
             
                                    with gr.Column(scale=4):
         | 
| 122 | 
            +
                                        gr.Markdown("**Task-Avg Elo**: Compute Elo on subsets of each task type and then take their avg. | **Win Rates**: [Estimated by Elo differences](https://www.hexwiki.net/index.php/Elo_rating#Definition). | **Length penalty**: Models w/ longer outputs are penalized. (Plz check 📖 **Details**.)", elem_classes="markdown-text-small top-left-LP") 
         | 
| 123 | 
             
                                    with gr.Column(scale=0.8):
         | 
| 124 | 
             
                                        length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider") 
         | 
| 125 | 
             
                                # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
         | 
|  | |
| 136 | 
             
                                length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])
         | 
| 137 |  | 
| 138 |  | 
| 139 | 
            +
                            with gr.TabItem("All Tasks (Win% vs GPT-3.5T)", elem_id="od-benchmark-tab-table-ablation", id=1, visible=False):
         | 
| 140 | 
             
                                with gr.Row():
         | 
| 141 | 
             
                                    with gr.Column(scale=4):
         | 
| 142 | 
             
                                        gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP") 
         | 
| 143 | 
             
                                    with gr.Column(scale=0.8):
         | 
| 144 | 
             
                                        length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
         | 
| 145 | 
            +
                                default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
         | 
| 146 | 
             
                                # do not show the "# battles" column here 
         | 
| 147 | 
            +
                                default_full_df = default_full_df.sort_values(by="Overall Elo", ascending=False)
         | 
| 148 | 
             
                                default_full_df = default_full_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
         | 
| 149 | 
            +
                                default_full_df = add_winrates_tasks(default_full_df, ref="gpt-3.5", LP=DEFAULT_LP)                    
         | 
| 150 |  | 
| 151 | 
             
                                leaderboard_table_full = gr.components.Dataframe(
         | 
| 152 | 
             
                                    value=default_full_df,
         | 
|  | |
| 162 | 
             
                                length_penlty_slider_full.change(fn=slider_change_full, inputs=[length_penlty_slider_full, show_winrate], outputs=[leaderboard_table_full])
         | 
| 163 |  | 
| 164 |  | 
| 165 | 
            +
                            with gr.TabItem("All Tasks (Win% vs GPT-4)", elem_id="od-benchmark-tab-table-ablation", id=2, visible=False):
         | 
| 166 | 
             
                                with gr.Row():
         | 
| 167 | 
             
                                    with gr.Column(scale=4):
         | 
| 168 | 
             
                                        gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP") 
         | 
| 169 | 
             
                                    with gr.Column(scale=0.8):
         | 
| 170 | 
             
                                        length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
         | 
| 171 | 
            +
                                default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
         | 
| 172 | 
             
                                # do not show the "# battles" column here 
         | 
| 173 | 
            +
                                default_full_df = default_full_df.sort_values(by="Overall Elo", ascending=False)
         | 
| 174 | 
             
                                default_full_df = default_full_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
         | 
| 175 | 
            +
                                default_full_df = add_winrates_tasks(default_full_df, ref="gpt-4", LP=DEFAULT_LP)
         | 
| 176 | 
            +
                                
         | 
| 177 | 
             
                                leaderboard_table_full = gr.components.Dataframe(
         | 
| 178 | 
             
                                    value=default_full_df,
         | 
| 179 | 
             
                                    datatype=TYPES,
         | 
|  | |
| 194 | 
             
                                        gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP") 
         | 
| 195 | 
             
                                    with gr.Column(scale=0.8):
         | 
| 196 | 
             
                                        length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
         | 
| 197 | 
            +
                                default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
         | 
| 198 | 
             
                                # do not show the "# battles" column here 
         | 
| 199 | 
             
                                default_full_df = default_full_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
         | 
| 200 | 
             
                                leaderboard_table_full = gr.components.Dataframe(
         | 
|  | |
| 436 | 
             
                parser.add_argument("--length_balation_file", help="Path to results table", default="data_dir/elo_ranks.length_ablation.all.jsonl")
         | 
| 437 | 
             
                parser.add_argument("--skip_empty_result_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.all.jsonl")
         | 
| 438 | 
             
                parser.add_argument("--skip_empty_length_balation_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
         | 
| 439 | 
            +
                
         | 
| 440 | 
             
                args = parser.parse_args()
         | 
| 441 |  | 
| 442 | 
             
                LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
         | 
|  | |
| 445 | 
             
                ablation_df = pd.read_json(args.length_balation_file, lines=True)
         | 
| 446 | 
             
                skip_empty_original_df = pd.read_json(args.skip_empty_result_file , lines=True)
         | 
| 447 | 
             
                skip_empty_ablation_df = pd.read_json(args.skip_empty_length_balation_file, lines=True)
         | 
| 448 | 
            +
             | 
| 449 | 
            +
                
         | 
| 450 | 
            +
                for i in range(0, 11):
         | 
| 451 | 
            +
                    if i == 0:
         | 
| 452 | 
            +
                        L = "0.0"
         | 
| 453 | 
            +
                    elif 1 <= i <= 9:
         | 
| 454 | 
            +
                        L = f"0.{i}"
         | 
| 455 | 
            +
                    elif i == 10:
         | 
| 456 | 
            +
                        L = "1.0" 
         | 
| 457 | 
            +
                    result_file_path = args.result_file.replace(".jsonl", f".L={L}.jsonl")
         | 
| 458 | 
            +
                    LP_original_dfs[L] = pd.read_json(result_file_path, lines=True)
         | 
| 459 | 
            +
                
         | 
| 460 | 
            +
                
         | 
| 461 |  | 
| 462 |  | 
| 463 | 
             
                # available_models = sorted(list(set(list(original_df["model name "])))) 
         | 
| 464 | 
             
                available_models = list(model_info.keys())
         | 
| 465 | 
            +
                model_len_info = json.load(open("model_len_info.json", "r"))
         | 
| 466 | 
            +
             | 
| 467 | 
             
                # remove the rows where the model name is not in the available_models
         | 
| 468 | 
             
                original_df = original_df[original_df["model name "].isin(available_models)]
         | 
| 469 | 
             
                ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
         | 
| 470 | 
             
                skip_empty_ablation_df = skip_empty_ablation_df[skip_empty_ablation_df["model name "].isin(available_models)]
         | 
| 471 | 
             
                skip_empty_original_df = skip_empty_original_df[skip_empty_original_df["model name "].isin(available_models)]
         | 
| 472 |  | 
| 473 | 
            +
             | 
| 474 |  | 
| 475 | 
             
                original_df = post_processing(original_df, model_len_info)
         | 
| 476 | 
             
                ablation_df = post_processing(ablation_df, model_len_info)
         | 
|  | |
| 478 | 
             
                skip_empty_ablation_df = post_processing(skip_empty_ablation_df, model_len_info)
         | 
| 479 |  | 
| 480 |  | 
| 481 | 
            +
                for LP, LP_origin_df in LP_original_dfs.items():
         | 
| 482 | 
            +
                    LP_original_dfs[LP] = LP_origin_df[LP_origin_df["model name "].isin(available_models)]
         | 
| 483 | 
            +
                    LP_original_dfs[LP] = post_processing(LP_original_dfs[LP], model_len_info)
         | 
| 484 | 
            +
             | 
| 485 | 
            +
             | 
| 486 |  | 
| 487 | 
             
                TYPES = ["markdown", "number"]
         | 
| 488 |  | 
    	
        compute_model_output_len.py
    CHANGED
    
    | @@ -8,6 +8,7 @@ length_info = {} | |
| 8 | 
             
            for model_name in tqdm(list(model_info.keys())): 
         | 
| 9 | 
             
                result = load_infer_results(model_name)
         | 
| 10 | 
             
                lens = []
         | 
|  | |
| 11 | 
             
                for item in result:
         | 
| 12 | 
             
                    o = item["output"]
         | 
| 13 | 
             
                    if type(o) == list:
         | 
| @@ -16,9 +17,11 @@ for model_name in tqdm(list(model_info.keys())): | |
| 16 | 
             
                        L = len(o.strip())
         | 
| 17 | 
             
                    if L > 0:
         | 
| 18 | 
             
                        lens.append(L)
         | 
|  | |
|  | |
| 19 | 
             
                avg_len = sum(lens) / len(lens)
         | 
| 20 | 
            -
                print(f"{model_name}: {avg_len}")
         | 
| 21 | 
            -
                length_info[model_name] = avg_len
         | 
| 22 |  | 
| 23 | 
             
            with open("model_len_info.json", "w") as f:
         | 
| 24 | 
             
                json.dump(length_info, f, indent=2)
         | 
|  | |
| 8 | 
             
            for model_name in tqdm(list(model_info.keys())): 
         | 
| 9 | 
             
                result = load_infer_results(model_name)
         | 
| 10 | 
             
                lens = []
         | 
| 11 | 
            +
                cnt_empty = 0
         | 
| 12 | 
             
                for item in result:
         | 
| 13 | 
             
                    o = item["output"]
         | 
| 14 | 
             
                    if type(o) == list:
         | 
|  | |
| 17 | 
             
                        L = len(o.strip())
         | 
| 18 | 
             
                    if L > 0:
         | 
| 19 | 
             
                        lens.append(L)
         | 
| 20 | 
            +
                    else:
         | 
| 21 | 
            +
                        cnt_empty += 1
         | 
| 22 | 
             
                avg_len = sum(lens) / len(lens)
         | 
| 23 | 
            +
                print(f"{model_name}: {avg_len}; {cnt_empty} empty outputs.")
         | 
| 24 | 
            +
                length_info[model_name] = {"avg_len": avg_len, "empty_output": cnt_empty, "num_samples": len(result)}
         | 
| 25 |  | 
| 26 | 
             
            with open("model_len_info.json", "w") as f:
         | 
| 27 | 
             
                json.dump(length_info, f, indent=2)
         | 
    	
        constants.py
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 | 
             
            from pathlib import Path
         | 
| 2 |  | 
| 3 | 
            -
            DEFAULT_LP = 0. | 
| 4 |  | 
| 5 | 
             
            banner_url = "https://allenai.github.io/WildBench/gray_banner.png" # the same repo here.
         | 
| 6 | 
             
            BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 800px;"> </div>'
         | 
|  | |
| 1 | 
             
            from pathlib import Path
         | 
| 2 |  | 
| 3 | 
            +
            DEFAULT_LP = 0.5
         | 
| 4 |  | 
| 5 | 
             
            banner_url = "https://allenai.github.io/WildBench/gray_banner.png" # the same repo here.
         | 
| 6 | 
             
            BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 800px;"> </div>'
         | 
    	
        data_dir/battle_outcome.png
    CHANGED
    
    |   | 
|   | 
    	
        data_dir/battle_outcome_2d.png
    CHANGED
    
    |   | 
|   | 
    	
        data_dir/bootstrap_elo_estimates.png
    CHANGED
    
    |   | 
|   | 
    	
        data_dir/elo_ranks.L=0.0.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1293, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1153, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1134, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1117, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1099, "# battles": 1484}
         | 
| 6 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1099, "# battles": 2519}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1068, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1067, "# battles": 2035}
         | 
| 9 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1053, "# battles": 2606}
         | 
| 10 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1027, "# battles": 2144}
         | 
| 11 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 1022, "# battles": 1532}
         | 
| 12 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1004, "# battles": 2091}
         | 
| 13 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 990, "# battles": 3630}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 981, "# battles": 2094}
         | 
| 15 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 977, "# battles": 3543}
         | 
| 16 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 973, "# battles": 14196}
         | 
| 17 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 947, "# battles": 2728}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 941, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 884, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 859, "# battles": 2689}
         | 
| 21 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 853, "# battles": 2406}
         | 
| 22 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 843, "# battles": 2715}
         | 
| 23 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 819, "# battles": 2659}
         | 
| 24 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 804, "# battles": 2366}
         | 
    	
        data_dir/elo_ranks.L=0.1.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1207, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1130, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1112, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1088, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1070, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1068, "# battles": 2606}
         | 
| 7 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1056, "# battles": 2519}
         | 
| 8 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1051, "# battles": 1484}
         | 
| 9 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1047, "# battles": 3619}
         | 
| 10 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1025, "# battles": 2144}
         | 
| 11 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1014, "# battles": 14196}
         | 
| 12 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1006, "# battles": 2091}
         | 
| 13 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 990, "# battles": 1532}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 984, "# battles": 2094}
         | 
| 15 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 966, "# battles": 3630}
         | 
| 16 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 954, "# battles": 2728}
         | 
| 17 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 950, "# battles": 3543}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 938, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 931, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 909, "# battles": 2406}
         | 
| 21 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 908, "# battles": 2715}
         | 
| 22 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 905, "# battles": 2689}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 865, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 831, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.L=0.2.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1170, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1126, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1109, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1089, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1071, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1057, "# battles": 2606}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1044, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1044, "# battles": 2519}
         | 
| 9 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1041, "# battles": 1484}
         | 
| 10 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1038, "# battles": 14196}
         | 
| 11 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1009, "# battles": 2144}
         | 
| 12 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 991, "# battles": 2091}
         | 
| 13 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 984, "# battles": 1532}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 969, "# battles": 2094}
         | 
| 15 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 968, "# battles": 2728}
         | 
| 16 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 965, "# battles": 3630}
         | 
| 17 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 948, "# battles": 3543}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 944, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 938, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 925, "# battles": 2715}
         | 
| 21 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 925, "# battles": 2689}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 920, "# battles": 2406}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 880, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 849, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.L=0.3.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1145, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1122, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1107, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1092, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1074, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1053, "# battles": 14196}
         | 
| 7 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1048, "# battles": 2606}
         | 
| 8 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1042, "# battles": 3619}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1037, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1034, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 997, "# battles": 2144}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 983, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 980, "# battles": 2091}
         | 
| 14 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 977, "# battles": 2728}
         | 
| 15 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 966, "# battles": 3630}
         | 
| 16 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 957, "# battles": 2094}
         | 
| 17 | 
            +
            {"model name ": "command", "elo overall": 946, "# battles": 1939}
         | 
| 18 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 944, "# battles": 3543}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 942, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 938, "# battles": 2715}
         | 
| 21 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 937, "# battles": 2689}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 926, "# battles": 2406}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 889, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 863, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.L=0.4.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1127, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1119, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1106, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1095, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1079, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1064, "# battles": 14196}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1041, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1040, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1032, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1029, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 987, "# battles": 2144}
         | 
| 12 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 986, "# battles": 2728}
         | 
| 13 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 983, "# battles": 1532}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 970, "# battles": 2091}
         | 
| 15 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 967, "# battles": 3630}
         | 
| 16 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 949, "# battles": 2715}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 947, "# battles": 2094}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 947, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 944, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 944, "# battles": 2689}
         | 
| 21 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 939, "# battles": 3543}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 930, "# battles": 2406}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 895, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 873, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.L=0.5.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1118, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1115, "# battles": 5781}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1106, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1098, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1083, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1074, "# battles": 14196}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1043, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1034, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1029, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1026, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 993, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 980, "# battles": 2144}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 968, "# battles": 3630}
         | 
| 15 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 962, "# battles": 2091}
         | 
| 16 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 957, "# battles": 2715}
         | 
| 17 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 950, "# battles": 2689}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 947, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 946, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 940, "# battles": 2094}
         | 
| 21 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 936, "# battles": 3543}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 932, "# battles": 2406}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 898, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 880, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.L=0.6.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1118, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1106, "# battles": 2791}
         | 
| 3 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1106, "# battles": 5781}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1102, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1087, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1083, "# battles": 14196}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1044, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1029, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1025, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1024, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 998, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 984, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 974, "# battles": 2144}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 969, "# battles": 3630}
         | 
| 15 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 964, "# battles": 2715}
         | 
| 16 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 955, "# battles": 2091}
         | 
| 17 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 954, "# battles": 2689}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 948, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 946, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 933, "# battles": 2094}
         | 
| 21 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 933, "# battles": 2406}
         | 
| 22 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 931, "# battles": 3543}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 900, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 884, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.L=0.7.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1119, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1107, "# battles": 2791}
         | 
| 3 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1104, "# battles": 2058}
         | 
| 4 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1100, "# battles": 5781}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1091, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1090, "# battles": 14196}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1044, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1026, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1022, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1022, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 1001, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 969, "# battles": 3630}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 969, "# battles": 2144}
         | 
| 15 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 969, "# battles": 2715}
         | 
| 16 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 957, "# battles": 2689}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 951, "# battles": 2091}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 947, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 946, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 934, "# battles": 2406}
         | 
| 21 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 928, "# battles": 2094}
         | 
| 22 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 927, "# battles": 3543}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 901, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 886, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.L=0.8.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1120, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1110, "# battles": 2791}
         | 
| 3 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1107, "# battles": 2058}
         | 
| 4 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1097, "# battles": 14196}
         | 
| 5 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1096, "# battles": 5781}
         | 
| 6 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1095, "# battles": 2035}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1046, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1023, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1020, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1020, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 1005, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 973, "# battles": 2715}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 969, "# battles": 3630}
         | 
| 15 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 965, "# battles": 2144}
         | 
| 16 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 958, "# battles": 2689}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 947, "# battles": 2091}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 946, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 945, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 934, "# battles": 2406}
         | 
| 21 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 924, "# battles": 3543}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 923, "# battles": 2094}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 901, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 887, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.L=0.9.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1121, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1112, "# battles": 2791}
         | 
| 3 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1111, "# battles": 2058}
         | 
| 4 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1105, "# battles": 14196}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1099, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1093, "# battles": 5781}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1048, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1021, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1018, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1018, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 1008, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 978, "# battles": 2715}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 968, "# battles": 3630}
         | 
| 15 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 961, "# battles": 2144}
         | 
| 16 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 959, "# battles": 2689}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 945, "# battles": 2461}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 945, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 944, "# battles": 2091}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 933, "# battles": 2406}
         | 
| 21 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 922, "# battles": 3543}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 919, "# battles": 2094}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 900, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 887, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.L=1.0.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1123, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1114, "# battles": 2058}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1114, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1112, "# battles": 14196}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1103, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1092, "# battles": 5781}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1050, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1019, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1018, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1017, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 1010, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 981, "# battles": 2715}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 967, "# battles": 3630}
         | 
| 15 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 960, "# battles": 2689}
         | 
| 16 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 957, "# battles": 2144}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 945, "# battles": 2461}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 944, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 940, "# battles": 2091}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 932, "# battles": 2406}
         | 
| 21 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 919, "# battles": 3543}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 914, "# battles": 2094}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 899, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 886, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.L=0.0.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1293, "Information seeking": 1267, "Creative Writing": 1248, "Coding & Debugging": 1366, "Reasoning": 1353, "Editing": 1210, "Math": 1275, "Planning": 1294, "Brainstorming": 1311, "Role playing": 1231, "Advice seeking": 1287, "Data Analysis": 1277, "Others": 1066, "average": 1265.4166666666667, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1153, "Information seeking": 1137, "Creative Writing": 1073, "Coding & Debugging": 1303, "Reasoning": 1218, "Editing": 1166, "Math": 1207, "Planning": 1188, "Brainstorming": 1131, "Role playing": 982, "Advice seeking": 1140, "Data Analysis": 1183, "Others": 1036, "average": 1147.0, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1134, "Information seeking": 1104, "Creative Writing": 1077, "Coding & Debugging": 1247, "Reasoning": 1171, "Editing": 1155, "Math": 1144, "Planning": 1167, "Brainstorming": 1096, "Role playing": 995, "Advice seeking": 1089, "Data Analysis": 1174, "Others": 1021, "average": 1120.0, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1117, "Information seeking": 1092, "Creative Writing": 1106, "Coding & Debugging": 1162, "Reasoning": 1127, "Editing": 1106, "Math": 1090, "Planning": 1082, "Brainstorming": 1078, "Role playing": 1056, "Advice seeking": 1070, "Data Analysis": 1103, "Others": 1020, "average": 1091.0, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1099, "Information seeking": 1086, "Creative Writing": 1084, "Coding & Debugging": 1126, "Reasoning": 1059, "Editing": 1036, "Math": 1074, "Planning": 1048, "Brainstorming": 1036, "Role playing": 1082, "Advice seeking": 1019, "Data Analysis": 1044, "Others": 989, "average": 1056.9166666666667, "# battles": 1484}
         | 
| 6 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1099, "Information seeking": 1078, "Creative Writing": 1139, "Coding & Debugging": 1136, "Reasoning": 1045, "Editing": 1106, "Math": 1017, "Planning": 1079, "Brainstorming": 1073, "Role playing": 1121, "Advice seeking": 1065, "Data Analysis": 1059, "Others": 1008, "average": 1077.1666666666667, "# battles": 2519}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1068, "Information seeking": 1073, "Creative Writing": 1046, "Coding & Debugging": 1115, "Reasoning": 1056, "Editing": 1013, "Math": 1059, "Planning": 1001, "Brainstorming": 1025, "Role playing": 1084, "Advice seeking": 1003, "Data Analysis": 1057, "Others": 994, "average": 1043.8333333333333, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1067, "Information seeking": 1086, "Creative Writing": 1044, "Coding & Debugging": 1088, "Reasoning": 1075, "Editing": 1026, "Math": 1056, "Planning": 1070, "Brainstorming": 1026, "Role playing": 1025, "Advice seeking": 1075, "Data Analysis": 1091, "Others": 1006, "average": 1055.6666666666667, "# battles": 2035}
         | 
| 9 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1053, "Information seeking": 1068, "Creative Writing": 1099, "Coding & Debugging": 924, "Reasoning": 1067, "Editing": 1040, "Math": 998, "Planning": 1088, "Brainstorming": 1143, "Role playing": 1066, "Advice seeking": 1088, "Data Analysis": 937, "Others": 1001, "average": 1043.25, "# battles": 2606}
         | 
| 10 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1027, "Information seeking": 1091, "Creative Writing": 1065, "Coding & Debugging": 866, "Reasoning": 1028, "Editing": 985, "Math": 962, "Planning": 1007, "Brainstorming": 1058, "Role playing": 1070, "Advice seeking": 1041, "Data Analysis": 943, "Others": 1022, "average": 1011.5, "# battles": 2144}
         | 
| 11 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 1022, "Information seeking": 1020, "Creative Writing": 984, "Coding & Debugging": 1065, "Reasoning": 1010, "Editing": 985, "Math": 1042, "Planning": 996, "Brainstorming": 962, "Role playing": 1013, "Advice seeking": 991, "Data Analysis": 1025, "Others": 1009, "average": 1008.5, "# battles": 1532}
         | 
| 12 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1004, "Information seeking": 1052, "Creative Writing": 1051, "Coding & Debugging": 835, "Reasoning": 974, "Editing": 981, "Math": 936, "Planning": 982, "Brainstorming": 1023, "Role playing": 1045, "Advice seeking": 1007, "Data Analysis": 920, "Others": 1015, "average": 985.0833333333334, "# battles": 2091}
         | 
| 13 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 990, "Information seeking": 997, "Creative Writing": 1017, "Coding & Debugging": 933, "Reasoning": 989, "Editing": 968, "Math": 967, "Planning": 959, "Brainstorming": 934, "Role playing": 1068, "Advice seeking": 972, "Data Analysis": 927, "Others": 988, "average": 976.5833333333334, "# battles": 3630}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 981, "Information seeking": 1028, "Creative Writing": 1024, "Coding & Debugging": 830, "Reasoning": 951, "Editing": 961, "Math": 898, "Planning": 990, "Brainstorming": 997, "Role playing": 1052, "Advice seeking": 1024, "Data Analysis": 929, "Others": 1012, "average": 974.6666666666666, "# battles": 2094}
         | 
| 15 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 977, "Information seeking": 995, "Creative Writing": 987, "Coding & Debugging": 982, "Reasoning": 958, "Editing": 983, "Math": 925, "Planning": 961, "Brainstorming": 938, "Role playing": 1021, "Advice seeking": 936, "Data Analysis": 1003, "Others": 958, "average": 970.5833333333334, "# battles": 3543}
         | 
| 16 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 973, "Information seeking": 931, "Creative Writing": 931, "Coding & Debugging": 1149, "Reasoning": 1015, "Editing": 992, "Math": 1147, "Planning": 981, "Brainstorming": 930, "Role playing": 920, "Advice seeking": 957, "Data Analysis": 1068, "Others": 980, "average": 1000.0833333333334, "# battles": 14196}
         | 
| 17 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 947, "Information seeking": 930, "Creative Writing": 935, "Coding & Debugging": 981, "Reasoning": 955, "Editing": 919, "Math": 984, "Planning": 980, "Brainstorming": 982, "Role playing": 939, "Advice seeking": 978, "Data Analysis": 974, "Others": 990, "average": 962.25, "# battles": 2728}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 941, "Information seeking": 932, "Creative Writing": 935, "Coding & Debugging": 957, "Reasoning": 920, "Editing": 934, "Math": 925, "Planning": 976, "Brainstorming": 995, "Role playing": 941, "Advice seeking": 961, "Data Analysis": 954, "Others": 978, "average": 950.6666666666666, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 884, "Information seeking": 896, "Creative Writing": 898, "Coding & Debugging": 813, "Reasoning": 898, "Editing": 939, "Math": 892, "Planning": 902, "Brainstorming": 939, "Role playing": 898, "Advice seeking": 942, "Data Analysis": 923, "Others": 1000, "average": 911.6666666666666, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 859, "Information seeking": 835, "Creative Writing": 895, "Coding & Debugging": 871, "Reasoning": 816, "Editing": 878, "Math": 880, "Planning": 895, "Brainstorming": 925, "Role playing": 896, "Advice seeking": 881, "Data Analysis": 893, "Others": 977, "average": 886.8333333333334, "# battles": 2689}
         | 
| 21 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 853, "Information seeking": 872, "Creative Writing": 865, "Coding & Debugging": 790, "Reasoning": 880, "Editing": 956, "Math": 908, "Planning": 895, "Brainstorming": 892, "Role playing": 863, "Advice seeking": 938, "Data Analysis": 877, "Others": 1006, "average": 895.1666666666666, "# battles": 2406}
         | 
| 22 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 843, "Information seeking": 844, "Creative Writing": 863, "Coding & Debugging": 803, "Reasoning": 837, "Editing": 871, "Math": 873, "Planning": 819, "Brainstorming": 870, "Role playing": 904, "Advice seeking": 839, "Data Analysis": 866, "Others": 971, "average": 863.3333333333334, "# battles": 2715}
         | 
| 23 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 819, "Information seeking": 778, "Creative Writing": 798, "Coding & Debugging": 930, "Reasoning": 787, "Editing": 870, "Math": 885, "Planning": 802, "Brainstorming": 773, "Role playing": 883, "Advice seeking": 815, "Data Analysis": 912, "Others": 962, "average": 849.5833333333334, "# battles": 2659}
         | 
| 24 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 804, "Information seeking": 802, "Creative Writing": 833, "Coding & Debugging": 737, "Reasoning": 801, "Editing": 916, "Math": 849, "Planning": 832, "Brainstorming": 854, "Role playing": 848, "Advice seeking": 884, "Data Analysis": 859, "Others": 995, "average": 850.8333333333334, "# battles": 2366}
         | 
    	
        data_dir/elo_ranks.all.L=0.1.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1207, "Information seeking": 1190, "Creative Writing": 1190, "Coding & Debugging": 1209, "Reasoning": 1253, "Editing": 1170, "Math": 1207, "Planning": 1223, "Brainstorming": 1248, "Role playing": 1161, "Advice seeking": 1222, "Data Analysis": 1182, "Others": 1060, "average": 1192.9166666666667, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1130, "Information seeking": 1129, "Creative Writing": 1090, "Coding & Debugging": 1156, "Reasoning": 1177, "Editing": 1131, "Math": 1164, "Planning": 1166, "Brainstorming": 1144, "Role playing": 1028, "Advice seeking": 1127, "Data Analysis": 1104, "Others": 1033, "average": 1120.75, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1112, "Information seeking": 1099, "Creative Writing": 1095, "Coding & Debugging": 1118, "Reasoning": 1137, "Editing": 1109, "Math": 1108, "Planning": 1137, "Brainstorming": 1109, "Role playing": 1044, "Advice seeking": 1087, "Data Analysis": 1104, "Others": 1022, "average": 1097.4166666666667, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1088, "Information seeking": 1080, "Creative Writing": 1087, "Coding & Debugging": 1093, "Reasoning": 1112, "Editing": 1087, "Math": 1079, "Planning": 1078, "Brainstorming": 1068, "Role playing": 1049, "Advice seeking": 1063, "Data Analysis": 1060, "Others": 1017, "average": 1072.75, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1070, "Information seeking": 1095, "Creative Writing": 1058, "Coding & Debugging": 1049, "Reasoning": 1071, "Editing": 1020, "Math": 1057, "Planning": 1073, "Brainstorming": 1036, "Role playing": 1042, "Advice seeking": 1080, "Data Analysis": 1061, "Others": 1009, "average": 1054.25, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1068, "Information seeking": 1062, "Creative Writing": 1096, "Coding & Debugging": 1026, "Reasoning": 1064, "Editing": 1058, "Math": 1011, "Planning": 1088, "Brainstorming": 1112, "Role playing": 1071, "Advice seeking": 1078, "Data Analysis": 1009, "Others": 1003, "average": 1056.5, "# battles": 2606}
         | 
| 7 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1056, "Information seeking": 1048, "Creative Writing": 1091, "Coding & Debugging": 1044, "Reasoning": 1022, "Editing": 1077, "Math": 989, "Planning": 1057, "Brainstorming": 1059, "Role playing": 1065, "Advice seeking": 1053, "Data Analysis": 1009, "Others": 1002, "average": 1043.0, "# battles": 2519}
         | 
| 8 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1051, "Information seeking": 1053, "Creative Writing": 1043, "Coding & Debugging": 1045, "Reasoning": 1037, "Editing": 1018, "Math": 1057, "Planning": 1027, "Brainstorming": 1018, "Role playing": 1041, "Advice seeking": 1007, "Data Analysis": 1013, "Others": 988, "average": 1028.9166666666667, "# battles": 1484}
         | 
| 9 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1047, "Information seeking": 1055, "Creative Writing": 1041, "Coding & Debugging": 1047, "Reasoning": 1046, "Editing": 1018, "Math": 1040, "Planning": 1015, "Brainstorming": 1028, "Role playing": 1060, "Advice seeking": 1015, "Data Analysis": 1015, "Others": 984, "average": 1030.3333333333333, "# battles": 3619}
         | 
| 10 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1025, "Information seeking": 1049, "Creative Writing": 1021, "Coding & Debugging": 990, "Reasoning": 1041, "Editing": 995, "Math": 984, "Planning": 1011, "Brainstorming": 1029, "Role playing": 1025, "Advice seeking": 1031, "Data Analysis": 1019, "Others": 1021, "average": 1018.0, "# battles": 2144}
         | 
| 11 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1014, "Information seeking": 991, "Creative Writing": 1010, "Coding & Debugging": 1078, "Reasoning": 1031, "Editing": 1018, "Math": 1143, "Planning": 1004, "Brainstorming": 983, "Role playing": 1019, "Advice seeking": 983, "Data Analysis": 1035, "Others": 1024, "average": 1026.5833333333333, "# battles": 14196}
         | 
| 12 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1006, "Information seeking": 1019, "Creative Writing": 1016, "Coding & Debugging": 962, "Reasoning": 981, "Editing": 997, "Math": 957, "Planning": 992, "Brainstorming": 996, "Role playing": 1008, "Advice seeking": 993, "Data Analysis": 981, "Others": 1004, "average": 992.1666666666666, "# battles": 2091}
         | 
| 13 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 990, "Information seeking": 995, "Creative Writing": 963, "Coding & Debugging": 1000, "Reasoning": 994, "Editing": 971, "Math": 1024, "Planning": 978, "Brainstorming": 954, "Role playing": 991, "Advice seeking": 971, "Data Analysis": 1000, "Others": 1006, "average": 987.25, "# battles": 1532}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 984, "Information seeking": 1001, "Creative Writing": 993, "Coding & Debugging": 952, "Reasoning": 956, "Editing": 984, "Math": 914, "Planning": 992, "Brainstorming": 978, "Role playing": 1006, "Advice seeking": 1005, "Data Analysis": 983, "Others": 1010, "average": 981.1666666666666, "# battles": 2094}
         | 
| 15 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 966, "Information seeking": 982, "Creative Writing": 985, "Coding & Debugging": 899, "Reasoning": 974, "Editing": 948, "Math": 955, "Planning": 943, "Brainstorming": 924, "Role playing": 1020, "Advice seeking": 963, "Data Analysis": 903, "Others": 980, "average": 956.3333333333334, "# battles": 3630}
         | 
| 16 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 954, "Information seeking": 956, "Creative Writing": 957, "Coding & Debugging": 928, "Reasoning": 973, "Editing": 925, "Math": 987, "Planning": 989, "Brainstorming": 1010, "Role playing": 954, "Advice seeking": 993, "Data Analysis": 951, "Others": 992, "average": 967.9166666666666, "# battles": 2728}
         | 
| 17 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 950, "Information seeking": 983, "Creative Writing": 961, "Coding & Debugging": 911, "Reasoning": 941, "Editing": 946, "Math": 916, "Planning": 934, "Brainstorming": 922, "Role playing": 989, "Advice seeking": 931, "Data Analysis": 953, "Others": 961, "average": 945.6666666666666, "# battles": 3543}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 938, "Information seeking": 948, "Creative Writing": 951, "Coding & Debugging": 910, "Reasoning": 921, "Editing": 939, "Math": 921, "Planning": 967, "Brainstorming": 998, "Role playing": 956, "Advice seeking": 973, "Data Analysis": 924, "Others": 980, "average": 949.0, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 931, "Information seeking": 916, "Creative Writing": 923, "Coding & Debugging": 971, "Reasoning": 927, "Editing": 970, "Math": 920, "Planning": 935, "Brainstorming": 944, "Role playing": 921, "Advice seeking": 949, "Data Analysis": 1000, "Others": 991, "average": 947.25, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 909, "Information seeking": 902, "Creative Writing": 898, "Coding & Debugging": 956, "Reasoning": 917, "Editing": 980, "Math": 931, "Planning": 928, "Brainstorming": 900, "Role playing": 896, "Advice seeking": 946, "Data Analysis": 961, "Others": 1001, "average": 934.6666666666666, "# battles": 2406}
         | 
| 21 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 908, "Information seeking": 897, "Creative Writing": 904, "Coding & Debugging": 967, "Reasoning": 890, "Editing": 921, "Math": 916, "Planning": 872, "Brainstorming": 896, "Role playing": 940, "Advice seeking": 869, "Data Analysis": 965, "Others": 971, "average": 917.3333333333334, "# battles": 2715}
         | 
| 22 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 905, "Information seeking": 890, "Creative Writing": 936, "Coding & Debugging": 872, "Reasoning": 877, "Editing": 910, "Math": 930, "Planning": 912, "Brainstorming": 960, "Role playing": 938, "Advice seeking": 929, "Data Analysis": 923, "Others": 984, "average": 921.75, "# battles": 2689}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 865, "Information seeking": 839, "Creative Writing": 874, "Coding & Debugging": 921, "Reasoning": 840, "Editing": 943, "Math": 900, "Planning": 866, "Brainstorming": 872, "Role playing": 883, "Advice seeking": 896, "Data Analysis": 944, "Others": 989, "average": 897.25, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 831, "Information seeking": 813, "Creative Writing": 819, "Coding & Debugging": 890, "Reasoning": 817, "Editing": 866, "Math": 891, "Planning": 820, "Brainstorming": 807, "Role playing": 897, "Advice seeking": 836, "Data Analysis": 896, "Others": 969, "average": 860.0833333333334, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.L=0.2.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1170, "Information seeking": 1151, "Creative Writing": 1159, "Coding & Debugging": 1174, "Reasoning": 1203, "Editing": 1149, "Math": 1173, "Planning": 1188, "Brainstorming": 1210, "Role playing": 1128, "Advice seeking": 1180, "Data Analysis": 1153, "Others": 1054, "average": 1160.1666666666667, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1126, "Information seeking": 1132, "Creative Writing": 1097, "Coding & Debugging": 1134, "Reasoning": 1166, "Editing": 1113, "Math": 1154, "Planning": 1159, "Brainstorming": 1146, "Role playing": 1048, "Advice seeking": 1124, "Data Analysis": 1095, "Others": 1031, "average": 1116.5833333333333, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1109, "Information seeking": 1098, "Creative Writing": 1099, "Coding & Debugging": 1101, "Reasoning": 1126, "Editing": 1094, "Math": 1096, "Planning": 1126, "Brainstorming": 1114, "Role playing": 1058, "Advice seeking": 1090, "Data Analysis": 1091, "Others": 1023, "average": 1093.0, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1089, "Information seeking": 1084, "Creative Writing": 1088, "Coding & Debugging": 1098, "Reasoning": 1112, "Editing": 1083, "Math": 1078, "Planning": 1082, "Brainstorming": 1066, "Role playing": 1053, "Advice seeking": 1069, "Data Analysis": 1062, "Others": 1016, "average": 1074.25, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1071, "Information seeking": 1095, "Creative Writing": 1057, "Coding & Debugging": 1053, "Reasoning": 1074, "Editing": 1024, "Math": 1061, "Planning": 1081, "Brainstorming": 1038, "Role playing": 1038, "Advice seeking": 1082, "Data Analysis": 1063, "Others": 1009, "average": 1056.25, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1057, "Information seeking": 1049, "Creative Writing": 1085, "Coding & Debugging": 1019, "Reasoning": 1044, "Editing": 1057, "Math": 998, "Planning": 1071, "Brainstorming": 1091, "Role playing": 1062, "Advice seeking": 1063, "Data Analysis": 1000, "Others": 1002, "average": 1045.0833333333333, "# battles": 2606}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1044, "Information seeking": 1049, "Creative Writing": 1036, "Coding & Debugging": 1047, "Reasoning": 1042, "Editing": 1019, "Math": 1037, "Planning": 1019, "Brainstorming": 1029, "Role playing": 1051, "Advice seeking": 1015, "Data Analysis": 1015, "Others": 979, "average": 1028.1666666666667, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1044, "Information seeking": 1038, "Creative Writing": 1078, "Coding & Debugging": 1035, "Reasoning": 1015, "Editing": 1070, "Math": 979, "Planning": 1051, "Brainstorming": 1053, "Role playing": 1051, "Advice seeking": 1049, "Data Analysis": 1002, "Others": 997, "average": 1034.8333333333333, "# battles": 2519}
         | 
| 9 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1041, "Information seeking": 1044, "Creative Writing": 1028, "Coding & Debugging": 1037, "Reasoning": 1033, "Editing": 1014, "Math": 1053, "Planning": 1018, "Brainstorming": 1010, "Role playing": 1028, "Advice seeking": 1003, "Data Analysis": 1010, "Others": 989, "average": 1022.25, "# battles": 1484}
         | 
| 10 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1038, "Information seeking": 1021, "Creative Writing": 1033, "Coding & Debugging": 1091, "Reasoning": 1052, "Editing": 1031, "Math": 1156, "Planning": 1024, "Brainstorming": 1008, "Role playing": 1042, "Advice seeking": 1007, "Data Analysis": 1053, "Others": 1045, "average": 1046.9166666666667, "# battles": 14196}
         | 
| 11 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1009, "Information seeking": 1025, "Creative Writing": 1000, "Coding & Debugging": 985, "Reasoning": 1029, "Editing": 987, "Math": 982, "Planning": 998, "Brainstorming": 1011, "Role playing": 1008, "Advice seeking": 1018, "Data Analysis": 1014, "Others": 1018, "average": 1006.25, "# battles": 2144}
         | 
| 12 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 991, "Information seeking": 998, "Creative Writing": 1000, "Coding & Debugging": 959, "Reasoning": 970, "Editing": 991, "Math": 957, "Planning": 979, "Brainstorming": 982, "Role playing": 993, "Advice seeking": 977, "Data Analysis": 978, "Others": 1001, "average": 982.0833333333334, "# battles": 2091}
         | 
| 13 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 984, "Information seeking": 990, "Creative Writing": 960, "Coding & Debugging": 1000, "Reasoning": 991, "Editing": 971, "Math": 1022, "Planning": 978, "Brainstorming": 952, "Role playing": 987, "Advice seeking": 967, "Data Analysis": 1001, "Others": 1002, "average": 985.0833333333334, "# battles": 1532}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 969, "Information seeking": 979, "Creative Writing": 976, "Coding & Debugging": 949, "Reasoning": 944, "Editing": 985, "Math": 914, "Planning": 979, "Brainstorming": 961, "Role playing": 990, "Advice seeking": 990, "Data Analysis": 977, "Others": 1009, "average": 971.0833333333334, "# battles": 2094}
         | 
| 15 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 968, "Information seeking": 967, "Creative Writing": 972, "Coding & Debugging": 937, "Reasoning": 986, "Editing": 946, "Math": 995, "Planning": 1009, "Brainstorming": 1028, "Role playing": 957, "Advice seeking": 1003, "Data Analysis": 957, "Others": 991, "average": 979.0, "# battles": 2728}
         | 
| 16 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 965, "Information seeking": 979, "Creative Writing": 977, "Coding & Debugging": 908, "Reasoning": 975, "Editing": 942, "Math": 951, "Planning": 946, "Brainstorming": 927, "Role playing": 1010, "Advice seeking": 963, "Data Analysis": 904, "Others": 977, "average": 954.9166666666666, "# battles": 3630}
         | 
| 17 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 948, "Information seeking": 985, "Creative Writing": 954, "Coding & Debugging": 904, "Reasoning": 942, "Editing": 938, "Math": 914, "Planning": 933, "Brainstorming": 918, "Role playing": 986, "Advice seeking": 936, "Data Analysis": 949, "Others": 960, "average": 943.25, "# battles": 3543}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 944, "Information seeking": 956, "Creative Writing": 956, "Coding & Debugging": 914, "Reasoning": 926, "Editing": 945, "Math": 924, "Planning": 959, "Brainstorming": 995, "Role playing": 965, "Advice seeking": 981, "Data Analysis": 930, "Others": 981, "average": 952.6666666666666, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 938, "Information seeking": 925, "Creative Writing": 935, "Coding & Debugging": 977, "Reasoning": 931, "Editing": 972, "Math": 926, "Planning": 940, "Brainstorming": 944, "Role playing": 933, "Advice seeking": 949, "Data Analysis": 1003, "Others": 991, "average": 952.1666666666666, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 925, "Information seeking": 920, "Creative Writing": 923, "Coding & Debugging": 974, "Reasoning": 911, "Editing": 937, "Math": 925, "Planning": 891, "Brainstorming": 914, "Role playing": 949, "Advice seeking": 886, "Data Analysis": 975, "Others": 973, "average": 931.5, "# battles": 2715}
         | 
| 21 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 925, "Information seeking": 904, "Creative Writing": 949, "Coding & Debugging": 898, "Reasoning": 903, "Editing": 926, "Math": 961, "Planning": 928, "Brainstorming": 976, "Role playing": 948, "Advice seeking": 943, "Data Analysis": 942, "Others": 985, "average": 938.5833333333334, "# battles": 2689}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 920, "Information seeking": 913, "Creative Writing": 914, "Coding & Debugging": 966, "Reasoning": 925, "Editing": 982, "Math": 933, "Planning": 934, "Brainstorming": 906, "Role playing": 910, "Advice seeking": 947, "Data Analysis": 968, "Others": 1003, "average": 941.75, "# battles": 2406}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 880, "Information seeking": 855, "Creative Writing": 890, "Coding & Debugging": 933, "Reasoning": 852, "Editing": 947, "Math": 909, "Planning": 874, "Brainstorming": 885, "Role playing": 902, "Advice seeking": 896, "Data Analysis": 951, "Others": 991, "average": 907.0833333333334, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 849, "Information seeking": 840, "Creative Writing": 833, "Coding & Debugging": 903, "Reasoning": 843, "Editing": 877, "Math": 901, "Planning": 844, "Brainstorming": 828, "Role playing": 903, "Advice seeking": 859, "Data Analysis": 906, "Others": 974, "average": 875.9166666666666, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.L=0.3.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1145, "Information seeking": 1125, "Creative Writing": 1139, "Coding & Debugging": 1149, "Reasoning": 1168, "Editing": 1136, "Math": 1148, "Planning": 1160, "Brainstorming": 1184, "Role playing": 1107, "Advice seeking": 1147, "Data Analysis": 1133, "Others": 1051, "average": 1137.25, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1122, "Information seeking": 1132, "Creative Writing": 1100, "Coding & Debugging": 1120, "Reasoning": 1157, "Editing": 1098, "Math": 1147, "Planning": 1152, "Brainstorming": 1146, "Role playing": 1057, "Advice seeking": 1118, "Data Analysis": 1090, "Others": 1030, "average": 1112.25, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1107, "Information seeking": 1100, "Creative Writing": 1102, "Coding & Debugging": 1092, "Reasoning": 1118, "Editing": 1088, "Math": 1089, "Planning": 1121, "Brainstorming": 1117, "Role playing": 1064, "Advice seeking": 1090, "Data Analysis": 1086, "Others": 1026, "average": 1091.0833333333333, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1092, "Information seeking": 1090, "Creative Writing": 1089, "Coding & Debugging": 1101, "Reasoning": 1112, "Editing": 1083, "Math": 1079, "Planning": 1088, "Brainstorming": 1065, "Role playing": 1059, "Advice seeking": 1076, "Data Analysis": 1063, "Others": 1016, "average": 1076.75, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1074, "Information seeking": 1097, "Creative Writing": 1059, "Coding & Debugging": 1057, "Reasoning": 1078, "Editing": 1028, "Math": 1063, "Planning": 1090, "Brainstorming": 1044, "Role playing": 1038, "Advice seeking": 1087, "Data Analysis": 1064, "Others": 1008, "average": 1059.4166666666667, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1053, "Information seeking": 1041, "Creative Writing": 1048, "Coding & Debugging": 1101, "Reasoning": 1068, "Editing": 1042, "Math": 1166, "Planning": 1041, "Brainstorming": 1026, "Role playing": 1054, "Advice seeking": 1025, "Data Analysis": 1068, "Others": 1062, "average": 1061.8333333333333, "# battles": 14196}
         | 
| 7 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1048, "Information seeking": 1039, "Creative Writing": 1073, "Coding & Debugging": 1013, "Reasoning": 1030, "Editing": 1054, "Math": 986, "Planning": 1057, "Brainstorming": 1077, "Role playing": 1054, "Advice seeking": 1049, "Data Analysis": 995, "Others": 1002, "average": 1035.75, "# battles": 2606}
         | 
| 8 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1042, "Information seeking": 1049, "Creative Writing": 1034, "Coding & Debugging": 1047, "Reasoning": 1041, "Editing": 1019, "Math": 1035, "Planning": 1022, "Brainstorming": 1031, "Role playing": 1048, "Advice seeking": 1016, "Data Analysis": 1015, "Others": 976, "average": 1027.75, "# battles": 3619}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1037, "Information seeking": 1031, "Creative Writing": 1068, "Coding & Debugging": 1029, "Reasoning": 1009, "Editing": 1065, "Math": 970, "Planning": 1050, "Brainstorming": 1049, "Role playing": 1042, "Advice seeking": 1045, "Data Analysis": 999, "Others": 994, "average": 1029.25, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1034, "Information seeking": 1039, "Creative Writing": 1018, "Coding & Debugging": 1032, "Reasoning": 1032, "Editing": 1011, "Math": 1052, "Planning": 1012, "Brainstorming": 1007, "Role playing": 1018, "Advice seeking": 1002, "Data Analysis": 1010, "Others": 988, "average": 1018.4166666666666, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 997, "Information seeking": 1006, "Creative Writing": 986, "Coding & Debugging": 982, "Reasoning": 1019, "Editing": 982, "Math": 981, "Planning": 986, "Brainstorming": 998, "Role playing": 997, "Advice seeking": 1006, "Data Analysis": 1012, "Others": 1016, "average": 997.5833333333334, "# battles": 2144}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 983, "Information seeking": 986, "Creative Writing": 960, "Coding & Debugging": 1001, "Reasoning": 991, "Editing": 974, "Math": 1020, "Planning": 980, "Brainstorming": 954, "Role playing": 986, "Advice seeking": 967, "Data Analysis": 1002, "Others": 1002, "average": 985.25, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 980, "Information seeking": 981, "Creative Writing": 989, "Coding & Debugging": 956, "Reasoning": 961, "Editing": 985, "Math": 959, "Planning": 968, "Brainstorming": 970, "Role playing": 985, "Advice seeking": 964, "Data Analysis": 975, "Others": 997, "average": 974.1666666666666, "# battles": 2091}
         | 
| 14 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 977, "Information seeking": 975, "Creative Writing": 985, "Coding & Debugging": 944, "Reasoning": 996, "Editing": 962, "Math": 1004, "Planning": 1024, "Brainstorming": 1041, "Role playing": 963, "Advice seeking": 1012, "Data Analysis": 960, "Others": 991, "average": 988.0833333333334, "# battles": 2728}
         | 
| 15 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 966, "Information seeking": 982, "Creative Writing": 974, "Coding & Debugging": 915, "Reasoning": 978, "Editing": 937, "Math": 948, "Planning": 948, "Brainstorming": 932, "Role playing": 1007, "Advice seeking": 967, "Data Analysis": 904, "Others": 974, "average": 955.5, "# battles": 3630}
         | 
| 16 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 957, "Information seeking": 963, "Creative Writing": 964, "Coding & Debugging": 947, "Reasoning": 933, "Editing": 984, "Math": 913, "Planning": 967, "Brainstorming": 948, "Role playing": 979, "Advice seeking": 977, "Data Analysis": 972, "Others": 1007, "average": 962.8333333333334, "# battles": 2094}
         | 
| 17 | 
            +
            {"model name ": "command", "elo overall": 946, "Information seeking": 960, "Creative Writing": 960, "Coding & Debugging": 913, "Reasoning": 928, "Editing": 945, "Math": 926, "Planning": 952, "Brainstorming": 991, "Role playing": 972, "Advice seeking": 985, "Data Analysis": 931, "Others": 981, "average": 953.6666666666666, "# battles": 1939}
         | 
| 18 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 944, "Information seeking": 984, "Creative Writing": 947, "Coding & Debugging": 898, "Reasoning": 941, "Editing": 932, "Math": 910, "Planning": 931, "Brainstorming": 915, "Role playing": 983, "Advice seeking": 939, "Data Analysis": 945, "Others": 958, "average": 940.25, "# battles": 3543}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 942, "Information seeking": 930, "Creative Writing": 940, "Coding & Debugging": 980, "Reasoning": 932, "Editing": 971, "Math": 930, "Planning": 942, "Brainstorming": 942, "Role playing": 937, "Advice seeking": 949, "Data Analysis": 1005, "Others": 992, "average": 954.1666666666666, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 938, "Information seeking": 936, "Creative Writing": 937, "Coding & Debugging": 980, "Reasoning": 928, "Editing": 949, "Math": 933, "Planning": 907, "Brainstorming": 930, "Role playing": 955, "Advice seeking": 900, "Data Analysis": 983, "Others": 974, "average": 942.6666666666666, "# battles": 2715}
         | 
| 21 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 937, "Information seeking": 914, "Creative Writing": 958, "Coding & Debugging": 915, "Reasoning": 921, "Editing": 936, "Math": 980, "Planning": 937, "Brainstorming": 986, "Role playing": 956, "Advice seeking": 952, "Data Analysis": 955, "Others": 986, "average": 949.6666666666666, "# battles": 2689}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 926, "Information seeking": 918, "Creative Writing": 924, "Coding & Debugging": 971, "Reasoning": 932, "Editing": 983, "Math": 934, "Planning": 936, "Brainstorming": 911, "Role playing": 918, "Advice seeking": 947, "Data Analysis": 970, "Others": 1004, "average": 945.6666666666666, "# battles": 2406}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 889, "Information seeking": 865, "Creative Writing": 900, "Coding & Debugging": 941, "Reasoning": 861, "Editing": 950, "Math": 914, "Planning": 877, "Brainstorming": 890, "Role playing": 912, "Advice seeking": 896, "Data Analysis": 954, "Others": 992, "average": 912.6666666666666, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 863, "Information seeking": 857, "Creative Writing": 845, "Coding & Debugging": 912, "Reasoning": 863, "Editing": 885, "Math": 907, "Planning": 862, "Brainstorming": 846, "Role playing": 907, "Advice seeking": 880, "Data Analysis": 910, "Others": 976, "average": 887.5, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.L=0.4.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1127, "Information seeking": 1105, "Creative Writing": 1126, "Coding & Debugging": 1134, "Reasoning": 1143, "Editing": 1127, "Math": 1130, "Planning": 1141, "Brainstorming": 1167, "Role playing": 1091, "Advice seeking": 1122, "Data Analysis": 1119, "Others": 1049, "average": 1121.1666666666667, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1119, "Information seeking": 1132, "Creative Writing": 1102, "Coding & Debugging": 1110, "Reasoning": 1149, "Editing": 1088, "Math": 1143, "Planning": 1147, "Brainstorming": 1148, "Role playing": 1064, "Advice seeking": 1114, "Data Analysis": 1086, "Others": 1029, "average": 1109.3333333333333, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1106, "Information seeking": 1101, "Creative Writing": 1105, "Coding & Debugging": 1084, "Reasoning": 1112, "Editing": 1082, "Math": 1081, "Planning": 1118, "Brainstorming": 1120, "Role playing": 1066, "Advice seeking": 1089, "Data Analysis": 1084, "Others": 1027, "average": 1089.0833333333333, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1095, "Information seeking": 1096, "Creative Writing": 1091, "Coding & Debugging": 1107, "Reasoning": 1114, "Editing": 1085, "Math": 1081, "Planning": 1095, "Brainstorming": 1065, "Role playing": 1064, "Advice seeking": 1080, "Data Analysis": 1066, "Others": 1016, "average": 1080.0, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1079, "Information seeking": 1099, "Creative Writing": 1063, "Coding & Debugging": 1062, "Reasoning": 1080, "Editing": 1033, "Math": 1065, "Planning": 1097, "Brainstorming": 1050, "Role playing": 1039, "Advice seeking": 1091, "Data Analysis": 1066, "Others": 1008, "average": 1062.75, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1064, "Information seeking": 1057, "Creative Writing": 1060, "Coding & Debugging": 1110, "Reasoning": 1082, "Editing": 1051, "Math": 1177, "Planning": 1055, "Brainstorming": 1039, "Role playing": 1063, "Advice seeking": 1039, "Data Analysis": 1079, "Others": 1078, "average": 1074.1666666666667, "# battles": 14196}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1041, "Information seeking": 1049, "Creative Writing": 1034, "Coding & Debugging": 1048, "Reasoning": 1041, "Editing": 1018, "Math": 1036, "Planning": 1023, "Brainstorming": 1032, "Role playing": 1048, "Advice seeking": 1018, "Data Analysis": 1016, "Others": 974, "average": 1028.0833333333333, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1040, "Information seeking": 1030, "Creative Writing": 1066, "Coding & Debugging": 1007, "Reasoning": 1019, "Editing": 1052, "Math": 976, "Planning": 1046, "Brainstorming": 1066, "Role playing": 1047, "Advice seeking": 1037, "Data Analysis": 988, "Others": 1001, "average": 1027.9166666666667, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1032, "Information seeking": 1025, "Creative Writing": 1061, "Coding & Debugging": 1025, "Reasoning": 1006, "Editing": 1061, "Math": 965, "Planning": 1049, "Brainstorming": 1047, "Role playing": 1038, "Advice seeking": 1043, "Data Analysis": 997, "Others": 992, "average": 1025.75, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1029, "Information seeking": 1035, "Creative Writing": 1012, "Coding & Debugging": 1030, "Reasoning": 1031, "Editing": 1008, "Math": 1052, "Planning": 1008, "Brainstorming": 1002, "Role playing": 1012, "Advice seeking": 1002, "Data Analysis": 1010, "Others": 988, "average": 1015.8333333333334, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 987, "Information seeking": 992, "Creative Writing": 976, "Coding & Debugging": 978, "Reasoning": 1011, "Editing": 978, "Math": 981, "Planning": 975, "Brainstorming": 986, "Role playing": 988, "Advice seeking": 996, "Data Analysis": 1011, "Others": 1014, "average": 990.5, "# battles": 2144}
         | 
| 12 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 986, "Information seeking": 982, "Creative Writing": 996, "Coding & Debugging": 948, "Reasoning": 1005, "Editing": 974, "Math": 1011, "Planning": 1036, "Brainstorming": 1051, "Role playing": 969, "Advice seeking": 1019, "Data Analysis": 965, "Others": 990, "average": 995.5, "# battles": 2728}
         | 
| 13 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 983, "Information seeking": 984, "Creative Writing": 960, "Coding & Debugging": 1002, "Reasoning": 990, "Editing": 975, "Math": 1021, "Planning": 980, "Brainstorming": 957, "Role playing": 985, "Advice seeking": 963, "Data Analysis": 1003, "Others": 1002, "average": 985.1666666666666, "# battles": 1532}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 970, "Information seeking": 968, "Creative Writing": 981, "Coding & Debugging": 954, "Reasoning": 953, "Editing": 981, "Math": 958, "Planning": 959, "Brainstorming": 959, "Role playing": 979, "Advice seeking": 953, "Data Analysis": 972, "Others": 996, "average": 967.75, "# battles": 2091}
         | 
| 15 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 967, "Information seeking": 984, "Creative Writing": 973, "Coding & Debugging": 918, "Reasoning": 980, "Editing": 934, "Math": 945, "Planning": 948, "Brainstorming": 935, "Role playing": 1005, "Advice seeking": 970, "Data Analysis": 905, "Others": 971, "average": 955.6666666666666, "# battles": 3630}
         | 
| 16 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 949, "Information seeking": 949, "Creative Writing": 949, "Coding & Debugging": 985, "Reasoning": 940, "Editing": 960, "Math": 938, "Planning": 921, "Brainstorming": 942, "Role playing": 961, "Advice seeking": 914, "Data Analysis": 989, "Others": 974, "average": 951.8333333333334, "# battles": 2715}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 947, "Information seeking": 950, "Creative Writing": 956, "Coding & Debugging": 944, "Reasoning": 923, "Editing": 982, "Math": 913, "Planning": 956, "Brainstorming": 936, "Role playing": 971, "Advice seeking": 966, "Data Analysis": 970, "Others": 1007, "average": 956.1666666666666, "# battles": 2094}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 947, "Information seeking": 963, "Creative Writing": 962, "Coding & Debugging": 911, "Reasoning": 928, "Editing": 942, "Math": 927, "Planning": 946, "Brainstorming": 989, "Role playing": 974, "Advice seeking": 987, "Data Analysis": 932, "Others": 980, "average": 953.4166666666666, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 944, "Information seeking": 933, "Creative Writing": 942, "Coding & Debugging": 982, "Reasoning": 933, "Editing": 971, "Math": 932, "Planning": 942, "Brainstorming": 939, "Role playing": 941, "Advice seeking": 949, "Data Analysis": 1006, "Others": 992, "average": 955.1666666666666, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 944, "Information seeking": 921, "Creative Writing": 964, "Coding & Debugging": 923, "Reasoning": 933, "Editing": 943, "Math": 994, "Planning": 943, "Brainstorming": 990, "Role playing": 960, "Advice seeking": 957, "Data Analysis": 963, "Others": 986, "average": 956.4166666666666, "# battles": 2689}
         | 
| 21 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 939, "Information seeking": 983, "Creative Writing": 941, "Coding & Debugging": 893, "Reasoning": 941, "Editing": 926, "Math": 907, "Planning": 927, "Brainstorming": 911, "Role playing": 983, "Advice seeking": 944, "Data Analysis": 939, "Others": 956, "average": 937.5833333333334, "# battles": 3543}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 930, "Information seeking": 921, "Creative Writing": 930, "Coding & Debugging": 974, "Reasoning": 938, "Editing": 984, "Math": 935, "Planning": 938, "Brainstorming": 913, "Role playing": 920, "Advice seeking": 948, "Data Analysis": 970, "Others": 1005, "average": 948.0, "# battles": 2406}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 895, "Information seeking": 870, "Creative Writing": 906, "Coding & Debugging": 948, "Reasoning": 869, "Editing": 951, "Math": 918, "Planning": 880, "Brainstorming": 893, "Role playing": 917, "Advice seeking": 898, "Data Analysis": 956, "Others": 992, "average": 916.5, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 873, "Information seeking": 870, "Creative Writing": 852, "Coding & Debugging": 918, "Reasoning": 877, "Editing": 893, "Math": 911, "Planning": 877, "Brainstorming": 859, "Role playing": 909, "Advice seeking": 896, "Data Analysis": 911, "Others": 978, "average": 895.9166666666666, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.L=0.5.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1118, "Information seeking": 1133, "Creative Writing": 1105, "Coding & Debugging": 1102, "Reasoning": 1143, "Editing": 1083, "Math": 1140, "Planning": 1145, "Brainstorming": 1151, "Role playing": 1068, "Advice seeking": 1112, "Data Analysis": 1082, "Others": 1028, "average": 1107.6666666666667, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1115, "Information seeking": 1091, "Creative Writing": 1116, "Coding & Debugging": 1123, "Reasoning": 1124, "Editing": 1119, "Math": 1116, "Planning": 1128, "Brainstorming": 1155, "Role playing": 1080, "Advice seeking": 1105, "Data Analysis": 1108, "Others": 1050, "average": 1109.5833333333333, "# battles": 5781}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1106, "Information seeking": 1101, "Creative Writing": 1107, "Coding & Debugging": 1080, "Reasoning": 1108, "Editing": 1079, "Math": 1076, "Planning": 1114, "Brainstorming": 1124, "Role playing": 1070, "Advice seeking": 1087, "Data Analysis": 1081, "Others": 1029, "average": 1088.0, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1098, "Information seeking": 1099, "Creative Writing": 1091, "Coding & Debugging": 1111, "Reasoning": 1118, "Editing": 1086, "Math": 1082, "Planning": 1101, "Brainstorming": 1066, "Role playing": 1067, "Advice seeking": 1084, "Data Analysis": 1070, "Others": 1015, "average": 1082.5, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1083, "Information seeking": 1102, "Creative Writing": 1067, "Coding & Debugging": 1067, "Reasoning": 1083, "Editing": 1038, "Math": 1066, "Planning": 1105, "Brainstorming": 1055, "Role playing": 1041, "Advice seeking": 1095, "Data Analysis": 1068, "Others": 1008, "average": 1066.25, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1074, "Information seeking": 1070, "Creative Writing": 1070, "Coding & Debugging": 1119, "Reasoning": 1095, "Editing": 1058, "Math": 1189, "Planning": 1067, "Brainstorming": 1048, "Role playing": 1071, "Advice seeking": 1052, "Data Analysis": 1089, "Others": 1092, "average": 1085.0, "# battles": 14196}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1043, "Information seeking": 1050, "Creative Writing": 1035, "Coding & Debugging": 1051, "Reasoning": 1041, "Editing": 1017, "Math": 1037, "Planning": 1026, "Brainstorming": 1034, "Role playing": 1049, "Advice seeking": 1019, "Data Analysis": 1018, "Others": 972, "average": 1029.0833333333333, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1034, "Information seeking": 1023, "Creative Writing": 1062, "Coding & Debugging": 1002, "Reasoning": 1010, "Editing": 1050, "Math": 970, "Planning": 1035, "Brainstorming": 1056, "Role playing": 1041, "Advice seeking": 1028, "Data Analysis": 984, "Others": 1001, "average": 1021.8333333333334, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1029, "Information seeking": 1021, "Creative Writing": 1055, "Coding & Debugging": 1023, "Reasoning": 1003, "Editing": 1059, "Math": 962, "Planning": 1049, "Brainstorming": 1045, "Role playing": 1034, "Advice seeking": 1041, "Data Analysis": 996, "Others": 989, "average": 1023.0833333333334, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1026, "Information seeking": 1031, "Creative Writing": 1006, "Coding & Debugging": 1029, "Reasoning": 1030, "Editing": 1007, "Math": 1051, "Planning": 1005, "Brainstorming": 1000, "Role playing": 1008, "Advice seeking": 1003, "Data Analysis": 1010, "Others": 987, "average": 1013.9166666666666, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 993, "Information seeking": 987, "Creative Writing": 1006, "Coding & Debugging": 951, "Reasoning": 1011, "Editing": 984, "Math": 1017, "Planning": 1046, "Brainstorming": 1062, "Role playing": 973, "Advice seeking": 1026, "Data Analysis": 968, "Others": 990, "average": 1001.75, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "Information seeking": 983, "Creative Writing": 961, "Coding & Debugging": 1003, "Reasoning": 991, "Editing": 976, "Math": 1022, "Planning": 979, "Brainstorming": 959, "Role playing": 987, "Advice seeking": 961, "Data Analysis": 1004, "Others": 1002, "average": 985.6666666666666, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 980, "Information seeking": 981, "Creative Writing": 968, "Coding & Debugging": 976, "Reasoning": 1003, "Editing": 974, "Math": 982, "Planning": 966, "Brainstorming": 976, "Role playing": 982, "Advice seeking": 989, "Data Analysis": 1009, "Others": 1013, "average": 984.9166666666666, "# battles": 2144}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 968, "Information seeking": 987, "Creative Writing": 972, "Coding & Debugging": 919, "Reasoning": 979, "Editing": 930, "Math": 941, "Planning": 949, "Brainstorming": 938, "Role playing": 1005, "Advice seeking": 971, "Data Analysis": 905, "Others": 967, "average": 955.25, "# battles": 3630}
         | 
| 15 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 962, "Information seeking": 956, "Creative Writing": 974, "Coding & Debugging": 952, "Reasoning": 946, "Editing": 976, "Math": 955, "Planning": 951, "Brainstorming": 950, "Role playing": 976, "Advice seeking": 944, "Data Analysis": 969, "Others": 995, "average": 962.0, "# battles": 2091}
         | 
| 16 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 957, "Information seeking": 960, "Creative Writing": 959, "Coding & Debugging": 989, "Reasoning": 949, "Editing": 969, "Math": 942, "Planning": 930, "Brainstorming": 951, "Role playing": 965, "Advice seeking": 923, "Data Analysis": 994, "Others": 973, "average": 958.6666666666666, "# battles": 2715}
         | 
| 17 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 950, "Information seeking": 926, "Creative Writing": 969, "Coding & Debugging": 929, "Reasoning": 942, "Editing": 949, "Math": 1004, "Planning": 948, "Brainstorming": 995, "Role playing": 963, "Advice seeking": 961, "Data Analysis": 968, "Others": 985, "average": 961.5833333333334, "# battles": 2689}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 947, "Information seeking": 966, "Creative Writing": 964, "Coding & Debugging": 908, "Reasoning": 929, "Editing": 942, "Math": 927, "Planning": 942, "Brainstorming": 987, "Role playing": 975, "Advice seeking": 990, "Data Analysis": 932, "Others": 980, "average": 953.5, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 946, "Information seeking": 934, "Creative Writing": 942, "Coding & Debugging": 983, "Reasoning": 933, "Editing": 972, "Math": 934, "Planning": 941, "Brainstorming": 936, "Role playing": 943, "Advice seeking": 947, "Data Analysis": 1006, "Others": 991, "average": 955.1666666666666, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 940, "Information seeking": 939, "Creative Writing": 949, "Coding & Debugging": 942, "Reasoning": 913, "Editing": 980, "Math": 912, "Planning": 947, "Brainstorming": 926, "Role playing": 966, "Advice seeking": 957, "Data Analysis": 967, "Others": 1007, "average": 950.4166666666666, "# battles": 2094}
         | 
| 21 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 936, "Information seeking": 982, "Creative Writing": 937, "Coding & Debugging": 887, "Reasoning": 940, "Editing": 921, "Math": 904, "Planning": 926, "Brainstorming": 908, "Role playing": 983, "Advice seeking": 946, "Data Analysis": 936, "Others": 954, "average": 935.3333333333334, "# battles": 3543}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 932, "Information seeking": 924, "Creative Writing": 932, "Coding & Debugging": 977, "Reasoning": 943, "Editing": 983, "Math": 936, "Planning": 939, "Brainstorming": 914, "Role playing": 920, "Advice seeking": 948, "Data Analysis": 971, "Others": 1006, "average": 949.4166666666666, "# battles": 2406}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 898, "Information seeking": 874, "Creative Writing": 908, "Coding & Debugging": 953, "Reasoning": 874, "Editing": 953, "Math": 921, "Planning": 882, "Brainstorming": 895, "Role playing": 921, "Advice seeking": 901, "Data Analysis": 958, "Others": 993, "average": 919.4166666666666, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 880, "Information seeking": 880, "Creative Writing": 858, "Coding & Debugging": 921, "Reasoning": 889, "Editing": 898, "Math": 913, "Planning": 887, "Brainstorming": 868, "Role playing": 912, "Advice seeking": 909, "Data Analysis": 911, "Others": 979, "average": 902.0833333333334, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.L=0.6.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1118, "Information seeking": 1135, "Creative Writing": 1107, "Coding & Debugging": 1095, "Reasoning": 1140, "Editing": 1079, "Math": 1138, "Planning": 1145, "Brainstorming": 1153, "Role playing": 1073, "Advice seeking": 1110, "Data Analysis": 1080, "Others": 1029, "average": 1107.0, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1106, "Information seeking": 1102, "Creative Writing": 1109, "Coding & Debugging": 1077, "Reasoning": 1105, "Editing": 1076, "Math": 1074, "Planning": 1113, "Brainstorming": 1127, "Role playing": 1073, "Advice seeking": 1086, "Data Analysis": 1079, "Others": 1030, "average": 1087.5833333333333, "# battles": 2791}
         | 
| 3 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1106, "Information seeking": 1081, "Creative Writing": 1108, "Coding & Debugging": 1116, "Reasoning": 1110, "Editing": 1113, "Math": 1106, "Planning": 1118, "Brainstorming": 1146, "Role playing": 1072, "Advice seeking": 1092, "Data Analysis": 1100, "Others": 1052, "average": 1101.1666666666667, "# battles": 5781}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1102, "Information seeking": 1103, "Creative Writing": 1093, "Coding & Debugging": 1115, "Reasoning": 1121, "Editing": 1087, "Math": 1083, "Planning": 1107, "Brainstorming": 1068, "Role playing": 1069, "Advice seeking": 1088, "Data Analysis": 1074, "Others": 1015, "average": 1085.25, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1087, "Information seeking": 1105, "Creative Writing": 1070, "Coding & Debugging": 1071, "Reasoning": 1087, "Editing": 1041, "Math": 1069, "Planning": 1111, "Brainstorming": 1059, "Role playing": 1043, "Advice seeking": 1099, "Data Analysis": 1070, "Others": 1007, "average": 1069.3333333333333, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1083, "Information seeking": 1081, "Creative Writing": 1079, "Coding & Debugging": 1128, "Reasoning": 1104, "Editing": 1064, "Math": 1200, "Planning": 1077, "Brainstorming": 1056, "Role playing": 1078, "Advice seeking": 1062, "Data Analysis": 1098, "Others": 1104, "average": 1094.25, "# battles": 14196}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1044, "Information seeking": 1052, "Creative Writing": 1036, "Coding & Debugging": 1052, "Reasoning": 1042, "Editing": 1016, "Math": 1039, "Planning": 1028, "Brainstorming": 1036, "Role playing": 1049, "Advice seeking": 1020, "Data Analysis": 1020, "Others": 970, "average": 1030.0, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1029, "Information seeking": 1019, "Creative Writing": 1057, "Coding & Debugging": 998, "Reasoning": 1003, "Editing": 1049, "Math": 962, "Planning": 1027, "Brainstorming": 1050, "Role playing": 1036, "Advice seeking": 1021, "Data Analysis": 980, "Others": 1000, "average": 1016.8333333333334, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1025, "Information seeking": 1018, "Creative Writing": 1051, "Coding & Debugging": 1022, "Reasoning": 1001, "Editing": 1057, "Math": 957, "Planning": 1048, "Brainstorming": 1044, "Role playing": 1031, "Advice seeking": 1040, "Data Analysis": 995, "Others": 988, "average": 1021.0, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1024, "Information seeking": 1027, "Creative Writing": 1003, "Coding & Debugging": 1028, "Reasoning": 1030, "Editing": 1006, "Math": 1051, "Planning": 1003, "Brainstorming": 1000, "Role playing": 1004, "Advice seeking": 1002, "Data Analysis": 1009, "Others": 987, "average": 1012.5, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 998, "Information seeking": 991, "Creative Writing": 1012, "Coding & Debugging": 953, "Reasoning": 1016, "Editing": 991, "Math": 1023, "Planning": 1054, "Brainstorming": 1070, "Role playing": 976, "Advice seeking": 1031, "Data Analysis": 970, "Others": 990, "average": 1006.4166666666666, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 984, "Information seeking": 983, "Creative Writing": 961, "Coding & Debugging": 1005, "Reasoning": 993, "Editing": 978, "Math": 1023, "Planning": 979, "Brainstorming": 961, "Role playing": 987, "Advice seeking": 959, "Data Analysis": 1006, "Others": 1002, "average": 986.4166666666666, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 974, "Information seeking": 972, "Creative Writing": 961, "Coding & Debugging": 973, "Reasoning": 997, "Editing": 970, "Math": 981, "Planning": 959, "Brainstorming": 968, "Role playing": 977, "Advice seeking": 983, "Data Analysis": 1008, "Others": 1012, "average": 980.0833333333334, "# battles": 2144}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 969, "Information seeking": 988, "Creative Writing": 970, "Coding & Debugging": 919, "Reasoning": 979, "Editing": 926, "Math": 939, "Planning": 949, "Brainstorming": 941, "Role playing": 1005, "Advice seeking": 972, "Data Analysis": 903, "Others": 963, "average": 954.5, "# battles": 3630}
         | 
| 15 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 964, "Information seeking": 968, "Creative Writing": 966, "Coding & Debugging": 993, "Reasoning": 957, "Editing": 976, "Math": 946, "Planning": 938, "Brainstorming": 959, "Role playing": 969, "Advice seeking": 931, "Data Analysis": 998, "Others": 972, "average": 964.4166666666666, "# battles": 2715}
         | 
| 16 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 955, "Information seeking": 946, "Creative Writing": 967, "Coding & Debugging": 950, "Reasoning": 940, "Editing": 971, "Math": 953, "Planning": 943, "Brainstorming": 942, "Role playing": 971, "Advice seeking": 935, "Data Analysis": 967, "Others": 994, "average": 956.5833333333334, "# battles": 2091}
         | 
| 17 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 954, "Information seeking": 930, "Creative Writing": 973, "Coding & Debugging": 933, "Reasoning": 949, "Editing": 954, "Math": 1012, "Planning": 951, "Brainstorming": 999, "Role playing": 967, "Advice seeking": 964, "Data Analysis": 972, "Others": 985, "average": 965.75, "# battles": 2689}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 948, "Information seeking": 967, "Creative Writing": 965, "Coding & Debugging": 904, "Reasoning": 929, "Editing": 942, "Math": 925, "Planning": 938, "Brainstorming": 985, "Role playing": 977, "Advice seeking": 990, "Data Analysis": 932, "Others": 980, "average": 952.8333333333334, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 946, "Information seeking": 934, "Creative Writing": 942, "Coding & Debugging": 985, "Reasoning": 932, "Editing": 971, "Math": 934, "Planning": 940, "Brainstorming": 934, "Role playing": 944, "Advice seeking": 946, "Data Analysis": 1006, "Others": 991, "average": 954.9166666666666, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 933, "Information seeking": 929, "Creative Writing": 943, "Coding & Debugging": 939, "Reasoning": 907, "Editing": 978, "Math": 912, "Planning": 939, "Brainstorming": 918, "Role playing": 963, "Advice seeking": 948, "Data Analysis": 964, "Others": 1007, "average": 945.5833333333334, "# battles": 2094}
         | 
| 21 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 933, "Information seeking": 925, "Creative Writing": 932, "Coding & Debugging": 979, "Reasoning": 946, "Editing": 983, "Math": 935, "Planning": 938, "Brainstorming": 914, "Role playing": 918, "Advice seeking": 948, "Data Analysis": 971, "Others": 1007, "average": 949.6666666666666, "# battles": 2406}
         | 
| 22 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 931, "Information seeking": 981, "Creative Writing": 933, "Coding & Debugging": 879, "Reasoning": 938, "Editing": 917, "Math": 900, "Planning": 923, "Brainstorming": 903, "Role playing": 981, "Advice seeking": 947, "Data Analysis": 932, "Others": 951, "average": 932.0833333333334, "# battles": 3543}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 900, "Information seeking": 875, "Creative Writing": 910, "Coding & Debugging": 957, "Reasoning": 877, "Editing": 954, "Math": 923, "Planning": 882, "Brainstorming": 896, "Role playing": 922, "Advice seeking": 902, "Data Analysis": 958, "Others": 994, "average": 920.8333333333334, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 884, "Information seeking": 887, "Creative Writing": 860, "Coding & Debugging": 923, "Reasoning": 896, "Editing": 900, "Math": 913, "Planning": 895, "Brainstorming": 874, "Role playing": 912, "Advice seeking": 919, "Data Analysis": 910, "Others": 979, "average": 905.6666666666666, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.L=0.7.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1119, "Information seeking": 1137, "Creative Writing": 1111, "Coding & Debugging": 1091, "Reasoning": 1138, "Editing": 1076, "Math": 1138, "Planning": 1145, "Brainstorming": 1156, "Role playing": 1076, "Advice seeking": 1108, "Data Analysis": 1077, "Others": 1029, "average": 1106.8333333333333, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1107, "Information seeking": 1104, "Creative Writing": 1113, "Coding & Debugging": 1075, "Reasoning": 1103, "Editing": 1075, "Math": 1071, "Planning": 1114, "Brainstorming": 1132, "Role playing": 1076, "Advice seeking": 1086, "Data Analysis": 1079, "Others": 1031, "average": 1088.25, "# battles": 2791}
         | 
| 3 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1104, "Information seeking": 1107, "Creative Writing": 1094, "Coding & Debugging": 1119, "Reasoning": 1123, "Editing": 1090, "Math": 1085, "Planning": 1113, "Brainstorming": 1070, "Role playing": 1071, "Advice seeking": 1092, "Data Analysis": 1077, "Others": 1015, "average": 1088.0, "# battles": 2058}
         | 
| 4 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1100, "Information seeking": 1075, "Creative Writing": 1103, "Coding & Debugging": 1111, "Reasoning": 1100, "Editing": 1110, "Math": 1097, "Planning": 1110, "Brainstorming": 1141, "Role playing": 1066, "Advice seeking": 1083, "Data Analysis": 1095, "Others": 1054, "average": 1095.4166666666667, "# battles": 5781}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1091, "Information seeking": 1109, "Creative Writing": 1074, "Coding & Debugging": 1076, "Reasoning": 1090, "Editing": 1045, "Math": 1071, "Planning": 1116, "Brainstorming": 1063, "Role playing": 1046, "Advice seeking": 1102, "Data Analysis": 1072, "Others": 1007, "average": 1072.5833333333333, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1090, "Information seeking": 1090, "Creative Writing": 1086, "Coding & Debugging": 1136, "Reasoning": 1113, "Editing": 1070, "Math": 1212, "Planning": 1086, "Brainstorming": 1063, "Role playing": 1083, "Advice seeking": 1071, "Data Analysis": 1106, "Others": 1114, "average": 1102.5, "# battles": 14196}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1044, "Information seeking": 1054, "Creative Writing": 1037, "Coding & Debugging": 1054, "Reasoning": 1045, "Editing": 1017, "Math": 1041, "Planning": 1029, "Brainstorming": 1038, "Role playing": 1049, "Advice seeking": 1021, "Data Analysis": 1023, "Others": 967, "average": 1031.25, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1026, "Information seeking": 1014, "Creative Writing": 1054, "Coding & Debugging": 996, "Reasoning": 997, "Editing": 1048, "Math": 956, "Planning": 1022, "Brainstorming": 1045, "Role playing": 1033, "Advice seeking": 1014, "Data Analysis": 977, "Others": 1000, "average": 1013.0, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1022, "Information seeking": 1015, "Creative Writing": 1047, "Coding & Debugging": 1023, "Reasoning": 999, "Editing": 1055, "Math": 953, "Planning": 1048, "Brainstorming": 1043, "Role playing": 1029, "Advice seeking": 1039, "Data Analysis": 993, "Others": 986, "average": 1019.1666666666666, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1022, "Information seeking": 1024, "Creative Writing": 1000, "Coding & Debugging": 1028, "Reasoning": 1029, "Editing": 1006, "Math": 1052, "Planning": 1001, "Brainstorming": 999, "Role playing": 1002, "Advice seeking": 1003, "Data Analysis": 1007, "Others": 985, "average": 1011.3333333333334, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 1001, "Information seeking": 994, "Creative Writing": 1018, "Coding & Debugging": 953, "Reasoning": 1020, "Editing": 997, "Math": 1027, "Planning": 1060, "Brainstorming": 1077, "Role playing": 977, "Advice seeking": 1035, "Data Analysis": 972, "Others": 990, "average": 1010.0, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "Information seeking": 983, "Creative Writing": 962, "Coding & Debugging": 1007, "Reasoning": 994, "Editing": 981, "Math": 1024, "Planning": 980, "Brainstorming": 962, "Role playing": 987, "Advice seeking": 957, "Data Analysis": 1007, "Others": 1002, "average": 987.1666666666666, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 969, "Information seeking": 990, "Creative Writing": 969, "Coding & Debugging": 916, "Reasoning": 979, "Editing": 921, "Math": 935, "Planning": 949, "Brainstorming": 941, "Role playing": 1005, "Advice seeking": 971, "Data Analysis": 901, "Others": 961, "average": 953.1666666666666, "# battles": 3630}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 969, "Information seeking": 964, "Creative Writing": 954, "Coding & Debugging": 972, "Reasoning": 993, "Editing": 967, "Math": 980, "Planning": 954, "Brainstorming": 961, "Role playing": 973, "Advice seeking": 980, "Data Analysis": 1006, "Others": 1012, "average": 976.3333333333334, "# battles": 2144}
         | 
| 15 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 969, "Information seeking": 974, "Creative Writing": 972, "Coding & Debugging": 996, "Reasoning": 963, "Editing": 982, "Math": 948, "Planning": 945, "Brainstorming": 965, "Role playing": 972, "Advice seeking": 937, "Data Analysis": 1001, "Others": 971, "average": 968.8333333333334, "# battles": 2715}
         | 
| 16 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 957, "Information seeking": 932, "Creative Writing": 976, "Coding & Debugging": 935, "Reasoning": 955, "Editing": 958, "Math": 1017, "Planning": 953, "Brainstorming": 1002, "Role playing": 970, "Advice seeking": 966, "Data Analysis": 975, "Others": 984, "average": 968.5833333333334, "# battles": 2689}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 951, "Information seeking": 939, "Creative Writing": 961, "Coding & Debugging": 949, "Reasoning": 936, "Editing": 967, "Math": 952, "Planning": 938, "Brainstorming": 934, "Role playing": 968, "Advice seeking": 929, "Data Analysis": 965, "Others": 993, "average": 952.5833333333334, "# battles": 2091}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 947, "Information seeking": 968, "Creative Writing": 965, "Coding & Debugging": 902, "Reasoning": 929, "Editing": 940, "Math": 924, "Planning": 936, "Brainstorming": 984, "Role playing": 978, "Advice seeking": 990, "Data Analysis": 932, "Others": 979, "average": 952.25, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 946, "Information seeking": 935, "Creative Writing": 942, "Coding & Debugging": 986, "Reasoning": 930, "Editing": 970, "Math": 933, "Planning": 938, "Brainstorming": 932, "Role playing": 944, "Advice seeking": 943, "Data Analysis": 1005, "Others": 990, "average": 954.0, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 934, "Information seeking": 925, "Creative Writing": 933, "Coding & Debugging": 980, "Reasoning": 948, "Editing": 982, "Math": 934, "Planning": 937, "Brainstorming": 913, "Role playing": 917, "Advice seeking": 946, "Data Analysis": 971, "Others": 1007, "average": 949.4166666666666, "# battles": 2406}
         | 
| 21 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 928, "Information seeking": 921, "Creative Writing": 938, "Coding & Debugging": 937, "Reasoning": 901, "Editing": 976, "Math": 910, "Planning": 932, "Brainstorming": 911, "Role playing": 961, "Advice seeking": 942, "Data Analysis": 962, "Others": 1008, "average": 941.5833333333334, "# battles": 2094}
         | 
| 22 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 927, "Information seeking": 979, "Creative Writing": 929, "Coding & Debugging": 871, "Reasoning": 935, "Editing": 913, "Math": 897, "Planning": 921, "Brainstorming": 898, "Role playing": 980, "Advice seeking": 948, "Data Analysis": 928, "Others": 948, "average": 928.9166666666666, "# battles": 3543}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 901, "Information seeking": 876, "Creative Writing": 910, "Coding & Debugging": 959, "Reasoning": 878, "Editing": 956, "Math": 925, "Planning": 882, "Brainstorming": 896, "Role playing": 921, "Advice seeking": 904, "Data Analysis": 958, "Others": 995, "average": 921.6666666666666, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 886, "Information seeking": 891, "Creative Writing": 861, "Coding & Debugging": 925, "Reasoning": 902, "Editing": 900, "Math": 913, "Planning": 901, "Brainstorming": 879, "Role playing": 913, "Advice seeking": 927, "Data Analysis": 908, "Others": 979, "average": 908.25, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.L=0.8.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1120, "Information seeking": 1140, "Creative Writing": 1114, "Coding & Debugging": 1087, "Reasoning": 1136, "Editing": 1074, "Math": 1137, "Planning": 1145, "Brainstorming": 1158, "Role playing": 1079, "Advice seeking": 1108, "Data Analysis": 1075, "Others": 1030, "average": 1106.9166666666667, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1110, "Information seeking": 1106, "Creative Writing": 1116, "Coding & Debugging": 1075, "Reasoning": 1101, "Editing": 1075, "Math": 1069, "Planning": 1114, "Brainstorming": 1136, "Role playing": 1079, "Advice seeking": 1086, "Data Analysis": 1079, "Others": 1032, "average": 1089.0, "# battles": 2791}
         | 
| 3 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1107, "Information seeking": 1110, "Creative Writing": 1095, "Coding & Debugging": 1122, "Reasoning": 1126, "Editing": 1092, "Math": 1087, "Planning": 1118, "Brainstorming": 1072, "Role playing": 1073, "Advice seeking": 1096, "Data Analysis": 1080, "Others": 1016, "average": 1090.5833333333333, "# battles": 2058}
         | 
| 4 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1097, "Information seeking": 1099, "Creative Writing": 1093, "Coding & Debugging": 1144, "Reasoning": 1121, "Editing": 1075, "Math": 1225, "Planning": 1095, "Brainstorming": 1069, "Role playing": 1089, "Advice seeking": 1078, "Data Analysis": 1114, "Others": 1123, "average": 1110.4166666666667, "# battles": 14196}
         | 
| 5 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1096, "Information seeking": 1070, "Creative Writing": 1099, "Coding & Debugging": 1109, "Reasoning": 1093, "Editing": 1106, "Math": 1092, "Planning": 1105, "Brainstorming": 1137, "Role playing": 1061, "Advice seeking": 1076, "Data Analysis": 1091, "Others": 1057, "average": 1091.3333333333333, "# battles": 5781}
         | 
| 6 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1095, "Information seeking": 1113, "Creative Writing": 1078, "Coding & Debugging": 1080, "Reasoning": 1092, "Editing": 1048, "Math": 1074, "Planning": 1121, "Brainstorming": 1066, "Role playing": 1049, "Advice seeking": 1105, "Data Analysis": 1075, "Others": 1008, "average": 1075.75, "# battles": 2035}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1046, "Information seeking": 1057, "Creative Writing": 1039, "Coding & Debugging": 1056, "Reasoning": 1046, "Editing": 1017, "Math": 1043, "Planning": 1030, "Brainstorming": 1039, "Role playing": 1050, "Advice seeking": 1022, "Data Analysis": 1025, "Others": 965, "average": 1032.4166666666667, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1023, "Information seeking": 1012, "Creative Writing": 1051, "Coding & Debugging": 993, "Reasoning": 992, "Editing": 1046, "Math": 950, "Planning": 1018, "Brainstorming": 1041, "Role playing": 1031, "Advice seeking": 1010, "Data Analysis": 975, "Others": 1000, "average": 1009.9166666666666, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1020, "Information seeking": 1013, "Creative Writing": 1043, "Coding & Debugging": 1023, "Reasoning": 997, "Editing": 1053, "Math": 948, "Planning": 1048, "Brainstorming": 1042, "Role playing": 1027, "Advice seeking": 1038, "Data Analysis": 991, "Others": 985, "average": 1017.3333333333334, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1020, "Information seeking": 1022, "Creative Writing": 998, "Coding & Debugging": 1028, "Reasoning": 1029, "Editing": 1006, "Math": 1052, "Planning": 1001, "Brainstorming": 998, "Role playing": 1000, "Advice seeking": 1003, "Data Analysis": 1007, "Others": 984, "average": 1010.6666666666666, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 1005, "Information seeking": 998, "Creative Writing": 1024, "Coding & Debugging": 953, "Reasoning": 1024, "Editing": 1002, "Math": 1031, "Planning": 1065, "Brainstorming": 1083, "Role playing": 980, "Advice seeking": 1039, "Data Analysis": 974, "Others": 990, "average": 1013.5833333333334, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "Information seeking": 982, "Creative Writing": 963, "Coding & Debugging": 1008, "Reasoning": 994, "Editing": 984, "Math": 1024, "Planning": 980, "Brainstorming": 963, "Role playing": 988, "Advice seeking": 956, "Data Analysis": 1009, "Others": 1002, "average": 987.75, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 973, "Information seeking": 980, "Creative Writing": 976, "Coding & Debugging": 998, "Reasoning": 969, "Editing": 985, "Math": 950, "Planning": 950, "Brainstorming": 971, "Role playing": 975, "Advice seeking": 941, "Data Analysis": 1003, "Others": 970, "average": 972.3333333333334, "# battles": 2715}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 969, "Information seeking": 991, "Creative Writing": 968, "Coding & Debugging": 915, "Reasoning": 980, "Editing": 917, "Math": 933, "Planning": 948, "Brainstorming": 941, "Role playing": 1004, "Advice seeking": 971, "Data Analysis": 899, "Others": 957, "average": 952.0, "# battles": 3630}
         | 
| 15 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 965, "Information seeking": 958, "Creative Writing": 948, "Coding & Debugging": 971, "Reasoning": 989, "Editing": 964, "Math": 980, "Planning": 949, "Brainstorming": 955, "Role playing": 969, "Advice seeking": 976, "Data Analysis": 1006, "Others": 1011, "average": 973.0, "# battles": 2144}
         | 
| 16 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 958, "Information seeking": 934, "Creative Writing": 980, "Coding & Debugging": 936, "Reasoning": 959, "Editing": 960, "Math": 1021, "Planning": 954, "Brainstorming": 1005, "Role playing": 971, "Advice seeking": 968, "Data Analysis": 978, "Others": 984, "average": 970.8333333333334, "# battles": 2689}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 947, "Information seeking": 933, "Creative Writing": 956, "Coding & Debugging": 948, "Reasoning": 931, "Editing": 964, "Math": 949, "Planning": 932, "Brainstorming": 928, "Role playing": 964, "Advice seeking": 924, "Data Analysis": 964, "Others": 993, "average": 948.8333333333334, "# battles": 2091}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 946, "Information seeking": 967, "Creative Writing": 965, "Coding & Debugging": 899, "Reasoning": 928, "Editing": 938, "Math": 922, "Planning": 932, "Brainstorming": 982, "Role playing": 979, "Advice seeking": 989, "Data Analysis": 931, "Others": 979, "average": 950.9166666666666, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 945, "Information seeking": 936, "Creative Writing": 941, "Coding & Debugging": 986, "Reasoning": 930, "Editing": 969, "Math": 933, "Planning": 936, "Brainstorming": 930, "Role playing": 943, "Advice seeking": 941, "Data Analysis": 1005, "Others": 989, "average": 953.25, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 934, "Information seeking": 925, "Creative Writing": 933, "Coding & Debugging": 981, "Reasoning": 950, "Editing": 981, "Math": 933, "Planning": 935, "Brainstorming": 912, "Role playing": 916, "Advice seeking": 946, "Data Analysis": 971, "Others": 1008, "average": 949.25, "# battles": 2406}
         | 
| 21 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 924, "Information seeking": 978, "Creative Writing": 925, "Coding & Debugging": 864, "Reasoning": 933, "Editing": 909, "Math": 894, "Planning": 919, "Brainstorming": 892, "Role playing": 979, "Advice seeking": 948, "Data Analysis": 923, "Others": 946, "average": 925.8333333333334, "# battles": 3543}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 923, "Information seeking": 913, "Creative Writing": 934, "Coding & Debugging": 935, "Reasoning": 895, "Editing": 975, "Math": 909, "Planning": 925, "Brainstorming": 904, "Role playing": 958, "Advice seeking": 938, "Data Analysis": 960, "Others": 1008, "average": 937.8333333333334, "# battles": 2094}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 901, "Information seeking": 875, "Creative Writing": 909, "Coding & Debugging": 961, "Reasoning": 879, "Editing": 956, "Math": 925, "Planning": 881, "Brainstorming": 895, "Role playing": 920, "Advice seeking": 905, "Data Analysis": 957, "Others": 996, "average": 921.5833333333334, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 887, "Information seeking": 895, "Creative Writing": 860, "Coding & Debugging": 926, "Reasoning": 906, "Editing": 899, "Math": 912, "Planning": 905, "Brainstorming": 881, "Role playing": 913, "Advice seeking": 932, "Data Analysis": 906, "Others": 978, "average": 909.4166666666666, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.L=0.9.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1121, "Information seeking": 1143, "Creative Writing": 1117, "Coding & Debugging": 1084, "Reasoning": 1134, "Editing": 1073, "Math": 1136, "Planning": 1145, "Brainstorming": 1162, "Role playing": 1083, "Advice seeking": 1108, "Data Analysis": 1074, "Others": 1031, "average": 1107.5, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1112, "Information seeking": 1108, "Creative Writing": 1120, "Coding & Debugging": 1074, "Reasoning": 1100, "Editing": 1075, "Math": 1067, "Planning": 1115, "Brainstorming": 1141, "Role playing": 1082, "Advice seeking": 1086, "Data Analysis": 1079, "Others": 1033, "average": 1090.0, "# battles": 2791}
         | 
| 3 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1111, "Information seeking": 1114, "Creative Writing": 1096, "Coding & Debugging": 1127, "Reasoning": 1130, "Editing": 1094, "Math": 1089, "Planning": 1125, "Brainstorming": 1074, "Role playing": 1074, "Advice seeking": 1099, "Data Analysis": 1084, "Others": 1016, "average": 1093.5, "# battles": 2058}
         | 
| 4 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1105, "Information seeking": 1107, "Creative Writing": 1099, "Coding & Debugging": 1151, "Reasoning": 1128, "Editing": 1080, "Math": 1239, "Planning": 1102, "Brainstorming": 1074, "Role playing": 1094, "Advice seeking": 1085, "Data Analysis": 1121, "Others": 1132, "average": 1117.6666666666667, "# battles": 14196}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1099, "Information seeking": 1117, "Creative Writing": 1081, "Coding & Debugging": 1084, "Reasoning": 1096, "Editing": 1050, "Math": 1076, "Planning": 1126, "Brainstorming": 1070, "Role playing": 1051, "Advice seeking": 1108, "Data Analysis": 1079, "Others": 1008, "average": 1078.8333333333333, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1093, "Information seeking": 1067, "Creative Writing": 1096, "Coding & Debugging": 1107, "Reasoning": 1087, "Editing": 1104, "Math": 1089, "Planning": 1101, "Brainstorming": 1135, "Role playing": 1058, "Advice seeking": 1070, "Data Analysis": 1089, "Others": 1061, "average": 1088.6666666666667, "# battles": 5781}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1048, "Information seeking": 1060, "Creative Writing": 1041, "Coding & Debugging": 1058, "Reasoning": 1048, "Editing": 1017, "Math": 1045, "Planning": 1032, "Brainstorming": 1041, "Role playing": 1051, "Advice seeking": 1022, "Data Analysis": 1027, "Others": 962, "average": 1033.6666666666667, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1021, "Information seeking": 1010, "Creative Writing": 1050, "Coding & Debugging": 992, "Reasoning": 989, "Editing": 1046, "Math": 946, "Planning": 1015, "Brainstorming": 1038, "Role playing": 1030, "Advice seeking": 1006, "Data Analysis": 974, "Others": 1000, "average": 1008.0, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1018, "Information seeking": 1011, "Creative Writing": 1041, "Coding & Debugging": 1024, "Reasoning": 996, "Editing": 1053, "Math": 945, "Planning": 1049, "Brainstorming": 1042, "Role playing": 1025, "Advice seeking": 1037, "Data Analysis": 991, "Others": 984, "average": 1016.5, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1018, "Information seeking": 1020, "Creative Writing": 995, "Coding & Debugging": 1029, "Reasoning": 1029, "Editing": 1006, "Math": 1052, "Planning": 999, "Brainstorming": 998, "Role playing": 999, "Advice seeking": 1002, "Data Analysis": 1006, "Others": 983, "average": 1009.8333333333334, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 1008, "Information seeking": 1000, "Creative Writing": 1029, "Coding & Debugging": 953, "Reasoning": 1028, "Editing": 1007, "Math": 1035, "Planning": 1070, "Brainstorming": 1089, "Role playing": 982, "Advice seeking": 1043, "Data Analysis": 976, "Others": 990, "average": 1016.8333333333334, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "Information seeking": 982, "Creative Writing": 963, "Coding & Debugging": 1009, "Reasoning": 994, "Editing": 987, "Math": 1024, "Planning": 980, "Brainstorming": 963, "Role playing": 990, "Advice seeking": 955, "Data Analysis": 1010, "Others": 1002, "average": 988.25, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 978, "Information seeking": 984, "Creative Writing": 982, "Coding & Debugging": 1001, "Reasoning": 974, "Editing": 989, "Math": 952, "Planning": 955, "Brainstorming": 976, "Role playing": 977, "Advice seeking": 946, "Data Analysis": 1005, "Others": 968, "average": 975.75, "# battles": 2715}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 968, "Information seeking": 991, "Creative Writing": 967, "Coding & Debugging": 914, "Reasoning": 980, "Editing": 913, "Math": 931, "Planning": 946, "Brainstorming": 941, "Role playing": 1003, "Advice seeking": 970, "Data Analysis": 896, "Others": 954, "average": 950.5, "# battles": 3630}
         | 
| 15 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 961, "Information seeking": 952, "Creative Writing": 943, "Coding & Debugging": 969, "Reasoning": 986, "Editing": 962, "Math": 979, "Planning": 945, "Brainstorming": 951, "Role playing": 966, "Advice seeking": 972, "Data Analysis": 1005, "Others": 1010, "average": 970.0, "# battles": 2144}
         | 
| 16 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 959, "Information seeking": 934, "Creative Writing": 982, "Coding & Debugging": 936, "Reasoning": 961, "Editing": 961, "Math": 1026, "Planning": 955, "Brainstorming": 1008, "Role playing": 972, "Advice seeking": 970, "Data Analysis": 979, "Others": 984, "average": 972.3333333333334, "# battles": 2689}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 945, "Information seeking": 935, "Creative Writing": 941, "Coding & Debugging": 987, "Reasoning": 928, "Editing": 968, "Math": 932, "Planning": 935, "Brainstorming": 927, "Role playing": 942, "Advice seeking": 938, "Data Analysis": 1005, "Others": 988, "average": 952.1666666666666, "# battles": 2461}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 945, "Information seeking": 967, "Creative Writing": 965, "Coding & Debugging": 896, "Reasoning": 928, "Editing": 935, "Math": 922, "Planning": 930, "Brainstorming": 982, "Role playing": 980, "Advice seeking": 988, "Data Analysis": 930, "Others": 978, "average": 950.0833333333334, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 944, "Information seeking": 927, "Creative Writing": 952, "Coding & Debugging": 947, "Reasoning": 928, "Editing": 961, "Math": 947, "Planning": 927, "Brainstorming": 922, "Role playing": 962, "Advice seeking": 919, "Data Analysis": 962, "Others": 992, "average": 945.5, "# battles": 2091}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 933, "Information seeking": 924, "Creative Writing": 932, "Coding & Debugging": 982, "Reasoning": 951, "Editing": 980, "Math": 931, "Planning": 934, "Brainstorming": 910, "Role playing": 914, "Advice seeking": 946, "Data Analysis": 971, "Others": 1008, "average": 948.5833333333334, "# battles": 2406}
         | 
| 21 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 922, "Information seeking": 977, "Creative Writing": 921, "Coding & Debugging": 856, "Reasoning": 930, "Editing": 905, "Math": 890, "Planning": 916, "Brainstorming": 888, "Role playing": 979, "Advice seeking": 948, "Data Analysis": 919, "Others": 943, "average": 922.6666666666666, "# battles": 3543}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 919, "Information seeking": 906, "Creative Writing": 930, "Coding & Debugging": 933, "Reasoning": 890, "Editing": 973, "Math": 907, "Planning": 919, "Brainstorming": 897, "Role playing": 956, "Advice seeking": 934, "Data Analysis": 958, "Others": 1008, "average": 934.25, "# battles": 2094}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 900, "Information seeking": 873, "Creative Writing": 908, "Coding & Debugging": 962, "Reasoning": 878, "Editing": 956, "Math": 925, "Planning": 878, "Brainstorming": 894, "Role playing": 918, "Advice seeking": 904, "Data Analysis": 957, "Others": 996, "average": 920.75, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 887, "Information seeking": 896, "Creative Writing": 859, "Coding & Debugging": 926, "Reasoning": 908, "Editing": 899, "Math": 909, "Planning": 907, "Brainstorming": 883, "Role playing": 911, "Advice seeking": 936, "Data Analysis": 903, "Others": 977, "average": 909.5, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.L=1.0.jsonl
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1123, "Information seeking": 1146, "Creative Writing": 1120, "Coding & Debugging": 1082, "Reasoning": 1135, "Editing": 1072, "Math": 1136, "Planning": 1146, "Brainstorming": 1165, "Role playing": 1087, "Advice seeking": 1107, "Data Analysis": 1074, "Others": 1031, "average": 1108.4166666666667, "# battles": 3658}
         | 
| 2 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1114, "Information seeking": 1118, "Creative Writing": 1098, "Coding & Debugging": 1132, "Reasoning": 1133, "Editing": 1096, "Math": 1091, "Planning": 1131, "Brainstorming": 1076, "Role playing": 1075, "Advice seeking": 1102, "Data Analysis": 1088, "Others": 1016, "average": 1096.3333333333333, "# battles": 2058}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1114, "Information seeking": 1111, "Creative Writing": 1123, "Coding & Debugging": 1072, "Reasoning": 1100, "Editing": 1075, "Math": 1067, "Planning": 1115, "Brainstorming": 1146, "Role playing": 1085, "Advice seeking": 1087, "Data Analysis": 1080, "Others": 1034, "average": 1091.25, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1112, "Information seeking": 1115, "Creative Writing": 1106, "Coding & Debugging": 1158, "Reasoning": 1135, "Editing": 1084, "Math": 1252, "Planning": 1109, "Brainstorming": 1080, "Role playing": 1099, "Advice seeking": 1092, "Data Analysis": 1129, "Others": 1142, "average": 1125.0833333333333, "# battles": 14196}
         | 
| 5 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1103, "Information seeking": 1121, "Creative Writing": 1085, "Coding & Debugging": 1088, "Reasoning": 1098, "Editing": 1052, "Math": 1079, "Planning": 1131, "Brainstorming": 1073, "Role playing": 1052, "Advice seeking": 1112, "Data Analysis": 1081, "Others": 1008, "average": 1081.6666666666667, "# battles": 2035}
         | 
| 6 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1092, "Information seeking": 1065, "Creative Writing": 1095, "Coding & Debugging": 1107, "Reasoning": 1083, "Editing": 1103, "Math": 1085, "Planning": 1098, "Brainstorming": 1134, "Role playing": 1055, "Advice seeking": 1066, "Data Analysis": 1087, "Others": 1064, "average": 1086.8333333333333, "# battles": 5781}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1050, "Information seeking": 1063, "Creative Writing": 1042, "Coding & Debugging": 1061, "Reasoning": 1050, "Editing": 1017, "Math": 1047, "Planning": 1034, "Brainstorming": 1043, "Role playing": 1053, "Advice seeking": 1022, "Data Analysis": 1029, "Others": 960, "average": 1035.0833333333333, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1019, "Information seeking": 1008, "Creative Writing": 1049, "Coding & Debugging": 990, "Reasoning": 985, "Editing": 1046, "Math": 942, "Planning": 1012, "Brainstorming": 1036, "Role playing": 1029, "Advice seeking": 1002, "Data Analysis": 972, "Others": 1000, "average": 1005.9166666666666, "# battles": 2606}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1018, "Information seeking": 1010, "Creative Writing": 1039, "Coding & Debugging": 1025, "Reasoning": 995, "Editing": 1051, "Math": 941, "Planning": 1049, "Brainstorming": 1042, "Role playing": 1025, "Advice seeking": 1036, "Data Analysis": 990, "Others": 983, "average": 1015.5, "# battles": 2519}
         | 
| 10 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1017, "Information seeking": 1019, "Creative Writing": 993, "Coding & Debugging": 1030, "Reasoning": 1030, "Editing": 1006, "Math": 1053, "Planning": 998, "Brainstorming": 996, "Role playing": 997, "Advice seeking": 1002, "Data Analysis": 1006, "Others": 982, "average": 1009.3333333333334, "# battles": 1484}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 1010, "Information seeking": 1002, "Creative Writing": 1033, "Coding & Debugging": 953, "Reasoning": 1031, "Editing": 1012, "Math": 1038, "Planning": 1074, "Brainstorming": 1094, "Role playing": 984, "Advice seeking": 1046, "Data Analysis": 977, "Others": 990, "average": 1019.5, "# battles": 2728}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "Information seeking": 982, "Creative Writing": 964, "Coding & Debugging": 1011, "Reasoning": 994, "Editing": 989, "Math": 1024, "Planning": 981, "Brainstorming": 963, "Role playing": 991, "Advice seeking": 954, "Data Analysis": 1011, "Others": 1001, "average": 988.75, "# battles": 1532}
         | 
| 13 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 981, "Information seeking": 988, "Creative Writing": 986, "Coding & Debugging": 1003, "Reasoning": 977, "Editing": 992, "Math": 953, "Planning": 959, "Brainstorming": 981, "Role playing": 979, "Advice seeking": 949, "Data Analysis": 1008, "Others": 967, "average": 978.5, "# battles": 2715}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 967, "Information seeking": 991, "Creative Writing": 965, "Coding & Debugging": 911, "Reasoning": 980, "Editing": 908, "Math": 928, "Planning": 945, "Brainstorming": 940, "Role playing": 1004, "Advice seeking": 969, "Data Analysis": 894, "Others": 951, "average": 948.8333333333334, "# battles": 3630}
         | 
| 15 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 960, "Information seeking": 934, "Creative Writing": 983, "Coding & Debugging": 935, "Reasoning": 964, "Editing": 962, "Math": 1030, "Planning": 955, "Brainstorming": 1011, "Role playing": 972, "Advice seeking": 972, "Data Analysis": 981, "Others": 984, "average": 973.5833333333334, "# battles": 2689}
         | 
| 16 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 957, "Information seeking": 947, "Creative Writing": 939, "Coding & Debugging": 968, "Reasoning": 982, "Editing": 960, "Math": 979, "Planning": 942, "Brainstorming": 946, "Role playing": 962, "Advice seeking": 969, "Data Analysis": 1006, "Others": 1010, "average": 967.5, "# battles": 2144}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 945, "Information seeking": 934, "Creative Writing": 940, "Coding & Debugging": 987, "Reasoning": 926, "Editing": 968, "Math": 931, "Planning": 933, "Brainstorming": 924, "Role playing": 940, "Advice seeking": 936, "Data Analysis": 1005, "Others": 987, "average": 950.9166666666666, "# battles": 2461}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 944, "Information seeking": 966, "Creative Writing": 964, "Coding & Debugging": 893, "Reasoning": 926, "Editing": 933, "Math": 921, "Planning": 927, "Brainstorming": 981, "Role playing": 981, "Advice seeking": 988, "Data Analysis": 929, "Others": 978, "average": 948.9166666666666, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 940, "Information seeking": 921, "Creative Writing": 947, "Coding & Debugging": 945, "Reasoning": 924, "Editing": 958, "Math": 945, "Planning": 923, "Brainstorming": 917, "Role playing": 959, "Advice seeking": 915, "Data Analysis": 960, "Others": 991, "average": 942.0833333333334, "# battles": 2091}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 932, "Information seeking": 923, "Creative Writing": 931, "Coding & Debugging": 983, "Reasoning": 953, "Editing": 979, "Math": 929, "Planning": 932, "Brainstorming": 908, "Role playing": 912, "Advice seeking": 945, "Data Analysis": 970, "Others": 1008, "average": 947.75, "# battles": 2406}
         | 
| 21 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 919, "Information seeking": 976, "Creative Writing": 917, "Coding & Debugging": 848, "Reasoning": 928, "Editing": 902, "Math": 886, "Planning": 913, "Brainstorming": 883, "Role playing": 978, "Advice seeking": 948, "Data Analysis": 915, "Others": 940, "average": 919.5, "# battles": 3543}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 914, "Information seeking": 899, "Creative Writing": 927, "Coding & Debugging": 931, "Reasoning": 885, "Editing": 972, "Math": 905, "Planning": 914, "Brainstorming": 890, "Role playing": 954, "Advice seeking": 931, "Data Analysis": 956, "Others": 1007, "average": 930.9166666666666, "# battles": 2094}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 899, "Information seeking": 871, "Creative Writing": 907, "Coding & Debugging": 962, "Reasoning": 877, "Editing": 958, "Math": 923, "Planning": 875, "Brainstorming": 893, "Role playing": 916, "Advice seeking": 904, "Data Analysis": 956, "Others": 996, "average": 919.8333333333334, "# battles": 2366}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 886, "Information seeking": 897, "Creative Writing": 858, "Coding & Debugging": 925, "Reasoning": 910, "Editing": 899, "Math": 907, "Planning": 908, "Brainstorming": 883, "Role playing": 909, "Advice seeking": 939, "Data Analysis": 900, "Others": 977, "average": 909.3333333333334, "# battles": 2659}
         | 
    	
        data_dir/elo_ranks.all.jsonl
    CHANGED
    
    | @@ -1,22 +1,24 @@ | |
| 1 | 
            -
            {"model name ": "gpt-4-0125-preview", "elo overall":  | 
| 2 | 
            -
            {"model name ": "claude-3-opus-20240229", "elo overall":  | 
| 3 | 
            -
            {"model name ": "claude-3-sonnet-20240229", "elo overall":  | 
| 4 | 
            -
            {"model name ": "mistral-large-2402", "elo overall":  | 
| 5 | 
            -
            {"model name ": " | 
| 6 | 
            -
            {"model name ": " | 
| 7 | 
            -
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall":  | 
| 8 | 
            -
            {"model name ": " | 
| 9 | 
            -
            {"model name ": " | 
| 10 | 
            -
            {"model name ": "Llama-2- | 
| 11 | 
            -
            {"model name ": " | 
| 12 | 
            -
            {"model name ": " | 
| 13 | 
            -
            {"model name ": " | 
| 14 | 
            -
            {"model name ": " | 
| 15 | 
            -
            {"model name ": " | 
| 16 | 
            -
            {"model name ": " | 
| 17 | 
            -
            {"model name ": " | 
| 18 | 
            -
            {"model name ": " | 
| 19 | 
            -
            {"model name ": "Llama-2- | 
| 20 | 
            -
            {"model name ": " | 
| 21 | 
            -
            {"model name ": " | 
| 22 | 
            -
            {"model name ": " | 
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1293, "Information seeking": 1267, "Creative Writing": 1248, "Coding & Debugging": 1366, "Reasoning": 1353, "Editing": 1210, "Math": 1275, "Planning": 1294, "Brainstorming": 1311, "Role playing": 1231, "Advice seeking": 1287, "Data Analysis": 1277, "Others": 1066, "average": 1265.4166666666667, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1153, "Information seeking": 1137, "Creative Writing": 1073, "Coding & Debugging": 1303, "Reasoning": 1218, "Editing": 1166, "Math": 1207, "Planning": 1188, "Brainstorming": 1131, "Role playing": 982, "Advice seeking": 1140, "Data Analysis": 1183, "Others": 1036, "average": 1147.0, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1134, "Information seeking": 1104, "Creative Writing": 1077, "Coding & Debugging": 1247, "Reasoning": 1171, "Editing": 1155, "Math": 1144, "Planning": 1167, "Brainstorming": 1096, "Role playing": 995, "Advice seeking": 1089, "Data Analysis": 1174, "Others": 1021, "average": 1120.0, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1117, "Information seeking": 1092, "Creative Writing": 1106, "Coding & Debugging": 1162, "Reasoning": 1127, "Editing": 1106, "Math": 1090, "Planning": 1082, "Brainstorming": 1078, "Role playing": 1056, "Advice seeking": 1070, "Data Analysis": 1103, "Others": 1020, "average": 1091.0, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1099, "Information seeking": 1086, "Creative Writing": 1084, "Coding & Debugging": 1126, "Reasoning": 1059, "Editing": 1036, "Math": 1074, "Planning": 1048, "Brainstorming": 1036, "Role playing": 1082, "Advice seeking": 1019, "Data Analysis": 1044, "Others": 989, "average": 1056.9166666666667, "# battles": 1484}
         | 
| 6 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1099, "Information seeking": 1078, "Creative Writing": 1139, "Coding & Debugging": 1136, "Reasoning": 1045, "Editing": 1106, "Math": 1017, "Planning": 1079, "Brainstorming": 1073, "Role playing": 1121, "Advice seeking": 1065, "Data Analysis": 1059, "Others": 1008, "average": 1077.1666666666667, "# battles": 2519}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1068, "Information seeking": 1073, "Creative Writing": 1046, "Coding & Debugging": 1115, "Reasoning": 1056, "Editing": 1013, "Math": 1059, "Planning": 1001, "Brainstorming": 1025, "Role playing": 1084, "Advice seeking": 1003, "Data Analysis": 1057, "Others": 994, "average": 1043.8333333333333, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1067, "Information seeking": 1086, "Creative Writing": 1044, "Coding & Debugging": 1088, "Reasoning": 1075, "Editing": 1026, "Math": 1056, "Planning": 1070, "Brainstorming": 1026, "Role playing": 1025, "Advice seeking": 1075, "Data Analysis": 1091, "Others": 1006, "average": 1055.6666666666667, "# battles": 2035}
         | 
| 9 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1053, "Information seeking": 1068, "Creative Writing": 1099, "Coding & Debugging": 924, "Reasoning": 1067, "Editing": 1040, "Math": 998, "Planning": 1088, "Brainstorming": 1143, "Role playing": 1066, "Advice seeking": 1088, "Data Analysis": 937, "Others": 1001, "average": 1043.25, "# battles": 2606}
         | 
| 10 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1027, "Information seeking": 1091, "Creative Writing": 1065, "Coding & Debugging": 866, "Reasoning": 1028, "Editing": 985, "Math": 962, "Planning": 1007, "Brainstorming": 1058, "Role playing": 1070, "Advice seeking": 1041, "Data Analysis": 943, "Others": 1022, "average": 1011.5, "# battles": 2144}
         | 
| 11 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 1022, "Information seeking": 1020, "Creative Writing": 984, "Coding & Debugging": 1065, "Reasoning": 1010, "Editing": 985, "Math": 1042, "Planning": 996, "Brainstorming": 962, "Role playing": 1013, "Advice seeking": 991, "Data Analysis": 1025, "Others": 1009, "average": 1008.5, "# battles": 1532}
         | 
| 12 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1004, "Information seeking": 1052, "Creative Writing": 1051, "Coding & Debugging": 835, "Reasoning": 974, "Editing": 981, "Math": 936, "Planning": 982, "Brainstorming": 1023, "Role playing": 1045, "Advice seeking": 1007, "Data Analysis": 920, "Others": 1015, "average": 985.0833333333334, "# battles": 2091}
         | 
| 13 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 990, "Information seeking": 997, "Creative Writing": 1017, "Coding & Debugging": 933, "Reasoning": 989, "Editing": 968, "Math": 967, "Planning": 959, "Brainstorming": 934, "Role playing": 1068, "Advice seeking": 972, "Data Analysis": 927, "Others": 988, "average": 976.5833333333334, "# battles": 3630}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 981, "Information seeking": 1028, "Creative Writing": 1024, "Coding & Debugging": 830, "Reasoning": 951, "Editing": 961, "Math": 898, "Planning": 990, "Brainstorming": 997, "Role playing": 1052, "Advice seeking": 1024, "Data Analysis": 929, "Others": 1012, "average": 974.6666666666666, "# battles": 2094}
         | 
| 15 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 977, "Information seeking": 995, "Creative Writing": 987, "Coding & Debugging": 982, "Reasoning": 958, "Editing": 983, "Math": 925, "Planning": 961, "Brainstorming": 938, "Role playing": 1021, "Advice seeking": 936, "Data Analysis": 1003, "Others": 958, "average": 970.5833333333334, "# battles": 3543}
         | 
| 16 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 973, "Information seeking": 931, "Creative Writing": 931, "Coding & Debugging": 1149, "Reasoning": 1015, "Editing": 992, "Math": 1147, "Planning": 981, "Brainstorming": 930, "Role playing": 920, "Advice seeking": 957, "Data Analysis": 1068, "Others": 980, "average": 1000.0833333333334, "# battles": 14196}
         | 
| 17 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 947, "Information seeking": 930, "Creative Writing": 935, "Coding & Debugging": 981, "Reasoning": 955, "Editing": 919, "Math": 984, "Planning": 980, "Brainstorming": 982, "Role playing": 939, "Advice seeking": 978, "Data Analysis": 974, "Others": 990, "average": 962.25, "# battles": 2728}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 941, "Information seeking": 932, "Creative Writing": 935, "Coding & Debugging": 957, "Reasoning": 920, "Editing": 934, "Math": 925, "Planning": 976, "Brainstorming": 995, "Role playing": 941, "Advice seeking": 961, "Data Analysis": 954, "Others": 978, "average": 950.6666666666666, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 884, "Information seeking": 896, "Creative Writing": 898, "Coding & Debugging": 813, "Reasoning": 898, "Editing": 939, "Math": 892, "Planning": 902, "Brainstorming": 939, "Role playing": 898, "Advice seeking": 942, "Data Analysis": 923, "Others": 1000, "average": 911.6666666666666, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 859, "Information seeking": 835, "Creative Writing": 895, "Coding & Debugging": 871, "Reasoning": 816, "Editing": 878, "Math": 880, "Planning": 895, "Brainstorming": 925, "Role playing": 896, "Advice seeking": 881, "Data Analysis": 893, "Others": 977, "average": 886.8333333333334, "# battles": 2689}
         | 
| 21 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 853, "Information seeking": 872, "Creative Writing": 865, "Coding & Debugging": 790, "Reasoning": 880, "Editing": 956, "Math": 908, "Planning": 895, "Brainstorming": 892, "Role playing": 863, "Advice seeking": 938, "Data Analysis": 877, "Others": 1006, "average": 895.1666666666666, "# battles": 2406}
         | 
| 22 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 843, "Information seeking": 844, "Creative Writing": 863, "Coding & Debugging": 803, "Reasoning": 837, "Editing": 871, "Math": 873, "Planning": 819, "Brainstorming": 870, "Role playing": 904, "Advice seeking": 839, "Data Analysis": 866, "Others": 971, "average": 863.3333333333334, "# battles": 2715}
         | 
| 23 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 819, "Information seeking": 778, "Creative Writing": 798, "Coding & Debugging": 930, "Reasoning": 787, "Editing": 870, "Math": 885, "Planning": 802, "Brainstorming": 773, "Role playing": 883, "Advice seeking": 815, "Data Analysis": 912, "Others": 962, "average": 849.5833333333334, "# battles": 2659}
         | 
| 24 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 804, "Information seeking": 802, "Creative Writing": 833, "Coding & Debugging": 737, "Reasoning": 801, "Editing": 916, "Math": 849, "Planning": 832, "Brainstorming": 854, "Role playing": 848, "Advice seeking": 884, "Data Analysis": 859, "Others": 995, "average": 850.8333333333334, "# battles": 2366}
         | 
    	
        data_dir/elo_ranks.jsonl
    CHANGED
    
    | @@ -1,22 +1,24 @@ | |
| 1 | 
            -
            {"model name ": "gpt-4-0125-preview", "elo overall":  | 
| 2 | 
            -
            {"model name ": "claude-3-opus-20240229", "elo overall":  | 
| 3 | 
            -
            {"model name ": "claude-3-sonnet-20240229", "elo overall":  | 
| 4 | 
            -
            {"model name ": "mistral-large-2402", "elo overall":  | 
| 5 | 
            -
            {"model name ": " | 
| 6 | 
            -
            {"model name ": " | 
| 7 | 
            -
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall":  | 
| 8 | 
            -
            {"model name ": " | 
| 9 | 
            -
            {"model name ": " | 
| 10 | 
            -
            {"model name ": "Llama-2- | 
| 11 | 
            -
            {"model name ": " | 
| 12 | 
            -
            {"model name ": " | 
| 13 | 
            -
            {"model name ": " | 
| 14 | 
            -
            {"model name ": " | 
| 15 | 
            -
            {"model name ": " | 
| 16 | 
            -
            {"model name ": " | 
| 17 | 
            -
            {"model name ": " | 
| 18 | 
            -
            {"model name ": " | 
| 19 | 
            -
            {"model name ": "Llama-2- | 
| 20 | 
            -
            {"model name ": " | 
| 21 | 
            -
            {"model name ": " | 
| 22 | 
            -
            {"model name ": " | 
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1293, "# battles": 5781}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1153, "# battles": 3658}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1134, "# battles": 2791}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1117, "# battles": 2058}
         | 
| 5 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1099, "# battles": 1484}
         | 
| 6 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1099, "# battles": 2519}
         | 
| 7 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1068, "# battles": 3619}
         | 
| 8 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1067, "# battles": 2035}
         | 
| 9 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1053, "# battles": 2606}
         | 
| 10 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1027, "# battles": 2144}
         | 
| 11 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 1022, "# battles": 1532}
         | 
| 12 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1004, "# battles": 2091}
         | 
| 13 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 990, "# battles": 3630}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 981, "# battles": 2094}
         | 
| 15 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 977, "# battles": 3543}
         | 
| 16 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 973, "# battles": 14196}
         | 
| 17 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 947, "# battles": 2728}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 941, "# battles": 1939}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 884, "# battles": 2461}
         | 
| 20 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 859, "# battles": 2689}
         | 
| 21 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 853, "# battles": 2406}
         | 
| 22 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 843, "# battles": 2715}
         | 
| 23 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 819, "# battles": 2659}
         | 
| 24 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 804, "# battles": 2366}
         | 
    	
        data_dir/elo_ranks.length_ablation.all.jsonl
    CHANGED
    
    | @@ -1,22 +1,24 @@ | |
| 1 | 
            -
            {"model name ": "gpt-4-0125-preview", "elo overall":  | 
| 2 | 
            -
            {"model name ": " | 
| 3 | 
            -
            {"model name ": " | 
| 4 | 
            -
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall":  | 
| 5 | 
            -
            {"model name ": " | 
| 6 | 
            -
            {"model name ": " | 
| 7 | 
            -
            {"model name ": " | 
| 8 | 
            -
            {"model name ": " | 
| 9 | 
            -
            {"model name ": " | 
| 10 | 
            -
            {"model name ": " | 
| 11 | 
            -
            {"model name ": " | 
| 12 | 
            -
            {"model name ": " | 
| 13 | 
            -
            {"model name ": " | 
| 14 | 
            -
            {"model name ": " | 
| 15 | 
            -
            {"model name ": " | 
| 16 | 
            -
            {"model name ": " | 
| 17 | 
            -
            {"model name ": "Llama-2- | 
| 18 | 
            -
            {"model name ": " | 
| 19 | 
            -
            {"model name ": "gemma-7b-it", "elo overall":  | 
| 20 | 
            -
            {"model name ": " | 
| 21 | 
            -
            {"model name ": " | 
| 22 | 
            -
            {"model name ": " | 
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1187, "Information seeking": 1216, "Creative Writing": 1152, "Coding & Debugging": 1214, "Reasoning": 1245, "Editing": 1105, "Math": 1204, "Planning": 1171, "Brainstorming": 1148, "Role playing": 1176, "Advice seeking": 1223, "Data Analysis": 1183, "Others": 1027, "average": 1172.0, "# battles": 6611}
         | 
| 2 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1089, "Information seeking": 1167, "Creative Writing": 1151, "Coding & Debugging": 920, "Reasoning": 1065, "Editing": 1045, "Math": 985, "Planning": 1087, "Brainstorming": 1122, "Role playing": 1126, "Advice seeking": 1088, "Data Analysis": 947, "Others": 1016, "average": 1059.9166666666667, "# battles": 2144}
         | 
| 3 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1086, "Information seeking": 1076, "Creative Writing": 1101, "Coding & Debugging": 1102, "Reasoning": 1054, "Editing": 1055, "Math": 1078, "Planning": 1035, "Brainstorming": 1031, "Role playing": 1110, "Advice seeking": 1033, "Data Analysis": 1071, "Others": 1017, "average": 1063.5833333333333, "# battles": 2519}
         | 
| 4 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 1080, "Information seeking": 1154, "Creative Writing": 1114, "Coding & Debugging": 916, "Reasoning": 1082, "Editing": 999, "Math": 995, "Planning": 1089, "Brainstorming": 1120, "Role playing": 1121, "Advice seeking": 1106, "Data Analysis": 980, "Others": 1007, "average": 1056.9166666666667, "# battles": 2094}
         | 
| 5 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1078, "Information seeking": 1061, "Creative Writing": 1096, "Coding & Debugging": 1097, "Reasoning": 1016, "Editing": 1029, "Math": 1018, "Planning": 1043, "Brainstorming": 1039, "Role playing": 1087, "Advice seeking": 1013, "Data Analysis": 1035, "Others": 1003, "average": 1044.75, "# battles": 1484}
         | 
| 6 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1075, "Information seeking": 1152, "Creative Writing": 1111, "Coding & Debugging": 906, "Reasoning": 1060, "Editing": 1026, "Math": 994, "Planning": 1070, "Brainstorming": 1120, "Role playing": 1092, "Advice seeking": 1114, "Data Analysis": 971, "Others": 1031, "average": 1053.9166666666667, "# battles": 2091}
         | 
| 7 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 1066, "Information seeking": 1026, "Creative Writing": 1082, "Coding & Debugging": 1128, "Reasoning": 1050, "Editing": 1092, "Math": 1055, "Planning": 1064, "Brainstorming": 1066, "Role playing": 1041, "Advice seeking": 990, "Data Analysis": 1095, "Others": 1010, "average": 1058.25, "# battles": 3543}
         | 
| 8 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 1037, "Information seeking": 1038, "Creative Writing": 1020, "Coding & Debugging": 1047, "Reasoning": 1015, "Editing": 995, "Math": 1006, "Planning": 1020, "Brainstorming": 1000, "Role playing": 1021, "Advice seeking": 1045, "Data Analysis": 1012, "Others": 1001, "average": 1018.3333333333334, "# battles": 1532}
         | 
| 9 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1036, "Information seeking": 1068, "Creative Writing": 1052, "Coding & Debugging": 946, "Reasoning": 1084, "Editing": 988, "Math": 1063, "Planning": 1081, "Brainstorming": 1114, "Role playing": 1045, "Advice seeking": 1096, "Data Analysis": 968, "Others": 1002, "average": 1042.25, "# battles": 2606}
         | 
| 10 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 1031, "Information seeking": 1010, "Creative Writing": 1052, "Coding & Debugging": 1034, "Reasoning": 1017, "Editing": 1056, "Math": 1047, "Planning": 1024, "Brainstorming": 1003, "Role playing": 1058, "Advice seeking": 1015, "Data Analysis": 1035, "Others": 1025, "average": 1031.3333333333333, "# battles": 3630}
         | 
| 11 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1013, "Information seeking": 1007, "Creative Writing": 1000, "Coding & Debugging": 1047, "Reasoning": 1008, "Editing": 999, "Math": 1012, "Planning": 967, "Brainstorming": 979, "Role playing": 1028, "Advice seeking": 976, "Data Analysis": 1027, "Others": 1023, "average": 1006.0833333333334, "# battles": 3620}
         | 
| 12 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1009, "Information seeking": 981, "Creative Writing": 947, "Coding & Debugging": 1143, "Reasoning": 1043, "Editing": 1073, "Math": 1071, "Planning": 1035, "Brainstorming": 942, "Role playing": 903, "Advice seeking": 987, "Data Analysis": 1083, "Others": 995, "average": 1016.9166666666666, "# battles": 2791}
         | 
| 13 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1004, "Information seeking": 968, "Creative Writing": 943, "Coding & Debugging": 1169, "Reasoning": 1043, "Editing": 1095, "Math": 1034, "Planning": 1018, "Brainstorming": 940, "Role playing": 881, "Advice seeking": 1006, "Data Analysis": 1083, "Others": 1004, "average": 1015.3333333333334, "# battles": 4488}
         | 
| 14 | 
            +
            {"model name ": "command", "elo overall": 996, "Information seeking": 965, "Creative Writing": 973, "Coding & Debugging": 1064, "Reasoning": 995, "Editing": 1008, "Math": 1001, "Planning": 1064, "Brainstorming": 1027, "Role playing": 956, "Advice seeking": 974, "Data Analysis": 1024, "Others": 1003, "average": 1004.5, "# battles": 1939}
         | 
| 15 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 987, "Information seeking": 953, "Creative Writing": 991, "Coding & Debugging": 1009, "Reasoning": 975, "Editing": 1002, "Math": 997, "Planning": 937, "Brainstorming": 996, "Role playing": 977, "Advice seeking": 951, "Data Analysis": 1007, "Others": 1001, "average": 983.0, "# battles": 2058}
         | 
| 16 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 958, "Information seeking": 943, "Creative Writing": 947, "Coding & Debugging": 993, "Reasoning": 974, "Editing": 968, "Math": 980, "Planning": 932, "Brainstorming": 951, "Role playing": 966, "Advice seeking": 949, "Data Analysis": 1004, "Others": 996, "average": 966.9166666666666, "# battles": 2036}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 951, "Information seeking": 972, "Creative Writing": 972, "Coding & Debugging": 859, "Reasoning": 984, "Editing": 974, "Math": 954, "Planning": 987, "Brainstorming": 1019, "Role playing": 975, "Advice seeking": 1023, "Data Analysis": 926, "Others": 1018, "average": 971.9166666666666, "# battles": 2461}
         | 
| 18 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 937, "Information seeking": 956, "Creative Writing": 959, "Coding & Debugging": 837, "Reasoning": 943, "Editing": 983, "Math": 987, "Planning": 970, "Brainstorming": 1001, "Role playing": 971, "Advice seeking": 1009, "Data Analysis": 920, "Others": 1001, "average": 961.4166666666666, "# battles": 2406}
         | 
| 19 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 931, "Information seeking": 922, "Creative Writing": 884, "Coding & Debugging": 1025, "Reasoning": 918, "Editing": 893, "Math": 944, "Planning": 891, "Brainstorming": 881, "Role playing": 951, "Advice seeking": 921, "Data Analysis": 994, "Others": 999, "average": 935.25, "# battles": 2729}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 926, "Information seeking": 947, "Creative Writing": 954, "Coding & Debugging": 817, "Reasoning": 942, "Editing": 967, "Math": 933, "Planning": 972, "Brainstorming": 980, "Role playing": 954, "Advice seeking": 985, "Data Analysis": 914, "Others": 1002, "average": 947.25, "# battles": 2366}
         | 
| 21 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 926, "Information seeking": 878, "Creative Writing": 928, "Coding & Debugging": 1006, "Reasoning": 895, "Editing": 973, "Math": 967, "Planning": 900, "Brainstorming": 890, "Role playing": 959, "Advice seeking": 869, "Data Analysis": 1005, "Others": 984, "average": 937.8333333333334, "# battles": 2660}
         | 
| 22 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 894, "Information seeking": 898, "Creative Writing": 895, "Coding & Debugging": 934, "Reasoning": 851, "Editing": 908, "Math": 839, "Planning": 929, "Brainstorming": 909, "Role playing": 912, "Advice seeking": 901, "Data Analysis": 912, "Others": 990, "average": 906.5, "# battles": 2689}
         | 
| 23 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 860, "Information seeking": 849, "Creative Writing": 865, "Coding & Debugging": 829, "Reasoning": 867, "Editing": 880, "Math": 923, "Planning": 864, "Brainstorming": 888, "Role playing": 903, "Advice seeking": 888, "Data Analysis": 870, "Others": 997, "average": 885.25, "# battles": 2715}
         | 
| 24 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 843, "Information seeking": 784, "Creative Writing": 805, "Coding & Debugging": 966, "Reasoning": 869, "Editing": 898, "Math": 906, "Planning": 856, "Brainstorming": 837, "Role playing": 784, "Advice seeking": 836, "Data Analysis": 933, "Others": 852, "average": 860.5, "# battles": 14196}
         | 
    	
        data_dir/elo_ranks.length_ablation.jsonl
    CHANGED
    
    | @@ -1,22 +1,24 @@ | |
| 1 | 
            -
            {"model name ": "gpt-4-0125-preview", "elo overall":  | 
| 2 | 
            -
            {"model name ": " | 
| 3 | 
            -
            {"model name ": " | 
| 4 | 
            -
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall":  | 
| 5 | 
            -
            {"model name ": " | 
| 6 | 
            -
            {"model name ": " | 
| 7 | 
            -
            {"model name ": " | 
| 8 | 
            -
            {"model name ": " | 
| 9 | 
            -
            {"model name ": " | 
| 10 | 
            -
            {"model name ": " | 
| 11 | 
            -
            {"model name ": " | 
| 12 | 
            -
            {"model name ": " | 
| 13 | 
            -
            {"model name ": " | 
| 14 | 
            -
            {"model name ": " | 
| 15 | 
            -
            {"model name ": " | 
| 16 | 
            -
            {"model name ": " | 
| 17 | 
            -
            {"model name ": "Llama-2- | 
| 18 | 
            -
            {"model name ": " | 
| 19 | 
            -
            {"model name ": "gemma-7b-it", "elo overall":  | 
| 20 | 
            -
            {"model name ": " | 
| 21 | 
            -
            {"model name ": " | 
| 22 | 
            -
            {"model name ": " | 
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1187, "# battles": 6611}
         | 
| 2 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1089, "# battles": 2144}
         | 
| 3 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1086, "# battles": 2519}
         | 
| 4 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 1080, "# battles": 2094}
         | 
| 5 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1078, "# battles": 1484}
         | 
| 6 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1075, "# battles": 2091}
         | 
| 7 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 1066, "# battles": 3543}
         | 
| 8 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 1037, "# battles": 1532}
         | 
| 9 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1036, "# battles": 2606}
         | 
| 10 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 1031, "# battles": 3630}
         | 
| 11 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1013, "# battles": 3620}
         | 
| 12 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1009, "# battles": 2791}
         | 
| 13 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1004, "# battles": 4488}
         | 
| 14 | 
            +
            {"model name ": "command", "elo overall": 996, "# battles": 1939}
         | 
| 15 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 987, "# battles": 2058}
         | 
| 16 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 958, "# battles": 2036}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 951, "# battles": 2461}
         | 
| 18 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 937, "# battles": 2406}
         | 
| 19 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 931, "# battles": 2729}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 926, "# battles": 2366}
         | 
| 21 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 926, "# battles": 2660}
         | 
| 22 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 894, "# battles": 2689}
         | 
| 23 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 860, "# battles": 2715}
         | 
| 24 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 843, "# battles": 14196}
         | 
    	
        data_dir/elo_ranks.skip_empty.all.jsonl
    CHANGED
    
    | @@ -1,22 +1,24 @@ | |
| 1 | 
            -
            {"model name ": "gpt-4-0125-preview", "elo overall":  | 
| 2 | 
            -
            {"model name ": "claude-3-opus-20240229", "elo overall":  | 
| 3 | 
            -
            {"model name ": "claude-3-sonnet-20240229", "elo overall":  | 
| 4 | 
            -
            {"model name ": "mistral-large-2402", "elo overall":  | 
| 5 | 
            -
            {"model name ": "Yi-34B-Chat", "elo overall":  | 
| 6 | 
            -
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall":  | 
| 7 | 
            -
            {"model name ": "gemini-1.0-pro", "elo overall": 1082, "Information seeking":  | 
| 8 | 
            -
            {"model name ": " | 
| 9 | 
            -
            {"model name ": " | 
| 10 | 
            -
            {"model name ": " | 
| 11 | 
            -
            {"model name ": "Llama-2- | 
| 12 | 
            -
            {"model name ": " | 
| 13 | 
            -
            {"model name ": " | 
| 14 | 
            -
            {"model name ": " | 
| 15 | 
            -
            {"model name ": " | 
| 16 | 
            -
            {"model name ": " | 
| 17 | 
            -
            {"model name ": " | 
| 18 | 
            -
            {"model name ": " | 
| 19 | 
            -
            {"model name ": " | 
| 20 | 
            -
            {"model name ": " | 
| 21 | 
            -
            {"model name ": " | 
| 22 | 
            -
            {"model name ": " | 
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1283, "Information seeking": 1262, "Creative Writing": 1250, "Coding & Debugging": 1319, "Reasoning": 1345, "Editing": 1200, "Math": 1268, "Planning": 1284, "Brainstorming": 1310, "Role playing": 1226, "Advice seeking": 1285, "Data Analysis": 1245, "Others": 1068, "average": 1255.1666666666667, "# battles": 5713}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1146, "Information seeking": 1137, "Creative Writing": 1068, "Coding & Debugging": 1259, "Reasoning": 1218, "Editing": 1152, "Math": 1196, "Planning": 1181, "Brainstorming": 1132, "Role playing": 982, "Advice seeking": 1134, "Data Analysis": 1146, "Others": 1035, "average": 1136.6666666666667, "# battles": 3494}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1119, "Information seeking": 1099, "Creative Writing": 1074, "Coding & Debugging": 1202, "Reasoning": 1166, "Editing": 1144, "Math": 1137, "Planning": 1158, "Brainstorming": 1095, "Role playing": 993, "Advice seeking": 1086, "Data Analysis": 1140, "Others": 1020, "average": 1109.5, "# battles": 2665}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1108, "Information seeking": 1085, "Creative Writing": 1107, "Coding & Debugging": 1120, "Reasoning": 1123, "Editing": 1095, "Math": 1085, "Planning": 1079, "Brainstorming": 1078, "Role playing": 1063, "Advice seeking": 1072, "Data Analysis": 1069, "Others": 1020, "average": 1083.0, "# battles": 1971}
         | 
| 5 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1089, "Information seeking": 1075, "Creative Writing": 1101, "Coding & Debugging": 1050, "Reasoning": 1090, "Editing": 1067, "Math": 1023, "Planning": 1119, "Brainstorming": 1145, "Role playing": 1065, "Advice seeking": 1103, "Data Analysis": 1017, "Others": 1002, "average": 1071.4166666666667, "# battles": 2292}
         | 
| 6 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1085, "Information seeking": 1075, "Creative Writing": 1132, "Coding & Debugging": 1089, "Reasoning": 1033, "Editing": 1098, "Math": 1005, "Planning": 1069, "Brainstorming": 1075, "Role playing": 1120, "Advice seeking": 1059, "Data Analysis": 1031, "Others": 1007, "average": 1066.0833333333333, "# battles": 2461}
         | 
| 7 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1082, "Information seeking": 1111, "Creative Writing": 1069, "Coding & Debugging": 1059, "Reasoning": 1075, "Editing": 1019, "Math": 1053, "Planning": 1070, "Brainstorming": 1040, "Role playing": 1054, "Advice seeking": 1084, "Data Analysis": 1067, "Others": 1008, "average": 1059.0833333333333, "# battles": 1852}
         | 
| 8 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1081, "Information seeking": 1081, "Creative Writing": 1085, "Coding & Debugging": 1075, "Reasoning": 1048, "Editing": 1022, "Math": 1070, "Planning": 1038, "Brainstorming": 1038, "Role playing": 1077, "Advice seeking": 1016, "Data Analysis": 1021, "Others": 989, "average": 1046.6666666666667, "# battles": 1428}
         | 
| 9 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1069, "Information seeking": 1103, "Creative Writing": 1069, "Coding & Debugging": 987, "Reasoning": 1064, "Editing": 1007, "Math": 989, "Planning": 1034, "Brainstorming": 1062, "Role playing": 1072, "Advice seeking": 1046, "Data Analysis": 1026, "Others": 1022, "average": 1040.0833333333333, "# battles": 1882}
         | 
| 10 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1052, "Information seeking": 1070, "Creative Writing": 1045, "Coding & Debugging": 1067, "Reasoning": 1049, "Editing": 1005, "Math": 1046, "Planning": 990, "Brainstorming": 1024, "Role playing": 1080, "Advice seeking": 998, "Data Analysis": 1025, "Others": 993, "average": 1032.6666666666667, "# battles": 3551}
         | 
| 11 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1035, "Information seeking": 1064, "Creative Writing": 1050, "Coding & Debugging": 935, "Reasoning": 996, "Editing": 1004, "Math": 955, "Planning": 1005, "Brainstorming": 1024, "Role playing": 1043, "Advice seeking": 1012, "Data Analysis": 984, "Others": 1014, "average": 1007.1666666666666, "# battles": 1838}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 1010, "Information seeking": 1014, "Creative Writing": 980, "Coding & Debugging": 1017, "Reasoning": 1008, "Editing": 973, "Math": 1033, "Planning": 985, "Brainstorming": 963, "Role playing": 1010, "Advice seeking": 991, "Data Analysis": 1000, "Others": 1009, "average": 998.5833333333334, "# battles": 1470}
         | 
| 13 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 1009, "Information seeking": 1034, "Creative Writing": 1025, "Coding & Debugging": 918, "Reasoning": 973, "Editing": 989, "Math": 908, "Planning": 1011, "Brainstorming": 997, "Role playing": 1049, "Advice seeking": 1037, "Data Analysis": 983, "Others": 1012, "average": 994.6666666666666, "# battles": 1838}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 979, "Information seeking": 991, "Creative Writing": 1007, "Coding & Debugging": 881, "Reasoning": 972, "Editing": 960, "Math": 961, "Planning": 943, "Brainstorming": 933, "Role playing": 1064, "Advice seeking": 968, "Data Analysis": 897, "Others": 986, "average": 963.5833333333334, "# battles": 3535}
         | 
| 15 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 965, "Information seeking": 988, "Creative Writing": 990, "Coding & Debugging": 925, "Reasoning": 942, "Editing": 971, "Math": 920, "Planning": 950, "Brainstorming": 937, "Role playing": 1016, "Advice seeking": 932, "Data Analysis": 966, "Others": 956, "average": 957.75, "# battles": 3486}
         | 
| 16 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 960, "Information seeking": 925, "Creative Writing": 927, "Coding & Debugging": 1093, "Reasoning": 1002, "Editing": 976, "Math": 1137, "Planning": 969, "Brainstorming": 926, "Role playing": 921, "Advice seeking": 950, "Data Analysis": 1018, "Others": 984, "average": 985.6666666666666, "# battles": 13526}
         | 
| 17 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 927, "Information seeking": 929, "Creative Writing": 929, "Coding & Debugging": 926, "Reasoning": 945, "Editing": 900, "Math": 974, "Planning": 964, "Brainstorming": 983, "Role playing": 938, "Advice seeking": 972, "Data Analysis": 947, "Others": 989, "average": 949.6666666666666, "# battles": 2638}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 920, "Information seeking": 939, "Creative Writing": 940, "Coding & Debugging": 892, "Reasoning": 908, "Editing": 923, "Math": 914, "Planning": 968, "Brainstorming": 992, "Role playing": 939, "Advice seeking": 956, "Data Analysis": 901, "Others": 979, "average": 937.5833333333334, "# battles": 1861}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 906, "Information seeking": 898, "Creative Writing": 897, "Coding & Debugging": 919, "Reasoning": 915, "Editing": 967, "Math": 904, "Planning": 923, "Brainstorming": 938, "Role playing": 899, "Advice seeking": 953, "Data Analysis": 1000, "Others": 1000, "average": 934.4166666666666, "# battles": 2153}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 872, "Information seeking": 870, "Creative Writing": 864, "Coding & Debugging": 888, "Reasoning": 901, "Editing": 980, "Math": 924, "Planning": 912, "Brainstorming": 889, "Role playing": 862, "Advice seeking": 954, "Data Analysis": 940, "Others": 1006, "average": 915.8333333333334, "# battles": 2095}
         | 
| 21 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 851, "Information seeking": 846, "Creative Writing": 861, "Coding & Debugging": 893, "Reasoning": 849, "Editing": 882, "Math": 893, "Planning": 830, "Brainstorming": 867, "Role playing": 899, "Advice seeking": 842, "Data Analysis": 928, "Others": 969, "average": 879.9166666666666, "# battles": 2381}
         | 
| 22 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 841, "Information seeking": 827, "Creative Writing": 894, "Coding & Debugging": 802, "Reasoning": 796, "Editing": 871, "Math": 871, "Planning": 878, "Brainstorming": 922, "Role playing": 898, "Advice seeking": 870, "Data Analysis": 856, "Others": 975, "average": 871.6666666666666, "# battles": 2613}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 815, "Information seeking": 803, "Creative Writing": 831, "Coding & Debugging": 817, "Reasoning": 815, "Editing": 937, "Math": 867, "Planning": 849, "Brainstorming": 854, "Role playing": 849, "Advice seeking": 890, "Data Analysis": 916, "Others": 995, "average": 868.5833333333334, "# battles": 2092}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 791, "Information seeking": 778, "Creative Writing": 794, "Coding & Debugging": 872, "Reasoning": 774, "Editing": 856, "Math": 872, "Planning": 785, "Brainstorming": 773, "Role playing": 880, "Advice seeking": 803, "Data Analysis": 875, "Others": 963, "average": 835.4166666666666, "# battles": 2595}
         | 
    	
        data_dir/elo_ranks.skip_empty.jsonl
    CHANGED
    
    | @@ -1,22 +1,24 @@ | |
| 1 | 
            -
            {"model name ": "gpt-4-0125-preview", "elo overall":  | 
| 2 | 
            -
            {"model name ": "claude-3-opus-20240229", "elo overall":  | 
| 3 | 
            -
            {"model name ": "claude-3-sonnet-20240229", "elo overall":  | 
| 4 | 
            -
            {"model name ": "mistral-large-2402", "elo overall":  | 
| 5 | 
            -
            {"model name ": "Yi-34B-Chat", "elo overall":  | 
| 6 | 
            -
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall":  | 
| 7 | 
            -
            {"model name ": "gemini-1.0-pro", "elo overall": 1082, "# battles":  | 
| 8 | 
            -
            {"model name ": " | 
| 9 | 
            -
            {"model name ": " | 
| 10 | 
            -
            {"model name ": " | 
| 11 | 
            -
            {"model name ": "Llama-2- | 
| 12 | 
            -
            {"model name ": " | 
| 13 | 
            -
            {"model name ": " | 
| 14 | 
            -
            {"model name ": " | 
| 15 | 
            -
            {"model name ": " | 
| 16 | 
            -
            {"model name ": " | 
| 17 | 
            -
            {"model name ": " | 
| 18 | 
            -
            {"model name ": " | 
| 19 | 
            -
            {"model name ": " | 
| 20 | 
            -
            {"model name ": " | 
| 21 | 
            -
            {"model name ": " | 
| 22 | 
            -
            {"model name ": " | 
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1283, "# battles": 5713}
         | 
| 2 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1146, "# battles": 3494}
         | 
| 3 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1119, "# battles": 2665}
         | 
| 4 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1108, "# battles": 1971}
         | 
| 5 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1089, "# battles": 2292}
         | 
| 6 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1085, "# battles": 2461}
         | 
| 7 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1082, "# battles": 1852}
         | 
| 8 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1081, "# battles": 1428}
         | 
| 9 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1069, "# battles": 1882}
         | 
| 10 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1052, "# battles": 3551}
         | 
| 11 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1035, "# battles": 1838}
         | 
| 12 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 1010, "# battles": 1470}
         | 
| 13 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 1009, "# battles": 1838}
         | 
| 14 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 979, "# battles": 3535}
         | 
| 15 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 965, "# battles": 3486}
         | 
| 16 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 960, "# battles": 13526}
         | 
| 17 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 927, "# battles": 2638}
         | 
| 18 | 
            +
            {"model name ": "command", "elo overall": 920, "# battles": 1861}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 906, "# battles": 2153}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 872, "# battles": 2095}
         | 
| 21 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 851, "# battles": 2381}
         | 
| 22 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 841, "# battles": 2613}
         | 
| 23 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 815, "# battles": 2092}
         | 
| 24 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 791, "# battles": 2595}
         | 
    	
        data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl
    CHANGED
    
    | @@ -1,22 +1,24 @@ | |
| 1 | 
            -
            {"model name ": "gpt-4-0125-preview", "elo overall":  | 
| 2 | 
            -
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall":  | 
| 3 | 
            -
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall":  | 
| 4 | 
            -
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall":  | 
| 5 | 
            -
            {"model name ": "Yi-34B-Chat", "elo overall":  | 
| 6 | 
            -
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall":  | 
| 7 | 
            -
            {"model name ": " | 
| 8 | 
            -
            {"model name ": " | 
| 9 | 
            -
            {"model name ": " | 
| 10 | 
            -
            {"model name ": " | 
| 11 | 
            -
            {"model name ": " | 
| 12 | 
            -
            {"model name ": " | 
| 13 | 
            -
            {"model name ": " | 
| 14 | 
            -
            {"model name ": " | 
| 15 | 
            -
            {"model name ": " | 
| 16 | 
            -
            {"model name ": " | 
| 17 | 
            -
            {"model name ": " | 
| 18 | 
            -
            {"model name ": " | 
| 19 | 
            -
            {"model name ": " | 
| 20 | 
            -
            {"model name ": " | 
| 21 | 
            -
            {"model name ": " | 
| 22 | 
            -
            {"model name ": " | 
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1172, "Information seeking": 1212, "Creative Writing": 1153, "Coding & Debugging": 1155, "Reasoning": 1234, "Editing": 1089, "Math": 1190, "Planning": 1161, "Brainstorming": 1149, "Role playing": 1174, "Advice seeking": 1221, "Data Analysis": 1142, "Others": 1024, "average": 1158.6666666666667, "# battles": 6543}
         | 
| 2 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1141, "Information seeking": 1184, "Creative Writing": 1150, "Coding & Debugging": 1049, "Reasoning": 1108, "Editing": 1068, "Math": 1010, "Planning": 1122, "Brainstorming": 1125, "Role playing": 1127, "Advice seeking": 1100, "Data Analysis": 1032, "Others": 1015, "average": 1090.8333333333333, "# battles": 1882}
         | 
| 3 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 1126, "Information seeking": 1164, "Creative Writing": 1116, "Coding & Debugging": 1035, "Reasoning": 1114, "Editing": 1026, "Math": 1011, "Planning": 1114, "Brainstorming": 1119, "Role playing": 1119, "Advice seeking": 1119, "Data Analysis": 1050, "Others": 1005, "average": 1082.6666666666667, "# battles": 1838}
         | 
| 4 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1124, "Information seeking": 1164, "Creative Writing": 1114, "Coding & Debugging": 1031, "Reasoning": 1089, "Editing": 1055, "Math": 1014, "Planning": 1102, "Brainstorming": 1119, "Role playing": 1090, "Advice seeking": 1121, "Data Analysis": 1043, "Others": 1031, "average": 1081.0833333333333, "# battles": 1838}
         | 
| 5 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1090, "Information seeking": 1076, "Creative Writing": 1055, "Coding & Debugging": 1118, "Reasoning": 1111, "Editing": 1014, "Math": 1097, "Planning": 1120, "Brainstorming": 1116, "Role playing": 1049, "Advice seeking": 1109, "Data Analysis": 1069, "Others": 1002, "average": 1078.0, "# battles": 2292}
         | 
| 6 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1068, "Information seeking": 1073, "Creative Writing": 1097, "Coding & Debugging": 1041, "Reasoning": 1043, "Editing": 1041, "Math": 1068, "Planning": 1017, "Brainstorming": 1032, "Role playing": 1106, "Advice seeking": 1030, "Data Analysis": 1033, "Others": 1017, "average": 1049.8333333333333, "# battles": 2461}
         | 
| 7 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1063, "Information seeking": 1059, "Creative Writing": 1093, "Coding & Debugging": 1037, "Reasoning": 1007, "Editing": 1013, "Math": 1009, "Planning": 1034, "Brainstorming": 1039, "Role playing": 1086, "Advice seeking": 1011, "Data Analysis": 1010, "Others": 1001, "average": 1033.25, "# battles": 1428}
         | 
| 8 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 1051, "Information seeking": 1020, "Creative Writing": 1079, "Coding & Debugging": 1059, "Reasoning": 1038, "Editing": 1081, "Math": 1040, "Planning": 1046, "Brainstorming": 1066, "Role playing": 1043, "Advice seeking": 989, "Data Analysis": 1052, "Others": 1010, "average": 1043.5833333333333, "# battles": 3486}
         | 
| 9 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 1018, "Information seeking": 1037, "Creative Writing": 1019, "Coding & Debugging": 993, "Reasoning": 1005, "Editing": 983, "Math": 994, "Planning": 1012, "Brainstorming": 1002, "Role playing": 1026, "Advice seeking": 1043, "Data Analysis": 989, "Others": 1001, "average": 1008.6666666666666, "# battles": 1470}
         | 
| 10 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 1015, "Information seeking": 1003, "Creative Writing": 1045, "Coding & Debugging": 972, "Reasoning": 997, "Editing": 1040, "Math": 1034, "Planning": 1001, "Brainstorming": 1002, "Role playing": 1058, "Advice seeking": 1008, "Data Analysis": 1001, "Others": 1023, "average": 1015.3333333333334, "# battles": 3535}
         | 
| 11 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 991, "Information seeking": 1000, "Creative Writing": 1001, "Coding & Debugging": 981, "Reasoning": 992, "Editing": 987, "Math": 996, "Planning": 954, "Brainstorming": 979, "Role playing": 1026, "Advice seeking": 969, "Data Analysis": 985, "Others": 1024, "average": 991.1666666666666, "# battles": 3552}
         | 
| 12 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 991, "Information seeking": 976, "Creative Writing": 946, "Coding & Debugging": 1088, "Reasoning": 1035, "Editing": 1055, "Math": 1063, "Planning": 1017, "Brainstorming": 940, "Role playing": 902, "Advice seeking": 981, "Data Analysis": 1046, "Others": 994, "average": 1003.5833333333334, "# battles": 2665}
         | 
| 13 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 986, "Information seeking": 981, "Creative Writing": 975, "Coding & Debugging": 993, "Reasoning": 1012, "Editing": 1010, "Math": 972, "Planning": 1016, "Brainstorming": 1019, "Role playing": 976, "Advice seeking": 1036, "Data Analysis": 1016, "Others": 1018, "average": 1002.0, "# battles": 2153}
         | 
| 14 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 985, "Information seeking": 959, "Creative Writing": 941, "Coding & Debugging": 1111, "Reasoning": 1030, "Editing": 1075, "Math": 1016, "Planning": 1003, "Brainstorming": 942, "Role playing": 876, "Advice seeking": 997, "Data Analysis": 1041, "Others": 1004, "average": 999.5833333333334, "# battles": 4324}
         | 
| 15 | 
            +
            {"model name ": "command", "elo overall": 976, "Information seeking": 961, "Creative Writing": 975, "Coding & Debugging": 999, "Reasoning": 978, "Editing": 1005, "Math": 994, "Planning": 1057, "Brainstorming": 1025, "Role playing": 954, "Advice seeking": 975, "Data Analysis": 972, "Others": 1003, "average": 991.5, "# battles": 1861}
         | 
| 16 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 969, "Information seeking": 966, "Creative Writing": 958, "Coding & Debugging": 960, "Reasoning": 969, "Editing": 1012, "Math": 1002, "Planning": 1005, "Brainstorming": 999, "Role playing": 969, "Advice seeking": 1022, "Data Analysis": 1001, "Others": 1002, "average": 988.75, "# battles": 2095}
         | 
| 17 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 966, "Information seeking": 950, "Creative Writing": 995, "Coding & Debugging": 947, "Reasoning": 967, "Editing": 992, "Math": 988, "Planning": 926, "Brainstorming": 996, "Role playing": 979, "Advice seeking": 952, "Data Analysis": 963, "Others": 1002, "average": 971.4166666666666, "# battles": 1971}
         | 
| 18 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 956, "Information seeking": 961, "Creative Writing": 964, "Coding & Debugging": 945, "Reasoning": 970, "Editing": 960, "Math": 976, "Planning": 924, "Brainstorming": 960, "Role playing": 984, "Advice seeking": 950, "Data Analysis": 968, "Others": 998, "average": 963.3333333333334, "# battles": 1853}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 953, "Information seeking": 951, "Creative Writing": 955, "Coding & Debugging": 921, "Reasoning": 962, "Editing": 994, "Math": 957, "Planning": 994, "Brainstorming": 981, "Role playing": 958, "Advice seeking": 1001, "Data Analysis": 995, "Others": 1002, "average": 972.5833333333334, "# battles": 2092}
         | 
| 20 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 902, "Information seeking": 872, "Creative Writing": 926, "Coding & Debugging": 934, "Reasoning": 882, "Editing": 955, "Math": 964, "Planning": 874, "Brainstorming": 885, "Role playing": 958, "Advice seeking": 858, "Data Analysis": 965, "Others": 984, "average": 921.4166666666666, "# battles": 2596}
         | 
| 21 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 901, "Information seeking": 911, "Creative Writing": 883, "Coding & Debugging": 967, "Reasoning": 900, "Editing": 873, "Math": 936, "Planning": 874, "Brainstorming": 880, "Role playing": 947, "Advice seeking": 909, "Data Analysis": 957, "Others": 998, "average": 919.5833333333334, "# battles": 2639}
         | 
| 22 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 876, "Information seeking": 853, "Creative Writing": 866, "Coding & Debugging": 936, "Reasoning": 884, "Editing": 898, "Math": 950, "Planning": 881, "Brainstorming": 889, "Role playing": 905, "Advice seeking": 893, "Data Analysis": 943, "Others": 994, "average": 907.6666666666666, "# battles": 2381}
         | 
| 23 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 870, "Information seeking": 887, "Creative Writing": 895, "Coding & Debugging": 857, "Reasoning": 825, "Editing": 895, "Math": 826, "Planning": 912, "Brainstorming": 908, "Role playing": 912, "Advice seeking": 889, "Data Analysis": 870, "Others": 990, "average": 888.8333333333334, "# battles": 2613}
         | 
| 24 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 812, "Information seeking": 775, "Creative Writing": 803, "Coding & Debugging": 881, "Reasoning": 845, "Editing": 874, "Math": 883, "Planning": 830, "Brainstorming": 836, "Role playing": 784, "Advice seeking": 824, "Data Analysis": 851, "Others": 855, "average": 836.75, "# battles": 13526}
         | 
    	
        data_dir/elo_ranks.skip_empty.length_ablation.jsonl
    CHANGED
    
    | @@ -1,22 +1,24 @@ | |
| 1 | 
            -
            {"model name ": "gpt-4-0125-preview", "elo overall":  | 
| 2 | 
            -
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall":  | 
| 3 | 
            -
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall":  | 
| 4 | 
            -
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall":  | 
| 5 | 
            -
            {"model name ": "Yi-34B-Chat", "elo overall":  | 
| 6 | 
            -
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall":  | 
| 7 | 
            -
            {"model name ": " | 
| 8 | 
            -
            {"model name ": " | 
| 9 | 
            -
            {"model name ": " | 
| 10 | 
            -
            {"model name ": " | 
| 11 | 
            -
            {"model name ": " | 
| 12 | 
            -
            {"model name ": " | 
| 13 | 
            -
            {"model name ": " | 
| 14 | 
            -
            {"model name ": " | 
| 15 | 
            -
            {"model name ": " | 
| 16 | 
            -
            {"model name ": " | 
| 17 | 
            -
            {"model name ": " | 
| 18 | 
            -
            {"model name ": " | 
| 19 | 
            -
            {"model name ": " | 
| 20 | 
            -
            {"model name ": " | 
| 21 | 
            -
            {"model name ": " | 
| 22 | 
            -
            {"model name ": " | 
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1172, "# battles": 6543}
         | 
| 2 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 1141, "# battles": 1882}
         | 
| 3 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 1126, "# battles": 1838}
         | 
| 4 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 1124, "# battles": 1838}
         | 
| 5 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1090, "# battles": 2292}
         | 
| 6 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1068, "# battles": 2461}
         | 
| 7 | 
            +
            {"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1063, "# battles": 1428}
         | 
| 8 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 1051, "# battles": 3486}
         | 
| 9 | 
            +
            {"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 1018, "# battles": 1470}
         | 
| 10 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 1015, "# battles": 3535}
         | 
| 11 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 991, "# battles": 3552}
         | 
| 12 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 991, "# battles": 2665}
         | 
| 13 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 986, "# battles": 2153}
         | 
| 14 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 985, "# battles": 4324}
         | 
| 15 | 
            +
            {"model name ": "command", "elo overall": 976, "# battles": 1861}
         | 
| 16 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 969, "# battles": 2095}
         | 
| 17 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 966, "# battles": 1971}
         | 
| 18 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 956, "# battles": 1853}
         | 
| 19 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 953, "# battles": 2092}
         | 
| 20 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 902, "# battles": 2596}
         | 
| 21 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 901, "# battles": 2639}
         | 
| 22 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 876, "# battles": 2381}
         | 
| 23 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 870, "# battles": 2613}
         | 
| 24 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 812, "# battles": 13526}
         | 
    	
        data_dir/elo_ranks.test.jsonl
    ADDED
    
    | @@ -0,0 +1,23 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {"model name ": "Qwen1.5-72B-Chat", "elo overall": 1131, "# battles": 3117}
         | 
| 2 | 
            +
            {"model name ": "gpt-4-0125-preview", "elo overall": 1130, "# battles": 5854}
         | 
| 3 | 
            +
            {"model name ": "claude-3-opus-20240229", "elo overall": 1120, "# battles": 3780}
         | 
| 4 | 
            +
            {"model name ": "claude-3-sonnet-20240229", "elo overall": 1107, "# battles": 2856}
         | 
| 5 | 
            +
            {"model name ": "mistral-large-2402", "elo overall": 1099, "# battles": 2119}
         | 
| 6 | 
            +
            {"model name ": "gemini-1.0-pro", "elo overall": 1072, "# battles": 1984}
         | 
| 7 | 
            +
            {"model name ": "gpt-3.5-turbo-0125", "elo overall": 1062, "# battles": 14279}
         | 
| 8 | 
            +
            {"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1041, "# battles": 3583}
         | 
| 9 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1030, "# battles": 2537}
         | 
| 10 | 
            +
            {"model name ": "Yi-34B-Chat", "elo overall": 1028, "# battles": 2591}
         | 
| 11 | 
            +
            {"model name ": "gemma-7b-it", "elo overall": 983, "# battles": 2693}
         | 
| 12 | 
            +
            {"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 976, "# battles": 2089}
         | 
| 13 | 
            +
            {"model name ": "tulu-2-dpo-70b", "elo overall": 967, "# battles": 3605}
         | 
| 14 | 
            +
            {"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 959, "# battles": 2064}
         | 
| 15 | 
            +
            {"model name ": "command", "elo overall": 943, "# battles": 1907}
         | 
| 16 | 
            +
            {"model name ": "gemma-2b-it", "elo overall": 941, "# battles": 2653}
         | 
| 17 | 
            +
            {"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 938, "# battles": 2055}
         | 
| 18 | 
            +
            {"model name ": "zephyr-7b-beta", "elo overall": 936, "# battles": 3511}
         | 
| 19 | 
            +
            {"model name ": "vicuna-13b-v1.5", "elo overall": 936, "# battles": 2666}
         | 
| 20 | 
            +
            {"model name ": "Llama-2-70b-chat-hf", "elo overall": 930, "# battles": 2571}
         | 
| 21 | 
            +
            {"model name ": "Llama-2-13b-chat-hf", "elo overall": 917, "# battles": 2507}
         | 
| 22 | 
            +
            {"model name ": "Llama-2-7b-chat-hf", "elo overall": 879, "# battles": 2479}
         | 
| 23 | 
            +
            {"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 870, "# battles": 2658}
         | 
    	
        data_dir/pairwise_win_fractions.pkl
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
            -
            size  | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:75a83042bb54bc81d4bb63cebfe75fe04c1ca11f715783379750165a0968f5ca
         | 
| 3 | 
            +
            size 12668
         | 
    	
        data_dir/pairwise_win_fractions.png
    CHANGED
    
    |   | 
|   | 
    	
        data_utils.py
    CHANGED
    
    | @@ -6,11 +6,12 @@ from constants import column_names, all_task_types | |
| 6 | 
             
            from utils_display import make_clickable_model
         | 
| 7 | 
             
            import random 
         | 
| 8 | 
             
            disable_progress_bar()
         | 
|  | |
| 9 |  | 
| 10 | 
             
            id_to_data = None 
         | 
| 11 | 
             
            model_len_info = None 
         | 
| 12 |  | 
| 13 | 
            -
            def estimated_win_rate(elo_a, elo_b):
         | 
| 14 | 
             
                """
         | 
| 15 | 
             
                Calculate the estimated win rate for player A against player B using their Elo ratings.
         | 
| 16 |  | 
| @@ -18,7 +19,7 @@ def estimated_win_rate(elo_a, elo_b): | |
| 18 | 
             
                :param elo_b: Elo rating of player B
         | 
| 19 | 
             
                :return: Estimated win rate for player A
         | 
| 20 | 
             
                """
         | 
| 21 | 
            -
                exponent = (elo_b - elo_a) / 400
         | 
| 22 | 
             
                probability_a_wins = 1 / (1 + 10 ** exponent)
         | 
| 23 | 
             
                return (1-probability_a_wins)*100
         | 
| 24 |  | 
| @@ -33,9 +34,9 @@ def formatter(x): | |
| 33 | 
             
                return x
         | 
| 34 |  | 
| 35 |  | 
| 36 | 
            -
            def add_winrates(current_df):
         | 
| 37 | 
             
                df = current_df.copy()
         | 
| 38 | 
            -
                elo_column = " | 
| 39 |  | 
| 40 | 
             
                # Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
         | 
| 41 | 
             
                model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]
         | 
| @@ -45,8 +46,8 @@ def add_winrates(current_df): | |
| 45 |  | 
| 46 |  | 
| 47 | 
             
                # Calculate the win rate of "gpt-4-0125-preview" against all models
         | 
| 48 | 
            -
                df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x)).apply(formatter)    
         | 
| 49 | 
            -
                df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x)).apply(formatter)    
         | 
| 50 | 
             
                # apply the formatter for the two new columns 
         | 
| 51 | 
             
                cols = list(df.columns)
         | 
| 52 | 
             
                cols.remove("# battles"); cols.append("# battles")
         | 
| @@ -54,18 +55,18 @@ def add_winrates(current_df): | |
| 54 | 
             
                df = df[cols]
         | 
| 55 | 
             
                return df
         | 
| 56 |  | 
| 57 | 
            -
            def add_winrates_tasks(current_df, ref="gpt-4"):
         | 
| 58 | 
             
                new_df = current_df.copy()
         | 
| 59 | 
             
                for t in all_task_types:
         | 
| 60 | 
             
                    column = column_names[t]
         | 
| 61 | 
             
                    model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
         | 
| 62 | 
            -
                    new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x)).apply(formatter)
         | 
| 63 | 
             
                return new_df
         | 
| 64 |  | 
| 65 |  | 
| 66 | 
             
            def post_processing(df, model_len_info):
         | 
| 67 | 
             
                if model_len_info:
         | 
| 68 | 
            -
                    df["Length"] = df["model name "].apply(lambda x: model_len_info[x])
         | 
| 69 |  | 
| 70 | 
             
                for col in df.columns:
         | 
| 71 | 
             
                    if col == "model name ":
         | 
| @@ -73,13 +74,16 @@ def post_processing(df, model_len_info): | |
| 73 | 
             
                    else:
         | 
| 74 | 
             
                        df[col] = df[col].apply(formatter) # For numerical values 
         | 
| 75 | 
             
                df.rename(columns=column_names, inplace=True)
         | 
| 76 | 
            -
                df.sort_values(by=" | 
| 77 | 
             
                # put the "Overall Elo" and "Task-Avg Elo" column to the front
         | 
| 78 | 
             
                # add the length info
         | 
| 79 | 
            -
                df = df[["Model", " | 
| 80 | 
             
                return df
         | 
| 81 |  | 
| 82 | 
            -
            def apply_length_penalty(original_df, ablation_df, length_penalty=0.2):
         | 
|  | |
|  | |
|  | |
| 83 | 
             
                original_df = original_df.copy()
         | 
| 84 | 
             
                ablation_df = ablation_df.copy()
         | 
| 85 | 
             
                # replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
         | 
| @@ -92,11 +96,14 @@ def apply_length_penalty(original_df, ablation_df, length_penalty=0.2): | |
| 92 | 
             
                        # assert that the model names are the same in both dataframes
         | 
| 93 | 
             
                        assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
         | 
| 94 | 
             
                        original_df[col] = original_df[col].astype(float)
         | 
| 95 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
| 96 | 
             
                # post_processing
         | 
| 97 | 
             
                original_df = post_processing(original_df, model_len_info=None)
         | 
| 98 | 
            -
                return original_df
         | 
| 99 | 
            -
             | 
| 100 |  | 
| 101 | 
             
            def load_benchdata():
         | 
| 102 | 
             
                print("Loading WildBench data...")
         | 
|  | |
| 6 | 
             
            from utils_display import make_clickable_model
         | 
| 7 | 
             
            import random 
         | 
| 8 | 
             
            disable_progress_bar()
         | 
| 9 | 
            +
            import math 
         | 
| 10 |  | 
| 11 | 
             
            id_to_data = None 
         | 
| 12 | 
             
            model_len_info = None 
         | 
| 13 |  | 
| 14 | 
            +
            def estimated_win_rate(elo_a, elo_b, LP=0):
         | 
| 15 | 
             
                """
         | 
| 16 | 
             
                Calculate the estimated win rate for player A against player B using their Elo ratings.
         | 
| 17 |  | 
|  | |
| 19 | 
             
                :param elo_b: Elo rating of player B
         | 
| 20 | 
             
                :return: Estimated win rate for player A
         | 
| 21 | 
             
                """
         | 
| 22 | 
            +
                exponent = (elo_b - elo_a)*(10**LP) / 400
         | 
| 23 | 
             
                probability_a_wins = 1 / (1 + 10 ** exponent)
         | 
| 24 | 
             
                return (1-probability_a_wins)*100
         | 
| 25 |  | 
|  | |
| 34 | 
             
                return x
         | 
| 35 |  | 
| 36 |  | 
| 37 | 
            +
            def add_winrates(current_df, LP=0):
         | 
| 38 | 
             
                df = current_df.copy()
         | 
| 39 | 
            +
                elo_column = "Task-Avg Elo" 
         | 
| 40 |  | 
| 41 | 
             
                # Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
         | 
| 42 | 
             
                model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]
         | 
|  | |
| 46 |  | 
| 47 |  | 
| 48 | 
             
                # Calculate the win rate of "gpt-4-0125-preview" against all models
         | 
| 49 | 
            +
                df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)    
         | 
| 50 | 
            +
                df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x, LP=LP)).apply(formatter)    
         | 
| 51 | 
             
                # apply the formatter for the two new columns 
         | 
| 52 | 
             
                cols = list(df.columns)
         | 
| 53 | 
             
                cols.remove("# battles"); cols.append("# battles")
         | 
|  | |
| 55 | 
             
                df = df[cols]
         | 
| 56 | 
             
                return df
         | 
| 57 |  | 
| 58 | 
            +
            def add_winrates_tasks(current_df, ref="gpt-4", LP=0):
         | 
| 59 | 
             
                new_df = current_df.copy()
         | 
| 60 | 
             
                for t in all_task_types:
         | 
| 61 | 
             
                    column = column_names[t]
         | 
| 62 | 
             
                    model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
         | 
| 63 | 
            +
                    new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
         | 
| 64 | 
             
                return new_df
         | 
| 65 |  | 
| 66 |  | 
| 67 | 
             
            def post_processing(df, model_len_info):
         | 
| 68 | 
             
                if model_len_info:
         | 
| 69 | 
            +
                    df["Length"] = df["model name "].apply(lambda x: model_len_info[x]["avg_len"])
         | 
| 70 |  | 
| 71 | 
             
                for col in df.columns:
         | 
| 72 | 
             
                    if col == "model name ":
         | 
|  | |
| 74 | 
             
                    else:
         | 
| 75 | 
             
                        df[col] = df[col].apply(formatter) # For numerical values 
         | 
| 76 | 
             
                df.rename(columns=column_names, inplace=True)
         | 
| 77 | 
            +
                df.sort_values(by="Task-Avg Elo", inplace=True, ascending=False)
         | 
| 78 | 
             
                # put the "Overall Elo" and "Task-Avg Elo" column to the front
         | 
| 79 | 
             
                # add the length info
         | 
| 80 | 
            +
                df = df[["Model", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Task-Avg Elo"]]]
         | 
| 81 | 
             
                return df
         | 
| 82 |  | 
| 83 | 
            +
            def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
         | 
| 84 | 
            +
                if mode == 'v2' and LP_original_dfs is not None:
         | 
| 85 | 
            +
                    L = f"{length_penalty:.1f}"
         | 
| 86 | 
            +
                    return LP_original_dfs[L]
         | 
| 87 | 
             
                original_df = original_df.copy()
         | 
| 88 | 
             
                ablation_df = ablation_df.copy()
         | 
| 89 | 
             
                # replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
         | 
|  | |
| 96 | 
             
                        # assert that the model names are the same in both dataframes
         | 
| 97 | 
             
                        assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
         | 
| 98 | 
             
                        original_df[col] = original_df[col].astype(float)
         | 
| 99 | 
            +
                        if mode == "v1":
         | 
| 100 | 
            +
                            original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty
         | 
| 101 | 
            +
                        elif mode == "v1.1":
         | 
| 102 | 
            +
                            diff = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] 
         | 
| 103 | 
            +
                            original_df.at[i, col] = original_df.at[i, col] * (1-length_penalty) + diff*length_penalty
         | 
| 104 | 
             
                # post_processing
         | 
| 105 | 
             
                original_df = post_processing(original_df, model_len_info=None)
         | 
| 106 | 
            +
                return original_df 
         | 
|  | |
| 107 |  | 
| 108 | 
             
            def load_benchdata():
         | 
| 109 | 
             
                print("Loading WildBench data...")
         | 
    	
        model_info.json
    CHANGED
    
    | @@ -4,7 +4,9 @@ | |
| 4 | 
             
                "Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
         | 
| 5 | 
             
                "Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
         | 
| 6 | 
             
                "Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
         | 
|  | |
| 7 | 
             
                "Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
         | 
|  | |
| 8 | 
             
                "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
         | 
| 9 | 
             
                "gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
         | 
| 10 | 
             
                "gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
         | 
| @@ -15,5 +17,6 @@ | |
| 15 | 
             
                "zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
         | 
| 16 | 
             
                "mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
         | 
| 17 | 
             
                "claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
         | 
| 18 | 
            -
                "claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"}
         | 
| 19 | 
            -
            }
         | 
|  | 
|  | |
| 4 | 
             
                "Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
         | 
| 5 | 
             
                "Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
         | 
| 6 | 
             
                "Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
         | 
| 7 | 
            +
                "Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
         | 
| 8 | 
             
                "Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
         | 
| 9 | 
            +
                "Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
         | 
| 10 | 
             
                "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
         | 
| 11 | 
             
                "gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
         | 
| 12 | 
             
                "gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
         | 
|  | |
| 17 | 
             
                "zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
         | 
| 18 | 
             
                "mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
         | 
| 19 | 
             
                "claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
         | 
| 20 | 
            +
                "claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
         | 
| 21 | 
            +
                "zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"}
         | 
| 22 | 
            +
            }
         | 
    	
        model_len_info.json
    CHANGED
    
    | @@ -1,19 +1,102 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
            -
              "Llama-2-13b-chat-hf.nosp":  | 
| 3 | 
            -
             | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 | 
            -
               | 
| 7 | 
            -
              " | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
               | 
| 12 | 
            -
              " | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
               | 
| 17 | 
            -
              " | 
| 18 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 19 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
            +
              "Llama-2-13b-chat-hf.nosp": {
         | 
| 3 | 
            +
                "avg_len": 2943.346238938053,
         | 
| 4 | 
            +
                "empty_output": 120,
         | 
| 5 | 
            +
                "num_samples": 1024
         | 
| 6 | 
            +
              },
         | 
| 7 | 
            +
              "Llama-2-70b-chat-hf.nosp": {
         | 
| 8 | 
            +
                "avg_len": 3077.0840707964603,
         | 
| 9 | 
            +
                "empty_output": 120,
         | 
| 10 | 
            +
                "num_samples": 1024
         | 
| 11 | 
            +
              },
         | 
| 12 | 
            +
              "Llama-2-7b-chat-hf.nosp": {
         | 
| 13 | 
            +
                "avg_len": 2965.4059734513276,
         | 
| 14 | 
            +
                "empty_output": 120,
         | 
| 15 | 
            +
                "num_samples": 1024
         | 
| 16 | 
            +
              },
         | 
| 17 | 
            +
              "Llama-2-7b-chat-hf": {
         | 
| 18 | 
            +
                "avg_len": 2137.34,
         | 
| 19 | 
            +
                "empty_output": 124,
         | 
| 20 | 
            +
                "num_samples": 1024
         | 
| 21 | 
            +
              },
         | 
| 22 | 
            +
              "Mistral-7B-Instruct-v0.1": {
         | 
| 23 | 
            +
                "avg_len": 2208.8115234375,
         | 
| 24 | 
            +
                "empty_output": 0,
         | 
| 25 | 
            +
                "num_samples": 1024
         | 
| 26 | 
            +
              },
         | 
| 27 | 
            +
              "Mistral-7B-Instruct-v0.2": {
         | 
| 28 | 
            +
                "avg_len": 2852.33203125,
         | 
| 29 | 
            +
                "empty_output": 0,
         | 
| 30 | 
            +
                "num_samples": 1024
         | 
| 31 | 
            +
              },
         | 
| 32 | 
            +
              "Mixtral-8x7B-Instruct-v0.1": {
         | 
| 33 | 
            +
                "avg_len": 2483.9638671875,
         | 
| 34 | 
            +
                "empty_output": 0,
         | 
| 35 | 
            +
                "num_samples": 1024
         | 
| 36 | 
            +
              },
         | 
| 37 | 
            +
              "Nous-Hermes-2-Mixtral-8x7B-DPO": {
         | 
| 38 | 
            +
                "avg_len": 2878.79296875,
         | 
| 39 | 
            +
                "empty_output": 0,
         | 
| 40 | 
            +
                "num_samples": 1024
         | 
| 41 | 
            +
              },
         | 
| 42 | 
            +
              "Yi-34B-Chat": {
         | 
| 43 | 
            +
                "avg_len": 2899.1797133406835,
         | 
| 44 | 
            +
                "empty_output": 117,
         | 
| 45 | 
            +
                "num_samples": 1024
         | 
| 46 | 
            +
              },
         | 
| 47 | 
            +
              "gemini-1.0-pro": {
         | 
| 48 | 
            +
                "avg_len": 2407.559462254395,
         | 
| 49 | 
            +
                "empty_output": 57,
         | 
| 50 | 
            +
                "num_samples": 1024
         | 
| 51 | 
            +
              },
         | 
| 52 | 
            +
              "gemma-7b-it": {
         | 
| 53 | 
            +
                "avg_len": 1960.829244357213,
         | 
| 54 | 
            +
                "empty_output": 5,
         | 
| 55 | 
            +
                "num_samples": 1024
         | 
| 56 | 
            +
              },
         | 
| 57 | 
            +
              "gpt-3.5-turbo-0125": {
         | 
| 58 | 
            +
                "avg_len": 1725.7216796875,
         | 
| 59 | 
            +
                "empty_output": 0,
         | 
| 60 | 
            +
                "num_samples": 1024
         | 
| 61 | 
            +
              },
         | 
| 62 | 
            +
              "gpt-4-0125-preview": {
         | 
| 63 | 
            +
                "avg_len": 3190.716796875,
         | 
| 64 | 
            +
                "empty_output": 0,
         | 
| 65 | 
            +
                "num_samples": 1024
         | 
| 66 | 
            +
              },
         | 
| 67 | 
            +
              "tulu-2-dpo-70b": {
         | 
| 68 | 
            +
                "avg_len": 2630.2337917485265,
         | 
| 69 | 
            +
                "empty_output": 6,
         | 
| 70 | 
            +
                "num_samples": 1024
         | 
| 71 | 
            +
              },
         | 
| 72 | 
            +
              "vicuna-13b-v1.5": {
         | 
| 73 | 
            +
                "avg_len": 1864.2749445676275,
         | 
| 74 | 
            +
                "empty_output": 122,
         | 
| 75 | 
            +
                "num_samples": 1024
         | 
| 76 | 
            +
              },
         | 
| 77 | 
            +
              "zephyr-7b-beta": {
         | 
| 78 | 
            +
                "avg_len": 3011.2529296875,
         | 
| 79 | 
            +
                "empty_output": 0,
         | 
| 80 | 
            +
                "num_samples": 1024
         | 
| 81 | 
            +
              },
         | 
| 82 | 
            +
              "mistral-large-2402": {
         | 
| 83 | 
            +
                "avg_len": 2352.189453125,
         | 
| 84 | 
            +
                "empty_output": 0,
         | 
| 85 | 
            +
                "num_samples": 1024
         | 
| 86 | 
            +
              },
         | 
| 87 | 
            +
              "claude-3-opus-20240229": {
         | 
| 88 | 
            +
                "avg_len": 2460.330078125,
         | 
| 89 | 
            +
                "empty_output": 0,
         | 
| 90 | 
            +
                "num_samples": 1024
         | 
| 91 | 
            +
              },
         | 
| 92 | 
            +
              "claude-3-sonnet-20240229": {
         | 
| 93 | 
            +
                "avg_len": 2456.21484375,
         | 
| 94 | 
            +
                "empty_output": 0,
         | 
| 95 | 
            +
                "num_samples": 1024
         | 
| 96 | 
            +
              },
         | 
| 97 | 
            +
              "zephyr-7b-gemma-v0.1": {
         | 
| 98 | 
            +
                "avg_len": 2551.9842983316976,
         | 
| 99 | 
            +
                "empty_output": 5,
         | 
| 100 | 
            +
                "num_samples": 1024
         | 
| 101 | 
            +
              }
         | 
| 102 | 
             
            }
         | 
 
			
