Update space
Browse files- app.py +79 -32
 - src/populate.py +24 -7
 
    	
        app.py
    CHANGED
    
    | 
         @@ -183,29 +183,6 @@ with demo: 
     | 
|
| 183 | 
         
             
                            )
         
     | 
| 184 | 
         
             
                        )
         
     | 
| 185 | 
         | 
| 186 | 
         
            -
                    with gr.TabItem("π― Mixed", elem_id="llm-benchmark-tab-table", id=1):
         
     | 
| 187 | 
         
            -
                        DESCRIPTION_TEXT = """
         
     | 
| 188 | 
         
            -
                        Overall dimension measures the comprehensive performance of LLMs across diverse tasks. 
         
     | 
| 189 | 
         
            -
                        We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685), 
         
     | 
| 190 | 
         
            -
                        coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
         
     | 
| 191 | 
         
            -
                        """
         
     | 
| 192 | 
         
            -
                        gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
         
     | 
| 193 | 
         
            -
                        
         
     | 
| 194 | 
         
            -
                        with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"): 
         
     | 
| 195 | 
         
            -
                            leaderboard = overall_leaderboard(
         
     | 
| 196 | 
         
            -
                                get_model_leaderboard_df(
         
     | 
| 197 | 
         
            -
                                    model_result_path,
         
     | 
| 198 | 
         
            -
                                    benchmark_cols=[
         
     | 
| 199 | 
         
            -
                                        AutoEvalColumn.rank_overall.name,
         
     | 
| 200 | 
         
            -
                                        AutoEvalColumn.model.name, 
         
     | 
| 201 | 
         
            -
                                        AutoEvalColumn.score_overall.name,
         
     | 
| 202 | 
         
            -
                                        AutoEvalColumn.sd_overall.name,
         
     | 
| 203 | 
         
            -
                                        AutoEvalColumn.license.name,
         
     | 
| 204 | 
         
            -
                                        AutoEvalColumn.organization.name,
         
     | 
| 205 | 
         
            -
                                        AutoEvalColumn.knowledge_cutoff.name,
         
     | 
| 206 | 
         
            -
                                        ],
         
     | 
| 207 | 
         
            -
                                    rank_col=[AutoEvalColumn.rank_overall.name],
         
     | 
| 208 | 
         
            -
                                ))
         
     | 
| 209 | 
         | 
| 210 | 
         | 
| 211 | 
         
             
                    with gr.TabItem("π’ Math", elem_id="math-tab-table", id=2):
         
     | 
| 
         @@ -232,11 +209,18 @@ with demo: 
     | 
|
| 232 | 
         
             
                                    model_result_path,
         
     | 
| 233 | 
         
             
                                    benchmark_cols=[
         
     | 
| 234 | 
         
             
                                        AutoEvalColumn.model.name, 
         
     | 
| 235 | 
         
            -
                                        AutoEvalColumn. 
     | 
| 236 | 
         
            -
                                        AutoEvalColumn. 
     | 
| 237 | 
         
            -
                                        AutoEvalColumn. 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 238 | 
         
             
                                        ],
         
     | 
| 239 | 
         
            -
                                    rank_col=[],
         
     | 
| 240 | 
         
             
                                )
         
     | 
| 241 | 
         
             
                            )
         
     | 
| 242 | 
         | 
| 
         @@ -292,6 +276,21 @@ with demo: 
     | 
|
| 292 | 
         
             
                                )
         
     | 
| 293 | 
         
             
                            )
         
     | 
| 294 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 295 | 
         
             
                    with gr.TabItem("π§  Reasoning", elem_id="reasonong-tab-table", id=3):
         
     | 
| 296 | 
         
             
                        DESCRIPTION_TEXT = """
         
     | 
| 297 | 
         
             
                        Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs. 
         
     | 
| 
         @@ -323,10 +322,16 @@ with demo: 
     | 
|
| 323 | 
         
             
                                    model_result_path,
         
     | 
| 324 | 
         
             
                                    benchmark_cols=[
         
     | 
| 325 | 
         
             
                                        AutoEvalColumn.model.name, 
         
     | 
| 326 | 
         
            -
                                        AutoEvalColumn. 
     | 
| 327 | 
         
            -
                                        AutoEvalColumn. 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 328 | 
         
             
                                        ],
         
     | 
| 329 | 
         
            -
                                    rank_col=[],
         
     | 
| 330 | 
         
             
                                )
         
     | 
| 331 | 
         
             
                            )
         
     | 
| 332 | 
         | 
| 
         @@ -364,6 +369,19 @@ with demo: 
     | 
|
| 364 | 
         
             
                                )
         
     | 
| 365 | 
         
             
                            )
         
     | 
| 366 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 367 | 
         
             
                    with gr.TabItem("π¬ Science", elem_id="science-table", id=4):
         
     | 
| 368 | 
         
             
                        CURRENT_TEXT = """
         
     | 
| 369 | 
         
             
                        Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
         
     | 
| 
         @@ -385,9 +403,14 @@ with demo: 
     | 
|
| 385 | 
         
             
                                    model_result_path,
         
     | 
| 386 | 
         
             
                                    benchmark_cols=[
         
     | 
| 387 | 
         
             
                                        AutoEvalColumn.model.name, 
         
     | 
| 388 | 
         
            -
                                        AutoEvalColumn. 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 389 | 
         
             
                                        ],
         
     | 
| 390 | 
         
            -
                                    rank_col=[],
         
     | 
| 391 | 
         
             
                                )
         
     | 
| 392 | 
         
             
                            )
         
     | 
| 393 | 
         | 
| 
         @@ -468,6 +491,30 @@ with demo: 
     | 
|
| 468 | 
         | 
| 469 | 
         | 
| 470 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 471 | 
         | 
| 472 | 
         | 
| 473 | 
         
             
                    with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=6):
         
     | 
| 
         | 
|
| 183 | 
         
             
                            )
         
     | 
| 184 | 
         
             
                        )
         
     | 
| 185 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 186 | 
         | 
| 187 | 
         | 
| 188 | 
         
             
                    with gr.TabItem("π’ Math", elem_id="math-tab-table", id=2):
         
     | 
| 
         | 
|
| 209 | 
         
             
                                    model_result_path,
         
     | 
| 210 | 
         
             
                                    benchmark_cols=[
         
     | 
| 211 | 
         
             
                                        AutoEvalColumn.model.name, 
         
     | 
| 212 | 
         
            +
                                        AutoEvalColumn.license.name,
         
     | 
| 213 | 
         
            +
                                        AutoEvalColumn.organization.name,
         
     | 
| 214 | 
         
            +
                                        AutoEvalColumn.knowledge_cutoff.name,
         
     | 
| 215 | 
         
            +
             
     | 
| 216 | 
         
            +
                                        AutoEvalColumn.score_math_algebra.name,
         
     | 
| 217 | 
         
            +
                                        AutoEvalColumn.score_math_geometry.name,
         
     | 
| 218 | 
         
            +
                                        AutoEvalColumn.score_math_probability.name,
         
     | 
| 219 | 
         
            +
                                        # AutoEvalColumn.rank_math_algebra.name,
         
     | 
| 220 | 
         
            +
                                        # AutoEvalColumn.rank_math_geometry.name,
         
     | 
| 221 | 
         
            +
                                        # AutoEvalColumn.rank_math_probability.name,
         
     | 
| 222 | 
         
             
                                        ],
         
     | 
| 223 | 
         
            +
                                    rank_col=['sort_by_score'],
         
     | 
| 224 | 
         
             
                                )
         
     | 
| 225 | 
         
             
                            )
         
     | 
| 226 | 
         | 
| 
         | 
|
| 276 | 
         
             
                                )
         
     | 
| 277 | 
         
             
                            )
         
     | 
| 278 | 
         | 
| 279 | 
         
            +
                            
         
     | 
| 280 | 
         
            +
                        # with gr.TabItem("Sort_by_rank", elem_id="math_sort_by_rank_subtab", id=4, elem_classes="subtab"): 
         
     | 
| 281 | 
         
            +
                        #     leaderboard = overall_leaderboard(
         
     | 
| 282 | 
         
            +
                        #         get_model_leaderboard_df(
         
     | 
| 283 | 
         
            +
                        #             model_result_path,
         
     | 
| 284 | 
         
            +
                        #             benchmark_cols=[
         
     | 
| 285 | 
         
            +
                        #                 AutoEvalColumn.model.name, 
         
     | 
| 286 | 
         
            +
                        #                 AutoEvalColumn.rank_math_algebra.name,
         
     | 
| 287 | 
         
            +
                        #                 AutoEvalColumn.rank_math_geometry.name,
         
     | 
| 288 | 
         
            +
                        #                 AutoEvalColumn.rank_math_probability.name,
         
     | 
| 289 | 
         
            +
                        #                 ],
         
     | 
| 290 | 
         
            +
                        #             rank_col=[],
         
     | 
| 291 | 
         
            +
                        #         )
         
     | 
| 292 | 
         
            +
                        #     )        
         
     | 
| 293 | 
         
            +
                            
         
     | 
| 294 | 
         
             
                    with gr.TabItem("π§  Reasoning", elem_id="reasonong-tab-table", id=3):
         
     | 
| 295 | 
         
             
                        DESCRIPTION_TEXT = """
         
     | 
| 296 | 
         
             
                        Reasoning is a broad domain for evaluating LLMs, but traditional tasks like commonsense reasoning have become less effective in differentiating modern LLMs. 
         
     | 
| 
         | 
|
| 322 | 
         
             
                                    model_result_path,
         
     | 
| 323 | 
         
             
                                    benchmark_cols=[
         
     | 
| 324 | 
         
             
                                        AutoEvalColumn.model.name, 
         
     | 
| 325 | 
         
            +
                                        AutoEvalColumn.license.name,
         
     | 
| 326 | 
         
            +
                                        AutoEvalColumn.organization.name,
         
     | 
| 327 | 
         
            +
                                        AutoEvalColumn.knowledge_cutoff.name,
         
     | 
| 328 | 
         
            +
             
     | 
| 329 | 
         
            +
                                        AutoEvalColumn.score_reason_logical.name,
         
     | 
| 330 | 
         
            +
                                        AutoEvalColumn.score_reason_social.name,
         
     | 
| 331 | 
         
            +
                                        # AutoEvalColumn.rank_reason_logical.name,
         
     | 
| 332 | 
         
            +
                                        # AutoEvalColumn.rank_reason_social.name,
         
     | 
| 333 | 
         
             
                                        ],
         
     | 
| 334 | 
         
            +
                                    rank_col=['sort_by_score'],
         
     | 
| 335 | 
         
             
                                )
         
     | 
| 336 | 
         
             
                            )
         
     | 
| 337 | 
         | 
| 
         | 
|
| 369 | 
         
             
                                )
         
     | 
| 370 | 
         
             
                            )
         
     | 
| 371 | 
         | 
| 372 | 
         
            +
                        # with gr.TabItem("Sort_by_rank", elem_id="reasoning_sort_by_rank_subtab", id=3, elem_classes="subtab"): 
         
     | 
| 373 | 
         
            +
                        #     leaderboard = overall_leaderboard(
         
     | 
| 374 | 
         
            +
                        #         get_model_leaderboard_df(
         
     | 
| 375 | 
         
            +
                        #             model_result_path,
         
     | 
| 376 | 
         
            +
                        #             benchmark_cols=[
         
     | 
| 377 | 
         
            +
                        #                 AutoEvalColumn.model.name, 
         
     | 
| 378 | 
         
            +
                        #                 AutoEvalColumn.rank_reason_logical.name,
         
     | 
| 379 | 
         
            +
                        #                 AutoEvalColumn.rank_reason_social.name,
         
     | 
| 380 | 
         
            +
                        #                 ],
         
     | 
| 381 | 
         
            +
                        #             rank_col=[],
         
     | 
| 382 | 
         
            +
                        #         )
         
     | 
| 383 | 
         
            +
                        #     )
         
     | 
| 384 | 
         
            +
                            
         
     | 
| 385 | 
         
             
                    with gr.TabItem("π¬ Science", elem_id="science-table", id=4):
         
     | 
| 386 | 
         
             
                        CURRENT_TEXT = """
         
     | 
| 387 | 
         
             
                        Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
         
     | 
| 
         | 
|
| 403 | 
         
             
                                    model_result_path,
         
     | 
| 404 | 
         
             
                                    benchmark_cols=[
         
     | 
| 405 | 
         
             
                                        AutoEvalColumn.model.name, 
         
     | 
| 406 | 
         
            +
                                        AutoEvalColumn.license.name,
         
     | 
| 407 | 
         
            +
                                        AutoEvalColumn.organization.name,
         
     | 
| 408 | 
         
            +
                                        AutoEvalColumn.knowledge_cutoff.name,
         
     | 
| 409 | 
         
            +
             
     | 
| 410 | 
         
            +
                                        AutoEvalColumn.score_chemistry.name,
         
     | 
| 411 | 
         
            +
                                        # AutoEvalColumn.rank_chemistry.name,
         
     | 
| 412 | 
         
             
                                        ],
         
     | 
| 413 | 
         
            +
                                    rank_col=['sort_by_score'],
         
     | 
| 414 | 
         
             
                                )
         
     | 
| 415 | 
         
             
                            )
         
     | 
| 416 | 
         | 
| 
         | 
|
| 491 | 
         | 
| 492 | 
         | 
| 493 | 
         | 
| 494 | 
         
            +
                    with gr.TabItem("π― Mixed", elem_id="llm-benchmark-tab-table", id=1):
         
     | 
| 495 | 
         
            +
                        DESCRIPTION_TEXT = """
         
     | 
| 496 | 
         
            +
                        Overall dimension measures the comprehensive performance of LLMs across diverse tasks. 
         
     | 
| 497 | 
         
            +
                        We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685), 
         
     | 
| 498 | 
         
            +
                        coving a wide range of domains, including writing, roleplay, extraction, reasoning, math, coding, knowledge I (STEM), and knowledge II (humanities/social science).
         
     | 
| 499 | 
         
            +
                        """
         
     | 
| 500 | 
         
            +
                        gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
         
     | 
| 501 | 
         
            +
                        
         
     | 
| 502 | 
         
            +
                        with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"): 
         
     | 
| 503 | 
         
            +
                            leaderboard = overall_leaderboard(
         
     | 
| 504 | 
         
            +
                                get_model_leaderboard_df(
         
     | 
| 505 | 
         
            +
                                    model_result_path,
         
     | 
| 506 | 
         
            +
                                    benchmark_cols=[
         
     | 
| 507 | 
         
            +
                                        AutoEvalColumn.rank_overall.name,
         
     | 
| 508 | 
         
            +
                                        AutoEvalColumn.model.name, 
         
     | 
| 509 | 
         
            +
                                        AutoEvalColumn.score_overall.name,
         
     | 
| 510 | 
         
            +
                                        AutoEvalColumn.sd_overall.name,
         
     | 
| 511 | 
         
            +
                                        AutoEvalColumn.license.name,
         
     | 
| 512 | 
         
            +
                                        AutoEvalColumn.organization.name,
         
     | 
| 513 | 
         
            +
                                        AutoEvalColumn.knowledge_cutoff.name,
         
     | 
| 514 | 
         
            +
                                        ],
         
     | 
| 515 | 
         
            +
                                    rank_col=[AutoEvalColumn.rank_overall.name],
         
     | 
| 516 | 
         
            +
                                ))
         
     | 
| 517 | 
         
            +
             
     | 
| 518 | 
         | 
| 519 | 
         | 
| 520 | 
         
             
                    with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=6):
         
     | 
    	
        src/populate.py
    CHANGED
    
    | 
         @@ -24,7 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis 
     | 
|
| 24 | 
         | 
| 25 | 
         
             
                # if there is one col in rank_col, this is an isolated dimension to rank by
         
     | 
| 26 | 
         
             
                # sort by that selected column and remove NaN values
         
     | 
| 27 | 
         
            -
                if rank_col: 
         
     | 
| 28 | 
         
             
                    # df = df.dropna(subset=benchmark_cols)
         
     | 
| 29 | 
         
             
                    df = df.dropna(subset=rank_col)
         
     | 
| 30 | 
         
             
                    df = df.fillna(0.00)
         
     | 
| 
         @@ -32,8 +32,29 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis 
     | 
|
| 32 | 
         
             
                    df = df.sort_values(by=[rank_col[0]], ascending=True)
         
     | 
| 33 | 
         
             
                    # print(rank_col, benchmark_cols)
         
     | 
| 34 | 
         
             
                    # print(df.head())
         
     | 
| 35 | 
         
            -
             
     | 
| 36 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 37 | 
         
             
                    avg_rank = df.iloc[:, 1:].mean(axis=1)
         
     | 
| 38 | 
         
             
                    df["Average Rank"] = avg_rank.round(decimals=4)
         
     | 
| 39 | 
         
             
                    df = df.sort_values(by=["Average Rank"], ascending=True)        
         
     | 
| 
         @@ -46,10 +67,6 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis 
     | 
|
| 46 | 
         
             
                    df.insert(0, 'Rank', rank)
         
     | 
| 47 | 
         | 
| 48 | 
         | 
| 49 | 
         
            -
                for col in benchmark_cols:
         
     | 
| 50 | 
         
            -
                    if 'Std dev' in col or 'Score' in col:
         
     | 
| 51 | 
         
            -
                        df[col] = (df[col]).map('{:.2f}'.format)
         
     | 
| 52 | 
         
            -
                        df[col] = df[col].round(decimals=2)
         
     | 
| 53 | 
         | 
| 54 | 
         | 
| 55 | 
         
             
                # for col in benchmark_cols:
         
     | 
| 
         | 
|
| 24 | 
         | 
| 25 | 
         
             
                # if there is one col in rank_col, this is an isolated dimension to rank by
         
     | 
| 26 | 
         
             
                # sort by that selected column and remove NaN values
         
     | 
| 27 | 
         
            +
                if rank_col and rank_col[0] != "sort_by_score": 
         
     | 
| 28 | 
         
             
                    # df = df.dropna(subset=benchmark_cols)
         
     | 
| 29 | 
         
             
                    df = df.dropna(subset=rank_col)
         
     | 
| 30 | 
         
             
                    df = df.fillna(0.00)
         
     | 
| 
         | 
|
| 32 | 
         
             
                    df = df.sort_values(by=[rank_col[0]], ascending=True)
         
     | 
| 33 | 
         
             
                    # print(rank_col, benchmark_cols)
         
     | 
| 34 | 
         
             
                    # print(df.head())
         
     | 
| 35 | 
         
            +
                    
         
     | 
| 36 | 
         
            +
                    for col in benchmark_cols:
         
     | 
| 37 | 
         
            +
                        if 'Std dev' in col or 'Score' in col:
         
     | 
| 38 | 
         
            +
                            df[col] = (df[col]).map('{:.2f}'.format)
         
     | 
| 39 | 
         
            +
                            df[col] = df[col].round(decimals=2)
         
     | 
| 40 | 
         
            +
                            
         
     | 
| 41 | 
         
            +
                elif rank_col and rank_col[0] == "sort_by_score": # sorting by averaging all benchmark cols, except cols before offset_idx
         
     | 
| 42 | 
         
            +
                    offset_idx = 4
         
     | 
| 43 | 
         
            +
                    avg_scores = df.iloc[:, offset_idx:].mean(axis=1)
         
     | 
| 44 | 
         
            +
                    df.insert(1, "Average Score", avg_scores) 
         
     | 
| 45 | 
         
            +
                    
         
     | 
| 46 | 
         
            +
                    df["Average Score"] = avg_scores.round(decimals=4)
         
     | 
| 47 | 
         
            +
                    df = df.sort_values(by=["Average Score"], ascending=False)
         
     | 
| 48 | 
         
            +
                    df["Average Score"] = df["Average Score"].map('{:.2f}'.format)
         
     | 
| 49 | 
         
            +
                    
         
     | 
| 50 | 
         
            +
                    df = df.drop(columns=benchmark_cols[offset_idx:])
         
     | 
| 51 | 
         
            +
                    # print(benchmark_cols)
         
     | 
| 52 | 
         
            +
                    # print(df.head())
         
     | 
| 53 | 
         
            +
                    # insert a rank column
         
     | 
| 54 | 
         
            +
                    rank = np.arange(1, len(df)+1)
         
     | 
| 55 | 
         
            +
                    df.insert(0, 'Rank', rank) 
         
     | 
| 56 | 
         
            +
                
         
     | 
| 57 | 
         
            +
                else:  # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
         
     | 
| 58 | 
         
             
                    avg_rank = df.iloc[:, 1:].mean(axis=1)
         
     | 
| 59 | 
         
             
                    df["Average Rank"] = avg_rank.round(decimals=4)
         
     | 
| 60 | 
         
             
                    df = df.sort_values(by=["Average Rank"], ascending=True)        
         
     | 
| 
         | 
|
| 67 | 
         
             
                    df.insert(0, 'Rank', rank)
         
     | 
| 68 | 
         | 
| 69 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 70 | 
         | 
| 71 | 
         | 
| 72 | 
         
             
                # for col in benchmark_cols:
         
     |