Update space
Browse files- app.py +43 -12
 - src/display/utils.py +4 -0
 - src/leaderboard/read_evals.py +4 -0
 - src/populate.py +2 -2
 
    	
        app.py
    CHANGED
    
    | 
         @@ -104,7 +104,8 @@ def init_leaderboard(dataframe): 
     | 
|
| 104 | 
         
             
            # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
         
     | 
| 105 | 
         
             
            # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
         
     | 
| 106 | 
         
             
            # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
         
     | 
| 107 | 
         
            -
            model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
         
     | 
| 
         | 
|
| 108 | 
         
             
            # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
         
     | 
| 109 | 
         | 
| 110 | 
         | 
| 
         @@ -131,17 +132,33 @@ with demo: 
     | 
|
| 131 | 
         
             
                gr.HTML(TITLE)
         
     | 
| 132 | 
         
             
                gr.HTML(SUB_TITLE)
         
     | 
| 133 | 
         
             
                gr.HTML(EXTERNAL_LINKS)
         
     | 
| 134 | 
         
            -
                gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 135 | 
         | 
| 136 | 
         
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         
     | 
| 137 | 
         | 
| 138 | 
         
             
                    with gr.TabItem("π
 Overview", elem_id="llm-benchmark-tab-table", id=0):
         
     | 
| 139 | 
         | 
| 140 | 
         
             
                        DESCRIPTION_TEXT = """
         
     | 
| 141 | 
         
            -
                        Total #models: 53 (Last updated: 2024-10- 
     | 
| 142 | 
         | 
| 143 | 
         
            -
                        This page  
     | 
| 144 | 
         
            -
                        ( 
     | 
| 145 | 
         
             
                        """
         
     | 
| 146 | 
         
             
                        gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
         
     | 
| 147 | 
         | 
| 
         @@ -158,6 +175,7 @@ with demo: 
     | 
|
| 158 | 
         
             
                                    AutoEvalColumn.rank_reason_logical.name,
         
     | 
| 159 | 
         
             
                                    AutoEvalColumn.rank_reason_social.name,
         
     | 
| 160 | 
         
             
                                    AutoEvalColumn.rank_chemistry.name,
         
     | 
| 
         | 
|
| 161 | 
         
             
                                    ],
         
     | 
| 162 | 
         
             
                                rank_col=[],
         
     | 
| 163 | 
         
             
                            )
         
     | 
| 
         @@ -374,19 +392,31 @@ with demo: 
     | 
|
| 374 | 
         
             
                        """
         
     | 
| 375 | 
         
             
                        gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         
     | 
| 376 | 
         | 
| 377 | 
         
            -
                        with gr.TabItem(" 
     | 
| 378 | 
         
            -
             
     | 
| 379 | 
         
            -
                             
     | 
| 380 | 
         
            -
             
     | 
| 381 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 382 | 
         | 
| 383 | 
         
            -
                        with gr.TabItem(" 
     | 
| 384 | 
         
             
                            CURRENT_TEXT = """
         
     | 
| 385 | 
         
             
                            # Coming soon!
         
     | 
| 386 | 
         
             
                            """
         
     | 
| 387 | 
         
             
                            gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         
     | 
| 388 | 
         | 
| 389 | 
         
            -
                        with gr.TabItem(" 
     | 
| 390 | 
         
             
                            CURRENT_TEXT = """
         
     | 
| 391 | 
         
             
                            # Coming soon!
         
     | 
| 392 | 
         
             
                            """
         
     | 
| 
         @@ -395,6 +425,7 @@ with demo: 
     | 
|
| 395 | 
         | 
| 396 | 
         | 
| 397 | 
         | 
| 
         | 
|
| 398 | 
         
             
                    with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=6):
         
     | 
| 399 | 
         
             
                        ABOUT_TEXT = """
         
     | 
| 400 | 
         
             
                        # About Us
         
     | 
| 
         | 
|
| 104 | 
         
             
            # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
         
     | 
| 105 | 
         
             
            # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
         
     | 
| 106 | 
         
             
            # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
         
     | 
| 107 | 
         
            +
            # model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
         
     | 
| 108 | 
         
            +
            model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
         
     | 
| 109 | 
         
             
            # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
         
     | 
| 110 | 
         | 
| 111 | 
         | 
| 
         | 
|
| 132 | 
         
             
                gr.HTML(TITLE)
         
     | 
| 133 | 
         
             
                gr.HTML(SUB_TITLE)
         
     | 
| 134 | 
         
             
                gr.HTML(EXTERNAL_LINKS)
         
     | 
| 135 | 
         
            +
                # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         
     | 
| 136 | 
         
            +
                # gr.HTML('<p style="font-size:15px;">This is a larger text using HTML in Markdown.</p>')
         
     | 
| 137 | 
         
            +
                INTRODUCTION_TEXT_FONT_SIZE = 16
         
     | 
| 138 | 
         
            +
                INTRODUCTION_TEXT = (
         
     | 
| 139 | 
         
            +
                    f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         
     | 
| 140 | 
         
            +
                    '<strong>Decentralized Arena</strong> automates,  scales, and accelerates "<a href="https://lmarena.ai/">Chatbot Arena</a>" '
         
     | 
| 141 | 
         
            +
                    'for large language model (LLM) evaluation across diverse, fine-grained dimensions, '
         
     | 
| 142 | 
         
            +
                    'such as mathematics (algebra, geometry, probability), logical reasoning, social reasoning, biology, chemistry, and more'
         
     | 
| 143 | 
         
            +
                    'The evaluation is decentralized and democratic, with all participating LLMs assessing each other to ensure unbiased and fair results '
         
     | 
| 144 | 
         
            +
                    'With a 95\% correlation to Chatbot Arena\'s overall rankings, the system is fully transparent and reproducible.'
         
     | 
| 145 | 
         
            +
                    '</p>'
         
     | 
| 146 | 
         
            +
                    f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         
     | 
| 147 | 
         
            +
                    'We actively invite <b>model developers</b> to participate and expedite their benchmarking efforts '
         
     | 
| 148 | 
         
            +
                    'and encourage <b>data stakeholders</b> to freely define and evaluate dimensions of interest for their own objectives.'
         
     | 
| 149 | 
         
            +
                    '</p>'
         
     | 
| 150 | 
         
            +
                )
         
     | 
| 151 | 
         
            +
                gr.HTML(INTRODUCTION_TEXT)
         
     | 
| 152 | 
         | 
| 153 | 
         
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         
     | 
| 154 | 
         | 
| 155 | 
         
             
                    with gr.TabItem("π
 Overview", elem_id="llm-benchmark-tab-table", id=0):
         
     | 
| 156 | 
         | 
| 157 | 
         
             
                        DESCRIPTION_TEXT = """
         
     | 
| 158 | 
         
            +
                        Total #models: 53 (Last updated: 2024-10-09)
         
     | 
| 159 | 
         | 
| 160 | 
         
            +
                        This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks. 
         
     | 
| 161 | 
         
            +
                        (Missing values are due to the slow or problemtic model responses, which will be fixed soom.)
         
     | 
| 162 | 
         
             
                        """
         
     | 
| 163 | 
         
             
                        gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
         
     | 
| 164 | 
         | 
| 
         | 
|
| 175 | 
         
             
                                    AutoEvalColumn.rank_reason_logical.name,
         
     | 
| 176 | 
         
             
                                    AutoEvalColumn.rank_reason_social.name,
         
     | 
| 177 | 
         
             
                                    AutoEvalColumn.rank_chemistry.name,
         
     | 
| 178 | 
         
            +
                                    AutoEvalColumn.rank_cpp.name,
         
     | 
| 179 | 
         
             
                                    ],
         
     | 
| 180 | 
         
             
                                rank_col=[],
         
     | 
| 181 | 
         
             
                            )
         
     | 
| 
         | 
|
| 392 | 
         
             
                        """
         
     | 
| 393 | 
         
             
                        gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         
     | 
| 394 | 
         | 
| 395 | 
         
            +
                        with gr.TabItem("β C++", elem_id="cpp_subtab", id=0, elem_classes="subtab"):   
         
     | 
| 396 | 
         
            +
                                            
         
     | 
| 397 | 
         
            +
                            leaderboard = overall_leaderboard(
         
     | 
| 398 | 
         
            +
                                get_model_leaderboard_df(
         
     | 
| 399 | 
         
            +
                                    model_result_path,
         
     | 
| 400 | 
         
            +
                                    benchmark_cols=[
         
     | 
| 401 | 
         
            +
                                        AutoEvalColumn.rank_cpp.name,
         
     | 
| 402 | 
         
            +
                                        AutoEvalColumn.model.name,
         
     | 
| 403 | 
         
            +
                                        AutoEvalColumn.score_cpp.name,
         
     | 
| 404 | 
         
            +
                                        # AutoEvalColumn.sd_cpp.name,
         
     | 
| 405 | 
         
            +
                                        AutoEvalColumn.license.name,
         
     | 
| 406 | 
         
            +
                                        AutoEvalColumn.organization.name,
         
     | 
| 407 | 
         
            +
                                        AutoEvalColumn.knowledge_cutoff.name,
         
     | 
| 408 | 
         
            +
                                        ],
         
     | 
| 409 | 
         
            +
                                    rank_col=[AutoEvalColumn.rank_cpp.name],
         
     | 
| 410 | 
         
            +
                                )
         
     | 
| 411 | 
         
            +
                            )
         
     | 
| 412 | 
         | 
| 413 | 
         
            +
                        with gr.TabItem("π Python", elem_id="python_subtab", id=1, elem_classes="subtab"):   
         
     | 
| 414 | 
         
             
                            CURRENT_TEXT = """
         
     | 
| 415 | 
         
             
                            # Coming soon!
         
     | 
| 416 | 
         
             
                            """
         
     | 
| 417 | 
         
             
                            gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         
     | 
| 418 | 
         | 
| 419 | 
         
            +
                        with gr.TabItem("β Java", elem_id="java_subtab", id=2, elem_classes="subtab"):   
         
     | 
| 420 | 
         
             
                            CURRENT_TEXT = """
         
     | 
| 421 | 
         
             
                            # Coming soon!
         
     | 
| 422 | 
         
             
                            """
         
     | 
| 
         | 
|
| 425 | 
         | 
| 426 | 
         | 
| 427 | 
         | 
| 428 | 
         
            +
             
     | 
| 429 | 
         
             
                    with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=6):
         
     | 
| 430 | 
         
             
                        ABOUT_TEXT = """
         
     | 
| 431 | 
         
             
                        # About Us
         
     | 
    	
        src/display/utils.py
    CHANGED
    
    | 
         @@ -89,6 +89,10 @@ auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_fa 
     | 
|
| 89 | 
         
             
            auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
         
     | 
| 90 | 
         
             
            auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
         
     | 
| 91 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 92 | 
         
             
            for task in Tasks:
         
     | 
| 93 | 
         
             
                auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
         
     | 
| 94 | 
         
             
            auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
         
     | 
| 
         | 
|
| 89 | 
         
             
            auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
         
     | 
| 90 | 
         
             
            auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
         
     | 
| 91 | 
         | 
| 92 | 
         
            +
            auto_eval_column_dict.append(["score_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Score (C++)", "number", True))])
         
     | 
| 93 | 
         
            +
            auto_eval_column_dict.append(["sd_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (C++)", "number", True))])
         
     | 
| 94 | 
         
            +
            auto_eval_column_dict.append(["rank_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (C++)", "number", True))])
         
     | 
| 95 | 
         
            +
             
     | 
| 96 | 
         
             
            for task in Tasks:
         
     | 
| 97 | 
         
             
                auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
         
     | 
| 98 | 
         
             
            auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
         
     | 
    	
        src/leaderboard/read_evals.py
    CHANGED
    
    | 
         @@ -189,6 +189,10 @@ class ModelResult: 
     | 
|
| 189 | 
         
             
                        AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
         
     | 
| 190 | 
         
             
                        AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
         
     | 
| 191 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 192 | 
         
             
                        AutoEvalColumn.license.name: self.license,
         
     | 
| 193 | 
         
             
                        AutoEvalColumn.organization.name: self.org,
         
     | 
| 194 | 
         
             
                        AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
         
     | 
| 
         | 
|
| 189 | 
         
             
                        AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
         
     | 
| 190 | 
         
             
                        AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
         
     | 
| 191 | 
         | 
| 192 | 
         
            +
                        AutoEvalColumn.score_cpp.name: self.results.get("CPP").get("Average Score", None) if self.results.get("CPP") else None,
         
     | 
| 193 | 
         
            +
                        AutoEvalColumn.sd_cpp.name: self.results.get("CPP").get("Standard Deviation", None) if self.results.get("CPP") else None,
         
     | 
| 194 | 
         
            +
                        AutoEvalColumn.rank_cpp.name: self.results.get("CPP").get("Rank", None) if self.results.get("CPP") else None,
         
     | 
| 195 | 
         
            +
                        
         
     | 
| 196 | 
         
             
                        AutoEvalColumn.license.name: self.license,
         
     | 
| 197 | 
         
             
                        AutoEvalColumn.organization.name: self.org,
         
     | 
| 198 | 
         
             
                        AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
         
     | 
    	
        src/populate.py
    CHANGED
    
    | 
         @@ -24,7 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis 
     | 
|
| 24 | 
         
             
                if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
         
     | 
| 25 | 
         
             
                    df = df.dropna(subset=benchmark_cols)
         
     | 
| 26 | 
         
             
                    df = df.sort_values(by=[rank_col[0]], ascending=True)
         
     | 
| 27 | 
         
            -
                    # print(rank_col)
         
     | 
| 28 | 
         
             
                else: 
         
     | 
| 29 | 
         
             
                    # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
         
     | 
| 30 | 
         
             
                    avg_rank = df.iloc[:, 1:].mean(axis=1) 
         
     | 
| 
         @@ -43,7 +43,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis 
     | 
|
| 43 | 
         
             
                    # print(col)
         
     | 
| 44 | 
         
             
                    # if 'Std dev' in col or 'Score' in col:
         
     | 
| 45 | 
         
             
                    if 'Std dev' in col or 'Score' in col:
         
     | 
| 46 | 
         
            -
                        if "Chemistry" in col:
         
     | 
| 47 | 
         
             
                            df[col] = (df[col]).map('{:.2f}'.format)
         
     | 
| 48 | 
         
             
                        else:
         
     | 
| 49 | 
         
             
                            df[col] = (df[col]*100).map('{:.2f}'.format)
         
     | 
| 
         | 
|
| 24 | 
         
             
                if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
         
     | 
| 25 | 
         
             
                    df = df.dropna(subset=benchmark_cols)
         
     | 
| 26 | 
         
             
                    df = df.sort_values(by=[rank_col[0]], ascending=True)
         
     | 
| 27 | 
         
            +
                    # print(rank_col, benchmark_cols)
         
     | 
| 28 | 
         
             
                else: 
         
     | 
| 29 | 
         
             
                    # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
         
     | 
| 30 | 
         
             
                    avg_rank = df.iloc[:, 1:].mean(axis=1) 
         
     | 
| 
         | 
|
| 43 | 
         
             
                    # print(col)
         
     | 
| 44 | 
         
             
                    # if 'Std dev' in col or 'Score' in col:
         
     | 
| 45 | 
         
             
                    if 'Std dev' in col or 'Score' in col:
         
     | 
| 46 | 
         
            +
                        if "Chemistry" in col or "C++" in col:
         
     | 
| 47 | 
         
             
                            df[col] = (df[col]).map('{:.2f}'.format)
         
     | 
| 48 | 
         
             
                        else:
         
     | 
| 49 | 
         
             
                            df[col] = (df[col]*100).map('{:.2f}'.format)
         
     |