update
Browse files- app.py +80 -30
- src/display/utils.py +1 -0
- src/leaderboard/read_evals.py +9 -0
- src/results/models_2024-11-08-08:36:00.464224.json +0 -0
    	
        app.py
    CHANGED
    
    | @@ -100,7 +100,8 @@ def init_leaderboard(dataframe): | |
| 100 | 
             
                )
         | 
| 101 |  | 
| 102 | 
             
            # model_result_path = "./src/results/models_2024-10-20-23:34:57.242641.json"
         | 
| 103 | 
            -
            model_result_path = "./src/results/models_2024-10-24-08:08:59.127307.json"
         | 
|  | |
| 104 | 
             
            # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
         | 
| 105 |  | 
| 106 |  | 
| @@ -192,7 +193,8 @@ with demo: | |
| 192 |  | 
| 193 | 
             
                        TEXT = (
         | 
| 194 | 
             
                            f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         | 
| 195 | 
            -
                            '<b>Total #models: 57 (Last updated: 2024-10-21)</b>'
         | 
|  | |
| 196 | 
             
                            '</p>'
         | 
| 197 | 
             
                            f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         | 
| 198 | 
             
                            'This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks or scores.'
         | 
| @@ -218,6 +220,9 @@ with demo: | |
| 218 | 
             
                                        AutoEvalColumn.rank_reason_logical.name,
         | 
| 219 | 
             
                                        AutoEvalColumn.rank_reason_social.name,
         | 
| 220 | 
             
                                        AutoEvalColumn.rank_chemistry.name,
         | 
|  | |
|  | |
|  | |
| 221 | 
             
                                        AutoEvalColumn.rank_overall.name,
         | 
| 222 | 
             
                                        # AutoEvalColumn.rank_cpp.name,
         | 
| 223 | 
             
                                        ],
         | 
| @@ -242,6 +247,9 @@ with demo: | |
| 242 | 
             
                                        AutoEvalColumn.score_reason_logical.name,
         | 
| 243 | 
             
                                        AutoEvalColumn.score_reason_social.name,
         | 
| 244 | 
             
                                        AutoEvalColumn.score_chemistry.name,
         | 
|  | |
|  | |
|  | |
| 245 | 
             
                                        AutoEvalColumn.score_overall.name,
         | 
| 246 | 
             
                                        # AutoEvalColumn.score_cpp.name,
         | 
| 247 |  | 
| @@ -278,11 +286,19 @@ with demo: | |
| 278 |  | 
| 279 | 
             
                        TEXT = (
         | 
| 280 | 
             
                            f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         | 
| 281 | 
            -
                            ' | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 282 | 
             
                            '</p>'
         | 
| 283 | 
             
                            f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         | 
| 284 | 
            -
                            ' | 
| 285 | 
            -
                            '(Missing values are due to the slow or problemtic model responses to be fixed soom.)'
         | 
| 286 | 
             
                            '</p>'
         | 
| 287 | 
             
                            # '<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         | 
| 288 | 
             
                            # 'We present '
         | 
| @@ -534,18 +550,19 @@ with demo: | |
| 534 | 
             
                                    get_model_leaderboard_df(
         | 
| 535 | 
             
                                        model_result_path,
         | 
| 536 | 
             
                                        benchmark_cols=[
         | 
| 537 | 
            -
                                            AutoEvalColumn.model.name, | 
| 538 | 
            -
                                            
         | 
| 539 | 
            -
                                            AutoEvalColumn. | 
| 540 | 
            -
                                            AutoEvalColumn. | 
| 541 | 
            -
                                            AutoEvalColumn. | 
| 542 | 
            -
             | 
| 543 | 
            -
                                            AutoEvalColumn. | 
| 544 | 
             
                                            ],
         | 
| 545 | 
            -
                                        rank_col=['sort_by_rank',  | 
| 546 | 
             
                                    )
         | 
| 547 | 
             
                                )
         | 
| 548 | 
            -
             | 
|  | |
| 549 | 
             
                            with gr.TabItem("⭐ Sort by Score", elem_id="science_overview_sort_by_score_subtab", id=1, elem_classes="subtab"): 
         | 
| 550 | 
             
                                leaderboard = overall_leaderboard(
         | 
| 551 | 
             
                                    get_model_leaderboard_df(
         | 
| @@ -553,14 +570,15 @@ with demo: | |
| 553 | 
             
                                        benchmark_cols=[
         | 
| 554 | 
             
                                            AutoEvalColumn.model.name, 
         | 
| 555 |  | 
| 556 | 
            -
                                            AutoEvalColumn.license.name,
         | 
| 557 | 
            -
                                            AutoEvalColumn.organization.name,
         | 
| 558 | 
            -
                                            AutoEvalColumn.knowledge_cutoff.name,
         | 
| 559 |  | 
| 560 | 
            -
                                            AutoEvalColumn.score_chemistry.name,
         | 
| 561 | 
            -
                                             | 
|  | |
| 562 | 
             
                                            ],
         | 
| 563 | 
            -
                                        rank_col=['sort_by_score', 4,  | 
| 564 | 
             
                                    )
         | 
| 565 | 
             
                                )
         | 
| 566 |  | 
| @@ -583,18 +601,50 @@ with demo: | |
| 583 | 
             
                                )
         | 
| 584 | 
             
                            )
         | 
| 585 |  | 
| 586 | 
            -
                        with gr.TabItem("⚛️ Physics", elem_id="physics_subtab", id=2, elem_classes="subtab"):   
         | 
| 587 | 
            -
                            CURRENT_TEXT = """
         | 
| 588 | 
            -
                            # Coming soon!
         | 
| 589 | 
            -
                            """
         | 
| 590 | 
            -
                            gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         | 
| 591 | 
            -
             | 
| 592 |  | 
| 593 | 
             
                        with gr.TabItem("🧬 Biology", elem_id="biology_subtab", id=3, elem_classes="subtab"):   
         | 
| 594 | 
            -
                            CURRENT_TEXT = """
         | 
| 595 | 
            -
                            # Coming soon!
         | 
| 596 | 
            -
                            """
         | 
| 597 | 
            -
                            gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 598 |  | 
| 599 |  | 
| 600 | 
             
                    with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
         | 
|  | |
| 100 | 
             
                )
         | 
| 101 |  | 
| 102 | 
             
            # model_result_path = "./src/results/models_2024-10-20-23:34:57.242641.json"
         | 
| 103 | 
            +
            # model_result_path = "./src/results/models_2024-10-24-08:08:59.127307.json"
         | 
| 104 | 
            +
            model_result_path = "./src/results/models_2024-11-08-08:36:00.464224.json"
         | 
| 105 | 
             
            # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
         | 
| 106 |  | 
| 107 |  | 
|  | |
| 193 |  | 
| 194 | 
             
                        TEXT = (
         | 
| 195 | 
             
                            f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         | 
| 196 | 
            +
                            # '<b>Total #models: 57 (Last updated: 2024-10-21)</b>'
         | 
| 197 | 
            +
                            '<b>Total #models: 62 (Last updated: 2024-11-08)</b>'
         | 
| 198 | 
             
                            '</p>'
         | 
| 199 | 
             
                            f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         | 
| 200 | 
             
                            'This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks or scores.'
         | 
|  | |
| 220 | 
             
                                        AutoEvalColumn.rank_reason_logical.name,
         | 
| 221 | 
             
                                        AutoEvalColumn.rank_reason_social.name,
         | 
| 222 | 
             
                                        AutoEvalColumn.rank_chemistry.name,
         | 
| 223 | 
            +
                                        AutoEvalColumn.rank_biology.name,
         | 
| 224 | 
            +
                                        AutoEvalColumn.rank_physics.name,
         | 
| 225 | 
            +
                                        
         | 
| 226 | 
             
                                        AutoEvalColumn.rank_overall.name,
         | 
| 227 | 
             
                                        # AutoEvalColumn.rank_cpp.name,
         | 
| 228 | 
             
                                        ],
         | 
|  | |
| 247 | 
             
                                        AutoEvalColumn.score_reason_logical.name,
         | 
| 248 | 
             
                                        AutoEvalColumn.score_reason_social.name,
         | 
| 249 | 
             
                                        AutoEvalColumn.score_chemistry.name,
         | 
| 250 | 
            +
                                        AutoEvalColumn.score_biology.name,
         | 
| 251 | 
            +
                                        AutoEvalColumn.score_physics.name,
         | 
| 252 | 
            +
             | 
| 253 | 
             
                                        AutoEvalColumn.score_overall.name,
         | 
| 254 | 
             
                                        # AutoEvalColumn.score_cpp.name,
         | 
| 255 |  | 
|  | |
| 286 |  | 
| 287 | 
             
                        TEXT = (
         | 
| 288 | 
             
                            f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         | 
| 289 | 
            +
                            'Algebra, Geometry, and Probability are the current three main math domains in the leaderboard. '
         | 
| 290 | 
            +
                            'To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources. '
         | 
| 291 | 
            +
                            'We prioritize <b>recent math datasets</b> and focus on <b>college and beyond level</b> math questions. '
         | 
| 292 | 
            +
                            'The current datasets include</b>'
         | 
| 293 | 
            +
                            '<a href="https://arxiv.org/abs/2103.03874">MATH</a>, '
         | 
| 294 | 
            +
                            '<a href="htt ps://github.com/openai/prm800k/tree/main/prm800k/math_splits">MATH-500</a>, '
         | 
| 295 | 
            +
                            '<a href="https://omni-math.github.io/">Omni</a>, '
         | 
| 296 | 
            +
                            '<a href="https://arxiv.org/abs/1905.13319">MathQA</a>, '
         | 
| 297 | 
            +
                            '<a href="https://arxiv.org/abs/2405.12209">MathBench</a>, '
         | 
| 298 | 
            +
                            '<a href="https://arxiv.org/abs/2307.10635">SciBench</a>, and more! '                
         | 
| 299 | 
             
                            '</p>'
         | 
| 300 | 
             
                            f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         | 
| 301 | 
            +
                            'We plan to include more math domains, such as calculus, number theory, and more in the future. '
         | 
|  | |
| 302 | 
             
                            '</p>'
         | 
| 303 | 
             
                            # '<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
         | 
| 304 | 
             
                            # 'We present '
         | 
|  | |
| 550 | 
             
                                    get_model_leaderboard_df(
         | 
| 551 | 
             
                                        model_result_path,
         | 
| 552 | 
             
                                        benchmark_cols=[
         | 
| 553 | 
            +
                                            AutoEvalColumn.model.name,                                 
         | 
| 554 | 
            +
                                            # AutoEvalColumn.license.name,
         | 
| 555 | 
            +
                                            # AutoEvalColumn.organization.name,
         | 
| 556 | 
            +
                                            # AutoEvalColumn.knowledge_cutoff.name,
         | 
| 557 | 
            +
                                            AutoEvalColumn.rank_chemistry.name, 
         | 
| 558 | 
            +
                                            AutoEvalColumn.rank_biology.name,
         | 
| 559 | 
            +
                                            AutoEvalColumn.rank_physics.name,
         | 
| 560 | 
             
                                            ],
         | 
| 561 | 
            +
                                        rank_col=['sort_by_rank', 1, 4, 'Science'],
         | 
| 562 | 
             
                                    )
         | 
| 563 | 
             
                                )
         | 
| 564 | 
            +
                                
         | 
| 565 | 
            +
                                
         | 
| 566 | 
             
                            with gr.TabItem("⭐ Sort by Score", elem_id="science_overview_sort_by_score_subtab", id=1, elem_classes="subtab"): 
         | 
| 567 | 
             
                                leaderboard = overall_leaderboard(
         | 
| 568 | 
             
                                    get_model_leaderboard_df(
         | 
|  | |
| 570 | 
             
                                        benchmark_cols=[
         | 
| 571 | 
             
                                            AutoEvalColumn.model.name, 
         | 
| 572 |  | 
| 573 | 
            +
                                            # AutoEvalColumn.license.name,
         | 
| 574 | 
            +
                                            # AutoEvalColumn.organization.name,
         | 
| 575 | 
            +
                                            # AutoEvalColumn.knowledge_cutoff.name,
         | 
| 576 |  | 
| 577 | 
            +
                                            AutoEvalColumn.score_chemistry.name, 
         | 
| 578 | 
            +
                                            AutoEvalColumn.score_biology.name,
         | 
| 579 | 
            +
                                            AutoEvalColumn.score_physics.name,
         | 
| 580 | 
             
                                            ],
         | 
| 581 | 
            +
                                        rank_col=['sort_by_score', 1, 4, 'Science'], # two numbers are index to select the columns to average and sort
         | 
| 582 | 
             
                                    )
         | 
| 583 | 
             
                                )
         | 
| 584 |  | 
|  | |
| 601 | 
             
                                )
         | 
| 602 | 
             
                            )
         | 
| 603 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 604 |  | 
| 605 | 
             
                        with gr.TabItem("🧬 Biology", elem_id="biology_subtab", id=3, elem_classes="subtab"):   
         | 
| 606 | 
            +
                            # CURRENT_TEXT = """
         | 
| 607 | 
            +
                            # # Coming soon!
         | 
| 608 | 
            +
                            # """
         | 
| 609 | 
            +
                            # gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         | 
| 610 | 
            +
                            leaderboard = overall_leaderboard(
         | 
| 611 | 
            +
                                get_model_leaderboard_df(
         | 
| 612 | 
            +
                                    model_result_path,
         | 
| 613 | 
            +
                                    benchmark_cols=[
         | 
| 614 | 
            +
                                        AutoEvalColumn.rank_biology.name,
         | 
| 615 | 
            +
                                        AutoEvalColumn.model.name, 
         | 
| 616 | 
            +
                                        AutoEvalColumn.score_biology.name,
         | 
| 617 | 
            +
                                        # AutoEvalColumn.sd_reason_social.name,
         | 
| 618 | 
            +
                                        AutoEvalColumn.license.name,
         | 
| 619 | 
            +
                                        AutoEvalColumn.organization.name,
         | 
| 620 | 
            +
                                        AutoEvalColumn.knowledge_cutoff.name,
         | 
| 621 | 
            +
                                        ],
         | 
| 622 | 
            +
                                    rank_col=[AutoEvalColumn.rank_biology.name],
         | 
| 623 | 
            +
                                )
         | 
| 624 | 
            +
                            )
         | 
| 625 | 
            +
                            
         | 
| 626 | 
            +
                            
         | 
| 627 | 
            +
                        with gr.TabItem("⚛️ Physics", elem_id="physics_subtab", id=2, elem_classes="subtab"):   
         | 
| 628 | 
            +
                            # CURRENT_TEXT = """
         | 
| 629 | 
            +
                            # # Coming soon!
         | 
| 630 | 
            +
                            # """
         | 
| 631 | 
            +
                            # gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
         | 
| 632 | 
            +
                            leaderboard = overall_leaderboard(
         | 
| 633 | 
            +
                                get_model_leaderboard_df(
         | 
| 634 | 
            +
                                    model_result_path,
         | 
| 635 | 
            +
                                    benchmark_cols=[
         | 
| 636 | 
            +
                                        AutoEvalColumn.rank_physics.name,
         | 
| 637 | 
            +
                                        AutoEvalColumn.model.name, 
         | 
| 638 | 
            +
                                        AutoEvalColumn.score_physics.name,
         | 
| 639 | 
            +
                                        # AutoEvalColumn.sd_reason_social.name,
         | 
| 640 | 
            +
                                        AutoEvalColumn.license.name,
         | 
| 641 | 
            +
                                        AutoEvalColumn.organization.name,
         | 
| 642 | 
            +
                                        AutoEvalColumn.knowledge_cutoff.name,
         | 
| 643 | 
            +
                                        ],
         | 
| 644 | 
            +
                                    rank_col=[AutoEvalColumn.rank_physics.name],
         | 
| 645 | 
            +
                                )
         | 
| 646 | 
            +
                            )
         | 
| 647 | 
            +
             | 
| 648 |  | 
| 649 |  | 
| 650 | 
             
                    with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
         | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -101,6 +101,7 @@ auto_eval_column_dict.append(["sd_biology", ColumnContent, field(default_factory | |
| 101 | 
             
            auto_eval_column_dict.append(["rank_biology", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Biology)", "number", True))])
         | 
| 102 |  | 
| 103 |  | 
|  | |
| 104 | 
             
            auto_eval_column_dict.append(["score_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Score (C++)", "number", True))])
         | 
| 105 | 
             
            auto_eval_column_dict.append(["sd_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (C++)", "number", True))])
         | 
| 106 | 
             
            auto_eval_column_dict.append(["rank_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (C++)", "number", True))])
         | 
|  | |
| 101 | 
             
            auto_eval_column_dict.append(["rank_biology", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Biology)", "number", True))])
         | 
| 102 |  | 
| 103 |  | 
| 104 | 
            +
             | 
| 105 | 
             
            auto_eval_column_dict.append(["score_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Score (C++)", "number", True))])
         | 
| 106 | 
             
            auto_eval_column_dict.append(["sd_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (C++)", "number", True))])
         | 
| 107 | 
             
            auto_eval_column_dict.append(["rank_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (C++)", "number", True))])
         | 
    	
        src/leaderboard/read_evals.py
    CHANGED
    
    | @@ -188,6 +188,15 @@ class ModelResult: | |
| 188 | 
             
                        AutoEvalColumn.score_chemistry.name: self.results.get("Chemistry").get("Average Score", None) if self.results.get("Chemistry") else None,
         | 
| 189 | 
             
                        AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
         | 
| 190 | 
             
                        AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 191 |  | 
| 192 | 
             
                        AutoEvalColumn.score_cpp.name: self.results.get("CPP").get("Average Score", None) if self.results.get("CPP") else None,
         | 
| 193 | 
             
                        AutoEvalColumn.sd_cpp.name: self.results.get("CPP").get("Standard Deviation", None) if self.results.get("CPP") else None,
         | 
|  | |
| 188 | 
             
                        AutoEvalColumn.score_chemistry.name: self.results.get("Chemistry").get("Average Score", None) if self.results.get("Chemistry") else None,
         | 
| 189 | 
             
                        AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
         | 
| 190 | 
             
                        AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                        AutoEvalColumn.score_biology.name: self.results.get("Biology").get("Average Score", None) if self.results.get("Biology") else None,
         | 
| 193 | 
            +
                        AutoEvalColumn.sd_biology.name: self.results.get("Biology").get("Standard Deviation", None) if self.results.get("Biology") else None,
         | 
| 194 | 
            +
                        AutoEvalColumn.rank_biology.name: self.results.get("Biology").get("Rank", None) if self.results.get("Biology") else None,
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                        AutoEvalColumn.score_physics.name: self.results.get("Physics").get("Average Score", None) if self.results.get("Physics") else None,
         | 
| 197 | 
            +
                        AutoEvalColumn.sd_physics.name: self.results.get("Physics").get("Standard Deviation", None) if self.results.get("Physics") else None,
         | 
| 198 | 
            +
                        AutoEvalColumn.rank_physics.name: self.results.get("Physics").get("Rank", None) if self.results.get("Physics") else None,
         | 
| 199 | 
            +
                        
         | 
| 200 |  | 
| 201 | 
             
                        AutoEvalColumn.score_cpp.name: self.results.get("CPP").get("Average Score", None) if self.results.get("CPP") else None,
         | 
| 202 | 
             
                        AutoEvalColumn.sd_cpp.name: self.results.get("CPP").get("Standard Deviation", None) if self.results.get("CPP") else None,
         | 
    	
        src/results/models_2024-11-08-08:36:00.464224.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
