Spaces:

Metric-AI
/

ArmBench-LLM

Running

App Files Files Community

Bagratuni commited on Mar 12

Commit

c633720

1 Parent(s): 6b4ef20

commit

Browse files

Files changed (5) hide show

app.py +13 -4
data_handler.py → data/data_handler.py +1 -5
model_handler.py → data/model_handler.py +16 -8
model_results.json → data/model_results.json +64 -0
data/required_categories.py +24 -0

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import gradio as gr
 import pandas as pd
 import plotly.express as px
-from model_handler import ModelHandler
-from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart
 global_unified_exam_df = None
 global_mmlu_df = None
@@ -96,8 +96,17 @@ def main():
                             ]
                         }
                         ```
-                    3. **Submit your model**:
-                        - Add the `arm_bench` tag and the `result.json` file to your model card.
                         - Click on the "Refresh Data" button in this app, and you will see your model's results.
                     """
                 )

 import gradio as gr
 import pandas as pd
 import plotly.express as px
+from data.model_handler import ModelHandler
+from data.data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart
 global_unified_exam_df = None
 global_mmlu_df = None
                             ]
                         }
                         ```
+                    3. **Important Notes**:
+                        - For **`mmlu_results`**:
+                            - The following categories must be included in the `mmlu_results` for the model to be considered valid:
+                                - "Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering", "Health", "History", "Law", "Math", "Other", "Philosophy", "Physics", "Psychology", "Average"
+                            - If any of these categories are missing, the model will not be added to the evaluation.
+                        - For **`unified_exam_results`**:
+                            - The following categories must be included in the `unified_exam_results` for the model to be considered valid:
+                                - "Average", "Armenian language and literature", "Armenian history", "Mathematics"
+                            - If any of these categories are missing, the model will not be added to the evaluation.
+                    4. **Submit your model**:
+                        - Add the `Arm-LLM-Bench` tag and the `result.json` file to your model card.
                         - Click on the "Refresh Data" button in this app, and you will see your model's results.
                     """
                 )

data_handler.py → data/data_handler.py RENAMED Viewed

@@ -1,12 +1,10 @@
 import gradio as gr
 import pandas as pd
 import plotly.express as px
-from model_handler import ModelHandler
 def unified_exam_result_table(unified_exam_df):
     df = unified_exam_df.copy()
-    numeric_columns = df.select_dtypes(include=["number"])
-    df["Average"] = numeric_columns.mean(axis=1)
     df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
     df.insert(0, 'Rank', range(1, len(df) + 1))
     cols = df.columns.tolist()
@@ -18,8 +16,6 @@ def unified_exam_result_table(unified_exam_df):
 def mmlu_result_table(mmlu_df):
     df = mmlu_df.copy()
-    numeric_columns = df.select_dtypes(include=["number"])
-    df["Average"] = numeric_columns.mean(axis=1)
     df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
     df.insert(0, 'Rank', range(1, len(df) + 1))
     cols = df.columns.tolist()

 import gradio as gr
 import pandas as pd
 import plotly.express as px
+from data.model_handler import ModelHandler
 def unified_exam_result_table(unified_exam_df):
     df = unified_exam_df.copy()
     df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
     df.insert(0, 'Rank', range(1, len(df) + 1))
     cols = df.columns.tolist()
 def mmlu_result_table(mmlu_df):
     df = mmlu_df.copy()
     df = df.sort_values(by='Average', ascending=False).reset_index(drop=True)
     df.insert(0, 'Rank', range(1, len(df) + 1))
     cols = df.columns.tolist()

model_handler.py → data/model_handler.py RENAMED Viewed

@@ -5,8 +5,10 @@ from typing import Any, Dict
 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download
 class ModelHandler:
-    def __init__(self, model_infos_path="model_results.json"):
         self.api = HfApi()
         self.model_infos_path = model_infos_path
         self.model_infos = self._load_model_infos()
@@ -23,7 +25,7 @@ class ModelHandler:
             json.dump(self.model_infos, f, indent=4)
     def get_arm_bench_data(self):
-        models = self.api.list_models(filter="arm_llm")
         model_names = {model["model_name"] for model in self.model_infos}
         repositories = [model.modelId for model in models]
@@ -63,16 +65,22 @@ class ModelHandler:
             if mmlu_results:
                 mmlu_row = {"Model": model_name}
-                for result in mmlu_results:
-                    mmlu_row[result["category"]] = result["score"]
-                mmlu_data.append(mmlu_row)
             if unified_exam_results:
                 unified_exam_row = {"Model": model_name}
-                for result in unified_exam_results:
-                    unified_exam_row[result["category"]] = result["score"]
-                unified_exam_data.append(unified_exam_row)
         mmlu_df = pd.DataFrame(mmlu_data)
         unified_exam_df = pd.DataFrame(unified_exam_data)

 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download
+from data.required_categories import required_mmlu_categories, required_unified_exam_categories
 class ModelHandler:
+    def __init__(self, model_infos_path="data\model_results.json"):
         self.api = HfApi()
         self.model_infos_path = model_infos_path
         self.model_infos = self._load_model_infos()
             json.dump(self.model_infos, f, indent=4)
     def get_arm_bench_data(self):
+        models = self.api.list_models(filter="Arm-LLM-Benchmark")
         model_names = {model["model_name"] for model in self.model_infos}
         repositories = [model.modelId for model in models]
             if mmlu_results:
                 mmlu_row = {"Model": model_name}
+                mmlu_categories = {result["category"] for result in mmlu_results}
+                if all(category in mmlu_categories for category in required_mmlu_categories):
+                    for result in mmlu_results:
+                        mmlu_row[result["category"]] = result["score"]
+                    mmlu_data.append(mmlu_row)
             if unified_exam_results:
                 unified_exam_row = {"Model": model_name}
+                unified_exam_categories = {result["category"] for result in unified_exam_results}
+                if all(category in unified_exam_categories for category in required_unified_exam_categories):
+                    for result in unified_exam_results:
+                        unified_exam_row[result["category"]] = result["score"]
+                    unified_exam_data.append(unified_exam_row)
         mmlu_df = pd.DataFrame(mmlu_data)
         unified_exam_df = pd.DataFrame(unified_exam_data)

model_results.json → data/model_results.json RENAMED Viewed

@@ -4,6 +4,10 @@
         "results": {
             "mmlu_results": [],
             "unified_exam_results": [
                 {
                     "category": "Armenian language and literature",
                     "score": 10.5
@@ -23,6 +27,10 @@
         "model_name": "claude-3-5-sonnet-20241022",
         "results": {
             "mmlu_results": [
                 {
                     "category": "Biology",
                     "score": 0.8667
@@ -81,6 +89,10 @@
                 }
             ],
             "unified_exam_results": [
                 {
                     "category": "Armenian language and literature",
                     "score": 10.0
@@ -100,6 +112,10 @@
         "model_name": "gemini-2.0-flash",
         "results": {
             "mmlu_results": [
                 {
                     "category": "Biology",
                     "score": 0.85
@@ -158,6 +174,10 @@
                 }
             ],
             "unified_exam_results": [
                 {
                     "category": "Armenian language and literature",
                     "score": 5.5
@@ -177,6 +197,10 @@
         "model_name": "gpt-4o",
         "results": {
             "mmlu_results": [
                 {
                     "category": "Biology",
                     "score": 0.8667
@@ -235,6 +259,10 @@
                 }
             ],
             "unified_exam_results": [
                 {
                     "category": "Armenian language and literature",
                     "score": 6.75
@@ -255,6 +283,10 @@
         "results": {
             "mmlu_results": [],
             "unified_exam_results": [
                 {
                     "category": "Armenian language and literature",
                     "score": 7.25
@@ -274,6 +306,10 @@
         "model_name": "gemini-1.5-flash",
         "results": {
             "mmlu_results": [
                 {
                     "category": "Biology",
                     "score": 0.75
@@ -332,6 +368,10 @@
                 }
             ],
             "unified_exam_results": [
                 {
                     "category": "Armenian language and literature",
                     "score": 4.75
@@ -351,6 +391,10 @@
         "model_name": "DeepSeek-V3",
         "results": {
             "mmlu_results": [
                 {
                     "category": "Biology",
                     "score": 0.8167
@@ -409,6 +453,10 @@
                 }
             ],
             "unified_exam_results": [
                 {
                     "category": "Armenian language and literature",
                     "score": 5.25
@@ -428,6 +476,10 @@
         "model_name": "Meta-Llama-3.3-70B-Instruct",
         "results": {
             "mmlu_results": [
                 {
                     "category": "Biology",
                     "score": 0.7333
@@ -486,6 +538,10 @@
                 }
             ],
             "unified_exam_results": [
                 {
                     "category": "Armenian language and literature",
                     "score": 4.5
@@ -505,6 +561,10 @@
         "model_name": "claude-3-5-haiku-20241022",
         "results": {
             "mmlu_results": [
                 {
                     "category": "Biology",
                     "score": 0.75
@@ -563,6 +623,10 @@
                 }
             ],
             "unified_exam_results": [
                 {
                     "category": "Armenian language and literature",
                     "score": 5.0

         "results": {
             "mmlu_results": [],
             "unified_exam_results": [
+                {
+                    "category": "Average",
+                    "score": 11.0833
+                },
                 {
                     "category": "Armenian language and literature",
                     "score": 10.5
         "model_name": "claude-3-5-sonnet-20241022",
         "results": {
             "mmlu_results": [
+                {
+                    "category": "Average",
+                    "score": 0.6958
+                },
                 {
                     "category": "Biology",
                     "score": 0.8667
                 }
             ],
             "unified_exam_results": [
+                {
+                    "category": "Average",
+                    "score": 10.6667
+                },
                 {
                     "category": "Armenian language and literature",
                     "score": 10.0
         "model_name": "gemini-2.0-flash",
         "results": {
             "mmlu_results": [
+                {
+                    "category": "Average",
+                    "score": 0.7247
+                },
                 {
                     "category": "Biology",
                     "score": 0.85
                 }
             ],
             "unified_exam_results": [
+                {
+                    "category": "Average",
+                    "score": 9.8333
+                },
                 {
                     "category": "Armenian language and literature",
                     "score": 5.5
         "model_name": "gpt-4o",
         "results": {
             "mmlu_results": [
+                {
+                    "category": "Average",
+                    "score": 0.6758
+                },
                 {
                     "category": "Biology",
                     "score": 0.8667
                 }
             ],
             "unified_exam_results": [
+                {
+                    "category": "Average",
+                    "score": 8.9167
+                },
                 {
                     "category": "Armenian language and literature",
                     "score": 6.75
         "results": {
             "mmlu_results": [],
             "unified_exam_results": [
+                {
+                    "category": "Average",
+                    "score": 8.6667
+                },
                 {
                     "category": "Armenian language and literature",
                     "score": 7.25
         "model_name": "gemini-1.5-flash",
         "results": {
             "mmlu_results": [
+                {
+                    "category": "Average",
+                    "score": 0.5592
+                },
                 {
                     "category": "Biology",
                     "score": 0.75
                 }
             ],
             "unified_exam_results": [
+                {
+                    "category": "Average",
+                    "score": 7.8333
+                },
                 {
                     "category": "Armenian language and literature",
                     "score": 4.75
         "model_name": "DeepSeek-V3",
         "results": {
             "mmlu_results": [
+                {
+                    "category": "Average",
+                    "score": 0.6633
+                },
                 {
                     "category": "Biology",
                     "score": 0.8167
                 }
             ],
             "unified_exam_results": [
+                {
+                    "category": "Average",
+                    "score": 7.5
+                },
                 {
                     "category": "Armenian language and literature",
                     "score": 5.25
         "model_name": "Meta-Llama-3.3-70B-Instruct",
         "results": {
             "mmlu_results": [
+                {
+                    "category": "Average",
+                    "score": 0.5139
+                },
                 {
                     "category": "Biology",
                     "score": 0.7333
                 }
             ],
             "unified_exam_results": [
+                {
+                    "category": "Average",
+                    "score": 7.0833
+                },
                 {
                     "category": "Armenian language and literature",
                     "score": 4.5
         "model_name": "claude-3-5-haiku-20241022",
         "results": {
             "mmlu_results": [
+                {
+                    "category": "Average",
+                    "score": 0.5198
+                },
                 {
                     "category": "Biology",
                     "score": 0.75
                 }
             ],
             "unified_exam_results": [
+                {
+                    "category": "Average",
+                    "score": 6.5
+                },
                 {
                     "category": "Armenian language and literature",
                     "score": 5.0

data/required_categories.py ADDED Viewed

	@@ -0,0 +1,24 @@

+required_unified_exam_categories = [
+    "Average",
+    "Armenian language and literature",
+    "Armenian history",
+    "Mathematics"
+]
+required_mmlu_categories = [
+    "Biology",
+    "Business",
+    "Chemistry",
+    "Computer Science",
+    "Economics",
+    "Engineering",
+    "Health",
+    "History",
+    "Law",
+    "Math",
+    "Other",
+    "Philosophy",
+    "Physics",
+    "Psychology",
+    "Average"
+]