CodeReviewBench

Sleeping

App Files Files Community

apsys commited on Apr 25

Commit

a17bcda

1 Parent(s): 37c2f94

categories rename

Browse files

Files changed (6) hide show

app.py +16 -2
src/about.py +6 -8
src/display/utils.py +26 -0
src/leaderboard/processor.py +1 -1
src/populate.py +1 -0
src/submission/submit.py +1 -0

app.py CHANGED Viewed

@@ -32,6 +32,7 @@ from src.display.utils import (
     CATEGORIES,
     TEST_TYPES,
     ModelType,
     Precision,
     WeightType,
     GuardModelType,
@@ -394,6 +395,7 @@ def submit_results(
     precision: str,
     weight_type: str,
     model_type: str,
     submission_file: tempfile._TemporaryFileWrapper,
     version: str,
     guard_model_type: GuardModelType
@@ -410,6 +412,9 @@ def submit_results(
     if not model_type:
         return styled_error("Please select a model type")
     file_path = submission_file.name
     logger.info(f"Received submission for model {model_name}: {file_path}")
@@ -421,6 +426,7 @@ def submit_results(
         "precision": precision,
         "weight_type": weight_type,
         "model_type": model_type,
         "version": version,
         "guard_model_type": guard_model_type
     }
@@ -809,8 +815,8 @@ with demo:
                 gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 with gr.Row():
-                    with gr.Column(scale=3):
-                        gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
                     with gr.Column(scale=1):
                         # Add version selector specifically for the submission tab
                         submission_version_selector = gr.Dropdown(
@@ -825,6 +831,13 @@ with demo:
                 with gr.Row():
                     with gr.Column():
                         model_name_textbox = gr.Textbox(label="Model name")
                         revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                         model_type = gr.Dropdown(
                             choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
@@ -876,6 +889,7 @@ with demo:
                         precision,
                         weight_type,
                         model_type,
                         file_input,
                         submission_version_selector,
                         guard_model_type

     CATEGORIES,
     TEST_TYPES,
     ModelType,
+    Mode,
     Precision,
     WeightType,
     GuardModelType,
     precision: str,
     weight_type: str,
     model_type: str,
+    mode: str,
     submission_file: tempfile._TemporaryFileWrapper,
     version: str,
     guard_model_type: GuardModelType
     if not model_type:
         return styled_error("Please select a model type")
+    if not mode:
+        return styled_error("Please select an inference mode")
     file_path = submission_file.name
     logger.info(f"Received submission for model {model_name}: {file_path}")
         "precision": precision,
         "weight_type": weight_type,
         "model_type": model_type,
+        "mode": mode,
         "version": version,
         "guard_model_type": guard_model_type
     }
                 gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 with gr.Row():
+                    # with gr.Column(scale=3):
+                    #     gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
                     with gr.Column(scale=1):
                         # Add version selector specifically for the submission tab
                         submission_version_selector = gr.Dropdown(
                 with gr.Row():
                     with gr.Column():
                         model_name_textbox = gr.Textbox(label="Model name")
+                        mode_selector = gr.Dropdown(
+                            choices=[m.name for m in Mode],
+                            label="Mode",
+                            multiselect=False,
+                            value=None,
+                            interactive=True,
+                        )
                         revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                         model_type = gr.Dropdown(
                             choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
                         precision,
                         weight_type,
                         model_type,
+                        mode_selector,
                         file_input,
                         submission_version_selector,
                         guard_model_type

src/about.py CHANGED Viewed

@@ -20,8 +20,6 @@ across multiple categories and test scenarios.
 """
 LLM_BENCHMARKS_TEXT = """
-## GuardBench evaluation methodology
 GuardBench checks how well models handle safety challenges — from misinformation and self-harm to sexual content and corruption.
 Models are tested with regular and adversarial prompts to see if they can avoid saying harmful things.
@@ -30,15 +28,15 @@ We track how accurate they are, how often they make mistakes, and how fast they
 """
 EVALUATION_QUEUE_TEXT = """
-## Submission Process
-To submit your model results to the GuardBench leaderboard:
-1. Evaluate your model using the [GuardBench framework](https://github.com/huggingface/guard-bench)
-2. Format your results as a JSONL file according to our schema
-3. Submit your results using the submission form with your authorized token
-Results will be processed and added to the leaderboard once validated.
 """
 CITATION_BUTTON_LABEL = "Cite GuardBench"

 """
 LLM_BENCHMARKS_TEXT = """
 GuardBench checks how well models handle safety challenges — from misinformation and self-harm to sexual content and corruption.
 Models are tested with regular and adversarial prompts to see if they can avoid saying harmful things.
 """
 EVALUATION_QUEUE_TEXT = """
+## Submit Your Model
+To add your model to the GuardBench leaderboard:
+Run your evaluation using the GuardBench framework at https://github.com/whitecircle-ai/guard-bench
+Upload your run results in .jsonl format using this form
+Once validated, your model will appear on the leaderboard.
+✉️✨ Ready? Upload your results below!
 """
 CITATION_BUTTON_LABEL = "Cite GuardBench"

src/display/utils.py CHANGED Viewed

@@ -7,6 +7,16 @@ from enum import Enum, auto
 from typing import List, Optional
 class ModelType(Enum):
     """Model types for the leaderboard."""
     Unknown = auto()
@@ -86,6 +96,11 @@ class GuardBenchColumn:
         never_hidden=True,
         displayed_by_default=True
     ))
     model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="model_type",
         display_name="Type",
@@ -333,6 +348,17 @@ GUARDBENCH_COLUMN = GuardBenchColumn()
 COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
 DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
                 if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
 METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
                if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
 HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)

 from typing import List, Optional
+class Mode(Enum):
+    """Inference mode for the guard model."""
+    CoT = auto()  # Chain of Thought
+    Strict = auto()
+    def __str__(self):
+        """String representation of the mode."""
+        return self.name
 class ModelType(Enum):
     """Model types for the leaderboard."""
     Unknown = auto()
         never_hidden=True,
         displayed_by_default=True
     ))
+    mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="mode",
+        display_name="Mode",
+        displayed_by_default=True
+    ))
     model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="model_type",
         display_name="Type",
 COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
 DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
                 if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
+# Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
+def reorder_display_cols():
+    cols = DISPLAY_COLS
+    if 'model_name' in cols and 'mode' in cols:
+        cols.remove('mode')
+        model_name_index = cols.index('model_name')
+        cols.insert(model_name_index + 1, 'mode')
+    return cols
+DISPLAY_COLS = reorder_display_cols()
 METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
                if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
 HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)

src/leaderboard/processor.py CHANGED Viewed

@@ -175,7 +175,7 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
         }
         # Add additional metadata fields if present
-        for key in ["base_model", "revision", "precision", "weight_type"]:
             if key in entry:
                 row[key] = entry[key]

         }
         # Add additional metadata fields if present
+        for key in ["base_model", "revision", "precision", "weight_type", "mode"]:
             if key in entry:
                 row[key] = entry[key]

src/populate.py CHANGED Viewed

@@ -141,6 +141,7 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
             "model_name": entry.get("model_name", "Unknown Model"),
             "model_type": entry.get("model_type", "Unknown"),
             "guard_model_type": entry.get("guard_model_type", "Unknown"),
             "submission_date": entry.get("submission_date", ""),
             "version": entry.get("version", version),
             "base_model": entry.get("base_model", ""),

             "model_name": entry.get("model_name", "Unknown Model"),
             "model_type": entry.get("model_type", "Unknown"),
             "guard_model_type": entry.get("guard_model_type", "Unknown"),
+            "mode": entry.get("mode", "Strict"),
             "submission_date": entry.get("submission_date", ""),
             "version": entry.get("version", version),
             "base_model": entry.get("base_model", ""),

src/submission/submit.py CHANGED Viewed

@@ -175,6 +175,7 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
                 "model_name": metadata.get("model_name"),  # Use original model name
                 "model_type": metadata.get("model_type"),
                 "guard_model_type": str(metadata.get("guard_model_type")).lower(),
                 "base_model": metadata.get("base_model"),
                 "revision": metadata.get("revision"),
                 "precision": metadata.get("precision"),

                 "model_name": metadata.get("model_name"),  # Use original model name
                 "model_type": metadata.get("model_type"),
                 "guard_model_type": str(metadata.get("guard_model_type")).lower(),
+                "mode": metadata.get("mode"),
                 "base_model": metadata.get("base_model"),
                 "revision": metadata.get("revision"),
                 "precision": metadata.get("precision"),