Spaces:
Sleeping
Sleeping
categories rename
Browse files- app.py +16 -2
- src/about.py +6 -8
- src/display/utils.py +26 -0
- src/leaderboard/processor.py +1 -1
- src/populate.py +1 -0
- src/submission/submit.py +1 -0
app.py
CHANGED
|
@@ -32,6 +32,7 @@ from src.display.utils import (
|
|
| 32 |
CATEGORIES,
|
| 33 |
TEST_TYPES,
|
| 34 |
ModelType,
|
|
|
|
| 35 |
Precision,
|
| 36 |
WeightType,
|
| 37 |
GuardModelType,
|
|
@@ -394,6 +395,7 @@ def submit_results(
|
|
| 394 |
precision: str,
|
| 395 |
weight_type: str,
|
| 396 |
model_type: str,
|
|
|
|
| 397 |
submission_file: tempfile._TemporaryFileWrapper,
|
| 398 |
version: str,
|
| 399 |
guard_model_type: GuardModelType
|
|
@@ -410,6 +412,9 @@ def submit_results(
|
|
| 410 |
if not model_type:
|
| 411 |
return styled_error("Please select a model type")
|
| 412 |
|
|
|
|
|
|
|
|
|
|
| 413 |
file_path = submission_file.name
|
| 414 |
logger.info(f"Received submission for model {model_name}: {file_path}")
|
| 415 |
|
|
@@ -421,6 +426,7 @@ def submit_results(
|
|
| 421 |
"precision": precision,
|
| 422 |
"weight_type": weight_type,
|
| 423 |
"model_type": model_type,
|
|
|
|
| 424 |
"version": version,
|
| 425 |
"guard_model_type": guard_model_type
|
| 426 |
}
|
|
@@ -809,8 +815,8 @@ with demo:
|
|
| 809 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 810 |
|
| 811 |
with gr.Row():
|
| 812 |
-
with gr.Column(scale=3):
|
| 813 |
-
|
| 814 |
with gr.Column(scale=1):
|
| 815 |
# Add version selector specifically for the submission tab
|
| 816 |
submission_version_selector = gr.Dropdown(
|
|
@@ -825,6 +831,13 @@ with demo:
|
|
| 825 |
with gr.Row():
|
| 826 |
with gr.Column():
|
| 827 |
model_name_textbox = gr.Textbox(label="Model name")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 829 |
model_type = gr.Dropdown(
|
| 830 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
|
@@ -876,6 +889,7 @@ with demo:
|
|
| 876 |
precision,
|
| 877 |
weight_type,
|
| 878 |
model_type,
|
|
|
|
| 879 |
file_input,
|
| 880 |
submission_version_selector,
|
| 881 |
guard_model_type
|
|
|
|
| 32 |
CATEGORIES,
|
| 33 |
TEST_TYPES,
|
| 34 |
ModelType,
|
| 35 |
+
Mode,
|
| 36 |
Precision,
|
| 37 |
WeightType,
|
| 38 |
GuardModelType,
|
|
|
|
| 395 |
precision: str,
|
| 396 |
weight_type: str,
|
| 397 |
model_type: str,
|
| 398 |
+
mode: str,
|
| 399 |
submission_file: tempfile._TemporaryFileWrapper,
|
| 400 |
version: str,
|
| 401 |
guard_model_type: GuardModelType
|
|
|
|
| 412 |
if not model_type:
|
| 413 |
return styled_error("Please select a model type")
|
| 414 |
|
| 415 |
+
if not mode:
|
| 416 |
+
return styled_error("Please select an inference mode")
|
| 417 |
+
|
| 418 |
file_path = submission_file.name
|
| 419 |
logger.info(f"Received submission for model {model_name}: {file_path}")
|
| 420 |
|
|
|
|
| 426 |
"precision": precision,
|
| 427 |
"weight_type": weight_type,
|
| 428 |
"model_type": model_type,
|
| 429 |
+
"mode": mode,
|
| 430 |
"version": version,
|
| 431 |
"guard_model_type": guard_model_type
|
| 432 |
}
|
|
|
|
| 815 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 816 |
|
| 817 |
with gr.Row():
|
| 818 |
+
# with gr.Column(scale=3):
|
| 819 |
+
# gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
| 820 |
with gr.Column(scale=1):
|
| 821 |
# Add version selector specifically for the submission tab
|
| 822 |
submission_version_selector = gr.Dropdown(
|
|
|
|
| 831 |
with gr.Row():
|
| 832 |
with gr.Column():
|
| 833 |
model_name_textbox = gr.Textbox(label="Model name")
|
| 834 |
+
mode_selector = gr.Dropdown(
|
| 835 |
+
choices=[m.name for m in Mode],
|
| 836 |
+
label="Mode",
|
| 837 |
+
multiselect=False,
|
| 838 |
+
value=None,
|
| 839 |
+
interactive=True,
|
| 840 |
+
)
|
| 841 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 842 |
model_type = gr.Dropdown(
|
| 843 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
|
|
|
| 889 |
precision,
|
| 890 |
weight_type,
|
| 891 |
model_type,
|
| 892 |
+
mode_selector,
|
| 893 |
file_input,
|
| 894 |
submission_version_selector,
|
| 895 |
guard_model_type
|
src/about.py
CHANGED
|
@@ -20,8 +20,6 @@ across multiple categories and test scenarios.
|
|
| 20 |
"""
|
| 21 |
|
| 22 |
LLM_BENCHMARKS_TEXT = """
|
| 23 |
-
## GuardBench evaluation methodology
|
| 24 |
-
|
| 25 |
GuardBench checks how well models handle safety challenges — from misinformation and self-harm to sexual content and corruption.
|
| 26 |
|
| 27 |
Models are tested with regular and adversarial prompts to see if they can avoid saying harmful things.
|
|
@@ -30,15 +28,15 @@ We track how accurate they are, how often they make mistakes, and how fast they
|
|
| 30 |
"""
|
| 31 |
|
| 32 |
EVALUATION_QUEUE_TEXT = """
|
| 33 |
-
##
|
| 34 |
|
| 35 |
-
To
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
|
| 41 |
-
|
| 42 |
"""
|
| 43 |
|
| 44 |
CITATION_BUTTON_LABEL = "Cite GuardBench"
|
|
|
|
| 20 |
"""
|
| 21 |
|
| 22 |
LLM_BENCHMARKS_TEXT = """
|
|
|
|
|
|
|
| 23 |
GuardBench checks how well models handle safety challenges — from misinformation and self-harm to sexual content and corruption.
|
| 24 |
|
| 25 |
Models are tested with regular and adversarial prompts to see if they can avoid saying harmful things.
|
|
|
|
| 28 |
"""
|
| 29 |
|
| 30 |
EVALUATION_QUEUE_TEXT = """
|
| 31 |
+
## Submit Your Model
|
| 32 |
|
| 33 |
+
To add your model to the GuardBench leaderboard:
|
| 34 |
|
| 35 |
+
Run your evaluation using the GuardBench framework at https://github.com/whitecircle-ai/guard-bench
|
| 36 |
+
Upload your run results in .jsonl format using this form
|
| 37 |
+
Once validated, your model will appear on the leaderboard.
|
| 38 |
|
| 39 |
+
✉️✨ Ready? Upload your results below!
|
| 40 |
"""
|
| 41 |
|
| 42 |
CITATION_BUTTON_LABEL = "Cite GuardBench"
|
src/display/utils.py
CHANGED
|
@@ -7,6 +7,16 @@ from enum import Enum, auto
|
|
| 7 |
from typing import List, Optional
|
| 8 |
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
class ModelType(Enum):
|
| 11 |
"""Model types for the leaderboard."""
|
| 12 |
Unknown = auto()
|
|
@@ -86,6 +96,11 @@ class GuardBenchColumn:
|
|
| 86 |
never_hidden=True,
|
| 87 |
displayed_by_default=True
|
| 88 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 90 |
name="model_type",
|
| 91 |
display_name="Type",
|
|
@@ -333,6 +348,17 @@ GUARDBENCH_COLUMN = GuardBenchColumn()
|
|
| 333 |
COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
|
| 334 |
DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 335 |
if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 337 |
if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
|
| 338 |
HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
|
|
|
| 7 |
from typing import List, Optional
|
| 8 |
|
| 9 |
|
| 10 |
+
class Mode(Enum):
|
| 11 |
+
"""Inference mode for the guard model."""
|
| 12 |
+
CoT = auto() # Chain of Thought
|
| 13 |
+
Strict = auto()
|
| 14 |
+
|
| 15 |
+
def __str__(self):
|
| 16 |
+
"""String representation of the mode."""
|
| 17 |
+
return self.name
|
| 18 |
+
|
| 19 |
+
|
| 20 |
class ModelType(Enum):
|
| 21 |
"""Model types for the leaderboard."""
|
| 22 |
Unknown = auto()
|
|
|
|
| 96 |
never_hidden=True,
|
| 97 |
displayed_by_default=True
|
| 98 |
))
|
| 99 |
+
mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 100 |
+
name="mode",
|
| 101 |
+
display_name="Mode",
|
| 102 |
+
displayed_by_default=True
|
| 103 |
+
))
|
| 104 |
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 105 |
name="model_type",
|
| 106 |
display_name="Type",
|
|
|
|
| 348 |
COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
|
| 349 |
DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 350 |
if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
|
| 351 |
+
|
| 352 |
+
# Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
|
| 353 |
+
def reorder_display_cols():
|
| 354 |
+
cols = DISPLAY_COLS
|
| 355 |
+
if 'model_name' in cols and 'mode' in cols:
|
| 356 |
+
cols.remove('mode')
|
| 357 |
+
model_name_index = cols.index('model_name')
|
| 358 |
+
cols.insert(model_name_index + 1, 'mode')
|
| 359 |
+
return cols
|
| 360 |
+
DISPLAY_COLS = reorder_display_cols()
|
| 361 |
+
|
| 362 |
METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 363 |
if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
|
| 364 |
HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
src/leaderboard/processor.py
CHANGED
|
@@ -175,7 +175,7 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
| 175 |
}
|
| 176 |
|
| 177 |
# Add additional metadata fields if present
|
| 178 |
-
for key in ["base_model", "revision", "precision", "weight_type"]:
|
| 179 |
if key in entry:
|
| 180 |
row[key] = entry[key]
|
| 181 |
|
|
|
|
| 175 |
}
|
| 176 |
|
| 177 |
# Add additional metadata fields if present
|
| 178 |
+
for key in ["base_model", "revision", "precision", "weight_type", "mode"]:
|
| 179 |
if key in entry:
|
| 180 |
row[key] = entry[key]
|
| 181 |
|
src/populate.py
CHANGED
|
@@ -141,6 +141,7 @@ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
|
| 141 |
"model_name": entry.get("model_name", "Unknown Model"),
|
| 142 |
"model_type": entry.get("model_type", "Unknown"),
|
| 143 |
"guard_model_type": entry.get("guard_model_type", "Unknown"),
|
|
|
|
| 144 |
"submission_date": entry.get("submission_date", ""),
|
| 145 |
"version": entry.get("version", version),
|
| 146 |
"base_model": entry.get("base_model", ""),
|
|
|
|
| 141 |
"model_name": entry.get("model_name", "Unknown Model"),
|
| 142 |
"model_type": entry.get("model_type", "Unknown"),
|
| 143 |
"guard_model_type": entry.get("guard_model_type", "Unknown"),
|
| 144 |
+
"mode": entry.get("mode", "Strict"),
|
| 145 |
"submission_date": entry.get("submission_date", ""),
|
| 146 |
"version": entry.get("version", version),
|
| 147 |
"base_model": entry.get("base_model", ""),
|
src/submission/submit.py
CHANGED
|
@@ -175,6 +175,7 @@ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
|
|
| 175 |
"model_name": metadata.get("model_name"), # Use original model name
|
| 176 |
"model_type": metadata.get("model_type"),
|
| 177 |
"guard_model_type": str(metadata.get("guard_model_type")).lower(),
|
|
|
|
| 178 |
"base_model": metadata.get("base_model"),
|
| 179 |
"revision": metadata.get("revision"),
|
| 180 |
"precision": metadata.get("precision"),
|
|
|
|
| 175 |
"model_name": metadata.get("model_name"), # Use original model name
|
| 176 |
"model_type": metadata.get("model_type"),
|
| 177 |
"guard_model_type": str(metadata.get("guard_model_type")).lower(),
|
| 178 |
+
"mode": metadata.get("mode"),
|
| 179 |
"base_model": metadata.get("base_model"),
|
| 180 |
"revision": metadata.get("revision"),
|
| 181 |
"precision": metadata.get("precision"),
|