CodeReviewBench

Sleeping

App Files Files Community

apsys commited on Mar 31

Commit

b1cb07d

1 Parent(s): dbfaa11

works

Browse files

Files changed (46) hide show

.gitignore +1 -0
app.py +285 -97
guard-bench-submodule +1 -1
logs/guardbench_20250331_202524_f070fad2.log +19 -0
logs/guardbench_20250331_203348_6e24b81a.log +19 -0
logs/guardbench_20250331_204307_8d77ec17.log +23 -0
logs/guardbench_20250331_204836_3421e73f.log +16 -0
logs/guardbench_20250331_205152_1140353b.log +19 -0
logs/guardbench_20250331_205420_d59a424e.log +61 -0
logs/guardbench_20250331_205853_bf4fa85f.log +2 -0
logs/guardbench_20250331_210025_ba691e37.log +2 -0
logs/guardbench_20250331_210109_f46123b9.log +2 -0
logs/guardbench_20250331_210119_f0e10b9b.log +2 -0
logs/guardbench_20250331_210208_029d8318.log +2 -0
logs/guardbench_20250331_210459_437adc64.log +2 -0
logs/guardbench_20250331_210550_0358568e.log +2 -0
logs/guardbench_20250331_210641_91d1d06b.log +2 -0
logs/guardbench_20250331_210920_cd35c4f0.log +2 -0
logs/guardbench_20250331_211215_a8cf06a3.log +2 -0
logs/guardbench_20250331_211255_d32ee1a4.log +2 -0
logs/guardbench_20250331_211332_24223f1a.log +2 -0
logs/guardbench_20250331_211545_244674df.log +2 -0
logs/guardbench_20250331_211735_6d503239.log +2 -0
logs/guardbench_20250331_211916_3e96de42.log +2 -0
logs/guardbench_20250331_212027_f9d450e9.log +5 -0
logs/guardbench_20250331_212722_655f3190.log +2 -0
logs/guardbench_20250331_213207_74f9e2de.log +5 -0
logs/guardbench_20250331_213331_f72d2f6a.log +5 -0
logs/guardbench_20250331_214118_0da0491f.log +50 -0
logs/guardbench_20250331_214511_0a5acf8b.log +5 -0
logs/guardbench_20250331_214841_4df080f3.log +5 -0
logs/guardbench_20250331_215007_9c98c60a.log +5 -0
logs/guardbench_20250331_215514_ad36e4b4.log +50 -0
logs/guardbench_20250331_220348_0cb4d8e9.log +50 -0
logs/guardbench_20250331_220638_21aa20f2.log +50 -0
logs/guardbench_20250331_221124_3ffa908f.log +50 -0
logs/guardbench_20250331_221755_fc667123.log +50 -0
logs/guardbench_20250331_222103_90a3095d.log +50 -0
logs/guardbench_20250331_222531_b0fff871.log +50 -0
logs/guardbench_20250331_223148_4e22eb66.log +50 -0
requirements.txt +6 -5
src/display/css_html_js.py +11 -3
src/display/utils.py +164 -22
src/leaderboard/processor.py +57 -39
src/populate.py +143 -208
src/submission/submit.py +166 -68

.gitignore CHANGED Viewed

@@ -20,6 +20,7 @@ var/
 *.egg-info/
 .installed.cfg
 *.egg
 # Environment variables
 .env

 *.egg-info/
 .installed.cfg
 *.egg
+.gradio/
 # Environment variables
 .env

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ import logging
 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from src.about import (
@@ -30,7 +32,8 @@ from src.display.utils import (
     TEST_TYPES,
     ModelType,
     Precision,
-    WeightType
 )
 from src.display.formatting import styled_message, styled_error, styled_warning
 from src.envs import (
@@ -41,7 +44,7 @@ from src.envs import (
     TOKEN,
     DATA_PATH
 )
-from src.populate import get_leaderboard_df, download_leaderboard_data, get_category_leaderboard_df
 from src.submission.submit import process_submission
 # Configure logging
@@ -64,6 +67,7 @@ except Exception as e:
     logger.error(f"Error loading leaderboard data: {e}")
     LEADERBOARD_DF = pd.DataFrame()
 def init_leaderboard(dataframe):
     """
@@ -100,7 +104,8 @@ def submit_results(
     weight_type: str,
     model_type: str,
     submission_file: tempfile._TemporaryFileWrapper,
-    version: str
 ):
     """
     Handle submission of results with model metadata.
@@ -125,7 +130,8 @@ def submit_results(
         "precision": precision,
         "weight_type": weight_type,
         "model_type": model_type,
-        "version": version
     }
     # Process the submission
@@ -150,10 +156,22 @@ def refresh_data(version=CURRENT_VERSION):
     global LEADERBOARD_DF
     try:
         logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...")
-        LEADERBOARD_DF = get_leaderboard_df(version=version)
-        logger.info("Scheduled refresh of leaderboard data completed")
     except Exception as e:
         logger.error(f"Error in scheduled refresh: {e}")
     return LEADERBOARD_DF
@@ -166,111 +184,281 @@ def update_leaderboards(version):
     return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs]
-# Create Gradio app
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    with gr.Row():
-        with gr.Column(scale=3):
-            gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-        with gr.Column(scale=1):
-            version_selector = gr.Dropdown(
-                choices=BENCHMARK_VERSIONS,
-                label="Benchmark Version",
-                value=CURRENT_VERSION,
-                interactive=True,
-                elem_classes="version-selector"
-            )
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
-            refresh_button = gr.Button("Refresh Leaderboard")
-            # Create tabs for each category
-            with gr.Tabs(elem_classes="category-tabs") as category_tabs:
-                # First tab for average metrics across all categories
-                with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
-                    leaderboard = init_leaderboard(LEADERBOARD_DF)
-                # Create a tab for each category
-                for category in CATEGORIES:
-                    with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
-                        category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION)
-                        category_leaderboard = init_leaderboard(category_df)
-            # Refresh button functionality
-            refresh_button.click(
-                fn=lambda: [
-                    init_leaderboard(get_leaderboard_df(version=version_selector.value)),
-                    *[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES]
-                ],
-                inputs=[],
-                outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
-            )
-        with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
-            gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
                         interactive=True,
                     )
-                    weight_type = gr.Dropdown(
-                        choices=[i.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
                     )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            with gr.Row():
-                file_input = gr.File(
-                    label="Upload JSONL Results File",
-                    file_types=[".jsonl"]
                 )
-            submit_button = gr.Button("Submit Results")
-            result_output = gr.Markdown()
-            submit_button.click(
-                fn=submit_results,
-                inputs=[
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                    file_input,
-                    version_selector
-                ],
-                outputs=result_output
-            )
     # Version selector functionality
     version_selector.change(

 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
 from apscheduler.schedulers.background import BackgroundScheduler
 from src.about import (
     TEST_TYPES,
     ModelType,
     Precision,
+    WeightType,
+    GuardModelType
 )
 from src.display.formatting import styled_message, styled_error, styled_warning
 from src.envs import (
     TOKEN,
     DATA_PATH
 )
+from src.populate import get_leaderboard_df, get_category_leaderboard_df
 from src.submission.submit import process_submission
 # Configure logging
     logger.error(f"Error loading leaderboard data: {e}")
     LEADERBOARD_DF = pd.DataFrame()
+print(DISPLAY_COLS)
 def init_leaderboard(dataframe):
     """
     weight_type: str,
     model_type: str,
     submission_file: tempfile._TemporaryFileWrapper,
+    version: str,
+    guard_model_type: GuardModelType
 ):
     """
     Handle submission of results with model metadata.
         "precision": precision,
         "weight_type": weight_type,
         "model_type": model_type,
+        "version": version,
+        "guard_model_type": guard_model_type
     }
     # Process the submission
     global LEADERBOARD_DF
     try:
         logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...")
+        new_df = get_leaderboard_df(version=version)
+        if new_df is not None and not new_df.empty:
+            LEADERBOARD_DF = new_df
+            logger.info("Scheduled refresh of leaderboard data completed")
+        else:
+            logger.warning("Refresh returned empty data, keeping existing data")
+            # If empty, create a dataframe with correct columns
+            if LEADERBOARD_DF is None or LEADERBOARD_DF.empty:
+                columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
+                LEADERBOARD_DF = pd.DataFrame(columns=columns)
     except Exception as e:
         logger.error(f"Error in scheduled refresh: {e}")
+        # Ensure we have at least an empty dataframe with correct columns
+        if LEADERBOARD_DF is None or LEADERBOARD_DF.empty:
+            columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
+            LEADERBOARD_DF = pd.DataFrame(columns=columns)
     return LEADERBOARD_DF
     return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs]
+def create_performance_plot(selected_models, category, metric="f1_binary", version=CURRENT_VERSION):
+    """
+    Create a radar plot comparing model performance for selected models.
+    """
+    if category == "📊 Overall Performance":
+        df = get_leaderboard_df(version=version)
+    else:
+        df = get_category_leaderboard_df(category, version=version)
+    if df.empty:
+        return go.Figure()
+    # Filter for selected models
+    df = df[df['model_name'].isin(selected_models)]
+    # Get the relevant metric columns
+    metric_cols = [col for col in df.columns if metric in col]
+    # Create figure
+    fig = go.Figure()
+    # Custom colors for different models
+    colors = ['#8FCCCC', '#C2A4B6', '#98B4A6', '#B68F7C']  # Pale Cyan, Pale Pink, Pale Green, Pale Orange
+    # Add traces for each model
+    for idx, model in enumerate(selected_models):
+        model_data = df[df['model_name'] == model]
+        if not model_data.empty:
+            values = model_data[metric_cols].values[0].tolist()
+            # Add the first value again at the end to complete the polygon
+            values = values + [values[0]]
+            # Clean up test type names
+            categories = [col.replace(f'_{metric}', '') for col in metric_cols]
+            # Add the first category again at the end to complete the polygon
+            categories = categories + [categories[0]]
+            fig.add_trace(go.Scatterpolar(
+                r=values,
+                theta=categories,
+                name=model,
+                line_color=colors[idx % len(colors)],
+                fill='toself'
+            ))
+    # Update layout with all settings at once
+    fig.update_layout(
+        paper_bgcolor='#000000',
+        plot_bgcolor='#000000',
+        font={'color': '#ffffff'},
+        title={
+            'text': f'{category} - {metric.upper()} Score Comparison',
+            'font': {'color': '#ffffff', 'size': 24}
+        },
+        polar=dict(
+            bgcolor='#000000',
+            radialaxis=dict(
+                visible=True,
+                range=[0, 1],
+                gridcolor='#333333',
+                linecolor='#333333',
+                tickfont={'color': '#ffffff'},
+            ),
+            angularaxis=dict(
+                gridcolor='#333333',
+                linecolor='#333333',
+                tickfont={'color': '#ffffff'},
+            )
+        ),
+        height=600,
+        showlegend=True,
+        legend=dict(
+            yanchor="top",
+            y=0.99,
+            xanchor="right",
+            x=0.99,
+            bgcolor='rgba(0,0,0,0.5)',
+            font={'color': '#ffffff'}
+        )
+    )
+    return fig
+def update_model_choices(version):
+    """
+    Update the list of available models for the given version.
+    """
+    df = get_leaderboard_df(version=version)
+    if df.empty:
+        return []
+    return sorted(df['model_name'].unique().tolist())
+def update_visualization(selected_models, selected_category, selected_metric, version):
+    """
+    Update the visualization based on user selections.
+    """
+    if not selected_models:
+        return go.Figure()
+    return create_performance_plot(selected_models, selected_category, selected_metric, version)
+# Create Gradio app
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        tabs = gr.Tabs(elem_classes="tab-buttons")
+        with tabs:
+            with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
+                with gr.Row():
+                    refresh_button = gr.Button("Refresh Leaderboard", scale=3)
+                    version_selector = gr.Dropdown(
+                        choices=BENCHMARK_VERSIONS,
+                        label="Benchmark Version",
+                        value=CURRENT_VERSION,
                         interactive=True,
+                        elem_classes="version-selector",
+                        scale=1
                     )
+                # Create tabs for each category
+                with gr.Tabs(elem_classes="category-tabs") as category_tabs:
+                    # First tab for average metrics across all categories
+                    with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
+                        leaderboard = init_leaderboard(LEADERBOARD_DF)
+                    # Create a tab for each category
+                    for category in CATEGORIES:
+                        with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
+                            category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION)
+                            category_leaderboard = init_leaderboard(category_df)
+                # Refresh button functionality
+                refresh_button.click(
+                    fn=lambda: [
+                        init_leaderboard(get_leaderboard_df(version=version_selector.value)),
+                        *[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES]
+                    ],
+                    inputs=[],
+                    outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
+                )
+            with gr.TabItem("📊 Visualize", elem_id="guardbench-viz-tab", id=1):
+                with gr.Row():
+                    with gr.Column():
+                        viz_version_selector = gr.Dropdown(
+                            choices=BENCHMARK_VERSIONS,
+                            label="Benchmark Version",
+                            value=CURRENT_VERSION,
+                            interactive=True
+                        )
+                        model_selector = gr.Dropdown(
+                            choices=update_model_choices(CURRENT_VERSION),
+                            label="Select Models to Compare",
+                            multiselect=True,
+                            interactive=True
+                        )
+                    with gr.Column():
+                        # Add Overall Performance to categories
+                        viz_categories = ["📊 Overall Performance"] + CATEGORIES
+                        category_selector = gr.Dropdown(
+                            choices=viz_categories,
+                            label="Select Category",
+                            value=viz_categories[0],
+                            interactive=True
+                        )
+                        metric_selector = gr.Dropdown(
+                            choices=["f1_binary", "precision_binary", "recall_binary"],
+                            label="Select Metric",
+                            value="f1_binary",
+                            interactive=True
+                        )
+                plot_output = gr.Plot()
+                # Update visualization when any selector changes
+                for control in [viz_version_selector, model_selector, category_selector, metric_selector]:
+                    control.change(
+                        fn=update_visualization,
+                        inputs=[model_selector, category_selector, metric_selector, viz_version_selector],
+                        outputs=plot_output
                     )
+                # Update model choices when version changes
+                viz_version_selector.change(
+                    fn=update_model_choices,
+                    inputs=[viz_version_selector],
+                    outputs=[model_selector]
                 )
+            with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=2):
+                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=3):
+                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
+                    with gr.Column(scale=1):
+                        # Add version selector specifically for the submission tab
+                        submission_version_selector = gr.Dropdown(
+                            choices=BENCHMARK_VERSIONS,
+                            label="Benchmark Version",
+                            value=CURRENT_VERSION,
+                            interactive=True,
+                            elem_classes="version-selector"
+                        )
+                with gr.Row():
+                    with gr.Column():
+                        model_name_textbox = gr.Textbox(label="Model name")
+                        revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                        model_type = gr.Dropdown(
+                            choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                            label="Model type",
+                            multiselect=False,
+                            value=None,
+                            interactive=True,
+                        )
+                        guard_model_type = gr.Dropdown(
+                            choices=[t.name for t in GuardModelType],
+                            label="Guard model type",
+                            multiselect=False,
+                            value=GuardModelType.LLM_REGEXP.name,
+                            interactive=True,
+                        )
+                    with gr.Column():
+                        precision = gr.Dropdown(
+                            choices=[i.name for i in Precision if i != Precision.Unknown],
+                            label="Precision",
+                            multiselect=False,
+                            value="float16",
+                            interactive=True,
+                        )
+                        weight_type = gr.Dropdown(
+                            choices=[i.name for i in WeightType],
+                            label="Weights type",
+                            multiselect=False,
+                            value="Original",
+                            interactive=True,
+                        )
+                        base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+                with gr.Row():
+                    file_input = gr.File(
+                        label="Upload JSONL Results File",
+                        file_types=[".jsonl"]
+                    )
+                submit_button = gr.Button("Submit Results")
+                result_output = gr.Markdown()
+                submit_button.click(
+                    fn=submit_results,
+                    inputs=[
+                        model_name_textbox,
+                        base_model_name_textbox,
+                        revision_name_textbox,
+                        precision,
+                        weight_type,
+                        model_type,
+                        file_input,
+                        submission_version_selector,
+                        guard_model_type
+                    ],
+                    outputs=result_output
+                )
     # Version selector functionality
     version_selector.change(

guard-bench-submodule CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~0a9f48bcedd0ccb6b5cf59ff7ed1186e32a5dc17~~


1	+ Subproject commit 34b40c5c6c766632f460ce7d7f1895881a866d83

logs/guardbench_20250331_202524_f070fad2.log ADDED Viewed

	@@ -0,0 +1,19 @@

+2025-03-31 20:25:27,855 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 20:25:28,116 - __main__ - INFO - Loaded leaderboard with 0 entries
+2025-03-31 20:25:28,184 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:28,348 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:28,514 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:28,624 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:28,739 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:28,869 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:29,008 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:29,568 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:29,687 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:29,796 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:29,908 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:25:30,148 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 20:25:30,148 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 20:25:30,148 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 20:29:44,676 - __main__ - INFO - Received submission for model chatgpt-4o-latest (CoT): /tmp/gradio/a1f2d3a725f7b441a1fbfdac8e51dfd3bf7bbb4ab2d1c20362cfa130f4bdda6d/chatgpt-4o-latest CoT.jsonl
+2025-03-31 20:29:44,708 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 20:29:44,923 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_203348_6e24b81a.log ADDED Viewed

	@@ -0,0 +1,19 @@

+2025-03-31 20:33:49,499 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 20:33:49,728 - __main__ - INFO - Loaded leaderboard with 0 entries
+2025-03-31 20:33:49,795 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:49,946 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:50,278 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:50,398 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:50,624 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:50,735 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:50,853 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:51,000 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:51,211 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:51,471 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:51,591 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:33:51,921 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 20:33:51,921 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 20:33:51,922 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 20:37:48,565 - __main__ - INFO - Received submission for model chatgpt-4o-latest (CoT): /tmp/gradio/a1f2d3a725f7b441a1fbfdac8e51dfd3bf7bbb4ab2d1c20362cfa130f4bdda6d/chatgpt-4o-latest CoT.jsonl
+2025-03-31 20:37:48,595 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 20:37:48,765 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_204307_8d77ec17.log ADDED Viewed

	@@ -0,0 +1,23 @@

+2025-03-31 20:43:08,422 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 20:43:08,623 - __main__ - INFO - Loaded leaderboard with 0 entries
+2025-03-31 20:43:08,693 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:08,832 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:08,948 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:09,071 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:09,188 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:09,383 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:09,494 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:09,604 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:09,803 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:10,013 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:10,123 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:43:10,578 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 20:43:10,579 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 20:43:10,579 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 20:46:56,010 - __main__ - INFO - Received submission for model chatgpt-4o-latest (CoT): /tmp/gradio/a1f2d3a725f7b441a1fbfdac8e51dfd3bf7bbb4ab2d1c20362cfa130f4bdda6d/chatgpt-4o-latest CoT.jsonl
+2025-03-31 20:46:56,040 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 20:46:57,488 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 20:46:57,488 - guardbench.evaluator - INFO - Starting evaluation for model: chatgpt-4o-latest_(CoT)
+2025-03-31 20:46:57,488 - guardbench.evaluator - INFO - Using cached results for model: chatgpt-4o-latest_(CoT)
+2025-03-31 20:46:57,489 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 20:46:57,582 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_204836_3421e73f.log ADDED Viewed

	@@ -0,0 +1,16 @@

+2025-03-31 20:48:37,607 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 20:48:37,906 - __main__ - INFO - Loaded leaderboard with 0 entries
+2025-03-31 20:48:37,976 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:38,156 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:38,310 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:38,484 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:38,744 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:38,898 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:39,059 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:39,327 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:39,480 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:39,670 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:39,832 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:48:40,214 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 20:48:40,214 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 20:48:40,214 - apscheduler.scheduler - INFO - Scheduler started

logs/guardbench_20250331_205152_1140353b.log ADDED Viewed

	@@ -0,0 +1,19 @@

+2025-03-31 20:51:53,395 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 20:51:53,703 - __main__ - INFO - Loaded leaderboard with 0 entries
+2025-03-31 20:51:53,774 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:54,056 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:54,148 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:54,235 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:54,326 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:54,417 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:54,705 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:54,797 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:54,901 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:54,993 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:55,092 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:51:55,407 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 20:51:55,407 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 20:51:55,407 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 20:53:44,802 - __main__ - INFO - Received submission for model chatgpt-4o-latest (CoT): /tmp/gradio/a1f2d3a725f7b441a1fbfdac8e51dfd3bf7bbb4ab2d1c20362cfa130f4bdda6d/chatgpt-4o-latest CoT.jsonl
+2025-03-31 20:53:44,829 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 20:53:44,996 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_205420_d59a424e.log ADDED Viewed

	@@ -0,0 +1,61 @@

+2025-03-31 20:54:21,474 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 20:54:21,606 - __main__ - INFO - Loaded leaderboard with 0 entries
+2025-03-31 20:54:21,675 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:21,785 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:21,881 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:21,977 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:22,074 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:22,169 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:22,293 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:22,394 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:22,505 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:22,594 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:22,685 - __main__ - WARNING - Initializing empty leaderboard
+2025-03-31 20:54:22,997 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 20:54:22,997 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 20:54:22,997 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 20:55:51,877 - __main__ - INFO - Received submission for model chatgpt-4o-latest (CoT): /tmp/gradio/a1f2d3a725f7b441a1fbfdac8e51dfd3bf7bbb4ab2d1c20362cfa130f4bdda6d/chatgpt-4o-latest CoT.jsonl
+2025-03-31 20:55:51,906 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 20:55:52,929 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 20:55:52,929 - guardbench.evaluator - INFO - Starting evaluation for model: chatgpt-4o-latest_(CoT)
+2025-03-31 20:55:52,929 - guardbench.evaluator - INFO - Using cached results for model: chatgpt-4o-latest_(CoT)
+2025-03-31 20:55:52,966 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
+2025-03-31 20:55:52,970 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
+2025-03-31 20:55:53,073 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
+2025-03-31 20:55:53,076 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
+2025-03-31 20:55:53,175 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
+2025-03-31 20:55:53,178 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
+2025-03-31 20:55:53,281 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
+2025-03-31 20:55:53,284 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
+2025-03-31 20:55:53,386 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
+2025-03-31 20:55:53,390 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
+2025-03-31 20:55:53,487 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
+2025-03-31 20:55:53,491 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
+2025-03-31 20:55:53,592 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
+2025-03-31 20:55:53,596 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
+2025-03-31 20:55:53,698 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
+2025-03-31 20:55:53,701 - guardbench.evaluator - INFO - Length Safe Prompts - 490
+2025-03-31 20:55:54,267 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
+2025-03-31 20:55:54,271 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
+2025-03-31 20:55:54,371 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
+2025-03-31 20:55:54,375 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
+2025-03-31 20:55:54,431 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
+2025-03-31 20:55:54,434 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
+2025-03-31 20:55:54,534 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
+2025-03-31 20:55:54,537 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
+2025-03-31 20:55:54,634 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
+2025-03-31 20:55:54,638 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
+2025-03-31 20:55:54,738 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
+2025-03-31 20:55:54,741 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
+2025-03-31 20:55:54,841 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
+2025-03-31 20:55:54,844 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
+2025-03-31 20:55:54,945 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
+2025-03-31 20:55:54,949 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
+2025-03-31 20:55:55,049 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
+2025-03-31 20:55:55,052 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
+2025-03-31 20:55:55,152 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
+2025-03-31 20:55:55,156 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
+2025-03-31 20:55:55,260 - guardbench.evaluator - INFO - Updated leaderboard for model: chatgpt-4o-latest_(CoT) from cached results
+2025-03-31 20:55:55,262 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: chatgpt-4o-latest_(CoT)
+2025-03-31 20:55:56,838 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 20:55:57,001 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_205853_bf4fa85f.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 20:58:54,582 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 20:58:54,675 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_210025_ba691e37.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:00:26,667 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:00:26,793 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_210109_f46123b9.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:01:10,756 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:01:10,828 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_210119_f0e10b9b.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:01:20,172 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:01:20,242 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_210208_029d8318.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:02:09,399 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:02:09,500 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_210459_437adc64.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:04:59,995 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:05:00,111 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_210550_0358568e.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:05:51,686 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:05:51,759 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_210641_91d1d06b.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:06:42,594 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:06:42,717 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_210920_cd35c4f0.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:09:21,490 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:09:21,613 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_211215_a8cf06a3.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:12:16,593 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:12:16,704 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_211255_d32ee1a4.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:12:56,431 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:12:56,505 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_211332_24223f1a.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:13:33,061 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:13:33,189 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_211545_244674df.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:15:46,264 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:15:46,382 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_211735_6d503239.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:17:36,939 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:17:37,057 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_211916_3e96de42.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:19:17,112 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:19:17,224 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_212027_f9d450e9.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2025-03-31 21:20:28,558 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 21:20:28,829 - __main__ - INFO - Loaded leaderboard with 1 entries
+2025-03-31 21:20:30,189 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 21:20:30,189 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 21:20:30,189 - apscheduler.scheduler - INFO - Scheduler started

logs/guardbench_20250331_212722_655f3190.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 2025-03-31 21:27:23,630 - __main__ - INFO - Initializing leaderboard data...
2	+ 2025-03-31 21:27:23,765 - __main__ - INFO - Loaded leaderboard with 1 entries

logs/guardbench_20250331_213207_74f9e2de.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2025-03-31 21:32:08,256 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 21:32:08,412 - __main__ - INFO - Loaded leaderboard with 1 entries
+2025-03-31 21:32:09,385 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 21:32:09,385 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 21:32:09,386 - apscheduler.scheduler - INFO - Scheduler started

logs/guardbench_20250331_213331_f72d2f6a.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2025-03-31 21:33:32,956 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 21:33:33,112 - __main__ - INFO - Loaded leaderboard with 1 entries
+2025-03-31 21:33:34,050 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 21:33:34,050 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 21:33:34,050 - apscheduler.scheduler - INFO - Scheduler started

logs/guardbench_20250331_214118_0da0491f.log ADDED Viewed

	@@ -0,0 +1,50 @@

+2025-03-31 21:41:19,264 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 21:41:19,411 - __main__ - INFO - Loaded leaderboard with 1 entries
+2025-03-31 21:41:20,492 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 21:41:20,492 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 21:41:20,492 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 21:44:25,074 - __main__ - INFO - Received submission for model gpt-4o-mini (CoT): /tmp/gradio/35fc6ab7ba3af1e1b210ed2851ec70f52004490c3534b64bfd8e4830f5cccea0/gpt-4o-mini CoT.jsonl
+2025-03-31 21:44:25,100 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 21:44:26,183 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 21:44:26,183 - guardbench.evaluator - INFO - Starting evaluation for model: gpt-4o-mini_(CoT)
+2025-03-31 21:44:26,183 - guardbench.evaluator - INFO - Using cached results for model: gpt-4o-mini_(CoT)
+2025-03-31 21:44:26,214 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
+2025-03-31 21:44:26,218 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
+2025-03-31 21:44:26,486 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
+2025-03-31 21:44:26,490 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
+2025-03-31 21:44:26,594 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
+2025-03-31 21:44:26,597 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
+2025-03-31 21:44:26,700 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
+2025-03-31 21:44:26,703 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
+2025-03-31 21:44:26,806 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
+2025-03-31 21:44:26,810 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
+2025-03-31 21:44:26,908 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
+2025-03-31 21:44:26,912 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
+2025-03-31 21:44:27,015 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
+2025-03-31 21:44:27,018 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
+2025-03-31 21:44:27,130 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
+2025-03-31 21:44:27,134 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
+2025-03-31 21:44:27,201 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
+2025-03-31 21:44:27,205 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
+2025-03-31 21:44:27,326 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
+2025-03-31 21:44:27,330 - guardbench.evaluator - INFO - Length Safe Prompts - 490
+2025-03-31 21:44:27,962 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
+2025-03-31 21:44:27,966 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
+2025-03-31 21:44:28,070 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
+2025-03-31 21:44:28,074 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
+2025-03-31 21:44:28,175 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
+2025-03-31 21:44:28,179 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
+2025-03-31 21:44:28,282 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
+2025-03-31 21:44:28,286 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
+2025-03-31 21:44:28,386 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
+2025-03-31 21:44:28,390 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
+2025-03-31 21:44:28,489 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
+2025-03-31 21:44:28,493 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
+2025-03-31 21:44:28,594 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
+2025-03-31 21:44:28,598 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
+2025-03-31 21:44:28,702 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
+2025-03-31 21:44:28,705 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
+2025-03-31 21:44:28,813 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt-4o-mini_(CoT) from cached results
+2025-03-31 21:44:28,815 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt-4o-mini_(CoT)
+2025-03-31 21:44:30,083 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 21:44:30,284 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_214511_0a5acf8b.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2025-03-31 21:45:12,578 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 21:45:12,654 - __main__ - INFO - Loaded leaderboard with 2 entries
+2025-03-31 21:45:13,819 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 21:45:13,819 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 21:45:13,819 - apscheduler.scheduler - INFO - Scheduler started

logs/guardbench_20250331_214841_4df080f3.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2025-03-31 21:48:42,942 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 21:48:43,058 - __main__ - INFO - Loaded leaderboard with 2 entries
+2025-03-31 21:48:43,995 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 21:48:43,995 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 21:48:43,995 - apscheduler.scheduler - INFO - Scheduler started

logs/guardbench_20250331_215007_9c98c60a.log ADDED Viewed

	@@ -0,0 +1,5 @@

+2025-03-31 21:50:08,436 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 21:50:08,552 - __main__ - INFO - Loaded leaderboard with 2 entries
+2025-03-31 21:50:09,654 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 21:50:09,654 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 21:50:09,655 - apscheduler.scheduler - INFO - Scheduler started

logs/guardbench_20250331_215514_ad36e4b4.log ADDED Viewed

	@@ -0,0 +1,50 @@

+2025-03-31 21:55:15,352 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 21:55:15,454 - __main__ - INFO - Loaded leaderboard with 2 entries
+2025-03-31 21:55:16,351 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 21:55:16,352 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 21:55:16,352 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 21:57:09,862 - __main__ - INFO - Received submission for model gpt-4o-mini: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
+2025-03-31 21:57:10,863 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 21:57:11,927 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 21:57:11,927 - guardbench.evaluator - INFO - Starting evaluation for model: gpt-4o-mini
+2025-03-31 21:57:11,927 - guardbench.evaluator - INFO - Using cached results for model: gpt-4o-mini
+2025-03-31 21:57:11,947 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
+2025-03-31 21:57:11,951 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
+2025-03-31 21:57:12,054 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
+2025-03-31 21:57:12,058 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
+2025-03-31 21:57:12,155 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
+2025-03-31 21:57:12,158 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
+2025-03-31 21:57:12,215 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
+2025-03-31 21:57:12,219 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
+2025-03-31 21:57:12,319 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
+2025-03-31 21:57:12,322 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
+2025-03-31 21:57:12,423 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
+2025-03-31 21:57:12,426 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
+2025-03-31 21:57:12,526 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
+2025-03-31 21:57:12,530 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
+2025-03-31 21:57:12,631 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
+2025-03-31 21:57:12,634 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
+2025-03-31 21:57:12,736 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
+2025-03-31 21:57:12,740 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
+2025-03-31 21:57:12,840 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
+2025-03-31 21:57:12,843 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
+2025-03-31 21:57:12,943 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
+2025-03-31 21:57:12,946 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
+2025-03-31 21:57:13,045 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
+2025-03-31 21:57:13,049 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
+2025-03-31 21:57:13,147 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
+2025-03-31 21:57:13,151 - guardbench.evaluator - INFO - Length Safe Prompts - 490
+2025-03-31 21:57:13,701 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
+2025-03-31 21:57:13,705 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
+2025-03-31 21:57:13,805 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
+2025-03-31 21:57:13,809 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
+2025-03-31 21:57:13,905 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
+2025-03-31 21:57:13,909 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
+2025-03-31 21:57:14,008 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
+2025-03-31 21:57:14,011 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
+2025-03-31 21:57:14,113 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
+2025-03-31 21:57:14,117 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
+2025-03-31 21:57:14,219 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt-4o-mini from cached results
+2025-03-31 21:57:14,220 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt-4o-mini
+2025-03-31 21:57:15,528 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 21:57:15,650 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_220348_0cb4d8e9.log ADDED Viewed

	@@ -0,0 +1,50 @@

+2025-03-31 22:03:49,677 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 22:03:49,841 - __main__ - INFO - Loaded leaderboard with 3 entries
+2025-03-31 22:03:51,379 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 22:03:51,379 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 22:03:51,379 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 22:04:45,096 - __main__ - INFO - Received submission for model gpt4omini-TEST: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
+2025-03-31 22:04:45,574 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 22:04:46,470 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 22:04:46,470 - guardbench.evaluator - INFO - Starting evaluation for model: gpt4omini-TEST
+2025-03-31 22:04:46,470 - guardbench.evaluator - INFO - Using cached results for model: gpt4omini-TEST
+2025-03-31 22:04:46,490 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
+2025-03-31 22:04:46,493 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
+2025-03-31 22:04:46,595 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
+2025-03-31 22:04:46,600 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
+2025-03-31 22:04:46,702 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
+2025-03-31 22:04:46,706 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
+2025-03-31 22:04:46,804 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
+2025-03-31 22:04:46,808 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
+2025-03-31 22:04:46,914 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
+2025-03-31 22:04:46,917 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
+2025-03-31 22:04:47,019 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
+2025-03-31 22:04:47,023 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
+2025-03-31 22:04:47,128 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
+2025-03-31 22:04:47,132 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
+2025-03-31 22:04:47,237 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
+2025-03-31 22:04:47,240 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
+2025-03-31 22:04:47,355 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
+2025-03-31 22:04:47,359 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
+2025-03-31 22:04:47,462 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
+2025-03-31 22:04:47,466 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
+2025-03-31 22:04:47,573 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
+2025-03-31 22:04:47,576 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
+2025-03-31 22:04:47,694 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
+2025-03-31 22:04:47,697 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
+2025-03-31 22:04:47,804 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
+2025-03-31 22:04:47,808 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
+2025-03-31 22:04:47,913 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
+2025-03-31 22:04:47,917 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
+2025-03-31 22:04:48,022 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
+2025-03-31 22:04:48,026 - guardbench.evaluator - INFO - Length Safe Prompts - 490
+2025-03-31 22:04:48,605 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
+2025-03-31 22:04:48,609 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
+2025-03-31 22:04:48,667 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
+2025-03-31 22:04:48,671 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
+2025-03-31 22:04:48,771 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
+2025-03-31 22:04:48,775 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
+2025-03-31 22:04:48,882 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt4omini-TEST from cached results
+2025-03-31 22:04:48,883 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt4omini-TEST
+2025-03-31 22:04:50,345 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 22:04:50,514 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_220638_21aa20f2.log ADDED Viewed

	@@ -0,0 +1,50 @@

+2025-03-31 22:06:39,559 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 22:06:39,705 - __main__ - INFO - Loaded leaderboard with 4 entries
+2025-03-31 22:06:41,404 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 22:06:41,404 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 22:06:41,404 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 22:07:29,463 - __main__ - INFO - Received submission for model gpt4omini-TEST2: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
+2025-03-31 22:07:30,063 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 22:07:31,596 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 22:07:31,597 - guardbench.evaluator - INFO - Starting evaluation for model: gpt4omini-TEST2
+2025-03-31 22:07:31,597 - guardbench.evaluator - INFO - Using cached results for model: gpt4omini-TEST2
+2025-03-31 22:07:31,616 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
+2025-03-31 22:07:31,620 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
+2025-03-31 22:07:31,723 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
+2025-03-31 22:07:31,727 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
+2025-03-31 22:07:31,828 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
+2025-03-31 22:07:31,831 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
+2025-03-31 22:07:31,934 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
+2025-03-31 22:07:31,937 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
+2025-03-31 22:07:32,040 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
+2025-03-31 22:07:32,043 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
+2025-03-31 22:07:32,144 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
+2025-03-31 22:07:32,148 - guardbench.evaluator - INFO - Length Safe Prompts - 490
+2025-03-31 22:07:32,703 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
+2025-03-31 22:07:32,706 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
+2025-03-31 22:07:32,808 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
+2025-03-31 22:07:32,812 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
+2025-03-31 22:07:32,912 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
+2025-03-31 22:07:32,916 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
+2025-03-31 22:07:33,018 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
+2025-03-31 22:07:33,022 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
+2025-03-31 22:07:33,123 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
+2025-03-31 22:07:33,127 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
+2025-03-31 22:07:33,231 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
+2025-03-31 22:07:33,234 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
+2025-03-31 22:07:33,336 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
+2025-03-31 22:07:33,339 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
+2025-03-31 22:07:33,438 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
+2025-03-31 22:07:33,441 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
+2025-03-31 22:07:33,545 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
+2025-03-31 22:07:33,548 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
+2025-03-31 22:07:33,647 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
+2025-03-31 22:07:33,650 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
+2025-03-31 22:07:33,707 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
+2025-03-31 22:07:33,711 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
+2025-03-31 22:07:33,807 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
+2025-03-31 22:07:33,811 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
+2025-03-31 22:07:33,915 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt4omini-TEST2 from cached results
+2025-03-31 22:07:33,917 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt4omini-TEST2
+2025-03-31 22:07:36,275 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 22:07:36,423 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_221124_3ffa908f.log ADDED Viewed

	@@ -0,0 +1,50 @@

+2025-03-31 22:11:25,101 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 22:11:25,216 - __main__ - INFO - Loaded leaderboard with 5 entries
+2025-03-31 22:11:26,084 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 22:11:26,084 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 22:11:26,085 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 22:12:12,091 - __main__ - INFO - Received submission for model gpt4omini-TEST5: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
+2025-03-31 22:12:12,831 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 22:12:14,244 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 22:12:14,244 - guardbench.evaluator - INFO - Starting evaluation for model: gpt4omini-TEST5
+2025-03-31 22:12:14,244 - guardbench.evaluator - INFO - Using cached results for model: gpt4omini-TEST5
+2025-03-31 22:12:14,263 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
+2025-03-31 22:12:14,267 - guardbench.evaluator - INFO - Length Safe Prompts - 490
+2025-03-31 22:12:14,831 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
+2025-03-31 22:12:14,835 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
+2025-03-31 22:12:14,939 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
+2025-03-31 22:12:14,942 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
+2025-03-31 22:12:15,044 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
+2025-03-31 22:12:15,047 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
+2025-03-31 22:12:15,149 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
+2025-03-31 22:12:15,152 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
+2025-03-31 22:12:15,255 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
+2025-03-31 22:12:15,258 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
+2025-03-31 22:12:15,361 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
+2025-03-31 22:12:15,364 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
+2025-03-31 22:12:15,463 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
+2025-03-31 22:12:15,466 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
+2025-03-31 22:12:15,569 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
+2025-03-31 22:12:15,573 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
+2025-03-31 22:12:15,676 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
+2025-03-31 22:12:15,680 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
+2025-03-31 22:12:15,782 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
+2025-03-31 22:12:15,785 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
+2025-03-31 22:12:15,843 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
+2025-03-31 22:12:15,847 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
+2025-03-31 22:12:15,949 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
+2025-03-31 22:12:15,953 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
+2025-03-31 22:12:16,053 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
+2025-03-31 22:12:16,056 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
+2025-03-31 22:12:16,156 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
+2025-03-31 22:12:16,160 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
+2025-03-31 22:12:16,263 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
+2025-03-31 22:12:16,267 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
+2025-03-31 22:12:16,369 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
+2025-03-31 22:12:16,373 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
+2025-03-31 22:12:16,474 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
+2025-03-31 22:12:16,478 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
+2025-03-31 22:12:16,583 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt4omini-TEST5 from cached results
+2025-03-31 22:12:16,584 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt4omini-TEST5
+2025-03-31 22:12:18,047 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 22:12:18,174 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_221755_fc667123.log ADDED Viewed

	@@ -0,0 +1,50 @@

+2025-03-31 22:17:56,280 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 22:17:56,406 - __main__ - INFO - Loaded leaderboard with 6 entries
+2025-03-31 22:17:57,983 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 22:17:57,983 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 22:17:57,983 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 22:18:49,068 - __main__ - INFO - Received submission for model gpt4omini-TEST27: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
+2025-03-31 22:18:49,596 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 22:18:50,543 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 22:18:50,543 - guardbench.evaluator - INFO - Starting evaluation for model: gpt4omini-TEST27
+2025-03-31 22:18:50,543 - guardbench.evaluator - INFO - Using cached results for model: gpt4omini-TEST27
+2025-03-31 22:18:50,563 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
+2025-03-31 22:18:50,567 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
+2025-03-31 22:18:50,668 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
+2025-03-31 22:18:50,672 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
+2025-03-31 22:18:50,769 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
+2025-03-31 22:18:50,772 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
+2025-03-31 22:18:50,873 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
+2025-03-31 22:18:50,877 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
+2025-03-31 22:18:50,978 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
+2025-03-31 22:18:50,982 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
+2025-03-31 22:18:51,085 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
+2025-03-31 22:18:51,088 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
+2025-03-31 22:18:51,191 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
+2025-03-31 22:18:51,195 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
+2025-03-31 22:18:51,297 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
+2025-03-31 22:18:51,300 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
+2025-03-31 22:18:51,398 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
+2025-03-31 22:18:51,401 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
+2025-03-31 22:18:51,503 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
+2025-03-31 22:18:51,507 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
+2025-03-31 22:18:51,607 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
+2025-03-31 22:18:51,611 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
+2025-03-31 22:18:51,712 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
+2025-03-31 22:18:51,716 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
+2025-03-31 22:18:51,817 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
+2025-03-31 22:18:51,821 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
+2025-03-31 22:18:51,923 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
+2025-03-31 22:18:51,926 - guardbench.evaluator - INFO - Length Safe Prompts - 490
+2025-03-31 22:18:52,476 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
+2025-03-31 22:18:52,480 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
+2025-03-31 22:18:52,582 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
+2025-03-31 22:18:52,586 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
+2025-03-31 22:18:52,686 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
+2025-03-31 22:18:52,690 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
+2025-03-31 22:18:52,747 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
+2025-03-31 22:18:52,750 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
+2025-03-31 22:18:52,851 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt4omini-TEST27 from cached results
+2025-03-31 22:18:52,852 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt4omini-TEST27
+2025-03-31 22:18:54,180 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 22:18:54,298 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_222103_90a3095d.log ADDED Viewed

	@@ -0,0 +1,50 @@

+2025-03-31 22:21:04,147 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 22:21:04,266 - __main__ - INFO - Loaded leaderboard with 7 entries
+2025-03-31 22:21:06,082 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 22:21:06,083 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 22:21:06,083 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 22:21:41,064 - __main__ - INFO - Received submission for model gpt-4o-mini-TEST6: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
+2025-03-31 22:21:42,056 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 22:21:42,922 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 22:21:42,922 - guardbench.evaluator - INFO - Starting evaluation for model: gpt-4o-mini-TEST6
+2025-03-31 22:21:42,922 - guardbench.evaluator - INFO - Using cached results for model: gpt-4o-mini-TEST6
+2025-03-31 22:21:42,942 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
+2025-03-31 22:21:42,945 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
+2025-03-31 22:21:43,044 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
+2025-03-31 22:21:43,048 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
+2025-03-31 22:21:43,151 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
+2025-03-31 22:21:43,155 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
+2025-03-31 22:21:43,257 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
+2025-03-31 22:21:43,260 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
+2025-03-31 22:21:43,360 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
+2025-03-31 22:21:43,363 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
+2025-03-31 22:21:43,461 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
+2025-03-31 22:21:43,465 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
+2025-03-31 22:21:43,563 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
+2025-03-31 22:21:43,567 - guardbench.evaluator - INFO - Length Safe Prompts - 490
+2025-03-31 22:21:44,119 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
+2025-03-31 22:21:44,123 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
+2025-03-31 22:21:44,223 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
+2025-03-31 22:21:44,226 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
+2025-03-31 22:21:44,323 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
+2025-03-31 22:21:44,326 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
+2025-03-31 22:21:44,383 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
+2025-03-31 22:21:44,387 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
+2025-03-31 22:21:44,487 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
+2025-03-31 22:21:44,490 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
+2025-03-31 22:21:44,590 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
+2025-03-31 22:21:44,593 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
+2025-03-31 22:21:44,694 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
+2025-03-31 22:21:44,698 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
+2025-03-31 22:21:44,799 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
+2025-03-31 22:21:44,802 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
+2025-03-31 22:21:44,903 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
+2025-03-31 22:21:44,907 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
+2025-03-31 22:21:45,008 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
+2025-03-31 22:21:45,011 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
+2025-03-31 22:21:45,112 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
+2025-03-31 22:21:45,116 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
+2025-03-31 22:21:45,222 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt-4o-mini-TEST6 from cached results
+2025-03-31 22:21:45,223 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt-4o-mini-TEST6
+2025-03-31 22:21:47,120 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 22:21:47,363 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_222531_b0fff871.log ADDED Viewed

	@@ -0,0 +1,50 @@

+2025-03-31 22:25:32,811 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 22:25:32,908 - __main__ - INFO - Loaded leaderboard with 8 entries
+2025-03-31 22:25:34,603 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 22:25:34,603 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 22:25:34,603 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 22:26:13,775 - __main__ - INFO - Received submission for model gpt4omini-TEST7: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
+2025-03-31 22:26:14,580 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 22:26:15,600 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 22:26:15,600 - guardbench.evaluator - INFO - Starting evaluation for model: gpt4omini-TEST7
+2025-03-31 22:26:15,601 - guardbench.evaluator - INFO - Using cached results for model: gpt4omini-TEST7
+2025-03-31 22:26:15,620 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
+2025-03-31 22:26:15,624 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
+2025-03-31 22:26:15,727 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
+2025-03-31 22:26:15,731 - guardbench.evaluator - INFO - Length Safe Prompts - 490
+2025-03-31 22:26:16,284 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
+2025-03-31 22:26:16,287 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
+2025-03-31 22:26:16,389 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
+2025-03-31 22:26:16,392 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
+2025-03-31 22:26:16,488 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
+2025-03-31 22:26:16,491 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
+2025-03-31 22:26:16,593 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
+2025-03-31 22:26:16,597 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
+2025-03-31 22:26:16,696 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
+2025-03-31 22:26:16,700 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
+2025-03-31 22:26:16,758 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
+2025-03-31 22:26:16,761 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
+2025-03-31 22:26:16,863 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
+2025-03-31 22:26:16,866 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
+2025-03-31 22:26:16,964 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
+2025-03-31 22:26:16,968 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
+2025-03-31 22:26:17,069 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
+2025-03-31 22:26:17,073 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
+2025-03-31 22:26:17,172 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
+2025-03-31 22:26:17,176 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
+2025-03-31 22:26:17,276 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
+2025-03-31 22:26:17,279 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
+2025-03-31 22:26:17,381 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
+2025-03-31 22:26:17,384 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
+2025-03-31 22:26:17,480 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
+2025-03-31 22:26:17,484 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
+2025-03-31 22:26:17,583 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
+2025-03-31 22:26:17,586 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
+2025-03-31 22:26:17,687 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
+2025-03-31 22:26:17,691 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
+2025-03-31 22:26:17,793 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
+2025-03-31 22:26:17,796 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
+2025-03-31 22:26:17,903 - guardbench.evaluator - INFO - Updated leaderboard for model: gpt4omini-TEST7 from cached results
+2025-03-31 22:26:17,905 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: gpt4omini-TEST7
+2025-03-31 22:26:19,582 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 22:26:19,716 - __main__ - INFO - Refreshed leaderboard data after submission

logs/guardbench_20250331_223148_4e22eb66.log ADDED Viewed

	@@ -0,0 +1,50 @@

+2025-03-31 22:31:49,992 - __main__ - INFO - Initializing leaderboard data...
+2025-03-31 22:31:50,115 - __main__ - INFO - Loaded leaderboard with 9 entries
+2025-03-31 22:31:51,543 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts
+2025-03-31 22:31:51,543 - apscheduler.scheduler - INFO - Added job "<lambda>" to job store "default"
+2025-03-31 22:31:51,543 - apscheduler.scheduler - INFO - Scheduler started
+2025-03-31 22:32:19,107 - __main__ - INFO - Received submission for model got-r0mini-8: /tmp/gradio/26f1c6837517a59736a02ffe486b3504336116347e339745a7973e9412dad4db/gpt-4o-mini-3.jsonl
+2025-03-31 22:32:19,583 - guardbench.context - INFO - Loading dataset from: whitecircle-ai/guardbench_dataset_1k_public
+2025-03-31 22:32:20,839 - guardbench.context - INFO - Successfully loaded dataset with 980 examples
+2025-03-31 22:32:20,839 - guardbench.evaluator - INFO - Starting evaluation for model: got-r0mini-8
+2025-03-31 22:32:20,839 - guardbench.evaluator - INFO - Using cached results for model: got-r0mini-8
+2025-03-31 22:32:20,857 - guardbench.evaluator - INFO - Processing cached results for category: Political Corruption and Legal Evasion
+2025-03-31 22:32:20,861 - guardbench.evaluator - INFO - Length Political Corruption and Legal Evasion - 30
+2025-03-31 22:32:20,960 - guardbench.evaluator - INFO - Processing cached results for category: Creative Content Involving Illicit Themes
+2025-03-31 22:32:20,963 - guardbench.evaluator - INFO - Length Creative Content Involving Illicit Themes - 30
+2025-03-31 22:32:21,062 - guardbench.evaluator - INFO - Processing cached results for category: Financial Fraud and Unethical Business
+2025-03-31 22:32:21,065 - guardbench.evaluator - INFO - Length Financial Fraud and Unethical Business - 30
+2025-03-31 22:32:21,165 - guardbench.evaluator - INFO - Processing cached results for category: Manipulation, Deception, and Misinformation
+2025-03-31 22:32:21,168 - guardbench.evaluator - INFO - Length Manipulation, Deception, and Misinformation - 30
+2025-03-31 22:32:21,266 - guardbench.evaluator - INFO - Processing cached results for category: Drug– and Substance–Related Activities
+2025-03-31 22:32:21,269 - guardbench.evaluator - INFO - Length Drug– and Substance–Related Activities - 30
+2025-03-31 22:32:21,367 - guardbench.evaluator - INFO - Processing cached results for category: AI Manipulation and Jailbreaking
+2025-03-31 22:32:21,371 - guardbench.evaluator - INFO - Length AI Manipulation and Jailbreaking - 30
+2025-03-31 22:32:21,469 - guardbench.evaluator - INFO - Processing cached results for category: Sexual Content and Violence
+2025-03-31 22:32:21,473 - guardbench.evaluator - INFO - Length Sexual Content and Violence - 29
+2025-03-31 22:32:21,568 - guardbench.evaluator - INFO - Processing cached results for category: Cybercrime, Hacking, and Digital Exploits
+2025-03-31 22:32:21,571 - guardbench.evaluator - INFO - Length Cybercrime, Hacking, and Digital Exploits - 30
+2025-03-31 22:32:21,669 - guardbench.evaluator - INFO - Processing cached results for category: Safe Prompts
+2025-03-31 22:32:21,673 - guardbench.evaluator - INFO - Length Safe Prompts - 490
+2025-03-31 22:32:22,215 - guardbench.evaluator - INFO - Processing cached results for category: Child Exploitation and Abuse
+2025-03-31 22:32:22,219 - guardbench.evaluator - INFO - Length Child Exploitation and Abuse - 30
+2025-03-31 22:32:22,318 - guardbench.evaluator - INFO - Processing cached results for category: Labor Exploitation and Human Trafficking
+2025-03-31 22:32:22,321 - guardbench.evaluator - INFO - Length Labor Exploitation and Human Trafficking - 30
+2025-03-31 22:32:22,418 - guardbench.evaluator - INFO - Processing cached results for category: Self–Harm and Suicidal Ideation
+2025-03-31 22:32:22,422 - guardbench.evaluator - INFO - Length Self–Harm and Suicidal Ideation - 13
+2025-03-31 22:32:22,476 - guardbench.evaluator - INFO - Processing cached results for category: Criminal, Violent, and Terrorist Activity
+2025-03-31 22:32:22,480 - guardbench.evaluator - INFO - Length Criminal, Violent, and Terrorist Activity - 30
+2025-03-31 22:32:22,578 - guardbench.evaluator - INFO - Processing cached results for category: Hate Speech, Extremism, and Discrimination
+2025-03-31 22:32:22,581 - guardbench.evaluator - INFO - Length Hate Speech, Extremism, and Discrimination - 29
+2025-03-31 22:32:22,678 - guardbench.evaluator - INFO - Processing cached results for category: Environmental and Industrial Harm
+2025-03-31 22:32:22,681 - guardbench.evaluator - INFO - Length Environmental and Industrial Harm - 30
+2025-03-31 22:32:22,779 - guardbench.evaluator - INFO - Processing cached results for category: Animal Cruelty and Exploitation
+2025-03-31 22:32:22,783 - guardbench.evaluator - INFO - Length Animal Cruelty and Exploitation - 30
+2025-03-31 22:32:22,884 - guardbench.evaluator - INFO - Processing cached results for category: Academic Dishonesty and Cheating
+2025-03-31 22:32:22,888 - guardbench.evaluator - INFO - Length Academic Dishonesty and Cheating - 29
+2025-03-31 22:32:22,984 - guardbench.evaluator - INFO - Processing cached results for category: Weapon, Explosives, and Hazardous Materials
+2025-03-31 22:32:22,987 - guardbench.evaluator - INFO - Length Weapon, Explosives, and Hazardous Materials - 30
+2025-03-31 22:32:23,092 - guardbench.evaluator - INFO - Updated leaderboard for model: got-r0mini-8 from cached results
+2025-03-31 22:32:23,093 - guardbench.evaluator - INFO - Evaluation from cached results completed for model: got-r0mini-8
+2025-03-31 22:32:25,575 - __main__ - INFO - Refreshing leaderboard data after submission for version v0...
+2025-03-31 22:32:25,790 - __main__ - INFO - Refreshed leaderboard data after submission

requirements.txt CHANGED Viewed

@@ -1,7 +1,8 @@
-gradio>=3.50.2
-huggingface_hub>=0.19.0
-datasets>=2.16.0
 pandas>=2.0.0
 python-dotenv>=1.0.0
-apscheduler>=3.10.1
-gradio-leaderboard

+gradio>=4.0.0
 pandas>=2.0.0
+huggingface_hub>=0.20.0
+datasets>=2.0.0
+apscheduler>=3.10.0
 python-dotenv>=1.0.0
+plotly>=5.18.0
+gradio-leaderboard>=0.1.0

src/display/css_html_js.py CHANGED Viewed

@@ -45,11 +45,9 @@ custom_css = """
 }
 .version-selector {
-    margin-top: 10px;
     padding: 5px;
-    border: 1px solid #e0e0e0;
     border-radius: 5px;
-    background-color: #f9f9f9;
 }
 .version-selector label {
@@ -61,4 +59,14 @@ custom_css = """
     border-color: #2196F3;
     border-radius: 4px;
 }
 """

 }
 .version-selector {
+    margin: 0 !important;
     padding: 5px;
     border-radius: 5px;
 }
 .version-selector label {
     border-color: #2196F3;
     border-radius: 4px;
 }
+/* Make sure the version selector is properly aligned with refresh button */
+.version-selector > .block {
+    padding: 0 !important;
+}
+.version-selector > .block > .wrap {
+    position: relative;
+    top: -5px;
+}
 """

src/display/utils.py CHANGED Viewed

@@ -26,6 +26,20 @@ class ModelType(Enum):
             return "API"
         return "Unknown"
 class Precision(Enum):
     """Model precision types."""
@@ -65,75 +79,203 @@ class ColumnInfo:
 @dataclass
 class GuardBenchColumn:
     """Columns for the GuardBench leaderboard."""
     model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="model_name",
         display_name="Model",
         never_hidden=True,
         displayed_by_default=True
     ))
     model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="model_type",
         display_name="Type",
         displayed_by_default=True
     ))
-    # Metrics for all categories
     default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="default_prompts_f1",
         display_name="Default Prompts F1",
         type="number",
         displayed_by_default=True
     ))
     jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="jailbreaked_prompts_f1",
         display_name="Jailbreaked Prompts F1",
         type="number",
         displayed_by_default=True
     ))
     default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="default_answers_f1",
         display_name="Default Answers F1",
         type="number",
         displayed_by_default=True
     ))
     jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="jailbreaked_answers_f1",
         display_name="Jailbreaked Answers F1",
         type="number",
         displayed_by_default=True
     ))
-    # Average metrics
-    average_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="average_f1",
-        display_name="Average F1",
         type="number",
-        displayed_by_default=True,
-        never_hidden=True
     ))
-    average_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="average_recall",
-        display_name="Average Recall",
         type="number",
         displayed_by_default=False
     ))
-    average_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="average_precision",
-        display_name="Average Precision",
         type="number",
         displayed_by_default=False
     ))
-    # Additional metadata
-    submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
-        name="submission_date",
-        display_name="Submission Date",
         displayed_by_default=False
     ))

             return "API"
         return "Unknown"
+class GuardModelType(str, Enum):
+    """Guard model types for the leaderboard."""
+    LLAMA_GUARD = "llama_guard"
+    PROMPT_GUARD_CLF = "prompt_guard_clf"
+    ATLA_SELENE = "atla_selene"
+    GEMMA_SHIELD = "gemma_shield"
+    LLM_REGEXP = "llm_regexp"
+    LLM_SO = "llm_so"
+    def __str__(self):
+        """String representation of the guard model type."""
+        return self.name
 class Precision(Enum):
     """Model precision types."""
 @dataclass
 class GuardBenchColumn:
     """Columns for the GuardBench leaderboard."""
+    # Core metadata
     model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="model_name",
         display_name="Model",
         never_hidden=True,
         displayed_by_default=True
     ))
     model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="model_type",
         display_name="Type",
         displayed_by_default=True
     ))
+    submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="submission_date",
+        display_name="Submission Date",
+        displayed_by_default=False
+    ))
+    version: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="version",
+        display_name="Version",
+        displayed_by_default=False
+    ))
+    guard_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="guard_model_type",
+        display_name="Guard Model Type",
+        displayed_by_default=True
+    ))
+    base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="base_model",
+        display_name="Base Model",
+        displayed_by_default=False
+    ))
+    revision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="revision",
+        display_name="Revision",
+        displayed_by_default=False
+    ))
+    precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="precision",
+        display_name="Precision",
+        displayed_by_default=False
+    ))
+    weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="weight_type",
+        display_name="Weight Type",
+        displayed_by_default=False
+    ))
+    # Default prompts metrics
+    default_prompts_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_prompts_f1_binary",
+        display_name="Default Prompts F1 Binary",
+        type="number",
+        displayed_by_default=False
+    ))
     default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="default_prompts_f1",
         display_name="Default Prompts F1",
         type="number",
         displayed_by_default=True
     ))
+    default_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_prompts_recall_binary",
+        display_name="Default Prompts Recall",
+        type="number",
+        displayed_by_default=False
+    ))
+    default_prompts_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_prompts_precision_binary",
+        display_name="Default Prompts Precision",
+        type="number",
+        displayed_by_default=False
+    ))
+    default_prompts_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_prompts_error_ratio",
+        display_name="Default Prompts Error Ratio",
+        type="number",
+        displayed_by_default=False
+    ))
+    default_prompts_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_prompts_avg_runtime_ms",
+        display_name="Default Prompts Avg Runtime (ms)",
+        type="number",
+        displayed_by_default=False
+    ))
+    # Jailbreaked prompts metrics
+    jailbreaked_prompts_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_prompts_f1_binary",
+        display_name="Jailbreaked Prompts F1 Binary",
+        type="number",
+        displayed_by_default=False
+    ))
     jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="jailbreaked_prompts_f1",
         display_name="Jailbreaked Prompts F1",
         type="number",
         displayed_by_default=True
     ))
+    jailbreaked_prompts_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_prompts_recall_binary",
+        display_name="Jailbreaked Prompts Recall",
+        type="number",
+        displayed_by_default=False
+    ))
+    jailbreaked_prompts_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_prompts_precision_binary",
+        display_name="Jailbreaked Prompts Precision",
+        type="number",
+        displayed_by_default=False
+    ))
+    jailbreaked_prompts_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_prompts_error_ratio",
+        display_name="Jailbreaked Prompts Error Ratio",
+        type="number",
+        displayed_by_default=False
+    ))
+    jailbreaked_prompts_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_prompts_avg_runtime_ms",
+        display_name="Jailbreaked Prompts Avg Runtime (ms)",
+        type="number",
+        displayed_by_default=False
+    ))
+    # Default answers metrics
+    default_answers_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_answers_f1_binary",
+        display_name="Default Answers F1 Binary",
+        type="number",
+        displayed_by_default=False
+    ))
     default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="default_answers_f1",
         display_name="Default Answers F1",
         type="number",
         displayed_by_default=True
     ))
+    default_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_answers_recall_binary",
+        display_name="Default Answers Recall",
+        type="number",
+        displayed_by_default=False
+    ))
+    default_answers_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_answers_precision_binary",
+        display_name="Default Answers Precision",
+        type="number",
+        displayed_by_default=False
+    ))
+    default_answers_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_answers_error_ratio",
+        display_name="Default Answers Error Ratio",
+        type="number",
+        displayed_by_default=False
+    ))
+    default_answers_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="default_answers_avg_runtime_ms",
+        display_name="Default Answers Avg Runtime (ms)",
+        type="number",
+        displayed_by_default=False
+    ))
+    # Jailbreaked answers metrics
+    jailbreaked_answers_f1_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_answers_f1_binary",
+        display_name="Jailbreaked Answers F1 Binary",
+        type="number",
+        displayed_by_default=False
+    ))
     jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
         name="jailbreaked_answers_f1",
         display_name="Jailbreaked Answers F1",
         type="number",
         displayed_by_default=True
     ))
+    jailbreaked_answers_recall_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_answers_recall_binary",
+        display_name="Jailbreaked Answers Recall",
         type="number",
+        displayed_by_default=False
     ))
+    jailbreaked_answers_precision_binary: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_answers_precision_binary",
+        display_name="Jailbreaked Answers Precision",
         type="number",
         displayed_by_default=False
     ))
+    jailbreaked_answers_error_ratio: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_answers_error_ratio",
+        display_name="Jailbreaked Answers Error Ratio",
         type="number",
         displayed_by_default=False
     ))
+    jailbreaked_answers_avg_runtime_ms: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="jailbreaked_answers_avg_runtime_ms",
+        display_name="Jailbreaked Answers Avg Runtime (ms)",
+        type="number",
         displayed_by_default=False
     ))

src/leaderboard/processor.py CHANGED Viewed

@@ -103,7 +103,8 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
             "model_name": model_name,
             "model_type": entry.get("model_type", "Unknown"),
             "submission_date": entry.get("submission_date", ""),
-            "version": entry.get("version", "v0")
         }
         # Add additional metadata fields if present
@@ -111,50 +112,67 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
             if key in entry:
                 row[key] = entry[key]
-        # Add average metrics
         avg_metrics = entry.get("avg_metrics", {})
-        for test_type in TEST_TYPES:
-            if test_type in avg_metrics:
-                for metric in METRICS:
-                    if metric in avg_metrics[test_type]:
-                        col_name = f"{test_type}_{metric}"
-                        row[col_name] = avg_metrics[test_type][metric]
-        # Calculate overall averages for key metrics
-        f1_values = []
-        recall_values = []
-        precision_values = []
-        for test_type in TEST_TYPES:
-            if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
-                f1_values.append(avg_metrics[test_type]["f1_binary"])
-            if test_type in avg_metrics and "recall_binary" in avg_metrics[test_type]:
-                recall_values.append(avg_metrics[test_type]["recall_binary"])
-            if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
-                precision_values.append(avg_metrics[test_type]["precision_binary"])
-        # Add overall averages
-        if f1_values:
-            row["average_f1"] = sum(f1_values) / len(f1_values)
-        if recall_values:
-            row["average_recall"] = sum(recall_values) / len(recall_values)
-        if precision_values:
-            row["average_precision"] = sum(precision_values) / len(precision_values)
-        # Add specific test type F1 scores for display
-        if "default_prompts" in avg_metrics and "f1_binary" in avg_metrics["default_prompts"]:
-            row["default_prompts_f1"] = avg_metrics["default_prompts"]["f1_binary"]
-        if "jailbreaked_prompts" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_prompts"]:
-            row["jailbreaked_prompts_f1"] = avg_metrics["jailbreaked_prompts"]["f1_binary"]
-        if "default_answers" in avg_metrics and "f1_binary" in avg_metrics["default_answers"]:
-            row["default_answers_f1"] = avg_metrics["default_answers"]["f1_binary"]
-        if "jailbreaked_answers" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_answers"]:
-            row["jailbreaked_answers_f1"] = avg_metrics["jailbreaked_answers"]["f1_binary"]
         rows.append(row)
     # Create DataFrame and sort by average F1 score
     df = pd.DataFrame(rows)
     if not df.empty and "average_f1" in df.columns:
         df = df.sort_values(by="average_f1", ascending=False)

             "model_name": model_name,
             "model_type": entry.get("model_type", "Unknown"),
             "submission_date": entry.get("submission_date", ""),
+            "version": entry.get("version", "v0"),
+            "guard_model_type": entry.get("guard_model_type", "llm_regexp").lower()
         }
         # Add additional metadata fields if present
             if key in entry:
                 row[key] = entry[key]
+        # CASE 1: Metrics are flat in the root
+        for key, value in entry.items():
+            if any(test_type in key for test_type in TEST_TYPES) or key in ["average_f1", "average_recall", "average_precision"]:
+                row[key] = value
+        # CASE 2: Metrics are in avg_metrics structure
         avg_metrics = entry.get("avg_metrics", {})
+        if avg_metrics:
+            for test_type in TEST_TYPES:
+                if test_type in avg_metrics:
+                    metrics = avg_metrics[test_type]
+                    for metric in METRICS:
+                        if metric in metrics:
+                            col_name = f"{test_type}_{metric}"
+                            row[col_name] = metrics[metric]
+                            # Also add non-binary version for F1 scores
+                            if metric == "f1_binary":
+                                row[f"{test_type}_f1"] = metrics[metric]
+            # Calculate averages if not present
+            if "average_f1" not in row:
+                f1_values = []
+                for test_type in TEST_TYPES:
+                    if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
+                        f1_values.append(avg_metrics[test_type]["f1_binary"])
+                if f1_values:
+                    row["average_f1"] = sum(f1_values) / len(f1_values)
+            if "average_recall" not in row:
+                recall_values = []
+                for test_type in TEST_TYPES:
+                    if test_type in avg_metrics and "recall_binary" in avg_metrics[test_type]:
+                        recall_values.append(avg_metrics[test_type]["recall_binary"])
+                if recall_values:
+                    row["average_recall"] = sum(recall_values) / len(recall_values)
+            if "average_precision" not in row:
+                precision_values = []
+                for test_type in TEST_TYPES:
+                    if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
+                        precision_values.append(avg_metrics[test_type]["precision_binary"])
+                if precision_values:
+                    row["average_precision"] = sum(precision_values) / len(precision_values)
         rows.append(row)
     # Create DataFrame and sort by average F1 score
     df = pd.DataFrame(rows)
+    # Ensure all expected columns exist
+    for test_type in TEST_TYPES:
+        if f"{test_type}_f1" not in df.columns:
+            df[f"{test_type}_f1"] = None
+        if f"{test_type}_f1_binary" not in df.columns:
+            df[f"{test_type}_f1_binary"] = None
+        if f"{test_type}_recall_binary" not in df.columns:
+            df[f"{test_type}_recall_binary"] = None
+        if f"{test_type}_precision_binary" not in df.columns:
+            df[f"{test_type}_precision_binary"] = None
     if not df.empty and "average_f1" in df.columns:
         df = df.sort_values(by="average_f1", ascending=False)

src/populate.py CHANGED Viewed

@@ -6,277 +6,212 @@ import json
 import os
 import pandas as pd
 import tempfile
-from typing import Dict, Tuple, List
-from glob import glob
-from huggingface_hub import snapshot_download, hf_hub_download, HfApi
 from datasets import load_dataset
 from src.display.utils import GUARDBENCH_COLUMN, DISPLAY_COLS, CATEGORIES
-from src.envs import RESULTS_DATASET_ID, TOKEN, LEADERBOARD_FILE, CACHE_PATH
-from src.leaderboard.processor import leaderboard_to_dataframe, load_leaderboard_data, save_leaderboard_data, process_jsonl_submission, add_entries_to_leaderboard
-def get_versioned_leaderboard_file(version="v0"):
     """
-    Get the versioned leaderboard file path.
     """
-    base_name, ext = os.path.splitext(LEADERBOARD_FILE)
-    return f"{base_name}_{version}{ext}"
-def download_leaderboard_data(version="v0") -> bool:
     """
-    Download the latest leaderboard data from HuggingFace.
-    Args:
-        version: The dataset version to download
     """
     try:
-        # Create a temporary directory to download the submissions
-        temp_dir = os.path.join(CACHE_PATH, f"temp_submissions_{version}")
-        os.makedirs(temp_dir, exist_ok=True)
-        # Get the versioned leaderboard file
-        leaderboard_file = get_versioned_leaderboard_file(version)
-        # Download the entire repository
-        try:
-            snapshot_path = snapshot_download(
-                repo_id=RESULTS_DATASET_ID,
-                repo_type="dataset",
-                local_dir=temp_dir,
-                token=TOKEN,
-                ignore_patterns=["*.md", ".*"],
-                etag_timeout=30
-            )
-            # Process all submission files
-            all_entries = []
-            submission_files = []
-            # Look for submission files in the submissions directory
-            submissions_dir = os.path.join(snapshot_path, "submissions")
-            version_submissions_dir = os.path.join(snapshot_path, f"submissions_{version}")
-            # Check both standard and versioned submission directories
-            if os.path.exists(submissions_dir):
-                submission_files.extend(glob(os.path.join(submissions_dir, "*.jsonl")))
-            if os.path.exists(version_submissions_dir):
-                submission_files.extend(glob(os.path.join(version_submissions_dir, "*.jsonl")))
-            # Also look for any versioned JSONL files in the root
-            submission_files.extend(glob(os.path.join(snapshot_path, f"*_{version}.jsonl")))
-            # If we're looking for v0 and no versioned files found, use generic ones
-            if version == "v0" and not submission_files:
-                submission_files.extend(glob(os.path.join(snapshot_path, "*.jsonl")))
-            # Process each submission file
-            for file_path in submission_files:
-                entries, _ = process_jsonl_submission(file_path)
-                # Filter entries to those that match the version or don't have version specified
-                filtered_entries = [
-                    entry for entry in entries
-                    if entry.get("version", "v0") == version or "version" not in entry
-                ]
-                all_entries.extend(filtered_entries)
-            # Create leaderboard data structure
-            leaderboard_data = {
-                "entries": all_entries,
-                "last_updated": pd.Timestamp.now().isoformat(),
-                "version": version
-            }
-            # Save to local file
-            save_leaderboard_data(leaderboard_data, leaderboard_file)
-            return True
-        except Exception as e:
-            print(f"Error downloading repository: {e}")
-            # If we can't download the repository, try to download individual files
             try:
-                api = HfApi(token=TOKEN)
-                files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
-                # Look for versioned and regular files
-                submission_files = [
-                    f for f in files
-                    if (f.endswith(f'_{version}.jsonl') or
-                        f.startswith(f'submissions_{version}/') or
-                        (version == "v0" and f.endswith('.jsonl')))
-                ]
-                all_entries = []
-                for file_path in submission_files:
-                    try:
-                        local_path = hf_hub_download(
-                            repo_id=RESULTS_DATASET_ID,
-                            filename=file_path,
-                            repo_type="dataset",
-                            token=TOKEN
-                        )
-                        entries, _ = process_jsonl_submission(local_path)
-                        # Filter entries to those that match the version or don't have version specified
-                        filtered_entries = [
-                            entry for entry in entries
-                            if entry.get("version", "v0") == version or "version" not in entry
-                        ]
-                        all_entries.extend(filtered_entries)
-                    except Exception as file_error:
-                        print(f"Error downloading file {file_path}: {file_error}")
-                # Create leaderboard data structure
-                leaderboard_data = {
-                    "entries": all_entries,
-                    "last_updated": pd.Timestamp.now().isoformat(),
-                    "version": version
-                }
-                # Save to local file
-                save_leaderboard_data(leaderboard_data, leaderboard_file)
-                return True
-            except Exception as list_error:
-                print(f"Error listing repository files: {list_error}")
-            # If we can't download anything, create an empty leaderboard
-            if not os.path.exists(leaderboard_file):
-                empty_data = {
-                    "entries": [],
-                    "last_updated": pd.Timestamp.now().isoformat(),
-                    "version": version
-                }
-                save_leaderboard_data(empty_data, leaderboard_file)
-            return False
     except Exception as e:
-        print(f"Error downloading leaderboard data: {e}")
-        # Ensure we have at least an empty leaderboard file
-        leaderboard_file = get_versioned_leaderboard_file(version)
-        if not os.path.exists(leaderboard_file):
-            empty_data = {
-                "entries": [],
-                "last_updated": pd.Timestamp.now().isoformat(),
-                "version": version
-            }
-            save_leaderboard_data(empty_data, leaderboard_file)
-        return False
 def get_leaderboard_df(version="v0") -> pd.DataFrame:
     """
     Get the leaderboard data as a DataFrame.
-    Args:
-        version: The dataset version to retrieve
     """
-    # Try to download the latest data
-    download_leaderboard_data(version=version)
-    # Load from local file
-    leaderboard_file = get_versioned_leaderboard_file(version)
-    leaderboard_data = load_leaderboard_data(leaderboard_file)
     # Convert to DataFrame
-    df = leaderboard_to_dataframe(leaderboard_data)
-    return df
 def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
     """
     Get the leaderboard data filtered by a specific category.
-    Args:
-        category: The category to filter by (e.g., "Criminal, Violent, and Terrorist Activity")
-        version: The dataset version to retrieve
-    Returns:
-        DataFrame with metrics for the specified category
     """
-    # Load the leaderboard data
-    leaderboard_file = get_versioned_leaderboard_file(version)
-    leaderboard_data = load_leaderboard_data(leaderboard_file)
     # Filter entries to only include those with data for the specified category
     filtered_entries = []
     for entry in leaderboard_data.get("entries", []):
-        # Check if the entry has data for this category
-        if "per_category_metrics" in entry and category in entry["per_category_metrics"]:
-            # Create a new entry with just the overall info and this category's metrics
-            filtered_entry = {
-                "model_name": entry.get("model_name", "Unknown Model"),
-                "model_type": entry.get("model_type", "Unknown"),
-                "submission_date": entry.get("submission_date", ""),
-                "version": entry.get("version", version),
-            }
-            # Extract metrics for this category
             category_metrics = entry["per_category_metrics"][category]
-            # Add metrics for each test type
-            for test_type in category_metrics:
-                if test_type and isinstance(category_metrics[test_type], dict):
-                    for metric, value in category_metrics[test_type].items():
                         col_name = f"{test_type}_{metric}"
                         filtered_entry[col_name] = value
-            # Calculate average F1 for this category
             f1_values = []
-            for test_type in category_metrics:
-                if test_type and isinstance(category_metrics[test_type], dict) and "f1_binary" in category_metrics[test_type]:
-                    f1_values.append(category_metrics[test_type]["f1_binary"])
             if f1_values:
                 filtered_entry["average_f1"] = sum(f1_values) / len(f1_values)
-            # Add specific test type F1 scores for display
-            for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
-                if test_type in category_metrics and "f1_binary" in category_metrics[test_type]:
-                    filtered_entry[f"{test_type}_f1"] = category_metrics[test_type]["f1_binary"]
             filtered_entries.append(filtered_entry)
     # Create a new leaderboard data structure with the filtered entries
     filtered_leaderboard = {
         "entries": filtered_entries,
-        "last_updated": leaderboard_data.get("last_updated", pd.Timestamp.now().isoformat()),
         "version": version
     }
     # Convert to DataFrame
-    df = leaderboard_to_dataframe(filtered_leaderboard)
-    return df
 def get_detailed_model_data(model_name: str, version="v0") -> Dict:
     """
     Get detailed data for a specific model.
-    Args:
-        model_name: The name of the model to get data for
-        version: The dataset version to retrieve
     """
-    leaderboard_file = get_versioned_leaderboard_file(version)
-    leaderboard_data = load_leaderboard_data(leaderboard_file)
-    for entry in leaderboard_data.get("entries", []):
-        # Check both the model name and version
-        entry_version = entry.get("version", "v0")
-        if entry.get("model_name") == model_name and (entry_version == version or entry_version is None):
-            return entry
     return {}

 import os
 import pandas as pd
 import tempfile
+from typing import Dict, List, Optional
+from datetime import datetime
+from huggingface_hub import hf_hub_download, HfApi
 from datasets import load_dataset
 from src.display.utils import GUARDBENCH_COLUMN, DISPLAY_COLS, CATEGORIES
+from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
+from src.leaderboard.processor import leaderboard_to_dataframe
+def get_latest_leaderboard(version="v0") -> Optional[Dict]:
     """
+    Get the latest leaderboard data from HuggingFace dataset.
     """
+    try:
+        # Try to download the leaderboard file
+        leaderboard_path = hf_hub_download(
+            repo_id=RESULTS_DATASET_ID,
+            filename=f"leaderboards/leaderboard_{version}.json",
+            repo_type="dataset",
+            token=TOKEN
+        )
+        with open(leaderboard_path, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"Error downloading leaderboard: {e}")
+        return None
+def get_model_entry(model_name: str, version="v0") -> Optional[Dict]:
     """
+    Get a specific model's entry from the entries folder.
     """
     try:
+        model_name_safe = model_name.replace("/", "_").replace(" ", "_")
+        entry_path = hf_hub_download(
+            repo_id=RESULTS_DATASET_ID,
+            filename=f"entries/entry_{model_name_safe}_{version}.json",
+            repo_type="dataset",
+            token=TOKEN
+        )
+        with open(entry_path, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"Error downloading model entry: {e}")
+        return None
+def get_all_entries(version="v0") -> List[Dict]:
+    """
+    Get all model entries from the entries folder.
+    """
+    try:
+        api = HfApi(token=TOKEN)
+        files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
+        entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
+        entries = []
+        for entry_file in entry_files:
             try:
+                entry_path = hf_hub_download(
+                    repo_id=RESULTS_DATASET_ID,
+                    filename=entry_file,
+                    repo_type="dataset",
+                    token=TOKEN
+                )
+                with open(entry_path, 'r') as f:
+                    entry_data = json.load(f)
+                    entries.append(entry_data)
+            except Exception as e:
+                print(f"Error loading entry {entry_file}: {e}")
+        return entries
     except Exception as e:
+        print(f"Error listing entries: {e}")
+        return []
 def get_leaderboard_df(version="v0") -> pd.DataFrame:
     """
     Get the leaderboard data as a DataFrame.
     """
+    # Get latest leaderboard data
+    leaderboard_data = get_latest_leaderboard(version)
+    if not leaderboard_data:
+        # If no leaderboard exists, try to build it from entries
+        entries = get_all_entries(version)
+        if entries:
+            leaderboard_data = {
+                "entries": entries,
+                "last_updated": datetime.now().isoformat(),
+                "version": version
+            }
+        else:
+            # Return empty DataFrame if no data available
+            return pd.DataFrame(columns=DISPLAY_COLS)
     # Convert to DataFrame
+    return leaderboard_to_dataframe(leaderboard_data)
 def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
     """
     Get the leaderboard data filtered by a specific category.
     """
+    # Get latest leaderboard data
+    leaderboard_data = get_latest_leaderboard(version)
+    if not leaderboard_data:
+        # If no leaderboard exists, try to build it from entries
+        entries = get_all_entries(version)
+        if entries:
+            leaderboard_data = {
+                "entries": entries,
+                "last_updated": datetime.now().isoformat(),
+                "version": version
+            }
+        else:
+            # Return empty DataFrame if no data available
+            return pd.DataFrame(columns=DISPLAY_COLS)
     # Filter entries to only include those with data for the specified category
     filtered_entries = []
     for entry in leaderboard_data.get("entries", []):
+        # Copy all base fields
+        filtered_entry = {
+            "model_name": entry.get("model_name", "Unknown Model"),
+            "model_type": entry.get("model_type", "Unknown"),
+            "guard_model_type": entry.get("guard_model_type", "Unknown"),
+            "submission_date": entry.get("submission_date", ""),
+            "version": entry.get("version", version),
+            "base_model": entry.get("base_model", ""),
+            "revision": entry.get("revision", ""),
+            "precision": entry.get("precision", ""),
+            "weight_type": entry.get("weight_type", "")
+        }
+        if "per_category_metrics" in entry and category in entry["per_category_metrics"]:
             category_metrics = entry["per_category_metrics"][category]
+            # Add all metrics for each test type
+            for test_type, metrics in category_metrics.items():
+                if isinstance(metrics, dict):
+                    for metric, value in metrics.items():
                         col_name = f"{test_type}_{metric}"
                         filtered_entry[col_name] = value
+                        # Also add the non-binary version for F1 scores
+                        if metric == "f1_binary":
+                            filtered_entry[f"{test_type}_f1"] = value
+            # Calculate averages
             f1_values = []
+            recall_values = []
+            precision_values = []
+            for test_type in ["default_prompts", "jailbreaked_prompts", "default_answers", "jailbreaked_answers"]:
+                if test_type in category_metrics and isinstance(category_metrics[test_type], dict):
+                    if "f1_binary" in category_metrics[test_type]:
+                        f1_values.append(category_metrics[test_type]["f1_binary"])
+                    if "recall_binary" in category_metrics[test_type]:
+                        recall_values.append(category_metrics[test_type]["recall_binary"])
+                    if "precision_binary" in category_metrics[test_type]:
+                        precision_values.append(category_metrics[test_type]["precision_binary"])
+            # Add overall averages
             if f1_values:
                 filtered_entry["average_f1"] = sum(f1_values) / len(f1_values)
+            if recall_values:
+                filtered_entry["average_recall"] = sum(recall_values) / len(recall_values)
+            if precision_values:
+                filtered_entry["average_precision"] = sum(precision_values) / len(precision_values)
             filtered_entries.append(filtered_entry)
     # Create a new leaderboard data structure with the filtered entries
     filtered_leaderboard = {
         "entries": filtered_entries,
+        "last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
         "version": version
     }
+    print(filtered_leaderboard)
     # Convert to DataFrame
+    return leaderboard_to_dataframe(filtered_leaderboard)
 def get_detailed_model_data(model_name: str, version="v0") -> Dict:
     """
     Get detailed data for a specific model.
     """
+    # Try to get model's entry directly first
+    entry = get_model_entry(model_name, version)
+    if entry:
+        return entry
+    # If no direct entry found, try looking in the leaderboard
+    leaderboard_data = get_latest_leaderboard(version)
+    if leaderboard_data:
+        for entry in leaderboard_data.get("entries", []):
+            if entry.get("model_name") == model_name:
+                return entry
     return {}

src/submission/submit.py CHANGED Viewed

@@ -5,16 +5,19 @@ Handle submissions to the GuardBench leaderboard.
 import json
 import os
 import tempfile
-import uuid
 from datetime import datetime
 from typing import Dict, List, Tuple
 from huggingface_hub import HfApi
-from datasets import load_dataset, Dataset
-from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, RESULTS_DATASET_ID, TOKEN
-from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard, load_leaderboard_data
 def validate_submission(file_path: str) -> Tuple[bool, str]:
@@ -25,99 +28,194 @@ def validate_submission(file_path: str) -> Tuple[bool, str]:
         entries, message = process_jsonl_submission(file_path)
         if not entries:
             return False, message
-        # Additional validation could be added here
         return True, "Submission is valid"
     except Exception as e:
         return False, f"Error validating submission: {e}"
-def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str, version="v0") -> Tuple[bool, str]:
     """
-    Submit results to a HuggingFace dataset repository as individual files.
-    Args:
-        file_path: Path to the submission file
-        metadata: Metadata to include with the submission
-        dataset_id: The dataset repository ID
-        token: HuggingFace API token
-        version: The version of the benchmark used (e.g., "v0", "v1")
     """
     try:
-        # Process the submission file to validate
-        entries, message = process_jsonl_submission(file_path)
-        if not entries:
-            return False, message
-        # Generate a unique submission ID
-        model_name = metadata.get("model_name", "unknown")
         model_name_safe = model_name.replace("/", "_").replace(" ", "_")
-        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-        submission_id = f"{model_name_safe}_{timestamp}"
-        # Create an API instance
-        api = HfApi(token=token)
-        # Create a temporary file with metadata added
-        with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as temp_file:
-            # Add metadata to each entry
-            for entry in entries:
-                # If the entry already has a model_name, don't override it
-                if "model_name" not in entry:
-                    entry["model_name"] = metadata.get("model_name")
-                # Add other metadata if not present
-                for key, value in metadata.items():
-                    if key != "model_name" and key not in entry:
-                        entry[key] = value
-                # Ensure version is set
-                entry["version"] = version
-                # Write to temp file
-                temp_file.write(json.dumps(entry) + "\n")
             temp_path = temp_file.name
-        # Upload the file to the version-specific directory
-        submission_path = f"submissions_{version}/{submission_id}_{version}.jsonl" if version != "v0" else f"submissions/{submission_id}.jsonl"
         api.upload_file(
             path_or_fileobj=temp_path,
-            path_in_repo=submission_path,
-            repo_id=dataset_id,
             repo_type="dataset",
-            commit_message=f"Add submission for {model_name} (version {version})"
         )
-        # Clean up the temporary file
         os.unlink(temp_path)
-        return True, f"Successfully uploaded submission for {model_name} to {dataset_id} (version {version})"
     except Exception as e:
-        return False, f"Error submitting to dataset: {e}"
 def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
     """
     Process a submission to the GuardBench leaderboard.
-    Args:
-        file_path: Path to the submission file
-        metadata: Metadata to include with the submission
-        version: The version of the benchmark used (e.g., "v0", "v1")
     """
-    # Validate submission file
-    is_valid, validation_message = validate_submission(file_path)
-    if not is_valid:
-        return styled_error(validation_message)
-    # Add version to metadata
-    metadata["version"] = version
-    # Submit to HuggingFace dataset repository
-    success, message = submit_to_hub(file_path, metadata, RESULTS_DATASET_ID, TOKEN, version=version)
-    if not success:
-        return styled_error(message)
-    return styled_message(f"Submission successful! {message}")

 import json
 import os
 import tempfile
 from datetime import datetime
 from typing import Dict, List, Tuple
+import shutil
 from huggingface_hub import HfApi
+from datasets import load_dataset
+from src.display.formatting import styled_error, styled_message
+from src.envs import RESULTS_DATASET_ID, TOKEN
+from src.leaderboard.processor import process_jsonl_submission
+from guardbench.evaluator import Evaluator
+from guardbench.context import GuardbenchContext
+from guardbench.models_config import ModelType
 def validate_submission(file_path: str) -> Tuple[bool, str]:
         entries, message = process_jsonl_submission(file_path)
         if not entries:
             return False, message
         return True, "Submission is valid"
     except Exception as e:
         return False, f"Error validating submission: {e}"
+def submit_entry_to_hub(entry: Dict, model_name: str, version="v0") -> Tuple[bool, str]:
     """
+    Submit a model's evaluation entry to the HuggingFace dataset.
     """
     try:
+        # Create safe model name for file path
         model_name_safe = model_name.replace("/", "_").replace(" ", "_")
+        # Create entry path in entries folder
+        entry_path = f"entries/entry_{model_name_safe}_{version}.json"
+        # Save entry to temporary file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
+            json.dump(entry, temp_file, indent=2)
+            temp_path = temp_file.name
+        # Upload file
+        api = HfApi(token=TOKEN)
+        api.upload_file(
+            path_or_fileobj=temp_path,
+            path_in_repo=entry_path,
+            repo_id=RESULTS_DATASET_ID,
+            repo_type="dataset",
+            commit_message=f"Add evaluation entry for {model_name} (version {version})"
+        )
+        os.unlink(temp_path)
+        return True, f"Successfully uploaded evaluation entry for {model_name}"
+    except Exception as e:
+        return False, f"Error submitting entry to dataset: {e}"
+def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
+    """
+    Submit updated leaderboard to the HuggingFace dataset.
+    """
+    try:
+        # Create leaderboard data
+        leaderboard_data = {
+            "entries": entries,
+            "last_updated": datetime.now().isoformat(),
+            "version": version
+        }
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
+            json.dump(leaderboard_data, temp_file, indent=2)
             temp_path = temp_file.name
+        # Upload file
+        api = HfApi(token=TOKEN)
         api.upload_file(
             path_or_fileobj=temp_path,
+            path_in_repo=f"leaderboards/leaderboard_{version}.json",
+            repo_id=RESULTS_DATASET_ID,
             repo_type="dataset",
+            commit_message=f"Update leaderboard for version {version}"
         )
         os.unlink(temp_path)
+        return True, "Leaderboard updated successfully"
     except Exception as e:
+        return False, f"Error updating leaderboard: {e}"
 def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
     """
     Process a submission to the GuardBench leaderboard.
     """
+    try:
+        # Validate submission
+        is_valid, validation_message = validate_submission(file_path)
+        if not is_valid:
+            return styled_error(validation_message)
+        # Get GuardBench results directory path
+        guardbench_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "guard-bench-submodule")
+        results_dir = os.path.join(guardbench_dir, "results")
+        os.makedirs(results_dir, exist_ok=True)
+        # Copy submission to GuardBench results directory
+        model_name = metadata.get("model_name", "unknown")
+        model_name_safe = model_name.replace("/", "_").replace(" ", "_")
+        guard_model_type = metadata.get("guard_model_type", "unknown")
+        target_file = os.path.join(results_dir + "/guardbench_dataset_1k_public", f"{model_name_safe}.jsonl")
+        # Upload raw submission file
+        api = HfApi(token=TOKEN)
+        submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
+        api.upload_file(
+            path_or_fileobj=file_path,
+            path_in_repo=submission_path,
+            repo_id=RESULTS_DATASET_ID,
+            repo_type="dataset",
+            commit_message=f"Add raw submission for {model_name}"
+        )
+        shutil.copy2(file_path, target_file)
+        try:
+            # Initialize GuardBench context
+            ctx = GuardbenchContext()
+            # Set results directory
+            ctx.results_dir = results_dir
+            # Set bench name from the results directory
+            ctx.bench_name = "guardbench_dataset_1k_public"
+            # Load dataset
+            ctx.load_dataset("whitecircle-ai/guardbench_dataset_1k_public")
+            # Mark as initialized
+            ctx.is_initialized = True
+            evaluator = Evaluator(ctx, force=True, using_cached=True)
+            # Run evaluation and get entry
+            evaluator.evaluate_model(model_name_safe, str(guard_model_type).lower())
+            # Get the entry from results
+            with open(os.path.join(results_dir + "/" + ctx.bench_name, "leaderboard.json"), 'r') as f:
+                results_data = json.load(f)
+                model_entry = next(
+                    (entry for entry in results_data.get("entries", [])
+                     if entry.get("model_name") == model_name_safe),
+                    None
+                )
+            if not model_entry:
+                return styled_error("No evaluation results found")
+            # Add metadata to entry
+            model_entry.update({
+                "model_name": metadata.get("model_name"),  # Use original model name
+                "model_type": metadata.get("model_type"),
+                "guard_model_type": str(metadata.get("guard_model_type")).lower(),
+                "base_model": metadata.get("base_model"),
+                "revision": metadata.get("revision"),
+                "precision": metadata.get("precision"),
+                "weight_type": metadata.get("weight_type"),
+                "version": version,
+                "submission_date": datetime.now().isoformat()
+            })
+            # Submit entry to entries folder
+            success, message = submit_entry_to_hub(model_entry, model_name, version)
+            if not success:
+                return styled_error(message)
+            # Get all entries from HF dataset
+            api = HfApi(token=TOKEN)
+            files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
+            entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
+            all_entries = []
+            for entry_file in entry_files:
+                try:
+                    entry_path = api.hf_hub_download(
+                        repo_id=RESULTS_DATASET_ID,
+                        filename=entry_file,
+                        repo_type="dataset",
+                    )
+                    with open(entry_path, 'r') as f:
+                        entry_data = json.load(f)
+                        all_entries.append(entry_data)
+                except Exception as e:
+                    print(f"Error loading entry {entry_file}: {e}")
+            # Update leaderboard with all entries
+            success, message = submit_leaderboard_to_hub(all_entries, version)
+            if not success:
+                return styled_error(message)
+            return styled_message(f"Submission successful! Model evaluated and leaderboard updated.")
+        except Exception as eval_error:
+            return styled_error(f"Error during evaluation: {eval_error}")
+    except Exception as e:
+        return styled_error(f"Error processing submission: {e}")
+    finally:
+        # Clean up temporary files
+        try:
+            if os.path.exists(file_path):
+                os.remove(file_path)
+            if os.path.exists(target_file):
+                os.remove(target_file)
+        except:
+            pass