Spaces:
Sleeping
Sleeping
| """ | |
| GuardBench Leaderboard Application | |
| """ | |
| import os | |
| import json | |
| import tempfile | |
| import logging | |
| import gradio as gr | |
| from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns | |
| import pandas as pd | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from src.about import ( | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| EVALUATION_QUEUE_TEXT, | |
| INTRODUCTION_TEXT, | |
| LLM_BENCHMARKS_TEXT, | |
| TITLE, | |
| ) | |
| from src.display.css_html_js import custom_css | |
| from src.display.utils import ( | |
| GUARDBENCH_COLUMN, | |
| DISPLAY_COLS, | |
| METRIC_COLS, | |
| HIDDEN_COLS, | |
| NEVER_HIDDEN_COLS, | |
| CATEGORIES, | |
| TEST_TYPES, | |
| ModelType, | |
| Precision, | |
| WeightType | |
| ) | |
| from src.display.formatting import styled_message, styled_error, styled_warning | |
| from src.envs import ( | |
| ADMIN_USERNAME, | |
| ADMIN_PASSWORD, | |
| RESULTS_DATASET_ID, | |
| SUBMITTER_TOKEN, | |
| TOKEN, | |
| DATA_PATH | |
| ) | |
| from src.populate import get_leaderboard_df, download_leaderboard_data, get_category_leaderboard_df | |
| from src.submission.submit import process_submission | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Ensure data directory exists | |
| os.makedirs(DATA_PATH, exist_ok=True) | |
| # Available benchmark versions | |
| BENCHMARK_VERSIONS = ["v0"] | |
| CURRENT_VERSION = "v0" | |
| # Initialize leaderboard data | |
| try: | |
| logger.info("Initializing leaderboard data...") | |
| LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION) | |
| logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries") | |
| except Exception as e: | |
| logger.error(f"Error loading leaderboard data: {e}") | |
| LEADERBOARD_DF = pd.DataFrame() | |
| def init_leaderboard(dataframe): | |
| """ | |
| Initialize the leaderboard component. | |
| """ | |
| if dataframe is None or dataframe.empty: | |
| # Create an empty dataframe with the right columns | |
| columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS] | |
| dataframe = pd.DataFrame(columns=columns) | |
| logger.warning("Initializing empty leaderboard") | |
| return Leaderboard( | |
| value=dataframe, | |
| datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS], | |
| select_columns=SelectColumns( | |
| default_selection=[getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS], | |
| cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS], | |
| label="Select Columns to Display:", | |
| ), | |
| search_columns=[GUARDBENCH_COLUMN.model_name.name], | |
| hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS], | |
| filter_columns=[ | |
| ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"), | |
| ], | |
| interactive=False, | |
| ) | |
| def submit_results( | |
| model_name: str, | |
| base_model: str, | |
| revision: str, | |
| precision: str, | |
| weight_type: str, | |
| model_type: str, | |
| submission_file: tempfile._TemporaryFileWrapper, | |
| version: str | |
| ): | |
| """ | |
| Handle submission of results with model metadata. | |
| """ | |
| if submission_file is None: | |
| return styled_error("No submission file provided") | |
| if not model_name: | |
| return styled_error("Model name is required") | |
| if not model_type: | |
| return styled_error("Please select a model type") | |
| file_path = submission_file.name | |
| logger.info(f"Received submission for model {model_name}: {file_path}") | |
| # Add metadata to the submission | |
| metadata = { | |
| "model_name": model_name, | |
| "base_model": base_model, | |
| "revision": revision if revision else "main", | |
| "precision": precision, | |
| "weight_type": weight_type, | |
| "model_type": model_type, | |
| "version": version | |
| } | |
| # Process the submission | |
| result = process_submission(file_path, metadata, version=version) | |
| # Refresh the leaderboard data | |
| global LEADERBOARD_DF | |
| try: | |
| logger.info(f"Refreshing leaderboard data after submission for version {version}...") | |
| LEADERBOARD_DF = get_leaderboard_df(version=version) | |
| logger.info("Refreshed leaderboard data after submission") | |
| except Exception as e: | |
| logger.error(f"Error refreshing leaderboard data: {e}") | |
| return result | |
| def refresh_data(version=CURRENT_VERSION): | |
| """ | |
| Refresh the leaderboard data from HuggingFace. | |
| """ | |
| global LEADERBOARD_DF | |
| try: | |
| logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...") | |
| LEADERBOARD_DF = get_leaderboard_df(version=version) | |
| logger.info("Scheduled refresh of leaderboard data completed") | |
| except Exception as e: | |
| logger.error(f"Error in scheduled refresh: {e}") | |
| return LEADERBOARD_DF | |
| def update_leaderboards(version): | |
| """ | |
| Update all leaderboard components with data for the selected version. | |
| """ | |
| new_df = get_leaderboard_df(version=version) | |
| category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES] | |
| return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs] | |
| # Create Gradio app | |
| demo = gr.Blocks(css=custom_css) | |
| with demo: | |
| gr.HTML(TITLE) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| with gr.Column(scale=1): | |
| version_selector = gr.Dropdown( | |
| choices=BENCHMARK_VERSIONS, | |
| label="Benchmark Version", | |
| value=CURRENT_VERSION, | |
| interactive=True, | |
| elem_classes="version-selector" | |
| ) | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.TabItem("๐ Leaderboard", elem_id="guardbench-leaderboard-tab", id=0): | |
| refresh_button = gr.Button("Refresh Leaderboard") | |
| # Create tabs for each category | |
| with gr.Tabs(elem_classes="category-tabs") as category_tabs: | |
| # First tab for average metrics across all categories | |
| with gr.TabItem("๐ Overall Performance", elem_id="overall-tab"): | |
| leaderboard = init_leaderboard(LEADERBOARD_DF) | |
| # Create a tab for each category | |
| for category in CATEGORIES: | |
| with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"): | |
| category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION) | |
| category_leaderboard = init_leaderboard(category_df) | |
| # Refresh button functionality | |
| refresh_button.click( | |
| fn=lambda: [ | |
| init_leaderboard(get_leaderboard_df(version=version_selector.value)), | |
| *[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES] | |
| ], | |
| inputs=[], | |
| outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] | |
| ) | |
| with gr.TabItem("๐ About", elem_id="guardbench-about-tab", id=1): | |
| gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") | |
| with gr.TabItem("๐ Submit", elem_id="guardbench-submit-tab", id=2): | |
| gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") | |
| with gr.Row(): | |
| gr.Markdown("# โ๏ธโจ Submit your results here!", elem_classes="markdown-text") | |
| with gr.Row(): | |
| with gr.Column(): | |
| model_name_textbox = gr.Textbox(label="Model name") | |
| revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") | |
| model_type = gr.Dropdown( | |
| choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], | |
| label="Model type", | |
| multiselect=False, | |
| value=None, | |
| interactive=True, | |
| ) | |
| with gr.Column(): | |
| precision = gr.Dropdown( | |
| choices=[i.name for i in Precision if i != Precision.Unknown], | |
| label="Precision", | |
| multiselect=False, | |
| value="float16", | |
| interactive=True, | |
| ) | |
| weight_type = gr.Dropdown( | |
| choices=[i.name for i in WeightType], | |
| label="Weights type", | |
| multiselect=False, | |
| value="Original", | |
| interactive=True, | |
| ) | |
| base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") | |
| with gr.Row(): | |
| file_input = gr.File( | |
| label="Upload JSONL Results File", | |
| file_types=[".jsonl"] | |
| ) | |
| submit_button = gr.Button("Submit Results") | |
| result_output = gr.Markdown() | |
| submit_button.click( | |
| fn=submit_results, | |
| inputs=[ | |
| model_name_textbox, | |
| base_model_name_textbox, | |
| revision_name_textbox, | |
| precision, | |
| weight_type, | |
| model_type, | |
| file_input, | |
| version_selector | |
| ], | |
| outputs=result_output | |
| ) | |
| # Version selector functionality | |
| version_selector.change( | |
| fn=update_leaderboards, | |
| inputs=[version_selector], | |
| outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)] | |
| ) | |
| with gr.Row(): | |
| with gr.Accordion("๐ Citation", open=False): | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=10, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| with gr.Accordion("โน๏ธ Dataset Information", open=False): | |
| dataset_info = gr.Markdown(f""" | |
| ## Dataset Information | |
| Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID}) | |
| Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")} | |
| """) | |
| scheduler = BackgroundScheduler() | |
| scheduler.add_job(lambda: refresh_data(version=CURRENT_VERSION), 'interval', minutes=30) | |
| scheduler.start() | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=True) | |