Spaces:
Sleeping
Sleeping
| import json | |
| from pathlib import Path | |
| from typing import List, Dict | |
| import os | |
| import gradio as gr | |
| from pydantic import BaseModel, Field, field_validator | |
| # --------------- Configuration --------------- | |
| LEADERBOARD_PATH = Path("leaderboard_data.json") | |
| # Initialize with default data | |
| DEFAULT_DATA = [{ | |
| "model_name": "example/model", | |
| "bleu": 0.5, | |
| "llm_pass_1": 0.5, | |
| "llm_pass_5": 0.5, | |
| "llm_pass_10": 0.5, | |
| "metrics": { | |
| "readability": 5, "relevance": 5, "explanation_clarity": 5, | |
| "problem_identification": 5, "actionability": 5, "completeness": 5, | |
| "specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5 | |
| } | |
| }] | |
| # --------------- Data models --------------- | |
| class Metrics(BaseModel): | |
| readability: int | |
| relevance: int | |
| explanation_clarity: int = Field(alias="explanation_clarity") | |
| problem_identification: int | |
| actionability: int | |
| completeness: int | |
| specificity: int | |
| contextual_adequacy: int | |
| consistency: int | |
| brevity: int | |
| def metric_range(cls, v: int): | |
| if not 0 <= v <= 10: | |
| raise ValueError("Multi-metrics should be between 0 and 10") | |
| return v | |
| class LeaderboardEntry(BaseModel): | |
| model_name: str | |
| bleu: float | |
| llm_pass_1: float | |
| llm_pass_5: float | |
| llm_pass_10: float | |
| metrics: Metrics | |
| def score_range(cls, v: float): | |
| if not 0.0 <= v <= 1.0: | |
| raise ValueError("Scores should be between 0 and 1") | |
| return v | |
| # --------------- Persistence helpers --------------- | |
| def _load_leaderboard() -> List[Dict]: | |
| """Load leaderboard data with persistent storage support.""" | |
| if not LEADERBOARD_PATH.exists(): | |
| # Create default example data | |
| _save_leaderboard(DEFAULT_DATA) | |
| return DEFAULT_DATA | |
| try: | |
| with LEADERBOARD_PATH.open("r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| return data.get("leaderboard", []) | |
| except Exception as e: | |
| print(f"Error loading leaderboard: {e}") | |
| return [] | |
| def _save_leaderboard(data: List[Dict]): | |
| """Save leaderboard data to persistent storage.""" | |
| try: | |
| to_store = {"leaderboard": data} | |
| with LEADERBOARD_PATH.open("w", encoding="utf-8") as f: | |
| json.dump(to_store, f, indent=2) | |
| except Exception as e: | |
| print(f"Error saving leaderboard: {e}") | |
| # --------------- Table data functions --------------- | |
| def _table_data(data: List[Dict] = None) -> List[List]: | |
| """Get main metrics table data.""" | |
| if data is None: | |
| data = _load_leaderboard() | |
| if not data: | |
| return [] | |
| data.sort(key=lambda x: x["llm_pass_1"], reverse=True) | |
| table_rows = [] | |
| for entry in data: | |
| row = [ | |
| entry["model_name"], | |
| entry["bleu"], | |
| entry["llm_pass_1"], | |
| entry["llm_pass_5"], | |
| entry["llm_pass_10"], | |
| ] | |
| table_rows.append(row) | |
| return table_rows | |
| def _multimetric_table_data(data: List[Dict] = None) -> List[List]: | |
| """Get multi-metric table data.""" | |
| if data is None: | |
| data = _load_leaderboard() | |
| if not data: | |
| return [] | |
| data.sort(key=lambda x: x["llm_pass_1"], reverse=True) | |
| table_rows = [] | |
| for entry in data: | |
| row = [ | |
| entry["model_name"], | |
| entry["metrics"]["readability"], | |
| entry["metrics"]["relevance"], | |
| entry["metrics"]["explanation_clarity"], | |
| entry["metrics"]["problem_identification"], | |
| entry["metrics"]["actionability"], | |
| entry["metrics"]["completeness"], | |
| entry["metrics"]["specificity"], | |
| entry["metrics"]["contextual_adequacy"], | |
| entry["metrics"]["consistency"], | |
| entry["metrics"]["brevity"], | |
| ] | |
| table_rows.append(row) | |
| return table_rows | |
| # --------------- Gradio callbacks --------------- | |
| def submit_model( | |
| current_data: List[Dict], | |
| model_name: str, | |
| bleu: float, | |
| llm_pass_1: float, | |
| llm_pass_5: float, | |
| llm_pass_10: float, | |
| readability: int, | |
| relevance: int, | |
| explanation_clarity: int, | |
| problem_identification: int, | |
| actionability: int, | |
| completeness: int, | |
| specificity: int, | |
| contextual_adequacy: int, | |
| consistency: int, | |
| brevity: int, | |
| ): | |
| """Validate and append a new model entry to the leaderboard.""" | |
| try: | |
| entry = LeaderboardEntry( | |
| model_name=model_name.strip(), | |
| bleu=bleu, | |
| llm_pass_1=llm_pass_1, | |
| llm_pass_5=llm_pass_5, | |
| llm_pass_10=llm_pass_10, | |
| metrics={ | |
| "readability": readability, | |
| "relevance": relevance, | |
| "explanation_clarity": explanation_clarity, | |
| "problem_identification": problem_identification, | |
| "actionability": actionability, | |
| "completeness": completeness, | |
| "specificity": specificity, | |
| "contextual_adequacy": contextual_adequacy, | |
| "consistency": consistency, | |
| "brevity": brevity, | |
| }, | |
| ) | |
| except Exception as e: | |
| return current_data, _table_data(current_data), _multimetric_table_data(current_data), f"β Submission failed: {e}" | |
| # Use current data from state | |
| data = current_data.copy() if current_data else [] | |
| # Replace existing model entry if any | |
| data = [d for d in data if d["model_name"] != entry.model_name] | |
| data.append(entry.dict()) | |
| _save_leaderboard(data) | |
| return data, _table_data(data), _multimetric_table_data(data), "β Submission recorded!" | |
| # --------------- Interface --------------- | |
| with gr.Blocks(title="CodeReview Leaderboard") as demo: | |
| gr.Markdown("""# π CodeReview Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """) | |
| # Initialize table data | |
| initial_leaderboard_data = _load_leaderboard() | |
| initial_data = _table_data(initial_leaderboard_data) | |
| initial_multimetric_data = _multimetric_table_data(initial_leaderboard_data) | |
| # State to store leaderboard data | |
| leaderboard_state = gr.State(value=initial_leaderboard_data) | |
| leaderboard_df = gr.Dataframe( | |
| headers=["Model", "BLEU", "Pass@1", "Pass@5", "Pass@10"], | |
| value=initial_data, | |
| label="Main Metrics Leaderboard", | |
| interactive=False, | |
| ) | |
| multimetric_df = gr.Dataframe( | |
| headers=["Model", "Readability", "Relevance", "Explanation Clarity", "Problem Identification", "Actionability", "Completeness", "Specificity", "Contextual Adequacy", "Consistency", "Brevity"], | |
| value=initial_multimetric_data, | |
| label="Multi-Metric Scores", | |
| interactive=False, | |
| ) | |
| gr.Markdown("## π Submit new model results") | |
| with gr.Accordion("Submission form", open=False): | |
| with gr.Row(): | |
| model_name_inp = gr.Text(label="Model name (org/model)", value="") | |
| bleu_inp = gr.Number(label="BLEU", value=0.0, minimum=0.0, maximum=1.0) | |
| pass1_inp = gr.Number(label="Pass@1", value=0.0, minimum=0.0, maximum=1.0) | |
| pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0) | |
| pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0) | |
| gr.Markdown("### Multi-metric subjective scores (0 β 10)") | |
| with gr.Row(): | |
| readability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Readability") | |
| relevance_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Relevance") | |
| explanation_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Explanation Clarity") | |
| problem_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Problem Identification") | |
| actionability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Actionability") | |
| completeness_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Completeness") | |
| specificity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Specificity") | |
| contextual_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Contextual Adequacy") | |
| consistency_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Consistency") | |
| brevity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Brevity") | |
| submit_btn = gr.Button("Submit") | |
| status_markdown = gr.Markdown("") | |
| submit_btn.click( | |
| fn=submit_model, | |
| inputs=[ | |
| leaderboard_state, | |
| model_name_inp, | |
| bleu_inp, | |
| pass1_inp, | |
| pass5_inp, | |
| pass10_inp, | |
| readability_inp, | |
| relevance_inp, | |
| explanation_inp, | |
| problem_inp, | |
| actionability_inp, | |
| completeness_inp, | |
| specificity_inp, | |
| contextual_inp, | |
| consistency_inp, | |
| brevity_inp, | |
| ], | |
| outputs=[leaderboard_state, leaderboard_df, multimetric_df, status_markdown], | |
| api_name="submit_model", | |
| ) | |
| # ----------------- Launch ----------------- | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |
| # For HF Spaces runtime (gradio SDK) expose `demo` | |
| app = demo |