CodeReviewBench

Sleeping

App Files Files Community

Alex commited on Jul 3

Commit

527d3c4

1 Parent(s): 9d40219

app fixed

Browse files

Files changed (2) hide show

README.md +27 -31
app.py +34 -59

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: CodeReviewBench
 emoji: 🥇
 colorFrom: green
 colorTo: indigo
@@ -7,40 +7,36 @@ sdk: gradio
 app_file: app.py
 pinned: true
 license: mit
-short_description: Result of benchmark presented in paper CodeReviewBench
 sdk_version: 5.19.0
 ---
-# Start the configuration
-Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
-Results files should have the following format and be stored as json files:
-```json
-{
-    "config": {
-        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
-        "model_name": "path of the model on the hub: org/model",
-        "model_sha": "revision on the hub",
-    },
-    "results": {
-        "task_name": {
-            "metric_name": score,
-        },
-        "task_name2": {
-            "metric_name": score,
-        }
-    }
-}
-```
-Request files are created automatically by this tool.
-If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
-# Code logic for more complex edits
-You'll find
-- the main table' columns names and properties in `src/display/utils.py`
-- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 ---
+title: CodeReview Leaderboard
 emoji: 🥇
 colorFrom: green
 colorTo: indigo
 app_file: app.py
 pinned: true
 license: mit
+short_description: CodeReview Leaderboard for evaluating code review models
 sdk_version: 5.19.0
+storage: persistent
 ---
+# CodeReview Leaderboard
+A leaderboard for evaluating code review models with BLEU, Pass@K metrics, and multi-dimensional subjective scores.
+## Metrics
+### Main Metrics (0-1 scale)
+- **BLEU**: Text similarity score
+- **Pass@1, Pass@5, Pass@10**: LLM-based exact match at different attempts
+### Multi-Metrics (0-10 scale)
+- Readability, Relevance, Explanation Clarity
+- Problem Identification, Actionability, Completeness
+- Specificity, Contextual Adequacy, Consistency, Brevity
+## Submission
+Submit your model results through the web interface or via API:
+```bash
+curl -X POST https://kenkaneki--codereviewbench.hf.space/api/submit_model \
+     -H "Content-Type: application/json" \
+     -d '{"data": ["org/model", 0.68, 0.73, 0.82, 0.87, 8, 7, 8, 7, 6, 7, 6, 7, 6, 5]}'
+```
+Results are sorted by **Pass@1** in descending order.

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ from pydantic import BaseModel, Field, field_validator
 # --------------- Configuration ---------------
 LEADERBOARD_PATH = Path("leaderboard_data.json")
-DEFAULT_MODEL_NAME = "example/model"
 # --------------- Data models ---------------
 class Metrics(BaseModel):
@@ -47,44 +46,47 @@ class LeaderboardEntry(BaseModel):
 # --------------- Persistence helpers ---------------
 def _load_leaderboard() -> List[Dict]:
     if not LEADERBOARD_PATH.exists():
         return []
-    with LEADERBOARD_PATH.open("r", encoding="utf-8") as f:
-        data = json.load(f)
-        return data.get("leaderboard", [])
 def _save_leaderboard(data: List[Dict]):
-    to_store = {"leaderboard": data}
-    with LEADERBOARD_PATH.open("w", encoding="utf-8") as f:
-        json.dump(to_store, f, indent=2)
-# --------------- Utility ---------------
-def _flatten_entry(entry: Dict) -> Dict:
-    """Flatten nested metrics so that every metric is a column."""
-    flat = {
-        "Model": entry["model_name"],
-        "BLEU": entry["bleu"],
-        "Pass@1": entry["llm_pass_1"],
-        "Pass@5": entry["llm_pass_5"],
-        "Pass@10": entry["llm_pass_10"],
-        "Readability": entry["metrics"]["readability"],
-        "Relevance": entry["metrics"]["relevance"],
-        "Explanation Clarity": entry["metrics"]["explanation_clarity"],
-        "Problem Identification": entry["metrics"]["problem_identification"],
-        "Actionability": entry["metrics"]["actionability"],
-        "Completeness": entry["metrics"]["completeness"],
-        "Specificity": entry["metrics"]["specificity"],
-        "Contextual Adequacy": entry["metrics"]["contextual_adequacy"],
-        "Consistency": entry["metrics"]["consistency"],
-        "Brevity": entry["metrics"]["brevity"],
-    }
-    return flat
 def _table_data() -> List[List]:
     data = _load_leaderboard()
     if not data:
         return []
@@ -104,6 +106,7 @@ def _table_data() -> List[List]:
 def _multimetric_table_data() -> List[List]:
     data = _load_leaderboard()
     if not data:
         return []
@@ -184,34 +187,6 @@ def submit_model(
 with gr.Blocks(title="CodeReview Leaderboard") as demo:
     gr.Markdown("""# 🏆 CodeReview Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """)
-    # Create initial example data if file doesn't exist
-    if not LEADERBOARD_PATH.exists():
-        example_data = {
-            "leaderboard": [
-                {
-                    "model_name": "example/model",
-                    "bleu": 0.5,
-                    "llm_pass_1": 0.5,
-                    "llm_pass_5": 0.5,
-                    "llm_pass_10": 0.5,
-                    "metrics": {
-                        "readability": 5,
-                        "relevance": 5,
-                        "explanation_clarity": 5,
-                        "problem_identification": 5,
-                        "actionability": 5,
-                        "completeness": 5,
-                        "specificity": 5,
-                        "contextual_adequacy": 5,
-                        "consistency": 5,
-                        "brevity": 5
-                    }
-                }
-            ]
-        }
-        with LEADERBOARD_PATH.open("w", encoding="utf-8") as f:
-            json.dump(example_data, f, indent=2)
     # Initialize table data
     initial_data = _table_data()
     initial_multimetric_data = _multimetric_table_data()

 # --------------- Configuration ---------------
 LEADERBOARD_PATH = Path("leaderboard_data.json")
 # --------------- Data models ---------------
 class Metrics(BaseModel):
 # --------------- Persistence helpers ---------------
 def _load_leaderboard() -> List[Dict]:
+    """Load leaderboard data with persistent storage support."""
     if not LEADERBOARD_PATH.exists():
+        # Create default example data
+        default_data = [{
+            "model_name": "example/model",
+            "bleu": 0.5,
+            "llm_pass_1": 0.5,
+            "llm_pass_5": 0.5,
+            "llm_pass_10": 0.5,
+            "metrics": {
+                "readability": 5, "relevance": 5, "explanation_clarity": 5,
+                "problem_identification": 5, "actionability": 5, "completeness": 5,
+                "specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5
+            }
+        }]
+        _save_leaderboard(default_data)
+        return default_data
+    try:
+        with LEADERBOARD_PATH.open("r", encoding="utf-8") as f:
+            data = json.load(f)
+            return data.get("leaderboard", [])
+    except Exception as e:
+        print(f"Error loading leaderboard: {e}")
         return []
 def _save_leaderboard(data: List[Dict]):
+    """Save leaderboard data to persistent storage."""
+    try:
+        to_store = {"leaderboard": data}
+        with LEADERBOARD_PATH.open("w", encoding="utf-8") as f:
+            json.dump(to_store, f, indent=2)
+    except Exception as e:
+        print(f"Error saving leaderboard: {e}")
+# --------------- Table data functions ---------------
 def _table_data() -> List[List]:
+    """Get main metrics table data."""
     data = _load_leaderboard()
     if not data:
         return []
 def _multimetric_table_data() -> List[List]:
+    """Get multi-metric table data."""
     data = _load_leaderboard()
     if not data:
         return []
 with gr.Blocks(title="CodeReview Leaderboard") as demo:
     gr.Markdown("""# 🏆 CodeReview Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """)
     # Initialize table data
     initial_data = _table_data()
     initial_multimetric_data = _multimetric_table_data()