Spaces:
Sleeping
Sleeping
| """ | |
| Process CodeReview Bench leaderboard data and submissions. | |
| """ | |
| import json | |
| import os | |
| import pandas as pd | |
| from datetime import datetime | |
| from typing import Dict, List, Tuple, Optional | |
| import numpy as np | |
| from src.display.utils import ( | |
| CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES, | |
| MULTIMETRIC_METRICS, EXACT_MATCH_METRICS | |
| ) | |
| def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]: | |
| """ | |
| Process a JSONL submission file for CodeReview Bench. | |
| Args: | |
| file_path: Path to the JSONL submission file | |
| Returns: | |
| Tuple of (entries_list, message) | |
| """ | |
| try: | |
| entries = [] | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| for line_num, line in enumerate(f, 1): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| entry = json.loads(line) | |
| # Validate required fields | |
| required_fields = ['model_name', 'programming_language', 'comment_language'] | |
| missing_fields = [field for field in required_fields if field not in entry] | |
| if missing_fields: | |
| return [], f"Missing required fields {missing_fields} in line {line_num}" | |
| # Validate metrics exist | |
| has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS) | |
| has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS) | |
| if not has_multimetric and not has_exact_match: | |
| return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}" | |
| entries.append(entry) | |
| except json.JSONDecodeError as e: | |
| return [], f"Invalid JSON in line {line_num}: {e}" | |
| if not entries: | |
| return [], "No valid entries found in submission file" | |
| return entries, f"Successfully processed {len(entries)} entries" | |
| except Exception as e: | |
| return [], f"Error processing submission: {e}" | |
| def calculate_overall_score(entry: Dict) -> float: | |
| """ | |
| Calculate overall score for a CodeReview Bench entry. | |
| Args: | |
| entry: Dictionary containing model evaluation results | |
| Returns: | |
| Overall score as float | |
| """ | |
| # Calculate multimetric average | |
| multimetric_scores = [] | |
| for metric in MULTIMETRIC_METRICS: | |
| if metric in entry and isinstance(entry[metric], (int, float)): | |
| multimetric_scores.append(entry[metric]) | |
| multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0 | |
| # Calculate exact match average | |
| exact_match_scores = [] | |
| for metric in EXACT_MATCH_METRICS: | |
| if metric in entry and isinstance(entry[metric], (int, float)): | |
| exact_match_scores.append(entry[metric]) | |
| exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0 | |
| # Weighted combination (can be adjusted based on requirements) | |
| overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3) | |
| return overall_score | |
| def load_leaderboard_data(file_path: str) -> Dict: | |
| """ | |
| Load the leaderboard data from a JSON file. | |
| """ | |
| if not os.path.exists(file_path): | |
| version = "v0" | |
| if "_v" in file_path: | |
| version = file_path.split("_")[-1].split(".")[0] | |
| return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version} | |
| with open(file_path, 'r') as f: | |
| data = json.load(f) | |
| # Ensure version field exists | |
| if "version" not in data: | |
| version = "v0" | |
| if "_v" in file_path: | |
| version = file_path.split("_")[-1].split(".")[0] | |
| data["version"] = version | |
| return data | |
| def save_leaderboard_data(data: Dict, file_path: str) -> None: | |
| """ | |
| Save the leaderboard data to a JSON file. | |
| """ | |
| # Ensure the directory exists | |
| os.makedirs(os.path.dirname(file_path), exist_ok=True) | |
| # Update the last_updated timestamp | |
| data["last_updated"] = datetime.now().isoformat() | |
| # Ensure version is set | |
| if "version" not in data: | |
| version = "v0" | |
| if "_v" in file_path: | |
| version = file_path.split("_")[-1].split(".")[0] | |
| data["version"] = version | |
| with open(file_path, 'w') as f: | |
| json.dump(data, f, indent=2) | |
| def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame: | |
| """ | |
| Convert leaderboard data to a pandas DataFrame for display. | |
| """ | |
| rows = [] | |
| for entry in leaderboard_data.get("entries", []): | |
| model_name = entry.get("model_name", "Unknown Model") | |
| # Extract basic metadata | |
| row = { | |
| "model_name": model_name, | |
| "model_type": entry.get("model_type", "Unknown"), | |
| "mode": entry.get("mode", "Strict"), | |
| "submission_date": entry.get("submission_date", ""), | |
| "version": entry.get("version", "v0"), | |
| "review_model_type": entry.get("review_model_type", "custom").lower() | |
| } | |
| # Add additional metadata fields if present | |
| for key in ["base_model", "revision", "precision", "weight_type", "topic", "programming_language", "comment_language"]: | |
| if key in entry: | |
| row[key] = entry[key] | |
| # Add multimetric scores | |
| for metric in MULTIMETRIC_METRICS: | |
| if metric in entry: | |
| row[metric] = entry[metric] | |
| else: | |
| row[metric] = pd.NA | |
| # Add exact match metrics | |
| for metric in EXACT_MATCH_METRICS: | |
| if metric in entry: | |
| row[metric] = entry[metric] | |
| else: | |
| row[metric] = pd.NA | |
| # Calculate aggregated metrics | |
| multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])] | |
| exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])] | |
| if multimetric_scores: | |
| row["multimetric_average"] = np.mean(multimetric_scores) | |
| else: | |
| row["multimetric_average"] = pd.NA | |
| if exact_match_scores: | |
| row["exact_match_average"] = np.mean(exact_match_scores) | |
| else: | |
| row["exact_match_average"] = pd.NA | |
| # Calculate overall score | |
| row["overall_score"] = calculate_overall_score(entry) | |
| # Add language-specific metrics if available | |
| for lang in COMMENT_LANGUAGES: | |
| for metric in ["readability", "relevance", "overall_score"]: | |
| lang_key = f"{lang}_{metric}" | |
| if lang_key in entry: | |
| row[lang_key] = entry[lang_key] | |
| else: | |
| row[lang_key] = pd.NA | |
| # Add evaluation count | |
| row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA)) | |
| rows.append(row) | |
| # Create DataFrame and sort by overall score | |
| df = pd.DataFrame(rows) | |
| # Ensure all expected columns exist | |
| for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS: | |
| if metric not in df.columns: | |
| df[metric] = pd.NA | |
| # Sort by overall score (descending) | |
| if not df.empty: | |
| df = df.sort_values(by="overall_score", ascending=False, na_position='last') | |
| # Ensure summary columns exist | |
| summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"] | |
| for col in summary_cols: | |
| if col not in df.columns: | |
| df[col] = pd.NA | |
| return df | |
| def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict: | |
| """ | |
| Add new entries to the leaderboard, replacing any with the same model name. | |
| """ | |
| # Create a mapping of existing entries by model name and version | |
| existing_entries = { | |
| (entry["model_name"], entry.get("version", "v0")): i | |
| for i, entry in enumerate(leaderboard_data.get("entries", [])) | |
| } | |
| # Process each new entry | |
| for new_entry in new_entries: | |
| model_name = new_entry.get("model_name") | |
| version = new_entry.get("version", "v0") | |
| # Add calculated metrics | |
| new_entry["overall_score"] = calculate_overall_score(new_entry) | |
| # Calculate averages | |
| multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])] | |
| exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])] | |
| if multimetric_scores: | |
| new_entry["multimetric_average"] = np.mean(multimetric_scores) | |
| if exact_match_scores: | |
| new_entry["exact_match_average"] = np.mean(exact_match_scores) | |
| if (model_name, version) in existing_entries: | |
| # Replace existing entry | |
| leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry | |
| else: | |
| # Add new entry | |
| if "entries" not in leaderboard_data: | |
| leaderboard_data["entries"] = [] | |
| leaderboard_data["entries"].append(new_entry) | |
| # Update the last_updated timestamp | |
| leaderboard_data["last_updated"] = datetime.now().isoformat() | |
| return leaderboard_data | |