Spaces:
Sleeping
Sleeping
merge_resolve
Browse files- .env.template +6 -0
- .gitignore +44 -5
- .gitmodules +3 -0
- .gradio/certificate.pem +31 -0
- README.md +130 -108
- app.py +1086 -335
- example_submission.jsonl +4 -0
- gradio_test.ipynb +32 -0
- leaderboard_data.json +28 -19
- requirements.txt +8 -19
- src/about.py +44 -32
- src/display/css_html_js.py +62 -271
- src/display/formatting.py +56 -167
- src/display/utils.py +412 -287
- src/envs.py +20 -99
- src/leaderboard/processor.py +258 -293
- src/populate.py +188 -0
- src/submission/submit.py +178 -380
.env.template
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
HF_TOKEN="your_huggingface_write_token"
|
| 2 |
+
OWNER="your_huggingface_username_or_org"
|
| 3 |
+
RESULTS_DATASET_ID="your_username/guardbench-results"
|
| 4 |
+
SUBMITTER_TOKEN="your_secret_submission_token"
|
| 5 |
+
ADMIN_USERNAME="admin"
|
| 6 |
+
ADMIN_PASSWORD="password" # Change this!
|
.gitignore
CHANGED
|
@@ -1,13 +1,52 @@
|
|
| 1 |
-
|
| 2 |
-
venv/
|
| 3 |
__pycache__/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
.env
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
.vscode/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
| 9 |
eval-queue/
|
| 10 |
eval-results/
|
| 11 |
eval-queue-bk/
|
| 12 |
eval-results-bk/
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
|
|
|
| 2 |
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
build/
|
| 9 |
+
develop-eggs/
|
| 10 |
+
dist/
|
| 11 |
+
downloads/
|
| 12 |
+
eggs/
|
| 13 |
+
.eggs/
|
| 14 |
+
lib/
|
| 15 |
+
lib64/
|
| 16 |
+
parts/
|
| 17 |
+
sdist/
|
| 18 |
+
var/
|
| 19 |
+
.venv/
|
| 20 |
+
*.egg-info/
|
| 21 |
+
.installed.cfg
|
| 22 |
+
*.egg
|
| 23 |
+
.gradio/
|
| 24 |
+
|
| 25 |
+
# Environment variables
|
| 26 |
.env
|
| 27 |
+
|
| 28 |
+
# Virtual Environment
|
| 29 |
+
venv/
|
| 30 |
+
ENV/
|
| 31 |
+
|
| 32 |
+
# IDE
|
| 33 |
+
.idea/
|
| 34 |
.vscode/
|
| 35 |
+
*.swp
|
| 36 |
+
*.swo
|
| 37 |
+
|
| 38 |
+
# OS
|
| 39 |
+
.DS_Store
|
| 40 |
+
Thumbs.db
|
| 41 |
|
| 42 |
+
# Hugging Face cache
|
| 43 |
eval-queue/
|
| 44 |
eval-results/
|
| 45 |
eval-queue-bk/
|
| 46 |
eval-results-bk/
|
| 47 |
+
|
| 48 |
+
# Data files
|
| 49 |
+
data/
|
| 50 |
+
|
| 51 |
+
# Versioned leaderboard files
|
| 52 |
+
data/leaderboard_v*.json
|
.gitmodules
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[submodule "guard-bench-submodule"]
|
| 2 |
+
path = guard-bench-submodule
|
| 3 |
+
url = https://github.com/whitecircle-ai/circle-guard-bench.git
|
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
|
@@ -1,136 +1,158 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
|
|
|
| 7 |
app_file: app.py
|
| 8 |
pinned: true
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
---
|
| 14 |
|
| 15 |
-
#
|
| 16 |
|
|
|
|
| 17 |
A comprehensive benchmark and leaderboard for code review generation models, inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench).
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
- **
|
| 25 |
-
- **
|
| 26 |
-
- **
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
- **
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
- **
|
| 38 |
-
- **
|
| 39 |
-
- **
|
| 40 |
-
- **
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
- **
|
| 45 |
-
- **
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
```bash
|
| 53 |
pip install -r requirements.txt
|
| 54 |
```
|
| 55 |
|
| 56 |
-
|
| 57 |
|
| 58 |
```bash
|
| 59 |
python app.py
|
| 60 |
```
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
### 3. Analytics & Insights
|
| 88 |
-
|
| 89 |
-
- Visit the **📈 Analytics** tab to see:
|
| 90 |
-
- Recent submission history
|
| 91 |
-
- Language performance comparisons
|
| 92 |
-
- Category performance analysis
|
| 93 |
-
- Trends and patterns
|
| 94 |
-
|
| 95 |
-
### 4. Data Export
|
| 96 |
-
|
| 97 |
-
- Use the **ℹ️ About** tab to export data in JSON or CSV format
|
| 98 |
-
- Full leaderboard data available for research and analysis
|
| 99 |
-
|
| 100 |
-
## 🏗️ Architecture
|
| 101 |
-
|
| 102 |
-
### Directory Structure
|
| 103 |
-
|
| 104 |
-
```
|
| 105 |
-
├── src/
|
| 106 |
-
│ ├── about.py # About page content
|
| 107 |
-
│ ├── envs.py # Environment configuration
|
| 108 |
-
│ ├── display/ # Display utilities
|
| 109 |
-
│ │ ├── css_html_js.py # Styling and themes
|
| 110 |
-
│ │ ├── formatting.py # Data formatting
|
| 111 |
-
│ │ └── utils.py # Display utilities
|
| 112 |
-
│ ├── leaderboard/ # Leaderboard processing
|
| 113 |
-
│ │ └── processor.py # Data operations
|
| 114 |
-
│ └── submission/ # Submission handling
|
| 115 |
-
│ └── submit.py # Submission validation
|
| 116 |
-
├── data/ # Data storage
|
| 117 |
-
│ ├── leaderboard_data.json # Main leaderboard
|
| 118 |
-
│ └── submissions.json # Submission log
|
| 119 |
-
├── app.py # Main application
|
| 120 |
-
└── requirements.txt # Dependencies
|
| 121 |
```
|
| 122 |
|
| 123 |
-
|
| 124 |
|
| 125 |
-
|
| 126 |
-
- **SubmissionHandler**: Manages model submissions with IP tracking and validation
|
| 127 |
-
- **Display Utils**: Provides filtering, formatting, and table generation
|
| 128 |
-
- **Dark Theme**: Custom CSS for modern, accessible interface
|
| 129 |
|
| 130 |
-
## 🎨 Features Inspired by CodeReviewBench
|
| 131 |
|
| 132 |
-
|
| 133 |
|
|
|
|
| 134 |
- **Multi-tab Interface**: Organized navigation with dedicated sections
|
| 135 |
- **Advanced Filtering**: Real-time filtering by multiple criteria
|
| 136 |
- **Dark Theme**: Modern, GitHub-inspired dark interface
|
|
|
|
| 1 |
---
|
| 2 |
+
title: CircleGuardBench
|
| 3 |
+
emoji: ⚪
|
| 4 |
+
colorFrom: gray
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
+
short_description: First benchmark testing LLM guards on safety and accuracy.
|
| 11 |
+
models:
|
| 12 |
+
- AtlaAI/Selene-1-Mini-Llama-3.1-8B
|
| 13 |
+
- google/gemma-3-12b-it
|
| 14 |
+
- google/gemma-3-4b-it
|
| 15 |
+
- meta-llama/Llama-3.1-8B-Instruct
|
| 16 |
+
- meta-llama/Llama-3.2-3B-Instruct
|
| 17 |
+
- meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
| 18 |
+
- meta-llama/Llama-4-Scout-17B-16E-Instruct
|
| 19 |
+
- meta-llama/Llama-Guard-3-1B
|
| 20 |
+
- meta-llama/Llama-Guard-3-8B
|
| 21 |
+
- meta-llama/Llama-Guard-4-12B
|
| 22 |
+
- mistralai/Ministral-8B-Instruct-2410
|
| 23 |
+
- mistralai/Mistral-Small-3.1-24B-Instruct-2503
|
| 24 |
+
- Qwen/Qwen2.5-7B-Instruct
|
| 25 |
+
- Qwen/Qwen3-0.6B
|
| 26 |
+
- Qwen/Qwen3-1.7B
|
| 27 |
+
- Qwen/Qwen3-4B
|
| 28 |
+
- Qwen/Qwen3-8B
|
| 29 |
+
|
| 30 |
---
|
| 31 |
|
| 32 |
+
# CodeReview Bench Leaderboard
|
| 33 |
|
| 34 |
+
<<<<<<< HEAD
|
| 35 |
A comprehensive benchmark and leaderboard for code review generation models, inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench).
|
| 36 |
+
=======
|
| 37 |
+
A comprehensive leaderboard for evaluating automated code review systems across programming languages and review quality dimensions.
|
| 38 |
+
>>>>>>> f990f507d1e99e7867021841fa223fe6ca8f653b
|
| 39 |
+
|
| 40 |
+
## Features
|
| 41 |
+
|
| 42 |
+
- **Multi-Language Support**: Evaluates models across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more
|
| 43 |
+
- **Dual Language Comments**: Supports both Russian and English comment languages
|
| 44 |
+
- **Comprehensive Metrics**:
|
| 45 |
+
- LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity)
|
| 46 |
+
- Exact-match metrics (pass@1, pass@5, pass@10, BLEU@10)
|
| 47 |
+
- **Interactive Visualization**: Compare model performance across categories with radar plots
|
| 48 |
+
- **Easy Submission**: Submit your model results via web interface
|
| 49 |
+
|
| 50 |
+
## Metrics
|
| 51 |
+
|
| 52 |
+
### LLM-based Multimetric
|
| 53 |
+
|
| 54 |
+
- **Readability**: How easy the review is to understand
|
| 55 |
+
- **Relevance**: How relevant the review is to the code
|
| 56 |
+
- **Explanation Clarity**: How clear the explanations are
|
| 57 |
+
- **Problem Identification**: How well problems are identified
|
| 58 |
+
- **Actionability**: How actionable the suggestions are
|
| 59 |
+
- **Completeness**: How complete the review is
|
| 60 |
+
- **Specificity**: How specific the feedback is
|
| 61 |
+
- **Contextual Adequacy**: How well the review fits the context
|
| 62 |
+
- **Consistency**: How consistent the review style is
|
| 63 |
+
- **Brevity**: How concise the review is
|
| 64 |
+
|
| 65 |
+
### Exact-Match Metrics
|
| 66 |
+
|
| 67 |
+
- **Pass@1**: Percentage of correct reviews on first attempt
|
| 68 |
+
- **Pass@5**: Percentage of correct reviews in top 5 attempts
|
| 69 |
+
- **Pass@10**: Percentage of correct reviews in top 10 attempts
|
| 70 |
+
- **BLEU@10**: BLEU score for top 10 review candidates
|
| 71 |
+
|
| 72 |
+
## Programming Languages Supported
|
| 73 |
+
|
| 74 |
+
- Python
|
| 75 |
+
- JavaScript
|
| 76 |
+
- Java
|
| 77 |
+
- C++
|
| 78 |
+
- C#
|
| 79 |
+
- TypeScript
|
| 80 |
+
- Go
|
| 81 |
+
- Rust
|
| 82 |
+
- Swift
|
| 83 |
+
- Kotlin
|
| 84 |
+
- Ruby
|
| 85 |
+
- PHP
|
| 86 |
+
- C
|
| 87 |
+
- Scala
|
| 88 |
+
- R
|
| 89 |
+
- Dart
|
| 90 |
+
- Other
|
| 91 |
+
|
| 92 |
+
## Comment Languages
|
| 93 |
+
|
| 94 |
+
- Russian (ru)
|
| 95 |
+
- English (en)
|
| 96 |
+
|
| 97 |
+
## Example Categories
|
| 98 |
+
|
| 99 |
+
- Bug Fix
|
| 100 |
+
- Code Style
|
| 101 |
+
- Performance
|
| 102 |
+
- Security
|
| 103 |
+
- Refactoring
|
| 104 |
+
- Documentation
|
| 105 |
+
- Testing
|
| 106 |
+
- Architecture
|
| 107 |
+
- Other
|
| 108 |
+
|
| 109 |
+
## Installation
|
| 110 |
|
| 111 |
```bash
|
| 112 |
pip install -r requirements.txt
|
| 113 |
```
|
| 114 |
|
| 115 |
+
## Usage
|
| 116 |
|
| 117 |
```bash
|
| 118 |
python app.py
|
| 119 |
```
|
| 120 |
|
| 121 |
+
## Submission Format
|
| 122 |
+
|
| 123 |
+
Submit your results as a JSONL file where each line contains:
|
| 124 |
+
|
| 125 |
+
```json
|
| 126 |
+
{
|
| 127 |
+
"model_name": "your-model-name",
|
| 128 |
+
"programming_language": "python",
|
| 129 |
+
"comment_language": "en",
|
| 130 |
+
"readability": 8.5,
|
| 131 |
+
"relevance": 9.0,
|
| 132 |
+
"explanation_clarity": 7.8,
|
| 133 |
+
"problem_identification": 8.2,
|
| 134 |
+
"actionability": 8.7,
|
| 135 |
+
"completeness": 8.0,
|
| 136 |
+
"specificity": 7.5,
|
| 137 |
+
"contextual_adequacy": 8.3,
|
| 138 |
+
"consistency": 8.8,
|
| 139 |
+
"brevity": 7.2,
|
| 140 |
+
"pass_at_1": 0.75,
|
| 141 |
+
"pass_at_5": 0.88,
|
| 142 |
+
"pass_at_10": 0.92,
|
| 143 |
+
"bleu_at_10": 0.65,
|
| 144 |
+
"total_evaluations": 100
|
| 145 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
```
|
| 147 |
|
| 148 |
+
## Environment Variables
|
| 149 |
|
| 150 |
+
Set the following environment variables:
|
|
|
|
|
|
|
|
|
|
| 151 |
|
|
|
|
| 152 |
|
| 153 |
+
## Citation
|
| 154 |
|
| 155 |
+
<<<<<<< HEAD
|
| 156 |
- **Multi-tab Interface**: Organized navigation with dedicated sections
|
| 157 |
- **Advanced Filtering**: Real-time filtering by multiple criteria
|
| 158 |
- **Dark Theme**: Modern, GitHub-inspired dark interface
|
app.py
CHANGED
|
@@ -3,363 +3,1114 @@ CodeReview Leaderboard - Inspired by CodeReviewBench
|
|
| 3 |
A comprehensive leaderboard for code review generation models
|
| 4 |
"""
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
| 14 |
)
|
| 15 |
-
from src.
|
| 16 |
-
from src.display.css_html_js import DARK_THEME_CSS, CUSTOM_JS, HEADER_HTML, FOOTER_HTML
|
| 17 |
from src.display.utils import (
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
)
|
| 21 |
-
from src.
|
| 22 |
-
from src.
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
"taxonomy_category": "All"
|
| 33 |
-
}
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
)
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
}
|
| 47 |
-
|
| 48 |
-
#
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
#
|
| 52 |
-
|
| 53 |
-
|
| 54 |
)
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
)
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
stats_text = f"""
|
| 65 |
-
## 📊 Current Statistics
|
| 66 |
-
- **Total Models**: {stats['total_models']}
|
| 67 |
-
- **Total Submissions**: {stats['total_submissions']}
|
| 68 |
-
- **Average Pass@1**: {stats['avg_pass_1']:.3f}
|
| 69 |
-
- **Best Model**: {stats['best_model']}
|
| 70 |
-
- **Languages Covered**: {stats['languages_covered']}
|
| 71 |
-
- **Categories Covered**: {stats['categories_covered']}
|
| 72 |
"""
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
):
|
| 88 |
-
"""
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
)
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
else:
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
info="Filter by programming language"
|
| 134 |
-
)
|
| 135 |
-
comment_lang_filter = gr.Dropdown(
|
| 136 |
-
choices=COMMENT_LANGUAGES,
|
| 137 |
-
value="All",
|
| 138 |
-
label="🌍 Comment Language",
|
| 139 |
-
info="Filter by comment language"
|
| 140 |
-
)
|
| 141 |
-
taxonomy_filter = gr.Dropdown(
|
| 142 |
-
choices=TAXONOMY_CATEGORIES,
|
| 143 |
-
value="All",
|
| 144 |
-
label="🏷️ Taxonomy Category",
|
| 145 |
-
info="Filter by review category"
|
| 146 |
)
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
)
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
)
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
form_components["taxonomy_category"],
|
| 187 |
-
form_components["bleu"],
|
| 188 |
-
form_components["pass1"],
|
| 189 |
-
form_components["pass5"],
|
| 190 |
-
form_components["pass10"],
|
| 191 |
-
form_components["readability"],
|
| 192 |
-
form_components["relevance"],
|
| 193 |
-
form_components["explanation_clarity"],
|
| 194 |
-
form_components["problem_identification"],
|
| 195 |
-
form_components["actionability"],
|
| 196 |
-
form_components["completeness"],
|
| 197 |
-
form_components["specificity"],
|
| 198 |
-
form_components["contextual_adequacy"],
|
| 199 |
-
form_components["consistency"],
|
| 200 |
-
form_components["brevity"],
|
| 201 |
-
],
|
| 202 |
-
outputs=[
|
| 203 |
-
leaderboard_state,
|
| 204 |
-
main_leaderboard,
|
| 205 |
-
quality_metrics,
|
| 206 |
-
form_components["status_msg"],
|
| 207 |
-
stats_display
|
| 208 |
-
]
|
| 209 |
-
)
|
| 210 |
-
|
| 211 |
-
# Analytics Tab
|
| 212 |
-
with gr.Tab("📈 Analytics"):
|
| 213 |
-
|
| 214 |
-
with gr.Row():
|
| 215 |
-
analytics_prog_lang = gr.Dropdown(
|
| 216 |
-
choices=PROGRAMMING_LANGUAGES,
|
| 217 |
-
value="All",
|
| 218 |
-
label="Programming Language"
|
| 219 |
)
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
)
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
)
|
| 230 |
-
|
| 231 |
-
#
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
label="
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
)
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
)
|
| 268 |
-
export_btn = gr.Button("📥 Export Data")
|
| 269 |
-
|
| 270 |
-
export_output = gr.Textbox(
|
| 271 |
-
label="Export Output",
|
| 272 |
-
lines=10,
|
| 273 |
-
max_lines=20,
|
| 274 |
-
show_copy_button=True
|
| 275 |
-
)
|
| 276 |
-
|
| 277 |
-
# Footer
|
| 278 |
-
gr.HTML(FOOTER_HTML)
|
| 279 |
-
|
| 280 |
-
# Initialize with data
|
| 281 |
-
initial_main, initial_quality, initial_stats = update_leaderboard_tables()
|
| 282 |
-
|
| 283 |
-
# Update tables when filters change
|
| 284 |
-
filter_inputs = [prog_lang_filter, comment_lang_filter, taxonomy_filter]
|
| 285 |
-
filter_outputs = [main_leaderboard, quality_metrics, stats_display]
|
| 286 |
-
|
| 287 |
-
for filter_input in filter_inputs:
|
| 288 |
-
filter_input.change(
|
| 289 |
-
fn=update_leaderboard_tables,
|
| 290 |
-
inputs=filter_inputs,
|
| 291 |
-
outputs=filter_outputs
|
| 292 |
-
)
|
| 293 |
-
|
| 294 |
-
# Refresh button
|
| 295 |
-
refresh_btn.click(
|
| 296 |
-
fn=refresh_data,
|
| 297 |
-
outputs=filter_outputs
|
| 298 |
-
)
|
| 299 |
-
|
| 300 |
-
# Analytics updates
|
| 301 |
-
analytics_inputs = [analytics_prog_lang, analytics_comment_lang, analytics_taxonomy]
|
| 302 |
-
|
| 303 |
-
def update_analytics(prog_lang, comment_lang, taxonomy):
|
| 304 |
-
"""Update analytics tables"""
|
| 305 |
-
data = processor.load_leaderboard_data()
|
| 306 |
-
|
| 307 |
-
# Get submission history
|
| 308 |
-
history = get_submission_history_data(data, prog_lang, comment_lang, taxonomy)
|
| 309 |
-
|
| 310 |
-
# Get language performance
|
| 311 |
-
lang_perf = []
|
| 312 |
-
for lang in PROGRAMMING_LANGUAGES[1:]:
|
| 313 |
-
lang_data = [d for d in data if d.get("programming_language") == lang]
|
| 314 |
-
if lang_data:
|
| 315 |
-
avg_score = sum(d.get("llm_pass_1", 0) for d in lang_data) / len(lang_data)
|
| 316 |
-
best_model = max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
|
| 317 |
-
lang_perf.append([lang, f"{avg_score:.3f}", len(lang_data), best_model])
|
| 318 |
-
|
| 319 |
-
# Get category performance
|
| 320 |
-
cat_perf = []
|
| 321 |
-
for cat in TAXONOMY_CATEGORIES[1:]:
|
| 322 |
-
cat_data = [d for d in data if d.get("taxonomy_category") == cat]
|
| 323 |
-
if cat_data:
|
| 324 |
-
avg_score = sum(d.get("llm_pass_1", 0) for d in cat_data) / len(cat_data)
|
| 325 |
-
best_model = max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
|
| 326 |
-
cat_perf.append([cat, f"{avg_score:.3f}", len(cat_data), best_model])
|
| 327 |
-
|
| 328 |
-
return history, lang_perf, cat_perf
|
| 329 |
-
|
| 330 |
-
for analytics_input in analytics_inputs:
|
| 331 |
-
analytics_input.change(
|
| 332 |
-
fn=update_analytics,
|
| 333 |
-
inputs=analytics_inputs,
|
| 334 |
-
outputs=[submission_history, language_analysis, category_analysis]
|
| 335 |
-
)
|
| 336 |
-
|
| 337 |
-
# Export functionality
|
| 338 |
-
def export_data(format_type):
|
| 339 |
-
"""Export leaderboard data"""
|
| 340 |
-
return processor.export_data(format_type.lower())
|
| 341 |
-
|
| 342 |
-
export_btn.click(
|
| 343 |
-
fn=export_data,
|
| 344 |
-
inputs=[export_format],
|
| 345 |
-
outputs=[export_output]
|
| 346 |
-
)
|
| 347 |
-
|
| 348 |
-
# Set initial values
|
| 349 |
-
demo.load(
|
| 350 |
-
fn=lambda: (initial_main, initial_quality, initial_stats),
|
| 351 |
-
outputs=[main_leaderboard, quality_metrics, stats_display]
|
| 352 |
-
)
|
| 353 |
|
| 354 |
-
#
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
)
|
| 363 |
|
| 364 |
-
|
| 365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
A comprehensive leaderboard for code review generation models
|
| 4 |
"""
|
| 5 |
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import tempfile
|
| 9 |
+
import logging
|
| 10 |
import gradio as gr
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import plotly.express as px
|
| 13 |
+
import plotly.graph_objects as go
|
| 14 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 15 |
+
import numpy as np
|
| 16 |
+
from gradio.themes.utils import fonts, colors
|
| 17 |
+
from dataclasses import fields, dataclass
|
| 18 |
|
| 19 |
+
from src.about import (
|
| 20 |
+
CITATION_BUTTON_LABEL,
|
| 21 |
+
CITATION_BUTTON_TEXT,
|
| 22 |
+
EVALUATION_QUEUE_TEXT,
|
| 23 |
+
INTRODUCTION_TEXT,
|
| 24 |
+
LLM_BENCHMARKS_TEXT,
|
| 25 |
+
TITLE,
|
| 26 |
)
|
| 27 |
+
from src.display.css_html_js import custom_css
|
|
|
|
| 28 |
from src.display.utils import (
|
| 29 |
+
CODEREVIEW_COLUMN,
|
| 30 |
+
DISPLAY_COLS,
|
| 31 |
+
METRIC_COLS,
|
| 32 |
+
HIDDEN_COLS,
|
| 33 |
+
NEVER_HIDDEN_COLS,
|
| 34 |
+
CATEGORIES,
|
| 35 |
+
COMMENT_LANGUAGES,
|
| 36 |
+
EXAMPLE_CATEGORIES,
|
| 37 |
+
TOPICS,
|
| 38 |
+
ModelType,
|
| 39 |
+
Mode,
|
| 40 |
+
Precision,
|
| 41 |
+
WeightType,
|
| 42 |
+
ReviewModelType,
|
| 43 |
+
get_all_column_choices,
|
| 44 |
+
get_default_visible_columns,
|
| 45 |
)
|
| 46 |
+
from src.display.formatting import styled_message, styled_error, styled_warning
|
| 47 |
+
from src.envs import (
|
| 48 |
+
ADMIN_USERNAME,
|
| 49 |
+
ADMIN_PASSWORD,
|
| 50 |
+
RESULTS_DATASET_ID,
|
| 51 |
+
SUBMITTER_TOKEN,
|
| 52 |
+
TOKEN,
|
| 53 |
+
DATA_PATH,
|
| 54 |
+
)
|
| 55 |
+
from src.populate import get_leaderboard_df, get_category_leaderboard_df
|
| 56 |
+
from src.submission.submit import process_submission
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
# Configure logging
|
| 59 |
+
logging.basicConfig(
|
| 60 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
| 61 |
+
)
|
| 62 |
+
logger = logging.getLogger(__name__)
|
| 63 |
+
|
| 64 |
+
# Ensure data directory exists
|
| 65 |
+
os.makedirs(DATA_PATH, exist_ok=True)
|
| 66 |
+
|
| 67 |
+
# Available benchmark versions
|
| 68 |
+
BENCHMARK_VERSIONS = ["v0"]
|
| 69 |
+
CURRENT_VERSION = "v0"
|
| 70 |
+
|
| 71 |
+
# Initialize leaderboard data
|
| 72 |
+
try:
|
| 73 |
+
logger.info("Initializing leaderboard data...")
|
| 74 |
+
LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
|
| 75 |
+
logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"Error loading leaderboard data: {e}")
|
| 78 |
+
LEADERBOARD_DF = pd.DataFrame()
|
| 79 |
+
|
| 80 |
+
custom_theme = gr.themes.Default(
|
| 81 |
+
primary_hue=colors.slate,
|
| 82 |
+
secondary_hue=colors.slate,
|
| 83 |
+
neutral_hue=colors.neutral,
|
| 84 |
+
font=(fonts.GoogleFont("Inter"), "sans-serif"),
|
| 85 |
+
).set(
|
| 86 |
+
# font_size="16px",
|
| 87 |
+
body_background_fill="#0f0f10",
|
| 88 |
+
body_background_fill_dark="#0f0f10",
|
| 89 |
+
body_text_color="#f4f4f5",
|
| 90 |
+
body_text_color_subdued="#a1a1aa",
|
| 91 |
+
block_background_fill="#1e1e1e", # Cooler Grey
|
| 92 |
+
block_border_color="#333333", # Cooler Grey
|
| 93 |
+
block_shadow="none",
|
| 94 |
+
# Swapped primary and secondary button styles
|
| 95 |
+
button_primary_background_fill="#121212", # Changed to specific color for Refresh button
|
| 96 |
+
button_primary_text_color="#f4f4f5",
|
| 97 |
+
button_primary_border_color="#333333", # Keep border grey or change to #121212?
|
| 98 |
+
button_secondary_background_fill="#f4f4f5",
|
| 99 |
+
button_secondary_text_color="#0f0f10",
|
| 100 |
+
button_secondary_border_color="#f4f4f5",
|
| 101 |
+
input_background_fill="#1e1e1e", # Cooler Grey
|
| 102 |
+
input_border_color="#333333", # Cooler Grey
|
| 103 |
+
input_placeholder_color="#71717a",
|
| 104 |
+
table_border_color="#333333", # Cooler Grey
|
| 105 |
+
table_even_background_fill="#2d2d2d", # Cooler Grey (Slightly lighter)
|
| 106 |
+
table_odd_background_fill="#1e1e1e", # Cooler Grey
|
| 107 |
+
table_text_color="#f4f4f5",
|
| 108 |
+
link_text_color="#ffffff",
|
| 109 |
+
border_color_primary="#333333", # Cooler Grey
|
| 110 |
+
background_fill_secondary="#333333", # Cooler Grey
|
| 111 |
+
color_accent="#f4f4f5",
|
| 112 |
+
border_color_accent="#333333", # Cooler Grey
|
| 113 |
+
button_primary_background_fill_hover="#424242", # Cooler Grey
|
| 114 |
+
block_title_text_color="#f4f4f5",
|
| 115 |
+
accordion_text_color="#f4f4f5",
|
| 116 |
+
panel_background_fill="#1e1e1e", # Cooler Grey
|
| 117 |
+
panel_border_color="#333333", # Cooler Grey
|
| 118 |
+
# Explicitly setting primary/secondary/accent colors/borders
|
| 119 |
+
background_fill_primary="#0f0f10",
|
| 120 |
+
background_fill_primary_dark="#0f0f10",
|
| 121 |
+
background_fill_secondary_dark="#333333", # Cooler Grey
|
| 122 |
+
border_color_primary_dark="#333333", # Cooler Grey
|
| 123 |
+
border_color_accent_dark="#333333", # Cooler Grey
|
| 124 |
+
border_color_accent_subdued="#424242", # Cooler Grey
|
| 125 |
+
border_color_accent_subdued_dark="#424242", # Cooler Grey
|
| 126 |
+
color_accent_soft="#a1a1aa",
|
| 127 |
+
color_accent_soft_dark="#a1a1aa",
|
| 128 |
+
# Explicitly setting input hover/focus states
|
| 129 |
+
input_background_fill_dark="#1e1e1e", # Cooler Grey
|
| 130 |
+
input_background_fill_focus="#424242", # Cooler Grey
|
| 131 |
+
input_background_fill_focus_dark="#424242", # Cooler Grey
|
| 132 |
+
input_background_fill_hover="#2d2d2d", # Cooler Grey
|
| 133 |
+
input_background_fill_hover_dark="#2d2d2d", # Cooler Grey
|
| 134 |
+
input_border_color_dark="#333333", # Cooler Grey
|
| 135 |
+
input_border_color_focus="#f4f4f5",
|
| 136 |
+
input_border_color_focus_dark="#f4f4f5",
|
| 137 |
+
input_border_color_hover="#424242", # Cooler Grey
|
| 138 |
+
input_border_color_hover_dark="#424242", # Cooler Grey
|
| 139 |
+
input_placeholder_color_dark="#71717a",
|
| 140 |
+
# Explicitly set dark variants for table backgrounds
|
| 141 |
+
table_even_background_fill_dark="#2d2d2d", # Cooler Grey
|
| 142 |
+
table_odd_background_fill_dark="#1e1e1e", # Cooler Grey
|
| 143 |
+
# Explicitly set dark text variants
|
| 144 |
+
body_text_color_dark="#f4f4f5",
|
| 145 |
+
body_text_color_subdued_dark="#a1a1aa",
|
| 146 |
+
block_title_text_color_dark="#f4f4f5",
|
| 147 |
+
accordion_text_color_dark="#f4f4f5",
|
| 148 |
+
table_text_color_dark="#f4f4f5",
|
| 149 |
+
# Explicitly set dark panel/block variants
|
| 150 |
+
panel_background_fill_dark="#1e1e1e", # Cooler Grey
|
| 151 |
+
panel_border_color_dark="#333333", # Cooler Grey
|
| 152 |
+
block_background_fill_dark="#1e1e1e", # Cooler Grey
|
| 153 |
+
block_border_color_dark="#333333", # Cooler Grey
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@dataclass
|
| 158 |
+
class ColumnInfo:
|
| 159 |
+
"""Information about a column in the leaderboard."""
|
| 160 |
+
|
| 161 |
+
name: str
|
| 162 |
+
display_name: str
|
| 163 |
+
type: str = "text"
|
| 164 |
+
hidden: bool = False
|
| 165 |
+
never_hidden: bool = False
|
| 166 |
+
displayed_by_default: bool = True
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def update_column_choices(df):
|
| 170 |
+
"""Update column choices based on what's actually in the dataframe"""
|
| 171 |
+
if df is None or df.empty:
|
| 172 |
+
return get_all_column_choices()
|
| 173 |
+
|
| 174 |
+
# Get columns that actually exist in the dataframe
|
| 175 |
+
existing_columns = list(df.columns)
|
| 176 |
+
|
| 177 |
+
# Get all possible columns with their display names
|
| 178 |
+
all_columns = get_all_column_choices()
|
| 179 |
+
|
| 180 |
+
# Filter to only include columns that exist in the dataframe
|
| 181 |
+
valid_columns = [
|
| 182 |
+
(col_name, display_name)
|
| 183 |
+
for col_name, display_name in all_columns
|
| 184 |
+
if col_name in existing_columns
|
| 185 |
+
]
|
| 186 |
+
|
| 187 |
+
# Return default if there are no valid columns
|
| 188 |
+
if not valid_columns:
|
| 189 |
+
return get_all_column_choices()
|
| 190 |
+
|
| 191 |
+
return valid_columns
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# Update the column_selector initialization
|
| 195 |
+
def get_initial_columns():
|
| 196 |
+
"""Get initial columns to show in the dropdown"""
|
| 197 |
+
try:
|
| 198 |
+
# Get available columns in the main dataframe
|
| 199 |
+
available_cols = list(LEADERBOARD_DF.columns)
|
| 200 |
+
logger.info(f"Available columns in LEADERBOARD_DF: {available_cols}")
|
| 201 |
+
|
| 202 |
+
# If dataframe is empty, use default visible columns
|
| 203 |
+
if not available_cols:
|
| 204 |
+
return get_default_visible_columns()
|
| 205 |
+
|
| 206 |
+
# Get default visible columns that actually exist in the dataframe
|
| 207 |
+
valid_defaults = [
|
| 208 |
+
col for col in get_default_visible_columns() if col in available_cols
|
| 209 |
+
]
|
| 210 |
+
|
| 211 |
+
# If none of the defaults exist, return all available columns
|
| 212 |
+
if not valid_defaults:
|
| 213 |
+
return available_cols
|
| 214 |
+
|
| 215 |
+
return valid_defaults
|
| 216 |
+
except Exception as e:
|
| 217 |
+
logger.error(f"Error getting initial columns: {e}")
|
| 218 |
+
return get_default_visible_columns()
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def init_leaderboard(dataframe, visible_columns=None):
|
| 222 |
+
"""
|
| 223 |
+
Initialize a standard Gradio Dataframe component for the leaderboard.
|
| 224 |
+
"""
|
| 225 |
+
if dataframe is None or dataframe.empty:
|
| 226 |
+
# Create an empty dataframe with the right columns
|
| 227 |
+
columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
|
| 228 |
+
dataframe = pd.DataFrame(columns=columns)
|
| 229 |
+
logger.warning("Initializing empty leaderboard")
|
| 230 |
+
|
| 231 |
+
# Lowercase model_name for display
|
| 232 |
+
if "model_name" in dataframe.columns:
|
| 233 |
+
dataframe = dataframe.copy()
|
| 234 |
+
dataframe["model_name"] = dataframe["model_name"].str.lower()
|
| 235 |
+
|
| 236 |
+
if "model_type" in dataframe.columns:
|
| 237 |
+
dataframe = dataframe.copy()
|
| 238 |
+
dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
|
| 239 |
+
|
| 240 |
+
if "review_model_type" in dataframe.columns:
|
| 241 |
+
dataframe = dataframe.copy()
|
| 242 |
+
dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
|
| 243 |
+
|
| 244 |
+
# print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
|
| 245 |
+
|
| 246 |
+
# Determine which columns to display
|
| 247 |
+
display_column_names = [
|
| 248 |
+
getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
|
| 249 |
+
]
|
| 250 |
+
hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
|
| 251 |
+
|
| 252 |
+
# Columns that should always be shown
|
| 253 |
+
always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
|
| 254 |
+
|
| 255 |
+
# Use provided visible columns if specified, otherwise use default
|
| 256 |
+
if visible_columns is None:
|
| 257 |
+
# Determine which columns to show initially
|
| 258 |
+
visible_columns = [
|
| 259 |
+
col for col in display_column_names if col not in hidden_column_names
|
| 260 |
+
]
|
| 261 |
+
|
| 262 |
+
# Always include the never-hidden columns
|
| 263 |
+
for col in always_visible:
|
| 264 |
+
if col not in visible_columns and col in dataframe.columns:
|
| 265 |
+
visible_columns.append(col)
|
| 266 |
+
|
| 267 |
+
# Make sure we only include columns that actually exist in the dataframe
|
| 268 |
+
visible_columns = [col for col in visible_columns if col in dataframe.columns]
|
| 269 |
+
|
| 270 |
+
# Map GuardBench column types to Gradio's expected datatype strings
|
| 271 |
+
# Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
|
| 272 |
+
type_mapping = {
|
| 273 |
+
"text": "str",
|
| 274 |
+
"number": "number",
|
| 275 |
+
"bool": "bool",
|
| 276 |
+
"date": "date",
|
| 277 |
+
"markdown": "markdown",
|
| 278 |
+
"html": "html",
|
| 279 |
+
"image": "image",
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
# Create a list of datatypes in the format Gradio expects
|
| 283 |
+
datatypes = []
|
| 284 |
+
for col in visible_columns:
|
| 285 |
+
# Find the corresponding CODEREVIEW_COLUMN entry
|
| 286 |
+
col_type = None
|
| 287 |
+
for display_col in DISPLAY_COLS:
|
| 288 |
+
if getattr(CODEREVIEW_COLUMN, display_col).name == col:
|
| 289 |
+
orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
|
| 290 |
+
# Map to Gradio's expected types
|
| 291 |
+
col_type = type_mapping.get(orig_type, "str")
|
| 292 |
+
break
|
| 293 |
+
|
| 294 |
+
# Default to 'str' if type not found or not mappable
|
| 295 |
+
if col_type is None:
|
| 296 |
+
col_type = "str"
|
| 297 |
+
|
| 298 |
+
datatypes.append(col_type)
|
| 299 |
+
|
| 300 |
+
# Create a dummy column for search functionality if it doesn't exist
|
| 301 |
+
if "search_dummy" not in dataframe.columns:
|
| 302 |
+
dataframe["search_dummy"] = dataframe.apply(
|
| 303 |
+
lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
|
| 304 |
+
axis=1,
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
# Select only the visible columns for display
|
| 308 |
+
visible_columns.remove("model_name")
|
| 309 |
+
|
| 310 |
+
visible_columns = ["model_name"] + visible_columns
|
| 311 |
+
display_df = dataframe[visible_columns].copy()
|
| 312 |
+
|
| 313 |
+
# print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
|
| 314 |
+
# print(display_df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head() if all(c in display_df.columns for c in ['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']) else "Relevant columns not present")
|
| 315 |
+
# print(f"-------------------------------------------------------------")
|
| 316 |
+
|
| 317 |
+
# Round numeric columns to 3 decimal places for display
|
| 318 |
+
numeric_cols = display_df.select_dtypes(include=np.number).columns
|
| 319 |
+
for col in numeric_cols:
|
| 320 |
+
# Avoid rounding integer columns like counts
|
| 321 |
+
if not pd.api.types.is_integer_dtype(display_df[col]):
|
| 322 |
+
# Format floats to exactly 3 decimal places, preserving trailing zeros
|
| 323 |
+
display_df[col] = display_df[col].apply(
|
| 324 |
+
lambda x: f"{x:.3f}" if pd.notna(x) else None
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
column_info_map = {
|
| 328 |
+
f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
|
| 329 |
+
}
|
| 330 |
+
column_mapping = {
|
| 331 |
+
col: column_info_map.get(col, ColumnInfo(col, col)).display_name
|
| 332 |
+
for col in visible_columns
|
| 333 |
}
|
| 334 |
+
|
| 335 |
+
# Rename columns in the DataFrame
|
| 336 |
+
display_df.rename(columns=column_mapping, inplace=True)
|
| 337 |
+
|
| 338 |
+
# Apply styling - note: styling might need adjustment if it relies on column names
|
| 339 |
+
styler = display_df.style.set_properties(**{"text-align": "right"}).set_properties(
|
| 340 |
+
subset=["Model"], **{"width": "200px"}
|
| 341 |
)
|
| 342 |
+
|
| 343 |
+
return gr.Dataframe(
|
| 344 |
+
value=styler,
|
| 345 |
+
datatype=datatypes,
|
| 346 |
+
interactive=False,
|
| 347 |
+
wrap=True,
|
| 348 |
+
height=2500,
|
| 349 |
+
elem_id="leaderboard-table",
|
| 350 |
+
row_count=len(display_df),
|
| 351 |
)
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def search_filter_leaderboard(
|
| 355 |
+
df, search_query="", comment_languages=None, version=CURRENT_VERSION
|
| 356 |
+
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
"""
|
| 358 |
+
Filter the leaderboard based on search query and comment languages.
|
| 359 |
+
"""
|
| 360 |
+
if df is None or df.empty:
|
| 361 |
+
return df
|
| 362 |
+
|
| 363 |
+
filtered_df = df.copy()
|
| 364 |
+
|
| 365 |
+
# Add search dummy column if it doesn't exist
|
| 366 |
+
if "search_dummy" not in filtered_df.columns:
|
| 367 |
+
filtered_df["search_dummy"] = filtered_df.apply(
|
| 368 |
+
lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
|
| 369 |
+
axis=1,
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
# Apply comment language filter (assuming there's a comment_language column in the data)
|
| 373 |
+
if comment_languages and len(comment_languages) > 0:
|
| 374 |
+
# Look for a comment language column in the dataframe
|
| 375 |
+
comment_lang_cols = [col for col in filtered_df.columns if 'comment_language' in col.lower()]
|
| 376 |
+
if comment_lang_cols:
|
| 377 |
+
filtered_df = filtered_df[
|
| 378 |
+
filtered_df[comment_lang_cols[0]].isin(comment_languages)
|
| 379 |
+
]
|
| 380 |
+
|
| 381 |
+
# Apply search query
|
| 382 |
+
if search_query:
|
| 383 |
+
search_terms = [
|
| 384 |
+
term.strip() for term in search_query.split(";") if term.strip()
|
| 385 |
+
]
|
| 386 |
+
if search_terms:
|
| 387 |
+
combined_mask = None
|
| 388 |
+
for term in search_terms:
|
| 389 |
+
mask = filtered_df["search_dummy"].str.contains(
|
| 390 |
+
term, case=False, na=False
|
| 391 |
+
)
|
| 392 |
+
if combined_mask is None:
|
| 393 |
+
combined_mask = mask
|
| 394 |
+
else:
|
| 395 |
+
combined_mask = combined_mask | mask
|
| 396 |
+
|
| 397 |
+
if combined_mask is not None:
|
| 398 |
+
filtered_df = filtered_df[combined_mask]
|
| 399 |
+
|
| 400 |
+
# Drop the search dummy column before returning
|
| 401 |
+
visible_columns = [col for col in filtered_df.columns if col != "search_dummy"]
|
| 402 |
+
return filtered_df[visible_columns]
|
| 403 |
|
| 404 |
+
|
| 405 |
+
def refresh_data_with_filters(
|
| 406 |
+
version=CURRENT_VERSION, search_query="", comment_languages=None, selected_columns=None
|
| 407 |
+
):
|
| 408 |
+
"""
|
| 409 |
+
Refresh the leaderboard data and update all components with filtering.
|
| 410 |
+
Ensures we handle cases where dataframes might have limited columns.
|
| 411 |
+
"""
|
| 412 |
+
global LEADERBOARD_DF
|
| 413 |
+
try:
|
| 414 |
+
logger.info(f"Performing refresh of leaderboard data with filters...")
|
| 415 |
+
# Get new data
|
| 416 |
+
main_df = get_leaderboard_df(version=version)
|
| 417 |
+
LEADERBOARD_DF = main_df
|
| 418 |
+
category_dfs = [
|
| 419 |
+
get_category_leaderboard_df(category, version=version)
|
| 420 |
+
for category in CATEGORIES
|
| 421 |
+
]
|
| 422 |
+
selected_columns = [
|
| 423 |
+
x.lower()
|
| 424 |
+
.replace(" ", "_")
|
| 425 |
+
.replace("(", "")
|
| 426 |
+
.replace(")", "")
|
| 427 |
+
.replace("_recall", "_recall_binary")
|
| 428 |
+
.replace("_precision", "_precision_binary")
|
| 429 |
+
for x in selected_columns
|
| 430 |
+
]
|
| 431 |
+
|
| 432 |
+
# Log the actual columns we have
|
| 433 |
+
logger.info(f"Main dataframe columns: {list(main_df.columns)}")
|
| 434 |
+
|
| 435 |
+
# Apply filters to each dataframe
|
| 436 |
+
filtered_main_df = search_filter_leaderboard(
|
| 437 |
+
main_df, search_query, comment_languages, version
|
| 438 |
+
)
|
| 439 |
+
filtered_category_dfs = [
|
| 440 |
+
search_filter_leaderboard(df, search_query, comment_languages, version)
|
| 441 |
+
for df in category_dfs
|
| 442 |
+
]
|
| 443 |
+
|
| 444 |
+
# Get available columns from the dataframe
|
| 445 |
+
available_columns = list(filtered_main_df.columns)
|
| 446 |
+
|
| 447 |
+
# Filter selected columns to only those available in the data
|
| 448 |
+
if selected_columns:
|
| 449 |
+
# Convert display names to internal names first
|
| 450 |
+
internal_selected_columns = [
|
| 451 |
+
x.lower()
|
| 452 |
+
.replace(" ", "_")
|
| 453 |
+
.replace("(", "")
|
| 454 |
+
.replace(")", "")
|
| 455 |
+
.replace("_recall", "_recall_binary")
|
| 456 |
+
.replace("_precision", "_precision_binary")
|
| 457 |
+
for x in selected_columns
|
| 458 |
+
]
|
| 459 |
+
valid_selected_columns = [
|
| 460 |
+
col for col in internal_selected_columns if col in available_columns
|
| 461 |
+
]
|
| 462 |
+
if not valid_selected_columns and "model_name" in available_columns:
|
| 463 |
+
# Fallback if conversion/filtering leads to empty selection
|
| 464 |
+
valid_selected_columns = ["model_name"] + [
|
| 465 |
+
col
|
| 466 |
+
for col in get_default_visible_columns()
|
| 467 |
+
if col in available_columns
|
| 468 |
+
]
|
| 469 |
+
else:
|
| 470 |
+
# If no columns were selected in the dropdown, use default visible columns that exist
|
| 471 |
+
valid_selected_columns = [
|
| 472 |
+
col for col in get_default_visible_columns() if col in available_columns
|
| 473 |
+
]
|
| 474 |
+
|
| 475 |
+
# Initialize dataframes for display with valid selected columns
|
| 476 |
+
main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
|
| 477 |
+
|
| 478 |
+
# For category dataframes, get columns that actually exist in each one
|
| 479 |
+
category_dataframes = []
|
| 480 |
+
for df in filtered_category_dfs:
|
| 481 |
+
df_columns = list(df.columns)
|
| 482 |
+
df_valid_columns = [
|
| 483 |
+
col for col in valid_selected_columns if col in df_columns
|
| 484 |
+
]
|
| 485 |
+
if not df_valid_columns and "model_name" in df_columns:
|
| 486 |
+
df_valid_columns = ["model_name"] + get_default_visible_columns()
|
| 487 |
+
category_dataframes.append(init_leaderboard(df, df_valid_columns))
|
| 488 |
+
|
| 489 |
+
return main_dataframe, *category_dataframes
|
| 490 |
+
|
| 491 |
+
except Exception as e:
|
| 492 |
+
logger.error(f"Error in refresh with filters: {e}")
|
| 493 |
+
# Return the current leaderboards on error
|
| 494 |
+
return leaderboard, *[
|
| 495 |
+
tab.children[0] for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
|
| 496 |
+
]
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
def submit_results(
|
| 500 |
+
model_name: str,
|
| 501 |
+
base_model: str,
|
| 502 |
+
revision: str,
|
| 503 |
+
precision: str,
|
| 504 |
+
weight_type: str,
|
| 505 |
+
model_type: str,
|
| 506 |
+
mode: str,
|
| 507 |
+
submission_file: tempfile._TemporaryFileWrapper,
|
| 508 |
+
version: str,
|
| 509 |
+
review_model_type: ReviewModelType,
|
| 510 |
+
programming_language: str,
|
| 511 |
+
comment_language: str,
|
| 512 |
):
|
| 513 |
+
"""
|
| 514 |
+
Handle submission of results with model metadata.
|
| 515 |
+
"""
|
| 516 |
+
if submission_file is None:
|
| 517 |
+
return styled_error("No submission file provided")
|
| 518 |
+
|
| 519 |
+
if not model_name:
|
| 520 |
+
return styled_error("Model name is required")
|
| 521 |
+
|
| 522 |
+
if not model_type:
|
| 523 |
+
return styled_error("Please select a model type")
|
| 524 |
+
|
| 525 |
+
if not mode:
|
| 526 |
+
return styled_error("Please select an inference mode")
|
| 527 |
+
|
| 528 |
+
file_path = submission_file.name
|
| 529 |
+
logger.info(f"Received submission for model {model_name}: {file_path}")
|
| 530 |
+
|
| 531 |
+
# Add metadata to the submission
|
| 532 |
+
metadata = {
|
| 533 |
+
"model_name": model_name,
|
| 534 |
+
"base_model": base_model,
|
| 535 |
+
"revision": revision if revision else "main",
|
| 536 |
+
"precision": precision,
|
| 537 |
+
"weight_type": weight_type,
|
| 538 |
+
"model_type": model_type,
|
| 539 |
+
"mode": mode,
|
| 540 |
+
"version": version,
|
| 541 |
+
"review_model_type": review_model_type,
|
| 542 |
+
"programming_language": programming_language,
|
| 543 |
+
"comment_language": comment_language,
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
# Process the submission
|
| 547 |
+
result = process_submission(file_path, metadata, version=version)
|
| 548 |
+
|
| 549 |
+
# Refresh the leaderboard data
|
| 550 |
+
global LEADERBOARD_DF
|
| 551 |
+
try:
|
| 552 |
+
logger.info(
|
| 553 |
+
f"Refreshing leaderboard data after submission for version {version}..."
|
| 554 |
)
|
| 555 |
+
LEADERBOARD_DF = get_leaderboard_df(version=version)
|
| 556 |
+
logger.info("Refreshed leaderboard data after submission")
|
| 557 |
+
except Exception as e:
|
| 558 |
+
logger.error(f"Error refreshing leaderboard data: {e}")
|
| 559 |
+
|
| 560 |
+
return result
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
def refresh_data(version=CURRENT_VERSION):
|
| 564 |
+
"""
|
| 565 |
+
Refresh the leaderboard data and update all components.
|
| 566 |
+
"""
|
| 567 |
+
try:
|
| 568 |
+
logger.info(f"Performing scheduled refresh of leaderboard data...")
|
| 569 |
+
# Get new data
|
| 570 |
+
main_df = get_leaderboard_df(version=version)
|
| 571 |
+
category_dfs = [
|
| 572 |
+
get_category_leaderboard_df(category, version=version)
|
| 573 |
+
for category in CATEGORIES
|
| 574 |
+
]
|
| 575 |
+
|
| 576 |
+
# For gr.Dataframe, we return the actual dataframes
|
| 577 |
+
return main_df, *category_dfs
|
| 578 |
+
|
| 579 |
+
except Exception as e:
|
| 580 |
+
logger.error(f"Error in scheduled refresh: {e}")
|
| 581 |
+
return None, *[None for _ in CATEGORIES]
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
def update_leaderboards(version):
|
| 585 |
+
"""
|
| 586 |
+
Update all leaderboard components with data for the selected version.
|
| 587 |
+
"""
|
| 588 |
+
try:
|
| 589 |
+
new_df = get_leaderboard_df(version=version)
|
| 590 |
+
category_dfs = [
|
| 591 |
+
get_category_leaderboard_df(category, version=version)
|
| 592 |
+
for category in CATEGORIES
|
| 593 |
+
]
|
| 594 |
+
return new_df, *category_dfs
|
| 595 |
+
except Exception as e:
|
| 596 |
+
logger.error(f"Error updating leaderboards for version {version}: {e}")
|
| 597 |
+
return None, *[None for _ in CATEGORIES]
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
def create_performance_plot(
|
| 601 |
+
selected_models, category, metric="f1_binary", version=CURRENT_VERSION
|
| 602 |
+
):
|
| 603 |
+
"""
|
| 604 |
+
Create a radar plot comparing model performance for selected models.
|
| 605 |
+
"""
|
| 606 |
+
if category == "All Results":
|
| 607 |
+
df = get_leaderboard_df(version=version)
|
| 608 |
else:
|
| 609 |
+
df = get_category_leaderboard_df(category, version=version)
|
| 610 |
+
|
| 611 |
+
if df.empty:
|
| 612 |
+
return go.Figure()
|
| 613 |
+
|
| 614 |
+
# Lowercase model_name in df and selected_models
|
| 615 |
+
df = df.copy()
|
| 616 |
+
df["model_name"] = df["model_name"].str.lower()
|
| 617 |
+
selected_models = [m.lower() for m in selected_models]
|
| 618 |
+
df = df[df["model_name"].isin(selected_models)]
|
| 619 |
+
metric_cols = [col for col in df.columns if metric in col]
|
| 620 |
+
fig = go.Figure()
|
| 621 |
+
colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
|
| 622 |
+
for idx, model in enumerate(selected_models):
|
| 623 |
+
model_data = df[df["model_name"] == model]
|
| 624 |
+
if not model_data.empty:
|
| 625 |
+
values = model_data[metric_cols].values[0].tolist()
|
| 626 |
+
values = values + [values[0]]
|
| 627 |
+
categories = [col.replace(f"_{metric}", "") for col in metric_cols]
|
| 628 |
+
# Replace 'jailbreaked' with 'jailbroken' in categories
|
| 629 |
+
categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
|
| 630 |
+
categories = categories + [categories[0]]
|
| 631 |
+
fig.add_trace(
|
| 632 |
+
go.Scatterpolar(
|
| 633 |
+
r=values,
|
| 634 |
+
theta=categories,
|
| 635 |
+
name=model,
|
| 636 |
+
line_color=colors[idx % len(colors)],
|
| 637 |
+
fill="toself",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
)
|
| 639 |
+
)
|
| 640 |
+
fig.update_layout(
|
| 641 |
+
paper_bgcolor="#000000",
|
| 642 |
+
plot_bgcolor="#000000",
|
| 643 |
+
font={"color": "#ffffff"},
|
| 644 |
+
title={
|
| 645 |
+
"text": f"{category} - {metric.upper()} Score Comparison",
|
| 646 |
+
"font": {"color": "#ffffff", "size": 24},
|
| 647 |
+
},
|
| 648 |
+
polar=dict(
|
| 649 |
+
bgcolor="#000000",
|
| 650 |
+
radialaxis=dict(
|
| 651 |
+
visible=True,
|
| 652 |
+
range=[0, 1],
|
| 653 |
+
gridcolor="#333333",
|
| 654 |
+
linecolor="#333333",
|
| 655 |
+
tickfont={"color": "#ffffff"},
|
| 656 |
+
),
|
| 657 |
+
angularaxis=dict(
|
| 658 |
+
gridcolor="#333333",
|
| 659 |
+
linecolor="#333333",
|
| 660 |
+
tickfont={"color": "#ffffff"},
|
| 661 |
+
),
|
| 662 |
+
),
|
| 663 |
+
height=600,
|
| 664 |
+
showlegend=True,
|
| 665 |
+
legend=dict(
|
| 666 |
+
yanchor="top",
|
| 667 |
+
y=0.99,
|
| 668 |
+
xanchor="right",
|
| 669 |
+
x=0.99,
|
| 670 |
+
bgcolor="rgba(0,0,0,0.5)",
|
| 671 |
+
font={"color": "#ffffff"},
|
| 672 |
+
),
|
| 673 |
+
)
|
| 674 |
+
return fig
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
def update_model_choices(version):
|
| 678 |
+
"""
|
| 679 |
+
Update the list of available models for the given version.
|
| 680 |
+
"""
|
| 681 |
+
df = get_leaderboard_df(version=version)
|
| 682 |
+
if df.empty:
|
| 683 |
+
return []
|
| 684 |
+
return sorted(df["model_name"].str.lower().unique().tolist())
|
| 685 |
+
|
| 686 |
+
|
| 687 |
+
def update_visualization(selected_models, selected_category, selected_metric, version):
|
| 688 |
+
"""
|
| 689 |
+
Update the visualization based on user selections.
|
| 690 |
+
"""
|
| 691 |
+
if not selected_models:
|
| 692 |
+
return go.Figure()
|
| 693 |
+
return create_performance_plot(
|
| 694 |
+
selected_models, selected_category, selected_metric, version
|
| 695 |
+
)
|
| 696 |
+
|
| 697 |
+
|
| 698 |
+
# Create Gradio app
|
| 699 |
+
demo = gr.Blocks(css=custom_css, theme=custom_theme)
|
| 700 |
+
|
| 701 |
+
CATEGORY_DISPLAY_MAP = {
|
| 702 |
+
"Python": "Python",
|
| 703 |
+
"Java": "Java",
|
| 704 |
+
"Scala": "Scala",
|
| 705 |
+
"Go": "Go"
|
| 706 |
+
}
|
| 707 |
+
# Create reverse mapping for lookups
|
| 708 |
+
CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
|
| 709 |
+
|
| 710 |
+
with demo:
|
| 711 |
+
gr.HTML(TITLE)
|
| 712 |
+
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 713 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 714 |
+
|
| 715 |
+
with gr.Row():
|
| 716 |
+
tabs = gr.Tabs(elem_classes="tab-buttons")
|
| 717 |
+
|
| 718 |
+
with tabs:
|
| 719 |
+
with gr.TabItem("Leaderboard", elem_id="codereview-leaderboard-tab", id=0):
|
| 720 |
+
with gr.Row():
|
| 721 |
+
version_selector = gr.Dropdown(
|
| 722 |
+
choices=BENCHMARK_VERSIONS,
|
| 723 |
+
label="Benchmark Version",
|
| 724 |
+
value=CURRENT_VERSION,
|
| 725 |
+
interactive=True,
|
| 726 |
+
elem_classes="version-selector",
|
| 727 |
+
scale=1,
|
| 728 |
+
visible=False,
|
| 729 |
+
)
|
| 730 |
+
|
| 731 |
+
with gr.Row():
|
| 732 |
+
search_input = gr.Textbox(
|
| 733 |
+
placeholder="Search by models (use ; to split)",
|
| 734 |
+
label="Search",
|
| 735 |
+
elem_id="search-bar",
|
| 736 |
+
scale=2,
|
| 737 |
+
)
|
| 738 |
+
comment_language_filter = gr.Dropdown(
|
| 739 |
+
choices=["en", "ru"],
|
| 740 |
+
label="Comment Language",
|
| 741 |
+
multiselect=True,
|
| 742 |
+
value=[],
|
| 743 |
+
interactive=True,
|
| 744 |
+
scale=1,
|
| 745 |
+
)
|
| 746 |
+
programming_language_filter = gr.Dropdown(
|
| 747 |
+
choices=["Python", "Java", "Scala", "Go"],
|
| 748 |
+
label="Programming Language",
|
| 749 |
+
multiselect=True,
|
| 750 |
+
value=[],
|
| 751 |
+
interactive=True,
|
| 752 |
+
scale=1,
|
| 753 |
+
)
|
| 754 |
+
with gr.Row():
|
| 755 |
+
topic_filter = gr.Dropdown(
|
| 756 |
+
choices=TOPICS,
|
| 757 |
+
label="Topic",
|
| 758 |
+
multiselect=True,
|
| 759 |
+
value=[],
|
| 760 |
+
interactive=True,
|
| 761 |
+
scale=2,
|
| 762 |
+
)
|
| 763 |
+
column_selector = gr.Dropdown(
|
| 764 |
+
choices=get_all_column_choices(),
|
| 765 |
+
label="Columns",
|
| 766 |
+
multiselect=True,
|
| 767 |
+
value=get_initial_columns(),
|
| 768 |
+
interactive=True,
|
| 769 |
+
visible=False,
|
| 770 |
+
scale=1,
|
| 771 |
+
)
|
| 772 |
+
with gr.Row():
|
| 773 |
+
refresh_button = gr.Button(
|
| 774 |
+
"Refresh", scale=0, elem_id="refresh-button"
|
| 775 |
+
)
|
| 776 |
+
|
| 777 |
+
# Create tabs for each category
|
| 778 |
+
with gr.Tabs(elem_classes="category-tabs") as category_tabs:
|
| 779 |
+
# First tab for average metrics across all categories
|
| 780 |
+
with gr.TabItem("All Results", elem_id="overall-tab"):
|
| 781 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 782 |
+
|
| 783 |
+
# Create a tab for each category using display names
|
| 784 |
+
for category in CATEGORIES:
|
| 785 |
+
display_name = CATEGORY_DISPLAY_MAP.get(category, category)
|
| 786 |
+
elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
|
| 787 |
+
with gr.TabItem(display_name, elem_id=elem_id):
|
| 788 |
+
category_df = get_category_leaderboard_df(
|
| 789 |
+
category, version=CURRENT_VERSION
|
| 790 |
+
)
|
| 791 |
+
category_leaderboard = init_leaderboard(category_df)
|
| 792 |
+
|
| 793 |
+
# Connect search and filter inputs to update function
|
| 794 |
+
def update_with_search_filters(
|
| 795 |
+
version=CURRENT_VERSION,
|
| 796 |
+
search_query="",
|
| 797 |
+
comment_languages=None,
|
| 798 |
+
selected_columns=None,
|
| 799 |
+
):
|
| 800 |
+
"""
|
| 801 |
+
Update the leaderboards with search and filter settings.
|
| 802 |
+
"""
|
| 803 |
+
return refresh_data_with_filters(
|
| 804 |
+
version, search_query, comment_languages, selected_columns
|
| 805 |
+
)
|
| 806 |
+
|
| 807 |
+
# Refresh button functionality
|
| 808 |
+
def refresh_and_update(
|
| 809 |
+
version, search_query, comment_languages, selected_columns
|
| 810 |
+
):
|
| 811 |
+
"""
|
| 812 |
+
Refresh data, update LEADERBOARD_DF, and return updated components.
|
| 813 |
+
"""
|
| 814 |
+
global LEADERBOARD_DF
|
| 815 |
+
main_df = get_leaderboard_df(version=version)
|
| 816 |
+
LEADERBOARD_DF = main_df # Update the global DataFrame
|
| 817 |
+
return refresh_data_with_filters(
|
| 818 |
+
version, search_query, comment_languages, selected_columns
|
| 819 |
+
)
|
| 820 |
+
|
| 821 |
+
refresh_button.click(
|
| 822 |
+
fn=refresh_and_update,
|
| 823 |
+
inputs=[
|
| 824 |
+
version_selector,
|
| 825 |
+
search_input,
|
| 826 |
+
comment_language_filter,
|
| 827 |
+
column_selector,
|
| 828 |
+
],
|
| 829 |
+
outputs=[leaderboard]
|
| 830 |
+
+ [
|
| 831 |
+
category_tabs.children[i].children[0]
|
| 832 |
+
for i in range(1, len(CATEGORIES) + 1)
|
| 833 |
+
],
|
| 834 |
)
|
| 835 |
+
# Search input functionality
|
| 836 |
+
search_input.change(
|
| 837 |
+
fn=refresh_data_with_filters,
|
| 838 |
+
inputs=[
|
| 839 |
+
version_selector,
|
| 840 |
+
search_input,
|
| 841 |
+
comment_language_filter,
|
| 842 |
+
column_selector,
|
| 843 |
+
],
|
| 844 |
+
outputs=[leaderboard]
|
| 845 |
+
+ [
|
| 846 |
+
category_tabs.children[i].children[0]
|
| 847 |
+
for i in range(1, len(CATEGORIES) + 1)
|
| 848 |
+
],
|
| 849 |
)
|
| 850 |
+
|
| 851 |
+
# Comment language filter functionality
|
| 852 |
+
comment_language_filter.change(
|
| 853 |
+
fn=refresh_data_with_filters,
|
| 854 |
+
inputs=[
|
| 855 |
+
version_selector,
|
| 856 |
+
search_input,
|
| 857 |
+
comment_language_filter,
|
| 858 |
+
column_selector,
|
| 859 |
+
],
|
| 860 |
+
outputs=[leaderboard]
|
| 861 |
+
+ [
|
| 862 |
+
category_tabs.children[i].children[0]
|
| 863 |
+
for i in range(1, len(CATEGORIES) + 1)
|
| 864 |
+
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 865 |
)
|
| 866 |
+
|
| 867 |
+
# Version selector functionality
|
| 868 |
+
version_selector.change(
|
| 869 |
+
fn=refresh_data_with_filters,
|
| 870 |
+
inputs=[
|
| 871 |
+
version_selector,
|
| 872 |
+
search_input,
|
| 873 |
+
comment_language_filter,
|
| 874 |
+
column_selector,
|
| 875 |
+
],
|
| 876 |
+
outputs=[leaderboard]
|
| 877 |
+
+ [
|
| 878 |
+
category_tabs.children[i].children[0]
|
| 879 |
+
for i in range(1, len(CATEGORIES) + 1)
|
| 880 |
+
],
|
| 881 |
)
|
| 882 |
+
|
| 883 |
+
# Update the update_columns function to handle updating all tabs at once
|
| 884 |
+
def update_columns(selected_columns):
|
| 885 |
+
"""
|
| 886 |
+
Update all leaderboards to show the selected columns.
|
| 887 |
+
Ensures all selected columns are preserved in the update.
|
| 888 |
+
|
| 889 |
+
"""
|
| 890 |
+
|
| 891 |
+
try:
|
| 892 |
+
logger.info(f"Updating columns to show: {selected_columns}")
|
| 893 |
+
|
| 894 |
+
# If no columns are selected, use default visible columns
|
| 895 |
+
if not selected_columns or len(selected_columns) == 0:
|
| 896 |
+
selected_columns = get_default_visible_columns()
|
| 897 |
+
logger.info(
|
| 898 |
+
f"No columns selected, using defaults: {selected_columns}"
|
| 899 |
+
)
|
| 900 |
+
|
| 901 |
+
# Convert display names to internal names
|
| 902 |
+
internal_selected_columns = [
|
| 903 |
+
x.lower()
|
| 904 |
+
.replace(" ", "_")
|
| 905 |
+
.replace("(", "")
|
| 906 |
+
.replace(")", "")
|
| 907 |
+
.replace("_recall", "_recall_binary")
|
| 908 |
+
.replace("_precision", "_precision_binary")
|
| 909 |
+
for x in selected_columns
|
| 910 |
+
]
|
| 911 |
+
|
| 912 |
+
# Get the current data with ALL columns preserved
|
| 913 |
+
main_df = get_leaderboard_df(version=version_selector.value)
|
| 914 |
+
|
| 915 |
+
# Get category dataframes with ALL columns preserved
|
| 916 |
+
category_dfs = [
|
| 917 |
+
get_category_leaderboard_df(
|
| 918 |
+
category, version=version_selector.value
|
| 919 |
+
)
|
| 920 |
+
for category in CATEGORIES
|
| 921 |
+
]
|
| 922 |
+
|
| 923 |
+
# Log columns for debugging
|
| 924 |
+
logger.info(f"Main dataframe columns: {list(main_df.columns)}")
|
| 925 |
+
logger.info(
|
| 926 |
+
f"Selected columns (internal): {internal_selected_columns}"
|
| 927 |
+
)
|
| 928 |
+
|
| 929 |
+
# IMPORTANT: Make sure model_name is always included
|
| 930 |
+
if (
|
| 931 |
+
"model_name" in main_df.columns
|
| 932 |
+
and "model_name" not in internal_selected_columns
|
| 933 |
+
):
|
| 934 |
+
internal_selected_columns = [
|
| 935 |
+
"model_name"
|
| 936 |
+
] + internal_selected_columns
|
| 937 |
+
|
| 938 |
+
# Initialize the main leaderboard with the selected columns
|
| 939 |
+
# We're passing the internal_selected_columns directly to preserve the selection
|
| 940 |
+
main_leaderboard = init_leaderboard(
|
| 941 |
+
main_df, internal_selected_columns
|
| 942 |
+
)
|
| 943 |
+
|
| 944 |
+
# Initialize category dataframes with the same selected columns
|
| 945 |
+
# This ensures consistency across all tabs
|
| 946 |
+
category_leaderboards = []
|
| 947 |
+
for df in category_dfs:
|
| 948 |
+
# Use the same selected columns for each category
|
| 949 |
+
# init_leaderboard will automatically handle filtering to columns that exist
|
| 950 |
+
category_leaderboards.append(
|
| 951 |
+
init_leaderboard(df, internal_selected_columns)
|
| 952 |
+
)
|
| 953 |
+
|
| 954 |
+
return main_leaderboard, *category_leaderboards
|
| 955 |
+
|
| 956 |
+
except Exception as e:
|
| 957 |
+
logger.error(f"Error updating columns: {e}")
|
| 958 |
+
import traceback
|
| 959 |
+
|
| 960 |
+
logger.error(traceback.format_exc())
|
| 961 |
+
return leaderboard, *[
|
| 962 |
+
tab.children[0]
|
| 963 |
+
for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
|
| 964 |
+
]
|
| 965 |
+
|
| 966 |
+
# Connect column selector to update function
|
| 967 |
+
column_selector.change(
|
| 968 |
+
fn=update_columns,
|
| 969 |
+
inputs=[column_selector],
|
| 970 |
+
outputs=[leaderboard]
|
| 971 |
+
+ [
|
| 972 |
+
category_tabs.children[i].children[0]
|
| 973 |
+
for i in range(1, len(CATEGORIES) + 1)
|
| 974 |
+
],
|
| 975 |
)
|
| 976 |
+
|
| 977 |
+
# with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
|
| 978 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 979 |
+
|
| 980 |
+
with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=1):
|
| 981 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 982 |
+
|
| 983 |
+
with gr.Row():
|
| 984 |
+
# with gr.Column(scale=3):
|
| 985 |
+
# gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
| 986 |
+
with gr.Column(scale=1):
|
| 987 |
+
# Add version selector specifically for the submission tab
|
| 988 |
+
submission_version_selector = gr.Dropdown(
|
| 989 |
+
choices=BENCHMARK_VERSIONS,
|
| 990 |
+
label="Benchmark Version",
|
| 991 |
+
value=CURRENT_VERSION,
|
| 992 |
+
interactive=True,
|
| 993 |
+
elem_classes="version-selector",
|
| 994 |
+
visible=False,
|
| 995 |
+
)
|
| 996 |
+
|
| 997 |
+
with gr.Row():
|
| 998 |
+
with gr.Column():
|
| 999 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
| 1000 |
+
mode_selector = gr.Dropdown(
|
| 1001 |
+
choices=[m.name for m in Mode],
|
| 1002 |
+
label="Mode",
|
| 1003 |
+
multiselect=False,
|
| 1004 |
+
value=None,
|
| 1005 |
+
interactive=True,
|
| 1006 |
+
)
|
| 1007 |
+
revision_name_textbox = gr.Textbox(
|
| 1008 |
+
label="Revision commit", placeholder="main"
|
| 1009 |
+
)
|
| 1010 |
+
model_type = gr.Dropdown(
|
| 1011 |
+
choices=[
|
| 1012 |
+
t.to_str("-")
|
| 1013 |
+
for t in ModelType
|
| 1014 |
+
if t != ModelType.Unknown and t != ModelType.ClosedSource
|
| 1015 |
+
],
|
| 1016 |
+
label="Model type",
|
| 1017 |
+
multiselect=False,
|
| 1018 |
+
value=None,
|
| 1019 |
+
interactive=True,
|
| 1020 |
+
)
|
| 1021 |
+
review_model_type = gr.Dropdown(
|
| 1022 |
+
choices=[t.name for t in ReviewModelType],
|
| 1023 |
+
label="Review model type",
|
| 1024 |
+
multiselect=False,
|
| 1025 |
+
value=ReviewModelType.CUSTOM.name,
|
| 1026 |
+
interactive=True,
|
| 1027 |
+
)
|
| 1028 |
+
programming_language_selector = gr.Dropdown(
|
| 1029 |
+
choices=["Python", "Java", "Scala", "Go"],
|
| 1030 |
+
label="Programming Language",
|
| 1031 |
+
multiselect=False,
|
| 1032 |
+
value=None,
|
| 1033 |
+
interactive=True,
|
| 1034 |
+
)
|
| 1035 |
+
comment_language_selector = gr.Dropdown(
|
| 1036 |
+
choices=["en", "ru"],
|
| 1037 |
+
label="Comment Language",
|
| 1038 |
+
multiselect=False,
|
| 1039 |
+
value="en",
|
| 1040 |
+
interactive=True,
|
| 1041 |
+
)
|
| 1042 |
+
|
| 1043 |
+
with gr.Column():
|
| 1044 |
+
precision = gr.Dropdown(
|
| 1045 |
+
choices=[
|
| 1046 |
+
i.name for i in Precision if i != Precision.Unknown
|
| 1047 |
+
],
|
| 1048 |
+
label="Precision",
|
| 1049 |
+
multiselect=False,
|
| 1050 |
+
value="float16",
|
| 1051 |
+
interactive=True,
|
| 1052 |
+
)
|
| 1053 |
+
weight_type = gr.Dropdown(
|
| 1054 |
+
choices=[i.name for i in WeightType],
|
| 1055 |
+
label="Weights type",
|
| 1056 |
+
multiselect=False,
|
| 1057 |
+
value="Original",
|
| 1058 |
+
interactive=True,
|
| 1059 |
+
)
|
| 1060 |
+
base_model_name_textbox = gr.Textbox(
|
| 1061 |
+
label="Base model (for delta or adapter weights)"
|
| 1062 |
+
)
|
| 1063 |
+
|
| 1064 |
+
with gr.Row():
|
| 1065 |
+
file_input = gr.File(
|
| 1066 |
+
label="Upload JSONL Results File", file_types=[".jsonl"]
|
| 1067 |
)
|
| 1068 |
+
|
| 1069 |
+
submit_button = gr.Button("Submit Results")
|
| 1070 |
+
result_output = gr.Markdown()
|
| 1071 |
+
|
| 1072 |
+
submit_button.click(
|
| 1073 |
+
fn=submit_results,
|
| 1074 |
+
inputs=[
|
| 1075 |
+
model_name_textbox,
|
| 1076 |
+
base_model_name_textbox,
|
| 1077 |
+
revision_name_textbox,
|
| 1078 |
+
precision,
|
| 1079 |
+
weight_type,
|
| 1080 |
+
model_type,
|
| 1081 |
+
mode_selector,
|
| 1082 |
+
file_input,
|
| 1083 |
+
submission_version_selector,
|
| 1084 |
+
review_model_type,
|
| 1085 |
+
programming_language_selector,
|
| 1086 |
+
comment_language_selector,
|
| 1087 |
+
],
|
| 1088 |
+
outputs=result_output,
|
| 1089 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1090 |
|
| 1091 |
+
# Version selector functionality
|
| 1092 |
+
version_selector.change(
|
| 1093 |
+
fn=update_leaderboards,
|
| 1094 |
+
inputs=[version_selector],
|
| 1095 |
+
outputs=[leaderboard]
|
| 1096 |
+
+ [
|
| 1097 |
+
category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
|
| 1098 |
+
],
|
| 1099 |
+
).then(
|
| 1100 |
+
lambda version: refresh_data_with_filters(version),
|
| 1101 |
+
inputs=[version_selector],
|
| 1102 |
+
outputs=[leaderboard]
|
| 1103 |
+
+ [
|
| 1104 |
+
category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
|
| 1105 |
+
],
|
| 1106 |
)
|
| 1107 |
|
| 1108 |
+
|
| 1109 |
+
# Set up the scheduler to refresh data periodically
|
| 1110 |
+
scheduler = BackgroundScheduler()
|
| 1111 |
+
scheduler.add_job(refresh_data, "interval", minutes=30)
|
| 1112 |
+
scheduler.start()
|
| 1113 |
+
|
| 1114 |
+
# Launch the app
|
| 1115 |
+
if __name__ == "__main__":
|
| 1116 |
+
demo.launch()
|
example_submission.jsonl
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"model_name": "GPT-4-CodeReview", "programming_language": "Python", "comment_language": "en", "topic": "Code Reliability", "observation_id": "obs_001", "code_snippet": "def calculate_sum(a, b):\n return a + b", "review_text": "This function is simple and correct, but consider adding type hints and docstring for better documentation.", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
|
| 2 |
+
{"model_name": "GPT-4-CodeReview", "programming_language": "Java", "comment_language": "en", "topic": "Coding Standards", "observation_id": "obs_002", "code_snippet": "public class Calculator {\n public int add(int a, int b) {\n return a + b;\n }\n}", "review_text": "Consider following Java naming conventions and adding JavaDoc comments. The method is functionally correct.", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
|
| 3 |
+
{"model_name": "Claude-3-CodeReview", "programming_language": "Scala", "comment_language": "ru", "topic": "Performance Issues", "observation_id": "obs_003", "code_snippet": "def fibonacci(n: Int): Int = {\n if (n <= 1) n\n else fibonacci(n-1) + fibonacci(n-2)\n}", "review_text": "Эта реализация неэффективна из-за экспоненциальной сложности. Рекомендуется использовать мемоизацию или итеративный подход.", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 9.2, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
|
| 4 |
+
{"model_name": "Llama-CodeReview", "programming_language": "Go", "comment_language": "en", "topic": "Variables", "observation_id": "obs_004", "code_snippet": "package main\n\nimport \"fmt\"\n\nfunc main() {\n var x int = 5\n var y int = 10\n fmt.Println(x + y)\n}", "review_text": "Consider using short variable declarations (:=) for local variables. Also, the variable names could be more descriptive.", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}
|
gradio_test.ipynb
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": []
|
| 9 |
+
}
|
| 10 |
+
],
|
| 11 |
+
"metadata": {
|
| 12 |
+
"kernelspec": {
|
| 13 |
+
"display_name": "agent_env",
|
| 14 |
+
"language": "python",
|
| 15 |
+
"name": "python3"
|
| 16 |
+
},
|
| 17 |
+
"language_info": {
|
| 18 |
+
"codemirror_mode": {
|
| 19 |
+
"name": "ipython",
|
| 20 |
+
"version": 3
|
| 21 |
+
},
|
| 22 |
+
"file_extension": ".py",
|
| 23 |
+
"mimetype": "text/x-python",
|
| 24 |
+
"name": "python",
|
| 25 |
+
"nbconvert_exporter": "python",
|
| 26 |
+
"pygments_lexer": "ipython3",
|
| 27 |
+
"version": "3.13.2"
|
| 28 |
+
}
|
| 29 |
+
},
|
| 30 |
+
"nbformat": 4,
|
| 31 |
+
"nbformat_minor": 2
|
| 32 |
+
}
|
leaderboard_data.json
CHANGED
|
@@ -1,23 +1,32 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
| 3 |
{
|
| 4 |
-
"model_name": "
|
| 5 |
-
"
|
| 6 |
-
"
|
| 7 |
-
"
|
| 8 |
-
"
|
| 9 |
-
"
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
}
|
| 22 |
-
]
|
|
|
|
|
|
|
| 23 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"entries": [
|
| 3 |
{
|
| 4 |
+
"model_name": "GPT-4-CodeReview",
|
| 5 |
+
"model_type": "LLM",
|
| 6 |
+
"mode": "Strict",
|
| 7 |
+
"review_model_type": "gpt-4",
|
| 8 |
+
"programming_language": "Python",
|
| 9 |
+
"comment_language": "en",
|
| 10 |
+
"topic": "Code Reliability",
|
| 11 |
+
"submission_date": "2024-10-06T12:00:00Z",
|
| 12 |
+
"version": "v0",
|
| 13 |
+
"readability": 8.5,
|
| 14 |
+
"relevance": 9.0,
|
| 15 |
+
"explanation_clarity": 7.8,
|
| 16 |
+
"problem_identification": 8.2,
|
| 17 |
+
"actionability": 8.7,
|
| 18 |
+
"completeness": 8.0,
|
| 19 |
+
"specificity": 7.5,
|
| 20 |
+
"contextual_adequacy": 8.3,
|
| 21 |
+
"consistency": 8.8,
|
| 22 |
+
"brevity": 7.2,
|
| 23 |
+
"pass_at_1": 0.75,
|
| 24 |
+
"pass_at_5": 0.88,
|
| 25 |
+
"pass_at_10": 0.92,
|
| 26 |
+
"bleu_at_10": 0.65,
|
| 27 |
+
"total_evaluations": 100
|
| 28 |
}
|
| 29 |
+
],
|
| 30 |
+
"last_updated": "2024-10-06T12:00:00Z",
|
| 31 |
+
"version": "v0"
|
| 32 |
}
|
requirements.txt
CHANGED
|
@@ -1,19 +1,8 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
matplotlib
|
| 10 |
-
numpy
|
| 11 |
-
pandas>=1.3.0
|
| 12 |
-
python-dateutil
|
| 13 |
-
tqdm
|
| 14 |
-
transformers
|
| 15 |
-
tokenizers>=0.15.0
|
| 16 |
-
sentencepiece
|
| 17 |
-
fastapi
|
| 18 |
-
uvicorn
|
| 19 |
-
pydantic>=2.0.0
|
|
|
|
| 1 |
+
gradio==4.44.1
|
| 2 |
+
pandas>=2.0.0
|
| 3 |
+
huggingface_hub>=0.20.0
|
| 4 |
+
datasets>=2.0.0
|
| 5 |
+
apscheduler>=3.10.0
|
| 6 |
+
python-dotenv>=1.0.0
|
| 7 |
+
plotly>=5.18.0
|
| 8 |
+
pydantic==2.10.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/about.py
CHANGED
|
@@ -1,48 +1,60 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
"""
|
| 4 |
|
| 5 |
-
TITLE = "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
INTRODUCTION_TEXT = """
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
A comprehensive benchmark for evaluating code review generation models across multiple programming languages and comment types.
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
- **Taxonomy Categories**: Performance across different types of code review feedback
|
| 18 |
|
| 19 |
-
|
|
|
|
| 20 |
|
| 21 |
-
-
|
| 22 |
-
- **Pass@1/5/10**: Percentage of reviews that pass quality checks in 1, 5, or 10 attempts
|
| 23 |
-
- **Multi-dimensional Quality Scores**: Detailed evaluation across 10 quality dimensions
|
| 24 |
|
| 25 |
-
|
| 26 |
|
| 27 |
-
|
| 28 |
-
✨ **Comment Language Support**: Filter by the natural language of code comments
|
| 29 |
-
✨ **Taxonomy Categories**: Browse results by review type (bug detection, style, performance, etc.)
|
| 30 |
-
✨ **IP-based Submissions**: Secure submission system with IP tracking
|
| 31 |
-
✨ **Dark Theme**: Modern, eye-friendly interface
|
| 32 |
"""
|
| 33 |
|
| 34 |
-
|
| 35 |
-
##
|
| 36 |
|
| 37 |
-
|
| 38 |
-
2. **Format**: Provide scores in the specified format ranges
|
| 39 |
-
3. **Reproducibility**: Include model details and evaluation setup
|
| 40 |
-
4. **Quality Metrics**: Rate your model across all 10 quality dimensions
|
| 41 |
-
5. **Metadata**: Specify programming language, comment language, and taxonomy focus
|
| 42 |
-
"""
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Text content for the CodeReview Bench Leaderboard.
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
TITLE = """
|
| 6 |
+
<div style="text-align: center; margin-bottom: 1rem">
|
| 7 |
+
<h1>CodeReview Bench Leaderboard</h1>
|
| 8 |
+
</div>
|
| 9 |
+
"""
|
| 10 |
|
| 11 |
INTRODUCTION_TEXT = """
|
| 12 |
+
## Introduction
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
CodeReview Bench is a comprehensive benchmark for evaluating the quality and effectiveness of automated code review systems.
|
| 15 |
+
This leaderboard tracks model performance across various programming languages and review criteria,
|
| 16 |
+
including readability, relevance, explanation clarity, and actionability.
|
| 17 |
|
| 18 |
+
Models are evaluated on their ability to provide high-quality code reviews that are helpful,
|
| 19 |
+
accurate, and actionable across multiple programming languages and review categories.
|
| 20 |
+
"""
|
|
|
|
| 21 |
|
| 22 |
+
LLM_BENCHMARKS_TEXT = """
|
| 23 |
+
CodeReview Bench is a comprehensive benchmark for evaluating automated code review systems across programming languages and review quality dimensions.
|
| 24 |
|
| 25 |
+
It evaluates models on their ability to provide high-quality code reviews using both LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity) and exact-match metrics (pass@1, pass@5, pass@10, BLEU@10).
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
The benchmark supports both Russian and English comment languages across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more.
|
| 28 |
|
| 29 |
+
Learn more about automated code review evaluation and best practices.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
"""
|
| 31 |
|
| 32 |
+
EVALUATION_QUEUE_TEXT = """
|
| 33 |
+
## Submit Your Model
|
| 34 |
|
| 35 |
+
To add your model to the CodeReview Bench leaderboard:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
1. Run your evaluation using the CodeReview Bench framework
|
| 38 |
+
2. Upload your results in .jsonl format using this form.
|
| 39 |
+
3. Once validated, your model will appear on the leaderboard.
|
| 40 |
|
| 41 |
+
### Requirements:
|
| 42 |
+
- Results must include all required metrics: LLM-based multimetric scores and exact-match metrics
|
| 43 |
+
- Submissions should cover multiple programming languages where applicable
|
| 44 |
+
- Both Russian and English comment languages are supported
|
| 45 |
+
|
| 46 |
+
### ✉️✨ Ready? Upload your results below!
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
CITATION_BUTTON_LABEL = "Cite CodeReview Bench"
|
| 50 |
+
|
| 51 |
+
CITATION_BUTTON_TEXT = """
|
| 52 |
+
@misc{codereviewbench2025,
|
| 53 |
+
author = {CodeReview Bench Team},
|
| 54 |
+
title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
|
| 55 |
+
year = {2025},
|
| 56 |
+
publisher = {GitHub},
|
| 57 |
+
journal = {GitHub repository},
|
| 58 |
+
howpublished = {\\url{https://github.com/your-org/codereview-bench}}
|
| 59 |
+
}
|
| 60 |
+
"""
|
src/display/css_html_js.py
CHANGED
|
@@ -1,306 +1,97 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
"""
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
:
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
--text-primary: #e6edf3;
|
| 13 |
-
--text-secondary: #7d8590;
|
| 14 |
-
--border-color: #30363d;
|
| 15 |
-
--accent-color: #ffffff;
|
| 16 |
-
--accent-hover: #f0f0f0;
|
| 17 |
-
--danger-color: #da3633;
|
| 18 |
-
--warning-color: #d29922;
|
| 19 |
-
--info-color: #1f6feb;
|
| 20 |
}
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
background:
|
| 25 |
-
color:
|
| 26 |
}
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
color: var(--text-primary) !important;
|
| 31 |
}
|
| 32 |
|
| 33 |
-
.
|
| 34 |
-
|
| 35 |
}
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
background:
|
| 40 |
-
|
|
|
|
|
|
|
| 41 |
}
|
| 42 |
|
| 43 |
-
.
|
| 44 |
-
|
| 45 |
-
color:
|
| 46 |
-
border: none !important;
|
| 47 |
-
padding: 12px 24px !important;
|
| 48 |
-
transition: all 0.2s ease !important;
|
| 49 |
}
|
| 50 |
|
| 51 |
-
.
|
| 52 |
-
|
| 53 |
-
background: var(--bg-tertiary) !important;
|
| 54 |
}
|
| 55 |
|
| 56 |
-
.
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
border-bottom: 2px solid var(--accent-color) !important;
|
| 60 |
}
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
border:
|
| 66 |
-
border-radius: 8px !important;
|
| 67 |
-
overflow: hidden !important;
|
| 68 |
}
|
| 69 |
|
| 70 |
-
.
|
| 71 |
-
|
|
|
|
| 72 |
}
|
| 73 |
|
| 74 |
-
.
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
border-bottom: 2px solid var(--border-color) !important;
|
| 78 |
-
padding: 12px !important;
|
| 79 |
-
font-weight: 600 !important;
|
| 80 |
}
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
border-bottom: 1px solid var(--border-color) !important;
|
| 86 |
-
padding: 10px 12px !important;
|
| 87 |
}
|
| 88 |
|
| 89 |
-
.
|
| 90 |
-
|
|
|
|
| 91 |
}
|
| 92 |
|
| 93 |
-
/*
|
| 94 |
-
.gradio-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
| 100 |
}
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
| 105 |
}
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
color: var(--bg-primary) !important;
|
| 111 |
-
border: 1px solid var(--border-color) !important;
|
| 112 |
-
border-radius: 6px !important;
|
| 113 |
-
padding: 8px 16px !important;
|
| 114 |
-
font-weight: 500 !important;
|
| 115 |
-
transition: all 0.2s ease !important;
|
| 116 |
-
}
|
| 117 |
-
|
| 118 |
-
.gradio-container button:hover {
|
| 119 |
-
background: var(--accent-hover) !important;
|
| 120 |
-
transform: translateY(-1px) !important;
|
| 121 |
-
color: var(--bg-primary) !important;
|
| 122 |
-
}
|
| 123 |
-
|
| 124 |
-
.gradio-container button:active {
|
| 125 |
-
transform: translateY(0) !important;
|
| 126 |
-
}
|
| 127 |
-
|
| 128 |
-
/* Dropdowns */
|
| 129 |
-
.gradio-container .dropdown {
|
| 130 |
-
background: var(--bg-tertiary) !important;
|
| 131 |
-
border: 1px solid var(--border-color) !important;
|
| 132 |
-
border-radius: 6px !important;
|
| 133 |
-
}
|
| 134 |
-
|
| 135 |
-
.gradio-container .dropdown-menu {
|
| 136 |
-
background: var(--bg-secondary) !important;
|
| 137 |
-
border: 1px solid var(--border-color) !important;
|
| 138 |
-
border-radius: 6px !important;
|
| 139 |
-
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important;
|
| 140 |
-
}
|
| 141 |
-
|
| 142 |
-
.gradio-container .dropdown-menu .dropdown-item {
|
| 143 |
-
color: var(--text-primary) !important;
|
| 144 |
-
padding: 8px 12px !important;
|
| 145 |
-
}
|
| 146 |
-
|
| 147 |
-
.gradio-container .dropdown-menu .dropdown-item:hover {
|
| 148 |
-
background: var(--bg-tertiary) !important;
|
| 149 |
-
}
|
| 150 |
-
|
| 151 |
-
/* Sliders */
|
| 152 |
-
.gradio-container .slider {
|
| 153 |
-
background: var(--bg-tertiary) !important;
|
| 154 |
-
}
|
| 155 |
-
|
| 156 |
-
.gradio-container .slider input[type="range"] {
|
| 157 |
-
background: var(--bg-tertiary) !important;
|
| 158 |
-
}
|
| 159 |
-
|
| 160 |
-
.gradio-container .slider input[type="range"]::-webkit-slider-thumb {
|
| 161 |
-
background: var(--accent-color) !important;
|
| 162 |
-
border: 2px solid var(--bg-primary) !important;
|
| 163 |
-
border-radius: 50% !important;
|
| 164 |
-
width: 18px !important;
|
| 165 |
-
height: 18px !important;
|
| 166 |
-
}
|
| 167 |
-
|
| 168 |
-
.gradio-container .slider input[type="range"]::-webkit-slider-track {
|
| 169 |
-
background: var(--border-color) !important;
|
| 170 |
-
border-radius: 4px !important;
|
| 171 |
-
height: 6px !important;
|
| 172 |
-
}
|
| 173 |
-
|
| 174 |
-
/* Accordions */
|
| 175 |
-
.gradio-container .accordion {
|
| 176 |
-
background: var(--bg-secondary) !important;
|
| 177 |
-
border: 1px solid var(--border-color) !important;
|
| 178 |
-
border-radius: 8px !important;
|
| 179 |
-
margin: 16px 0 !important;
|
| 180 |
-
}
|
| 181 |
-
|
| 182 |
-
.gradio-container .accordion-header {
|
| 183 |
-
background: var(--bg-tertiary) !important;
|
| 184 |
-
color: var(--text-primary) !important;
|
| 185 |
-
padding: 16px !important;
|
| 186 |
-
border-bottom: 1px solid var(--border-color) !important;
|
| 187 |
-
cursor: pointer !important;
|
| 188 |
-
font-weight: 500 !important;
|
| 189 |
-
}
|
| 190 |
-
|
| 191 |
-
.gradio-container .accordion-header:hover {
|
| 192 |
-
background: var(--bg-primary) !important;
|
| 193 |
-
}
|
| 194 |
-
|
| 195 |
-
/* Status messages */
|
| 196 |
-
.gradio-container .success {
|
| 197 |
-
background: rgba(255, 255, 255, 0.1) !important;
|
| 198 |
-
color: var(--text-primary) !important;
|
| 199 |
-
border: 1px solid var(--accent-color) !important;
|
| 200 |
-
border-radius: 6px !important;
|
| 201 |
-
padding: 12px 16px !important;
|
| 202 |
-
margin: 8px 0 !important;
|
| 203 |
-
}
|
| 204 |
-
|
| 205 |
-
.gradio-container .error {
|
| 206 |
-
background: rgba(218, 54, 51, 0.1) !important;
|
| 207 |
-
color: var(--danger-color) !important;
|
| 208 |
-
border: 1px solid var(--danger-color) !important;
|
| 209 |
-
border-radius: 6px !important;
|
| 210 |
-
padding: 12px 16px !important;
|
| 211 |
-
margin: 8px 0 !important;
|
| 212 |
-
}
|
| 213 |
-
|
| 214 |
-
/* Responsive design */
|
| 215 |
-
@media (max-width: 768px) {
|
| 216 |
-
.gradio-container {
|
| 217 |
-
padding: 16px !important;
|
| 218 |
-
}
|
| 219 |
-
|
| 220 |
-
.gradio-container .tab-nav button {
|
| 221 |
-
padding: 8px 16px !important;
|
| 222 |
-
font-size: 14px !important;
|
| 223 |
-
}
|
| 224 |
-
|
| 225 |
-
.gradio-container .dataframe {
|
| 226 |
-
font-size: 14px !important;
|
| 227 |
-
}
|
| 228 |
}
|
| 229 |
"""
|
| 230 |
-
|
| 231 |
-
# Custom JavaScript for enhanced functionality
|
| 232 |
-
CUSTOM_JS = """
|
| 233 |
-
// Enhanced table sorting and filtering
|
| 234 |
-
function enhanceTable() {
|
| 235 |
-
const tables = document.querySelectorAll('.dataframe table');
|
| 236 |
-
tables.forEach(table => {
|
| 237 |
-
// Add sorting functionality
|
| 238 |
-
const headers = table.querySelectorAll('th');
|
| 239 |
-
headers.forEach((header, index) => {
|
| 240 |
-
header.style.cursor = 'pointer';
|
| 241 |
-
header.addEventListener('click', () => sortTable(table, index));
|
| 242 |
-
});
|
| 243 |
-
});
|
| 244 |
-
}
|
| 245 |
-
|
| 246 |
-
function sortTable(table, columnIndex) {
|
| 247 |
-
const tbody = table.querySelector('tbody');
|
| 248 |
-
const rows = Array.from(tbody.querySelectorAll('tr'));
|
| 249 |
-
|
| 250 |
-
rows.sort((a, b) => {
|
| 251 |
-
const aText = a.cells[columnIndex].textContent.trim();
|
| 252 |
-
const bText = b.cells[columnIndex].textContent.trim();
|
| 253 |
-
|
| 254 |
-
// Try to parse as numbers first
|
| 255 |
-
const aNum = parseFloat(aText);
|
| 256 |
-
const bNum = parseFloat(bText);
|
| 257 |
-
|
| 258 |
-
if (!isNaN(aNum) && !isNaN(bNum)) {
|
| 259 |
-
return bNum - aNum; // Descending for numbers
|
| 260 |
-
}
|
| 261 |
-
|
| 262 |
-
return aText.localeCompare(bText); // Ascending for text
|
| 263 |
-
});
|
| 264 |
-
|
| 265 |
-
rows.forEach(row => tbody.appendChild(row));
|
| 266 |
-
}
|
| 267 |
-
|
| 268 |
-
// Auto-refresh functionality
|
| 269 |
-
function autoRefresh() {
|
| 270 |
-
setInterval(() => {
|
| 271 |
-
const refreshBtn = document.querySelector('button[aria-label="Refresh"]');
|
| 272 |
-
if (refreshBtn) {
|
| 273 |
-
refreshBtn.click();
|
| 274 |
-
}
|
| 275 |
-
}, 30000); // Refresh every 30 seconds
|
| 276 |
-
}
|
| 277 |
-
|
| 278 |
-
// Initialize enhancements
|
| 279 |
-
document.addEventListener('DOMContentLoaded', function() {
|
| 280 |
-
enhanceTable();
|
| 281 |
-
autoRefresh();
|
| 282 |
-
});
|
| 283 |
-
"""
|
| 284 |
-
|
| 285 |
-
# HTML components
|
| 286 |
-
HEADER_HTML = """
|
| 287 |
-
<div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-bottom: 20px;">
|
| 288 |
-
<h1 style="color: var(--text-primary); margin: 0; font-size: 2.5em; font-weight: 700;">
|
| 289 |
-
🏆 CodeReview Leaderboard
|
| 290 |
-
</h1>
|
| 291 |
-
<p style="color: var(--text-secondary); margin: 10px 0 0 0; font-size: 1.2em;">
|
| 292 |
-
Benchmarking code review generation models across languages and categories
|
| 293 |
-
</p>
|
| 294 |
-
</div>
|
| 295 |
-
"""
|
| 296 |
-
|
| 297 |
-
FOOTER_HTML = """
|
| 298 |
-
<div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-top: 20px;">
|
| 299 |
-
<p style="color: var(--text-secondary); margin: 0; font-size: 0.9em;">
|
| 300 |
-
Built with ❤️ for the code review community |
|
| 301 |
-
<a href="https://github.com/your-repo" style="color: var(--accent-color); text-decoration: none;">
|
| 302 |
-
GitHub
|
| 303 |
-
</a>
|
| 304 |
-
</p>
|
| 305 |
-
</div>
|
| 306 |
-
"""
|
|
|
|
| 1 |
"""
|
| 2 |
+
CSS and styling for the CodeReview Bench Leaderboard.
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
custom_css = """
|
| 6 |
+
.markdown-text {
|
| 7 |
+
font-size: 16px !important;
|
| 8 |
+
text-align: justify !important;
|
| 9 |
+
line-height: 1.0 !important;
|
| 10 |
+
margin-top: 10px !important;
|
| 11 |
+
margin-bottom: 10px !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
}
|
| 13 |
|
| 14 |
+
.tab-buttons button.selected {
|
| 15 |
+
border-color: #f4f4f5 !important;
|
| 16 |
+
background: #3f3f46 !important;
|
| 17 |
+
color: #f4f4f5 !important;
|
| 18 |
}
|
| 19 |
|
| 20 |
+
#citation-button textarea {
|
| 21 |
+
font-family: monospace !important;
|
|
|
|
| 22 |
}
|
| 23 |
|
| 24 |
+
.leaderboard-container {
|
| 25 |
+
margin-top: 20px;
|
| 26 |
}
|
| 27 |
|
| 28 |
+
.category-header {
|
| 29 |
+
font-weight: bold;
|
| 30 |
+
background-color: #f5f5f5;
|
| 31 |
+
padding: 10px;
|
| 32 |
+
margin-top: 15px;
|
| 33 |
+
border-radius: 5px;
|
| 34 |
}
|
| 35 |
|
| 36 |
+
.metric-name {
|
| 37 |
+
font-weight: bold;
|
| 38 |
+
color: #a1a1aa !important;
|
|
|
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
|
| 41 |
+
.model-name {
|
| 42 |
+
font-weight: bold;
|
|
|
|
| 43 |
}
|
| 44 |
|
| 45 |
+
.model-link:hover {
|
| 46 |
+
text-decoration: underline;
|
| 47 |
+
color: #ffffff !important;
|
|
|
|
| 48 |
}
|
| 49 |
|
| 50 |
+
.version-selector {
|
| 51 |
+
margin: 0 !important;
|
| 52 |
+
padding: 5px;
|
| 53 |
+
border-radius: 5px;
|
|
|
|
|
|
|
| 54 |
}
|
| 55 |
|
| 56 |
+
.version-selector label {
|
| 57 |
+
font-weight: bold;
|
| 58 |
+
color: #f4f4f5 !important;
|
| 59 |
}
|
| 60 |
|
| 61 |
+
.version-selector select {
|
| 62 |
+
border-color: #3f3f46 !important;
|
| 63 |
+
border-radius: 5px;
|
|
|
|
|
|
|
|
|
|
| 64 |
}
|
| 65 |
|
| 66 |
+
/* Make sure the version selector is properly aligned with refresh button */
|
| 67 |
+
.version-selector > .block {
|
| 68 |
+
padding: 0 !important;
|
|
|
|
|
|
|
| 69 |
}
|
| 70 |
|
| 71 |
+
.version-selector > .block > .wrap {
|
| 72 |
+
position: relative;
|
| 73 |
+
top: -5px;
|
| 74 |
}
|
| 75 |
|
| 76 |
+
/* Force background/border for common layout containers */
|
| 77 |
+
.gradio-row > .block,
|
| 78 |
+
.gradio-column > .block,
|
| 79 |
+
.form,
|
| 80 |
+
.panel {
|
| 81 |
+
/* background: #18181b !important; */ /* Removed background override */
|
| 82 |
+
border-color: #27272a80 !important; /* Made border color semi-transparent */
|
| 83 |
+
border-width: 1px !important; /* Ensure border is visible */
|
| 84 |
+
border-style: solid !important;
|
| 85 |
}
|
| 86 |
|
| 87 |
+
/* Target the specific file upload component area */
|
| 88 |
+
.gradio-file .wrap {
|
| 89 |
+
/* background: #18181b !important; */ /* Removed background override */
|
| 90 |
+
border-color: #27272a !important;
|
| 91 |
}
|
| 92 |
|
| 93 |
+
#refresh-button {
|
| 94 |
+
margin-top: 5px !important;
|
| 95 |
+
margin-bottom: 5px !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
}
|
| 97 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/formatting.py
CHANGED
|
@@ -1,182 +1,71 @@
|
|
| 1 |
"""
|
| 2 |
-
Formatting utilities for
|
| 3 |
"""
|
| 4 |
|
| 5 |
-
import
|
| 6 |
-
|
| 7 |
-
from datetime import datetime, timezone
|
| 8 |
|
| 9 |
-
def format_score(score: float, precision: int = 3) -> str:
|
| 10 |
-
"""Format a score with specified precision"""
|
| 11 |
-
if isinstance(score, (int, float)):
|
| 12 |
-
return f"{score:.{precision}f}"
|
| 13 |
-
return str(score)
|
| 14 |
|
| 15 |
-
def
|
| 16 |
-
"""
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
return
|
| 20 |
|
| 21 |
-
def format_model_name(name: str) -> str:
|
| 22 |
-
"""Format model name for display"""
|
| 23 |
-
# Remove common prefixes and make more readable
|
| 24 |
-
name = name.strip()
|
| 25 |
-
if "/" in name:
|
| 26 |
-
org, model = name.split("/", 1)
|
| 27 |
-
return f"<span style='color: var(--text-secondary); font-size: 0.9em;'>{org}/</span><strong>{model}</strong>"
|
| 28 |
-
return f"<strong>{name}</strong>"
|
| 29 |
|
| 30 |
-
def
|
| 31 |
-
"""
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
except:
|
| 36 |
-
return timestamp
|
| 37 |
|
| 38 |
-
def format_ip_address(ip: str) -> str:
|
| 39 |
-
"""Format IP address for display (partial masking)"""
|
| 40 |
-
if not ip:
|
| 41 |
-
return "Unknown"
|
| 42 |
-
|
| 43 |
-
# Mask part of IP for privacy
|
| 44 |
-
parts = ip.split(".")
|
| 45 |
-
if len(parts) == 4:
|
| 46 |
-
return f"{parts[0]}.{parts[1]}.{parts[2]}.xxx"
|
| 47 |
-
return "xxx.xxx.xxx.xxx"
|
| 48 |
|
| 49 |
-
def
|
| 50 |
-
"""
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
color = "#ffffff" # White
|
| 57 |
-
elif score >= 6:
|
| 58 |
-
color = "#d0d0d0" # Light gray
|
| 59 |
-
elif score >= 4:
|
| 60 |
-
color = "#a0a0a0" # Gray
|
| 61 |
-
else:
|
| 62 |
-
color = "#707070" # Dark gray
|
| 63 |
-
|
| 64 |
-
return f"<span style='color: {color}; font-weight: 600;'>{score}</span>"
|
| 65 |
|
| 66 |
-
def format_language_badge(language: str) -> str:
|
| 67 |
-
"""Format programming language as a badge"""
|
| 68 |
-
if not language or language == "All":
|
| 69 |
-
return language
|
| 70 |
-
|
| 71 |
-
# Language-specific colors
|
| 72 |
-
colors = {
|
| 73 |
-
"Python": "#3776ab",
|
| 74 |
-
"JavaScript": "#f7df1e",
|
| 75 |
-
"Java": "#ed8b00",
|
| 76 |
-
"C++": "#00599c",
|
| 77 |
-
"C#": "#239120",
|
| 78 |
-
"Go": "#00add8",
|
| 79 |
-
"Rust": "#ce422b",
|
| 80 |
-
"TypeScript": "#3178c6",
|
| 81 |
-
"PHP": "#777bb4",
|
| 82 |
-
"Ruby": "#cc342d",
|
| 83 |
-
"Swift": "#fa7343",
|
| 84 |
-
"Kotlin": "#7f52ff",
|
| 85 |
-
"Scala": "#dc322f",
|
| 86 |
-
"R": "#276dc3",
|
| 87 |
-
"MATLAB": "#e16737"
|
| 88 |
-
}
|
| 89 |
-
|
| 90 |
-
color = colors.get(language, "#6c757d")
|
| 91 |
-
return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{language}</span>"
|
| 92 |
|
| 93 |
-
def
|
| 94 |
-
"""
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
"Bug Detection": "#dc3545",
|
| 101 |
-
"Code Style": "#6f42c1",
|
| 102 |
-
"Performance": "#fd7e14",
|
| 103 |
-
"Security": "#e83e8c",
|
| 104 |
-
"Maintainability": "#ffffff",
|
| 105 |
-
"Documentation": "#17a2b8",
|
| 106 |
-
"Testing": "#ffffff",
|
| 107 |
-
"Architecture": "#6c757d",
|
| 108 |
-
"Best Practices": "#007bff",
|
| 109 |
-
"Refactoring": "#ffc107"
|
| 110 |
-
}
|
| 111 |
-
|
| 112 |
-
color = colors.get(category, "#6c757d")
|
| 113 |
-
return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{category}</span>"
|
| 114 |
|
| 115 |
-
def format_comment_language_flag(language: str) -> str:
|
| 116 |
-
"""Format comment language with flag emoji"""
|
| 117 |
-
if not language or language == "All":
|
| 118 |
-
return language
|
| 119 |
-
|
| 120 |
-
# Language-specific flags
|
| 121 |
-
flags = {
|
| 122 |
-
"English": "🇺🇸",
|
| 123 |
-
"Chinese": "🇨🇳",
|
| 124 |
-
"Spanish": "🇪🇸",
|
| 125 |
-
"French": "🇫🇷",
|
| 126 |
-
"German": "🇩🇪",
|
| 127 |
-
"Japanese": "🇯🇵",
|
| 128 |
-
"Korean": "🇰🇷",
|
| 129 |
-
"Russian": "🇷🇺",
|
| 130 |
-
"Portuguese": "🇵🇹",
|
| 131 |
-
"Italian": "🇮🇹",
|
| 132 |
-
"Dutch": "🇳🇱"
|
| 133 |
-
}
|
| 134 |
-
|
| 135 |
-
flag = flags.get(language, "🌐")
|
| 136 |
-
return f"{flag} {language}"
|
| 137 |
|
| 138 |
-
def
|
| 139 |
-
"""
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
#
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
text = re.sub(r'on\w+=\'[^\']*\'', '', text, flags=re.IGNORECASE)
|
| 148 |
-
|
| 149 |
-
return text
|
| 150 |
|
| 151 |
-
def truncate_text(text: str, max_length: int = 50) -> str:
|
| 152 |
-
"""Truncate text with ellipsis"""
|
| 153 |
-
if not isinstance(text, str):
|
| 154 |
-
text = str(text)
|
| 155 |
-
|
| 156 |
-
if len(text) <= max_length:
|
| 157 |
-
return text
|
| 158 |
-
|
| 159 |
-
return text[:max_length-3] + "..."
|
| 160 |
|
| 161 |
-
def
|
| 162 |
-
"""
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
#
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
else:
|
| 182 |
-
return sanitize_html(str(value))
|
|
|
|
| 1 |
"""
|
| 2 |
+
Formatting utilities for the GuardBench Leaderboard.
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
|
|
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
def make_clickable_model(model_name: str) -> str:
|
| 10 |
+
"""
|
| 11 |
+
Create a clickable link for a model name.
|
| 12 |
+
"""
|
| 13 |
+
return f'<a href="https://huggingface.co/{model_name}" target="_blank">{model_name}</a>'
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
def has_no_nan_values(df: pd.DataFrame, columns: list) -> pd.Series:
|
| 17 |
+
"""
|
| 18 |
+
Check if a row has no NaN values in the specified columns.
|
| 19 |
+
"""
|
| 20 |
+
return ~df[columns].isna().any(axis=1)
|
|
|
|
|
|
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
def format_percentage(value: float) -> str:
|
| 24 |
+
"""
|
| 25 |
+
Format a value as a percentage.
|
| 26 |
+
"""
|
| 27 |
+
if pd.isna(value):
|
| 28 |
+
return "N/A"
|
| 29 |
+
return f"{value * 100:.2f}%"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
def format_number(value: float, precision: int = 2) -> str:
|
| 33 |
+
"""
|
| 34 |
+
Format a number with specified precision.
|
| 35 |
+
"""
|
| 36 |
+
if pd.isna(value):
|
| 37 |
+
return "N/A"
|
| 38 |
+
return f"{value:.{precision}f}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
def styled_message(message: str) -> str:
|
| 42 |
+
"""
|
| 43 |
+
Format a success message with styling.
|
| 44 |
+
"""
|
| 45 |
+
return f"""
|
| 46 |
+
<div style="padding: 10px; border-radius: 5px; background-color: #e6f7e6; color: #2e7d32; border: 1px solid #2e7d32;">
|
| 47 |
+
✅ {message}
|
| 48 |
+
</div>
|
| 49 |
+
"""
|
|
|
|
|
|
|
|
|
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
def styled_warning(message: str) -> str:
|
| 53 |
+
"""
|
| 54 |
+
Format a warning message with styling.
|
| 55 |
+
"""
|
| 56 |
+
return f"""
|
| 57 |
+
<div style="padding: 10px; border-radius: 5px; background-color: #fff8e1; color: #ff8f00; border: 1px solid #ff8f00;">
|
| 58 |
+
⚠️ {message}
|
| 59 |
+
</div>
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def styled_error(message: str) -> str:
|
| 64 |
+
"""
|
| 65 |
+
Format an error message with styling.
|
| 66 |
+
"""
|
| 67 |
+
return f"""
|
| 68 |
+
<div style="padding: 10px; border-radius: 5px; background-color: #ffebee; color: #c62828; border: 1px solid #c62828;">
|
| 69 |
+
❌ {message}
|
| 70 |
+
</div>
|
| 71 |
+
"""
|
|
|
|
|
|
src/display/utils.py
CHANGED
|
@@ -1,292 +1,417 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
"""
|
| 4 |
|
| 5 |
-
from
|
| 6 |
-
import
|
| 7 |
-
from
|
| 8 |
-
from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
|
| 9 |
-
from src.display.formatting import format_table_cell, format_timestamp
|
| 10 |
-
|
| 11 |
-
def filter_leaderboard_data(
|
| 12 |
-
data: List[Dict],
|
| 13 |
-
programming_language: str = "All",
|
| 14 |
-
comment_language: str = "All",
|
| 15 |
-
taxonomy_category: str = "All",
|
| 16 |
-
sort_by: str = "llm_pass_1",
|
| 17 |
-
sort_order: str = "desc"
|
| 18 |
-
) -> List[Dict]:
|
| 19 |
-
"""Filter and sort leaderboard data based on criteria"""
|
| 20 |
-
|
| 21 |
-
if not data:
|
| 22 |
-
return []
|
| 23 |
-
|
| 24 |
-
# Apply filters
|
| 25 |
-
filtered_data = data.copy()
|
| 26 |
-
|
| 27 |
-
if programming_language != "All":
|
| 28 |
-
filtered_data = [
|
| 29 |
-
entry for entry in filtered_data
|
| 30 |
-
if entry.get("programming_language", "").lower() == programming_language.lower()
|
| 31 |
-
]
|
| 32 |
-
|
| 33 |
-
if comment_language != "All":
|
| 34 |
-
filtered_data = [
|
| 35 |
-
entry for entry in filtered_data
|
| 36 |
-
if entry.get("comment_language", "").lower() == comment_language.lower()
|
| 37 |
-
]
|
| 38 |
-
|
| 39 |
-
if taxonomy_category != "All":
|
| 40 |
-
filtered_data = [
|
| 41 |
-
entry for entry in filtered_data
|
| 42 |
-
if entry.get("taxonomy_category", "").lower() == taxonomy_category.lower()
|
| 43 |
-
]
|
| 44 |
-
|
| 45 |
-
# Sort data
|
| 46 |
-
reverse = sort_order.lower() == "desc"
|
| 47 |
-
|
| 48 |
-
try:
|
| 49 |
-
if sort_by in ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]:
|
| 50 |
-
filtered_data.sort(key=lambda x: x.get(sort_by, 0), reverse=reverse)
|
| 51 |
-
elif sort_by in QUALITY_METRICS:
|
| 52 |
-
filtered_data.sort(key=lambda x: x.get("metrics", {}).get(sort_by, 0), reverse=reverse)
|
| 53 |
-
else:
|
| 54 |
-
filtered_data.sort(key=lambda x: str(x.get(sort_by, "")), reverse=reverse)
|
| 55 |
-
except Exception as e:
|
| 56 |
-
print(f"Error sorting data: {e}")
|
| 57 |
-
# Default sort by pass@1
|
| 58 |
-
filtered_data.sort(key=lambda x: x.get("llm_pass_1", 0), reverse=True)
|
| 59 |
-
|
| 60 |
-
return filtered_data
|
| 61 |
-
|
| 62 |
-
def get_main_leaderboard_data(
|
| 63 |
-
data: List[Dict],
|
| 64 |
-
programming_language: str = "All",
|
| 65 |
-
comment_language: str = "All",
|
| 66 |
-
taxonomy_category: str = "All",
|
| 67 |
-
sort_by: str = "llm_pass_1"
|
| 68 |
-
) -> List[List[str]]:
|
| 69 |
-
"""Get formatted main leaderboard table data"""
|
| 70 |
-
|
| 71 |
-
filtered_data = filter_leaderboard_data(
|
| 72 |
-
data, programming_language, comment_language, taxonomy_category, sort_by
|
| 73 |
-
)
|
| 74 |
-
|
| 75 |
-
table_rows = []
|
| 76 |
-
for entry in filtered_data:
|
| 77 |
-
row = [
|
| 78 |
-
format_table_cell(entry.get("model_name", ""), "model"),
|
| 79 |
-
format_table_cell(entry.get("programming_language", ""), "programming language"),
|
| 80 |
-
format_table_cell(entry.get("comment_language", ""), "comment language"),
|
| 81 |
-
format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
|
| 82 |
-
format_table_cell(entry.get("bleu", 0), "bleu"),
|
| 83 |
-
format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
|
| 84 |
-
format_table_cell(entry.get("llm_pass_5", 0), "pass@5"),
|
| 85 |
-
format_table_cell(entry.get("llm_pass_10", 0), "pass@10"),
|
| 86 |
-
]
|
| 87 |
-
table_rows.append(row)
|
| 88 |
-
|
| 89 |
-
return table_rows
|
| 90 |
-
|
| 91 |
-
def get_quality_metrics_data(
|
| 92 |
-
data: List[Dict],
|
| 93 |
-
programming_language: str = "All",
|
| 94 |
-
comment_language: str = "All",
|
| 95 |
-
taxonomy_category: str = "All",
|
| 96 |
-
sort_by: str = "llm_pass_1"
|
| 97 |
-
) -> List[List[str]]:
|
| 98 |
-
"""Get formatted quality metrics table data"""
|
| 99 |
-
|
| 100 |
-
filtered_data = filter_leaderboard_data(
|
| 101 |
-
data, programming_language, comment_language, taxonomy_category, sort_by
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
table_rows = []
|
| 105 |
-
for entry in filtered_data:
|
| 106 |
-
metrics = entry.get("metrics", {})
|
| 107 |
-
row = [format_table_cell(entry.get("model_name", ""), "model")]
|
| 108 |
-
|
| 109 |
-
for metric in QUALITY_METRICS:
|
| 110 |
-
formatted_value = format_table_cell(metrics.get(metric, 0), metric.replace("_", " "))
|
| 111 |
-
row.append(formatted_value)
|
| 112 |
-
|
| 113 |
-
table_rows.append(row)
|
| 114 |
-
|
| 115 |
-
return table_rows
|
| 116 |
-
|
| 117 |
-
def get_submission_history_data(
|
| 118 |
-
data: List[Dict],
|
| 119 |
-
programming_language: str = "All",
|
| 120 |
-
comment_language: str = "All",
|
| 121 |
-
taxonomy_category: str = "All",
|
| 122 |
-
limit: int = 50
|
| 123 |
-
) -> List[List[str]]:
|
| 124 |
-
"""Get formatted submission history data"""
|
| 125 |
-
|
| 126 |
-
filtered_data = filter_leaderboard_data(
|
| 127 |
-
data, programming_language, comment_language, taxonomy_category, "submission_date", "desc"
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
-
# Limit results
|
| 131 |
-
filtered_data = filtered_data[:limit]
|
| 132 |
-
|
| 133 |
-
table_rows = []
|
| 134 |
-
for entry in filtered_data:
|
| 135 |
-
row = [
|
| 136 |
-
format_table_cell(entry.get("model_name", ""), "model"),
|
| 137 |
-
format_table_cell(entry.get("programming_language", ""), "programming language"),
|
| 138 |
-
format_table_cell(entry.get("comment_language", ""), "comment language"),
|
| 139 |
-
format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
|
| 140 |
-
format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
|
| 141 |
-
format_timestamp(entry.get("submission_date", "")),
|
| 142 |
-
entry.get("submission_ip", "").split(".")[0] + ".xxx.xxx.xxx" if entry.get("submission_ip") else "Unknown"
|
| 143 |
-
]
|
| 144 |
-
table_rows.append(row)
|
| 145 |
-
|
| 146 |
-
return table_rows
|
| 147 |
|
| 148 |
-
def get_statistics_summary(data: List[Dict]) -> Dict[str, Any]:
|
| 149 |
-
"""Get summary statistics for the leaderboard"""
|
| 150 |
-
|
| 151 |
-
if not data:
|
| 152 |
-
return {
|
| 153 |
-
"total_models": 0,
|
| 154 |
-
"total_submissions": 0,
|
| 155 |
-
"avg_pass_1": 0,
|
| 156 |
-
"best_model": "None",
|
| 157 |
-
"languages_covered": 0,
|
| 158 |
-
"categories_covered": 0
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
-
# Calculate statistics
|
| 162 |
-
total_models = len(set(entry.get("model_name", "") for entry in data))
|
| 163 |
-
total_submissions = len(data)
|
| 164 |
-
|
| 165 |
-
pass_1_scores = [entry.get("llm_pass_1", 0) for entry in data if entry.get("llm_pass_1") is not None]
|
| 166 |
-
avg_pass_1 = sum(pass_1_scores) / len(pass_1_scores) if pass_1_scores else 0
|
| 167 |
-
|
| 168 |
-
best_entry = max(data, key=lambda x: x.get("llm_pass_1", 0)) if data else None
|
| 169 |
-
best_model = best_entry.get("model_name", "None") if best_entry else "None"
|
| 170 |
-
|
| 171 |
-
languages_covered = len(set(entry.get("programming_language", "") for entry in data if entry.get("programming_language")))
|
| 172 |
-
categories_covered = len(set(entry.get("taxonomy_category", "") for entry in data if entry.get("taxonomy_category")))
|
| 173 |
-
|
| 174 |
-
return {
|
| 175 |
-
"total_models": total_models,
|
| 176 |
-
"total_submissions": total_submissions,
|
| 177 |
-
"avg_pass_1": avg_pass_1,
|
| 178 |
-
"best_model": best_model,
|
| 179 |
-
"languages_covered": languages_covered,
|
| 180 |
-
"categories_covered": categories_covered
|
| 181 |
-
}
|
| 182 |
-
|
| 183 |
-
def validate_submission_data(data: Dict[str, Any]) -> Tuple[bool, str]:
|
| 184 |
-
"""Validate submission data"""
|
| 185 |
-
|
| 186 |
-
required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
|
| 187 |
-
|
| 188 |
-
# Check required fields
|
| 189 |
-
for field in required_fields:
|
| 190 |
-
if not data.get(field):
|
| 191 |
-
return False, f"Missing required field: {field}"
|
| 192 |
-
|
| 193 |
-
# Validate scores
|
| 194 |
-
score_fields = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
|
| 195 |
-
for field in score_fields:
|
| 196 |
-
value = data.get(field)
|
| 197 |
-
if value is None:
|
| 198 |
-
return False, f"Missing score: {field}"
|
| 199 |
-
if not isinstance(value, (int, float)):
|
| 200 |
-
return False, f"Invalid score format: {field}"
|
| 201 |
-
if not 0 <= value <= 1:
|
| 202 |
-
return False, f"Score out of range (0-1): {field}"
|
| 203 |
-
|
| 204 |
-
# Validate metrics
|
| 205 |
-
metrics = data.get("metrics", {})
|
| 206 |
-
for metric in QUALITY_METRICS:
|
| 207 |
-
value = metrics.get(metric)
|
| 208 |
-
if value is None:
|
| 209 |
-
return False, f"Missing metric: {metric}"
|
| 210 |
-
if not isinstance(value, (int, float)):
|
| 211 |
-
return False, f"Invalid metric format: {metric}"
|
| 212 |
-
if not 0 <= value <= 10:
|
| 213 |
-
return False, f"Metric out of range (0-10): {metric}"
|
| 214 |
-
|
| 215 |
-
# Validate language and category choices
|
| 216 |
-
if data.get("programming_language") not in PROGRAMMING_LANGUAGES:
|
| 217 |
-
return False, "Invalid programming language"
|
| 218 |
-
|
| 219 |
-
if data.get("comment_language") not in COMMENT_LANGUAGES:
|
| 220 |
-
return False, "Invalid comment language"
|
| 221 |
-
|
| 222 |
-
if data.get("taxonomy_category") not in TAXONOMY_CATEGORIES:
|
| 223 |
-
return False, "Invalid taxonomy category"
|
| 224 |
-
|
| 225 |
-
return True, "Valid submission"
|
| 226 |
|
| 227 |
-
|
| 228 |
-
"""
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Utility classes and functions for the CodeReview Bench Leaderboard display.
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
from dataclasses import dataclass, field, fields
|
| 6 |
+
from enum import Enum, auto
|
| 7 |
+
from typing import List, Optional
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
class Mode(Enum):
|
| 11 |
+
"""Inference mode for the review model."""
|
| 12 |
+
CoT = auto() # Chain of Thought
|
| 13 |
+
Strict = auto()
|
| 14 |
+
|
| 15 |
+
def __str__(self):
|
| 16 |
+
"""String representation of the mode."""
|
| 17 |
+
return self.name
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class ModelType(Enum):
|
| 21 |
+
"""Model types for the leaderboard."""
|
| 22 |
+
Unknown = auto()
|
| 23 |
+
OpenSource = auto()
|
| 24 |
+
ClosedSource = auto()
|
| 25 |
+
API = auto()
|
| 26 |
+
|
| 27 |
+
def to_str(self, separator: str = "-") -> str:
|
| 28 |
+
"""Convert enum to string with separator."""
|
| 29 |
+
if self == ModelType.Unknown:
|
| 30 |
+
return "Unknown"
|
| 31 |
+
elif self == ModelType.OpenSource:
|
| 32 |
+
return f"Open{separator}Source"
|
| 33 |
+
elif self == ModelType.ClosedSource:
|
| 34 |
+
return f"Closed{separator}Source"
|
| 35 |
+
elif self == ModelType.API:
|
| 36 |
+
return "API"
|
| 37 |
+
return "Unknown"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class ReviewModelType(str, Enum):
|
| 41 |
+
"""Review model types for the leaderboard."""
|
| 42 |
+
GPT_4 = "gpt-4"
|
| 43 |
+
GPT_3_5 = "gpt-3.5-turbo"
|
| 44 |
+
CLAUDE = "claude"
|
| 45 |
+
LLAMA = "llama"
|
| 46 |
+
GEMINI = "gemini"
|
| 47 |
+
CUSTOM = "custom"
|
| 48 |
+
|
| 49 |
+
def __str__(self):
|
| 50 |
+
"""String representation of the review model type."""
|
| 51 |
+
return self.value
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class Precision(Enum):
|
| 55 |
+
"""Model precision types."""
|
| 56 |
+
Unknown = auto()
|
| 57 |
+
float16 = auto()
|
| 58 |
+
bfloat16 = auto()
|
| 59 |
+
float32 = auto()
|
| 60 |
+
int8 = auto()
|
| 61 |
+
int4 = auto()
|
| 62 |
+
NA = auto()
|
| 63 |
+
|
| 64 |
+
def __str__(self):
|
| 65 |
+
"""String representation of the precision type."""
|
| 66 |
+
return self.name
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class WeightType(Enum):
|
| 70 |
+
"""Model weight types."""
|
| 71 |
+
Original = auto()
|
| 72 |
+
Delta = auto()
|
| 73 |
+
Adapter = auto()
|
| 74 |
+
|
| 75 |
+
def __str__(self):
|
| 76 |
+
"""String representation of the weight type."""
|
| 77 |
+
return self.name
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@dataclass
|
| 81 |
+
class ColumnInfo:
|
| 82 |
+
"""Information about a column in the leaderboard."""
|
| 83 |
+
name: str
|
| 84 |
+
display_name: str
|
| 85 |
+
type: str = "text"
|
| 86 |
+
hidden: bool = False
|
| 87 |
+
never_hidden: bool = False
|
| 88 |
+
displayed_by_default: bool = True
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@dataclass
|
| 92 |
+
class CodeReviewBenchColumn:
|
| 93 |
+
"""Columns for the CodeReview Bench leaderboard."""
|
| 94 |
+
# Core metadata
|
| 95 |
+
model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 96 |
+
name="model_name",
|
| 97 |
+
display_name="Model",
|
| 98 |
+
never_hidden=True,
|
| 99 |
+
displayed_by_default=True
|
| 100 |
+
))
|
| 101 |
+
mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 102 |
+
name="mode",
|
| 103 |
+
display_name="Mode",
|
| 104 |
+
displayed_by_default=True
|
| 105 |
+
))
|
| 106 |
+
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 107 |
+
name="model_type",
|
| 108 |
+
display_name="Access_Type",
|
| 109 |
+
displayed_by_default=True
|
| 110 |
+
))
|
| 111 |
+
submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 112 |
+
name="submission_date",
|
| 113 |
+
display_name="Submission_Date",
|
| 114 |
+
displayed_by_default=False
|
| 115 |
+
))
|
| 116 |
+
version: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 117 |
+
name="version",
|
| 118 |
+
display_name="Version",
|
| 119 |
+
displayed_by_default=False
|
| 120 |
+
))
|
| 121 |
+
review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 122 |
+
name="review_model_type",
|
| 123 |
+
display_name="Type",
|
| 124 |
+
displayed_by_default=False
|
| 125 |
+
))
|
| 126 |
+
base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 127 |
+
name="base_model",
|
| 128 |
+
display_name="Base Model",
|
| 129 |
+
displayed_by_default=False
|
| 130 |
+
))
|
| 131 |
+
revision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 132 |
+
name="revision",
|
| 133 |
+
display_name="Revision",
|
| 134 |
+
displayed_by_default=False
|
| 135 |
+
))
|
| 136 |
+
precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 137 |
+
name="precision",
|
| 138 |
+
display_name="Precision",
|
| 139 |
+
displayed_by_default=False
|
| 140 |
+
))
|
| 141 |
+
weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 142 |
+
name="weight_type",
|
| 143 |
+
display_name="Weight Type",
|
| 144 |
+
displayed_by_default=False
|
| 145 |
+
))
|
| 146 |
+
topic: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 147 |
+
name="topic",
|
| 148 |
+
display_name="Topic",
|
| 149 |
+
displayed_by_default=True
|
| 150 |
+
))
|
| 151 |
+
|
| 152 |
+
# LLM-based multimetric scores
|
| 153 |
+
readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 154 |
+
name="readability",
|
| 155 |
+
display_name="Readability",
|
| 156 |
+
type="number",
|
| 157 |
+
displayed_by_default=True
|
| 158 |
+
))
|
| 159 |
+
relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 160 |
+
name="relevance",
|
| 161 |
+
display_name="Relevance",
|
| 162 |
+
type="number",
|
| 163 |
+
displayed_by_default=True
|
| 164 |
+
))
|
| 165 |
+
explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 166 |
+
name="explanation_clarity",
|
| 167 |
+
display_name="Explanation_Clarity",
|
| 168 |
+
type="number",
|
| 169 |
+
displayed_by_default=True
|
| 170 |
+
))
|
| 171 |
+
problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 172 |
+
name="problem_identification",
|
| 173 |
+
display_name="Problem_Identification",
|
| 174 |
+
type="number",
|
| 175 |
+
displayed_by_default=True
|
| 176 |
+
))
|
| 177 |
+
actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 178 |
+
name="actionability",
|
| 179 |
+
display_name="Actionability",
|
| 180 |
+
type="number",
|
| 181 |
+
displayed_by_default=True
|
| 182 |
+
))
|
| 183 |
+
completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 184 |
+
name="completeness",
|
| 185 |
+
display_name="Completeness",
|
| 186 |
+
type="number",
|
| 187 |
+
displayed_by_default=True
|
| 188 |
+
))
|
| 189 |
+
specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 190 |
+
name="specificity",
|
| 191 |
+
display_name="Specificity",
|
| 192 |
+
type="number",
|
| 193 |
+
displayed_by_default=True
|
| 194 |
+
))
|
| 195 |
+
contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 196 |
+
name="contextual_adequacy",
|
| 197 |
+
display_name="Contextual_Adequacy",
|
| 198 |
+
type="number",
|
| 199 |
+
displayed_by_default=True
|
| 200 |
+
))
|
| 201 |
+
consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 202 |
+
name="consistency",
|
| 203 |
+
display_name="Consistency",
|
| 204 |
+
type="number",
|
| 205 |
+
displayed_by_default=True
|
| 206 |
+
))
|
| 207 |
+
brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 208 |
+
name="brevity",
|
| 209 |
+
display_name="Brevity",
|
| 210 |
+
type="number",
|
| 211 |
+
displayed_by_default=True
|
| 212 |
+
))
|
| 213 |
+
|
| 214 |
+
# LLM-based-exact-match metrics
|
| 215 |
+
pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 216 |
+
name="pass_at_1",
|
| 217 |
+
display_name="Pass@1",
|
| 218 |
+
type="number",
|
| 219 |
+
displayed_by_default=True
|
| 220 |
+
))
|
| 221 |
+
pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 222 |
+
name="pass_at_5",
|
| 223 |
+
display_name="Pass@5",
|
| 224 |
+
type="number",
|
| 225 |
+
displayed_by_default=True
|
| 226 |
+
))
|
| 227 |
+
pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 228 |
+
name="pass_at_10",
|
| 229 |
+
display_name="Pass@10",
|
| 230 |
+
type="number",
|
| 231 |
+
displayed_by_default=True
|
| 232 |
+
))
|
| 233 |
+
bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 234 |
+
name="bleu_at_10",
|
| 235 |
+
display_name="BLEU@10",
|
| 236 |
+
type="number",
|
| 237 |
+
displayed_by_default=True
|
| 238 |
+
))
|
| 239 |
+
|
| 240 |
+
# Overall aggregated metrics
|
| 241 |
+
overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 242 |
+
name="overall_score",
|
| 243 |
+
display_name="Overall_Score",
|
| 244 |
+
type="number",
|
| 245 |
+
displayed_by_default=True
|
| 246 |
+
))
|
| 247 |
+
multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 248 |
+
name="multimetric_average",
|
| 249 |
+
display_name="Multimetric_Average",
|
| 250 |
+
type="number",
|
| 251 |
+
displayed_by_default=True
|
| 252 |
+
))
|
| 253 |
+
exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 254 |
+
name="exact_match_average",
|
| 255 |
+
display_name="Exact_Match_Average",
|
| 256 |
+
type="number",
|
| 257 |
+
displayed_by_default=True
|
| 258 |
+
))
|
| 259 |
+
total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 260 |
+
name="total_evaluations",
|
| 261 |
+
display_name="Total_Evaluations",
|
| 262 |
+
type="number",
|
| 263 |
+
displayed_by_default=True
|
| 264 |
+
))
|
| 265 |
+
|
| 266 |
+
# Language-specific metrics (Russian)
|
| 267 |
+
ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 268 |
+
name="ru_readability",
|
| 269 |
+
display_name="RU_Readability",
|
| 270 |
+
type="number",
|
| 271 |
+
displayed_by_default=False
|
| 272 |
+
))
|
| 273 |
+
ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 274 |
+
name="ru_relevance",
|
| 275 |
+
display_name="RU_Relevance",
|
| 276 |
+
type="number",
|
| 277 |
+
displayed_by_default=False
|
| 278 |
+
))
|
| 279 |
+
ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 280 |
+
name="ru_overall_score",
|
| 281 |
+
display_name="RU_Overall_Score",
|
| 282 |
+
type="number",
|
| 283 |
+
displayed_by_default=False
|
| 284 |
+
))
|
| 285 |
+
|
| 286 |
+
# Language-specific metrics (English)
|
| 287 |
+
en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 288 |
+
name="en_readability",
|
| 289 |
+
display_name="EN_Readability",
|
| 290 |
+
type="number",
|
| 291 |
+
displayed_by_default=False
|
| 292 |
+
))
|
| 293 |
+
en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 294 |
+
name="en_relevance",
|
| 295 |
+
display_name="EN_Relevance",
|
| 296 |
+
type="number",
|
| 297 |
+
displayed_by_default=False
|
| 298 |
+
))
|
| 299 |
+
en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 300 |
+
name="en_overall_score",
|
| 301 |
+
display_name="EN_Overall_Score",
|
| 302 |
+
type="number",
|
| 303 |
+
displayed_by_default=False
|
| 304 |
+
))
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
# Create instances for easy access
|
| 308 |
+
CODEREVIEW_COLUMN = CodeReviewBenchColumn()
|
| 309 |
+
|
| 310 |
+
# Extract column lists for different views
|
| 311 |
+
COLS = [f.name for f in fields(CODEREVIEW_COLUMN)]
|
| 312 |
+
DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
| 313 |
+
if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
|
| 314 |
+
|
| 315 |
+
# Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
|
| 316 |
+
def reorder_display_cols():
|
| 317 |
+
cols = DISPLAY_COLS
|
| 318 |
+
if 'model_name' in cols and 'mode' in cols:
|
| 319 |
+
cols.remove('mode')
|
| 320 |
+
model_name_index = cols.index('model_name')
|
| 321 |
+
cols.insert(model_name_index + 1, 'mode')
|
| 322 |
+
return cols
|
| 323 |
+
DISPLAY_COLS = reorder_display_cols()
|
| 324 |
+
|
| 325 |
+
METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
| 326 |
+
if getattr(CODEREVIEW_COLUMN, f.name).type == "number"]
|
| 327 |
+
HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
| 328 |
+
if getattr(CODEREVIEW_COLUMN, f.name).hidden]
|
| 329 |
+
NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
| 330 |
+
if getattr(CODEREVIEW_COLUMN, f.name).never_hidden]
|
| 331 |
+
|
| 332 |
+
# Categories for CodeReview Bench (Programming Languages)
|
| 333 |
+
CATEGORIES = [
|
| 334 |
+
'Python',
|
| 335 |
+
'Java',
|
| 336 |
+
'Scala',
|
| 337 |
+
'Go'
|
| 338 |
+
]
|
| 339 |
+
|
| 340 |
+
# Language taxonomies for CodeReview Bench
|
| 341 |
+
COMMENT_LANGUAGES = [
|
| 342 |
+
'ru', # Russian
|
| 343 |
+
'en' # English
|
| 344 |
+
]
|
| 345 |
+
|
| 346 |
+
# Topics for CodeReview Bench
|
| 347 |
+
TOPICS = [
|
| 348 |
+
'Code Reliability',
|
| 349 |
+
'Coding Standards',
|
| 350 |
+
'Code Organization',
|
| 351 |
+
'Performance Issues',
|
| 352 |
+
'Validation',
|
| 353 |
+
'Variables'
|
| 354 |
+
]
|
| 355 |
+
|
| 356 |
+
# Example categories
|
| 357 |
+
EXAMPLE_CATEGORIES = [
|
| 358 |
+
'Bug_Fix',
|
| 359 |
+
'Code_Style',
|
| 360 |
+
'Performance',
|
| 361 |
+
'Security',
|
| 362 |
+
'Refactoring',
|
| 363 |
+
'Documentation',
|
| 364 |
+
'Testing',
|
| 365 |
+
'Architecture',
|
| 366 |
+
'Other'
|
| 367 |
+
]
|
| 368 |
+
|
| 369 |
+
# Metrics for CodeReview Bench
|
| 370 |
+
MULTIMETRIC_METRICS = [
|
| 371 |
+
"readability",
|
| 372 |
+
"relevance",
|
| 373 |
+
"explanation_clarity",
|
| 374 |
+
"problem_identification",
|
| 375 |
+
"actionability",
|
| 376 |
+
"completeness",
|
| 377 |
+
"specificity",
|
| 378 |
+
"contextual_adequacy",
|
| 379 |
+
"consistency",
|
| 380 |
+
"brevity"
|
| 381 |
+
]
|
| 382 |
+
|
| 383 |
+
EXACT_MATCH_METRICS = [
|
| 384 |
+
"pass_at_1",
|
| 385 |
+
"pass_at_5",
|
| 386 |
+
"pass_at_10",
|
| 387 |
+
"bleu_at_10"
|
| 388 |
+
]
|
| 389 |
+
|
| 390 |
+
def get_all_column_choices():
|
| 391 |
+
"""
|
| 392 |
+
Get all available column choices for the multiselect dropdown.
|
| 393 |
+
|
| 394 |
+
Returns:
|
| 395 |
+
List of tuples with (column_name, display_name) for all columns.
|
| 396 |
+
"""
|
| 397 |
+
column_choices = []
|
| 398 |
+
|
| 399 |
+
default_visible_columns = get_default_visible_columns()
|
| 400 |
+
|
| 401 |
+
for f in fields(CODEREVIEW_COLUMN):
|
| 402 |
+
column_info = getattr(CODEREVIEW_COLUMN, f.name)
|
| 403 |
+
# Create a tuple with both the internal name and display name
|
| 404 |
+
if column_info.name not in default_visible_columns:
|
| 405 |
+
column_choices.append((column_info.name, column_info.display_name))
|
| 406 |
+
|
| 407 |
+
return column_choices
|
| 408 |
+
|
| 409 |
+
def get_default_visible_columns():
|
| 410 |
+
"""
|
| 411 |
+
Get the list of column names that should be visible by default.
|
| 412 |
+
|
| 413 |
+
Returns:
|
| 414 |
+
List of column names that are displayed by default.
|
| 415 |
+
"""
|
| 416 |
+
return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
| 417 |
+
if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
|
src/envs.py
CHANGED
|
@@ -1,106 +1,27 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Environment configuration and constants
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
import os
|
| 6 |
-
from
|
| 7 |
-
|
| 8 |
-
# Data paths
|
| 9 |
-
DATA_DIR = Path("data")
|
| 10 |
-
LEADERBOARD_PATH = DATA_DIR / "leaderboard_data.json"
|
| 11 |
-
SUBMISSIONS_PATH = DATA_DIR / "submissions.json"
|
| 12 |
-
|
| 13 |
-
# Create data directory if it doesn't exist
|
| 14 |
-
DATA_DIR.mkdir(exist_ok=True)
|
| 15 |
-
|
| 16 |
-
# Programming languages supported
|
| 17 |
-
PROGRAMMING_LANGUAGES = [
|
| 18 |
-
"All",
|
| 19 |
-
"Python",
|
| 20 |
-
"JavaScript",
|
| 21 |
-
"Java",
|
| 22 |
-
"C++",
|
| 23 |
-
"C#",
|
| 24 |
-
"Go",
|
| 25 |
-
"Rust",
|
| 26 |
-
"TypeScript",
|
| 27 |
-
"PHP",
|
| 28 |
-
"Ruby",
|
| 29 |
-
"Swift",
|
| 30 |
-
"Kotlin",
|
| 31 |
-
"Scala",
|
| 32 |
-
"R",
|
| 33 |
-
"MATLAB",
|
| 34 |
-
"Other"
|
| 35 |
-
]
|
| 36 |
|
| 37 |
-
#
|
| 38 |
-
|
| 39 |
-
"All",
|
| 40 |
-
"English",
|
| 41 |
-
"Chinese",
|
| 42 |
-
"Spanish",
|
| 43 |
-
"French",
|
| 44 |
-
"German",
|
| 45 |
-
"Japanese",
|
| 46 |
-
"Korean",
|
| 47 |
-
"Russian",
|
| 48 |
-
"Portuguese",
|
| 49 |
-
"Italian",
|
| 50 |
-
"Dutch",
|
| 51 |
-
"Other"
|
| 52 |
-
]
|
| 53 |
|
| 54 |
-
#
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
"Security",
|
| 61 |
-
"Maintainability",
|
| 62 |
-
"Documentation",
|
| 63 |
-
"Testing",
|
| 64 |
-
"Architecture",
|
| 65 |
-
"Best Practices",
|
| 66 |
-
"Refactoring",
|
| 67 |
-
"Other"
|
| 68 |
-
]
|
| 69 |
|
| 70 |
-
#
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
"relevance",
|
| 74 |
-
"explanation_clarity",
|
| 75 |
-
"problem_identification",
|
| 76 |
-
"actionability",
|
| 77 |
-
"completeness",
|
| 78 |
-
"specificity",
|
| 79 |
-
"contextual_adequacy",
|
| 80 |
-
"consistency",
|
| 81 |
-
"brevity"
|
| 82 |
-
]
|
| 83 |
|
| 84 |
-
#
|
| 85 |
-
|
|
|
|
| 86 |
|
| 87 |
-
|
|
|
|
| 88 |
|
| 89 |
-
#
|
| 90 |
-
|
| 91 |
-
"model_name": "example/model",
|
| 92 |
-
"programming_language": "Python",
|
| 93 |
-
"comment_language": "English",
|
| 94 |
-
"taxonomy_category": "Bug Detection",
|
| 95 |
-
"bleu": 0.5,
|
| 96 |
-
"llm_pass_1": 0.5,
|
| 97 |
-
"llm_pass_5": 0.5,
|
| 98 |
-
"llm_pass_10": 0.5,
|
| 99 |
-
"metrics": {
|
| 100 |
-
"readability": 5, "relevance": 5, "explanation_clarity": 5,
|
| 101 |
-
"problem_identification": 5, "actionability": 5, "completeness": 5,
|
| 102 |
-
"specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5
|
| 103 |
-
},
|
| 104 |
-
"submission_ip": "127.0.0.1",
|
| 105 |
-
"submission_date": "2024-01-01T00:00:00Z"
|
| 106 |
-
}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from huggingface_hub import HfApi
|
| 3 |
+
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
# Load environment variables
|
| 6 |
+
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
# Hugging Face configuration
|
| 9 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 10 |
+
OWNER = os.environ.get("OWNER", "codereview-bench") # Change to your org
|
| 11 |
+
SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
|
| 12 |
+
ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
|
| 13 |
+
ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
# Repository IDs
|
| 16 |
+
REPO_ID = f"{OWNER}/codereview-bench"
|
| 17 |
+
RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/codereview-bench-results")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
# Cache paths
|
| 20 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 21 |
+
DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
|
| 22 |
|
| 23 |
+
# Local data paths
|
| 24 |
+
LEADERBOARD_FILE = os.path.join(DATA_PATH, "leaderboard.json")
|
| 25 |
|
| 26 |
+
# HF API instance
|
| 27 |
+
API = HfApi(token=TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard/processor.py
CHANGED
|
@@ -1,306 +1,271 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
"""
|
| 4 |
|
| 5 |
import json
|
| 6 |
-
import
|
| 7 |
-
|
| 8 |
-
from datetime import datetime
|
| 9 |
-
from
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
if not self.leaderboard_path.exists():
|
| 24 |
-
self.save_leaderboard_data(DEFAULT_DATA)
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
"""Load leaderboard data from storage"""
|
| 31 |
-
try:
|
| 32 |
-
with open(self.leaderboard_path, 'r', encoding='utf-8') as f:
|
| 33 |
-
data = json.load(f)
|
| 34 |
-
return data.get("leaderboard", [])
|
| 35 |
-
except Exception as e:
|
| 36 |
-
print(f"Error loading leaderboard: {e}")
|
| 37 |
-
return DEFAULT_DATA.copy()
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
"last_updated": datetime.now(timezone.utc).isoformat(),
|
| 45 |
-
"total_entries": len(data)
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
with open(self.leaderboard_path, 'w', encoding='utf-8') as f:
|
| 49 |
-
json.dump(to_store, f, indent=2, ensure_ascii=False)
|
| 50 |
-
|
| 51 |
-
return True
|
| 52 |
-
except Exception as e:
|
| 53 |
-
print(f"Error saving leaderboard: {e}")
|
| 54 |
-
return False
|
| 55 |
|
| 56 |
-
|
| 57 |
-
"""Load submission log from storage"""
|
| 58 |
-
try:
|
| 59 |
-
with open(self.submissions_path, 'r', encoding='utf-8') as f:
|
| 60 |
-
data = json.load(f)
|
| 61 |
-
return data.get("submissions", [])
|
| 62 |
-
except Exception as e:
|
| 63 |
-
print(f"Error loading submission log: {e}")
|
| 64 |
-
return []
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
try:
|
| 69 |
-
to_store = {
|
| 70 |
-
"submissions": submissions,
|
| 71 |
-
"last_updated": datetime.now(timezone.utc).isoformat(),
|
| 72 |
-
"total_submissions": len(submissions)
|
| 73 |
-
}
|
| 74 |
-
|
| 75 |
-
with open(self.submissions_path, 'w', encoding='utf-8') as f:
|
| 76 |
-
json.dump(to_store, f, indent=2, ensure_ascii=False)
|
| 77 |
-
|
| 78 |
-
return True
|
| 79 |
-
except Exception as e:
|
| 80 |
-
print(f"Error saving submission log: {e}")
|
| 81 |
-
return False
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
else:
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
])
|
| 213 |
-
|
| 214 |
-
return {
|
| 215 |
-
**basic_stats,
|
| 216 |
-
"recent_submissions_7d": recent_submissions,
|
| 217 |
-
"total_logged_submissions": len(submissions),
|
| 218 |
-
"last_updated": datetime.now(timezone.utc).isoformat()
|
| 219 |
-
}
|
| 220 |
-
|
| 221 |
-
except Exception as e:
|
| 222 |
-
print(f"Error getting leaderboard stats: {e}")
|
| 223 |
-
return {}
|
| 224 |
-
|
| 225 |
-
def backup_data(self) -> bool:
|
| 226 |
-
"""Create backup of current data"""
|
| 227 |
-
try:
|
| 228 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 229 |
-
backup_dir = Path("backups")
|
| 230 |
-
backup_dir.mkdir(exist_ok=True)
|
| 231 |
-
|
| 232 |
-
# Backup leaderboard
|
| 233 |
-
if self.leaderboard_path.exists():
|
| 234 |
-
backup_path = backup_dir / f"leaderboard_{timestamp}.json"
|
| 235 |
-
with open(self.leaderboard_path, 'r') as src, open(backup_path, 'w') as dst:
|
| 236 |
-
dst.write(src.read())
|
| 237 |
-
|
| 238 |
-
# Backup submissions
|
| 239 |
-
if self.submissions_path.exists():
|
| 240 |
-
backup_path = backup_dir / f"submissions_{timestamp}.json"
|
| 241 |
-
with open(self.submissions_path, 'r') as src, open(backup_path, 'w') as dst:
|
| 242 |
-
dst.write(src.read())
|
| 243 |
-
|
| 244 |
-
return True
|
| 245 |
-
|
| 246 |
-
except Exception as e:
|
| 247 |
-
print(f"Error creating backup: {e}")
|
| 248 |
-
return False
|
| 249 |
-
|
| 250 |
-
def export_data(self, format_type: str = "json") -> str:
|
| 251 |
-
"""Export leaderboard data in specified format"""
|
| 252 |
-
try:
|
| 253 |
-
from src.display.utils import export_leaderboard_data
|
| 254 |
-
|
| 255 |
-
data = self.load_leaderboard_data()
|
| 256 |
-
return export_leaderboard_data(data, format_type)
|
| 257 |
-
|
| 258 |
-
except Exception as e:
|
| 259 |
-
print(f"Error exporting data: {e}")
|
| 260 |
-
return f"Export failed: {str(e)}"
|
| 261 |
-
|
| 262 |
-
def validate_data_integrity(self) -> Dict[str, Any]:
|
| 263 |
-
"""Validate data integrity and return report"""
|
| 264 |
-
try:
|
| 265 |
-
data = self.load_leaderboard_data()
|
| 266 |
-
submissions = self.load_submission_log()
|
| 267 |
-
|
| 268 |
-
issues = []
|
| 269 |
-
|
| 270 |
-
# Check for duplicate models
|
| 271 |
-
model_names = [entry.get("model_name") for entry in data]
|
| 272 |
-
duplicates = [name for name in model_names if model_names.count(name) > 1]
|
| 273 |
-
if duplicates:
|
| 274 |
-
issues.append(f"Duplicate models found: {set(duplicates)}")
|
| 275 |
-
|
| 276 |
-
# Check for missing required fields
|
| 277 |
-
required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
|
| 278 |
-
for i, entry in enumerate(data):
|
| 279 |
-
missing = [field for field in required_fields if not entry.get(field)]
|
| 280 |
-
if missing:
|
| 281 |
-
issues.append(f"Entry {i}: Missing fields {missing}")
|
| 282 |
-
|
| 283 |
-
# Check score ranges
|
| 284 |
-
for i, entry in enumerate(data):
|
| 285 |
-
scores = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
|
| 286 |
-
for score in scores:
|
| 287 |
-
value = entry.get(score)
|
| 288 |
-
if value is not None and (value < 0 or value > 1):
|
| 289 |
-
issues.append(f"Entry {i}: {score} out of range: {value}")
|
| 290 |
-
|
| 291 |
-
return {
|
| 292 |
-
"is_valid": len(issues) == 0,
|
| 293 |
-
"issues": issues,
|
| 294 |
-
"total_entries": len(data),
|
| 295 |
-
"total_submissions": len(submissions),
|
| 296 |
-
"check_date": datetime.now(timezone.utc).isoformat()
|
| 297 |
-
}
|
| 298 |
-
|
| 299 |
-
except Exception as e:
|
| 300 |
-
return {
|
| 301 |
-
"is_valid": False,
|
| 302 |
-
"issues": [f"Validation failed: {str(e)}"],
|
| 303 |
-
"total_entries": 0,
|
| 304 |
-
"total_submissions": 0,
|
| 305 |
-
"check_date": datetime.now(timezone.utc).isoformat()
|
| 306 |
-
}
|
|
|
|
| 1 |
"""
|
| 2 |
+
Process CodeReview Bench leaderboard data and submissions.
|
| 3 |
"""
|
| 4 |
|
| 5 |
import json
|
| 6 |
+
import os
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import Dict, List, Tuple, Optional
|
| 10 |
+
import numpy as np
|
| 11 |
+
|
| 12 |
+
from src.display.utils import (
|
| 13 |
+
CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES,
|
| 14 |
+
MULTIMETRIC_METRICS, EXACT_MATCH_METRICS
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
|
| 19 |
+
"""
|
| 20 |
+
Process a JSONL submission file for CodeReview Bench.
|
| 21 |
|
| 22 |
+
Args:
|
| 23 |
+
file_path: Path to the JSONL submission file
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
Tuple of (entries_list, message)
|
| 27 |
+
"""
|
| 28 |
+
try:
|
| 29 |
+
entries = []
|
| 30 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 31 |
+
for line_num, line in enumerate(f, 1):
|
| 32 |
+
line = line.strip()
|
| 33 |
+
if not line:
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
entry = json.loads(line)
|
| 38 |
+
|
| 39 |
+
# Validate required fields
|
| 40 |
+
required_fields = ['model_name', 'programming_language', 'comment_language']
|
| 41 |
+
missing_fields = [field for field in required_fields if field not in entry]
|
| 42 |
+
if missing_fields:
|
| 43 |
+
return [], f"Missing required fields {missing_fields} in line {line_num}"
|
| 44 |
+
|
| 45 |
+
# Validate metrics exist
|
| 46 |
+
has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS)
|
| 47 |
+
has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS)
|
| 48 |
+
|
| 49 |
+
if not has_multimetric and not has_exact_match:
|
| 50 |
+
return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}"
|
| 51 |
+
|
| 52 |
+
entries.append(entry)
|
| 53 |
+
|
| 54 |
+
except json.JSONDecodeError as e:
|
| 55 |
+
return [], f"Invalid JSON in line {line_num}: {e}"
|
| 56 |
+
|
| 57 |
+
if not entries:
|
| 58 |
+
return [], "No valid entries found in submission file"
|
| 59 |
+
|
| 60 |
+
return entries, f"Successfully processed {len(entries)} entries"
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
return [], f"Error processing submission: {e}"
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def calculate_overall_score(entry: Dict) -> float:
|
| 67 |
+
"""
|
| 68 |
+
Calculate overall score for a CodeReview Bench entry.
|
| 69 |
|
| 70 |
+
Args:
|
| 71 |
+
entry: Dictionary containing model evaluation results
|
|
|
|
|
|
|
| 72 |
|
| 73 |
+
Returns:
|
| 74 |
+
Overall score as float
|
| 75 |
+
"""
|
| 76 |
+
# Calculate multimetric average
|
| 77 |
+
multimetric_scores = []
|
| 78 |
+
for metric in MULTIMETRIC_METRICS:
|
| 79 |
+
if metric in entry and isinstance(entry[metric], (int, float)):
|
| 80 |
+
multimetric_scores.append(entry[metric])
|
| 81 |
|
| 82 |
+
multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
+
# Calculate exact match average
|
| 85 |
+
exact_match_scores = []
|
| 86 |
+
for metric in EXACT_MATCH_METRICS:
|
| 87 |
+
if metric in entry and isinstance(entry[metric], (int, float)):
|
| 88 |
+
exact_match_scores.append(entry[metric])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
+
exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
# Weighted combination (can be adjusted based on requirements)
|
| 93 |
+
overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
return overall_score
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def load_leaderboard_data(file_path: str) -> Dict:
|
| 99 |
+
"""
|
| 100 |
+
Load the leaderboard data from a JSON file.
|
| 101 |
+
"""
|
| 102 |
+
if not os.path.exists(file_path):
|
| 103 |
+
version = "v0"
|
| 104 |
+
if "_v" in file_path:
|
| 105 |
+
version = file_path.split("_")[-1].split(".")[0]
|
| 106 |
+
return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
|
| 107 |
+
|
| 108 |
+
with open(file_path, 'r') as f:
|
| 109 |
+
data = json.load(f)
|
| 110 |
+
|
| 111 |
+
# Ensure version field exists
|
| 112 |
+
if "version" not in data:
|
| 113 |
+
version = "v0"
|
| 114 |
+
if "_v" in file_path:
|
| 115 |
+
version = file_path.split("_")[-1].split(".")[0]
|
| 116 |
+
data["version"] = version
|
| 117 |
+
|
| 118 |
+
return data
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def save_leaderboard_data(data: Dict, file_path: str) -> None:
|
| 122 |
+
"""
|
| 123 |
+
Save the leaderboard data to a JSON file.
|
| 124 |
+
"""
|
| 125 |
+
# Ensure the directory exists
|
| 126 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
| 127 |
+
|
| 128 |
+
# Update the last_updated timestamp
|
| 129 |
+
data["last_updated"] = datetime.now().isoformat()
|
| 130 |
+
|
| 131 |
+
# Ensure version is set
|
| 132 |
+
if "version" not in data:
|
| 133 |
+
version = "v0"
|
| 134 |
+
if "_v" in file_path:
|
| 135 |
+
version = file_path.split("_")[-1].split(".")[0]
|
| 136 |
+
data["version"] = version
|
| 137 |
+
|
| 138 |
+
with open(file_path, 'w') as f:
|
| 139 |
+
json.dump(data, f, indent=2)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
| 143 |
+
"""
|
| 144 |
+
Convert leaderboard data to a pandas DataFrame for display.
|
| 145 |
+
"""
|
| 146 |
+
rows = []
|
| 147 |
+
|
| 148 |
+
for entry in leaderboard_data.get("entries", []):
|
| 149 |
+
model_name = entry.get("model_name", "Unknown Model")
|
| 150 |
+
|
| 151 |
+
# Extract basic metadata
|
| 152 |
+
row = {
|
| 153 |
+
"model_name": model_name,
|
| 154 |
+
"model_type": entry.get("model_type", "Unknown"),
|
| 155 |
+
"mode": entry.get("mode", "Strict"),
|
| 156 |
+
"submission_date": entry.get("submission_date", ""),
|
| 157 |
+
"version": entry.get("version", "v0"),
|
| 158 |
+
"review_model_type": entry.get("review_model_type", "custom").lower()
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
# Add additional metadata fields if present
|
| 162 |
+
for key in ["base_model", "revision", "precision", "weight_type", "topic", "programming_language", "comment_language"]:
|
| 163 |
+
if key in entry:
|
| 164 |
+
row[key] = entry[key]
|
| 165 |
+
|
| 166 |
+
# Add multimetric scores
|
| 167 |
+
for metric in MULTIMETRIC_METRICS:
|
| 168 |
+
if metric in entry:
|
| 169 |
+
row[metric] = entry[metric]
|
| 170 |
else:
|
| 171 |
+
row[metric] = pd.NA
|
| 172 |
+
|
| 173 |
+
# Add exact match metrics
|
| 174 |
+
for metric in EXACT_MATCH_METRICS:
|
| 175 |
+
if metric in entry:
|
| 176 |
+
row[metric] = entry[metric]
|
| 177 |
+
else:
|
| 178 |
+
row[metric] = pd.NA
|
| 179 |
+
|
| 180 |
+
# Calculate aggregated metrics
|
| 181 |
+
multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])]
|
| 182 |
+
exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])]
|
| 183 |
+
|
| 184 |
+
if multimetric_scores:
|
| 185 |
+
row["multimetric_average"] = np.mean(multimetric_scores)
|
| 186 |
+
else:
|
| 187 |
+
row["multimetric_average"] = pd.NA
|
| 188 |
+
|
| 189 |
+
if exact_match_scores:
|
| 190 |
+
row["exact_match_average"] = np.mean(exact_match_scores)
|
| 191 |
+
else:
|
| 192 |
+
row["exact_match_average"] = pd.NA
|
| 193 |
+
|
| 194 |
+
# Calculate overall score
|
| 195 |
+
row["overall_score"] = calculate_overall_score(entry)
|
| 196 |
+
|
| 197 |
+
# Add language-specific metrics if available
|
| 198 |
+
for lang in COMMENT_LANGUAGES:
|
| 199 |
+
for metric in ["readability", "relevance", "overall_score"]:
|
| 200 |
+
lang_key = f"{lang}_{metric}"
|
| 201 |
+
if lang_key in entry:
|
| 202 |
+
row[lang_key] = entry[lang_key]
|
| 203 |
+
else:
|
| 204 |
+
row[lang_key] = pd.NA
|
| 205 |
+
|
| 206 |
+
# Add evaluation count
|
| 207 |
+
row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA))
|
| 208 |
+
|
| 209 |
+
rows.append(row)
|
| 210 |
+
|
| 211 |
+
# Create DataFrame and sort by overall score
|
| 212 |
+
df = pd.DataFrame(rows)
|
| 213 |
+
|
| 214 |
+
# Ensure all expected columns exist
|
| 215 |
+
for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS:
|
| 216 |
+
if metric not in df.columns:
|
| 217 |
+
df[metric] = pd.NA
|
| 218 |
+
|
| 219 |
+
# Sort by overall score (descending)
|
| 220 |
+
if not df.empty:
|
| 221 |
+
df = df.sort_values(by="overall_score", ascending=False, na_position='last')
|
| 222 |
+
|
| 223 |
+
# Ensure summary columns exist
|
| 224 |
+
summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"]
|
| 225 |
+
for col in summary_cols:
|
| 226 |
+
if col not in df.columns:
|
| 227 |
+
df[col] = pd.NA
|
| 228 |
+
|
| 229 |
+
return df
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict:
|
| 233 |
+
"""
|
| 234 |
+
Add new entries to the leaderboard, replacing any with the same model name.
|
| 235 |
+
"""
|
| 236 |
+
# Create a mapping of existing entries by model name and version
|
| 237 |
+
existing_entries = {
|
| 238 |
+
(entry["model_name"], entry.get("version", "v0")): i
|
| 239 |
+
for i, entry in enumerate(leaderboard_data.get("entries", []))
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
# Process each new entry
|
| 243 |
+
for new_entry in new_entries:
|
| 244 |
+
model_name = new_entry.get("model_name")
|
| 245 |
+
version = new_entry.get("version", "v0")
|
| 246 |
+
|
| 247 |
+
# Add calculated metrics
|
| 248 |
+
new_entry["overall_score"] = calculate_overall_score(new_entry)
|
| 249 |
+
|
| 250 |
+
# Calculate averages
|
| 251 |
+
multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
|
| 252 |
+
exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
|
| 253 |
+
|
| 254 |
+
if multimetric_scores:
|
| 255 |
+
new_entry["multimetric_average"] = np.mean(multimetric_scores)
|
| 256 |
+
if exact_match_scores:
|
| 257 |
+
new_entry["exact_match_average"] = np.mean(exact_match_scores)
|
| 258 |
+
|
| 259 |
+
if (model_name, version) in existing_entries:
|
| 260 |
+
# Replace existing entry
|
| 261 |
+
leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
|
| 262 |
+
else:
|
| 263 |
+
# Add new entry
|
| 264 |
+
if "entries" not in leaderboard_data:
|
| 265 |
+
leaderboard_data["entries"] = []
|
| 266 |
+
leaderboard_data["entries"].append(new_entry)
|
| 267 |
+
|
| 268 |
+
# Update the last_updated timestamp
|
| 269 |
+
leaderboard_data["last_updated"] = datetime.now().isoformat()
|
| 270 |
+
|
| 271 |
+
return leaderboard_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/populate.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Populate the CodeReview Bench leaderboard from HuggingFace datasets.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import tempfile
|
| 9 |
+
from typing import Dict, List, Optional
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
from huggingface_hub import hf_hub_download, HfApi
|
| 14 |
+
from datasets import load_dataset
|
| 15 |
+
|
| 16 |
+
from src.display.utils import CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES
|
| 17 |
+
from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
|
| 18 |
+
from src.leaderboard.processor import leaderboard_to_dataframe
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def get_latest_leaderboard(version="v0") -> Optional[Dict]:
|
| 22 |
+
"""
|
| 23 |
+
Get the latest leaderboard data from HuggingFace dataset.
|
| 24 |
+
Fallback to local JSON file if HF download fails or is unavailable.
|
| 25 |
+
"""
|
| 26 |
+
# First try to fetch from HuggingFace Hub
|
| 27 |
+
try:
|
| 28 |
+
leaderboard_path = hf_hub_download(
|
| 29 |
+
repo_id=RESULTS_DATASET_ID,
|
| 30 |
+
filename=f"leaderboards/leaderboard_{version}.json",
|
| 31 |
+
repo_type="dataset",
|
| 32 |
+
token=TOKEN
|
| 33 |
+
)
|
| 34 |
+
with open(leaderboard_path, 'r') as f:
|
| 35 |
+
return json.load(f)
|
| 36 |
+
except Exception as hf_err:
|
| 37 |
+
print(f"HF download failed or unavailable: {hf_err}. Trying local fallback...")
|
| 38 |
+
|
| 39 |
+
# Fallback: attempt to load a local leaderboard_data.json located at the project root
|
| 40 |
+
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 41 |
+
local_path_candidates = [
|
| 42 |
+
os.path.join(project_root, "leaderboard_data.json"), # legacy path in root
|
| 43 |
+
os.path.join(project_root, "data", "leaderboard.json"), # path defined in envs.py
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
for local_path in local_path_candidates:
|
| 47 |
+
if os.path.exists(local_path):
|
| 48 |
+
try:
|
| 49 |
+
with open(local_path, 'r') as f:
|
| 50 |
+
return json.load(f)
|
| 51 |
+
except Exception as local_err:
|
| 52 |
+
print(f"Error loading local leaderboard file {local_path}: {local_err}")
|
| 53 |
+
|
| 54 |
+
# If nothing found, return None
|
| 55 |
+
return None
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
|
| 59 |
+
"""
|
| 60 |
+
Get a specific model's entry from the entries folder, uniquely identified by model_name, mode, and version.
|
| 61 |
+
"""
|
| 62 |
+
try:
|
| 63 |
+
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
| 64 |
+
mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
|
| 65 |
+
entry_path = hf_hub_download(
|
| 66 |
+
repo_id=RESULTS_DATASET_ID,
|
| 67 |
+
filename=f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json",
|
| 68 |
+
repo_type="dataset",
|
| 69 |
+
token=TOKEN
|
| 70 |
+
)
|
| 71 |
+
with open(entry_path, 'r') as f:
|
| 72 |
+
return json.load(f)
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"Error downloading model entry: {e}")
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def get_all_entries(version="v0") -> List[Dict]:
|
| 79 |
+
"""
|
| 80 |
+
Get all entries from the HuggingFace dataset.
|
| 81 |
+
"""
|
| 82 |
+
try:
|
| 83 |
+
api = HfApi(token=TOKEN)
|
| 84 |
+
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
|
| 85 |
+
entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
|
| 86 |
+
|
| 87 |
+
all_entries = []
|
| 88 |
+
for entry_file in entry_files:
|
| 89 |
+
try:
|
| 90 |
+
entry_path = hf_hub_download(
|
| 91 |
+
repo_id=RESULTS_DATASET_ID,
|
| 92 |
+
filename=entry_file,
|
| 93 |
+
repo_type="dataset",
|
| 94 |
+
token=TOKEN
|
| 95 |
+
)
|
| 96 |
+
with open(entry_path, 'r') as f:
|
| 97 |
+
entry_data = json.load(f)
|
| 98 |
+
all_entries.append(entry_data)
|
| 99 |
+
except Exception as e:
|
| 100 |
+
print(f"Error loading entry {entry_file}: {e}")
|
| 101 |
+
|
| 102 |
+
return all_entries
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"Error getting all entries: {e}")
|
| 105 |
+
return []
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def get_leaderboard_df(version="v0") -> pd.DataFrame:
|
| 109 |
+
"""
|
| 110 |
+
Get the leaderboard data as a DataFrame.
|
| 111 |
+
"""
|
| 112 |
+
# Get latest leaderboard data
|
| 113 |
+
leaderboard_data = get_latest_leaderboard(version)
|
| 114 |
+
|
| 115 |
+
if not leaderboard_data:
|
| 116 |
+
# If no leaderboard exists, try to build it from entries
|
| 117 |
+
entries = get_all_entries(version)
|
| 118 |
+
if entries:
|
| 119 |
+
leaderboard_data = {
|
| 120 |
+
"entries": entries,
|
| 121 |
+
"last_updated": datetime.now().isoformat(),
|
| 122 |
+
"version": version
|
| 123 |
+
}
|
| 124 |
+
else:
|
| 125 |
+
# Return empty DataFrame if no data available
|
| 126 |
+
return pd.DataFrame(columns=DISPLAY_COLS)
|
| 127 |
+
|
| 128 |
+
# Convert to DataFrame
|
| 129 |
+
return leaderboard_to_dataframe(leaderboard_data)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
| 133 |
+
"""
|
| 134 |
+
Get the leaderboard data filtered by a specific programming language category.
|
| 135 |
+
"""
|
| 136 |
+
# Get latest leaderboard data
|
| 137 |
+
leaderboard_data = get_latest_leaderboard(version)
|
| 138 |
+
|
| 139 |
+
if not leaderboard_data:
|
| 140 |
+
# If no leaderboard exists, try to build it from entries
|
| 141 |
+
entries = get_all_entries(version)
|
| 142 |
+
if entries:
|
| 143 |
+
leaderboard_data = {
|
| 144 |
+
"entries": entries,
|
| 145 |
+
"last_updated": datetime.now().isoformat(),
|
| 146 |
+
"version": version
|
| 147 |
+
}
|
| 148 |
+
else:
|
| 149 |
+
# Return empty DataFrame if no data available
|
| 150 |
+
return pd.DataFrame(columns=DISPLAY_COLS)
|
| 151 |
+
|
| 152 |
+
# Filter entries to only include those with data for the specified programming language
|
| 153 |
+
filtered_entries = []
|
| 154 |
+
for entry in leaderboard_data.get("entries", []):
|
| 155 |
+
# Check if entry has data for this programming language
|
| 156 |
+
programming_language = entry.get("programming_language", "").lower()
|
| 157 |
+
if programming_language == category.lower() or category.lower() == "other":
|
| 158 |
+
# For "other" category, include entries that don't match any specific language
|
| 159 |
+
if category.lower() == "other":
|
| 160 |
+
if programming_language not in [cat.lower() for cat in CATEGORIES[:-1]]: # Exclude "Other" from check
|
| 161 |
+
filtered_entries.append(entry)
|
| 162 |
+
else:
|
| 163 |
+
filtered_entries.append(entry)
|
| 164 |
+
|
| 165 |
+
# Create a new leaderboard data structure with the filtered entries
|
| 166 |
+
filtered_leaderboard = {
|
| 167 |
+
"entries": filtered_entries,
|
| 168 |
+
"last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
|
| 169 |
+
"version": version
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
# Convert to DataFrame
|
| 173 |
+
return leaderboard_to_dataframe(filtered_leaderboard)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def get_detailed_model_data(model_name: str, mode: str, version="v0") -> Dict:
|
| 177 |
+
"""
|
| 178 |
+
Get detailed data for a specific model and mode.
|
| 179 |
+
"""
|
| 180 |
+
entry = get_model_entry(model_name, mode, version)
|
| 181 |
+
if entry:
|
| 182 |
+
return entry
|
| 183 |
+
leaderboard_data = get_latest_leaderboard(version)
|
| 184 |
+
if leaderboard_data:
|
| 185 |
+
for entry in leaderboard_data.get("entries", []):
|
| 186 |
+
if entry.get("model_name") == model_name and str(entry.get("mode")).lower() == str(mode).lower():
|
| 187 |
+
return entry
|
| 188 |
+
return {}
|
src/submission/submit.py
CHANGED
|
@@ -1,386 +1,184 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
"""
|
| 4 |
|
| 5 |
-
import
|
| 6 |
-
import
|
| 7 |
-
|
| 8 |
-
from datetime import datetime
|
| 9 |
-
from
|
| 10 |
-
from src.leaderboard.processor import LeaderboardProcessor
|
| 11 |
-
from src.display.utils import get_main_leaderboard_data, get_quality_metrics_data
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
taxonomy_category: str,
|
| 119 |
-
bleu: float,
|
| 120 |
-
llm_pass_1: float,
|
| 121 |
-
llm_pass_5: float,
|
| 122 |
-
llm_pass_10: float,
|
| 123 |
-
readability: int,
|
| 124 |
-
relevance: int,
|
| 125 |
-
explanation_clarity: int,
|
| 126 |
-
problem_identification: int,
|
| 127 |
-
actionability: int,
|
| 128 |
-
completeness: int,
|
| 129 |
-
specificity: int,
|
| 130 |
-
contextual_adequacy: int,
|
| 131 |
-
consistency: int,
|
| 132 |
-
brevity: int,
|
| 133 |
-
) -> Tuple[List[Dict], List[List[str]], List[List[str]], str]:
|
| 134 |
-
"""Handle model submission with full validation"""
|
| 135 |
-
|
| 136 |
-
try:
|
| 137 |
-
# Get client IP
|
| 138 |
-
client_ip = self.get_client_ip(request)
|
| 139 |
-
|
| 140 |
-
# Check rate limiting
|
| 141 |
-
rate_ok, rate_msg = self.processor.check_rate_limit(client_ip)
|
| 142 |
-
if not rate_ok:
|
| 143 |
-
return current_data, [], [], f"❌ {rate_msg}"
|
| 144 |
-
|
| 145 |
-
# Validate model name
|
| 146 |
-
name_valid, name_msg = self.validate_model_name(model_name)
|
| 147 |
-
if not name_valid:
|
| 148 |
-
return current_data, [], [], f"❌ {name_msg}"
|
| 149 |
-
|
| 150 |
-
# Validate scores
|
| 151 |
-
scores = {
|
| 152 |
-
"bleu": bleu,
|
| 153 |
-
"llm_pass_1": llm_pass_1,
|
| 154 |
-
"llm_pass_5": llm_pass_5,
|
| 155 |
-
"llm_pass_10": llm_pass_10
|
| 156 |
-
}
|
| 157 |
-
scores_valid, scores_msg = self.validate_scores(scores)
|
| 158 |
-
if not scores_valid:
|
| 159 |
-
return current_data, [], [], f"❌ {scores_msg}"
|
| 160 |
-
|
| 161 |
-
# Validate metrics
|
| 162 |
-
metrics = {
|
| 163 |
-
"readability": readability,
|
| 164 |
-
"relevance": relevance,
|
| 165 |
-
"explanation_clarity": explanation_clarity,
|
| 166 |
-
"problem_identification": problem_identification,
|
| 167 |
-
"actionability": actionability,
|
| 168 |
-
"completeness": completeness,
|
| 169 |
-
"specificity": specificity,
|
| 170 |
-
"contextual_adequacy": contextual_adequacy,
|
| 171 |
-
"consistency": consistency,
|
| 172 |
-
"brevity": brevity,
|
| 173 |
-
}
|
| 174 |
-
metrics_valid, metrics_msg = self.validate_metrics(metrics)
|
| 175 |
-
if not metrics_valid:
|
| 176 |
-
return current_data, [], [], f"❌ {metrics_msg}"
|
| 177 |
-
|
| 178 |
-
# Create submission data
|
| 179 |
-
submission_data = {
|
| 180 |
-
"model_name": model_name.strip(),
|
| 181 |
-
"programming_language": programming_language,
|
| 182 |
-
"comment_language": comment_language,
|
| 183 |
-
"taxonomy_category": taxonomy_category,
|
| 184 |
-
"bleu": bleu,
|
| 185 |
-
"llm_pass_1": llm_pass_1,
|
| 186 |
-
"llm_pass_5": llm_pass_5,
|
| 187 |
-
"llm_pass_10": llm_pass_10,
|
| 188 |
-
"metrics": metrics
|
| 189 |
-
}
|
| 190 |
-
|
| 191 |
-
# Submit to processor
|
| 192 |
-
success, message = self.processor.add_submission(submission_data, client_ip)
|
| 193 |
-
|
| 194 |
-
if success:
|
| 195 |
-
# Load updated data
|
| 196 |
-
updated_data = self.processor.load_leaderboard_data()
|
| 197 |
-
|
| 198 |
-
# Format tables
|
| 199 |
-
main_table = get_main_leaderboard_data(updated_data)
|
| 200 |
-
quality_table = get_quality_metrics_data(updated_data)
|
| 201 |
-
|
| 202 |
-
return updated_data, main_table, quality_table, message
|
| 203 |
-
else:
|
| 204 |
-
return current_data, [], [], message
|
| 205 |
-
|
| 206 |
-
except Exception as e:
|
| 207 |
-
print(f"Error in submission: {e}")
|
| 208 |
-
return current_data, [], [], f"❌ Submission failed: {str(e)}"
|
| 209 |
-
|
| 210 |
-
def get_submission_form_components(self):
|
| 211 |
-
"""Create gradio components for submission form"""
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
""
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
)
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
)
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
minimum=0.0,
|
| 277 |
-
maximum=1.0,
|
| 278 |
-
step=0.001,
|
| 279 |
-
info="Success rate in 10 attempts"
|
| 280 |
-
)
|
| 281 |
-
|
| 282 |
-
gr.Markdown("### 📋 Quality Metrics (0 - 10)")
|
| 283 |
-
with gr.Row():
|
| 284 |
-
readability = gr.Slider(
|
| 285 |
-
minimum=0, maximum=10, value=5, step=1,
|
| 286 |
-
label="Readability",
|
| 287 |
-
info="How readable are the generated reviews?"
|
| 288 |
-
)
|
| 289 |
-
relevance = gr.Slider(
|
| 290 |
-
minimum=0, maximum=10, value=5, step=1,
|
| 291 |
-
label="Relevance",
|
| 292 |
-
info="How relevant to the code changes?"
|
| 293 |
-
)
|
| 294 |
-
explanation_clarity = gr.Slider(
|
| 295 |
-
minimum=0, maximum=10, value=5, step=1,
|
| 296 |
-
label="Explanation Clarity",
|
| 297 |
-
info="How clear are the explanations?"
|
| 298 |
-
)
|
| 299 |
-
problem_identification = gr.Slider(
|
| 300 |
-
minimum=0, maximum=10, value=5, step=1,
|
| 301 |
-
label="Problem Identification",
|
| 302 |
-
info="How well does it identify issues?"
|
| 303 |
-
)
|
| 304 |
-
actionability = gr.Slider(
|
| 305 |
-
minimum=0, maximum=10, value=5, step=1,
|
| 306 |
-
label="Actionability",
|
| 307 |
-
info="How actionable are the suggestions?"
|
| 308 |
-
)
|
| 309 |
-
|
| 310 |
-
with gr.Row():
|
| 311 |
-
completeness = gr.Slider(
|
| 312 |
-
minimum=0, maximum=10, value=5, step=1,
|
| 313 |
-
label="Completeness",
|
| 314 |
-
info="How complete are the reviews?"
|
| 315 |
-
)
|
| 316 |
-
specificity = gr.Slider(
|
| 317 |
-
minimum=0, maximum=10, value=5, step=1,
|
| 318 |
-
label="Specificity",
|
| 319 |
-
info="How specific are the comments?"
|
| 320 |
-
)
|
| 321 |
-
contextual_adequacy = gr.Slider(
|
| 322 |
-
minimum=0, maximum=10, value=5, step=1,
|
| 323 |
-
label="Contextual Adequacy",
|
| 324 |
-
info="How well does it understand context?"
|
| 325 |
-
)
|
| 326 |
-
consistency = gr.Slider(
|
| 327 |
-
minimum=0, maximum=10, value=5, step=1,
|
| 328 |
-
label="Consistency",
|
| 329 |
-
info="How consistent across reviews?"
|
| 330 |
-
)
|
| 331 |
-
brevity = gr.Slider(
|
| 332 |
-
minimum=0, maximum=10, value=5, step=1,
|
| 333 |
-
label="Brevity",
|
| 334 |
-
info="How concise are the reviews?"
|
| 335 |
-
)
|
| 336 |
-
|
| 337 |
-
submit_btn = gr.Button("🚀 Submit Model", variant="primary")
|
| 338 |
-
status_msg = gr.Markdown("")
|
| 339 |
-
|
| 340 |
-
# Return all components for use in the main app
|
| 341 |
-
return {
|
| 342 |
-
"model_name": model_name,
|
| 343 |
-
"programming_language": programming_language,
|
| 344 |
-
"comment_language": comment_language,
|
| 345 |
-
"taxonomy_category": taxonomy_category,
|
| 346 |
-
"bleu": bleu,
|
| 347 |
-
"pass1": pass1,
|
| 348 |
-
"pass5": pass5,
|
| 349 |
-
"pass10": pass10,
|
| 350 |
-
"readability": readability,
|
| 351 |
-
"relevance": relevance,
|
| 352 |
-
"explanation_clarity": explanation_clarity,
|
| 353 |
-
"problem_identification": problem_identification,
|
| 354 |
-
"actionability": actionability,
|
| 355 |
-
"completeness": completeness,
|
| 356 |
-
"specificity": specificity,
|
| 357 |
-
"contextual_adequacy": contextual_adequacy,
|
| 358 |
-
"consistency": consistency,
|
| 359 |
-
"brevity": brevity,
|
| 360 |
-
"submit_btn": submit_btn,
|
| 361 |
-
"status_msg": status_msg,
|
| 362 |
-
}
|
| 363 |
-
|
| 364 |
-
def get_submission_history(self, ip_address: str) -> List[List[str]]:
|
| 365 |
-
"""Get submission history for display"""
|
| 366 |
try:
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
row = [
|
| 372 |
-
sub.get("model_name", ""),
|
| 373 |
-
sub.get("programming_language", ""),
|
| 374 |
-
sub.get("comment_language", ""),
|
| 375 |
-
sub.get("taxonomy_category", ""),
|
| 376 |
-
f"{sub.get('scores', {}).get('llm_pass_1', 0):.3f}",
|
| 377 |
-
sub.get("submission_date", "").split("T")[0] if sub.get("submission_date") else "",
|
| 378 |
-
sub.get("status", "")
|
| 379 |
-
]
|
| 380 |
-
table_data.append(row)
|
| 381 |
-
|
| 382 |
-
return table_data
|
| 383 |
-
|
| 384 |
-
except Exception as e:
|
| 385 |
-
print(f"Error getting submission history: {e}")
|
| 386 |
-
return []
|
|
|
|
| 1 |
"""
|
| 2 |
+
Handle submissions to the CodeReview Bench leaderboard.
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import tempfile
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import Dict, List, Tuple
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
from huggingface_hub import HfApi
|
| 12 |
+
from datasets import load_dataset
|
| 13 |
+
|
| 14 |
+
from src.display.formatting import styled_error, styled_message
|
| 15 |
+
from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
|
| 16 |
+
from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def validate_submission(file_path: str) -> Tuple[bool, str]:
|
| 20 |
+
"""
|
| 21 |
+
Validate a submission file.
|
| 22 |
+
"""
|
| 23 |
+
try:
|
| 24 |
+
entries, message = process_jsonl_submission(file_path)
|
| 25 |
+
if not entries:
|
| 26 |
+
return False, message
|
| 27 |
+
return True, "Submission is valid"
|
| 28 |
+
except Exception as e:
|
| 29 |
+
return False, f"Error validating submission: {e}"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]:
|
| 33 |
+
"""
|
| 34 |
+
Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version.
|
| 35 |
+
"""
|
| 36 |
+
try:
|
| 37 |
+
# Create safe model name for file path
|
| 38 |
+
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
| 39 |
+
mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
|
| 40 |
+
|
| 41 |
+
# Create entry path in entries folder
|
| 42 |
+
entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json"
|
| 43 |
+
|
| 44 |
+
# Save entry to temporary file
|
| 45 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
|
| 46 |
+
json.dump(entry, temp_file, indent=2)
|
| 47 |
+
temp_path = temp_file.name
|
| 48 |
+
|
| 49 |
+
# Upload file
|
| 50 |
+
api = HfApi(token=TOKEN)
|
| 51 |
+
api.upload_file(
|
| 52 |
+
path_or_fileobj=temp_path,
|
| 53 |
+
path_in_repo=entry_path,
|
| 54 |
+
repo_id=RESULTS_DATASET_ID,
|
| 55 |
+
repo_type="dataset",
|
| 56 |
+
commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
os.unlink(temp_path)
|
| 60 |
+
return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})"
|
| 61 |
+
except Exception as e:
|
| 62 |
+
return False, f"Error submitting entry to dataset: {e}"
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
|
| 66 |
+
"""
|
| 67 |
+
Submit updated leaderboard to the HuggingFace dataset.
|
| 68 |
+
"""
|
| 69 |
+
try:
|
| 70 |
+
# Create leaderboard data
|
| 71 |
+
leaderboard_data = {
|
| 72 |
+
"entries": entries,
|
| 73 |
+
"last_updated": datetime.now().isoformat(),
|
| 74 |
+
"version": version
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
# Save to temporary file
|
| 78 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
|
| 79 |
+
json.dump(leaderboard_data, temp_file, indent=2)
|
| 80 |
+
temp_path = temp_file.name
|
| 81 |
+
|
| 82 |
+
# Upload file
|
| 83 |
+
api = HfApi(token=TOKEN)
|
| 84 |
+
api.upload_file(
|
| 85 |
+
path_or_fileobj=temp_path,
|
| 86 |
+
path_in_repo=f"leaderboards/leaderboard_{version}.json",
|
| 87 |
+
repo_id=RESULTS_DATASET_ID,
|
| 88 |
+
repo_type="dataset",
|
| 89 |
+
commit_message=f"Update leaderboard for version {version}"
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
os.unlink(temp_path)
|
| 93 |
+
return True, "Leaderboard updated successfully"
|
| 94 |
+
except Exception as e:
|
| 95 |
+
return False, f"Error updating leaderboard: {e}"
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
|
| 99 |
+
"""
|
| 100 |
+
Process a submission to the CodeReview Bench leaderboard.
|
| 101 |
+
"""
|
| 102 |
+
try:
|
| 103 |
+
# Validate submission
|
| 104 |
+
is_valid, validation_message = validate_submission(file_path)
|
| 105 |
+
if not is_valid:
|
| 106 |
+
return styled_error(validation_message)
|
| 107 |
+
|
| 108 |
+
# Process the submission entries
|
| 109 |
+
entries, message = process_jsonl_submission(file_path)
|
| 110 |
+
if not entries:
|
| 111 |
+
return styled_error(f"Failed to process submission: {message}")
|
| 112 |
+
|
| 113 |
+
# Upload raw submission file
|
| 114 |
+
model_name = metadata.get("model_name", "unknown")
|
| 115 |
+
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
api = HfApi(token=TOKEN)
|
| 118 |
+
submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
|
| 119 |
+
api.upload_file(
|
| 120 |
+
path_or_fileobj=file_path,
|
| 121 |
+
path_in_repo=submission_path,
|
| 122 |
+
repo_id=RESULTS_DATASET_ID,
|
| 123 |
+
repo_type="dataset",
|
| 124 |
+
commit_message=f"Add raw submission for {model_name}"
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Process entries and add metadata
|
| 128 |
+
processed_entries = []
|
| 129 |
+
for entry in entries:
|
| 130 |
+
# Add metadata to entry
|
| 131 |
+
entry.update({
|
| 132 |
+
"model_name": metadata.get("model_name"),
|
| 133 |
+
"model_type": metadata.get("model_type"),
|
| 134 |
+
"review_model_type": str(metadata.get("review_model_type", "custom")).lower(),
|
| 135 |
+
"mode": metadata.get("mode"),
|
| 136 |
+
"base_model": metadata.get("base_model"),
|
| 137 |
+
"revision": metadata.get("revision"),
|
| 138 |
+
"precision": metadata.get("precision"),
|
| 139 |
+
"weight_type": metadata.get("weight_type"),
|
| 140 |
+
"version": version,
|
| 141 |
+
"submission_date": datetime.now().isoformat()
|
| 142 |
+
})
|
| 143 |
+
processed_entries.append(entry)
|
| 144 |
+
|
| 145 |
+
# Submit entries to entries folder
|
| 146 |
+
for entry in processed_entries:
|
| 147 |
+
success, message = submit_entry_to_hub(entry, model_name, metadata.get("mode"), version)
|
| 148 |
+
if not success:
|
| 149 |
+
return styled_error(message)
|
| 150 |
+
|
| 151 |
+
# Get all entries from HF dataset and update leaderboard
|
| 152 |
+
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
|
| 153 |
+
entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
|
| 154 |
+
|
| 155 |
+
all_entries = []
|
| 156 |
+
for entry_file in entry_files:
|
| 157 |
+
try:
|
| 158 |
+
entry_path = api.hf_hub_download(
|
| 159 |
+
repo_id=RESULTS_DATASET_ID,
|
| 160 |
+
filename=entry_file,
|
| 161 |
+
repo_type="dataset",
|
| 162 |
+
)
|
| 163 |
+
with open(entry_path, 'r') as f:
|
| 164 |
+
entry_data = json.load(f)
|
| 165 |
+
all_entries.append(entry_data)
|
| 166 |
+
except Exception as e:
|
| 167 |
+
print(f"Error loading entry {entry_file}: {e}")
|
| 168 |
+
|
| 169 |
+
# Update leaderboard with all entries
|
| 170 |
+
success, message = submit_leaderboard_to_hub(all_entries, version)
|
| 171 |
+
if not success:
|
| 172 |
+
return styled_error(message)
|
| 173 |
+
|
| 174 |
+
return styled_message("Submission successful! Model evaluated and leaderboard updated.")
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
return styled_error(f"Error processing submission: {e}")
|
| 178 |
+
finally:
|
| 179 |
+
# Clean up temporary files if they exist
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
try:
|
| 181 |
+
if os.path.exists(file_path):
|
| 182 |
+
os.remove(file_path)
|
| 183 |
+
except:
|
| 184 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|