Spaces:
Sleeping
Sleeping
make_leaderboard
#1
by
kenkaneki
- opened
- .env.template +0 -6
- .gitignore +5 -44
- .gitmodules +0 -3
- .gradio/certificate.pem +0 -31
- README.md +28 -209
- SUBMISSION_EXAMPLE.md +0 -266
- app.py +156 -1068
- data/.gitkeep +0 -1
- data/leaderboard_data.json +0 -30
- data/submissions.json +0 -5
- example_submission.jsonl +0 -4
- gradio_test.ipynb +0 -32
- leaderboard_data.json +0 -32
- requirements.txt +16 -8
- src/__init__.py +0 -1
- src/about.py +54 -41
- src/display/__init__.py +0 -1
- src/display/css_html_js.py +74 -66
- src/display/formatting.py +15 -59
- src/display/utils.py +90 -397
- src/envs.py +16 -18
- src/leaderboard/__init__.py +0 -1
- src/leaderboard/processor.py +0 -271
- src/leaderboard/read_evals.py +196 -0
- src/populate.py +54 -184
- src/submission/__init__.py +0 -1
- src/submission/check_validity.py +99 -0
- src/submission/submit.py +114 -179
.env.template
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
HF_TOKEN="your_huggingface_write_token"
|
| 2 |
-
OWNER="your_huggingface_username_or_org"
|
| 3 |
-
RESULTS_DATASET_ID="your_username/guardbench-results"
|
| 4 |
-
SUBMITTER_TOKEN="your_secret_submission_token"
|
| 5 |
-
ADMIN_USERNAME="admin"
|
| 6 |
-
ADMIN_PASSWORD="password" # Change this!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
|
@@ -1,52 +1,13 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
__pycache__/
|
| 3 |
-
*.py[cod]
|
| 4 |
-
*$py.class
|
| 5 |
-
*.so
|
| 6 |
-
.Python
|
| 7 |
-
env/
|
| 8 |
-
build/
|
| 9 |
-
develop-eggs/
|
| 10 |
-
dist/
|
| 11 |
-
downloads/
|
| 12 |
-
eggs/
|
| 13 |
-
.eggs/
|
| 14 |
-
lib/
|
| 15 |
-
lib64/
|
| 16 |
-
parts/
|
| 17 |
-
sdist/
|
| 18 |
-
var/
|
| 19 |
-
.venv/
|
| 20 |
-
*.egg-info/
|
| 21 |
-
.installed.cfg
|
| 22 |
-
*.egg
|
| 23 |
-
.gradio/
|
| 24 |
-
|
| 25 |
-
# Environment variables
|
| 26 |
.env
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
venv/
|
| 30 |
-
ENV/
|
| 31 |
-
|
| 32 |
-
# IDE
|
| 33 |
-
.idea/
|
| 34 |
.vscode/
|
| 35 |
-
*.swp
|
| 36 |
-
*.swo
|
| 37 |
-
|
| 38 |
-
# OS
|
| 39 |
-
.DS_Store
|
| 40 |
-
Thumbs.db
|
| 41 |
|
| 42 |
-
# Hugging Face cache
|
| 43 |
eval-queue/
|
| 44 |
eval-results/
|
| 45 |
eval-queue-bk/
|
| 46 |
eval-results-bk/
|
| 47 |
-
|
| 48 |
-
# Data files
|
| 49 |
-
data/
|
| 50 |
-
|
| 51 |
-
# Versioned leaderboard files
|
| 52 |
-
data/leaderboard_v*.json
|
|
|
|
| 1 |
+
auto_evals/
|
| 2 |
+
venv/
|
| 3 |
__pycache__/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
.env
|
| 5 |
+
.ipynb_checkpoints
|
| 6 |
+
*ipynb
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
.vscode/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
| 9 |
eval-queue/
|
| 10 |
eval-results/
|
| 11 |
eval-queue-bk/
|
| 12 |
eval-results-bk/
|
| 13 |
+
logs/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitmodules
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
[submodule "guard-bench-submodule"]
|
| 2 |
-
path = guard-bench-submodule
|
| 3 |
-
url = https://github.com/whitecircle-ai/circle-guard-bench.git
|
|
|
|
|
|
|
|
|
|
|
|
.gradio/certificate.pem
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
-----BEGIN CERTIFICATE-----
|
| 2 |
-
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
-
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
-
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
-
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
-
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
-
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
-
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
-
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
-
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
-
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
-
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
-
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
-
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
-
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
-
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
-
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
-
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
-
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
-
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
-
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
-
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
-
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
-
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
-
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
-
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
-
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
-
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
-
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
-
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
-
-----END CERTIFICATE-----
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,227 +1,46 @@
|
|
| 1 |
---
|
| 2 |
title: CodeReviewBench
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
|
| 8 |
-
sdk_version: 4.44.1
|
| 9 |
app_file: app.py
|
| 10 |
pinned: true
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
- openai/gpt-4o
|
| 15 |
-
- claude-3-7-sonnet
|
| 16 |
-
- deepseek/deepseek-r1
|
| 17 |
-
|
| 18 |
---
|
| 19 |
|
| 20 |
-
#
|
| 21 |
-
|
| 22 |
-
A comprehensive benchmark and leaderboard for code review generation models, inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench).
|
| 23 |
-
## Features
|
| 24 |
-
|
| 25 |
-
- **Multi-Language Support**: Evaluates models across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more
|
| 26 |
-
- **Dual Language Comments**: Supports both Russian and English comment languages
|
| 27 |
-
- **Comprehensive Metrics**:
|
| 28 |
-
- LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity)
|
| 29 |
-
- Exact-match metrics (pass@1, pass@5, pass@10, BLEU@10)
|
| 30 |
-
- **Interactive Visualization**: Compare model performance across categories with radar plots
|
| 31 |
-
- **Easy Submission**: Submit your model results via web interface
|
| 32 |
-
|
| 33 |
-
## Metrics
|
| 34 |
-
|
| 35 |
-
### LLM-based Multimetric
|
| 36 |
-
|
| 37 |
-
- **Readability**: How easy the review is to understand
|
| 38 |
-
- **Relevance**: How relevant the review is to the code
|
| 39 |
-
- **Explanation Clarity**: How clear the explanations are
|
| 40 |
-
- **Problem Identification**: How well problems are identified
|
| 41 |
-
- **Actionability**: How actionable the suggestions are
|
| 42 |
-
- **Completeness**: How complete the review is
|
| 43 |
-
- **Specificity**: How specific the feedback is
|
| 44 |
-
- **Contextual Adequacy**: How well the review fits the context
|
| 45 |
-
- **Consistency**: How consistent the review style is
|
| 46 |
-
- **Brevity**: How concise the review is
|
| 47 |
-
|
| 48 |
-
### Exact-Match Metrics
|
| 49 |
-
|
| 50 |
-
- **Pass@1**: Percentage of correct reviews on first attempt
|
| 51 |
-
- **Pass@5**: Percentage of correct reviews in top 5 attempts
|
| 52 |
-
- **Pass@10**: Percentage of correct reviews in top 10 attempts
|
| 53 |
-
- **BLEU@10**: BLEU score for top 10 review candidates
|
| 54 |
-
|
| 55 |
-
## Programming Languages Supported
|
| 56 |
-
|
| 57 |
-
- Python
|
| 58 |
-
- JavaScript
|
| 59 |
-
- Java
|
| 60 |
-
- C++
|
| 61 |
-
- C#
|
| 62 |
-
- TypeScript
|
| 63 |
-
- Go
|
| 64 |
-
- Rust
|
| 65 |
-
- Swift
|
| 66 |
-
- Kotlin
|
| 67 |
-
- Ruby
|
| 68 |
-
- PHP
|
| 69 |
-
- C
|
| 70 |
-
- Scala
|
| 71 |
-
- R
|
| 72 |
-
- Dart
|
| 73 |
-
- Other
|
| 74 |
-
|
| 75 |
-
## Comment Languages
|
| 76 |
-
|
| 77 |
-
- Russian (ru)
|
| 78 |
-
- English (en)
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
- Bug Fix
|
| 83 |
-
- Code Style
|
| 84 |
-
- Performance
|
| 85 |
-
- Security
|
| 86 |
-
- Refactoring
|
| 87 |
-
- Documentation
|
| 88 |
-
- Testing
|
| 89 |
-
- Architecture
|
| 90 |
-
- Other
|
| 91 |
-
|
| 92 |
-
## Installation
|
| 93 |
-
|
| 94 |
-
```bash
|
| 95 |
-
pip install -r requirements.txt
|
| 96 |
-
```
|
| 97 |
-
|
| 98 |
-
## Usage
|
| 99 |
-
|
| 100 |
-
```bash
|
| 101 |
-
python app.py
|
| 102 |
-
```
|
| 103 |
-
|
| 104 |
-
## Submission Format
|
| 105 |
-
|
| 106 |
-
Submit your results as a JSONL file where each line contains:
|
| 107 |
|
|
|
|
| 108 |
```json
|
| 109 |
{
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
"pass_at_1": 0.75,
|
| 124 |
-
"pass_at_5": 0.88,
|
| 125 |
-
"pass_at_10": 0.92,
|
| 126 |
-
"bleu_at_10": 0.65,
|
| 127 |
-
"total_evaluations": 100
|
| 128 |
}
|
| 129 |
```
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
Set the following environment variables:
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
## Citation
|
| 137 |
|
| 138 |
-
|
| 139 |
-
- **Multi-tab Interface**: Organized navigation with dedicated sections
|
| 140 |
-
- **Advanced Filtering**: Real-time filtering by multiple criteria
|
| 141 |
-
- **Dark Theme**: Modern, GitHub-inspired dark interface
|
| 142 |
-
- **IP-based Submissions**: Secure submission tracking
|
| 143 |
-
- **Comprehensive Analytics**: Detailed performance insights
|
| 144 |
-
- **Data Export**: Multiple export formats
|
| 145 |
-
- **Rate Limiting**: Anti-spam protection
|
| 146 |
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
- **Modular Architecture**: Clean separation of concerns
|
| 150 |
-
- **Type Safety**: Full type annotations throughout
|
| 151 |
-
- **Error Handling**: Comprehensive error handling and logging
|
| 152 |
-
- **Data Validation**: Multi-layer validation with Pydantic
|
| 153 |
-
- **Performance**: Optimized data processing and display
|
| 154 |
-
|
| 155 |
-
## 📈 Metrics & Evaluation
|
| 156 |
-
|
| 157 |
-
### Performance Metrics
|
| 158 |
-
|
| 159 |
-
- **BLEU**: Text similarity score (0.0-1.0)
|
| 160 |
-
- **Pass@1**: Success rate in single attempt (0.0-1.0)
|
| 161 |
-
- **Pass@5**: Success rate in 5 attempts (0.0-1.0)
|
| 162 |
-
- **Pass@10**: Success rate in 10 attempts (0.0-1.0)
|
| 163 |
-
|
| 164 |
-
### Quality Dimensions
|
| 165 |
-
|
| 166 |
-
1. **Readability**: How clear and readable are the reviews?
|
| 167 |
-
2. **Relevance**: How relevant to the code changes?
|
| 168 |
-
3. **Explanation Clarity**: How well does it explain issues?
|
| 169 |
-
4. **Problem Identification**: How effectively does it identify problems?
|
| 170 |
-
5. **Actionability**: How actionable are the suggestions?
|
| 171 |
-
6. **Completeness**: How thorough are the reviews?
|
| 172 |
-
7. **Specificity**: How specific are the comments?
|
| 173 |
-
8. **Contextual Adequacy**: How well does it understand context?
|
| 174 |
-
9. **Consistency**: How consistent across different reviews?
|
| 175 |
-
10. **Brevity**: How concise without losing important information?
|
| 176 |
-
|
| 177 |
-
## 🔒 Security Features
|
| 178 |
-
|
| 179 |
-
### Rate Limiting
|
| 180 |
-
|
| 181 |
-
- **5 submissions per IP per 24 hours**
|
| 182 |
-
- **Automatic IP tracking and logging**
|
| 183 |
-
- **Graceful error handling for rate limits**
|
| 184 |
-
|
| 185 |
-
### Data Validation
|
| 186 |
-
|
| 187 |
-
- **Model name format validation**
|
| 188 |
-
- **Score range validation (0.0-1.0 for performance, 0-10 for quality)**
|
| 189 |
-
- **Logical consistency checks (Pass@1 ≤ Pass@5 ≤ Pass@10)**
|
| 190 |
-
- **Required field validation**
|
| 191 |
-
|
| 192 |
-
### Audit Trail
|
| 193 |
-
|
| 194 |
-
- **Complete submission logging**
|
| 195 |
-
- **IP address tracking (partially masked for privacy)**
|
| 196 |
-
- **Timestamp recording**
|
| 197 |
-
- **Data integrity checks**
|
| 198 |
-
|
| 199 |
-
## 🤝 Contributing
|
| 200 |
-
|
| 201 |
-
1. Fork the repository
|
| 202 |
-
2. Create a feature branch
|
| 203 |
-
3. Make your changes
|
| 204 |
-
4. Add tests if applicable
|
| 205 |
-
5. Submit a pull request
|
| 206 |
-
|
| 207 |
-
## 📄 License
|
| 208 |
-
|
| 209 |
-
This project is licensed under the MIT License - see the LICENSE file for details.
|
| 210 |
-
|
| 211 |
-
## 🙏 Acknowledgments
|
| 212 |
-
|
| 213 |
-
- Inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench)
|
| 214 |
-
- Built with [Gradio](https://gradio.app/) for the web interface
|
| 215 |
-
- Thanks to the open-source community for tools and inspiration
|
| 216 |
-
|
| 217 |
-
## 📞 Support
|
| 218 |
-
|
| 219 |
-
For questions, issues, or contributions:
|
| 220 |
-
|
| 221 |
-
- Open an issue on GitHub
|
| 222 |
-
- Check the documentation
|
| 223 |
-
- Contact the maintainers
|
| 224 |
-
|
| 225 |
-
---
|
| 226 |
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: CodeReviewBench
|
| 3 |
+
emoji: 🥇
|
| 4 |
+
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
|
|
|
|
|
|
| 7 |
app_file: app.py
|
| 8 |
pinned: true
|
| 9 |
+
license: mit
|
| 10 |
+
short_description: Result of benchmark presented in paper CodeReviewBench
|
| 11 |
+
sdk_version: 5.19.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# Start the configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
Results files should have the following format and be stored as json files:
|
| 19 |
```json
|
| 20 |
{
|
| 21 |
+
"config": {
|
| 22 |
+
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
| 23 |
+
"model_name": "path of the model on the hub: org/model",
|
| 24 |
+
"model_sha": "revision on the hub",
|
| 25 |
+
},
|
| 26 |
+
"results": {
|
| 27 |
+
"task_name": {
|
| 28 |
+
"metric_name": score,
|
| 29 |
+
},
|
| 30 |
+
"task_name2": {
|
| 31 |
+
"metric_name": score,
|
| 32 |
+
}
|
| 33 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
}
|
| 35 |
```
|
| 36 |
|
| 37 |
+
Request files are created automatically by this tool.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
# Code logic for more complex edits
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
You'll find
|
| 44 |
+
- the main table' columns names and properties in `src/display/utils.py`
|
| 45 |
+
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
| 46 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
SUBMISSION_EXAMPLE.md
DELETED
|
@@ -1,266 +0,0 @@
|
|
| 1 |
-
# 📝 Model Submission Example
|
| 2 |
-
|
| 3 |
-
This guide shows you exactly how to submit your code review model to the leaderboard.
|
| 4 |
-
|
| 5 |
-
## 🚀 Step-by-Step Submission Process
|
| 6 |
-
|
| 7 |
-
### 1. **Access the Submission Form**
|
| 8 |
-
|
| 9 |
-
- Open the CodeReview Leaderboard in your browser
|
| 10 |
-
- Navigate to the **📝 Submit Model** tab
|
| 11 |
-
- Click on the "📝 Submit New Model Results" accordion to expand the form
|
| 12 |
-
|
| 13 |
-
### 2. **Fill in Basic Information**
|
| 14 |
-
|
| 15 |
-
#### **Model Name** ✨
|
| 16 |
-
|
| 17 |
-
```
|
| 18 |
-
Example: microsoft/CodeT5-base
|
| 19 |
-
Format: organization/model-name
|
| 20 |
-
```
|
| 21 |
-
|
| 22 |
-
#### **Programming Language** 🔍
|
| 23 |
-
|
| 24 |
-
```
|
| 25 |
-
Select: Python
|
| 26 |
-
(or Java, JavaScript, C++, Go, Rust, etc.)
|
| 27 |
-
```
|
| 28 |
-
|
| 29 |
-
#### **Comment Language** 🌍
|
| 30 |
-
|
| 31 |
-
```
|
| 32 |
-
Select: English
|
| 33 |
-
(or Chinese, Spanish, French, German, etc.)
|
| 34 |
-
```
|
| 35 |
-
|
| 36 |
-
#### **Taxonomy Category** 🏷️
|
| 37 |
-
|
| 38 |
-
```
|
| 39 |
-
Select: Bug Detection
|
| 40 |
-
(or Security, Performance, Code Style, etc.)
|
| 41 |
-
```
|
| 42 |
-
|
| 43 |
-
### 3. **Performance Scores** (0.0 - 1.0)
|
| 44 |
-
|
| 45 |
-
#### **BLEU Score**
|
| 46 |
-
|
| 47 |
-
```
|
| 48 |
-
Example: 0.742
|
| 49 |
-
Range: 0.0 to 1.0
|
| 50 |
-
Description: Measures similarity between generated and reference reviews
|
| 51 |
-
```
|
| 52 |
-
|
| 53 |
-
#### **Pass@1**
|
| 54 |
-
|
| 55 |
-
```
|
| 56 |
-
Example: 0.685
|
| 57 |
-
Range: 0.0 to 1.0
|
| 58 |
-
Description: Success rate when model gets 1 attempt
|
| 59 |
-
```
|
| 60 |
-
|
| 61 |
-
#### **Pass@5**
|
| 62 |
-
|
| 63 |
-
```
|
| 64 |
-
Example: 0.834
|
| 65 |
-
Range: 0.0 to 1.0
|
| 66 |
-
Description: Success rate when model gets 5 attempts
|
| 67 |
-
```
|
| 68 |
-
|
| 69 |
-
#### **Pass@10**
|
| 70 |
-
|
| 71 |
-
```
|
| 72 |
-
Example: 0.901
|
| 73 |
-
Range: 0.0 to 1.0
|
| 74 |
-
Description: Success rate when model gets 10 attempts
|
| 75 |
-
```
|
| 76 |
-
|
| 77 |
-
### 4. **Quality Metrics** (0 - 10)
|
| 78 |
-
|
| 79 |
-
Rate your model across these 10 dimensions:
|
| 80 |
-
|
| 81 |
-
#### **Readability: 8**
|
| 82 |
-
|
| 83 |
-
```
|
| 84 |
-
How clear and readable are the generated code reviews?
|
| 85 |
-
Scale: 0 (unreadable) to 10 (very clear)
|
| 86 |
-
```
|
| 87 |
-
|
| 88 |
-
#### **Relevance: 7**
|
| 89 |
-
|
| 90 |
-
```
|
| 91 |
-
How relevant are the reviews to the actual code changes?
|
| 92 |
-
Scale: 0 (irrelevant) to 10 (highly relevant)
|
| 93 |
-
```
|
| 94 |
-
|
| 95 |
-
#### **Explanation Clarity: 8**
|
| 96 |
-
|
| 97 |
-
```
|
| 98 |
-
How well does the model explain identified issues?
|
| 99 |
-
Scale: 0 (unclear) to 10 (very clear explanations)
|
| 100 |
-
```
|
| 101 |
-
|
| 102 |
-
#### **Problem Identification: 7**
|
| 103 |
-
|
| 104 |
-
```
|
| 105 |
-
How effectively does it identify real code problems?
|
| 106 |
-
Scale: 0 (misses issues) to 10 (finds all problems)
|
| 107 |
-
```
|
| 108 |
-
|
| 109 |
-
#### **Actionability: 6**
|
| 110 |
-
|
| 111 |
-
```
|
| 112 |
-
How actionable and useful are the suggestions?
|
| 113 |
-
Scale: 0 (not actionable) to 10 (very actionable)
|
| 114 |
-
```
|
| 115 |
-
|
| 116 |
-
#### **Completeness: 7**
|
| 117 |
-
|
| 118 |
-
```
|
| 119 |
-
How thorough and complete are the reviews?
|
| 120 |
-
Scale: 0 (incomplete) to 10 (comprehensive)
|
| 121 |
-
```
|
| 122 |
-
|
| 123 |
-
#### **Specificity: 6**
|
| 124 |
-
|
| 125 |
-
```
|
| 126 |
-
How specific are the comments and suggestions?
|
| 127 |
-
Scale: 0 (too generic) to 10 (very specific)
|
| 128 |
-
```
|
| 129 |
-
|
| 130 |
-
#### **Contextual Adequacy: 7**
|
| 131 |
-
|
| 132 |
-
```
|
| 133 |
-
How well does it understand the code context?
|
| 134 |
-
Scale: 0 (ignores context) to 10 (perfect context understanding)
|
| 135 |
-
```
|
| 136 |
-
|
| 137 |
-
#### **Consistency: 6**
|
| 138 |
-
|
| 139 |
-
```
|
| 140 |
-
How consistent is the model across different code reviews?
|
| 141 |
-
Scale: 0 (inconsistent) to 10 (very consistent)
|
| 142 |
-
```
|
| 143 |
-
|
| 144 |
-
#### **Brevity: 5**
|
| 145 |
-
|
| 146 |
-
```
|
| 147 |
-
How concise are the reviews without losing important information?
|
| 148 |
-
Scale: 0 (too verbose/too brief) to 10 (perfect length)
|
| 149 |
-
```
|
| 150 |
-
|
| 151 |
-
### 5. **Submit Your Model**
|
| 152 |
-
|
| 153 |
-
- Click the **🚀 Submit Model** button
|
| 154 |
-
- Wait for validation and processing
|
| 155 |
-
- Check for success/error message
|
| 156 |
-
|
| 157 |
-
## 📋 Complete Example Submission
|
| 158 |
-
|
| 159 |
-
Here's a real example of submitting the CodeT5-base model:
|
| 160 |
-
|
| 161 |
-
```yaml
|
| 162 |
-
Model Information:
|
| 163 |
-
Model Name: "microsoft/CodeT5-base"
|
| 164 |
-
Programming Language: "Python"
|
| 165 |
-
Comment Language: "English"
|
| 166 |
-
Taxonomy Category: "Bug Detection"
|
| 167 |
-
|
| 168 |
-
Performance Scores:
|
| 169 |
-
BLEU Score: 0.742
|
| 170 |
-
Pass@1: 0.685
|
| 171 |
-
Pass@5: 0.834
|
| 172 |
-
Pass@10: 0.901
|
| 173 |
-
|
| 174 |
-
Quality Metrics:
|
| 175 |
-
Readability: 8
|
| 176 |
-
Relevance: 7
|
| 177 |
-
Explanation Clarity: 8
|
| 178 |
-
Problem Identification: 7
|
| 179 |
-
Actionability: 6
|
| 180 |
-
Completeness: 7
|
| 181 |
-
Specificity: 6
|
| 182 |
-
Contextual Adequacy: 7
|
| 183 |
-
Consistency: 6
|
| 184 |
-
Brevity: 5
|
| 185 |
-
```
|
| 186 |
-
|
| 187 |
-
## 🔒 Security & Rate Limiting
|
| 188 |
-
|
| 189 |
-
### **IP-based Rate Limiting**
|
| 190 |
-
|
| 191 |
-
- **5 submissions per IP address per 24 hours**
|
| 192 |
-
- Submissions are tracked by your IP address
|
| 193 |
-
- Rate limit resets every 24 hours
|
| 194 |
-
|
| 195 |
-
### **Validation Rules**
|
| 196 |
-
|
| 197 |
-
- Model name must follow `organization/model` format
|
| 198 |
-
- All performance scores must be between 0.0 and 1.0
|
| 199 |
-
- All quality metrics must be between 0 and 10
|
| 200 |
-
- Pass@1 ≤ Pass@5 ≤ Pass@10 (logical consistency)
|
| 201 |
-
|
| 202 |
-
## ✅ After Submission
|
| 203 |
-
|
| 204 |
-
### **Immediate Feedback**
|
| 205 |
-
|
| 206 |
-
You'll see one of these messages:
|
| 207 |
-
|
| 208 |
-
#### **Success ✅**
|
| 209 |
-
|
| 210 |
-
```
|
| 211 |
-
✅ Submission recorded successfully!
|
| 212 |
-
```
|
| 213 |
-
|
| 214 |
-
#### **Error Examples ❌**
|
| 215 |
-
|
| 216 |
-
```
|
| 217 |
-
❌ Rate limit exceeded: 5/5 submissions in 24 hours
|
| 218 |
-
❌ Model name contains invalid characters
|
| 219 |
-
❌ Pass@1 score cannot be higher than Pass@5
|
| 220 |
-
❌ Score BLEU out of range: 1.2 (must be between 0 and 1)
|
| 221 |
-
```
|
| 222 |
-
|
| 223 |
-
### **View Your Results**
|
| 224 |
-
|
| 225 |
-
- Your model will appear in the **🏆 Leaderboard** tab
|
| 226 |
-
- Use filters to find your specific submission
|
| 227 |
-
- Check the **📈 Analytics** tab for submission history
|
| 228 |
-
|
| 229 |
-
## 🎯 Tips for Better Submissions
|
| 230 |
-
|
| 231 |
-
### **Model Naming**
|
| 232 |
-
|
| 233 |
-
```
|
| 234 |
-
✅ Good: "microsoft/CodeT5-base"
|
| 235 |
-
✅ Good: "facebook/bart-large"
|
| 236 |
-
✅ Good: "my-org/custom-model-v2"
|
| 237 |
-
❌ Bad: "my model"
|
| 238 |
-
❌ Bad: "model@v1.0"
|
| 239 |
-
```
|
| 240 |
-
|
| 241 |
-
### **Performance Scores**
|
| 242 |
-
|
| 243 |
-
- Be honest and accurate with your evaluations
|
| 244 |
-
- Use proper evaluation methodology
|
| 245 |
-
- Ensure Pass@k scores are logically consistent
|
| 246 |
-
- Document your evaluation process
|
| 247 |
-
|
| 248 |
-
### **Quality Metrics**
|
| 249 |
-
|
| 250 |
-
- Rate based on actual model performance
|
| 251 |
-
- Consider multiple test cases
|
| 252 |
-
- Be objective in your assessment
|
| 253 |
-
- Document your rating criteria
|
| 254 |
-
|
| 255 |
-
## 🤝 Need Help?
|
| 256 |
-
|
| 257 |
-
If you encounter issues:
|
| 258 |
-
|
| 259 |
-
1. Check the error message for specific guidance
|
| 260 |
-
2. Verify all fields are filled correctly
|
| 261 |
-
3. Ensure you haven't exceeded rate limits
|
| 262 |
-
4. Contact maintainers if problems persist
|
| 263 |
-
|
| 264 |
-
---
|
| 265 |
-
|
| 266 |
-
**Ready to submit your model? Head to the 📝 Submit Model tab and follow this guide!** 🚀
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,20 +1,8 @@
|
|
| 1 |
-
"""
|
| 2 |
-
CodeReview Leaderboard - Inspired by CodeReviewBench
|
| 3 |
-
A comprehensive leaderboard for code review generation models
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import os
|
| 7 |
-
import json
|
| 8 |
-
import tempfile
|
| 9 |
-
import logging
|
| 10 |
import gradio as gr
|
|
|
|
| 11 |
import pandas as pd
|
| 12 |
-
import plotly.express as px
|
| 13 |
-
import plotly.graph_objects as go
|
| 14 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 15 |
-
|
| 16 |
-
from gradio.themes.utils import fonts, colors
|
| 17 |
-
from dataclasses import fields, dataclass
|
| 18 |
|
| 19 |
from src.about import (
|
| 20 |
CITATION_BUTTON_LABEL,
|
|
@@ -26,1091 +14,191 @@ from src.about import (
|
|
| 26 |
)
|
| 27 |
from src.display.css_html_js import custom_css
|
| 28 |
from src.display.utils import (
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
CATEGORIES,
|
| 35 |
-
COMMENT_LANGUAGES,
|
| 36 |
-
EXAMPLE_CATEGORIES,
|
| 37 |
-
TOPICS,
|
| 38 |
ModelType,
|
| 39 |
-
|
| 40 |
-
Precision,
|
| 41 |
WeightType,
|
| 42 |
-
|
| 43 |
-
get_all_column_choices,
|
| 44 |
-
get_default_visible_columns,
|
| 45 |
-
)
|
| 46 |
-
from src.display.formatting import styled_message, styled_error, styled_warning
|
| 47 |
-
from src.envs import (
|
| 48 |
-
ADMIN_USERNAME,
|
| 49 |
-
ADMIN_PASSWORD,
|
| 50 |
-
RESULTS_DATASET_ID,
|
| 51 |
-
SUBMITTER_TOKEN,
|
| 52 |
-
TOKEN,
|
| 53 |
-
DATA_PATH,
|
| 54 |
-
)
|
| 55 |
-
from src.populate import get_leaderboard_df, get_category_leaderboard_df
|
| 56 |
-
from src.submission.submit import process_submission
|
| 57 |
-
|
| 58 |
-
# Configure logging
|
| 59 |
-
logging.basicConfig(
|
| 60 |
-
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
| 61 |
)
|
| 62 |
-
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
# Ensure data directory exists
|
| 65 |
-
os.makedirs(DATA_PATH, exist_ok=True)
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
CURRENT_VERSION = "v0"
|
| 70 |
|
| 71 |
-
|
| 72 |
try:
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
except Exception as e:
|
| 77 |
-
logger.error(f"Error loading leaderboard data: {e}")
|
| 78 |
-
LEADERBOARD_DF = pd.DataFrame()
|
| 79 |
-
|
| 80 |
-
custom_theme = gr.themes.Default(
|
| 81 |
-
primary_hue=colors.slate,
|
| 82 |
-
secondary_hue=colors.slate,
|
| 83 |
-
neutral_hue=colors.neutral,
|
| 84 |
-
font=(fonts.GoogleFont("Inter"), "sans-serif"),
|
| 85 |
-
).set(
|
| 86 |
-
# font_size="16px",
|
| 87 |
-
body_background_fill="#0f0f10",
|
| 88 |
-
body_background_fill_dark="#0f0f10",
|
| 89 |
-
body_text_color="#f4f4f5",
|
| 90 |
-
body_text_color_subdued="#a1a1aa",
|
| 91 |
-
block_background_fill="#1e1e1e", # Cooler Grey
|
| 92 |
-
block_border_color="#333333", # Cooler Grey
|
| 93 |
-
block_shadow="none",
|
| 94 |
-
# Swapped primary and secondary button styles
|
| 95 |
-
button_primary_background_fill="#121212", # Changed to specific color for Refresh button
|
| 96 |
-
button_primary_text_color="#f4f4f5",
|
| 97 |
-
button_primary_border_color="#333333", # Keep border grey or change to #121212?
|
| 98 |
-
button_secondary_background_fill="#f4f4f5",
|
| 99 |
-
button_secondary_text_color="#0f0f10",
|
| 100 |
-
button_secondary_border_color="#f4f4f5",
|
| 101 |
-
input_background_fill="#1e1e1e", # Cooler Grey
|
| 102 |
-
input_border_color="#333333", # Cooler Grey
|
| 103 |
-
input_placeholder_color="#71717a",
|
| 104 |
-
table_border_color="#333333", # Cooler Grey
|
| 105 |
-
table_even_background_fill="#2d2d2d", # Cooler Grey (Slightly lighter)
|
| 106 |
-
table_odd_background_fill="#1e1e1e", # Cooler Grey
|
| 107 |
-
table_text_color="#f4f4f5",
|
| 108 |
-
link_text_color="#ffffff",
|
| 109 |
-
border_color_primary="#333333", # Cooler Grey
|
| 110 |
-
background_fill_secondary="#333333", # Cooler Grey
|
| 111 |
-
color_accent="#f4f4f5",
|
| 112 |
-
border_color_accent="#333333", # Cooler Grey
|
| 113 |
-
button_primary_background_fill_hover="#424242", # Cooler Grey
|
| 114 |
-
block_title_text_color="#f4f4f5",
|
| 115 |
-
accordion_text_color="#f4f4f5",
|
| 116 |
-
panel_background_fill="#1e1e1e", # Cooler Grey
|
| 117 |
-
panel_border_color="#333333", # Cooler Grey
|
| 118 |
-
# Explicitly setting primary/secondary/accent colors/borders
|
| 119 |
-
background_fill_primary="#0f0f10",
|
| 120 |
-
background_fill_primary_dark="#0f0f10",
|
| 121 |
-
background_fill_secondary_dark="#333333", # Cooler Grey
|
| 122 |
-
border_color_primary_dark="#333333", # Cooler Grey
|
| 123 |
-
border_color_accent_dark="#333333", # Cooler Grey
|
| 124 |
-
border_color_accent_subdued="#424242", # Cooler Grey
|
| 125 |
-
border_color_accent_subdued_dark="#424242", # Cooler Grey
|
| 126 |
-
color_accent_soft="#a1a1aa",
|
| 127 |
-
color_accent_soft_dark="#a1a1aa",
|
| 128 |
-
# Explicitly setting input hover/focus states
|
| 129 |
-
input_background_fill_dark="#1e1e1e", # Cooler Grey
|
| 130 |
-
input_background_fill_focus="#424242", # Cooler Grey
|
| 131 |
-
input_background_fill_focus_dark="#424242", # Cooler Grey
|
| 132 |
-
input_background_fill_hover="#2d2d2d", # Cooler Grey
|
| 133 |
-
input_background_fill_hover_dark="#2d2d2d", # Cooler Grey
|
| 134 |
-
input_border_color_dark="#333333", # Cooler Grey
|
| 135 |
-
input_border_color_focus="#f4f4f5",
|
| 136 |
-
input_border_color_focus_dark="#f4f4f5",
|
| 137 |
-
input_border_color_hover="#424242", # Cooler Grey
|
| 138 |
-
input_border_color_hover_dark="#424242", # Cooler Grey
|
| 139 |
-
input_placeholder_color_dark="#71717a",
|
| 140 |
-
# Explicitly set dark variants for table backgrounds
|
| 141 |
-
table_even_background_fill_dark="#2d2d2d", # Cooler Grey
|
| 142 |
-
table_odd_background_fill_dark="#1e1e1e", # Cooler Grey
|
| 143 |
-
# Explicitly set dark text variants
|
| 144 |
-
body_text_color_dark="#f4f4f5",
|
| 145 |
-
body_text_color_subdued_dark="#a1a1aa",
|
| 146 |
-
block_title_text_color_dark="#f4f4f5",
|
| 147 |
-
accordion_text_color_dark="#f4f4f5",
|
| 148 |
-
table_text_color_dark="#f4f4f5",
|
| 149 |
-
# Explicitly set dark panel/block variants
|
| 150 |
-
panel_background_fill_dark="#1e1e1e", # Cooler Grey
|
| 151 |
-
panel_border_color_dark="#333333", # Cooler Grey
|
| 152 |
-
block_background_fill_dark="#1e1e1e", # Cooler Grey
|
| 153 |
-
block_border_color_dark="#333333", # Cooler Grey
|
| 154 |
-
)
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
@dataclass
|
| 158 |
-
class ColumnInfo:
|
| 159 |
-
"""Information about a column in the leaderboard."""
|
| 160 |
-
|
| 161 |
-
name: str
|
| 162 |
-
display_name: str
|
| 163 |
-
type: str = "text"
|
| 164 |
-
hidden: bool = False
|
| 165 |
-
never_hidden: bool = False
|
| 166 |
-
displayed_by_default: bool = True
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
def update_column_choices(df):
|
| 170 |
-
"""Update column choices based on what's actually in the dataframe"""
|
| 171 |
-
if df is None or df.empty:
|
| 172 |
-
return get_all_column_choices()
|
| 173 |
-
|
| 174 |
-
# Get columns that actually exist in the dataframe
|
| 175 |
-
existing_columns = list(df.columns)
|
| 176 |
-
|
| 177 |
-
# Get all possible columns with their display names
|
| 178 |
-
all_columns = get_all_column_choices()
|
| 179 |
-
|
| 180 |
-
# Filter to only include columns that exist in the dataframe
|
| 181 |
-
valid_columns = [
|
| 182 |
-
(col_name, display_name)
|
| 183 |
-
for col_name, display_name in all_columns
|
| 184 |
-
if col_name in existing_columns
|
| 185 |
-
]
|
| 186 |
-
|
| 187 |
-
# Return default if there are no valid columns
|
| 188 |
-
if not valid_columns:
|
| 189 |
-
return get_all_column_choices()
|
| 190 |
-
|
| 191 |
-
return valid_columns
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
# Update the column_selector initialization
|
| 195 |
-
def get_initial_columns():
|
| 196 |
-
"""Get initial columns to show in the dropdown"""
|
| 197 |
-
try:
|
| 198 |
-
# Get available columns in the main dataframe
|
| 199 |
-
available_cols = list(LEADERBOARD_DF.columns)
|
| 200 |
-
logger.info(f"Available columns in LEADERBOARD_DF: {available_cols}")
|
| 201 |
-
|
| 202 |
-
# If dataframe is empty, use default visible columns
|
| 203 |
-
if not available_cols:
|
| 204 |
-
return get_default_visible_columns()
|
| 205 |
-
|
| 206 |
-
# Get default visible columns that actually exist in the dataframe
|
| 207 |
-
valid_defaults = [
|
| 208 |
-
col for col in get_default_visible_columns() if col in available_cols
|
| 209 |
-
]
|
| 210 |
-
|
| 211 |
-
# If none of the defaults exist, return all available columns
|
| 212 |
-
if not valid_defaults:
|
| 213 |
-
return available_cols
|
| 214 |
-
|
| 215 |
-
return valid_defaults
|
| 216 |
-
except Exception as e:
|
| 217 |
-
logger.error(f"Error getting initial columns: {e}")
|
| 218 |
-
return get_default_visible_columns()
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
def init_leaderboard(dataframe, visible_columns=None):
|
| 222 |
-
"""
|
| 223 |
-
Initialize a standard Gradio Dataframe component for the leaderboard.
|
| 224 |
-
"""
|
| 225 |
-
if dataframe is None or dataframe.empty:
|
| 226 |
-
# Create an empty dataframe with the right columns
|
| 227 |
-
columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
|
| 228 |
-
dataframe = pd.DataFrame(columns=columns)
|
| 229 |
-
logger.warning("Initializing empty leaderboard")
|
| 230 |
-
|
| 231 |
-
# Lowercase model_name for display
|
| 232 |
-
if "model_name" in dataframe.columns:
|
| 233 |
-
dataframe = dataframe.copy()
|
| 234 |
-
dataframe["model_name"] = dataframe["model_name"].str.lower()
|
| 235 |
-
|
| 236 |
-
if "model_type" in dataframe.columns:
|
| 237 |
-
dataframe = dataframe.copy()
|
| 238 |
-
dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
|
| 239 |
-
|
| 240 |
-
if "review_model_type" in dataframe.columns:
|
| 241 |
-
dataframe = dataframe.copy()
|
| 242 |
-
dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
|
| 243 |
-
|
| 244 |
-
# print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
|
| 245 |
-
|
| 246 |
-
# Determine which columns to display
|
| 247 |
-
display_column_names = [
|
| 248 |
-
getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
|
| 249 |
-
]
|
| 250 |
-
hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
|
| 251 |
-
|
| 252 |
-
# Columns that should always be shown
|
| 253 |
-
always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
|
| 254 |
-
|
| 255 |
-
# Use provided visible columns if specified, otherwise use default
|
| 256 |
-
if visible_columns is None:
|
| 257 |
-
# Determine which columns to show initially
|
| 258 |
-
visible_columns = [
|
| 259 |
-
col for col in display_column_names if col not in hidden_column_names
|
| 260 |
-
]
|
| 261 |
-
|
| 262 |
-
# Always include the never-hidden columns
|
| 263 |
-
for col in always_visible:
|
| 264 |
-
if col not in visible_columns and col in dataframe.columns:
|
| 265 |
-
visible_columns.append(col)
|
| 266 |
-
|
| 267 |
-
# Make sure we only include columns that actually exist in the dataframe
|
| 268 |
-
visible_columns = [col for col in visible_columns if col in dataframe.columns]
|
| 269 |
-
|
| 270 |
-
# Map GuardBench column types to Gradio's expected datatype strings
|
| 271 |
-
# Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
|
| 272 |
-
type_mapping = {
|
| 273 |
-
"text": "str",
|
| 274 |
-
"number": "number",
|
| 275 |
-
"bool": "bool",
|
| 276 |
-
"date": "date",
|
| 277 |
-
"markdown": "markdown",
|
| 278 |
-
"html": "html",
|
| 279 |
-
"image": "image",
|
| 280 |
-
}
|
| 281 |
-
|
| 282 |
-
# Create a list of datatypes in the format Gradio expects
|
| 283 |
-
datatypes = []
|
| 284 |
-
for col in visible_columns:
|
| 285 |
-
# Find the corresponding CODEREVIEW_COLUMN entry
|
| 286 |
-
col_type = None
|
| 287 |
-
for display_col in DISPLAY_COLS:
|
| 288 |
-
if getattr(CODEREVIEW_COLUMN, display_col).name == col:
|
| 289 |
-
orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
|
| 290 |
-
# Map to Gradio's expected types
|
| 291 |
-
col_type = type_mapping.get(orig_type, "str")
|
| 292 |
-
break
|
| 293 |
-
|
| 294 |
-
# Default to 'str' if type not found or not mappable
|
| 295 |
-
if col_type is None:
|
| 296 |
-
col_type = "str"
|
| 297 |
-
|
| 298 |
-
datatypes.append(col_type)
|
| 299 |
-
|
| 300 |
-
# Create a dummy column for search functionality if it doesn't exist
|
| 301 |
-
if "search_dummy" not in dataframe.columns:
|
| 302 |
-
dataframe["search_dummy"] = dataframe.apply(
|
| 303 |
-
lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
|
| 304 |
-
axis=1,
|
| 305 |
-
)
|
| 306 |
-
|
| 307 |
-
# Select only the visible columns for display
|
| 308 |
-
visible_columns.remove("model_name")
|
| 309 |
-
|
| 310 |
-
visible_columns = ["model_name"] + visible_columns
|
| 311 |
-
display_df = dataframe[visible_columns].copy()
|
| 312 |
-
|
| 313 |
-
# print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
|
| 314 |
-
# print(display_df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head() if all(c in display_df.columns for c in ['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']) else "Relevant columns not present")
|
| 315 |
-
# print(f"-------------------------------------------------------------")
|
| 316 |
-
|
| 317 |
-
# Round numeric columns to 3 decimal places for display
|
| 318 |
-
numeric_cols = display_df.select_dtypes(include=np.number).columns
|
| 319 |
-
for col in numeric_cols:
|
| 320 |
-
# Avoid rounding integer columns like counts
|
| 321 |
-
if not pd.api.types.is_integer_dtype(display_df[col]):
|
| 322 |
-
# Format floats to exactly 3 decimal places, preserving trailing zeros
|
| 323 |
-
display_df[col] = display_df[col].apply(
|
| 324 |
-
lambda x: f"{x:.3f}" if pd.notna(x) else None
|
| 325 |
-
)
|
| 326 |
-
|
| 327 |
-
column_info_map = {
|
| 328 |
-
f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
|
| 329 |
-
}
|
| 330 |
-
column_mapping = {
|
| 331 |
-
col: column_info_map.get(col, ColumnInfo(col, col)).display_name
|
| 332 |
-
for col in visible_columns
|
| 333 |
-
}
|
| 334 |
-
|
| 335 |
-
# Rename columns in the DataFrame
|
| 336 |
-
display_df.rename(columns=column_mapping, inplace=True)
|
| 337 |
-
|
| 338 |
-
# Apply styling - note: styling might need adjustment if it relies on column names
|
| 339 |
-
styler = display_df.style.set_properties(**{"text-align": "right"}).set_properties(
|
| 340 |
-
subset=["Model"], **{"width": "200px"}
|
| 341 |
)
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
height=2500,
|
| 349 |
-
elem_id="leaderboard-table",
|
| 350 |
-
row_count=len(display_df),
|
| 351 |
)
|
|
|
|
|
|
|
| 352 |
|
| 353 |
|
| 354 |
-
|
| 355 |
-
df, search_query="", comment_languages=None, version=CURRENT_VERSION
|
| 356 |
-
):
|
| 357 |
-
"""
|
| 358 |
-
Filter the leaderboard based on search query and comment languages.
|
| 359 |
-
"""
|
| 360 |
-
if df is None or df.empty:
|
| 361 |
-
return df
|
| 362 |
-
|
| 363 |
-
filtered_df = df.copy()
|
| 364 |
-
|
| 365 |
-
# Add search dummy column if it doesn't exist
|
| 366 |
-
if "search_dummy" not in filtered_df.columns:
|
| 367 |
-
filtered_df["search_dummy"] = filtered_df.apply(
|
| 368 |
-
lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
|
| 369 |
-
axis=1,
|
| 370 |
-
)
|
| 371 |
-
|
| 372 |
-
# Apply comment language filter (assuming there's a comment_language column in the data)
|
| 373 |
-
if comment_languages and len(comment_languages) > 0:
|
| 374 |
-
# Look for a comment language column in the dataframe
|
| 375 |
-
comment_lang_cols = [col for col in filtered_df.columns if 'comment_language' in col.lower()]
|
| 376 |
-
if comment_lang_cols:
|
| 377 |
-
filtered_df = filtered_df[
|
| 378 |
-
filtered_df[comment_lang_cols[0]].isin(comment_languages)
|
| 379 |
-
]
|
| 380 |
-
|
| 381 |
-
# Apply search query
|
| 382 |
-
if search_query:
|
| 383 |
-
search_terms = [
|
| 384 |
-
term.strip() for term in search_query.split(";") if term.strip()
|
| 385 |
-
]
|
| 386 |
-
if search_terms:
|
| 387 |
-
combined_mask = None
|
| 388 |
-
for term in search_terms:
|
| 389 |
-
mask = filtered_df["search_dummy"].str.contains(
|
| 390 |
-
term, case=False, na=False
|
| 391 |
-
)
|
| 392 |
-
if combined_mask is None:
|
| 393 |
-
combined_mask = mask
|
| 394 |
-
else:
|
| 395 |
-
combined_mask = combined_mask | mask
|
| 396 |
-
|
| 397 |
-
if combined_mask is not None:
|
| 398 |
-
filtered_df = filtered_df[combined_mask]
|
| 399 |
-
|
| 400 |
-
# Drop the search dummy column before returning
|
| 401 |
-
visible_columns = [col for col in filtered_df.columns if col != "search_dummy"]
|
| 402 |
-
return filtered_df[visible_columns]
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
def refresh_data_with_filters(
|
| 406 |
-
version=CURRENT_VERSION, search_query="", comment_languages=None, selected_columns=None
|
| 407 |
-
):
|
| 408 |
-
"""
|
| 409 |
-
Refresh the leaderboard data and update all components with filtering.
|
| 410 |
-
Ensures we handle cases where dataframes might have limited columns.
|
| 411 |
-
"""
|
| 412 |
-
global LEADERBOARD_DF
|
| 413 |
-
try:
|
| 414 |
-
logger.info(f"Performing refresh of leaderboard data with filters...")
|
| 415 |
-
# Get new data
|
| 416 |
-
main_df = get_leaderboard_df(version=version)
|
| 417 |
-
LEADERBOARD_DF = main_df
|
| 418 |
-
category_dfs = [
|
| 419 |
-
get_category_leaderboard_df(category, version=version)
|
| 420 |
-
for category in CATEGORIES
|
| 421 |
-
]
|
| 422 |
-
selected_columns = [
|
| 423 |
-
x.lower()
|
| 424 |
-
.replace(" ", "_")
|
| 425 |
-
.replace("(", "")
|
| 426 |
-
.replace(")", "")
|
| 427 |
-
.replace("_recall", "_recall_binary")
|
| 428 |
-
.replace("_precision", "_precision_binary")
|
| 429 |
-
for x in selected_columns
|
| 430 |
-
]
|
| 431 |
-
|
| 432 |
-
# Log the actual columns we have
|
| 433 |
-
logger.info(f"Main dataframe columns: {list(main_df.columns)}")
|
| 434 |
-
|
| 435 |
-
# Apply filters to each dataframe
|
| 436 |
-
filtered_main_df = search_filter_leaderboard(
|
| 437 |
-
main_df, search_query, comment_languages, version
|
| 438 |
-
)
|
| 439 |
-
filtered_category_dfs = [
|
| 440 |
-
search_filter_leaderboard(df, search_query, comment_languages, version)
|
| 441 |
-
for df in category_dfs
|
| 442 |
-
]
|
| 443 |
-
|
| 444 |
-
# Get available columns from the dataframe
|
| 445 |
-
available_columns = list(filtered_main_df.columns)
|
| 446 |
-
|
| 447 |
-
# Filter selected columns to only those available in the data
|
| 448 |
-
if selected_columns:
|
| 449 |
-
# Convert display names to internal names first
|
| 450 |
-
internal_selected_columns = [
|
| 451 |
-
x.lower()
|
| 452 |
-
.replace(" ", "_")
|
| 453 |
-
.replace("(", "")
|
| 454 |
-
.replace(")", "")
|
| 455 |
-
.replace("_recall", "_recall_binary")
|
| 456 |
-
.replace("_precision", "_precision_binary")
|
| 457 |
-
for x in selected_columns
|
| 458 |
-
]
|
| 459 |
-
valid_selected_columns = [
|
| 460 |
-
col for col in internal_selected_columns if col in available_columns
|
| 461 |
-
]
|
| 462 |
-
if not valid_selected_columns and "model_name" in available_columns:
|
| 463 |
-
# Fallback if conversion/filtering leads to empty selection
|
| 464 |
-
valid_selected_columns = ["model_name"] + [
|
| 465 |
-
col
|
| 466 |
-
for col in get_default_visible_columns()
|
| 467 |
-
if col in available_columns
|
| 468 |
-
]
|
| 469 |
-
else:
|
| 470 |
-
# If no columns were selected in the dropdown, use default visible columns that exist
|
| 471 |
-
valid_selected_columns = [
|
| 472 |
-
col for col in get_default_visible_columns() if col in available_columns
|
| 473 |
-
]
|
| 474 |
-
|
| 475 |
-
# Initialize dataframes for display with valid selected columns
|
| 476 |
-
main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
|
| 477 |
-
|
| 478 |
-
# For category dataframes, get columns that actually exist in each one
|
| 479 |
-
category_dataframes = []
|
| 480 |
-
for df in filtered_category_dfs:
|
| 481 |
-
df_columns = list(df.columns)
|
| 482 |
-
df_valid_columns = [
|
| 483 |
-
col for col in valid_selected_columns if col in df_columns
|
| 484 |
-
]
|
| 485 |
-
if not df_valid_columns and "model_name" in df_columns:
|
| 486 |
-
df_valid_columns = ["model_name"] + get_default_visible_columns()
|
| 487 |
-
category_dataframes.append(init_leaderboard(df, df_valid_columns))
|
| 488 |
-
|
| 489 |
-
return main_dataframe, *category_dataframes
|
| 490 |
-
|
| 491 |
-
except Exception as e:
|
| 492 |
-
logger.error(f"Error in refresh with filters: {e}")
|
| 493 |
-
# Return the current leaderboards on error
|
| 494 |
-
return leaderboard, *[
|
| 495 |
-
tab.children[0] for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
|
| 496 |
-
]
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
def submit_results(
|
| 500 |
-
model_name: str,
|
| 501 |
-
base_model: str,
|
| 502 |
-
revision: str,
|
| 503 |
-
precision: str,
|
| 504 |
-
weight_type: str,
|
| 505 |
-
model_type: str,
|
| 506 |
-
mode: str,
|
| 507 |
-
submission_file: tempfile._TemporaryFileWrapper,
|
| 508 |
-
version: str,
|
| 509 |
-
review_model_type: ReviewModelType,
|
| 510 |
-
programming_language: str,
|
| 511 |
-
comment_language: str,
|
| 512 |
-
):
|
| 513 |
-
"""
|
| 514 |
-
Handle submission of results with model metadata.
|
| 515 |
-
"""
|
| 516 |
-
if submission_file is None:
|
| 517 |
-
return styled_error("No submission file provided")
|
| 518 |
-
|
| 519 |
-
if not model_name:
|
| 520 |
-
return styled_error("Model name is required")
|
| 521 |
-
|
| 522 |
-
if not model_type:
|
| 523 |
-
return styled_error("Please select a model type")
|
| 524 |
-
|
| 525 |
-
if not mode:
|
| 526 |
-
return styled_error("Please select an inference mode")
|
| 527 |
-
|
| 528 |
-
file_path = submission_file.name
|
| 529 |
-
logger.info(f"Received submission for model {model_name}: {file_path}")
|
| 530 |
-
|
| 531 |
-
# Add metadata to the submission
|
| 532 |
-
metadata = {
|
| 533 |
-
"model_name": model_name,
|
| 534 |
-
"base_model": base_model,
|
| 535 |
-
"revision": revision if revision else "main",
|
| 536 |
-
"precision": precision,
|
| 537 |
-
"weight_type": weight_type,
|
| 538 |
-
"model_type": model_type,
|
| 539 |
-
"mode": mode,
|
| 540 |
-
"version": version,
|
| 541 |
-
"review_model_type": review_model_type,
|
| 542 |
-
"programming_language": programming_language,
|
| 543 |
-
"comment_language": comment_language,
|
| 544 |
-
}
|
| 545 |
-
|
| 546 |
-
# Process the submission
|
| 547 |
-
result = process_submission(file_path, metadata, version=version)
|
| 548 |
-
|
| 549 |
-
# Refresh the leaderboard data
|
| 550 |
-
global LEADERBOARD_DF
|
| 551 |
-
try:
|
| 552 |
-
logger.info(
|
| 553 |
-
f"Refreshing leaderboard data after submission for version {version}..."
|
| 554 |
-
)
|
| 555 |
-
LEADERBOARD_DF = get_leaderboard_df(version=version)
|
| 556 |
-
logger.info("Refreshed leaderboard data after submission")
|
| 557 |
-
except Exception as e:
|
| 558 |
-
logger.error(f"Error refreshing leaderboard data: {e}")
|
| 559 |
-
|
| 560 |
-
return result
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
def refresh_data(version=CURRENT_VERSION):
|
| 564 |
-
"""
|
| 565 |
-
Refresh the leaderboard data and update all components.
|
| 566 |
-
"""
|
| 567 |
-
try:
|
| 568 |
-
logger.info(f"Performing scheduled refresh of leaderboard data...")
|
| 569 |
-
# Get new data
|
| 570 |
-
main_df = get_leaderboard_df(version=version)
|
| 571 |
-
category_dfs = [
|
| 572 |
-
get_category_leaderboard_df(category, version=version)
|
| 573 |
-
for category in CATEGORIES
|
| 574 |
-
]
|
| 575 |
-
|
| 576 |
-
# For gr.Dataframe, we return the actual dataframes
|
| 577 |
-
return main_df, *category_dfs
|
| 578 |
-
|
| 579 |
-
except Exception as e:
|
| 580 |
-
logger.error(f"Error in scheduled refresh: {e}")
|
| 581 |
-
return None, *[None for _ in CATEGORIES]
|
| 582 |
-
|
| 583 |
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
new_df = get_leaderboard_df(version=version)
|
| 590 |
-
category_dfs = [
|
| 591 |
-
get_category_leaderboard_df(category, version=version)
|
| 592 |
-
for category in CATEGORIES
|
| 593 |
-
]
|
| 594 |
-
return new_df, *category_dfs
|
| 595 |
-
except Exception as e:
|
| 596 |
-
logger.error(f"Error updating leaderboards for version {version}: {e}")
|
| 597 |
-
return None, *[None for _ in CATEGORIES]
|
| 598 |
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
|
| 622 |
-
for idx, model in enumerate(selected_models):
|
| 623 |
-
model_data = df[df["model_name"] == model]
|
| 624 |
-
if not model_data.empty:
|
| 625 |
-
values = model_data[metric_cols].values[0].tolist()
|
| 626 |
-
values = values + [values[0]]
|
| 627 |
-
categories = [col.replace(f"_{metric}", "") for col in metric_cols]
|
| 628 |
-
# Replace 'jailbreaked' with 'jailbroken' in categories
|
| 629 |
-
categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
|
| 630 |
-
categories = categories + [categories[0]]
|
| 631 |
-
fig.add_trace(
|
| 632 |
-
go.Scatterpolar(
|
| 633 |
-
r=values,
|
| 634 |
-
theta=categories,
|
| 635 |
-
name=model,
|
| 636 |
-
line_color=colors[idx % len(colors)],
|
| 637 |
-
fill="toself",
|
| 638 |
-
)
|
| 639 |
-
)
|
| 640 |
-
fig.update_layout(
|
| 641 |
-
paper_bgcolor="#000000",
|
| 642 |
-
plot_bgcolor="#000000",
|
| 643 |
-
font={"color": "#ffffff"},
|
| 644 |
-
title={
|
| 645 |
-
"text": f"{category} - {metric.upper()} Score Comparison",
|
| 646 |
-
"font": {"color": "#ffffff", "size": 24},
|
| 647 |
-
},
|
| 648 |
-
polar=dict(
|
| 649 |
-
bgcolor="#000000",
|
| 650 |
-
radialaxis=dict(
|
| 651 |
-
visible=True,
|
| 652 |
-
range=[0, 1],
|
| 653 |
-
gridcolor="#333333",
|
| 654 |
-
linecolor="#333333",
|
| 655 |
-
tickfont={"color": "#ffffff"},
|
| 656 |
),
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
linecolor="#333333",
|
| 660 |
-
tickfont={"color": "#ffffff"},
|
| 661 |
),
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
legend=dict(
|
| 666 |
-
yanchor="top",
|
| 667 |
-
y=0.99,
|
| 668 |
-
xanchor="right",
|
| 669 |
-
x=0.99,
|
| 670 |
-
bgcolor="rgba(0,0,0,0.5)",
|
| 671 |
-
font={"color": "#ffffff"},
|
| 672 |
-
),
|
| 673 |
-
)
|
| 674 |
-
return fig
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
def update_model_choices(version):
|
| 678 |
-
"""
|
| 679 |
-
Update the list of available models for the given version.
|
| 680 |
-
"""
|
| 681 |
-
df = get_leaderboard_df(version=version)
|
| 682 |
-
if df.empty:
|
| 683 |
-
return []
|
| 684 |
-
return sorted(df["model_name"].str.lower().unique().tolist())
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
def update_visualization(selected_models, selected_category, selected_metric, version):
|
| 688 |
-
"""
|
| 689 |
-
Update the visualization based on user selections.
|
| 690 |
-
"""
|
| 691 |
-
if not selected_models:
|
| 692 |
-
return go.Figure()
|
| 693 |
-
return create_performance_plot(
|
| 694 |
-
selected_models, selected_category, selected_metric, version
|
| 695 |
)
|
| 696 |
|
| 697 |
|
| 698 |
-
|
| 699 |
-
demo = gr.Blocks(css=custom_css, theme=custom_theme)
|
| 700 |
-
|
| 701 |
-
CATEGORY_DISPLAY_MAP = {
|
| 702 |
-
"Python": "Python",
|
| 703 |
-
"Java": "Java",
|
| 704 |
-
"Scala": "Scala",
|
| 705 |
-
"Go": "Go"
|
| 706 |
-
}
|
| 707 |
-
# Create reverse mapping for lookups
|
| 708 |
-
CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
|
| 709 |
-
|
| 710 |
with demo:
|
| 711 |
gr.HTML(TITLE)
|
| 712 |
-
|
| 713 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 714 |
|
| 715 |
-
with gr.
|
| 716 |
-
|
|
|
|
| 717 |
|
| 718 |
-
with
|
| 719 |
-
|
| 720 |
-
with gr.Row():
|
| 721 |
-
version_selector = gr.Dropdown(
|
| 722 |
-
choices=BENCHMARK_VERSIONS,
|
| 723 |
-
label="Benchmark Version",
|
| 724 |
-
value=CURRENT_VERSION,
|
| 725 |
-
interactive=True,
|
| 726 |
-
elem_classes="version-selector",
|
| 727 |
-
scale=1,
|
| 728 |
-
visible=False,
|
| 729 |
-
)
|
| 730 |
|
|
|
|
|
|
|
| 731 |
with gr.Row():
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
)
|
| 746 |
-
programming_language_filter = gr.Dropdown(
|
| 747 |
-
choices=["Python", "Java", "Scala", "Go"],
|
| 748 |
-
label="Programming Language",
|
| 749 |
-
multiselect=True,
|
| 750 |
-
value=[],
|
| 751 |
-
interactive=True,
|
| 752 |
-
scale=1,
|
| 753 |
-
)
|
| 754 |
-
with gr.Row():
|
| 755 |
-
topic_filter = gr.Dropdown(
|
| 756 |
-
choices=TOPICS,
|
| 757 |
-
label="Topic",
|
| 758 |
-
multiselect=True,
|
| 759 |
-
value=[],
|
| 760 |
-
interactive=True,
|
| 761 |
-
scale=2,
|
| 762 |
-
)
|
| 763 |
-
column_selector = gr.Dropdown(
|
| 764 |
-
choices=get_all_column_choices(),
|
| 765 |
-
label="Columns",
|
| 766 |
-
multiselect=True,
|
| 767 |
-
value=get_initial_columns(),
|
| 768 |
-
interactive=True,
|
| 769 |
-
visible=False,
|
| 770 |
-
scale=1,
|
| 771 |
-
)
|
| 772 |
-
with gr.Row():
|
| 773 |
-
refresh_button = gr.Button(
|
| 774 |
-
"Refresh", scale=0, elem_id="refresh-button"
|
| 775 |
-
)
|
| 776 |
-
|
| 777 |
-
# Create tabs for each category
|
| 778 |
-
with gr.Tabs(elem_classes="category-tabs") as category_tabs:
|
| 779 |
-
# First tab for average metrics across all categories
|
| 780 |
-
with gr.TabItem("All Results", elem_id="overall-tab"):
|
| 781 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 782 |
-
|
| 783 |
-
# Create a tab for each category using display names
|
| 784 |
-
for category in CATEGORIES:
|
| 785 |
-
display_name = CATEGORY_DISPLAY_MAP.get(category, category)
|
| 786 |
-
elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
|
| 787 |
-
with gr.TabItem(display_name, elem_id=elem_id):
|
| 788 |
-
category_df = get_category_leaderboard_df(
|
| 789 |
-
category, version=CURRENT_VERSION
|
| 790 |
-
)
|
| 791 |
-
category_leaderboard = init_leaderboard(category_df)
|
| 792 |
-
|
| 793 |
-
# Connect search and filter inputs to update function
|
| 794 |
-
def update_with_search_filters(
|
| 795 |
-
version=CURRENT_VERSION,
|
| 796 |
-
search_query="",
|
| 797 |
-
comment_languages=None,
|
| 798 |
-
selected_columns=None,
|
| 799 |
-
):
|
| 800 |
-
"""
|
| 801 |
-
Update the leaderboards with search and filter settings.
|
| 802 |
-
"""
|
| 803 |
-
return refresh_data_with_filters(
|
| 804 |
-
version, search_query, comment_languages, selected_columns
|
| 805 |
-
)
|
| 806 |
-
|
| 807 |
-
# Refresh button functionality
|
| 808 |
-
def refresh_and_update(
|
| 809 |
-
version, search_query, comment_languages, selected_columns
|
| 810 |
-
):
|
| 811 |
-
"""
|
| 812 |
-
Refresh data, update LEADERBOARD_DF, and return updated components.
|
| 813 |
-
"""
|
| 814 |
-
global LEADERBOARD_DF
|
| 815 |
-
main_df = get_leaderboard_df(version=version)
|
| 816 |
-
LEADERBOARD_DF = main_df # Update the global DataFrame
|
| 817 |
-
return refresh_data_with_filters(
|
| 818 |
-
version, search_query, comment_languages, selected_columns
|
| 819 |
-
)
|
| 820 |
-
|
| 821 |
-
refresh_button.click(
|
| 822 |
-
fn=refresh_and_update,
|
| 823 |
-
inputs=[
|
| 824 |
-
version_selector,
|
| 825 |
-
search_input,
|
| 826 |
-
comment_language_filter,
|
| 827 |
-
column_selector,
|
| 828 |
-
],
|
| 829 |
-
outputs=[leaderboard]
|
| 830 |
-
+ [
|
| 831 |
-
category_tabs.children[i].children[0]
|
| 832 |
-
for i in range(1, len(CATEGORIES) + 1)
|
| 833 |
-
],
|
| 834 |
-
)
|
| 835 |
-
# Search input functionality
|
| 836 |
-
search_input.change(
|
| 837 |
-
fn=refresh_data_with_filters,
|
| 838 |
-
inputs=[
|
| 839 |
-
version_selector,
|
| 840 |
-
search_input,
|
| 841 |
-
comment_language_filter,
|
| 842 |
-
column_selector,
|
| 843 |
-
],
|
| 844 |
-
outputs=[leaderboard]
|
| 845 |
-
+ [
|
| 846 |
-
category_tabs.children[i].children[0]
|
| 847 |
-
for i in range(1, len(CATEGORIES) + 1)
|
| 848 |
-
],
|
| 849 |
-
)
|
| 850 |
-
|
| 851 |
-
# Comment language filter functionality
|
| 852 |
-
comment_language_filter.change(
|
| 853 |
-
fn=refresh_data_with_filters,
|
| 854 |
-
inputs=[
|
| 855 |
-
version_selector,
|
| 856 |
-
search_input,
|
| 857 |
-
comment_language_filter,
|
| 858 |
-
column_selector,
|
| 859 |
-
],
|
| 860 |
-
outputs=[leaderboard]
|
| 861 |
-
+ [
|
| 862 |
-
category_tabs.children[i].children[0]
|
| 863 |
-
for i in range(1, len(CATEGORIES) + 1)
|
| 864 |
-
],
|
| 865 |
-
)
|
| 866 |
-
|
| 867 |
-
# Version selector functionality
|
| 868 |
-
version_selector.change(
|
| 869 |
-
fn=refresh_data_with_filters,
|
| 870 |
-
inputs=[
|
| 871 |
-
version_selector,
|
| 872 |
-
search_input,
|
| 873 |
-
comment_language_filter,
|
| 874 |
-
column_selector,
|
| 875 |
-
],
|
| 876 |
-
outputs=[leaderboard]
|
| 877 |
-
+ [
|
| 878 |
-
category_tabs.children[i].children[0]
|
| 879 |
-
for i in range(1, len(CATEGORIES) + 1)
|
| 880 |
-
],
|
| 881 |
-
)
|
| 882 |
-
|
| 883 |
-
# Update the update_columns function to handle updating all tabs at once
|
| 884 |
-
def update_columns(selected_columns):
|
| 885 |
-
"""
|
| 886 |
-
Update all leaderboards to show the selected columns.
|
| 887 |
-
Ensures all selected columns are preserved in the update.
|
| 888 |
-
|
| 889 |
-
"""
|
| 890 |
-
|
| 891 |
-
try:
|
| 892 |
-
logger.info(f"Updating columns to show: {selected_columns}")
|
| 893 |
-
|
| 894 |
-
# If no columns are selected, use default visible columns
|
| 895 |
-
if not selected_columns or len(selected_columns) == 0:
|
| 896 |
-
selected_columns = get_default_visible_columns()
|
| 897 |
-
logger.info(
|
| 898 |
-
f"No columns selected, using defaults: {selected_columns}"
|
| 899 |
)
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
-
.
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
|
| 910 |
-
]
|
| 911 |
-
|
| 912 |
-
# Get the current data with ALL columns preserved
|
| 913 |
-
main_df = get_leaderboard_df(version=version_selector.value)
|
| 914 |
-
|
| 915 |
-
# Get category dataframes with ALL columns preserved
|
| 916 |
-
category_dfs = [
|
| 917 |
-
get_category_leaderboard_df(
|
| 918 |
-
category, version=version_selector.value
|
| 919 |
)
|
| 920 |
-
for category in CATEGORIES
|
| 921 |
-
]
|
| 922 |
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
)
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
):
|
| 934 |
-
internal_selected_columns = [
|
| 935 |
-
"model_name"
|
| 936 |
-
] + internal_selected_columns
|
| 937 |
-
|
| 938 |
-
# Initialize the main leaderboard with the selected columns
|
| 939 |
-
# We're passing the internal_selected_columns directly to preserve the selection
|
| 940 |
-
main_leaderboard = init_leaderboard(
|
| 941 |
-
main_df, internal_selected_columns
|
| 942 |
-
)
|
| 943 |
-
|
| 944 |
-
# Initialize category dataframes with the same selected columns
|
| 945 |
-
# This ensures consistency across all tabs
|
| 946 |
-
category_leaderboards = []
|
| 947 |
-
for df in category_dfs:
|
| 948 |
-
# Use the same selected columns for each category
|
| 949 |
-
# init_leaderboard will automatically handle filtering to columns that exist
|
| 950 |
-
category_leaderboards.append(
|
| 951 |
-
init_leaderboard(df, internal_selected_columns)
|
| 952 |
)
|
| 953 |
-
|
| 954 |
-
|
| 955 |
-
|
| 956 |
-
|
| 957 |
-
|
| 958 |
-
|
| 959 |
-
|
| 960 |
-
|
| 961 |
-
|
| 962 |
-
|
| 963 |
-
|
| 964 |
-
|
| 965 |
-
|
| 966 |
-
# Connect column selector to update function
|
| 967 |
-
column_selector.change(
|
| 968 |
-
fn=update_columns,
|
| 969 |
-
inputs=[column_selector],
|
| 970 |
-
outputs=[leaderboard]
|
| 971 |
-
+ [
|
| 972 |
-
category_tabs.children[i].children[0]
|
| 973 |
-
for i in range(1, len(CATEGORIES) + 1)
|
| 974 |
-
],
|
| 975 |
-
)
|
| 976 |
-
|
| 977 |
-
# with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
|
| 978 |
-
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 979 |
-
|
| 980 |
-
with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=1):
|
| 981 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 982 |
-
|
| 983 |
-
with gr.Row():
|
| 984 |
-
# with gr.Column(scale=3):
|
| 985 |
-
# gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
| 986 |
-
with gr.Column(scale=1):
|
| 987 |
-
# Add version selector specifically for the submission tab
|
| 988 |
-
submission_version_selector = gr.Dropdown(
|
| 989 |
-
choices=BENCHMARK_VERSIONS,
|
| 990 |
-
label="Benchmark Version",
|
| 991 |
-
value=CURRENT_VERSION,
|
| 992 |
-
interactive=True,
|
| 993 |
-
elem_classes="version-selector",
|
| 994 |
-
visible=False,
|
| 995 |
-
)
|
| 996 |
-
|
| 997 |
-
with gr.Row():
|
| 998 |
-
with gr.Column():
|
| 999 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
| 1000 |
-
mode_selector = gr.Dropdown(
|
| 1001 |
-
choices=[m.name for m in Mode],
|
| 1002 |
-
label="Mode",
|
| 1003 |
-
multiselect=False,
|
| 1004 |
-
value=None,
|
| 1005 |
-
interactive=True,
|
| 1006 |
-
)
|
| 1007 |
-
revision_name_textbox = gr.Textbox(
|
| 1008 |
-
label="Revision commit", placeholder="main"
|
| 1009 |
-
)
|
| 1010 |
-
model_type = gr.Dropdown(
|
| 1011 |
-
choices=[
|
| 1012 |
-
t.to_str("-")
|
| 1013 |
-
for t in ModelType
|
| 1014 |
-
if t != ModelType.Unknown and t != ModelType.ClosedSource
|
| 1015 |
-
],
|
| 1016 |
-
label="Model type",
|
| 1017 |
-
multiselect=False,
|
| 1018 |
-
value=None,
|
| 1019 |
-
interactive=True,
|
| 1020 |
-
)
|
| 1021 |
-
review_model_type = gr.Dropdown(
|
| 1022 |
-
choices=[t.name for t in ReviewModelType],
|
| 1023 |
-
label="Review model type",
|
| 1024 |
-
multiselect=False,
|
| 1025 |
-
value=ReviewModelType.CUSTOM.name,
|
| 1026 |
-
interactive=True,
|
| 1027 |
-
)
|
| 1028 |
-
programming_language_selector = gr.Dropdown(
|
| 1029 |
-
choices=["Python", "Java", "Scala", "Go"],
|
| 1030 |
-
label="Programming Language",
|
| 1031 |
-
multiselect=False,
|
| 1032 |
-
value=None,
|
| 1033 |
-
interactive=True,
|
| 1034 |
-
)
|
| 1035 |
-
comment_language_selector = gr.Dropdown(
|
| 1036 |
-
choices=["en", "ru"],
|
| 1037 |
-
label="Comment Language",
|
| 1038 |
-
multiselect=False,
|
| 1039 |
-
value="en",
|
| 1040 |
-
interactive=True,
|
| 1041 |
-
)
|
| 1042 |
-
|
| 1043 |
-
with gr.Column():
|
| 1044 |
-
precision = gr.Dropdown(
|
| 1045 |
-
choices=[
|
| 1046 |
-
i.name for i in Precision if i != Precision.Unknown
|
| 1047 |
-
],
|
| 1048 |
-
label="Precision",
|
| 1049 |
-
multiselect=False,
|
| 1050 |
-
value="float16",
|
| 1051 |
-
interactive=True,
|
| 1052 |
-
)
|
| 1053 |
-
weight_type = gr.Dropdown(
|
| 1054 |
-
choices=[i.name for i in WeightType],
|
| 1055 |
-
label="Weights type",
|
| 1056 |
-
multiselect=False,
|
| 1057 |
-
value="Original",
|
| 1058 |
-
interactive=True,
|
| 1059 |
-
)
|
| 1060 |
-
base_model_name_textbox = gr.Textbox(
|
| 1061 |
-
label="Base model (for delta or adapter weights)"
|
| 1062 |
-
)
|
| 1063 |
-
|
| 1064 |
-
with gr.Row():
|
| 1065 |
-
file_input = gr.File(
|
| 1066 |
-
label="Upload JSONL Results File", file_types=[".jsonl"]
|
| 1067 |
)
|
| 1068 |
|
| 1069 |
-
|
| 1070 |
-
|
| 1071 |
-
|
| 1072 |
-
|
| 1073 |
-
|
| 1074 |
-
|
| 1075 |
-
|
| 1076 |
-
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
|
| 1082 |
-
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
-
|
| 1086 |
-
|
| 1087 |
-
|
| 1088 |
-
|
| 1089 |
-
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
-
lambda version: refresh_data_with_filters(version),
|
| 1101 |
-
inputs=[version_selector],
|
| 1102 |
-
outputs=[leaderboard]
|
| 1103 |
-
+ [
|
| 1104 |
-
category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
|
| 1105 |
-
],
|
| 1106 |
-
)
|
| 1107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1108 |
|
| 1109 |
-
# Set up the scheduler to refresh data periodically
|
| 1110 |
scheduler = BackgroundScheduler()
|
| 1111 |
-
scheduler.add_job(
|
| 1112 |
scheduler.start()
|
| 1113 |
-
|
| 1114 |
-
# Launch the app
|
| 1115 |
-
if __name__ == "__main__":
|
| 1116 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 3 |
import pandas as pd
|
|
|
|
|
|
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
+
from huggingface_hub import snapshot_download
|
|
|
|
|
|
|
| 6 |
|
| 7 |
from src.about import (
|
| 8 |
CITATION_BUTTON_LABEL,
|
|
|
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
from src.display.utils import (
|
| 17 |
+
BENCHMARK_COLS,
|
| 18 |
+
COLS,
|
| 19 |
+
EVAL_COLS,
|
| 20 |
+
EVAL_TYPES,
|
| 21 |
+
AutoEvalColumn,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
ModelType,
|
| 23 |
+
fields,
|
|
|
|
| 24 |
WeightType,
|
| 25 |
+
Precision
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
)
|
| 27 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 28 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 29 |
+
from src.submission.submit import add_new_eval
|
| 30 |
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
def restart_space():
|
| 33 |
+
API.restart_space(repo_id=REPO_ID)
|
|
|
|
| 34 |
|
| 35 |
+
### Space initialisation
|
| 36 |
try:
|
| 37 |
+
print(EVAL_REQUESTS_PATH)
|
| 38 |
+
snapshot_download(
|
| 39 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
)
|
| 41 |
+
except Exception:
|
| 42 |
+
restart_space()
|
| 43 |
+
try:
|
| 44 |
+
print(EVAL_RESULTS_PATH)
|
| 45 |
+
snapshot_download(
|
| 46 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
|
|
|
|
|
|
|
|
|
| 47 |
)
|
| 48 |
+
except Exception:
|
| 49 |
+
restart_space()
|
| 50 |
|
| 51 |
|
| 52 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
(
|
| 55 |
+
finished_eval_queue_df,
|
| 56 |
+
running_eval_queue_df,
|
| 57 |
+
pending_eval_queue_df,
|
| 58 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
def init_leaderboard(dataframe):
|
| 61 |
+
if dataframe is None or dataframe.empty:
|
| 62 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 63 |
+
return Leaderboard(
|
| 64 |
+
value=dataframe,
|
| 65 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 66 |
+
select_columns=SelectColumns(
|
| 67 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 68 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 69 |
+
label="Select Columns to Display:",
|
| 70 |
+
),
|
| 71 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 72 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 73 |
+
filter_columns=[
|
| 74 |
+
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 75 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 76 |
+
ColumnFilter(
|
| 77 |
+
AutoEvalColumn.params.name,
|
| 78 |
+
type="slider",
|
| 79 |
+
min=0.01,
|
| 80 |
+
max=150,
|
| 81 |
+
label="Select the number of parameters (B)",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
),
|
| 83 |
+
ColumnFilter(
|
| 84 |
+
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
|
|
|
|
|
|
| 85 |
),
|
| 86 |
+
],
|
| 87 |
+
bool_checkboxgroup_label="Hide models",
|
| 88 |
+
interactive=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
)
|
| 90 |
|
| 91 |
|
| 92 |
+
demo = gr.Blocks(css=custom_css)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
with demo:
|
| 94 |
gr.HTML(TITLE)
|
| 95 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
|
| 96 |
|
| 97 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 98 |
+
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 99 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 100 |
|
| 101 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 102 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 105 |
+
with gr.Column():
|
| 106 |
with gr.Row():
|
| 107 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 108 |
+
|
| 109 |
+
with gr.Column():
|
| 110 |
+
with gr.Accordion(
|
| 111 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 112 |
+
open=False,
|
| 113 |
+
):
|
| 114 |
+
with gr.Row():
|
| 115 |
+
finished_eval_table = gr.components.Dataframe(
|
| 116 |
+
value=finished_eval_queue_df,
|
| 117 |
+
headers=EVAL_COLS,
|
| 118 |
+
datatype=EVAL_TYPES,
|
| 119 |
+
row_count=5,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
)
|
| 121 |
+
with gr.Accordion(
|
| 122 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 123 |
+
open=False,
|
| 124 |
+
):
|
| 125 |
+
with gr.Row():
|
| 126 |
+
running_eval_table = gr.components.Dataframe(
|
| 127 |
+
value=running_eval_queue_df,
|
| 128 |
+
headers=EVAL_COLS,
|
| 129 |
+
datatype=EVAL_TYPES,
|
| 130 |
+
row_count=5,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
)
|
|
|
|
|
|
|
| 132 |
|
| 133 |
+
with gr.Accordion(
|
| 134 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 135 |
+
open=False,
|
| 136 |
+
):
|
| 137 |
+
with gr.Row():
|
| 138 |
+
pending_eval_table = gr.components.Dataframe(
|
| 139 |
+
value=pending_eval_queue_df,
|
| 140 |
+
headers=EVAL_COLS,
|
| 141 |
+
datatype=EVAL_TYPES,
|
| 142 |
+
row_count=5,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
)
|
| 144 |
+
with gr.Row():
|
| 145 |
+
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 146 |
+
|
| 147 |
+
with gr.Row():
|
| 148 |
+
with gr.Column():
|
| 149 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
| 150 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 151 |
+
model_type = gr.Dropdown(
|
| 152 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 153 |
+
label="Model type",
|
| 154 |
+
multiselect=False,
|
| 155 |
+
value=None,
|
| 156 |
+
interactive=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
)
|
| 158 |
|
| 159 |
+
with gr.Column():
|
| 160 |
+
precision = gr.Dropdown(
|
| 161 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 162 |
+
label="Precision",
|
| 163 |
+
multiselect=False,
|
| 164 |
+
value="float16",
|
| 165 |
+
interactive=True,
|
| 166 |
+
)
|
| 167 |
+
weight_type = gr.Dropdown(
|
| 168 |
+
choices=[i.value.name for i in WeightType],
|
| 169 |
+
label="Weights type",
|
| 170 |
+
multiselect=False,
|
| 171 |
+
value="Original",
|
| 172 |
+
interactive=True,
|
| 173 |
+
)
|
| 174 |
+
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 175 |
+
|
| 176 |
+
submit_button = gr.Button("Submit Eval")
|
| 177 |
+
submission_result = gr.Markdown()
|
| 178 |
+
submit_button.click(
|
| 179 |
+
add_new_eval,
|
| 180 |
+
[
|
| 181 |
+
model_name_textbox,
|
| 182 |
+
base_model_name_textbox,
|
| 183 |
+
revision_name_textbox,
|
| 184 |
+
precision,
|
| 185 |
+
weight_type,
|
| 186 |
+
model_type,
|
| 187 |
+
],
|
| 188 |
+
submission_result,
|
| 189 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
+
with gr.Row():
|
| 192 |
+
with gr.Accordion("📙 Citation", open=False):
|
| 193 |
+
citation_button = gr.Textbox(
|
| 194 |
+
value=CITATION_BUTTON_TEXT,
|
| 195 |
+
label=CITATION_BUTTON_LABEL,
|
| 196 |
+
lines=20,
|
| 197 |
+
elem_id="citation-button",
|
| 198 |
+
show_copy_button=True,
|
| 199 |
+
)
|
| 200 |
|
|
|
|
| 201 |
scheduler = BackgroundScheduler()
|
| 202 |
+
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 203 |
scheduler.start()
|
| 204 |
+
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
|
|
|
|
|
data/.gitkeep
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# Keep this directory in git
|
|
|
|
|
|
data/leaderboard_data.json
DELETED
|
@@ -1,30 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"leaderboard": [
|
| 3 |
-
{
|
| 4 |
-
"model_name": "example/model",
|
| 5 |
-
"programming_language": "Python",
|
| 6 |
-
"comment_language": "English",
|
| 7 |
-
"taxonomy_category": "Bug Detection",
|
| 8 |
-
"bleu": 0.5,
|
| 9 |
-
"llm_pass_1": 0.5,
|
| 10 |
-
"llm_pass_5": 0.5,
|
| 11 |
-
"llm_pass_10": 0.5,
|
| 12 |
-
"metrics": {
|
| 13 |
-
"readability": 5,
|
| 14 |
-
"relevance": 5,
|
| 15 |
-
"explanation_clarity": 5,
|
| 16 |
-
"problem_identification": 5,
|
| 17 |
-
"actionability": 5,
|
| 18 |
-
"completeness": 5,
|
| 19 |
-
"specificity": 5,
|
| 20 |
-
"contextual_adequacy": 5,
|
| 21 |
-
"consistency": 5,
|
| 22 |
-
"brevity": 5
|
| 23 |
-
},
|
| 24 |
-
"submission_ip": "127.0.0.1",
|
| 25 |
-
"submission_date": "2024-01-01T00:00:00Z"
|
| 26 |
-
}
|
| 27 |
-
],
|
| 28 |
-
"last_updated": "2025-07-03T13:10:47.434623+00:00",
|
| 29 |
-
"total_entries": 1
|
| 30 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/submissions.json
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"submissions": [],
|
| 3 |
-
"last_updated": "2025-07-03T13:10:47.435548+00:00",
|
| 4 |
-
"total_submissions": 0
|
| 5 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
example_submission.jsonl
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
{"model_name": "GPT-4-CodeReview", "programming_language": "Python", "comment_language": "en", "topic": "Code Reliability", "observation_id": "obs_001", "code_snippet": "def calculate_sum(a, b):\n return a + b", "review_text": "This function is simple and correct, but consider adding type hints and docstring for better documentation.", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
|
| 2 |
-
{"model_name": "GPT-4-CodeReview", "programming_language": "Java", "comment_language": "en", "topic": "Coding Standards", "observation_id": "obs_002", "code_snippet": "public class Calculator {\n public int add(int a, int b) {\n return a + b;\n }\n}", "review_text": "Consider following Java naming conventions and adding JavaDoc comments. The method is functionally correct.", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
|
| 3 |
-
{"model_name": "Claude-3-CodeReview", "programming_language": "Scala", "comment_language": "ru", "topic": "Performance Issues", "observation_id": "obs_003", "code_snippet": "def fibonacci(n: Int): Int = {\n if (n <= 1) n\n else fibonacci(n-1) + fibonacci(n-2)\n}", "review_text": "Эта реализация неэффективна из-за экспоненциальной сложности. Рекомендуется использовать мемоизацию или итеративный подход.", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 9.2, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
|
| 4 |
-
{"model_name": "Llama-CodeReview", "programming_language": "Go", "comment_language": "en", "topic": "Variables", "observation_id": "obs_004", "code_snippet": "package main\n\nimport \"fmt\"\n\nfunc main() {\n var x int = 5\n var y int = 10\n fmt.Println(x + y)\n}", "review_text": "Consider using short variable declarations (:=) for local variables. Also, the variable names could be more descriptive.", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gradio_test.ipynb
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "code",
|
| 5 |
-
"execution_count": null,
|
| 6 |
-
"metadata": {},
|
| 7 |
-
"outputs": [],
|
| 8 |
-
"source": []
|
| 9 |
-
}
|
| 10 |
-
],
|
| 11 |
-
"metadata": {
|
| 12 |
-
"kernelspec": {
|
| 13 |
-
"display_name": "agent_env",
|
| 14 |
-
"language": "python",
|
| 15 |
-
"name": "python3"
|
| 16 |
-
},
|
| 17 |
-
"language_info": {
|
| 18 |
-
"codemirror_mode": {
|
| 19 |
-
"name": "ipython",
|
| 20 |
-
"version": 3
|
| 21 |
-
},
|
| 22 |
-
"file_extension": ".py",
|
| 23 |
-
"mimetype": "text/x-python",
|
| 24 |
-
"name": "python",
|
| 25 |
-
"nbconvert_exporter": "python",
|
| 26 |
-
"pygments_lexer": "ipython3",
|
| 27 |
-
"version": "3.13.2"
|
| 28 |
-
}
|
| 29 |
-
},
|
| 30 |
-
"nbformat": 4,
|
| 31 |
-
"nbformat_minor": 2
|
| 32 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data.json
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"entries": [
|
| 3 |
-
{
|
| 4 |
-
"model_name": "GPT-4-CodeReview",
|
| 5 |
-
"model_type": "LLM",
|
| 6 |
-
"mode": "Strict",
|
| 7 |
-
"review_model_type": "gpt-4",
|
| 8 |
-
"programming_language": "Python",
|
| 9 |
-
"comment_language": "en",
|
| 10 |
-
"topic": "Code Reliability",
|
| 11 |
-
"submission_date": "2024-10-06T12:00:00Z",
|
| 12 |
-
"version": "v0",
|
| 13 |
-
"readability": 8.5,
|
| 14 |
-
"relevance": 9.0,
|
| 15 |
-
"explanation_clarity": 7.8,
|
| 16 |
-
"problem_identification": 8.2,
|
| 17 |
-
"actionability": 8.7,
|
| 18 |
-
"completeness": 8.0,
|
| 19 |
-
"specificity": 7.5,
|
| 20 |
-
"contextual_adequacy": 8.3,
|
| 21 |
-
"consistency": 8.8,
|
| 22 |
-
"brevity": 7.2,
|
| 23 |
-
"pass_at_1": 0.75,
|
| 24 |
-
"pass_at_5": 0.88,
|
| 25 |
-
"pass_at_10": 0.92,
|
| 26 |
-
"bleu_at_10": 0.65,
|
| 27 |
-
"total_evaluations": 100
|
| 28 |
-
}
|
| 29 |
-
],
|
| 30 |
-
"last_updated": "2024-10-06T12:00:00Z",
|
| 31 |
-
"version": "v0"
|
| 32 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,8 +1,16 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
APScheduler
|
| 2 |
+
black
|
| 3 |
+
datasets
|
| 4 |
+
gradio
|
| 5 |
+
gradio[oauth]
|
| 6 |
+
gradio_leaderboard==0.0.13
|
| 7 |
+
gradio_client
|
| 8 |
+
huggingface-hub>=0.18.0
|
| 9 |
+
matplotlib
|
| 10 |
+
numpy
|
| 11 |
+
pandas
|
| 12 |
+
python-dateutil
|
| 13 |
+
tqdm
|
| 14 |
+
transformers
|
| 15 |
+
tokenizers>=0.15.0
|
| 16 |
+
sentencepiece
|
src/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# CodeReview Leaderboard - Source Module
|
|
|
|
|
|
src/about.py
CHANGED
|
@@ -1,59 +1,72 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
"""
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
|
| 11 |
-
INTRODUCTION_TEXT = """
|
| 12 |
-
## Introduction
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
Models are evaluated on their ability to provide high-quality code reviews that are helpful,
|
| 19 |
-
accurate, and actionable across multiple programming languages and review categories.
|
| 20 |
-
"""
|
| 21 |
|
| 22 |
-
LLM_BENCHMARKS_TEXT = """
|
| 23 |
-
CodeReview Bench is a comprehensive benchmark for evaluating automated code review systems across programming languages and review quality dimensions.
|
| 24 |
|
| 25 |
-
|
|
|
|
| 26 |
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
"""
|
| 30 |
|
| 31 |
EVALUATION_QUEUE_TEXT = """
|
| 32 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
3. Once validated, your model will appear on the leaderboard.
|
| 39 |
|
| 40 |
-
###
|
| 41 |
-
|
| 42 |
-
- Submissions should cover multiple programming languages where applicable
|
| 43 |
-
- Both Russian and English comment languages are supported
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
| 46 |
"""
|
| 47 |
|
| 48 |
-
CITATION_BUTTON_LABEL = "
|
| 49 |
-
|
| 50 |
-
CITATION_BUTTON_TEXT = """
|
| 51 |
-
@misc{codereviewbench2025,
|
| 52 |
-
author = {CodeReview Bench Team},
|
| 53 |
-
title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
|
| 54 |
-
year = {2025},
|
| 55 |
-
publisher = {GitHub},
|
| 56 |
-
journal = {GitHub repository},
|
| 57 |
-
howpublished = {\\url{https://github.com/your-org/codereview-bench}}
|
| 58 |
-
}
|
| 59 |
"""
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from enum import Enum
|
|
|
|
| 3 |
|
| 4 |
+
@dataclass
|
| 5 |
+
class Task:
|
| 6 |
+
benchmark: str
|
| 7 |
+
metric: str
|
| 8 |
+
col_name: str
|
| 9 |
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
# Select your tasks here
|
| 12 |
+
# ---------------------------------------------------
|
| 13 |
+
class Tasks(Enum):
|
| 14 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 15 |
+
task0 = Task("anli_r1", "acc", "ANLI")
|
| 16 |
+
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
| 17 |
+
|
| 18 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
| 19 |
+
# ---------------------------------------------------
|
| 20 |
|
|
|
|
|
|
|
|
|
|
| 21 |
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
# Your leaderboard name
|
| 24 |
+
TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
|
| 25 |
|
| 26 |
+
# What does your leaderboard evaluate?
|
| 27 |
+
INTRODUCTION_TEXT = """
|
| 28 |
+
Intro text
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
# Which evaluations are you running? how can people reproduce what you have?
|
| 32 |
+
LLM_BENCHMARKS_TEXT = f"""
|
| 33 |
+
## How it works
|
| 34 |
+
|
| 35 |
+
## Reproducibility
|
| 36 |
+
To reproduce our results, here is the commands you can run:
|
| 37 |
|
| 38 |
"""
|
| 39 |
|
| 40 |
EVALUATION_QUEUE_TEXT = """
|
| 41 |
+
## Some good practices before submitting a model
|
| 42 |
+
|
| 43 |
+
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
| 44 |
+
```python
|
| 45 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
| 46 |
+
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
| 47 |
+
model = AutoModel.from_pretrained("your model name", revision=revision)
|
| 48 |
+
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
| 49 |
+
```
|
| 50 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
| 51 |
+
|
| 52 |
+
Note: make sure your model is public!
|
| 53 |
+
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
| 54 |
|
| 55 |
+
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
| 56 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
| 57 |
|
| 58 |
+
### 3) Make sure your model has an open license!
|
| 59 |
+
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
|
|
|
| 60 |
|
| 61 |
+
### 4) Fill up your model card
|
| 62 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
## In case of model failure
|
| 65 |
+
If your model is displayed in the `FAILED` category, its execution stopped.
|
| 66 |
+
Make sure you have followed the above steps first.
|
| 67 |
+
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
| 68 |
"""
|
| 69 |
|
| 70 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 71 |
+
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
"""
|
src/display/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# Display utilities module
|
|
|
|
|
|
src/display/css_html_js.py
CHANGED
|
@@ -1,97 +1,105 @@
|
|
| 1 |
-
"""
|
| 2 |
-
CSS and styling for the CodeReview Bench Leaderboard.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
custom_css = """
|
|
|
|
| 6 |
.markdown-text {
|
| 7 |
font-size: 16px !important;
|
| 8 |
-
text-align: justify !important;
|
| 9 |
-
line-height: 1.0 !important;
|
| 10 |
-
margin-top: 10px !important;
|
| 11 |
-
margin-bottom: 10px !important;
|
| 12 |
}
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
background: #3f3f46 !important;
|
| 17 |
-
color: #f4f4f5 !important;
|
| 18 |
}
|
| 19 |
|
| 20 |
-
#citation-button
|
| 21 |
-
font-
|
| 22 |
}
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
}
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
padding: 10px;
|
| 32 |
-
margin-top: 15px;
|
| 33 |
-
border-radius: 5px;
|
| 34 |
}
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
color: #a1a1aa !important;
|
| 39 |
}
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
}
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
}
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
padding: 5px;
|
| 53 |
-
border-radius: 5px;
|
| 54 |
}
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
| 59 |
}
|
| 60 |
|
| 61 |
-
.
|
| 62 |
-
|
| 63 |
-
border-radius: 5px;
|
| 64 |
}
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
}
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
top: -5px;
|
| 74 |
}
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
.form,
|
| 80 |
-
.panel {
|
| 81 |
-
/* background: #18181b !important; */ /* Removed background override */
|
| 82 |
-
border-color: #27272a80 !important; /* Made border color semi-transparent */
|
| 83 |
-
border-width: 1px !important; /* Ensure border is visible */
|
| 84 |
-
border-style: solid !important;
|
| 85 |
}
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
.gradio-file .wrap {
|
| 89 |
-
/* background: #18181b !important; */ /* Removed background override */
|
| 90 |
-
border-color: #27272a !important;
|
| 91 |
}
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
margin-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
}
|
| 97 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
custom_css = """
|
| 2 |
+
|
| 3 |
.markdown-text {
|
| 4 |
font-size: 16px !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
| 6 |
|
| 7 |
+
#models-to-add-text {
|
| 8 |
+
font-size: 18px !important;
|
|
|
|
|
|
|
| 9 |
}
|
| 10 |
|
| 11 |
+
#citation-button span {
|
| 12 |
+
font-size: 16px !important;
|
| 13 |
}
|
| 14 |
|
| 15 |
+
#citation-button textarea {
|
| 16 |
+
font-size: 16px !important;
|
| 17 |
}
|
| 18 |
|
| 19 |
+
#citation-button > label > button {
|
| 20 |
+
margin: 6px;
|
| 21 |
+
transform: scale(1.3);
|
|
|
|
|
|
|
|
|
|
| 22 |
}
|
| 23 |
|
| 24 |
+
#leaderboard-table {
|
| 25 |
+
margin-top: 15px
|
|
|
|
| 26 |
}
|
| 27 |
|
| 28 |
+
#leaderboard-table-lite {
|
| 29 |
+
margin-top: 15px
|
| 30 |
}
|
| 31 |
|
| 32 |
+
#search-bar-table-box > div:first-child {
|
| 33 |
+
background: none;
|
| 34 |
+
border: none;
|
| 35 |
}
|
| 36 |
+
|
| 37 |
+
#search-bar {
|
| 38 |
+
padding: 0px;
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
|
| 41 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
| 42 |
+
#leaderboard-table td:nth-child(2),
|
| 43 |
+
#leaderboard-table th:nth-child(2) {
|
| 44 |
+
max-width: 400px;
|
| 45 |
+
overflow: auto;
|
| 46 |
+
white-space: nowrap;
|
| 47 |
}
|
| 48 |
|
| 49 |
+
.tab-buttons button {
|
| 50 |
+
font-size: 20px;
|
|
|
|
| 51 |
}
|
| 52 |
|
| 53 |
+
#scale-logo {
|
| 54 |
+
border-style: none !important;
|
| 55 |
+
box-shadow: none;
|
| 56 |
+
display: block;
|
| 57 |
+
margin-left: auto;
|
| 58 |
+
margin-right: auto;
|
| 59 |
+
max-width: 600px;
|
| 60 |
}
|
| 61 |
|
| 62 |
+
#scale-logo .download {
|
| 63 |
+
display: none;
|
|
|
|
| 64 |
}
|
| 65 |
+
#filter_type{
|
| 66 |
+
border: 0;
|
| 67 |
+
padding-left: 0;
|
| 68 |
+
padding-top: 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
}
|
| 70 |
+
#filter_type label {
|
| 71 |
+
display: flex;
|
|
|
|
|
|
|
|
|
|
| 72 |
}
|
| 73 |
+
#filter_type label > span{
|
| 74 |
+
margin-top: var(--spacing-lg);
|
| 75 |
+
margin-right: 0.5em;
|
| 76 |
+
}
|
| 77 |
+
#filter_type label > .wrap{
|
| 78 |
+
width: 103px;
|
| 79 |
+
}
|
| 80 |
+
#filter_type label > .wrap .wrap-inner{
|
| 81 |
+
padding: 2px;
|
| 82 |
+
}
|
| 83 |
+
#filter_type label > .wrap .wrap-inner input{
|
| 84 |
+
width: 1px
|
| 85 |
+
}
|
| 86 |
+
#filter-columns-type{
|
| 87 |
+
border:0;
|
| 88 |
+
padding:0.5;
|
| 89 |
+
}
|
| 90 |
+
#filter-columns-size{
|
| 91 |
+
border:0;
|
| 92 |
+
padding:0.5;
|
| 93 |
+
}
|
| 94 |
+
#box-filter > .form{
|
| 95 |
+
border: 0
|
| 96 |
}
|
| 97 |
"""
|
| 98 |
+
|
| 99 |
+
get_window_url_params = """
|
| 100 |
+
function(url_params) {
|
| 101 |
+
const params = new URLSearchParams(window.location.search);
|
| 102 |
+
url_params = Object.fromEntries(params);
|
| 103 |
+
return url_params;
|
| 104 |
+
}
|
| 105 |
+
"""
|
src/display/formatting.py
CHANGED
|
@@ -1,71 +1,27 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
"""
|
| 4 |
|
| 5 |
-
import pandas as pd
|
| 6 |
-
import numpy as np
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
def make_clickable_model(model_name: str) -> str:
|
| 10 |
-
"""
|
| 11 |
-
Create a clickable link for a model name.
|
| 12 |
-
"""
|
| 13 |
-
return f'<a href="https://huggingface.co/{model_name}" target="_blank">{model_name}</a>'
|
| 14 |
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
def has_no_nan_values(df: pd.DataFrame, columns: list) -> pd.Series:
|
| 17 |
-
"""
|
| 18 |
-
Check if a row has no NaN values in the specified columns.
|
| 19 |
-
"""
|
| 20 |
-
return ~df[columns].isna().any(axis=1)
|
| 21 |
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
def format_percentage(value: float) -> str:
|
| 24 |
-
"""
|
| 25 |
-
Format a value as a percentage.
|
| 26 |
-
"""
|
| 27 |
-
if pd.isna(value):
|
| 28 |
-
return "N/A"
|
| 29 |
-
return f"{value * 100:.2f}%"
|
| 30 |
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
def format_number(value: float, precision: int = 2) -> str:
|
| 33 |
-
"""
|
| 34 |
-
Format a number with specified precision.
|
| 35 |
-
"""
|
| 36 |
-
if pd.isna(value):
|
| 37 |
-
return "N/A"
|
| 38 |
-
return f"{value:.{precision}f}"
|
| 39 |
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
def styled_message(message: str) -> str:
|
| 42 |
-
"""
|
| 43 |
-
Format a success message with styling.
|
| 44 |
-
"""
|
| 45 |
-
return f"""
|
| 46 |
-
<div style="padding: 10px; border-radius: 5px; background-color: #e6f7e6; color: #2e7d32; border: 1px solid #2e7d32;">
|
| 47 |
-
✅ {message}
|
| 48 |
-
</div>
|
| 49 |
-
"""
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
"""
|
| 54 |
-
Format a warning message with styling.
|
| 55 |
-
"""
|
| 56 |
-
return f"""
|
| 57 |
-
<div style="padding: 10px; border-radius: 5px; background-color: #fff8e1; color: #ff8f00; border: 1px solid #ff8f00;">
|
| 58 |
-
⚠️ {message}
|
| 59 |
-
</div>
|
| 60 |
-
"""
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
def styled_error(message: str) -> str:
|
| 64 |
-
"""
|
| 65 |
-
Format an error message with styling.
|
| 66 |
-
"""
|
| 67 |
-
return f"""
|
| 68 |
-
<div style="padding: 10px; border-radius: 5px; background-color: #ffebee; color: #c62828; border: 1px solid #c62828;">
|
| 69 |
-
❌ {message}
|
| 70 |
-
</div>
|
| 71 |
-
"""
|
|
|
|
| 1 |
+
def model_hyperlink(link, model_name):
|
| 2 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
|
|
|
| 3 |
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
def make_clickable_model(model_name):
|
| 6 |
+
link = f"https://huggingface.co/{model_name}"
|
| 7 |
+
return model_hyperlink(link, model_name)
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
def styled_error(error):
|
| 11 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
def styled_warning(warn):
|
| 15 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
def styled_message(message):
|
| 19 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
def has_no_nan_values(df, columns):
|
| 23 |
+
return df[columns].notna().all(axis=1)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
def has_nan_values(df, columns):
|
| 27 |
+
return df[columns].isna().any(axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/utils.py
CHANGED
|
@@ -1,417 +1,110 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
"""
|
| 4 |
|
| 5 |
-
|
| 6 |
-
from enum import Enum, auto
|
| 7 |
-
from typing import List, Optional
|
| 8 |
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
CoT = auto() # Chain of Thought
|
| 13 |
-
Strict = auto()
|
| 14 |
-
|
| 15 |
-
def __str__(self):
|
| 16 |
-
"""String representation of the mode."""
|
| 17 |
-
return self.name
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
class ModelType(Enum):
|
| 21 |
-
"""Model types for the leaderboard."""
|
| 22 |
-
Unknown = auto()
|
| 23 |
-
OpenSource = auto()
|
| 24 |
-
ClosedSource = auto()
|
| 25 |
-
API = auto()
|
| 26 |
-
|
| 27 |
-
def to_str(self, separator: str = "-") -> str:
|
| 28 |
-
"""Convert enum to string with separator."""
|
| 29 |
-
if self == ModelType.Unknown:
|
| 30 |
-
return "Unknown"
|
| 31 |
-
elif self == ModelType.OpenSource:
|
| 32 |
-
return f"Open{separator}Source"
|
| 33 |
-
elif self == ModelType.ClosedSource:
|
| 34 |
-
return f"Closed{separator}Source"
|
| 35 |
-
elif self == ModelType.API:
|
| 36 |
-
return "API"
|
| 37 |
-
return "Unknown"
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
class ReviewModelType(str, Enum):
|
| 41 |
-
"""Review model types for the leaderboard."""
|
| 42 |
-
GPT_4 = "gpt-4"
|
| 43 |
-
GPT_3_5 = "gpt-3.5-turbo"
|
| 44 |
-
CLAUDE = "claude"
|
| 45 |
-
LLAMA = "llama"
|
| 46 |
-
GEMINI = "gemini"
|
| 47 |
-
CUSTOM = "custom"
|
| 48 |
-
|
| 49 |
-
def __str__(self):
|
| 50 |
-
"""String representation of the review model type."""
|
| 51 |
-
return self.value
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
class Precision(Enum):
|
| 55 |
-
"""Model precision types."""
|
| 56 |
-
Unknown = auto()
|
| 57 |
-
float16 = auto()
|
| 58 |
-
bfloat16 = auto()
|
| 59 |
-
float32 = auto()
|
| 60 |
-
int8 = auto()
|
| 61 |
-
int4 = auto()
|
| 62 |
-
NA = auto()
|
| 63 |
-
|
| 64 |
-
def __str__(self):
|
| 65 |
-
"""String representation of the precision type."""
|
| 66 |
-
return self.name
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
class WeightType(Enum):
|
| 70 |
-
"""Model weight types."""
|
| 71 |
-
Original = auto()
|
| 72 |
-
Delta = auto()
|
| 73 |
-
Adapter = auto()
|
| 74 |
-
|
| 75 |
-
def __str__(self):
|
| 76 |
-
"""String representation of the weight type."""
|
| 77 |
-
return self.name
|
| 78 |
|
| 79 |
|
|
|
|
|
|
|
|
|
|
| 80 |
@dataclass
|
| 81 |
-
class
|
| 82 |
-
"""Information about a column in the leaderboard."""
|
| 83 |
name: str
|
| 84 |
-
|
| 85 |
-
|
| 86 |
hidden: bool = False
|
| 87 |
never_hidden: bool = False
|
| 88 |
-
displayed_by_default: bool = True
|
| 89 |
-
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
@dataclass
|
| 92 |
-
class
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
name="model_name",
|
| 97 |
-
display_name="Model",
|
| 98 |
-
never_hidden=True,
|
| 99 |
-
displayed_by_default=True
|
| 100 |
-
))
|
| 101 |
-
mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 102 |
-
name="mode",
|
| 103 |
-
display_name="Mode",
|
| 104 |
-
displayed_by_default=True
|
| 105 |
-
))
|
| 106 |
-
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 107 |
-
name="model_type",
|
| 108 |
-
display_name="Access_Type",
|
| 109 |
-
displayed_by_default=True
|
| 110 |
-
))
|
| 111 |
-
submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 112 |
-
name="submission_date",
|
| 113 |
-
display_name="Submission_Date",
|
| 114 |
-
displayed_by_default=False
|
| 115 |
-
))
|
| 116 |
-
version: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 117 |
-
name="version",
|
| 118 |
-
display_name="Version",
|
| 119 |
-
displayed_by_default=False
|
| 120 |
-
))
|
| 121 |
-
review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 122 |
-
name="review_model_type",
|
| 123 |
-
display_name="Type",
|
| 124 |
-
displayed_by_default=False
|
| 125 |
-
))
|
| 126 |
-
base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 127 |
-
name="base_model",
|
| 128 |
-
display_name="Base Model",
|
| 129 |
-
displayed_by_default=False
|
| 130 |
-
))
|
| 131 |
-
revision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 132 |
-
name="revision",
|
| 133 |
-
display_name="Revision",
|
| 134 |
-
displayed_by_default=False
|
| 135 |
-
))
|
| 136 |
-
precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 137 |
-
name="precision",
|
| 138 |
-
display_name="Precision",
|
| 139 |
-
displayed_by_default=False
|
| 140 |
-
))
|
| 141 |
-
weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 142 |
-
name="weight_type",
|
| 143 |
-
display_name="Weight Type",
|
| 144 |
-
displayed_by_default=False
|
| 145 |
-
))
|
| 146 |
-
topic: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 147 |
-
name="topic",
|
| 148 |
-
display_name="Topic",
|
| 149 |
-
displayed_by_default=True
|
| 150 |
-
))
|
| 151 |
-
|
| 152 |
-
# LLM-based multimetric scores
|
| 153 |
-
readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 154 |
-
name="readability",
|
| 155 |
-
display_name="Readability",
|
| 156 |
-
type="number",
|
| 157 |
-
displayed_by_default=True
|
| 158 |
-
))
|
| 159 |
-
relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 160 |
-
name="relevance",
|
| 161 |
-
display_name="Relevance",
|
| 162 |
-
type="number",
|
| 163 |
-
displayed_by_default=True
|
| 164 |
-
))
|
| 165 |
-
explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 166 |
-
name="explanation_clarity",
|
| 167 |
-
display_name="Explanation_Clarity",
|
| 168 |
-
type="number",
|
| 169 |
-
displayed_by_default=True
|
| 170 |
-
))
|
| 171 |
-
problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 172 |
-
name="problem_identification",
|
| 173 |
-
display_name="Problem_Identification",
|
| 174 |
-
type="number",
|
| 175 |
-
displayed_by_default=True
|
| 176 |
-
))
|
| 177 |
-
actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 178 |
-
name="actionability",
|
| 179 |
-
display_name="Actionability",
|
| 180 |
-
type="number",
|
| 181 |
-
displayed_by_default=True
|
| 182 |
-
))
|
| 183 |
-
completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 184 |
-
name="completeness",
|
| 185 |
-
display_name="Completeness",
|
| 186 |
-
type="number",
|
| 187 |
-
displayed_by_default=True
|
| 188 |
-
))
|
| 189 |
-
specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 190 |
-
name="specificity",
|
| 191 |
-
display_name="Specificity",
|
| 192 |
-
type="number",
|
| 193 |
-
displayed_by_default=True
|
| 194 |
-
))
|
| 195 |
-
contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 196 |
-
name="contextual_adequacy",
|
| 197 |
-
display_name="Contextual_Adequacy",
|
| 198 |
-
type="number",
|
| 199 |
-
displayed_by_default=True
|
| 200 |
-
))
|
| 201 |
-
consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 202 |
-
name="consistency",
|
| 203 |
-
display_name="Consistency",
|
| 204 |
-
type="number",
|
| 205 |
-
displayed_by_default=True
|
| 206 |
-
))
|
| 207 |
-
brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 208 |
-
name="brevity",
|
| 209 |
-
display_name="Brevity",
|
| 210 |
-
type="number",
|
| 211 |
-
displayed_by_default=True
|
| 212 |
-
))
|
| 213 |
-
|
| 214 |
-
# LLM-based-exact-match metrics
|
| 215 |
-
pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 216 |
-
name="pass_at_1",
|
| 217 |
-
display_name="Pass@1",
|
| 218 |
-
type="number",
|
| 219 |
-
displayed_by_default=True
|
| 220 |
-
))
|
| 221 |
-
pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 222 |
-
name="pass_at_5",
|
| 223 |
-
display_name="Pass@5",
|
| 224 |
-
type="number",
|
| 225 |
-
displayed_by_default=True
|
| 226 |
-
))
|
| 227 |
-
pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 228 |
-
name="pass_at_10",
|
| 229 |
-
display_name="Pass@10",
|
| 230 |
-
type="number",
|
| 231 |
-
displayed_by_default=True
|
| 232 |
-
))
|
| 233 |
-
bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 234 |
-
name="bleu_at_10",
|
| 235 |
-
display_name="BLEU@10",
|
| 236 |
-
type="number",
|
| 237 |
-
displayed_by_default=True
|
| 238 |
-
))
|
| 239 |
-
|
| 240 |
-
# Overall aggregated metrics
|
| 241 |
-
overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 242 |
-
name="overall_score",
|
| 243 |
-
display_name="Overall_Score",
|
| 244 |
-
type="number",
|
| 245 |
-
displayed_by_default=True
|
| 246 |
-
))
|
| 247 |
-
multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 248 |
-
name="multimetric_average",
|
| 249 |
-
display_name="Multimetric_Average",
|
| 250 |
-
type="number",
|
| 251 |
-
displayed_by_default=True
|
| 252 |
-
))
|
| 253 |
-
exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 254 |
-
name="exact_match_average",
|
| 255 |
-
display_name="Exact_Match_Average",
|
| 256 |
-
type="number",
|
| 257 |
-
displayed_by_default=True
|
| 258 |
-
))
|
| 259 |
-
total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 260 |
-
name="total_evaluations",
|
| 261 |
-
display_name="Total_Evaluations",
|
| 262 |
-
type="number",
|
| 263 |
-
displayed_by_default=True
|
| 264 |
-
))
|
| 265 |
-
|
| 266 |
-
# Language-specific metrics (Russian)
|
| 267 |
-
ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 268 |
-
name="ru_readability",
|
| 269 |
-
display_name="RU_Readability",
|
| 270 |
-
type="number",
|
| 271 |
-
displayed_by_default=False
|
| 272 |
-
))
|
| 273 |
-
ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 274 |
-
name="ru_relevance",
|
| 275 |
-
display_name="RU_Relevance",
|
| 276 |
-
type="number",
|
| 277 |
-
displayed_by_default=False
|
| 278 |
-
))
|
| 279 |
-
ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 280 |
-
name="ru_overall_score",
|
| 281 |
-
display_name="RU_Overall_Score",
|
| 282 |
-
type="number",
|
| 283 |
-
displayed_by_default=False
|
| 284 |
-
))
|
| 285 |
-
|
| 286 |
-
# Language-specific metrics (English)
|
| 287 |
-
en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 288 |
-
name="en_readability",
|
| 289 |
-
display_name="EN_Readability",
|
| 290 |
-
type="number",
|
| 291 |
-
displayed_by_default=False
|
| 292 |
-
))
|
| 293 |
-
en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 294 |
-
name="en_relevance",
|
| 295 |
-
display_name="EN_Relevance",
|
| 296 |
-
type="number",
|
| 297 |
-
displayed_by_default=False
|
| 298 |
-
))
|
| 299 |
-
en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 300 |
-
name="en_overall_score",
|
| 301 |
-
display_name="EN_Overall_Score",
|
| 302 |
-
type="number",
|
| 303 |
-
displayed_by_default=False
|
| 304 |
-
))
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
# Create instances for easy access
|
| 308 |
-
CODEREVIEW_COLUMN = CodeReviewBenchColumn()
|
| 309 |
-
|
| 310 |
-
# Extract column lists for different views
|
| 311 |
-
COLS = [f.name for f in fields(CODEREVIEW_COLUMN)]
|
| 312 |
-
DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
| 313 |
-
if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
|
| 314 |
-
|
| 315 |
-
# Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
|
| 316 |
-
def reorder_display_cols():
|
| 317 |
-
cols = DISPLAY_COLS
|
| 318 |
-
if 'model_name' in cols and 'mode' in cols:
|
| 319 |
-
cols.remove('mode')
|
| 320 |
-
model_name_index = cols.index('model_name')
|
| 321 |
-
cols.insert(model_name_index + 1, 'mode')
|
| 322 |
-
return cols
|
| 323 |
-
DISPLAY_COLS = reorder_display_cols()
|
| 324 |
-
|
| 325 |
-
METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
| 326 |
-
if getattr(CODEREVIEW_COLUMN, f.name).type == "number"]
|
| 327 |
-
HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
| 328 |
-
if getattr(CODEREVIEW_COLUMN, f.name).hidden]
|
| 329 |
-
NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
| 330 |
-
if getattr(CODEREVIEW_COLUMN, f.name).never_hidden]
|
| 331 |
-
|
| 332 |
-
# Categories for CodeReview Bench (Programming Languages)
|
| 333 |
-
CATEGORIES = [
|
| 334 |
-
'Python',
|
| 335 |
-
'Java',
|
| 336 |
-
'Scala',
|
| 337 |
-
'Go'
|
| 338 |
-
]
|
| 339 |
-
|
| 340 |
-
# Language taxonomies for CodeReview Bench
|
| 341 |
-
COMMENT_LANGUAGES = [
|
| 342 |
-
'ru', # Russian
|
| 343 |
-
'en' # English
|
| 344 |
-
]
|
| 345 |
-
|
| 346 |
-
# Topics for CodeReview Bench
|
| 347 |
-
TOPICS = [
|
| 348 |
-
'Code Reliability',
|
| 349 |
-
'Coding Standards',
|
| 350 |
-
'Code Organization',
|
| 351 |
-
'Performance Issues',
|
| 352 |
-
'Validation',
|
| 353 |
-
'Variables'
|
| 354 |
-
]
|
| 355 |
-
|
| 356 |
-
# Example categories
|
| 357 |
-
EXAMPLE_CATEGORIES = [
|
| 358 |
-
'Bug_Fix',
|
| 359 |
-
'Code_Style',
|
| 360 |
-
'Performance',
|
| 361 |
-
'Security',
|
| 362 |
-
'Refactoring',
|
| 363 |
-
'Documentation',
|
| 364 |
-
'Testing',
|
| 365 |
-
'Architecture',
|
| 366 |
-
'Other'
|
| 367 |
-
]
|
| 368 |
|
| 369 |
-
# Metrics for CodeReview Bench
|
| 370 |
-
MULTIMETRIC_METRICS = [
|
| 371 |
-
"readability",
|
| 372 |
-
"relevance",
|
| 373 |
-
"explanation_clarity",
|
| 374 |
-
"problem_identification",
|
| 375 |
-
"actionability",
|
| 376 |
-
"completeness",
|
| 377 |
-
"specificity",
|
| 378 |
-
"contextual_adequacy",
|
| 379 |
-
"consistency",
|
| 380 |
-
"brevity"
|
| 381 |
-
]
|
| 382 |
|
| 383 |
-
|
| 384 |
-
"
|
| 385 |
-
"
|
| 386 |
-
"
|
| 387 |
-
"
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
|
| 390 |
-
|
| 391 |
-
""
|
| 392 |
-
|
|
|
|
| 393 |
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
""
|
| 397 |
-
|
| 398 |
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
# Create a tuple with both the internal name and display name
|
| 404 |
-
if column_info.name not in default_visible_columns:
|
| 405 |
-
column_choices.append((column_info.name, column_info.display_name))
|
| 406 |
|
| 407 |
-
|
|
|
|
| 408 |
|
| 409 |
-
|
| 410 |
-
"""
|
| 411 |
-
Get the list of column names that should be visible by default.
|
| 412 |
|
| 413 |
-
Returns:
|
| 414 |
-
List of column names that are displayed by default.
|
| 415 |
-
"""
|
| 416 |
-
return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
| 417 |
-
if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
|
|
|
|
| 1 |
+
from dataclasses import dataclass, make_dataclass
|
| 2 |
+
from enum import Enum
|
|
|
|
| 3 |
|
| 4 |
+
import pandas as pd
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
from src.about import Tasks
|
| 7 |
|
| 8 |
+
def fields(raw_class):
|
| 9 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
+
# These classes are for user facing column names,
|
| 13 |
+
# to avoid having to change them all around the code
|
| 14 |
+
# when a modif is needed
|
| 15 |
@dataclass
|
| 16 |
+
class ColumnContent:
|
|
|
|
| 17 |
name: str
|
| 18 |
+
type: str
|
| 19 |
+
displayed_by_default: bool
|
| 20 |
hidden: bool = False
|
| 21 |
never_hidden: bool = False
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
## Leaderboard columns
|
| 24 |
+
auto_eval_column_dict = []
|
| 25 |
+
# Init
|
| 26 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
+
#Scores
|
| 29 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 30 |
+
for task in Tasks:
|
| 31 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 32 |
+
# Model information
|
| 33 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
| 35 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 36 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 37 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 38 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 39 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 40 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 41 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 42 |
+
|
| 43 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
| 44 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 45 |
+
|
| 46 |
+
## For the queue columns in the submission tab
|
| 47 |
+
@dataclass(frozen=True)
|
| 48 |
+
class EvalQueueColumn: # Queue column
|
| 49 |
+
model = ColumnContent("model", "markdown", True)
|
| 50 |
+
revision = ColumnContent("revision", "str", True)
|
| 51 |
+
private = ColumnContent("private", "bool", True)
|
| 52 |
+
precision = ColumnContent("precision", "str", True)
|
| 53 |
+
weight_type = ColumnContent("weight_type", "str", "Original")
|
| 54 |
+
status = ColumnContent("status", "str", True)
|
| 55 |
+
|
| 56 |
+
## All the model information that we might need
|
| 57 |
@dataclass
|
| 58 |
+
class ModelDetails:
|
| 59 |
+
name: str
|
| 60 |
+
display_name: str = ""
|
| 61 |
+
symbol: str = "" # emoji
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
class ModelType(Enum):
|
| 65 |
+
PT = ModelDetails(name="pretrained", symbol="🟢")
|
| 66 |
+
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
| 67 |
+
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
| 68 |
+
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
| 69 |
+
Unknown = ModelDetails(name="", symbol="?")
|
| 70 |
+
|
| 71 |
+
def to_str(self, separator=" "):
|
| 72 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
| 73 |
+
|
| 74 |
+
@staticmethod
|
| 75 |
+
def from_str(type):
|
| 76 |
+
if "fine-tuned" in type or "🔶" in type:
|
| 77 |
+
return ModelType.FT
|
| 78 |
+
if "pretrained" in type or "🟢" in type:
|
| 79 |
+
return ModelType.PT
|
| 80 |
+
if "RL-tuned" in type or "🟦" in type:
|
| 81 |
+
return ModelType.RL
|
| 82 |
+
if "instruction-tuned" in type or "⭕" in type:
|
| 83 |
+
return ModelType.IFT
|
| 84 |
+
return ModelType.Unknown
|
| 85 |
|
| 86 |
+
class WeightType(Enum):
|
| 87 |
+
Adapter = ModelDetails("Adapter")
|
| 88 |
+
Original = ModelDetails("Original")
|
| 89 |
+
Delta = ModelDetails("Delta")
|
| 90 |
|
| 91 |
+
class Precision(Enum):
|
| 92 |
+
float16 = ModelDetails("float16")
|
| 93 |
+
bfloat16 = ModelDetails("bfloat16")
|
| 94 |
+
Unknown = ModelDetails("?")
|
| 95 |
|
| 96 |
+
def from_str(precision):
|
| 97 |
+
if precision in ["torch.float16", "float16"]:
|
| 98 |
+
return Precision.float16
|
| 99 |
+
if precision in ["torch.bfloat16", "bfloat16"]:
|
| 100 |
+
return Precision.bfloat16
|
| 101 |
+
return Precision.Unknown
|
| 102 |
|
| 103 |
+
# Column selection
|
| 104 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
+
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 107 |
+
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 108 |
|
| 109 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
|
|
|
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/envs.py
CHANGED
|
@@ -1,27 +1,25 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
from huggingface_hub import HfApi
|
| 3 |
-
from dotenv import load_dotenv
|
| 4 |
|
| 5 |
-
#
|
| 6 |
-
|
|
|
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
OWNER = os.environ.get("OWNER", "codereview-bench") # Change to your org
|
| 11 |
-
SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
|
| 12 |
-
ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
|
| 13 |
-
ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
|
| 19 |
-
#
|
| 20 |
-
CACHE_PATH
|
| 21 |
-
DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
|
| 22 |
|
| 23 |
-
# Local
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
# HF API instance
|
| 27 |
API = HfApi(token=TOKEN)
|
|
|
|
| 1 |
import os
|
| 2 |
+
|
| 3 |
from huggingface_hub import HfApi
|
|
|
|
| 4 |
|
| 5 |
+
# Info to change for your repository
|
| 6 |
+
# ----------------------------------
|
| 7 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
+
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
| 10 |
+
# ----------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
REPO_ID = f"{OWNER}/leaderboard"
|
| 13 |
+
QUEUE_REPO = f"{OWNER}/requests"
|
| 14 |
+
RESULTS_REPO = f"{OWNER}/results"
|
| 15 |
|
| 16 |
+
# If you setup a cache later, just change HF_HOME
|
| 17 |
+
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
|
| 18 |
|
| 19 |
+
# Local caches
|
| 20 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 22 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 24 |
|
|
|
|
| 25 |
API = HfApi(token=TOKEN)
|
src/leaderboard/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# Leaderboard processing module
|
|
|
|
|
|
src/leaderboard/processor.py
DELETED
|
@@ -1,271 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Process CodeReview Bench leaderboard data and submissions.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import json
|
| 6 |
-
import os
|
| 7 |
-
import pandas as pd
|
| 8 |
-
from datetime import datetime
|
| 9 |
-
from typing import Dict, List, Tuple, Optional
|
| 10 |
-
import numpy as np
|
| 11 |
-
|
| 12 |
-
from src.display.utils import (
|
| 13 |
-
CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES,
|
| 14 |
-
MULTIMETRIC_METRICS, EXACT_MATCH_METRICS
|
| 15 |
-
)
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
|
| 19 |
-
"""
|
| 20 |
-
Process a JSONL submission file for CodeReview Bench.
|
| 21 |
-
|
| 22 |
-
Args:
|
| 23 |
-
file_path: Path to the JSONL submission file
|
| 24 |
-
|
| 25 |
-
Returns:
|
| 26 |
-
Tuple of (entries_list, message)
|
| 27 |
-
"""
|
| 28 |
-
try:
|
| 29 |
-
entries = []
|
| 30 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
| 31 |
-
for line_num, line in enumerate(f, 1):
|
| 32 |
-
line = line.strip()
|
| 33 |
-
if not line:
|
| 34 |
-
continue
|
| 35 |
-
|
| 36 |
-
try:
|
| 37 |
-
entry = json.loads(line)
|
| 38 |
-
|
| 39 |
-
# Validate required fields
|
| 40 |
-
required_fields = ['model_name', 'programming_language', 'comment_language']
|
| 41 |
-
missing_fields = [field for field in required_fields if field not in entry]
|
| 42 |
-
if missing_fields:
|
| 43 |
-
return [], f"Missing required fields {missing_fields} in line {line_num}"
|
| 44 |
-
|
| 45 |
-
# Validate metrics exist
|
| 46 |
-
has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS)
|
| 47 |
-
has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS)
|
| 48 |
-
|
| 49 |
-
if not has_multimetric and not has_exact_match:
|
| 50 |
-
return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}"
|
| 51 |
-
|
| 52 |
-
entries.append(entry)
|
| 53 |
-
|
| 54 |
-
except json.JSONDecodeError as e:
|
| 55 |
-
return [], f"Invalid JSON in line {line_num}: {e}"
|
| 56 |
-
|
| 57 |
-
if not entries:
|
| 58 |
-
return [], "No valid entries found in submission file"
|
| 59 |
-
|
| 60 |
-
return entries, f"Successfully processed {len(entries)} entries"
|
| 61 |
-
|
| 62 |
-
except Exception as e:
|
| 63 |
-
return [], f"Error processing submission: {e}"
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
def calculate_overall_score(entry: Dict) -> float:
|
| 67 |
-
"""
|
| 68 |
-
Calculate overall score for a CodeReview Bench entry.
|
| 69 |
-
|
| 70 |
-
Args:
|
| 71 |
-
entry: Dictionary containing model evaluation results
|
| 72 |
-
|
| 73 |
-
Returns:
|
| 74 |
-
Overall score as float
|
| 75 |
-
"""
|
| 76 |
-
# Calculate multimetric average
|
| 77 |
-
multimetric_scores = []
|
| 78 |
-
for metric in MULTIMETRIC_METRICS:
|
| 79 |
-
if metric in entry and isinstance(entry[metric], (int, float)):
|
| 80 |
-
multimetric_scores.append(entry[metric])
|
| 81 |
-
|
| 82 |
-
multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0
|
| 83 |
-
|
| 84 |
-
# Calculate exact match average
|
| 85 |
-
exact_match_scores = []
|
| 86 |
-
for metric in EXACT_MATCH_METRICS:
|
| 87 |
-
if metric in entry and isinstance(entry[metric], (int, float)):
|
| 88 |
-
exact_match_scores.append(entry[metric])
|
| 89 |
-
|
| 90 |
-
exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0
|
| 91 |
-
|
| 92 |
-
# Weighted combination (can be adjusted based on requirements)
|
| 93 |
-
overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3)
|
| 94 |
-
|
| 95 |
-
return overall_score
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
def load_leaderboard_data(file_path: str) -> Dict:
|
| 99 |
-
"""
|
| 100 |
-
Load the leaderboard data from a JSON file.
|
| 101 |
-
"""
|
| 102 |
-
if not os.path.exists(file_path):
|
| 103 |
-
version = "v0"
|
| 104 |
-
if "_v" in file_path:
|
| 105 |
-
version = file_path.split("_")[-1].split(".")[0]
|
| 106 |
-
return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
|
| 107 |
-
|
| 108 |
-
with open(file_path, 'r') as f:
|
| 109 |
-
data = json.load(f)
|
| 110 |
-
|
| 111 |
-
# Ensure version field exists
|
| 112 |
-
if "version" not in data:
|
| 113 |
-
version = "v0"
|
| 114 |
-
if "_v" in file_path:
|
| 115 |
-
version = file_path.split("_")[-1].split(".")[0]
|
| 116 |
-
data["version"] = version
|
| 117 |
-
|
| 118 |
-
return data
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
def save_leaderboard_data(data: Dict, file_path: str) -> None:
|
| 122 |
-
"""
|
| 123 |
-
Save the leaderboard data to a JSON file.
|
| 124 |
-
"""
|
| 125 |
-
# Ensure the directory exists
|
| 126 |
-
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
| 127 |
-
|
| 128 |
-
# Update the last_updated timestamp
|
| 129 |
-
data["last_updated"] = datetime.now().isoformat()
|
| 130 |
-
|
| 131 |
-
# Ensure version is set
|
| 132 |
-
if "version" not in data:
|
| 133 |
-
version = "v0"
|
| 134 |
-
if "_v" in file_path:
|
| 135 |
-
version = file_path.split("_")[-1].split(".")[0]
|
| 136 |
-
data["version"] = version
|
| 137 |
-
|
| 138 |
-
with open(file_path, 'w') as f:
|
| 139 |
-
json.dump(data, f, indent=2)
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
| 143 |
-
"""
|
| 144 |
-
Convert leaderboard data to a pandas DataFrame for display.
|
| 145 |
-
"""
|
| 146 |
-
rows = []
|
| 147 |
-
|
| 148 |
-
for entry in leaderboard_data.get("entries", []):
|
| 149 |
-
model_name = entry.get("model_name", "Unknown Model")
|
| 150 |
-
|
| 151 |
-
# Extract basic metadata
|
| 152 |
-
row = {
|
| 153 |
-
"model_name": model_name,
|
| 154 |
-
"model_type": entry.get("model_type", "Unknown"),
|
| 155 |
-
"mode": entry.get("mode", "Strict"),
|
| 156 |
-
"submission_date": entry.get("submission_date", ""),
|
| 157 |
-
"version": entry.get("version", "v0"),
|
| 158 |
-
"review_model_type": entry.get("review_model_type", "custom").lower()
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
-
# Add additional metadata fields if present
|
| 162 |
-
for key in ["base_model", "revision", "precision", "weight_type", "topic", "programming_language", "comment_language"]:
|
| 163 |
-
if key in entry:
|
| 164 |
-
row[key] = entry[key]
|
| 165 |
-
|
| 166 |
-
# Add multimetric scores
|
| 167 |
-
for metric in MULTIMETRIC_METRICS:
|
| 168 |
-
if metric in entry:
|
| 169 |
-
row[metric] = entry[metric]
|
| 170 |
-
else:
|
| 171 |
-
row[metric] = pd.NA
|
| 172 |
-
|
| 173 |
-
# Add exact match metrics
|
| 174 |
-
for metric in EXACT_MATCH_METRICS:
|
| 175 |
-
if metric in entry:
|
| 176 |
-
row[metric] = entry[metric]
|
| 177 |
-
else:
|
| 178 |
-
row[metric] = pd.NA
|
| 179 |
-
|
| 180 |
-
# Calculate aggregated metrics
|
| 181 |
-
multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])]
|
| 182 |
-
exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])]
|
| 183 |
-
|
| 184 |
-
if multimetric_scores:
|
| 185 |
-
row["multimetric_average"] = np.mean(multimetric_scores)
|
| 186 |
-
else:
|
| 187 |
-
row["multimetric_average"] = pd.NA
|
| 188 |
-
|
| 189 |
-
if exact_match_scores:
|
| 190 |
-
row["exact_match_average"] = np.mean(exact_match_scores)
|
| 191 |
-
else:
|
| 192 |
-
row["exact_match_average"] = pd.NA
|
| 193 |
-
|
| 194 |
-
# Calculate overall score
|
| 195 |
-
row["overall_score"] = calculate_overall_score(entry)
|
| 196 |
-
|
| 197 |
-
# Add language-specific metrics if available
|
| 198 |
-
for lang in COMMENT_LANGUAGES:
|
| 199 |
-
for metric in ["readability", "relevance", "overall_score"]:
|
| 200 |
-
lang_key = f"{lang}_{metric}"
|
| 201 |
-
if lang_key in entry:
|
| 202 |
-
row[lang_key] = entry[lang_key]
|
| 203 |
-
else:
|
| 204 |
-
row[lang_key] = pd.NA
|
| 205 |
-
|
| 206 |
-
# Add evaluation count
|
| 207 |
-
row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA))
|
| 208 |
-
|
| 209 |
-
rows.append(row)
|
| 210 |
-
|
| 211 |
-
# Create DataFrame and sort by overall score
|
| 212 |
-
df = pd.DataFrame(rows)
|
| 213 |
-
|
| 214 |
-
# Ensure all expected columns exist
|
| 215 |
-
for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS:
|
| 216 |
-
if metric not in df.columns:
|
| 217 |
-
df[metric] = pd.NA
|
| 218 |
-
|
| 219 |
-
# Sort by overall score (descending)
|
| 220 |
-
if not df.empty:
|
| 221 |
-
df = df.sort_values(by="overall_score", ascending=False, na_position='last')
|
| 222 |
-
|
| 223 |
-
# Ensure summary columns exist
|
| 224 |
-
summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"]
|
| 225 |
-
for col in summary_cols:
|
| 226 |
-
if col not in df.columns:
|
| 227 |
-
df[col] = pd.NA
|
| 228 |
-
|
| 229 |
-
return df
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict:
|
| 233 |
-
"""
|
| 234 |
-
Add new entries to the leaderboard, replacing any with the same model name.
|
| 235 |
-
"""
|
| 236 |
-
# Create a mapping of existing entries by model name and version
|
| 237 |
-
existing_entries = {
|
| 238 |
-
(entry["model_name"], entry.get("version", "v0")): i
|
| 239 |
-
for i, entry in enumerate(leaderboard_data.get("entries", []))
|
| 240 |
-
}
|
| 241 |
-
|
| 242 |
-
# Process each new entry
|
| 243 |
-
for new_entry in new_entries:
|
| 244 |
-
model_name = new_entry.get("model_name")
|
| 245 |
-
version = new_entry.get("version", "v0")
|
| 246 |
-
|
| 247 |
-
# Add calculated metrics
|
| 248 |
-
new_entry["overall_score"] = calculate_overall_score(new_entry)
|
| 249 |
-
|
| 250 |
-
# Calculate averages
|
| 251 |
-
multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
|
| 252 |
-
exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
|
| 253 |
-
|
| 254 |
-
if multimetric_scores:
|
| 255 |
-
new_entry["multimetric_average"] = np.mean(multimetric_scores)
|
| 256 |
-
if exact_match_scores:
|
| 257 |
-
new_entry["exact_match_average"] = np.mean(exact_match_scores)
|
| 258 |
-
|
| 259 |
-
if (model_name, version) in existing_entries:
|
| 260 |
-
# Replace existing entry
|
| 261 |
-
leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
|
| 262 |
-
else:
|
| 263 |
-
# Add new entry
|
| 264 |
-
if "entries" not in leaderboard_data:
|
| 265 |
-
leaderboard_data["entries"] = []
|
| 266 |
-
leaderboard_data["entries"].append(new_entry)
|
| 267 |
-
|
| 268 |
-
# Update the last_updated timestamp
|
| 269 |
-
leaderboard_data["last_updated"] = datetime.now().isoformat()
|
| 270 |
-
|
| 271 |
-
return leaderboard_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard/read_evals.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import glob
|
| 2 |
+
import json
|
| 3 |
+
import math
|
| 4 |
+
import os
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
|
| 7 |
+
import dateutil
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from src.display.formatting import make_clickable_model
|
| 11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 12 |
+
from src.submission.check_validity import is_model_on_hub
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class EvalResult:
|
| 17 |
+
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
| 18 |
+
"""
|
| 19 |
+
eval_name: str # org_model_precision (uid)
|
| 20 |
+
full_model: str # org/model (path on hub)
|
| 21 |
+
org: str
|
| 22 |
+
model: str
|
| 23 |
+
revision: str # commit hash, "" if main
|
| 24 |
+
results: dict
|
| 25 |
+
precision: Precision = Precision.Unknown
|
| 26 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 27 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
| 28 |
+
architecture: str = "Unknown"
|
| 29 |
+
license: str = "?"
|
| 30 |
+
likes: int = 0
|
| 31 |
+
num_params: int = 0
|
| 32 |
+
date: str = "" # submission date of request file
|
| 33 |
+
still_on_hub: bool = False
|
| 34 |
+
|
| 35 |
+
@classmethod
|
| 36 |
+
def init_from_json_file(self, json_filepath):
|
| 37 |
+
"""Inits the result from the specific model result file"""
|
| 38 |
+
with open(json_filepath) as fp:
|
| 39 |
+
data = json.load(fp)
|
| 40 |
+
|
| 41 |
+
config = data.get("config")
|
| 42 |
+
|
| 43 |
+
# Precision
|
| 44 |
+
precision = Precision.from_str(config.get("model_dtype"))
|
| 45 |
+
|
| 46 |
+
# Get model and org
|
| 47 |
+
org_and_model = config.get("model_name", config.get("model_args", None))
|
| 48 |
+
org_and_model = org_and_model.split("/", 1)
|
| 49 |
+
|
| 50 |
+
if len(org_and_model) == 1:
|
| 51 |
+
org = None
|
| 52 |
+
model = org_and_model[0]
|
| 53 |
+
result_key = f"{model}_{precision.value.name}"
|
| 54 |
+
else:
|
| 55 |
+
org = org_and_model[0]
|
| 56 |
+
model = org_and_model[1]
|
| 57 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
| 58 |
+
full_model = "/".join(org_and_model)
|
| 59 |
+
|
| 60 |
+
still_on_hub, _, model_config = is_model_on_hub(
|
| 61 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
| 62 |
+
)
|
| 63 |
+
architecture = "?"
|
| 64 |
+
if model_config is not None:
|
| 65 |
+
architectures = getattr(model_config, "architectures", None)
|
| 66 |
+
if architectures:
|
| 67 |
+
architecture = ";".join(architectures)
|
| 68 |
+
|
| 69 |
+
# Extract results available in this file (some results are split in several files)
|
| 70 |
+
results = {}
|
| 71 |
+
for task in Tasks:
|
| 72 |
+
task = task.value
|
| 73 |
+
|
| 74 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
| 75 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
| 76 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 77 |
+
continue
|
| 78 |
+
|
| 79 |
+
mean_acc = np.mean(accs) * 100.0
|
| 80 |
+
results[task.benchmark] = mean_acc
|
| 81 |
+
|
| 82 |
+
return self(
|
| 83 |
+
eval_name=result_key,
|
| 84 |
+
full_model=full_model,
|
| 85 |
+
org=org,
|
| 86 |
+
model=model,
|
| 87 |
+
results=results,
|
| 88 |
+
precision=precision,
|
| 89 |
+
revision= config.get("model_sha", ""),
|
| 90 |
+
still_on_hub=still_on_hub,
|
| 91 |
+
architecture=architecture
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
def update_with_request_file(self, requests_path):
|
| 95 |
+
"""Finds the relevant request file for the current model and updates info with it"""
|
| 96 |
+
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
with open(request_file, "r") as f:
|
| 100 |
+
request = json.load(f)
|
| 101 |
+
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
| 102 |
+
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
| 103 |
+
self.license = request.get("license", "?")
|
| 104 |
+
self.likes = request.get("likes", 0)
|
| 105 |
+
self.num_params = request.get("params", 0)
|
| 106 |
+
self.date = request.get("submitted_time", "")
|
| 107 |
+
except Exception:
|
| 108 |
+
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
| 109 |
+
|
| 110 |
+
def to_dict(self):
|
| 111 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 112 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 113 |
+
data_dict = {
|
| 114 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
| 115 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
| 116 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
| 117 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 118 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 119 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
| 120 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
| 121 |
+
AutoEvalColumn.revision.name: self.revision,
|
| 122 |
+
AutoEvalColumn.average.name: average,
|
| 123 |
+
AutoEvalColumn.license.name: self.license,
|
| 124 |
+
AutoEvalColumn.likes.name: self.likes,
|
| 125 |
+
AutoEvalColumn.params.name: self.num_params,
|
| 126 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
for task in Tasks:
|
| 130 |
+
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
| 131 |
+
|
| 132 |
+
return data_dict
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def get_request_file_for_model(requests_path, model_name, precision):
|
| 136 |
+
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
| 137 |
+
request_files = os.path.join(
|
| 138 |
+
requests_path,
|
| 139 |
+
f"{model_name}_eval_request_*.json",
|
| 140 |
+
)
|
| 141 |
+
request_files = glob.glob(request_files)
|
| 142 |
+
|
| 143 |
+
# Select correct request file (precision)
|
| 144 |
+
request_file = ""
|
| 145 |
+
request_files = sorted(request_files, reverse=True)
|
| 146 |
+
for tmp_request_file in request_files:
|
| 147 |
+
with open(tmp_request_file, "r") as f:
|
| 148 |
+
req_content = json.load(f)
|
| 149 |
+
if (
|
| 150 |
+
req_content["status"] in ["FINISHED"]
|
| 151 |
+
and req_content["precision"] == precision.split(".")[-1]
|
| 152 |
+
):
|
| 153 |
+
request_file = tmp_request_file
|
| 154 |
+
return request_file
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 158 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
| 159 |
+
model_result_filepaths = []
|
| 160 |
+
|
| 161 |
+
for root, _, files in os.walk(results_path):
|
| 162 |
+
# We should only have json files in model results
|
| 163 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 164 |
+
continue
|
| 165 |
+
|
| 166 |
+
# Sort the files by date
|
| 167 |
+
try:
|
| 168 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
| 169 |
+
except dateutil.parser._parser.ParserError:
|
| 170 |
+
files = [files[-1]]
|
| 171 |
+
|
| 172 |
+
for file in files:
|
| 173 |
+
model_result_filepaths.append(os.path.join(root, file))
|
| 174 |
+
|
| 175 |
+
eval_results = {}
|
| 176 |
+
for model_result_filepath in model_result_filepaths:
|
| 177 |
+
# Creation of result
|
| 178 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 179 |
+
eval_result.update_with_request_file(requests_path)
|
| 180 |
+
|
| 181 |
+
# Store results of same eval together
|
| 182 |
+
eval_name = eval_result.eval_name
|
| 183 |
+
if eval_name in eval_results.keys():
|
| 184 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 185 |
+
else:
|
| 186 |
+
eval_results[eval_name] = eval_result
|
| 187 |
+
|
| 188 |
+
results = []
|
| 189 |
+
for v in eval_results.values():
|
| 190 |
+
try:
|
| 191 |
+
v.to_dict() # we test if the dict version is complete
|
| 192 |
+
results.append(v)
|
| 193 |
+
except KeyError: # not all eval values present
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
return results
|
src/populate.py
CHANGED
|
@@ -1,188 +1,58 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Populate the CodeReview Bench leaderboard from HuggingFace datasets.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
import json
|
| 6 |
import os
|
| 7 |
-
import pandas as pd
|
| 8 |
-
import tempfile
|
| 9 |
-
from typing import Dict, List, Optional
|
| 10 |
-
from datetime import datetime
|
| 11 |
-
import numpy as np
|
| 12 |
-
|
| 13 |
-
from huggingface_hub import hf_hub_download, HfApi
|
| 14 |
-
from datasets import load_dataset
|
| 15 |
-
|
| 16 |
-
from src.display.utils import CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES
|
| 17 |
-
from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
|
| 18 |
-
from src.leaderboard.processor import leaderboard_to_dataframe
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
def get_latest_leaderboard(version="v0") -> Optional[Dict]:
|
| 22 |
-
"""
|
| 23 |
-
Get the latest leaderboard data from HuggingFace dataset.
|
| 24 |
-
Fallback to local JSON file if HF download fails or is unavailable.
|
| 25 |
-
"""
|
| 26 |
-
# First try to fetch from HuggingFace Hub
|
| 27 |
-
try:
|
| 28 |
-
leaderboard_path = hf_hub_download(
|
| 29 |
-
repo_id=RESULTS_DATASET_ID,
|
| 30 |
-
filename=f"leaderboards/leaderboard_{version}.json",
|
| 31 |
-
repo_type="dataset",
|
| 32 |
-
token=TOKEN
|
| 33 |
-
)
|
| 34 |
-
with open(leaderboard_path, 'r') as f:
|
| 35 |
-
return json.load(f)
|
| 36 |
-
except Exception as hf_err:
|
| 37 |
-
print(f"HF download failed or unavailable: {hf_err}. Trying local fallback...")
|
| 38 |
-
|
| 39 |
-
# Fallback: attempt to load a local leaderboard_data.json located at the project root
|
| 40 |
-
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 41 |
-
local_path_candidates = [
|
| 42 |
-
os.path.join(project_root, "leaderboard_data.json"), # legacy path in root
|
| 43 |
-
os.path.join(project_root, "data", "leaderboard.json"), # path defined in envs.py
|
| 44 |
-
]
|
| 45 |
-
|
| 46 |
-
for local_path in local_path_candidates:
|
| 47 |
-
if os.path.exists(local_path):
|
| 48 |
-
try:
|
| 49 |
-
with open(local_path, 'r') as f:
|
| 50 |
-
return json.load(f)
|
| 51 |
-
except Exception as local_err:
|
| 52 |
-
print(f"Error loading local leaderboard file {local_path}: {local_err}")
|
| 53 |
-
|
| 54 |
-
# If nothing found, return None
|
| 55 |
-
return None
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
|
| 59 |
-
"""
|
| 60 |
-
Get a specific model's entry from the entries folder, uniquely identified by model_name, mode, and version.
|
| 61 |
-
"""
|
| 62 |
-
try:
|
| 63 |
-
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
| 64 |
-
mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
|
| 65 |
-
entry_path = hf_hub_download(
|
| 66 |
-
repo_id=RESULTS_DATASET_ID,
|
| 67 |
-
filename=f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json",
|
| 68 |
-
repo_type="dataset",
|
| 69 |
-
token=TOKEN
|
| 70 |
-
)
|
| 71 |
-
with open(entry_path, 'r') as f:
|
| 72 |
-
return json.load(f)
|
| 73 |
-
except Exception as e:
|
| 74 |
-
print(f"Error downloading model entry: {e}")
|
| 75 |
-
return None
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
def get_all_entries(version="v0") -> List[Dict]:
|
| 79 |
-
"""
|
| 80 |
-
Get all entries from the HuggingFace dataset.
|
| 81 |
-
"""
|
| 82 |
-
try:
|
| 83 |
-
api = HfApi(token=TOKEN)
|
| 84 |
-
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
|
| 85 |
-
entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
|
| 86 |
-
|
| 87 |
-
all_entries = []
|
| 88 |
-
for entry_file in entry_files:
|
| 89 |
-
try:
|
| 90 |
-
entry_path = hf_hub_download(
|
| 91 |
-
repo_id=RESULTS_DATASET_ID,
|
| 92 |
-
filename=entry_file,
|
| 93 |
-
repo_type="dataset",
|
| 94 |
-
token=TOKEN
|
| 95 |
-
)
|
| 96 |
-
with open(entry_path, 'r') as f:
|
| 97 |
-
entry_data = json.load(f)
|
| 98 |
-
all_entries.append(entry_data)
|
| 99 |
-
except Exception as e:
|
| 100 |
-
print(f"Error loading entry {entry_file}: {e}")
|
| 101 |
-
|
| 102 |
-
return all_entries
|
| 103 |
-
except Exception as e:
|
| 104 |
-
print(f"Error getting all entries: {e}")
|
| 105 |
-
return []
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
def get_leaderboard_df(version="v0") -> pd.DataFrame:
|
| 109 |
-
"""
|
| 110 |
-
Get the leaderboard data as a DataFrame.
|
| 111 |
-
"""
|
| 112 |
-
# Get latest leaderboard data
|
| 113 |
-
leaderboard_data = get_latest_leaderboard(version)
|
| 114 |
-
|
| 115 |
-
if not leaderboard_data:
|
| 116 |
-
# If no leaderboard exists, try to build it from entries
|
| 117 |
-
entries = get_all_entries(version)
|
| 118 |
-
if entries:
|
| 119 |
-
leaderboard_data = {
|
| 120 |
-
"entries": entries,
|
| 121 |
-
"last_updated": datetime.now().isoformat(),
|
| 122 |
-
"version": version
|
| 123 |
-
}
|
| 124 |
-
else:
|
| 125 |
-
# Return empty DataFrame if no data available
|
| 126 |
-
return pd.DataFrame(columns=DISPLAY_COLS)
|
| 127 |
-
|
| 128 |
-
# Convert to DataFrame
|
| 129 |
-
return leaderboard_to_dataframe(leaderboard_data)
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
| 133 |
-
"""
|
| 134 |
-
Get the leaderboard data filtered by a specific programming language category.
|
| 135 |
-
"""
|
| 136 |
-
# Get latest leaderboard data
|
| 137 |
-
leaderboard_data = get_latest_leaderboard(version)
|
| 138 |
-
|
| 139 |
-
if not leaderboard_data:
|
| 140 |
-
# If no leaderboard exists, try to build it from entries
|
| 141 |
-
entries = get_all_entries(version)
|
| 142 |
-
if entries:
|
| 143 |
-
leaderboard_data = {
|
| 144 |
-
"entries": entries,
|
| 145 |
-
"last_updated": datetime.now().isoformat(),
|
| 146 |
-
"version": version
|
| 147 |
-
}
|
| 148 |
-
else:
|
| 149 |
-
# Return empty DataFrame if no data available
|
| 150 |
-
return pd.DataFrame(columns=DISPLAY_COLS)
|
| 151 |
-
|
| 152 |
-
# Filter entries to only include those with data for the specified programming language
|
| 153 |
-
filtered_entries = []
|
| 154 |
-
for entry in leaderboard_data.get("entries", []):
|
| 155 |
-
# Check if entry has data for this programming language
|
| 156 |
-
programming_language = entry.get("programming_language", "").lower()
|
| 157 |
-
if programming_language == category.lower() or category.lower() == "other":
|
| 158 |
-
# For "other" category, include entries that don't match any specific language
|
| 159 |
-
if category.lower() == "other":
|
| 160 |
-
if programming_language not in [cat.lower() for cat in CATEGORIES[:-1]]: # Exclude "Other" from check
|
| 161 |
-
filtered_entries.append(entry)
|
| 162 |
-
else:
|
| 163 |
-
filtered_entries.append(entry)
|
| 164 |
-
|
| 165 |
-
# Create a new leaderboard data structure with the filtered entries
|
| 166 |
-
filtered_leaderboard = {
|
| 167 |
-
"entries": filtered_entries,
|
| 168 |
-
"last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
|
| 169 |
-
"version": version
|
| 170 |
-
}
|
| 171 |
-
|
| 172 |
-
# Convert to DataFrame
|
| 173 |
-
return leaderboard_to_dataframe(filtered_leaderboard)
|
| 174 |
|
|
|
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
import pandas as pd
|
| 5 |
|
| 6 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 8 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 12 |
+
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
+
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
+
|
| 16 |
+
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 18 |
+
df = df[cols].round(decimals=2)
|
| 19 |
+
|
| 20 |
+
# filter out if any of the benchmarks have not been produced
|
| 21 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 22 |
+
return df
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
| 26 |
+
"""Creates the different dataframes for the evaluation queues requestes"""
|
| 27 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
| 28 |
+
all_evals = []
|
| 29 |
+
|
| 30 |
+
for entry in entries:
|
| 31 |
+
if ".json" in entry:
|
| 32 |
+
file_path = os.path.join(save_path, entry)
|
| 33 |
+
with open(file_path) as fp:
|
| 34 |
+
data = json.load(fp)
|
| 35 |
+
|
| 36 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
| 37 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
| 38 |
+
|
| 39 |
+
all_evals.append(data)
|
| 40 |
+
elif ".md" not in entry:
|
| 41 |
+
# this is a folder
|
| 42 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
| 43 |
+
for sub_entry in sub_entries:
|
| 44 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
| 45 |
+
with open(file_path) as fp:
|
| 46 |
+
data = json.load(fp)
|
| 47 |
+
|
| 48 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
| 49 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
| 50 |
+
all_evals.append(data)
|
| 51 |
+
|
| 52 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
| 53 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
| 54 |
+
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
| 55 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
| 56 |
+
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
| 57 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
| 58 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|
src/submission/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# Submission handling module
|
|
|
|
|
|
src/submission/check_validity.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
from datetime import datetime, timedelta, timezone
|
| 6 |
+
|
| 7 |
+
import huggingface_hub
|
| 8 |
+
from huggingface_hub import ModelCard
|
| 9 |
+
from huggingface_hub.hf_api import ModelInfo
|
| 10 |
+
from transformers import AutoConfig
|
| 11 |
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
| 12 |
+
|
| 13 |
+
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
| 14 |
+
"""Checks if the model card and license exist and have been filled"""
|
| 15 |
+
try:
|
| 16 |
+
card = ModelCard.load(repo_id)
|
| 17 |
+
except huggingface_hub.utils.EntryNotFoundError:
|
| 18 |
+
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
| 19 |
+
|
| 20 |
+
# Enforce license metadata
|
| 21 |
+
if card.data.license is None:
|
| 22 |
+
if not ("license_name" in card.data and "license_link" in card.data):
|
| 23 |
+
return False, (
|
| 24 |
+
"License not found. Please add a license to your model card using the `license` metadata or a"
|
| 25 |
+
" `license_name`/`license_link` pair."
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# Enforce card content
|
| 29 |
+
if len(card.text) < 200:
|
| 30 |
+
return False, "Please add a description to your model card, it is too short."
|
| 31 |
+
|
| 32 |
+
return True, ""
|
| 33 |
+
|
| 34 |
+
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
| 35 |
+
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
| 36 |
+
try:
|
| 37 |
+
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
| 38 |
+
if test_tokenizer:
|
| 39 |
+
try:
|
| 40 |
+
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
| 41 |
+
except ValueError as e:
|
| 42 |
+
return (
|
| 43 |
+
False,
|
| 44 |
+
f"uses a tokenizer which is not in a transformers release: {e}",
|
| 45 |
+
None
|
| 46 |
+
)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
| 49 |
+
return True, None, config
|
| 50 |
+
|
| 51 |
+
except ValueError:
|
| 52 |
+
return (
|
| 53 |
+
False,
|
| 54 |
+
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
| 55 |
+
None
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
except Exception as e:
|
| 59 |
+
return False, "was not found on hub!", None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def get_model_size(model_info: ModelInfo, precision: str):
|
| 63 |
+
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
| 64 |
+
try:
|
| 65 |
+
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
| 66 |
+
except (AttributeError, TypeError):
|
| 67 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
| 68 |
+
|
| 69 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
| 70 |
+
model_size = size_factor * model_size
|
| 71 |
+
return model_size
|
| 72 |
+
|
| 73 |
+
def get_model_arch(model_info: ModelInfo):
|
| 74 |
+
"""Gets the model architecture from the configuration"""
|
| 75 |
+
return model_info.config.get("architectures", "Unknown")
|
| 76 |
+
|
| 77 |
+
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
| 78 |
+
"""Gather a list of already submitted models to avoid duplicates"""
|
| 79 |
+
depth = 1
|
| 80 |
+
file_names = []
|
| 81 |
+
users_to_submission_dates = defaultdict(list)
|
| 82 |
+
|
| 83 |
+
for root, _, files in os.walk(requested_models_dir):
|
| 84 |
+
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
| 85 |
+
if current_depth == depth:
|
| 86 |
+
for file in files:
|
| 87 |
+
if not file.endswith(".json"):
|
| 88 |
+
continue
|
| 89 |
+
with open(os.path.join(root, file), "r") as f:
|
| 90 |
+
info = json.load(f)
|
| 91 |
+
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
| 92 |
+
|
| 93 |
+
# Select organisation
|
| 94 |
+
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
| 95 |
+
continue
|
| 96 |
+
organisation, _ = info["model"].split("/")
|
| 97 |
+
users_to_submission_dates[organisation].append(info["submitted_time"])
|
| 98 |
+
|
| 99 |
+
return set(file_names), users_to_submission_dates
|
src/submission/submit.py
CHANGED
|
@@ -1,184 +1,119 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Handle submissions to the CodeReview Bench leaderboard.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
import json
|
| 6 |
import os
|
| 7 |
-
import
|
| 8 |
-
|
| 9 |
-
from
|
| 10 |
-
|
| 11 |
-
from
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
"""
|
| 21 |
-
Validate a submission file.
|
| 22 |
-
"""
|
| 23 |
-
try:
|
| 24 |
-
entries, message = process_jsonl_submission(file_path)
|
| 25 |
-
if not entries:
|
| 26 |
-
return False, message
|
| 27 |
-
return True, "Submission is valid"
|
| 28 |
-
except Exception as e:
|
| 29 |
-
return False, f"Error validating submission: {e}"
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]:
|
| 33 |
-
"""
|
| 34 |
-
Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version.
|
| 35 |
-
"""
|
| 36 |
-
try:
|
| 37 |
-
# Create safe model name for file path
|
| 38 |
-
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
| 39 |
-
mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
|
| 40 |
-
|
| 41 |
-
# Create entry path in entries folder
|
| 42 |
-
entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json"
|
| 43 |
-
|
| 44 |
-
# Save entry to temporary file
|
| 45 |
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
|
| 46 |
-
json.dump(entry, temp_file, indent=2)
|
| 47 |
-
temp_path = temp_file.name
|
| 48 |
-
|
| 49 |
-
# Upload file
|
| 50 |
-
api = HfApi(token=TOKEN)
|
| 51 |
-
api.upload_file(
|
| 52 |
-
path_or_fileobj=temp_path,
|
| 53 |
-
path_in_repo=entry_path,
|
| 54 |
-
repo_id=RESULTS_DATASET_ID,
|
| 55 |
-
repo_type="dataset",
|
| 56 |
-
commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})"
|
| 57 |
-
)
|
| 58 |
-
|
| 59 |
-
os.unlink(temp_path)
|
| 60 |
-
return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})"
|
| 61 |
-
except Exception as e:
|
| 62 |
-
return False, f"Error submitting entry to dataset: {e}"
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
|
| 66 |
-
"""
|
| 67 |
-
Submit updated leaderboard to the HuggingFace dataset.
|
| 68 |
-
"""
|
| 69 |
-
try:
|
| 70 |
-
# Create leaderboard data
|
| 71 |
-
leaderboard_data = {
|
| 72 |
-
"entries": entries,
|
| 73 |
-
"last_updated": datetime.now().isoformat(),
|
| 74 |
-
"version": version
|
| 75 |
-
}
|
| 76 |
-
|
| 77 |
-
# Save to temporary file
|
| 78 |
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
|
| 79 |
-
json.dump(leaderboard_data, temp_file, indent=2)
|
| 80 |
-
temp_path = temp_file.name
|
| 81 |
-
|
| 82 |
-
# Upload file
|
| 83 |
-
api = HfApi(token=TOKEN)
|
| 84 |
-
api.upload_file(
|
| 85 |
-
path_or_fileobj=temp_path,
|
| 86 |
-
path_in_repo=f"leaderboards/leaderboard_{version}.json",
|
| 87 |
-
repo_id=RESULTS_DATASET_ID,
|
| 88 |
-
repo_type="dataset",
|
| 89 |
-
commit_message=f"Update leaderboard for version {version}"
|
| 90 |
-
)
|
| 91 |
-
|
| 92 |
-
os.unlink(temp_path)
|
| 93 |
-
return True, "Leaderboard updated successfully"
|
| 94 |
-
except Exception as e:
|
| 95 |
-
return False, f"Error updating leaderboard: {e}"
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
|
| 99 |
-
"""
|
| 100 |
-
Process a submission to the CodeReview Bench leaderboard.
|
| 101 |
-
"""
|
| 102 |
try:
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
for entry_file in entry_files:
|
| 157 |
-
try:
|
| 158 |
-
entry_path = api.hf_hub_download(
|
| 159 |
-
repo_id=RESULTS_DATASET_ID,
|
| 160 |
-
filename=entry_file,
|
| 161 |
-
repo_type="dataset",
|
| 162 |
-
)
|
| 163 |
-
with open(entry_path, 'r') as f:
|
| 164 |
-
entry_data = json.load(f)
|
| 165 |
-
all_entries.append(entry_data)
|
| 166 |
-
except Exception as e:
|
| 167 |
-
print(f"Error loading entry {entry_file}: {e}")
|
| 168 |
-
|
| 169 |
-
# Update leaderboard with all entries
|
| 170 |
-
success, message = submit_leaderboard_to_hub(all_entries, version)
|
| 171 |
-
if not success:
|
| 172 |
-
return styled_error(message)
|
| 173 |
-
|
| 174 |
-
return styled_message("Submission successful! Model evaluated and leaderboard updated.")
|
| 175 |
-
|
| 176 |
-
except Exception as e:
|
| 177 |
-
return styled_error(f"Error processing submission: {e}")
|
| 178 |
-
finally:
|
| 179 |
-
# Clean up temporary files if they exist
|
| 180 |
-
try:
|
| 181 |
-
if os.path.exists(file_path):
|
| 182 |
-
os.remove(file_path)
|
| 183 |
-
except:
|
| 184 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
+
from datetime import datetime, timezone
|
| 4 |
+
|
| 5 |
+
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
| 7 |
+
from src.submission.check_validity import (
|
| 8 |
+
already_submitted_models,
|
| 9 |
+
check_model_card,
|
| 10 |
+
get_model_size,
|
| 11 |
+
is_model_on_hub,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
REQUESTED_MODELS = None
|
| 15 |
+
USERS_TO_SUBMISSION_DATES = None
|
| 16 |
+
|
| 17 |
+
def add_new_eval(
|
| 18 |
+
model: str,
|
| 19 |
+
base_model: str,
|
| 20 |
+
revision: str,
|
| 21 |
+
precision: str,
|
| 22 |
+
weight_type: str,
|
| 23 |
+
model_type: str,
|
| 24 |
+
):
|
| 25 |
+
global REQUESTED_MODELS
|
| 26 |
+
global USERS_TO_SUBMISSION_DATES
|
| 27 |
+
if not REQUESTED_MODELS:
|
| 28 |
+
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
| 29 |
+
|
| 30 |
+
user_name = ""
|
| 31 |
+
model_path = model
|
| 32 |
+
if "/" in model:
|
| 33 |
+
user_name = model.split("/")[0]
|
| 34 |
+
model_path = model.split("/")[1]
|
| 35 |
+
|
| 36 |
+
precision = precision.split(" ")[0]
|
| 37 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 38 |
+
|
| 39 |
+
if model_type is None or model_type == "":
|
| 40 |
+
return styled_error("Please select a model type.")
|
| 41 |
+
|
| 42 |
+
# Does the model actually exist?
|
| 43 |
+
if revision == "":
|
| 44 |
+
revision = "main"
|
| 45 |
+
|
| 46 |
+
# Is the model on the hub?
|
| 47 |
+
if weight_type in ["Delta", "Adapter"]:
|
| 48 |
+
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
| 49 |
+
if not base_model_on_hub:
|
| 50 |
+
return styled_error(f'Base model "{base_model}" {error}')
|
| 51 |
+
|
| 52 |
+
if not weight_type == "Adapter":
|
| 53 |
+
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
| 54 |
+
if not model_on_hub:
|
| 55 |
+
return styled_error(f'Model "{model}" {error}')
|
| 56 |
+
|
| 57 |
+
# Is the model info correctly filled?
|
| 58 |
+
try:
|
| 59 |
+
model_info = API.model_info(repo_id=model, revision=revision)
|
| 60 |
+
except Exception:
|
| 61 |
+
return styled_error("Could not get your model information. Please fill it up properly.")
|
| 62 |
|
| 63 |
+
model_size = get_model_size(model_info=model_info, precision=precision)
|
| 64 |
|
| 65 |
+
# Were the model card and license filled?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
try:
|
| 67 |
+
license = model_info.cardData["license"]
|
| 68 |
+
except Exception:
|
| 69 |
+
return styled_error("Please select a license for your model")
|
| 70 |
+
|
| 71 |
+
modelcard_OK, error_msg = check_model_card(model)
|
| 72 |
+
if not modelcard_OK:
|
| 73 |
+
return styled_error(error_msg)
|
| 74 |
+
|
| 75 |
+
# Seems good, creating the eval
|
| 76 |
+
print("Adding new eval")
|
| 77 |
+
|
| 78 |
+
eval_entry = {
|
| 79 |
+
"model": model,
|
| 80 |
+
"base_model": base_model,
|
| 81 |
+
"revision": revision,
|
| 82 |
+
"precision": precision,
|
| 83 |
+
"weight_type": weight_type,
|
| 84 |
+
"status": "PENDING",
|
| 85 |
+
"submitted_time": current_time,
|
| 86 |
+
"model_type": model_type,
|
| 87 |
+
"likes": model_info.likes,
|
| 88 |
+
"params": model_size,
|
| 89 |
+
"license": license,
|
| 90 |
+
"private": False,
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
# Check for duplicate submission
|
| 94 |
+
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
| 95 |
+
return styled_warning("This model has been already submitted.")
|
| 96 |
+
|
| 97 |
+
print("Creating eval file")
|
| 98 |
+
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
| 99 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
| 100 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
| 101 |
+
|
| 102 |
+
with open(out_path, "w") as f:
|
| 103 |
+
f.write(json.dumps(eval_entry))
|
| 104 |
+
|
| 105 |
+
print("Uploading eval file")
|
| 106 |
+
API.upload_file(
|
| 107 |
+
path_or_fileobj=out_path,
|
| 108 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
| 109 |
+
repo_id=QUEUE_REPO,
|
| 110 |
+
repo_type="dataset",
|
| 111 |
+
commit_message=f"Add {model} to eval queue",
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# Remove the local file
|
| 115 |
+
os.remove(out_path)
|
| 116 |
+
|
| 117 |
+
return styled_message(
|
| 118 |
+
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
| 119 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|