Terry Zhuo
commited on
Commit
·
7a7f67a
1
Parent(s):
f614612
big update
Browse files- app.py +507 -282
- requirements.txt +21 -17
- src/{text_content.py → display/about.py} +10 -1
- src/{css_html.py → display/css_html_js.py} +72 -36
- src/display/formatting.py +37 -0
- src/display/utils.py +142 -0
- src/envs.py +39 -0
- src/populate.py +50 -0
- src/{utils.py → tools/plots.py} +1 -83
- src/voting/vote_system.py +150 -0
app.py
CHANGED
|
@@ -1,296 +1,521 @@
|
|
| 1 |
-
# some code blocks are taken from https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/tree/main
|
| 2 |
-
import json
|
| 3 |
import os
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
-
|
| 8 |
-
import
|
| 9 |
-
from huggingface_hub import
|
| 10 |
-
|
| 11 |
-
from
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
AutoEvalColumn,
|
| 15 |
fields,
|
| 16 |
-
|
| 17 |
-
make_clickable_names,
|
| 18 |
-
plot_elo_mle,
|
| 19 |
-
plot_solve_rate,
|
| 20 |
-
styled_error,
|
| 21 |
-
styled_message,
|
| 22 |
)
|
| 23 |
-
from
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
if query == "chat template":
|
| 126 |
-
return df[~df["direct_complete"]][leaderboard_table.columns]
|
| 127 |
else:
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
-
df = make_clickable_names(df)
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
"""<div style="text-align: center;"><h1> 🌸<span style='color: #A74E95;'>Big</span><span style='color: #C867B5;'>Code</span><span style='color: #DD71C8;'>Bench</span> Leaderboard🌸</h1></div>\
|
| 143 |
-
<br>\
|
| 144 |
-
<p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">⭐ Big Code Models Leaderboard</a>, we compare performance of LLMs on <a href="https://huggingface.co/datasets/bigcode/bigcodebench">BigCodeBench</a> benchmark.</p>
|
| 145 |
-
<p>To get started, please check out <a href="https://github.com/bigcode-project/bigcodebench">our GitHub repository</a>.</p>
|
| 146 |
-
""",
|
| 147 |
-
elem_classes="markdown-text",
|
| 148 |
-
)
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 151 |
-
with gr.
|
| 152 |
-
with gr.
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
)
|
| 188 |
-
filter_types_columns = gr.Radio(
|
| 189 |
-
label="⏚ Filter model types",
|
| 190 |
-
choices=["all", "🟢 base", "🔶 instruction-tuned"], #, "EXT external-evaluation"],
|
| 191 |
-
value="all",
|
| 192 |
-
elem_id="filter-columns",
|
| 193 |
-
)
|
| 194 |
-
filter_prompting_columns = gr.Radio(
|
| 195 |
-
label="⏚ Filter prompting",
|
| 196 |
-
choices=["all", "chat template", "direct complete"],
|
| 197 |
-
value="all",
|
| 198 |
-
elem_id="filter-direct-complete",
|
| 199 |
-
)
|
| 200 |
-
leaderboard_df = gr.components.Dataframe(
|
| 201 |
-
value=df[
|
| 202 |
-
[
|
| 203 |
-
AutoEvalColumn.model_type_symbol.name,
|
| 204 |
-
AutoEvalColumn.model.name,
|
| 205 |
-
]
|
| 206 |
-
+ shown_columns.value
|
| 207 |
-
],
|
| 208 |
-
headers=[
|
| 209 |
-
AutoEvalColumn.model_type_symbol.name,
|
| 210 |
-
AutoEvalColumn.model.name,
|
| 211 |
-
]
|
| 212 |
-
+ shown_columns.value,
|
| 213 |
-
datatype=TYPES,
|
| 214 |
-
elem_id="leaderboard-table",
|
| 215 |
-
interactive=False,
|
| 216 |
-
)
|
| 217 |
-
|
| 218 |
-
hidden_leaderboard_df = gr.components.Dataframe(
|
| 219 |
-
value=df,
|
| 220 |
-
headers=COLS,
|
| 221 |
-
datatype=["str" for _ in range(len(COLS))],
|
| 222 |
-
visible=False,
|
| 223 |
-
)
|
| 224 |
-
search_bar.submit(
|
| 225 |
-
search_table,
|
| 226 |
-
[hidden_leaderboard_df, leaderboard_df, search_bar],
|
| 227 |
-
leaderboard_df,
|
| 228 |
-
)
|
| 229 |
-
filter_types_columns.change(
|
| 230 |
-
filter_types,
|
| 231 |
-
[hidden_leaderboard_df, leaderboard_df, filter_types_columns],
|
| 232 |
-
leaderboard_df,
|
| 233 |
-
)
|
| 234 |
-
filter_prompting_columns.change(
|
| 235 |
-
filter_direct_complete,
|
| 236 |
-
[hidden_leaderboard_df, leaderboard_df, filter_prompting_columns],
|
| 237 |
-
leaderboard_df,
|
| 238 |
-
)
|
| 239 |
-
shown_columns.change(
|
| 240 |
-
select_columns,
|
| 241 |
-
[hidden_leaderboard_df, shown_columns],
|
| 242 |
-
leaderboard_df,
|
| 243 |
-
)
|
| 244 |
-
gr.Markdown(
|
| 245 |
-
"""
|
| 246 |
-
**Notes:**
|
| 247 |
-
- _Complete_ vs _Instruct_:
|
| 248 |
-
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
|
| 249 |
-
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
|
| 250 |
-
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
| 251 |
-
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
|
| 252 |
-
- `size` is the amount of activated model weight during inference.
|
| 253 |
-
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 254 |
-
- For more details check the 📝 About section.
|
| 255 |
-
""",
|
| 256 |
-
elem_classes="markdown-text",
|
| 257 |
-
)
|
| 258 |
-
|
| 259 |
-
with gr.TabItem("📊 Elo Rating", id=1):
|
| 260 |
-
with gr.Column():
|
| 261 |
-
with gr.Group():
|
| 262 |
-
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
| 263 |
-
task_elo_map = gr.Plot()
|
| 264 |
-
demo.load(plot_elo_mle, [gr.Dataframe(task_elo_mle_df, visible=False)], task_elo_map)
|
| 265 |
-
with gr.Group():
|
| 266 |
-
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
| 267 |
-
model_elo_map = gr.Plot()
|
| 268 |
-
demo.load(plot_elo_mle, [gr.Dataframe(bench_elo_mle_df, visible=False)], model_elo_map)
|
| 269 |
-
|
| 270 |
-
with gr.TabItem("🧩 Solve Rate", id=2):
|
| 271 |
-
with gr.Column():
|
| 272 |
-
complete_map = gr.Plot()
|
| 273 |
-
demo.load(plot_solve_rate, [gr.Dataframe(complete_solve_rate, visible=False),
|
| 274 |
-
gr.Textbox("Complete", visible=False),
|
| 275 |
-
], complete_map)
|
| 276 |
-
instruct_map = gr.Plot()
|
| 277 |
-
demo.load(plot_solve_rate, [gr.Dataframe(instruct_solve_rate, visible=False),
|
| 278 |
-
gr.Textbox("Instruct", visible=False),
|
| 279 |
-
], instruct_map)
|
| 280 |
-
|
| 281 |
-
with gr.TabItem("📝 About", id=3):
|
| 282 |
-
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
| 283 |
-
with gr.TabItem("Submit/Request Results 🚀", id=4):
|
| 284 |
-
gr.Markdown(SUBMISSION_TEXT_3)
|
| 285 |
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import logging
|
| 3 |
+
import time
|
| 4 |
+
import schedule
|
| 5 |
+
import datetime
|
| 6 |
import gradio as gr
|
| 7 |
+
from threading import Thread
|
| 8 |
+
import datasets
|
| 9 |
+
from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
|
| 10 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 11 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 12 |
+
|
| 13 |
+
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 14 |
+
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
|
| 15 |
+
|
| 16 |
+
from src.display.about import (
|
| 17 |
+
CITATION_BUTTON_LABEL,
|
| 18 |
+
CITATION_BUTTON_TEXT,
|
| 19 |
+
# INTRODUCTION_TEXT,
|
| 20 |
+
TITLE,
|
| 21 |
+
ABOUT_TEXT,
|
| 22 |
+
SUBMISSION_TEXT_3,
|
| 23 |
+
)
|
| 24 |
+
from src.display.css_html_js import custom_css
|
| 25 |
+
from src.display.utils import (
|
| 26 |
+
COLS,
|
| 27 |
+
EVAL_COLS,
|
| 28 |
+
EVAL_TYPES,
|
| 29 |
AutoEvalColumn,
|
| 30 |
fields,
|
| 31 |
+
EvalQueueColumn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
)
|
| 33 |
+
from src.envs import (
|
| 34 |
+
API,
|
| 35 |
+
EVAL_REQUESTS_PATH,
|
| 36 |
+
RESULT_REPO,
|
| 37 |
+
HARD_RESULT_REPO,
|
| 38 |
+
ELO_REPO,
|
| 39 |
+
HARD_ELO_REPO,
|
| 40 |
+
SOLVE_REPO,
|
| 41 |
+
HARD_SOLVE_REPO,
|
| 42 |
+
HF_TOKEN,
|
| 43 |
+
QUEUE_REPO,
|
| 44 |
+
REPO_ID,
|
| 45 |
+
VOTES_REPO,
|
| 46 |
+
VOTES_PATH,
|
| 47 |
+
HF_HOME,
|
| 48 |
+
)
|
| 49 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 50 |
+
from src.tools.plots import plot_elo_mle, plot_solve_rate
|
| 51 |
+
# from src.voting.vote_system import VoteManager, run_scheduler
|
| 52 |
+
|
| 53 |
+
# Configure logging
|
| 54 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 55 |
+
|
| 56 |
+
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 57 |
+
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
|
| 58 |
+
|
| 59 |
+
# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
|
| 60 |
+
# This controls whether a full initialization should be performed.
|
| 61 |
+
DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
| 62 |
+
NEW_DATA_ON_LEADERBOARD = True
|
| 63 |
+
LEADERBOARD_DF = None
|
| 64 |
+
HARD_LEADERBOARD_DF = None
|
| 65 |
+
ELO_TASK_DF = None
|
| 66 |
+
ELO_BENCH_DF = None
|
| 67 |
+
HARD_ELO_TASK_DF = None
|
| 68 |
+
HARD_ELO_BENCH_DF = None
|
| 69 |
+
COMPLETE_SOLVE_DF = None
|
| 70 |
+
INSTRUCT_SOLVE_DF = None
|
| 71 |
+
HARD_COMPLETE_SOLVE_DF = None
|
| 72 |
+
HARD_INSTRUCT_SOLVE_DF = None
|
| 73 |
+
|
| 74 |
+
def restart_space():
|
| 75 |
+
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def time_diff_wrapper(func):
|
| 79 |
+
def wrapper(*args, **kwargs):
|
| 80 |
+
start_time = time.time()
|
| 81 |
+
result = func(*args, **kwargs)
|
| 82 |
+
end_time = time.time()
|
| 83 |
+
diff = end_time - start_time
|
| 84 |
+
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
| 85 |
+
return result
|
| 86 |
+
|
| 87 |
+
return wrapper
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@time_diff_wrapper
|
| 91 |
+
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
| 92 |
+
"""Download dataset with exponential backoff retries."""
|
| 93 |
+
attempt = 0
|
| 94 |
+
while attempt < max_attempts:
|
| 95 |
+
try:
|
| 96 |
+
logging.info(f"Downloading {repo_id} to {local_dir}")
|
| 97 |
+
snapshot_download(
|
| 98 |
+
repo_id=repo_id,
|
| 99 |
+
local_dir=local_dir,
|
| 100 |
+
repo_type=repo_type,
|
| 101 |
+
tqdm_class=None,
|
| 102 |
+
etag_timeout=30,
|
| 103 |
+
max_workers=8,
|
| 104 |
+
)
|
| 105 |
+
logging.info("Download successful")
|
| 106 |
+
return
|
| 107 |
+
except Exception as e:
|
| 108 |
+
wait_time = backoff_factor**attempt
|
| 109 |
+
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
|
| 110 |
+
time.sleep(wait_time)
|
| 111 |
+
attempt += 1
|
| 112 |
+
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
| 113 |
+
|
| 114 |
+
def get_latest_data_leaderboard(
|
| 115 |
+
leaderboard_initial_df = None,
|
| 116 |
+
hard_leaderboard_initial_df = None,
|
| 117 |
+
elo_task_df = None,
|
| 118 |
+
elo_bench_df = None,
|
| 119 |
+
hard_elo_task_df = None,
|
| 120 |
+
hard_elo_bench_df = None,
|
| 121 |
+
complete_solve_df = None,
|
| 122 |
+
instruct_solve_df = None,
|
| 123 |
+
hard_complete_solve_df = None,
|
| 124 |
+
hard_instruct_solve_df = None
|
| 125 |
+
):
|
| 126 |
+
global NEW_DATA_ON_LEADERBOARD
|
| 127 |
+
global LEADERBOARD_DF
|
| 128 |
+
global HARD_LEADERBOARD_DF
|
| 129 |
+
global ELO_TASK_DF
|
| 130 |
+
global ELO_BENCH_DF
|
| 131 |
+
global HARD_ELO_TASK_DF
|
| 132 |
+
global HARD_ELO_BENCH_DF
|
| 133 |
+
global COMPLETE_SOLVE_DF
|
| 134 |
+
global INSTRUCT_SOLVE_DF
|
| 135 |
+
global HARD_COMPLETE_SOLVE_DF
|
| 136 |
+
global HARD_INSTRUCT_SOLVE_DF
|
| 137 |
+
|
| 138 |
+
if NEW_DATA_ON_LEADERBOARD:
|
| 139 |
+
print("Leaderboard updated at reload!")
|
| 140 |
+
leaderboard_dataset = datasets.load_dataset(
|
| 141 |
+
RESULT_REPO,
|
| 142 |
+
"default",
|
| 143 |
+
split="train",
|
| 144 |
+
cache_dir=HF_HOME,
|
| 145 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 146 |
+
verification_mode="no_checks"
|
| 147 |
+
)
|
| 148 |
+
LEADERBOARD_DF = get_leaderboard_df(
|
| 149 |
+
leaderboard_dataset=leaderboard_dataset,
|
| 150 |
+
cols=COLS,
|
| 151 |
+
)
|
| 152 |
+
hard_leaderboard_dataset = datasets.load_dataset(
|
| 153 |
+
HARD_RESULT_REPO,
|
| 154 |
+
"default",
|
| 155 |
+
split="train",
|
| 156 |
+
cache_dir=HF_HOME,
|
| 157 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 158 |
+
verification_mode="no_checks"
|
| 159 |
+
)
|
| 160 |
+
hard_leaderboard_df = get_leaderboard_df(
|
| 161 |
+
leaderboard_dataset=hard_leaderboard_dataset,
|
| 162 |
+
cols=COLS,
|
| 163 |
+
)
|
| 164 |
+
HARD_LEADERBOARD_DF = hard_leaderboard_df
|
| 165 |
+
|
| 166 |
+
elo_task_df = datasets.load_dataset(
|
| 167 |
+
ELO_REPO,
|
| 168 |
+
"default",
|
| 169 |
+
split="task_no_tie",
|
| 170 |
+
cache_dir=HF_HOME,
|
| 171 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 172 |
+
verification_mode="no_checks"
|
| 173 |
+
).to_pandas()
|
| 174 |
+
elo_bench_df = datasets.load_dataset(
|
| 175 |
+
ELO_REPO,
|
| 176 |
+
"default",
|
| 177 |
+
split="benchmark_tie",
|
| 178 |
+
cache_dir=HF_HOME,
|
| 179 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 180 |
+
verification_mode="no_checks"
|
| 181 |
+
).to_pandas()
|
| 182 |
+
ELO_TASK_DF = elo_task_df
|
| 183 |
+
ELO_BENCH_DF = elo_bench_df
|
| 184 |
+
|
| 185 |
+
hard_elo_task_df = datasets.load_dataset(
|
| 186 |
+
HARD_ELO_REPO,
|
| 187 |
+
"default",
|
| 188 |
+
split="task_no_tie",
|
| 189 |
+
cache_dir=HF_HOME,
|
| 190 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 191 |
+
verification_mode="no_checks"
|
| 192 |
+
).to_pandas()
|
| 193 |
+
hard_elo_bench_df = datasets.load_dataset(
|
| 194 |
+
HARD_ELO_REPO,
|
| 195 |
+
"default",
|
| 196 |
+
split="benchmark_tie",
|
| 197 |
+
cache_dir=HF_HOME,
|
| 198 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 199 |
+
verification_mode="no_checks"
|
| 200 |
+
).to_pandas()
|
| 201 |
+
HARD_ELO_TASK_DF = hard_elo_task_df
|
| 202 |
+
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
| 203 |
+
|
| 204 |
+
complete_solve_df = datasets.load_dataset(
|
| 205 |
+
SOLVE_REPO,
|
| 206 |
+
"default",
|
| 207 |
+
split="complete",
|
| 208 |
+
cache_dir=HF_HOME,
|
| 209 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 210 |
+
verification_mode="no_checks"
|
| 211 |
+
).to_pandas()
|
| 212 |
+
instruct_solve_df = datasets.load_dataset(
|
| 213 |
+
SOLVE_REPO,
|
| 214 |
+
"default",
|
| 215 |
+
split="instruct",
|
| 216 |
+
cache_dir=HF_HOME,
|
| 217 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 218 |
+
verification_mode="no_checks"
|
| 219 |
+
).to_pandas()
|
| 220 |
+
COMPLETE_SOLVE_DF = complete_solve_df
|
| 221 |
+
INSTRUCT_SOLVE_DF = instruct_solve_df
|
| 222 |
+
|
| 223 |
+
hard_complete_solve_df = datasets.load_dataset(
|
| 224 |
+
HARD_SOLVE_REPO,
|
| 225 |
+
"default",
|
| 226 |
+
split="complete",
|
| 227 |
+
cache_dir=HF_HOME,
|
| 228 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 229 |
+
verification_mode="no_checks"
|
| 230 |
+
).to_pandas()
|
| 231 |
+
hard_instruct_solve_df = datasets.load_dataset(
|
| 232 |
+
HARD_SOLVE_REPO,
|
| 233 |
+
"default",
|
| 234 |
+
split="instruct",
|
| 235 |
+
cache_dir=HF_HOME,
|
| 236 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 237 |
+
verification_mode="no_checks"
|
| 238 |
+
).to_pandas()
|
| 239 |
+
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
| 240 |
+
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
|
| 241 |
+
|
| 242 |
+
NEW_DATA_ON_LEADERBOARD = False
|
| 243 |
|
|
|
|
|
|
|
| 244 |
else:
|
| 245 |
+
LEADERBOARD_DF = leaderboard_initial_df
|
| 246 |
+
HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
|
| 247 |
+
ELO_TASK_DF = elo_task_df
|
| 248 |
+
ELO_BENCH_DF = elo_bench_df
|
| 249 |
+
HARD_ELO_TASK_DF = hard_elo_task_df
|
| 250 |
+
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
| 251 |
+
COMPLETE_SOLVE_DF = complete_solve_df
|
| 252 |
+
INSTRUCT_SOLVE_DF = instruct_solve_df
|
| 253 |
+
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
| 254 |
+
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
|
| 255 |
+
|
| 256 |
+
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def init_space():
|
| 260 |
+
"""Initializes the application space, loading only necessary data."""
|
| 261 |
+
|
| 262 |
+
# Always redownload the leaderboard DataFrame
|
| 263 |
+
global LEADERBOARD_DF
|
| 264 |
+
global HARD_LEADERBOARD_DF
|
| 265 |
+
global ELO_TASK_DF
|
| 266 |
+
global ELO_BENCH_DF
|
| 267 |
+
global HARD_ELO_TASK_DF
|
| 268 |
+
global HARD_ELO_BENCH_DF
|
| 269 |
+
global COMPLETE_SOLVE_DF
|
| 270 |
+
global INSTRUCT_SOLVE_DF
|
| 271 |
+
global HARD_COMPLETE_SOLVE_DF
|
| 272 |
+
global HARD_INSTRUCT_SOLVE_DF
|
| 273 |
+
|
| 274 |
+
LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
| 275 |
+
|
| 276 |
+
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
| 277 |
+
# eval_queue_dfs = get_latest_data_queue()
|
| 278 |
+
|
| 279 |
+
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
| 280 |
+
|
| 281 |
+
# Initialize VoteManager
|
| 282 |
+
# vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
# Schedule the upload_votes method to run every 15 minutes
|
| 286 |
+
# schedule.every(15).minutes.do(vote_manager.upload_votes)
|
| 287 |
+
|
| 288 |
+
# Start the scheduler in a separate thread
|
| 289 |
+
# scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
|
| 290 |
+
# scheduler_thread.start()
|
| 291 |
+
|
| 292 |
+
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
| 293 |
+
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
| 294 |
+
LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
|
| 295 |
+
ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
|
| 296 |
+
COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
|
| 297 |
+
HARD_INSTRUCT_SOLVE_DF = init_space()
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
# Data processing for plots now only on demand in the respective Gradio tab
|
| 301 |
+
# def load_and_create_plots():
|
| 302 |
+
# plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
|
| 303 |
+
# return plot_df
|
| 304 |
+
|
| 305 |
+
# Function to check if a user is logged in
|
| 306 |
+
def check_login(profile: gr.OAuthProfile | None) -> bool:
|
| 307 |
+
if profile is None:
|
| 308 |
+
return False
|
| 309 |
+
return True
|
| 310 |
+
|
| 311 |
+
def init_leaderboard(dataframe):
|
| 312 |
+
if dataframe is None or dataframe.empty:
|
| 313 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 314 |
+
return Leaderboard(
|
| 315 |
+
value=dataframe,
|
| 316 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 317 |
+
select_columns=SelectColumns(
|
| 318 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 319 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
| 320 |
+
label="Select Columns to Display:",
|
| 321 |
+
),
|
| 322 |
+
search_columns=[AutoEvalColumn.model.name],
|
| 323 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 324 |
+
filter_columns=[
|
| 325 |
+
ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"),
|
| 326 |
+
ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"),
|
| 327 |
+
ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"),
|
| 328 |
+
ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"),
|
| 329 |
+
],
|
| 330 |
+
bool_checkboxgroup_label="Hide models",
|
| 331 |
+
interactive=False,
|
| 332 |
+
)
|
| 333 |
|
|
|
|
| 334 |
|
| 335 |
+
def init_others(dataframe):
|
| 336 |
+
if dataframe is None or dataframe.empty:
|
| 337 |
+
raise ValueError("Gradio DataFrame is empty or None.")
|
| 338 |
+
return gr.Dataframe(dataframe, visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
|
| 340 |
+
main_block = gr.Blocks(css=custom_css)
|
| 341 |
+
with main_block as demo:
|
| 342 |
+
with gr.Row(elem_id="header-row"):
|
| 343 |
+
gr.HTML(TITLE)
|
| 344 |
+
|
| 345 |
+
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 346 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 347 |
+
with gr.Tab("💎 Hard Set") as hard_tabs:
|
| 348 |
+
with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
|
| 349 |
+
hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
|
| 350 |
+
gr.Markdown(
|
| 351 |
+
"""
|
| 352 |
+
**Notes:**
|
| 353 |
+
- _Hard_ vs _Full_:
|
| 354 |
+
- <u>Hard</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
|
| 355 |
+
- <u>Full</u>: The full set of 1140 BigCodeBench tasks.
|
| 356 |
+
- _Complete_ vs _Instruct_:
|
| 357 |
+
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
|
| 358 |
+
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
|
| 359 |
+
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
|
| 360 |
+
- `Average` is the average of `Complete` and `Instruct` when both are available.
|
| 361 |
+
- `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
|
| 362 |
+
- `#Act Params (B)` is the number of activated model parameters during inference.
|
| 363 |
+
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 364 |
+
- For more details check the 📝 About section.
|
| 365 |
+
""",
|
| 366 |
+
elem_classes="markdown-text",
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
with gr.TabItem("📊 Elo Rating", id="hard_elo"):
|
| 370 |
+
with gr.Column():
|
| 371 |
+
with gr.Group():
|
| 372 |
+
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
| 373 |
+
hard_task_elo_map = gr.Plot()
|
| 374 |
+
hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
|
| 375 |
+
demo.load(plot_elo_mle, [hard_elo_task_gr],
|
| 376 |
+
hard_task_elo_map)
|
| 377 |
+
with gr.Group():
|
| 378 |
+
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
| 379 |
+
hard_bench_elo_map = gr.Plot()
|
| 380 |
+
hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
|
| 381 |
+
demo.load(plot_elo_mle, [hard_elo_bench_gr],
|
| 382 |
+
hard_bench_elo_map)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
+
with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
|
| 385 |
+
with gr.Column():
|
| 386 |
+
hard_complete_map = gr.Plot()
|
| 387 |
+
hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
|
| 388 |
+
demo.load(plot_solve_rate, [hard_complete_solve_gr,
|
| 389 |
+
gr.Textbox("Complete", visible=False),
|
| 390 |
+
gr.Number(10, visible=False),
|
| 391 |
+
gr.Number(16, visible=False),
|
| 392 |
+
], hard_complete_map)
|
| 393 |
+
hard_instruct_map = gr.Plot()
|
| 394 |
+
hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
|
| 395 |
+
demo.load(plot_solve_rate, [hard_instruct_solve_gr,
|
| 396 |
+
gr.Textbox("Instruct", visible=False),
|
| 397 |
+
gr.Number(10, visible=False),
|
| 398 |
+
gr.Number(16, visible=False),
|
| 399 |
+
], hard_instruct_map)
|
| 400 |
+
with gr.Tab("🎯 Full Set") as full_tabs:
|
| 401 |
+
with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
|
| 402 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 403 |
+
gr.Markdown(
|
| 404 |
+
"""
|
| 405 |
+
**Notes:**
|
| 406 |
+
- _Complete_ vs _Instruct_:
|
| 407 |
+
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
|
| 408 |
+
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
|
| 409 |
+
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
| 410 |
+
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
|
| 411 |
+
- `size` is the amount of activated model weight during inference.
|
| 412 |
+
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 413 |
+
- For more details check the 📝 About section.
|
| 414 |
+
""",
|
| 415 |
+
elem_classes="markdown-text",
|
| 416 |
)
|
| 417 |
+
|
| 418 |
+
with gr.TabItem("📊 Elo Rating", id="full_elo"):
|
| 419 |
+
with gr.Column():
|
| 420 |
+
with gr.Group():
|
| 421 |
+
|
| 422 |
+
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
| 423 |
+
task_elo_map = gr.Plot()
|
| 424 |
+
elo_task_gr = init_others(ELO_TASK_DF)
|
| 425 |
+
demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
|
| 426 |
+
with gr.Group():
|
| 427 |
+
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
| 428 |
+
bench_elo_map = gr.Plot()
|
| 429 |
+
elo_bench_gr = init_others(ELO_BENCH_DF)
|
| 430 |
+
demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
|
| 431 |
+
|
| 432 |
+
with gr.TabItem("🧩 Solve Rate", id="full_solve"):
|
| 433 |
+
with gr.Column():
|
| 434 |
+
complete_map = gr.Plot()
|
| 435 |
+
complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
|
| 436 |
+
demo.load(plot_solve_rate, [complete_solve_gr,
|
| 437 |
+
gr.Textbox("Complete", visible=False),
|
| 438 |
+
], complete_map)
|
| 439 |
+
instruct_map = gr.Plot()
|
| 440 |
+
instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
|
| 441 |
+
demo.load(plot_solve_rate, [instruct_solve_gr,
|
| 442 |
+
gr.Textbox("Instruct", visible=False),
|
| 443 |
+
], instruct_map)
|
| 444 |
+
|
| 445 |
+
with gr.TabItem("📝 About", id=3):
|
| 446 |
+
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
| 447 |
+
with gr.TabItem("Request 🚀", id=4):
|
| 448 |
+
gr.Markdown(SUBMISSION_TEXT_3)
|
| 449 |
+
|
| 450 |
+
with gr.Row():
|
| 451 |
+
with gr.Accordion("📙 Citation", open=False):
|
| 452 |
+
citation_button = gr.Textbox(
|
| 453 |
+
value=CITATION_BUTTON_TEXT,
|
| 454 |
+
label=CITATION_BUTTON_LABEL,
|
| 455 |
+
lines=20,
|
| 456 |
+
elem_id="citation-button",
|
| 457 |
+
show_copy_button=True,
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
| 461 |
+
# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
| 462 |
+
# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
| 463 |
+
|
| 464 |
+
main_block.queue(default_concurrency_limit=40)
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
|
| 468 |
+
# Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
|
| 469 |
+
# Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
|
| 470 |
+
# ht to Lucain!
|
| 471 |
+
if SPACE_ID is None:
|
| 472 |
+
print("Not in a Space: Space CI disabled.")
|
| 473 |
+
return WebhooksServer(ui=main_block)
|
| 474 |
+
|
| 475 |
+
if IS_EPHEMERAL_SPACE:
|
| 476 |
+
print("In an ephemeral Space: Space CI disabled.")
|
| 477 |
+
return WebhooksServer(ui=main_block)
|
| 478 |
+
|
| 479 |
+
card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
|
| 480 |
+
config = card.data.get("space_ci", {})
|
| 481 |
+
print(f"Enabling Space CI with config from README: {config}")
|
| 482 |
+
|
| 483 |
+
return configure_space_ci(
|
| 484 |
+
blocks=ui,
|
| 485 |
+
trusted_authors=config.get("trusted_authors"),
|
| 486 |
+
private=config.get("private", "auto"),
|
| 487 |
+
variables=config.get("variables", "auto"),
|
| 488 |
+
secrets=config.get("secrets"),
|
| 489 |
+
hardware=config.get("hardware"),
|
| 490 |
+
storage=config.get("storage"),
|
| 491 |
+
)
|
| 492 |
|
| 493 |
+
# Create webhooks server (with CI url if in Space and not ephemeral)
|
| 494 |
+
webhooks_server = enable_space_ci_and_return_server(ui=main_block)
|
| 495 |
+
|
| 496 |
+
# Add webhooks
|
| 497 |
+
@webhooks_server.add_webhook
|
| 498 |
+
def update_leaderboard(payload: WebhookPayload) -> None:
|
| 499 |
+
"""Redownloads the leaderboard dataset each time it updates"""
|
| 500 |
+
if payload.repo.type == "dataset" and payload.event.action == "update":
|
| 501 |
+
global NEW_DATA_ON_LEADERBOARD
|
| 502 |
+
if NEW_DATA_ON_LEADERBOARD:
|
| 503 |
+
return
|
| 504 |
+
NEW_DATA_ON_LEADERBOARD = True
|
| 505 |
+
|
| 506 |
+
for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]:
|
| 507 |
+
datasets.load_dataset(
|
| 508 |
+
repo,
|
| 509 |
+
"default",
|
| 510 |
+
cache_dir=HF_HOME,
|
| 511 |
+
download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
|
| 512 |
+
verification_mode="no_checks"
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
webhooks_server.launch()
|
| 518 |
+
|
| 519 |
+
scheduler = BackgroundScheduler()
|
| 520 |
+
scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
|
| 521 |
+
scheduler.start()
|
requirements.txt
CHANGED
|
@@ -1,19 +1,23 @@
|
|
| 1 |
-
APScheduler
|
| 2 |
-
black
|
| 3 |
-
click
|
| 4 |
-
datasets
|
| 5 |
-
gradio
|
| 6 |
-
gradio_client
|
| 7 |
huggingface-hub>=0.18.0
|
| 8 |
-
matplotlib
|
| 9 |
-
numpy
|
| 10 |
-
pandas
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
tqdm
|
| 14 |
-
transformers
|
| 15 |
-
tokenizers>=0.15.0
|
| 16 |
-
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
| 17 |
-
accelerate
|
| 18 |
sentencepiece
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
APScheduler==3.10.1
|
| 2 |
+
black==23.11.0
|
| 3 |
+
click==8.1.3
|
| 4 |
+
datasets==2.14.5
|
|
|
|
|
|
|
| 5 |
huggingface-hub>=0.18.0
|
| 6 |
+
matplotlib==3.8.4
|
| 7 |
+
numpy==1.26.0
|
| 8 |
+
pandas==2.2.2
|
| 9 |
+
plotly==5.14.1
|
| 10 |
+
python-dateutil==2.8.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
sentencepiece
|
| 12 |
+
tqdm==4.65.0
|
| 13 |
+
transformers==4.41.1
|
| 14 |
+
tokenizers>=0.15.0
|
| 15 |
+
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
|
| 16 |
+
isort
|
| 17 |
+
ruff
|
| 18 |
+
gradio==4.31.0
|
| 19 |
+
gradio[oauth]
|
| 20 |
+
gradio_leaderboard==0.0.11
|
| 21 |
+
requests==2.31.0
|
| 22 |
+
requests-oauthlib== 1.3.1
|
| 23 |
+
schedule == 1.2.2
|
src/{text_content.py → display/about.py}
RENAMED
|
@@ -1,3 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
ABOUT_TEXT = """# Context
|
| 2 |
We believe that there are three main expectations of a good execution-based programming benchmark:
|
| 3 |
1. The benchmark should be easy to use and efficient in evaluating the fundamental capabilities of LLMs. Repo-level and agent-centric benchmarks (e.g., SWE-bench) are not suitable for this purpose.
|
|
@@ -135,5 +143,6 @@ CITATION_BUTTON_TEXT = r"""
|
|
| 135 |
"""
|
| 136 |
|
| 137 |
SUBMISSION_TEXT_3="""
|
| 138 |
-
We welcome the community to request for new models to be added to the leaderboard.
|
|
|
|
| 139 |
"""
|
|
|
|
| 1 |
+
TITLE = """<div style="text-align: center;"><h1> 🌸<span style='color: #C867B5;'>BigCodeBench</span> Leaderboard🌸</h1></div>\
|
| 2 |
+
<br>\
|
| 3 |
+
<p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">⭐ Big Code Models Leaderboard</a>, we compare performance of LLMs on <a href="https://huggingface.co/datasets/bigcode/bigcodebench">BigCodeBench</a> benchmark.</p>
|
| 4 |
+
<p>To get started, please check out <a href="https://github.com/bigcode-project/bigcodebench">our GitHub repository</a>.
|
| 5 |
+
<br>\
|
| 6 |
+
For more details, please check our <a href="https://huggingface.co/blog/leaderboard-bigcodebench-hard">blog on the Hard Set</a>, <a href="https://huggingface.co/blog/leaderboard-bigcodebench">blog on the Full Set</a> and <a href="https://arxiv.org/abs/2406.15877">paper</a>.</p>
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
ABOUT_TEXT = """# Context
|
| 10 |
We believe that there are three main expectations of a good execution-based programming benchmark:
|
| 11 |
1. The benchmark should be easy to use and efficient in evaluating the fundamental capabilities of LLMs. Repo-level and agent-centric benchmarks (e.g., SWE-bench) are not suitable for this purpose.
|
|
|
|
| 143 |
"""
|
| 144 |
|
| 145 |
SUBMISSION_TEXT_3="""
|
| 146 |
+
## We welcome the community to request for new models to be added to the leaderboard.
|
| 147 |
+
## Please [file an issue](https://github.com/bigcode-project/bigcodebench/issues/new/choose) to add the model to the leaderboard or [start a discussion](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard/discussions/new) in the community🤗
|
| 148 |
"""
|
src/{css_html.py → display/css_html_js.py}
RENAMED
|
@@ -1,13 +1,18 @@
|
|
| 1 |
-
# source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/assets/css_html_js.py
|
| 2 |
custom_css = """
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
| 9 |
}
|
| 10 |
|
|
|
|
| 11 |
.markdown-text {
|
| 12 |
font-size: 16px !important;
|
| 13 |
}
|
|
@@ -29,51 +34,82 @@ custom_css = """
|
|
| 29 |
transform: scale(1.3);
|
| 30 |
}
|
| 31 |
|
| 32 |
-
#leaderboard-table {
|
| 33 |
-
margin-top: 15px
|
| 34 |
-
}
|
| 35 |
-
|
| 36 |
-
#leaderboard-table-lite {
|
| 37 |
-
margin-top: 15px
|
| 38 |
-
}
|
| 39 |
-
|
| 40 |
#search-bar-table-box > div:first-child {
|
| 41 |
background: none;
|
| 42 |
border: none;
|
| 43 |
}
|
| 44 |
-
|
| 45 |
#search-bar {
|
| 46 |
padding: 0px;
|
| 47 |
}
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
#llm-benchmark-tab-table table th:last-child {
|
| 52 |
-
display: none;
|
| 53 |
}
|
| 54 |
|
| 55 |
-
/*
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
}
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
| 65 |
}
|
| 66 |
|
| 67 |
-
#
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
margin-left: auto;
|
| 72 |
-
margin-right: auto;
|
| 73 |
-
max-width: 600px;
|
| 74 |
}
|
| 75 |
|
| 76 |
-
#
|
| 77 |
-
|
| 78 |
}
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
custom_css = """
|
| 2 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
| 3 |
+
table td:first-child,
|
| 4 |
+
table th:first-child {
|
| 5 |
+
max-width: 400px;
|
| 6 |
+
overflow: auto;
|
| 7 |
+
white-space: nowrap;
|
| 8 |
}
|
| 9 |
|
| 10 |
+
/* Full width space */
|
| 11 |
+
.gradio-container {
|
| 12 |
+
max-width: 95% !important;
|
| 13 |
}
|
| 14 |
|
| 15 |
+
/* Text style and margins */
|
| 16 |
.markdown-text {
|
| 17 |
font-size: 16px !important;
|
| 18 |
}
|
|
|
|
| 34 |
transform: scale(1.3);
|
| 35 |
}
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
#search-bar-table-box > div:first-child {
|
| 38 |
background: none;
|
| 39 |
border: none;
|
| 40 |
}
|
| 41 |
+
|
| 42 |
#search-bar {
|
| 43 |
padding: 0px;
|
| 44 |
}
|
| 45 |
|
| 46 |
+
.tab-buttons button {
|
| 47 |
+
font-size: 20px;
|
|
|
|
|
|
|
| 48 |
}
|
| 49 |
|
| 50 |
+
/* Filters style */
|
| 51 |
+
#filter_type {
|
| 52 |
+
border: 0;
|
| 53 |
+
padding-left: 0;
|
| 54 |
+
padding-top: 0;
|
| 55 |
+
}
|
| 56 |
+
#filter_type label {
|
| 57 |
+
display: flex;
|
| 58 |
+
}
|
| 59 |
+
#filter_type label > span {
|
| 60 |
+
margin-top: var(--spacing-lg);
|
| 61 |
+
margin-right: 0.5em;
|
| 62 |
+
}
|
| 63 |
+
#filter_type label > .wrap {
|
| 64 |
+
width: 103px;
|
| 65 |
+
}
|
| 66 |
+
#filter_type label > .wrap .wrap-inner {
|
| 67 |
+
padding: 2px;
|
| 68 |
+
}
|
| 69 |
+
#filter_type label > .wrap .wrap-inner input {
|
| 70 |
+
width: 1px;
|
| 71 |
+
}
|
| 72 |
+
#filter-columns-type {
|
| 73 |
+
border: 0;
|
| 74 |
+
padding: 0.5;
|
| 75 |
+
}
|
| 76 |
+
#filter-columns-size {
|
| 77 |
+
border: 0;
|
| 78 |
+
padding: 0.5;
|
| 79 |
+
}
|
| 80 |
+
#box-filter > .form {
|
| 81 |
+
border: 0;
|
| 82 |
}
|
| 83 |
|
| 84 |
+
/* Header styles */
|
| 85 |
+
#header-title {
|
| 86 |
+
text-align: left;
|
| 87 |
+
display: inline-block;
|
| 88 |
}
|
| 89 |
|
| 90 |
+
#header-row {
|
| 91 |
+
display: flex;
|
| 92 |
+
justify-content: space-between;
|
| 93 |
+
align-items: center;
|
|
|
|
|
|
|
|
|
|
| 94 |
}
|
| 95 |
|
| 96 |
+
#header-row .gradio-html {
|
| 97 |
+
flex-grow: 1;
|
| 98 |
}
|
| 99 |
+
|
| 100 |
+
#oauth-button {
|
| 101 |
+
height: auto;
|
| 102 |
+
min-width: max-content;
|
| 103 |
+
white-space: nowrap;
|
| 104 |
+
padding: 10px 20px;
|
| 105 |
+
border-radius: 4px;
|
| 106 |
+
}
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
get_window_url_params = """
|
| 110 |
+
function(url_params) {
|
| 111 |
+
const params = new URLSearchParams(window.location.search);
|
| 112 |
+
url_params = Object.fromEntries(params);
|
| 113 |
+
return url_params;
|
| 114 |
+
}
|
| 115 |
+
"""
|
src/display/formatting.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import HfApi
|
| 2 |
+
|
| 3 |
+
API = HfApi()
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def model_hyperlink(link, model_name):
|
| 7 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def make_clickable_model(df, model_col, link_col):
|
| 11 |
+
df[model_col] = df.apply(
|
| 12 |
+
lambda row: model_hyperlink(row[link_col], row[model_col]), axis=1
|
| 13 |
+
)
|
| 14 |
+
df["Openness"] = df.apply(
|
| 15 |
+
lambda row: "Open" if "huggingface.co" in row[link_col] else "Closed", axis=1
|
| 16 |
+
)
|
| 17 |
+
return df
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def styled_error(error):
|
| 21 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def styled_warning(warn):
|
| 25 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def styled_message(message):
|
| 29 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def has_no_nan_values(df, columns):
|
| 33 |
+
return df[columns].notna().all(axis=1)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def has_nan_values(df, columns):
|
| 37 |
+
return df[columns].isna().any(axis=1)
|
src/display/utils.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, make_dataclass
|
| 2 |
+
from enum import Enum
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# Configure logging
|
| 10 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 11 |
+
|
| 12 |
+
# Convert ISO 8601 dates to datetime objects for comparison
|
| 13 |
+
def parse_iso8601_datetime(date_str):
|
| 14 |
+
if date_str.endswith('Z'):
|
| 15 |
+
date_str = date_str[:-1] + '+00:00'
|
| 16 |
+
return datetime.fromisoformat(date_str)
|
| 17 |
+
|
| 18 |
+
def parse_datetime(datetime_str):
|
| 19 |
+
formats = [
|
| 20 |
+
"%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
|
| 21 |
+
"%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
|
| 22 |
+
"%Y-%m-%dT%H %M %S.%f", # Spaces as separator
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
for fmt in formats:
|
| 26 |
+
try:
|
| 27 |
+
return datetime.strptime(datetime_str, fmt)
|
| 28 |
+
except ValueError:
|
| 29 |
+
continue
|
| 30 |
+
# in rare cases set unix start time for files with incorrect time (legacy files)
|
| 31 |
+
logging.error(f"No valid date format found for: {datetime_str}")
|
| 32 |
+
return datetime(1970, 1, 1)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def load_json_data(file_path):
|
| 36 |
+
"""Safely load JSON data from a file."""
|
| 37 |
+
try:
|
| 38 |
+
with open(file_path, "r") as file:
|
| 39 |
+
return json.load(file)
|
| 40 |
+
except json.JSONDecodeError:
|
| 41 |
+
print(f"Error reading JSON from {file_path}")
|
| 42 |
+
return None # Or raise an exception
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def fields(raw_class):
|
| 46 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
column_map = {
|
| 50 |
+
"T": "T",
|
| 51 |
+
"model": "Model",
|
| 52 |
+
"type": "Model Type",
|
| 53 |
+
"size_range": "Size Range",
|
| 54 |
+
"complete": "Complete",
|
| 55 |
+
"instruct": "Instruct",
|
| 56 |
+
"average": "Average",
|
| 57 |
+
"elo_mle": "Elo Rating",
|
| 58 |
+
"link": "Link",
|
| 59 |
+
"act_param": "#Act Params (B)",
|
| 60 |
+
"size": "#Params (B)",
|
| 61 |
+
"moe": "MoE",
|
| 62 |
+
"lazy": "Lazy",
|
| 63 |
+
"openness": "Openness",
|
| 64 |
+
"direct_complete": "Direct Completion",
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
type_map = {
|
| 68 |
+
"🔶": "🔶 Chat Models (RLHF, DPO, IFT, ...)",
|
| 69 |
+
"🟢": "🟢 Base Models"
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
moe_map = {
|
| 73 |
+
True: "MoE",
|
| 74 |
+
False: "Dense"
|
| 75 |
+
}
|
| 76 |
+
# These classes are for user facing column names,
|
| 77 |
+
# to avoid having to change them all around the code
|
| 78 |
+
# when a modif is needed
|
| 79 |
+
@dataclass(frozen=True)
|
| 80 |
+
class ColumnContent:
|
| 81 |
+
name: str
|
| 82 |
+
type: str
|
| 83 |
+
displayed_by_default: bool
|
| 84 |
+
hidden: bool = False
|
| 85 |
+
never_hidden: bool = False
|
| 86 |
+
dummy: bool = False
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
auto_eval_column_dict = []
|
| 90 |
+
# Init
|
| 91 |
+
auto_eval_column_dict.append(["T", ColumnContent, ColumnContent(column_map["T"], "str", True, never_hidden=True)])
|
| 92 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent(column_map["model"], "markdown", True, never_hidden=True)])
|
| 93 |
+
auto_eval_column_dict.append(["type", ColumnContent, ColumnContent(column_map["type"], "str", False, True)])
|
| 94 |
+
auto_eval_column_dict.append(["size_range", ColumnContent, ColumnContent(column_map["size_range"], "str", False, True)])
|
| 95 |
+
# Scores
|
| 96 |
+
auto_eval_column_dict.append(["complete", ColumnContent, ColumnContent(column_map["complete"], "number", True)])
|
| 97 |
+
auto_eval_column_dict.append(["instruct", ColumnContent, ColumnContent(column_map["instruct"], "number", True)])
|
| 98 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(column_map["average"], "number", True)])
|
| 99 |
+
auto_eval_column_dict.append(["elo_mle", ColumnContent, ColumnContent(column_map["elo_mle"], "number", True)])
|
| 100 |
+
|
| 101 |
+
# Model information
|
| 102 |
+
auto_eval_column_dict.append(["act_param", ColumnContent, ColumnContent(column_map["act_param"], "number", True)])
|
| 103 |
+
auto_eval_column_dict.append(["link", ColumnContent, ColumnContent(column_map["link"], "str", False, True)])
|
| 104 |
+
auto_eval_column_dict.append(["size", ColumnContent, ColumnContent(column_map["size"], "number", False)])
|
| 105 |
+
auto_eval_column_dict.append(["lazy", ColumnContent, ColumnContent(column_map["lazy"], "bool", False, True)])
|
| 106 |
+
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent(column_map["moe"], "str", False, True)])
|
| 107 |
+
auto_eval_column_dict.append(["openness", ColumnContent, ColumnContent(column_map["openness"], "str", False, True)])
|
| 108 |
+
auto_eval_column_dict.append(["direct_complete", ColumnContent, ColumnContent(column_map["direct_complete"], "bool", False)])
|
| 109 |
+
|
| 110 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
| 111 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
@dataclass(frozen=True)
|
| 115 |
+
class EvalQueueColumn: # Queue column
|
| 116 |
+
model_link = ColumnContent("link", "markdown", True)
|
| 117 |
+
model_name = ColumnContent("model", "str", True)
|
| 118 |
+
|
| 119 |
+
@dataclass
|
| 120 |
+
class ModelDetails:
|
| 121 |
+
name: str
|
| 122 |
+
symbol: str = "" # emoji, only for the model type
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# Column selection
|
| 126 |
+
COLS = [c.name for c in fields(AutoEvalColumn)]
|
| 127 |
+
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
| 128 |
+
|
| 129 |
+
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 130 |
+
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
NUMERIC_INTERVALS = {
|
| 134 |
+
"?": pd.Interval(-1, 0, closed="right"),
|
| 135 |
+
"~1.5": pd.Interval(0, 2, closed="right"),
|
| 136 |
+
"~3": pd.Interval(2, 4, closed="right"),
|
| 137 |
+
"~7": pd.Interval(4, 9, closed="right"),
|
| 138 |
+
"~13": pd.Interval(9, 20, closed="right"),
|
| 139 |
+
"~35": pd.Interval(20, 45, closed="right"),
|
| 140 |
+
"~60": pd.Interval(45, 70, closed="right"),
|
| 141 |
+
"70+": pd.Interval(70, 10000, closed="right"),
|
| 142 |
+
}
|
src/envs.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from huggingface_hub import HfApi
|
| 3 |
+
|
| 4 |
+
# clone / pull the lmeh eval data
|
| 5 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 6 |
+
|
| 7 |
+
REPO_ID = "bigcode/bigcodebench-leaderboard"
|
| 8 |
+
QUEUE_REPO = "bigcode/bigcodebench-requests"
|
| 9 |
+
RESULT_REPO = "bigcode/bigcodebench-results"
|
| 10 |
+
HARD_RESULT_REPO = "bigcode/bigcodebench-hard-results"
|
| 11 |
+
|
| 12 |
+
ELO_REPO = "bigcode/bigcodebench-elo"
|
| 13 |
+
HARD_ELO_REPO = "bigcode/bigcodebench-hard-elo"
|
| 14 |
+
SOLVE_REPO = "bigcode/bigcodebench-solve-rate"
|
| 15 |
+
HARD_SOLVE_REPO = "bigcode/bigcodebench-hard-solve-rate"
|
| 16 |
+
|
| 17 |
+
VOTES_REPO = "bigcode/bigcodebench-votes"
|
| 18 |
+
|
| 19 |
+
HF_HOME = os.getenv("HF_HOME", ".")
|
| 20 |
+
|
| 21 |
+
# Check HF_HOME write access
|
| 22 |
+
print(f"Initial HF_HOME set to: {HF_HOME}")
|
| 23 |
+
|
| 24 |
+
if not os.access(HF_HOME, os.W_OK):
|
| 25 |
+
print(f"No write access to HF_HOME: {HF_HOME}. Resetting to current directory.")
|
| 26 |
+
HF_HOME = "."
|
| 27 |
+
os.environ["HF_HOME"] = HF_HOME
|
| 28 |
+
else:
|
| 29 |
+
print("Write access confirmed for HF_HOME")
|
| 30 |
+
|
| 31 |
+
VOTES_PATH = os.path.join(HF_HOME, "model-votes")
|
| 32 |
+
EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
|
| 33 |
+
|
| 34 |
+
# Rate limit variables
|
| 35 |
+
RATE_LIMIT_PERIOD = 7
|
| 36 |
+
RATE_LIMIT_QUOTA = 5
|
| 37 |
+
HAS_HIGHER_RATE_LIMIT = []
|
| 38 |
+
|
| 39 |
+
API = HfApi(token=HF_TOKEN)
|
src/populate.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pathlib
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from datasets import Dataset
|
| 4 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 5 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 6 |
+
from src.display.utils import load_json_data, column_map, type_map, moe_map, NUMERIC_INTERVALS
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def get_evaluation_queue_df(save_path, cols):
|
| 11 |
+
"""Generate dataframes for pending, running, and finished evaluation entries."""
|
| 12 |
+
save_path = pathlib.Path(save_path)
|
| 13 |
+
all_evals = []
|
| 14 |
+
|
| 15 |
+
for path in save_path.rglob("*.json"):
|
| 16 |
+
data = load_json_data(path)
|
| 17 |
+
# Organizing data by status
|
| 18 |
+
status_map = {
|
| 19 |
+
"PENDING": ["PENDING", "RERUN"],
|
| 20 |
+
"RUNNING": ["RUNNING"],
|
| 21 |
+
"FINISHED": ["FINISHED", "PENDING_NEW_EVAL"],
|
| 22 |
+
}
|
| 23 |
+
status_dfs = {status: [] for status in status_map}
|
| 24 |
+
for eval_data in all_evals:
|
| 25 |
+
for status, extra_statuses in status_map.items():
|
| 26 |
+
if eval_data["status"] in extra_statuses:
|
| 27 |
+
status_dfs[status].append(eval_data)
|
| 28 |
+
|
| 29 |
+
return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list):
|
| 33 |
+
"""Retrieve and process leaderboard data."""
|
| 34 |
+
all_data_json = leaderboard_dataset.to_dict()
|
| 35 |
+
num_items = leaderboard_dataset.num_rows
|
| 36 |
+
all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
|
| 37 |
+
|
| 38 |
+
df = pd.DataFrame.from_records(all_data_json_list)
|
| 39 |
+
# replace df.moe true to false, false to true
|
| 40 |
+
# map column names
|
| 41 |
+
df = df.rename(columns=column_map)
|
| 42 |
+
df[AutoEvalColumn.moe.name] = df[AutoEvalColumn.moe.name].map(moe_map)
|
| 43 |
+
df[AutoEvalColumn.T.name] = df[AutoEvalColumn.type.name]
|
| 44 |
+
df[AutoEvalColumn.type.name] = df[AutoEvalColumn.type.name].map(type_map)
|
| 45 |
+
df[AutoEvalColumn.average.name] = df.apply(lambda x: (x[AutoEvalColumn.complete.name] + x[AutoEvalColumn.instruct.name]) / 2 if not pd.isna(x[AutoEvalColumn.complete.name]) and not pd.isna(x[AutoEvalColumn.instruct.name]) else None, axis=1)
|
| 46 |
+
df[AutoEvalColumn.size_range.name] = df[AutoEvalColumn.size.name].apply(lambda x: next((k for k, v in NUMERIC_INTERVALS.items() if x in v), "?"))
|
| 47 |
+
df = make_clickable_model(df, AutoEvalColumn.model.name, AutoEvalColumn.link.name)
|
| 48 |
+
df = df.sort_values(by=[AutoEvalColumn.complete.name], ascending=False)
|
| 49 |
+
df = df[cols].round(decimals=2)
|
| 50 |
+
return df
|
src/{utils.py → tools/plots.py}
RENAMED
|
@@ -1,45 +1,6 @@
|
|
| 1 |
-
# source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/utils_display.py
|
| 2 |
-
from dataclasses import dataclass
|
| 3 |
import plotly.graph_objects as go
|
| 4 |
-
from transformers import AutoConfig
|
| 5 |
import plotly.express as px
|
| 6 |
import numpy as np
|
| 7 |
-
# These classes are for user facing column names, to avoid having to change them
|
| 8 |
-
# all around the code when a modif is needed
|
| 9 |
-
@dataclass
|
| 10 |
-
class ColumnContent:
|
| 11 |
-
name: str
|
| 12 |
-
type: str
|
| 13 |
-
displayed_by_default: bool
|
| 14 |
-
hidden: bool = False
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def fields(raw_class):
|
| 18 |
-
return [
|
| 19 |
-
v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
|
| 20 |
-
]
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
@dataclass(frozen=True)
|
| 24 |
-
class AutoEvalColumn: # Auto evals column
|
| 25 |
-
model_type_symbol = ColumnContent("type", "str", True)
|
| 26 |
-
model = ColumnContent("model", "markdown", True)
|
| 27 |
-
complete_score = ColumnContent("complete", "number", True)
|
| 28 |
-
instruct_score = ColumnContent("instruct", "number", True)
|
| 29 |
-
elo_mle = ColumnContent("elo_mle", "number", True)
|
| 30 |
-
dummy = ColumnContent("model", "str", True)
|
| 31 |
-
size = ColumnContent("size", "number", True)
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
def model_hyperlink(link, model_name):
|
| 35 |
-
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
def make_clickable_names(df):
|
| 39 |
-
df["model"] = df.apply(
|
| 40 |
-
lambda row: model_hyperlink(row["link"], row["model"]), axis=1
|
| 41 |
-
)
|
| 42 |
-
return df
|
| 43 |
|
| 44 |
|
| 45 |
def plot_elo_mle(df):
|
|
@@ -63,13 +24,6 @@ def plot_solve_rate(df, task, rows=30, cols=38):
|
|
| 63 |
values = np.array(values)
|
| 64 |
|
| 65 |
n = len(values)
|
| 66 |
-
if rows is None or cols is None:
|
| 67 |
-
cols = int(math.sqrt(n))
|
| 68 |
-
rows = cols if cols * cols >= n else cols + 1
|
| 69 |
-
|
| 70 |
-
while rows * cols < n:
|
| 71 |
-
cols += 1
|
| 72 |
-
|
| 73 |
values = np.pad(values, (0, rows * cols - n), 'constant', constant_values=np.nan).reshape((rows, cols))
|
| 74 |
keys = np.pad(keys, (0, rows * cols - n), 'constant', constant_values='').reshape((rows, cols))
|
| 75 |
|
|
@@ -102,40 +56,4 @@ def plot_solve_rate(df, task, rows=30, cols=38):
|
|
| 102 |
# height=600,
|
| 103 |
)
|
| 104 |
|
| 105 |
-
return fig
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
def styled_error(error):
|
| 109 |
-
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
def styled_warning(warn):
|
| 113 |
-
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
def styled_message(message):
|
| 117 |
-
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
def has_no_nan_values(df, columns):
|
| 121 |
-
return df[columns].notna().all(axis=1)
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
def has_nan_values(df, columns):
|
| 125 |
-
return df[columns].isna().any(axis=1)
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
def is_model_on_hub(model_name: str, revision: str) -> bool:
|
| 129 |
-
try:
|
| 130 |
-
AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
|
| 131 |
-
return True, None
|
| 132 |
-
|
| 133 |
-
except ValueError:
|
| 134 |
-
return (
|
| 135 |
-
False,
|
| 136 |
-
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
except Exception as e:
|
| 140 |
-
print(f"Could not get the model config from the hub.: {e}")
|
| 141 |
-
return False, "was not found on hub!"
|
|
|
|
|
|
|
|
|
|
| 1 |
import plotly.graph_objects as go
|
|
|
|
| 2 |
import plotly.express as px
|
| 3 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
def plot_elo_mle(df):
|
|
|
|
| 24 |
values = np.array(values)
|
| 25 |
|
| 26 |
n = len(values)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
values = np.pad(values, (0, rows * cols - n), 'constant', constant_values=np.nan).reshape((rows, cols))
|
| 28 |
keys = np.pad(keys, (0, rows * cols - n), 'constant', constant_values='').reshape((rows, cols))
|
| 29 |
|
|
|
|
| 56 |
# height=600,
|
| 57 |
)
|
| 58 |
|
| 59 |
+
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/voting/vote_system.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import pathlib
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import schedule
|
| 7 |
+
import time
|
| 8 |
+
from datetime import datetime, timezone
|
| 9 |
+
|
| 10 |
+
from src.envs import API
|
| 11 |
+
|
| 12 |
+
# Set up logging
|
| 13 |
+
logging.basicConfig(level=logging.INFO)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
class VoteManager:
|
| 17 |
+
def __init__(self, votes_path, eval_requests_path, repo_id):
|
| 18 |
+
self.votes_path = votes_path
|
| 19 |
+
self.eval_requests_path = eval_requests_path
|
| 20 |
+
self.repo_id = repo_id
|
| 21 |
+
self.vote_dataset = self.read_vote_dataset()
|
| 22 |
+
self.vote_check_set = self.make_check_set(self.vote_dataset)
|
| 23 |
+
self.votes_to_upload = []
|
| 24 |
+
|
| 25 |
+
def init_vote_dataset(self):
|
| 26 |
+
self.vote_dataset = self.read_vote_dataset()
|
| 27 |
+
self.vote_check_set = self.make_check_set(self.vote_dataset)
|
| 28 |
+
|
| 29 |
+
def read_vote_dataset(self):
|
| 30 |
+
result = []
|
| 31 |
+
votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
|
| 32 |
+
if votes_file.exists():
|
| 33 |
+
with open(votes_file, "r") as f:
|
| 34 |
+
for line in f:
|
| 35 |
+
data = json.loads(line.strip())
|
| 36 |
+
result.append(data)
|
| 37 |
+
result = pd.DataFrame(result)
|
| 38 |
+
return result
|
| 39 |
+
|
| 40 |
+
def make_check_set(self, vote_dataset: pd.DataFrame):
|
| 41 |
+
result = list()
|
| 42 |
+
for row in vote_dataset.itertuples(index=False, name='vote'):
|
| 43 |
+
result.append((row.model, row.revision, row.username))
|
| 44 |
+
return set(result)
|
| 45 |
+
|
| 46 |
+
def get_model_revision(self, selected_model: str) -> str:
|
| 47 |
+
"""Fetch the revision for the given model from the request files."""
|
| 48 |
+
for user_folder in pathlib.Path(self.eval_requests_path).iterdir():
|
| 49 |
+
if user_folder.is_dir():
|
| 50 |
+
for file in user_folder.glob("*.json"):
|
| 51 |
+
with open(file, "r") as f:
|
| 52 |
+
data = json.load(f)
|
| 53 |
+
if data.get("model") == selected_model:
|
| 54 |
+
return data.get("revision", "main")
|
| 55 |
+
return "main"
|
| 56 |
+
|
| 57 |
+
def create_request_vote_df(self, pending_models_df: gr.Dataframe):
|
| 58 |
+
if pending_models_df.empty or not "model_name" in pending_models_df.columns:
|
| 59 |
+
return pending_models_df
|
| 60 |
+
self.vote_dataset = self.read_vote_dataset()
|
| 61 |
+
vote_counts = self.vote_dataset.groupby(['model', 'revision']).size().reset_index(name='vote_count')
|
| 62 |
+
|
| 63 |
+
pending_models_df_votes = pd.merge(
|
| 64 |
+
pending_models_df,
|
| 65 |
+
vote_counts,
|
| 66 |
+
left_on=["model_name", 'revision'],
|
| 67 |
+
right_on=['model', 'revision'],
|
| 68 |
+
how='left'
|
| 69 |
+
)
|
| 70 |
+
# Filling empty votes
|
| 71 |
+
pending_models_df_votes['vote_count'] = pending_models_df_votes['vote_count'].fillna(0)
|
| 72 |
+
pending_models_df_votes = pending_models_df_votes.sort_values(by=["vote_count", "model_name"], ascending=[False, True])
|
| 73 |
+
# Removing useless columns
|
| 74 |
+
pending_models_df_votes = pending_models_df_votes.drop(["model_name", "model"], axis=1)
|
| 75 |
+
return pending_models_df_votes
|
| 76 |
+
|
| 77 |
+
# Function to be called when a user votes for a model
|
| 78 |
+
def add_vote(
|
| 79 |
+
self,
|
| 80 |
+
selected_model: str,
|
| 81 |
+
pending_models_df: gr.Dataframe,
|
| 82 |
+
profile: gr.OAuthProfile | None
|
| 83 |
+
):
|
| 84 |
+
logger.debug(f"Type of list before usage: {type(list)}")
|
| 85 |
+
# model_name, revision, user_id, timestamp
|
| 86 |
+
if selected_model in ["str", ""]:
|
| 87 |
+
gr.Warning("No model selected")
|
| 88 |
+
return
|
| 89 |
+
|
| 90 |
+
if profile is None:
|
| 91 |
+
gr.Warning("Hub Login required")
|
| 92 |
+
return
|
| 93 |
+
|
| 94 |
+
vote_username = profile.username
|
| 95 |
+
model_revision = self.get_model_revision(selected_model)
|
| 96 |
+
|
| 97 |
+
# tuple (immutable) for checking than already voted for model
|
| 98 |
+
check_tuple = (selected_model, model_revision, vote_username)
|
| 99 |
+
if check_tuple in self.vote_check_set:
|
| 100 |
+
gr.Warning("Already voted for this model")
|
| 101 |
+
return
|
| 102 |
+
|
| 103 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 104 |
+
|
| 105 |
+
vote_obj = {
|
| 106 |
+
"model": selected_model,
|
| 107 |
+
"revision": model_revision,
|
| 108 |
+
"username": vote_username,
|
| 109 |
+
"timestamp": current_time
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
# Append the vote to the JSONL file
|
| 113 |
+
try:
|
| 114 |
+
votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
|
| 115 |
+
with open(votes_file, "a") as f:
|
| 116 |
+
f.write(json.dumps(vote_obj) + "\n")
|
| 117 |
+
logger.info(f"Vote added locally: {vote_obj}")
|
| 118 |
+
|
| 119 |
+
self.votes_to_upload.append(vote_obj)
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.error(f"Failed to write vote to file: {e}")
|
| 122 |
+
gr.Warning("Failed to record vote. Please try again")
|
| 123 |
+
return
|
| 124 |
+
|
| 125 |
+
self.vote_check_set.add(check_tuple)
|
| 126 |
+
gr.Info(f"Voted for {selected_model}")
|
| 127 |
+
|
| 128 |
+
return self.create_request_vote_df(pending_models_df)
|
| 129 |
+
|
| 130 |
+
def upload_votes(self):
|
| 131 |
+
if self.votes_to_upload:
|
| 132 |
+
votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
|
| 133 |
+
try:
|
| 134 |
+
with open(votes_file, "rb") as f:
|
| 135 |
+
API.upload_file(
|
| 136 |
+
path_or_fileobj=f,
|
| 137 |
+
path_in_repo="votes_data.jsonl",
|
| 138 |
+
repo_id=self.repo_id,
|
| 139 |
+
repo_type="dataset",
|
| 140 |
+
commit_message="Updating votes_data.jsonl with new votes",
|
| 141 |
+
)
|
| 142 |
+
logger.info("Votes uploaded to votes repository")
|
| 143 |
+
self.votes_to_upload.clear()
|
| 144 |
+
except Exception as e:
|
| 145 |
+
logger.error(f"Failed to upload votes to repository: {e}")
|
| 146 |
+
|
| 147 |
+
def run_scheduler(vote_manager):
|
| 148 |
+
while True:
|
| 149 |
+
schedule.run_pending()
|
| 150 |
+
time.sleep(1)
|