Spaces:
Running
Running
Update leaderboard.py
Browse files- leaderboard.py +112 -29
leaderboard.py
CHANGED
|
@@ -8,6 +8,7 @@ import threading
|
|
| 8 |
import arena_config
|
| 9 |
import sys
|
| 10 |
import math
|
|
|
|
| 11 |
|
| 12 |
# Initialize Nextcloud client
|
| 13 |
nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD)
|
|
@@ -56,7 +57,7 @@ def update_elo_ratings(winner, loser):
|
|
| 56 |
loser_size = get_model_size(loser)
|
| 57 |
max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS)
|
| 58 |
|
| 59 |
-
k_factor = 32 * (1 + (loser_size - winner_size) / max_size)
|
| 60 |
|
| 61 |
elo_ratings[winner] += k_factor * (1 - expected_winner)
|
| 62 |
elo_ratings[loser] += k_factor * (0 - expected_loser)
|
|
@@ -205,6 +206,27 @@ def get_leaderboard():
|
|
| 205 |
leaderboard_html += "</table>"
|
| 206 |
return leaderboard_html
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
def get_elo_leaderboard():
|
| 209 |
ensure_elo_ratings_initialized()
|
| 210 |
leaderboard = load_leaderboard()
|
|
@@ -217,9 +239,9 @@ def get_elo_leaderboard():
|
|
| 217 |
<p style="font-size: 16px; margin-bottom: 20px;">
|
| 218 |
This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
|
| 219 |
Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings.
|
| 220 |
-
|
| 221 |
-
The "
|
| 222 |
-
The
|
| 223 |
</p>
|
| 224 |
"""
|
| 225 |
|
|
@@ -249,41 +271,29 @@ def get_elo_leaderboard():
|
|
| 249 |
<tr>
|
| 250 |
<th class='rank-column'>Rank</th>
|
| 251 |
<th>Model</th>
|
| 252 |
-
<th>ELO Rating</th>
|
| 253 |
-
<th>
|
| 254 |
-
<th>
|
|
|
|
|
|
|
|
|
|
| 255 |
</tr>
|
| 256 |
"""
|
| 257 |
|
| 258 |
for index, (model, rating) in enumerate(sorted_ratings, start=1):
|
|
|
|
| 259 |
rank_display = {1: "🥇", 2: "🥈", 3: "🥉"}.get(index, f"{index}")
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
points_scored = 0
|
| 263 |
-
points_lost = 0
|
| 264 |
-
if model in leaderboard:
|
| 265 |
-
for opponent, results in leaderboard[model]['opponents'].items():
|
| 266 |
-
opponent_rating = elo_ratings.get(opponent, 1000)
|
| 267 |
-
opponent_size = get_model_size(opponent)
|
| 268 |
-
max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS)
|
| 269 |
-
|
| 270 |
-
for _ in range(results['wins']):
|
| 271 |
-
expected_score = calculate_expected_score(rating, opponent_rating)
|
| 272 |
-
k_factor = 32 * (1 + (opponent_size - model_size) / max_size)
|
| 273 |
-
points_scored += k_factor * (1 - expected_score)
|
| 274 |
-
|
| 275 |
-
for _ in range(results['losses']):
|
| 276 |
-
expected_score = calculate_expected_score(rating, opponent_rating)
|
| 277 |
-
k_factor = 32 * (1 + (model_size - opponent_size) / max_size)
|
| 278 |
-
points_lost += k_factor * expected_score
|
| 279 |
|
| 280 |
leaderboard_html += f"""
|
| 281 |
<tr>
|
| 282 |
<td class='rank-column'>{rank_display}</td>
|
| 283 |
<td>{get_human_readable_name(model)}</td>
|
| 284 |
-
<td>{round(rating)}</td>
|
| 285 |
-
<td>{
|
| 286 |
-
<td>{
|
|
|
|
|
|
|
| 287 |
</tr>
|
| 288 |
"""
|
| 289 |
|
|
@@ -307,3 +317,76 @@ def create_backup():
|
|
| 307 |
def start_backup_thread():
|
| 308 |
backup_thread = threading.Thread(target=create_backup, daemon=True)
|
| 309 |
backup_thread.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import arena_config
|
| 9 |
import sys
|
| 10 |
import math
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
|
| 13 |
# Initialize Nextcloud client
|
| 14 |
nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD)
|
|
|
|
| 57 |
loser_size = get_model_size(loser)
|
| 58 |
max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS)
|
| 59 |
|
| 60 |
+
k_factor = min(64, 32 * (1 + (loser_size - winner_size) / max_size))
|
| 61 |
|
| 62 |
elo_ratings[winner] += k_factor * (1 - expected_winner)
|
| 63 |
elo_ratings[loser] += k_factor * (0 - expected_loser)
|
|
|
|
| 206 |
leaderboard_html += "</table>"
|
| 207 |
return leaderboard_html
|
| 208 |
|
| 209 |
+
def calculate_elo_impact(model):
|
| 210 |
+
positive_impact = 0
|
| 211 |
+
negative_impact = 0
|
| 212 |
+
leaderboard = load_leaderboard()
|
| 213 |
+
initial_rating = 1000 + (get_model_size(model) * 100)
|
| 214 |
+
|
| 215 |
+
for opponent, results in leaderboard[model]['opponents'].items():
|
| 216 |
+
model_size = get_model_size(model)
|
| 217 |
+
opponent_size = get_model_size(opponent)
|
| 218 |
+
max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS)
|
| 219 |
+
|
| 220 |
+
size_difference = (opponent_size - model_size) / max_size
|
| 221 |
+
|
| 222 |
+
win_impact = 1 + max(0, size_difference)
|
| 223 |
+
loss_impact = 1 + max(0, -size_difference)
|
| 224 |
+
|
| 225 |
+
positive_impact += results['wins'] * win_impact
|
| 226 |
+
negative_impact += results['losses'] * loss_impact
|
| 227 |
+
|
| 228 |
+
return round(positive_impact), round(negative_impact), round(initial_rating)
|
| 229 |
+
|
| 230 |
def get_elo_leaderboard():
|
| 231 |
ensure_elo_ratings_initialized()
|
| 232 |
leaderboard = load_leaderboard()
|
|
|
|
| 239 |
<p style="font-size: 16px; margin-bottom: 20px;">
|
| 240 |
This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
|
| 241 |
Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings.
|
| 242 |
+
The "Positive Impact" score reflects the significance of wins, with higher scores for defeating larger models.
|
| 243 |
+
The "Negative Impact" score indicates the significance of losses, with higher scores for losing against smaller models.
|
| 244 |
+
The current ELO rating is calculated based on these impacts and the model's performance history.
|
| 245 |
</p>
|
| 246 |
"""
|
| 247 |
|
|
|
|
| 271 |
<tr>
|
| 272 |
<th class='rank-column'>Rank</th>
|
| 273 |
<th>Model</th>
|
| 274 |
+
<th>Current ELO Rating</th>
|
| 275 |
+
<th>Positive Impact</th>
|
| 276 |
+
<th>Negative Impact</th>
|
| 277 |
+
<th>Total Battles</th>
|
| 278 |
+
<th>Initial Rating</th>
|
| 279 |
+
|
| 280 |
</tr>
|
| 281 |
"""
|
| 282 |
|
| 283 |
for index, (model, rating) in enumerate(sorted_ratings, start=1):
|
| 284 |
+
total_battles = leaderboard[model]['wins'] + leaderboard[model]['losses']
|
| 285 |
rank_display = {1: "🥇", 2: "🥈", 3: "🥉"}.get(index, f"{index}")
|
| 286 |
+
positive_impact, negative_impact, initial_rating = calculate_elo_impact(model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
|
| 288 |
leaderboard_html += f"""
|
| 289 |
<tr>
|
| 290 |
<td class='rank-column'>{rank_display}</td>
|
| 291 |
<td>{get_human_readable_name(model)}</td>
|
| 292 |
+
<td><strong>{round(rating)}</strong></td>
|
| 293 |
+
<td>{positive_impact}</td>
|
| 294 |
+
<td>{negative_impact}</td>
|
| 295 |
+
<td>{total_battles}</td>
|
| 296 |
+
<td>{initial_rating}</td>
|
| 297 |
</tr>
|
| 298 |
"""
|
| 299 |
|
|
|
|
| 317 |
def start_backup_thread():
|
| 318 |
backup_thread = threading.Thread(target=create_backup, daemon=True)
|
| 319 |
backup_thread.start()
|
| 320 |
+
|
| 321 |
+
def get_leaderboard_chart():
|
| 322 |
+
battle_results = get_current_leaderboard()
|
| 323 |
+
|
| 324 |
+
# Calculate scores and sort results
|
| 325 |
+
for model, results in battle_results.items():
|
| 326 |
+
total_battles = results["wins"] + results["losses"]
|
| 327 |
+
if total_battles > 0:
|
| 328 |
+
win_rate = results["wins"] / total_battles
|
| 329 |
+
results["score"] = win_rate * (1 - 1 / (total_battles + 1))
|
| 330 |
+
else:
|
| 331 |
+
results["score"] = 0
|
| 332 |
+
|
| 333 |
+
sorted_results = sorted(
|
| 334 |
+
battle_results.items(),
|
| 335 |
+
key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]),
|
| 336 |
+
reverse=True
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
models = [get_human_readable_name(model) for model, _ in sorted_results]
|
| 340 |
+
wins = [results["wins"] for _, results in sorted_results]
|
| 341 |
+
losses = [results["losses"] for _, results in sorted_results]
|
| 342 |
+
scores = [results["score"] for _, results in sorted_results]
|
| 343 |
+
|
| 344 |
+
fig = go.Figure()
|
| 345 |
+
|
| 346 |
+
# Stacked Bar chart for Wins and Losses
|
| 347 |
+
fig.add_trace(go.Bar(
|
| 348 |
+
x=models,
|
| 349 |
+
y=wins,
|
| 350 |
+
name='Wins',
|
| 351 |
+
marker_color='#22577a'
|
| 352 |
+
))
|
| 353 |
+
fig.add_trace(go.Bar(
|
| 354 |
+
x=models,
|
| 355 |
+
y=losses,
|
| 356 |
+
name='Losses',
|
| 357 |
+
marker_color='#38a3a5'
|
| 358 |
+
))
|
| 359 |
+
|
| 360 |
+
# Line chart for Scores
|
| 361 |
+
fig.add_trace(go.Scatter(
|
| 362 |
+
x=models,
|
| 363 |
+
y=scores,
|
| 364 |
+
name='Score',
|
| 365 |
+
yaxis='y2',
|
| 366 |
+
line=dict(color='#ff7f0e', width=2)
|
| 367 |
+
))
|
| 368 |
+
|
| 369 |
+
# Update layout for full-width, increased height, and secondary y-axis
|
| 370 |
+
fig.update_layout(
|
| 371 |
+
title='Model Performance',
|
| 372 |
+
xaxis_title='Models',
|
| 373 |
+
yaxis_title='Number of Battles',
|
| 374 |
+
yaxis2=dict(
|
| 375 |
+
title='Score',
|
| 376 |
+
overlaying='y',
|
| 377 |
+
side='right'
|
| 378 |
+
),
|
| 379 |
+
barmode='stack',
|
| 380 |
+
height=800,
|
| 381 |
+
width=1450,
|
| 382 |
+
autosize=True,
|
| 383 |
+
legend=dict(
|
| 384 |
+
orientation='h',
|
| 385 |
+
yanchor='bottom',
|
| 386 |
+
y=1.02,
|
| 387 |
+
xanchor='right',
|
| 388 |
+
x=1
|
| 389 |
+
)
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
return fig
|