Spaces:
Running
Running
Create leaderboard.py
Browse files- leaderboard.py +114 -0
leaderboard.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import defaultdict
|
| 2 |
+
from datetime import datetime, timezone
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
|
| 5 |
+
# Constants
|
| 6 |
+
DEFAULT_ELO = 1200 # Starting ELO for new models
|
| 7 |
+
K_FACTOR = 32 # Standard chess K-factor
|
| 8 |
+
|
| 9 |
+
def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
|
| 10 |
+
"""Generate leaderboard data using votes from MongoDB."""
|
| 11 |
+
# Initialize dictionaries for tracking
|
| 12 |
+
ratings = defaultdict(lambda: DEFAULT_ELO)
|
| 13 |
+
matches = defaultdict(int)
|
| 14 |
+
|
| 15 |
+
# Process each vote
|
| 16 |
+
for vote in voting_data:
|
| 17 |
+
try:
|
| 18 |
+
model_a = vote.get("model_a")
|
| 19 |
+
model_b = vote.get("model_b")
|
| 20 |
+
winner = vote.get("winner")
|
| 21 |
+
|
| 22 |
+
# Skip if models aren't in current model_data
|
| 23 |
+
if (
|
| 24 |
+
not all([model_a, model_b, winner])
|
| 25 |
+
or model_a not in model_data
|
| 26 |
+
or model_b not in model_data
|
| 27 |
+
):
|
| 28 |
+
continue
|
| 29 |
+
|
| 30 |
+
# Update match counts
|
| 31 |
+
matches[model_a] += 1
|
| 32 |
+
matches[model_b] += 1
|
| 33 |
+
|
| 34 |
+
# Calculate ELO changes
|
| 35 |
+
elo_a = ratings[model_a]
|
| 36 |
+
elo_b = ratings[model_b]
|
| 37 |
+
|
| 38 |
+
# Expected scores
|
| 39 |
+
expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
|
| 40 |
+
expected_b = 1 - expected_a
|
| 41 |
+
|
| 42 |
+
# Actual scores
|
| 43 |
+
score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
|
| 44 |
+
score_b = 1 - score_a
|
| 45 |
+
|
| 46 |
+
# Update ratings
|
| 47 |
+
ratings[model_a] += K_FACTOR * (score_a - expected_a)
|
| 48 |
+
ratings[model_b] += K_FACTOR * (score_b - expected_b)
|
| 49 |
+
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"Error processing vote: {e}")
|
| 52 |
+
continue
|
| 53 |
+
|
| 54 |
+
# Generate leaderboard data
|
| 55 |
+
leaderboard = []
|
| 56 |
+
for model in model_data.keys():
|
| 57 |
+
votes = matches[model]
|
| 58 |
+
# Skip models with < 500 votes if show_preliminary is False
|
| 59 |
+
if not show_preliminary and votes < 500:
|
| 60 |
+
continue
|
| 61 |
+
|
| 62 |
+
elo = ratings[model]
|
| 63 |
+
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
|
| 64 |
+
data = {
|
| 65 |
+
"Model": model,
|
| 66 |
+
"ELO Score": f"{int(elo)}",
|
| 67 |
+
"95% CI": f"±{int(ci)}",
|
| 68 |
+
"# Votes": votes,
|
| 69 |
+
"Organization": model_data[model]["organization"],
|
| 70 |
+
"License": model_data[model]["license"],
|
| 71 |
+
}
|
| 72 |
+
leaderboard.append(data)
|
| 73 |
+
|
| 74 |
+
# Sort leaderboard by ELO score in descending order
|
| 75 |
+
leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
|
| 76 |
+
|
| 77 |
+
return leaderboard
|
| 78 |
+
|
| 79 |
+
def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
|
| 80 |
+
"""Get summary statistics for the leaderboard."""
|
| 81 |
+
now = datetime.now(timezone.utc)
|
| 82 |
+
total_votes = len(voting_data)
|
| 83 |
+
total_models = len(model_data)
|
| 84 |
+
last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
|
| 85 |
+
"%B %d, %Y at %H:00 UTC"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
return f"""
|
| 89 |
+
### Leaderboard Stats
|
| 90 |
+
- **Total Models**: {total_models}
|
| 91 |
+
- **Total Votes**: {total_votes}
|
| 92 |
+
- **Last Updated**: {last_updated}
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
def calculate_elo_change(rating_a: float, rating_b: float, winner: str) -> tuple[float, float]:
|
| 96 |
+
"""Calculate ELO rating changes for both players."""
|
| 97 |
+
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
|
| 98 |
+
expected_b = 1 - expected_a
|
| 99 |
+
|
| 100 |
+
if winner == "A":
|
| 101 |
+
score_a, score_b = 1, 0
|
| 102 |
+
elif winner == "B":
|
| 103 |
+
score_a, score_b = 0, 1
|
| 104 |
+
else: # Handle ties
|
| 105 |
+
score_a, score_b = 0.5, 0.5
|
| 106 |
+
|
| 107 |
+
change_a = K_FACTOR * (score_a - expected_a)
|
| 108 |
+
change_b = K_FACTOR * (score_b - expected_b)
|
| 109 |
+
|
| 110 |
+
return change_a, change_b
|
| 111 |
+
|
| 112 |
+
def get_model_rankings(leaderboard: List[Dict]) -> Dict[str, int]:
|
| 113 |
+
"""Get current rankings of all models from leaderboard data."""
|
| 114 |
+
return {entry["Model"]: idx + 1 for idx, entry in enumerate(leaderboard)}
|