Spaces:

AtlaAI
/

judge-arena

Running

App Files Files Community

kaikaidai commited on Nov 14, 2024

Commit

5267683

verified ·

1 Parent(s): ab62ff3

Create leaderboard.py

Browse files

Files changed (1) hide show

leaderboard.py +114 -0

leaderboard.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from collections import defaultdict
+from datetime import datetime, timezone
+from typing import Dict, List
+# Constants
+DEFAULT_ELO = 1200  # Starting ELO for new models
+K_FACTOR = 32  # Standard chess K-factor
+def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
+    """Generate leaderboard data using votes from MongoDB."""
+    # Initialize dictionaries for tracking
+    ratings = defaultdict(lambda: DEFAULT_ELO)
+    matches = defaultdict(int)
+    # Process each vote
+    for vote in voting_data:
+        try:
+            model_a = vote.get("model_a")
+            model_b = vote.get("model_b")
+            winner = vote.get("winner")
+            # Skip if models aren't in current model_data
+            if (
+                not all([model_a, model_b, winner])
+                or model_a not in model_data
+                or model_b not in model_data
+            ):
+                continue
+            # Update match counts
+            matches[model_a] += 1
+            matches[model_b] += 1
+            # Calculate ELO changes
+            elo_a = ratings[model_a]
+            elo_b = ratings[model_b]
+            # Expected scores
+            expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
+            expected_b = 1 - expected_a
+            # Actual scores
+            score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
+            score_b = 1 - score_a
+            # Update ratings
+            ratings[model_a] += K_FACTOR * (score_a - expected_a)
+            ratings[model_b] += K_FACTOR * (score_b - expected_b)
+        except Exception as e:
+            print(f"Error processing vote: {e}")
+            continue
+    # Generate leaderboard data
+    leaderboard = []
+    for model in model_data.keys():
+        votes = matches[model]
+        # Skip models with < 500 votes if show_preliminary is False
+        if not show_preliminary and votes < 500:
+            continue
+        elo = ratings[model]
+        ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
+        data = {
+            "Model": model,
+            "ELO Score": f"{int(elo)}",
+            "95% CI": f"±{int(ci)}",
+            "# Votes": votes,
+            "Organization": model_data[model]["organization"],
+            "License": model_data[model]["license"],
+        }
+        leaderboard.append(data)
+    # Sort leaderboard by ELO score in descending order
+    leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
+    return leaderboard
+def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
+    """Get summary statistics for the leaderboard."""
+    now = datetime.now(timezone.utc)
+    total_votes = len(voting_data)
+    total_models = len(model_data)
+    last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
+        "%B %d, %Y at %H:00 UTC"
+    )
+    return f"""
+### Leaderboard Stats
+- **Total Models**: {total_models}
+- **Total Votes**: {total_votes}
+- **Last Updated**: {last_updated}
+"""
+def calculate_elo_change(rating_a: float, rating_b: float, winner: str) -> tuple[float, float]:
+    """Calculate ELO rating changes for both players."""
+    expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
+    expected_b = 1 - expected_a
+    if winner == "A":
+        score_a, score_b = 1, 0
+    elif winner == "B":
+        score_a, score_b = 0, 1
+    else:  # Handle ties
+        score_a, score_b = 0.5, 0.5
+    change_a = K_FACTOR * (score_a - expected_a)
+    change_b = K_FACTOR * (score_b - expected_b)
+    return change_a, change_b
+def get_model_rankings(leaderboard: List[Dict]) -> Dict[str, int]:
+    """Get current rankings of all models from leaderboard data."""
+    return {entry["Model"]: idx + 1 for idx, entry in enumerate(leaderboard)}