Spaces:

k-mktr
/

gpu-poor-llm-arena

Running

App Files Files Community

k-mktr commited on Oct 23, 2024

Commit

d6f8bd2

verified ·

1 Parent(s): 264e6cc

Update leaderboard.py

Browse files

Files changed (1) hide show

leaderboard.py +112 -29

leaderboard.py CHANGED Viewed

@@ -8,6 +8,7 @@ import threading
 import arena_config
 import sys
 import math
 # Initialize Nextcloud client
 nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD)
@@ -56,7 +57,7 @@ def update_elo_ratings(winner, loser):
     loser_size = get_model_size(loser)
     max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS)
-    k_factor = 32 * (1 + (loser_size - winner_size) / max_size)
     elo_ratings[winner] += k_factor * (1 - expected_winner)
     elo_ratings[loser] += k_factor * (0 - expected_loser)
@@ -205,6 +206,27 @@ def get_leaderboard():
     leaderboard_html += "</table>"
     return leaderboard_html
 def get_elo_leaderboard():
     ensure_elo_ratings_initialized()
     leaderboard = load_leaderboard()
@@ -217,9 +239,9 @@ def get_elo_leaderboard():
     <p style="font-size: 16px; margin-bottom: 20px;">
     This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
     Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings.
-    When a smaller model defeats a larger one, it gains more points, while larger models gain fewer points for beating smaller ones.
-    The "Points Scored" column shows the total ELO points gained by the model from its victories, reflecting both quantity and quality of wins.
-    The "Points Lost" column shows the total ELO points lost by the model from its defeats, indicating the challenges faced.
     </p>
     """
@@ -249,41 +271,29 @@ def get_elo_leaderboard():
     <tr>
         <th class='rank-column'>Rank</th>
         <th>Model</th>
-        <th>ELO Rating</th>
-        <th>Points Scored</th>
-        <th>Points Lost</th>
     </tr>
     """
     for index, (model, rating) in enumerate(sorted_ratings, start=1):
         rank_display = {1: "🥇", 2: "🥈", 3: "🥉"}.get(index, f"{index}")
-        model_size = get_model_size(model)
-        points_scored = 0
-        points_lost = 0
-        if model in leaderboard:
-            for opponent, results in leaderboard[model]['opponents'].items():
-                opponent_rating = elo_ratings.get(opponent, 1000)
-                opponent_size = get_model_size(opponent)
-                max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS)
-                for _ in range(results['wins']):
-                    expected_score = calculate_expected_score(rating, opponent_rating)
-                    k_factor = 32 * (1 + (opponent_size - model_size) / max_size)
-                    points_scored += k_factor * (1 - expected_score)
-                for _ in range(results['losses']):
-                    expected_score = calculate_expected_score(rating, opponent_rating)
-                    k_factor = 32 * (1 + (model_size - opponent_size) / max_size)
-                    points_lost += k_factor * expected_score
         leaderboard_html += f"""
         <tr>
             <td class='rank-column'>{rank_display}</td>
             <td>{get_human_readable_name(model)}</td>
-            <td>{round(rating)}</td>
-            <td>{round(points_scored, 2)}</td>
-            <td>{round(points_lost, 2)}</td>
         </tr>
         """
@@ -307,3 +317,76 @@ def create_backup():
 def start_backup_thread():
     backup_thread = threading.Thread(target=create_backup, daemon=True)
     backup_thread.start()

 import arena_config
 import sys
 import math
+import plotly.graph_objects as go
 # Initialize Nextcloud client
 nc = Nextcloud(nextcloud_url=arena_config.NEXTCLOUD_URL, nc_auth_user=arena_config.NEXTCLOUD_USERNAME, nc_auth_pass=arena_config.NEXTCLOUD_PASSWORD)
     loser_size = get_model_size(loser)
     max_size = max(get_model_size(model) for model, _ in arena_config.APPROVED_MODELS)
+    k_factor = min(64, 32 * (1 + (loser_size - winner_size) / max_size))
     elo_ratings[winner] += k_factor * (1 - expected_winner)
     elo_ratings[loser] += k_factor * (0 - expected_loser)
     leaderboard_html += "</table>"
     return leaderboard_html
+def calculate_elo_impact(model):
+    positive_impact = 0
+    negative_impact = 0
+    leaderboard = load_leaderboard()
+    initial_rating = 1000 + (get_model_size(model) * 100)
+    for opponent, results in leaderboard[model]['opponents'].items():
+        model_size = get_model_size(model)
+        opponent_size = get_model_size(opponent)
+        max_size = max(get_model_size(m) for m, _ in arena_config.APPROVED_MODELS)
+        size_difference = (opponent_size - model_size) / max_size
+        win_impact = 1 + max(0, size_difference)
+        loss_impact = 1 + max(0, -size_difference)
+        positive_impact += results['wins'] * win_impact
+        negative_impact += results['losses'] * loss_impact
+    return round(positive_impact), round(negative_impact), round(initial_rating)
 def get_elo_leaderboard():
     ensure_elo_ratings_initialized()
     leaderboard = load_leaderboard()
     <p style="font-size: 16px; margin-bottom: 20px;">
     This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
     Initial ratings range from {round(min_initial_rating)} to {round(max_initial_rating)} points, based on model size, with larger models starting at higher ratings.
+    The "Positive Impact" score reflects the significance of wins, with higher scores for defeating larger models.
+    The "Negative Impact" score indicates the significance of losses, with higher scores for losing against smaller models.
+    The current ELO rating is calculated based on these impacts and the model's performance history.
     </p>
     """
     <tr>
         <th class='rank-column'>Rank</th>
         <th>Model</th>
+        <th>Current ELO Rating</th>
+        <th>Positive Impact</th>
+        <th>Negative Impact</th>
+        <th>Total Battles</th>
+        <th>Initial Rating</th>
     </tr>
     """
     for index, (model, rating) in enumerate(sorted_ratings, start=1):
+        total_battles = leaderboard[model]['wins'] + leaderboard[model]['losses']
         rank_display = {1: "🥇", 2: "🥈", 3: "🥉"}.get(index, f"{index}")
+        positive_impact, negative_impact, initial_rating = calculate_elo_impact(model)
         leaderboard_html += f"""
         <tr>
             <td class='rank-column'>{rank_display}</td>
             <td>{get_human_readable_name(model)}</td>
+            <td><strong>{round(rating)}</strong></td>
+            <td>{positive_impact}</td>
+            <td>{negative_impact}</td>
+            <td>{total_battles}</td>
+            <td>{initial_rating}</td>
         </tr>
         """
 def start_backup_thread():
     backup_thread = threading.Thread(target=create_backup, daemon=True)
     backup_thread.start()
+def get_leaderboard_chart():
+    battle_results = get_current_leaderboard()
+    # Calculate scores and sort results
+    for model, results in battle_results.items():
+        total_battles = results["wins"] + results["losses"]
+        if total_battles > 0:
+            win_rate = results["wins"] / total_battles
+            results["score"] = win_rate * (1 - 1 / (total_battles + 1))
+        else:
+            results["score"] = 0
+    sorted_results = sorted(
+        battle_results.items(),
+        key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]),
+        reverse=True
+    )
+    models = [get_human_readable_name(model) for model, _ in sorted_results]
+    wins = [results["wins"] for _, results in sorted_results]
+    losses = [results["losses"] for _, results in sorted_results]
+    scores = [results["score"] for _, results in sorted_results]
+    fig = go.Figure()
+    # Stacked Bar chart for Wins and Losses
+    fig.add_trace(go.Bar(
+        x=models,
+        y=wins,
+        name='Wins',
+        marker_color='#22577a'
+    ))
+    fig.add_trace(go.Bar(
+        x=models,
+        y=losses,
+        name='Losses',
+        marker_color='#38a3a5'
+    ))
+    # Line chart for Scores
+    fig.add_trace(go.Scatter(
+        x=models,
+        y=scores,
+        name='Score',
+        yaxis='y2',
+        line=dict(color='#ff7f0e', width=2)
+    ))
+    # Update layout for full-width, increased height, and secondary y-axis
+    fig.update_layout(
+        title='Model Performance',
+        xaxis_title='Models',
+        yaxis_title='Number of Battles',
+        yaxis2=dict(
+            title='Score',
+            overlaying='y',
+            side='right'
+        ),
+        barmode='stack',
+        height=800,
+        width=1450,
+        autosize=True,
+        legend=dict(
+            orientation='h',
+            yanchor='bottom',
+            y=1.02,
+            xanchor='right',
+            x=1
+        )
+    )
+    return fig