Spaces:
Sleeping
Sleeping
| """ | |
| Leaderboard module for Dynamic Highscores system. | |
| This module implements the unified leaderboard with tag-based filtering | |
| for displaying all evaluated models. | |
| """ | |
| import os | |
| import json | |
| import pandas as pd | |
| import gradio as gr | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| class Leaderboard: | |
| """Manages the unified leaderboard with filtering capabilities.""" | |
| def __init__(self, db_manager): | |
| """Initialize the leaderboard manager. | |
| Args: | |
| db_manager: Database manager instance | |
| """ | |
| self.db_manager = db_manager | |
| self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"] | |
| # Define color scheme for tags | |
| self.tag_colors = { | |
| "Merge": "#FF6B6B", | |
| "Agent": "#4ECDC4", | |
| "Reasoning": "#FFD166", | |
| "Coding": "#6B5B95", | |
| "General": "#88D8B0", | |
| "Specialized": "#FF8C42", | |
| "Instruction": "#5D9CEC", | |
| "Chat": "#AC92EB" | |
| } | |
| def get_leaderboard_data(self, tag=None, benchmark_id=None): | |
| """Get leaderboard data, optionally filtered by tag or benchmark. | |
| Args: | |
| tag: Model tag to filter by (None for all) | |
| benchmark_id: Benchmark ID to filter by (None for all) | |
| Returns: | |
| pd.DataFrame: Leaderboard data | |
| """ | |
| # Get evaluation results from database | |
| if tag and tag != "All": | |
| df = self.db_manager.get_leaderboard_df(tag=tag, benchmark_id=benchmark_id) | |
| else: | |
| df = self.db_manager.get_leaderboard_df(benchmark_id=benchmark_id) | |
| return df | |
| def format_leaderboard_for_display(self, df): | |
| """Format leaderboard data for display. | |
| Args: | |
| df: Leaderboard DataFrame | |
| Returns: | |
| pd.DataFrame: Formatted leaderboard for display | |
| """ | |
| if df.empty: | |
| return pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed']) | |
| # Select and rename columns for display | |
| display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy() | |
| display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed'] | |
| # Round score to 2 decimal places | |
| display_df['Score'] = display_df['Score'].round(2) | |
| # Sort by score (descending) | |
| display_df = display_df.sort_values('Score', ascending=False) | |
| return display_df | |
| def create_performance_chart(self, df, chart_type="bar"): | |
| """Create a performance chart from leaderboard data. | |
| Args: | |
| df: Leaderboard DataFrame | |
| chart_type: Type of chart to create ("bar" or "scatter") | |
| Returns: | |
| plotly.graph_objects.Figure: Performance chart | |
| """ | |
| if df.empty: | |
| # Return empty figure | |
| fig = go.Figure() | |
| fig.update_layout( | |
| title="No data available", | |
| xaxis_title="Model", | |
| yaxis_title="Score" | |
| ) | |
| return fig | |
| # Prepare data for visualization | |
| plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy() | |
| plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score'] | |
| # Create chart based on type | |
| if chart_type == "scatter": | |
| fig = px.scatter( | |
| plot_df, | |
| x="Model", | |
| y="Score", | |
| color="Tag", | |
| symbol="Benchmark", | |
| size="Score", | |
| hover_data=["Model", "Benchmark", "Score"], | |
| color_discrete_map=self.tag_colors | |
| ) | |
| else: # Default to bar chart | |
| fig = px.bar( | |
| plot_df, | |
| x="Model", | |
| y="Score", | |
| color="Tag", | |
| barmode="group", | |
| hover_data=["Model", "Benchmark", "Score"], | |
| color_discrete_map=self.tag_colors | |
| ) | |
| # Customize layout | |
| fig.update_layout( | |
| title="Model Performance Comparison", | |
| xaxis_title="Model", | |
| yaxis_title="Score", | |
| legend_title="Tag", | |
| font=dict(size=12) | |
| ) | |
| return fig | |
| def create_tag_distribution_chart(self, df): | |
| """Create a chart showing distribution of models by tag. | |
| Args: | |
| df: Leaderboard DataFrame | |
| Returns: | |
| plotly.graph_objects.Figure: Tag distribution chart | |
| """ | |
| if df.empty: | |
| # Return empty figure | |
| fig = go.Figure() | |
| fig.update_layout( | |
| title="No data available", | |
| xaxis_title="Tag", | |
| yaxis_title="Count" | |
| ) | |
| return fig | |
| # Count models by tag | |
| tag_counts = df['tag'].value_counts().reset_index() | |
| tag_counts.columns = ['Tag', 'Count'] | |
| # Create pie chart | |
| fig = px.pie( | |
| tag_counts, | |
| names='Tag', | |
| values='Count', | |
| title='Model Distribution by Tag', | |
| color='Tag', | |
| color_discrete_map=self.tag_colors | |
| ) | |
| # Customize layout | |
| fig.update_layout( | |
| font=dict(size=12) | |
| ) | |
| return fig | |
| def create_benchmark_comparison_chart(self, df): | |
| """Create a chart comparing performance across benchmarks. | |
| Args: | |
| df: Leaderboard DataFrame | |
| Returns: | |
| plotly.graph_objects.Figure: Benchmark comparison chart | |
| """ | |
| if df.empty: | |
| # Return empty figure | |
| fig = go.Figure() | |
| fig.update_layout( | |
| title="No data available", | |
| xaxis_title="Benchmark", | |
| yaxis_title="Average Score" | |
| ) | |
| return fig | |
| # Calculate average score by benchmark | |
| benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index() | |
| benchmark_avg.columns = ['Benchmark', 'Average Score'] | |
| # Create bar chart | |
| fig = px.bar( | |
| benchmark_avg, | |
| x='Benchmark', | |
| y='Average Score', | |
| title='Average Performance by Benchmark', | |
| color='Benchmark' | |
| ) | |
| # Customize layout | |
| fig.update_layout( | |
| xaxis_title="Benchmark", | |
| yaxis_title="Average Score", | |
| font=dict(size=12) | |
| ) | |
| return fig | |
| # Leaderboard UI components | |
| def create_leaderboard_ui(leaderboard, db_manager): | |
| """Create the leaderboard UI components. | |
| Args: | |
| leaderboard: Leaderboard instance | |
| db_manager: Database manager instance | |
| Returns: | |
| gr.Blocks: Gradio Blocks component with leaderboard UI | |
| """ | |
| with gr.Blocks() as leaderboard_ui: | |
| gr.Markdown("# Dynamic Highscores Leaderboard") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| tag_filter = gr.Dropdown( | |
| choices=leaderboard.model_tags, | |
| value="All", | |
| label="Filter by Tag" | |
| ) | |
| benchmark_filter = gr.Dropdown( | |
| choices=[("all", "All Benchmarks")], | |
| value="all", | |
| label="Filter by Benchmark" | |
| ) | |
| refresh_button = gr.Button("Refresh Leaderboard") | |
| with gr.Column(scale=2): | |
| chart_type = gr.Radio( | |
| choices=["bar", "scatter"], | |
| value="bar", | |
| label="Chart Type" | |
| ) | |
| view_type = gr.Radio( | |
| choices=["Table", "Chart", "Dashboard"], | |
| value="Table", | |
| label="View Type" | |
| ) | |
| # Table view | |
| leaderboard_table = gr.Dataframe( | |
| headers=["Model", "Benchmark", "Tag", "Score", "Completed"], | |
| label="Leaderboard", | |
| visible=True | |
| ) | |
| # Chart view | |
| with gr.Row(visible=False) as chart_view: | |
| performance_chart = gr.Plot(label="Performance Chart") | |
| # Dashboard view | |
| with gr.Row(visible=False) as dashboard_view: | |
| with gr.Column(scale=2): | |
| dashboard_performance_chart = gr.Plot(label="Performance Comparison") | |
| with gr.Column(scale=1): | |
| with gr.Row(): | |
| tag_distribution_chart = gr.Plot(label="Model Distribution") | |
| with gr.Row(): | |
| benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison") | |
| # Event handlers | |
| def refresh_benchmarks(): | |
| try: | |
| benchmarks = db_manager.get_benchmarks() | |
| # Format for dropdown | |
| choices = [("all", "All Benchmarks")] | |
| choices.extend([(str(b["id"]), b["name"]) for b in benchmarks]) | |
| return gr.update(choices=choices) | |
| except Exception as e: | |
| print(f"Error refreshing benchmarks: {e}") | |
| return gr.update(choices=[("all", "All Benchmarks")]) | |
| def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val): | |
| try: | |
| # Get leaderboard data | |
| if benchmark_id == "all": | |
| benchmark_id = None | |
| df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id) | |
| # Format for display | |
| display_df = leaderboard.format_leaderboard_for_display(df) | |
| # Create charts | |
| perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val) | |
| tag_chart = leaderboard.create_tag_distribution_chart(df) | |
| benchmark_chart = leaderboard.create_benchmark_comparison_chart(df) | |
| # Update visibility based on view type | |
| table_visible = view_type_val == "Table" | |
| chart_visible = view_type_val == "Chart" | |
| dashboard_visible = view_type_val == "Dashboard" | |
| return ( | |
| display_df, | |
| perf_chart, | |
| perf_chart, # Same chart for both views | |
| tag_chart, | |
| benchmark_chart, | |
| gr.update(visible=table_visible), | |
| gr.update(visible=chart_visible), | |
| gr.update(visible=dashboard_visible) | |
| ) | |
| except Exception as e: | |
| print(f"Error updating leaderboard: {e}") | |
| empty_df = pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed']) | |
| empty_chart = go.Figure() | |
| empty_chart.update_layout(title="Error loading data") | |
| return ( | |
| empty_df, | |
| empty_chart, | |
| empty_chart, | |
| empty_chart, | |
| empty_chart, | |
| gr.update(visible=True), | |
| gr.update(visible=False), | |
| gr.update(visible=False) | |
| ) | |
| # Connect event handlers | |
| refresh_button.click( | |
| fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t), | |
| inputs=[tag_filter, benchmark_filter, chart_type, view_type], | |
| outputs=[ | |
| leaderboard_table, | |
| performance_chart, | |
| dashboard_performance_chart, | |
| tag_distribution_chart, | |
| benchmark_comparison_chart, | |
| leaderboard_table, | |
| chart_view, | |
| dashboard_view | |
| ] | |
| ) | |
| view_type.change( | |
| fn=lambda view_t: ( | |
| gr.update(visible=view_t == "Table"), | |
| gr.update(visible=view_t == "Chart"), | |
| gr.update(visible=view_t == "Dashboard") | |
| ), | |
| inputs=[view_type], | |
| outputs=[leaderboard_table, chart_view, dashboard_view] | |
| ) | |
| # Initialize on load | |
| leaderboard_ui.load( | |
| fn=refresh_benchmarks, | |
| inputs=[], | |
| outputs=[benchmark_filter] | |
| ) | |
| leaderboard_ui.load( | |
| fn=lambda: update_leaderboard("All", "all", "bar", "Table"), | |
| inputs=[], | |
| outputs=[ | |
| leaderboard_table, | |
| performance_chart, | |
| dashboard_performance_chart, | |
| tag_distribution_chart, | |
| benchmark_comparison_chart, | |
| leaderboard_table, | |
| chart_view, | |
| dashboard_view | |
| ] | |
| ) | |
| return leaderboard_ui |