Spaces:

huggingface-KREW
/

Ko-AgentBench

Running on CPU Upgrade

App Files Files Community

Harheem Kim commited on 12 days ago

Commit

a9d36f6

1 Parent(s): c4adc3b

update en version

Browse files

Files changed (1) hide show

tabs/leaderboard_v1_en.py +612 -465

tabs/leaderboard_v1_en.py CHANGED Viewed

@@ -224,36 +224,36 @@ def create_leaderboard_v2_tab():
     # Level metadata for the 7-stage task framework
     level_details = {
         "ALL": {
-            "title": "ALL · All Tasks",
-            "description": "Compare overall performance levels and stage-specific strengths of models through average SR across L1~L7 levels."
         },
         "L1": {
-            "title": "<span style='color: white;'>L1 · Single Tool Call</span>",
-            "description": "<span style='color: white;'>Evaluates single tool execution capability and basic command performance accuracy.</span>"
         },
         "L2": {
-            "title": "<span style='color: white;'>L2 · Tool Selection</span>",
-            "description": "<span style='color: white;'>Measures the ability to select appropriate tools and invoke them with proper parameters.</span>"
         },
         "L3": {
-            "title": "<span style='color: white;'>L3 · Sequential Tool Reasoning</span>",
-            "description": "<span style='color: white;'>Validates the process of solving problems through multi-step sequential reasoning.</span>"
         },
         "L4": {
-            "title": "<span style='color: white;'>L4 · Parallel Tool Reasoning</span>",
-            "description": "<span style='color: white;'>Evaluates the ability to integrate and summarize information from multiple sources in parallel.</span>"
         },
         "L5": {
-            "title": "<span style='color: white;'>L5 · Error Handling and Robustness</span>",
-            "description": "<span style='color: white;'>Confirms recognition and response strategies for unexpected errors or failure situations.</span>"
         },
         "L6": {
-            "title": "<span style='color: white;'>L6 · Efficient Tool Utilization</span>",
-            "description": "<span style='color: white;'>Examines operational efficiency in achieving goals with minimal calls and costs.</span>"
         },
         "L7": {
-            "title": "<span style='color: white;'>L7 · Long-Context Reasoning</span>",
-            "description": "<span style='color: white;'>Intensively analyzes the ability to maintain and appropriately utilize long-term conversation context.</span>"
         }
     }
     default_level = "ALL"
@@ -389,7 +389,7 @@ def create_leaderboard_v2_tab():
             if highlight_map.get(level):
                 header_classes.append("highlight-header")
             table_html += f"""
-                        <th class="{' '.join(header_classes)}" title="Average Success Rate {level}">
                             <span class="metric-header">{level} <span class="info-icon">ⓘ</span></span>
                         </th>
             """
@@ -736,6 +736,8 @@ def create_leaderboard_v2_tab():
     # Header styles and navigation
     gr.HTML("""
     <style>
     /* Enhanced button styling with better gradio compatibility */
     .header-action-button {
         display: inline-block !important;
@@ -791,33 +793,41 @@ def create_leaderboard_v2_tab():
     }
     #hero-banner {
-        width: 100%;
-        margin: 0 0 20px 0;
-        border-radius: 0;
-        overflow: hidden;
-        box-shadow: 0 12px 32px rgba(0, 0, 0, 0.25);
     }
     #hero-banner img {
         width: 100%;
         height: auto;
         display: block;
     }
     .hero-title {
-        font-size: 6rem;
         font-weight: 800;
         line-height: 1.1;
         background: linear-gradient(135deg, #FFE082 0%, #FFC107 50%, #FFB300 100%);
         -webkit-background-clip: text;
         -webkit-text-fill-color: transparent;
         margin-bottom: 1rem;
     }
     .hero-subtitle {
         color: var(--text-secondary);
-        font-size: 1.25rem;
-        font-family: 'Geist', sans-serif;
         margin-top: 0;
     }
@@ -876,6 +886,7 @@ def create_leaderboard_v2_tab():
         box-shadow: 0 12px 30px rgba(0, 0, 0, 0.25);
         backdrop-filter: blur(12px);
         -webkit-backdrop-filter: blur(12px);
     }
     .dashboard-section.emphasized {
@@ -895,23 +906,25 @@ def create_leaderboard_v2_tab():
     }
     .section-title {
-        font-size: 2.2rem;
-        font-weight: 1000;
         color: var(--text-primary);
         margin-bottom: 12px;
         text-align: center !important;
     }
     .section-lead, .section-subtitle {
-        font-size: 1.1rem;
         color: var(--text-secondary);
-        max-width: 1500px;
         margin: 0 auto 24px auto;
         line-height: 1.7;
         text-align: center !important;
         word-break: keep-all;
         white-space: normal;
         display: block;
     }
     .phase-grid {
@@ -929,10 +942,11 @@ def create_leaderboard_v2_tab():
     }
     .phase-card h3 {
-        font-size: 1.5rem;
         color: var(--text-primary);
         margin-bottom: 20px;
         font-weight: 700;
     }
     .phase-chart {
@@ -960,20 +974,23 @@ def create_leaderboard_v2_tab():
     .phase-chart span {
         position: relative;
-        font-size: 1.5rem;
         font-weight: 700;
         color: white !important;
     }
-    /* 추가적인 구체적 선택자 */
     .phase-card .phase-chart span {
         color: #FFFFFF !important;
         text-shadow: 0 1px 2px rgba(0, 0, 0, 0.8) !important;
     }
     .phase-grid .phase-chart span {
         color: #FFFFFF !important;
         z-index: 10 !important;
     }
@@ -991,11 +1008,12 @@ def create_leaderboard_v2_tab():
         background: rgba(245, 246, 247, 0.05);
         border: 1px solid rgba(245, 246, 247, 0.08);
         color: var(--text-secondary);
-        font-size: 0.95rem;
     }
     .scenario-body {
-        max-width: 1200px;
         margin: 0 auto;
         text-align: center;
     }
@@ -1054,7 +1072,7 @@ def create_leaderboard_v2_tab():
     /* Responsive design */
     @media (max-width: 768px) {
         .hero-title {
-            font-size: 4rem;
         }
         .hero-action-button {
             width: 100% !important;
@@ -1078,7 +1096,7 @@ def create_leaderboard_v2_tab():
             gap: 8px;
         }
         .section-title {
-            font-size: 1.8rem;
         }
         .phase-chart {
             width: 100px;
@@ -1103,7 +1121,7 @@ def create_leaderboard_v2_tab():
     gr.HTML("""
     <div style="text-align: center; padding: 20px 0;">
         <h1 class="hero-title">Hugging Face KREW Ko-AgentBench</h1>
-        <p class="hero-subtitle">Agent Benchmark Specialized for Korean Service Environment</p>
     </div>
     """)
@@ -1144,82 +1162,81 @@ def create_leaderboard_v2_tab():
     </div>
     """)
-    # Section 1: 단계별 태스크 설계
     gr.HTML("""
     <div class="dashboard-section">
         <div class="section-header">
-            <h2 class="section-title">7-Level Task Structure</h2>
         </div>
-        <p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 1000px; line-height: 1.7; word-break: keep-all;">From simple tool calls to long-term context understanding and robustness handling,</p>
-            <p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 1000px; line-height: 1.7; word-break: keep-all;">we analyzed agent capabilities in 3D across 7 levels.</p>
         <div class="phase-grid">
             <div class="phase-card">
-                <h3>Single-Turn</h3>
                 <div class="phase-chart" style="--progress:80%;">
                     <span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">80%</span>
                 </div>
-                 <ul class="phase-list">
                     <li style="color: white;">L1: Single Tool Call</li>
                     <li style="color: white;">L2: Tool Selection</li>
                     <li style="color: white;">L3: Sequential Tool Reasoning</li>
                     <li style="color: white;">L4: Parallel Tool Reasoning</li>
-                    <li style="color: white;">L5: Error Handling and Robustness</li>
                 </ul>
             </div>
             <div class="phase-card">
-                <h3>Multi-Turn</h3>
                 <div class="phase-chart" style="--progress:20%;">
                     <span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">20%</span>
                 </div>
                 <ul class="phase-list">
                     <li style="color: white;">L6: Efficient Tool Utilization</li>
-                    <li style="color: white;">L7: Long-Context Reasoning</li>
                 </ul>
             </div>
         </div>
     </div>
     """)
-    # Section 2: 핵심 시나리오 구성
     gr.HTML("""
     <div class="dashboard-section emphasized">
         <div class="section-header">
-            <h2 class="section-title">Real-life Scenario Design Using 18 APIs Optimized for Domestic Environment</h2>
         </div>
         <div class="scenario-body">
-            <p>Realistic, user-centered scenarios—such as “appointment booking” and “blog review search”—were designed</p>
-            <p>by integrating major domestic service APIs including Naver Maps and Kakao.</p>
         </div>
-        <div class="section-flow">⌄</div>
     </div>
     """)
-    # Section 3: 핵심 평가 기준
     gr.HTML("""
     <div class="dashboard-section">
         <div class="section-header">
-            <h2 class="section-title">Key Evaluation Metrics</h2>
         </div>
         <div class="criteria-grid">
             <div class="criteria-card">
                 <h3>Cache-based Iterative Evaluation</h3>
                 <ul>
-                    <li>Real API Response Caching</li>
-                    <li>Solves chronic issues of existing benchmarks such as 'external API instability and information attribute mismatch'</li>
                     <li>Ensures benchmark consistency and reliability</li>
                 </ul>
             </div>
             <div class="criteria-card">
-                <h3>Robustness Test</h3>
                 <ul>
-                    <li>Evaluates error recognition and response capability (strategy) for intentional error situations (product discontinuation)</li>
-                    <li>Selects models that operate stably in real-world environments</li>
                 </ul>
             </div>
             <div class="criteria-card">
-                <h3>Level-specific Evaluation Metrics</h3>
-                 <ul>
-                    <li>Evaluates problem-solving efficiency at each stage including tool selection, parameter configuration, and data processing flow</li>
                     <li>Quantitatively identifies model strengths and weaknesses</li>
                 </ul>
             </div>
@@ -1232,6 +1249,8 @@ def create_leaderboard_v2_tab():
     # Domain filter section with enhanced styling
     gr.HTML("""
     <style>
     /* Enhanced domain selector styling */
     .domain-selector-container {
         background: #ffd21e0d;
@@ -1278,18 +1297,10 @@ def create_leaderboard_v2_tab():
     .domain-performance-container .domain-subtitle {
         font-size: 1.05rem;
-        max-width: 1000px;
-        margin: 0 auto;
-    }
-    .domain-performance-container .domain-subtitle_ {
-        font-size: 1.07rem;
-        max-width: 1000px;
         margin: 0 auto;
-        color: #bdbdbd;
     }
     .leaderboard-intro .domain-title,
     .domain-performance-container > .domain-header .domain-title,
     .performance-card-container > .domain-header .domain-title {
@@ -1318,7 +1329,7 @@ def create_leaderboard_v2_tab():
     .performance-card-container .domain-subtitle {
         font-size: 1.05rem;
-        max-width: 1000px;
         margin: 0 auto;
     }
@@ -1342,10 +1353,11 @@ def create_leaderboard_v2_tab():
         -webkit-background-clip: text;
         background-clip: text;
         -webkit-text-fill-color: transparent;
-        text-shadow: 0 0 22px rgba(255, 210, 30, 0.65), 0 0 45px rgba(255, 210, 30, 0.4);
-        filter: drop-shadow(0 0 16px rgba(255, 210, 30, 0.35));
         letter-spacing: 0.02em;
-        animation: title-shimmer 5s ease-in-out infinite;
     }
     @keyframes title-shimmer {
@@ -1633,11 +1645,11 @@ def create_leaderboard_v2_tab():
     .model-dropdown select,
     .model-dropdown [role="combobox"] {
-        background: rgba(245, 246, 247, 0.06) !important;
-        border: 1px solid var(--border-subtle) !important;
         border-radius: 999px !important;
         padding: 12px 24px !important;
-        color: var(--text-primary) !important;
         font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
         font-weight: 600 !important;
         font-size: 1rem !important;
@@ -1665,10 +1677,10 @@ def create_leaderboard_v2_tab():
         gap: 8px !important;
         width: 100% !important;
         padding: 12px 24px !important;
-        background: rgba(245, 246, 247, 0.06) !important;
-        border: 1px solid var(--border-subtle) !important;
         border-radius: 999px !important;
-        color: var(--text-primary) !important;
         font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
         font-weight: 600 !important;
         font-size: 0.95rem !important;
@@ -1957,94 +1969,147 @@ def create_leaderboard_v2_tab():
         padding: 12px 20px !important;
         font-size: 0.95rem !important;
     }
     </style>
     """)
     level_options = list(level_details.keys())
-    with gr.Column(elem_classes=["domain-selector-container"], elem_id="task-level-selector"):
-        gr.HTML("""
-        <div class="domain-header">
-            <h2 class="domain-title" style="color: white;">🧠 Select Task Level</h2>
-            <p class="domain-subtitle" style="color: white;">Easily compare agent performance across ALL · L1~L7 stages of Ko-AgentBench.</p>
-        </div>
-        """)
-        domain_filter = gr.Radio(
-            choices=level_options,
-            value=default_level,
-            label="",
-            interactive=True,
-            container=False,
-            elem_classes=["domain-radio"]
-        )
-    # Filter controls with domain styling
-    with gr.Column(elem_classes=["domain-selector-container", "filters-sorting-container"], elem_id="filters-sorting-container"):
-        gr.HTML("""
-        <div class="domain-header">
-            <h2 class="domain-title" style="color: white;">🔍 Filters & Sorting</h2>
-            <p class="domain-subtitle" style="color: white;">Select model type and sorting criteria to explore results in your preferred way.</p>
-        </div>
-        """)
-        with gr.Row(elem_classes=["filters-sorting-row"]):
-            with gr.Column(scale=1, elem_classes=["filter-group"]):
-                with gr.Row(elem_classes=["filter-group-row"]):
-                    gr.HTML("<span class='filter-group-label' style='color: white;'>Model Access</span>")
-                    model_type_filter = gr.Radio(
-                        choices=["All", "OSS", "API"],
-                        value="All",
-                        label="",
-                        elem_classes=["domain-radio"],
-                        container=False
-                    )
-            with gr.Column(scale=1, elem_classes=["filter-group"]):
-                with gr.Row(elem_classes=["filter-group-row"]):
-                    gr.HTML("<span class='filter-group-label' style='color: white;'>Sort Order</span>")
-                    sort_order = gr.Radio(
-                        choices=["Descending", "Ascending"],
-                        value="Descending",
-                        label="",
-                        elem_classes=["domain-radio"],
-                        container=False
-                    )
-    # Main leaderboard table with dynamic title
     leaderboard_title = gr.HTML(update_leaderboard_title(default_level))
-    leaderboard_table = gr.HTML(initial_table)
-    gr.HTML("""
-        </div>
-    </div>""")
     # Radar Chart Section
     gr.HTML("""
     <div class="domain-selector-container domain-performance-container">
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Core Capability Radar</h2>
-            <h2 class="domain-title" style="color: white;">Core Capability Radar</h2>
-            <p class="domain-subtitle_">#Execution Accuracy #Complex Reasoning #Robustness #Context & Efficiency #Overall Success #Validity</p>
-            <p class="domain-subtitle" style="color: white;">Analyze model performance capabilities and balance through 6 core competencies.</p>
         </div>
     """)
-    with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="radar-model-selector"):
-        gr.HTML("""
-        <div class="domain-header">
-            <h2 class="domain-title" style="color: white;">🎯 Select Models for Comparison</h2>
-            <p class="domain-subtitle" style="color: white;">Select models to compare in the radar chart.</p>
-        </div>
-        """)
-        model_selector = gr.Dropdown(
-            choices=initial_df['Model'].tolist()[:10],
-            value=initial_df['Model'].tolist()[:5],
-            multiselect=True,
-            label="",
-            info=None,
-            container=False,
-            # elem_classes=["model-dropdown"]
-        )
     # Radar chart plot - wrapped in centered container
     gr.HTML('<div class="chart-container radar-chart-container">')
@@ -2060,317 +2125,53 @@ def create_leaderboard_v2_tab():
     gr.HTML("</div>")
-    # Level metric breakdown section
-    gr.HTML("""
-    <div class="domain-selector-container domain-performance-container level-metrics-wrapper">
-        <div class="domain-header">
-            <h2 class="domain-title" style="color: white;">Level-Specific Metric Spotlight</h2>
-            <p class="domain-subtitle" style="color: white;">Compare model scores based on unique evaluation metrics for each L1–L7 level.</p>
-        </div>
-    """)
-    with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
-        gr.HTML("""
-        <div class="domain-header">
-            <h2 class="domain-title" style="color: white;">🧭 Select Task Level and Models</h2>
-            <p class="domain-subtitle" style="color: white;">Select L1–L7 levels and models to explore detailed SR-based metrics.</p>
-        </div>
-        """)
-        level_metric_selector = gr.Dropdown(
-            choices=level_ids,
-            value=level_ids[0] if level_ids else None,
-            multiselect=False,
-            label="",
-            info=None,
-            container=False,
-            elem_classes=["level-dropdown"]
-        )
-        level_model_selector = gr.Dropdown(
-            choices=initial_level_model_choices,
-            value=initial_level_model_values,
-            multiselect=True,
-            label="",
-            info=None,
-            container=False,
-            elem_classes=["model-dropdown", "level-model-dropdown"]
-        )
-    gr.HTML('<div class="chart-container level-metric-chart-container">')
-    level_metric_chart = gr.Plot(
-        label="",
-        value=initial_level_metric_chart,
-        elem_classes=["level-metric-plot", "plot-container"]
-    )
-    gr.HTML("""
-        </div>
-    </div>
-    """)
-    # Heatmap section
-    gr.HTML("""
-    <div class="domain-selector-container domain-performance-container heatmap-wrapper">
-        <div class="domain-header">
-            <h2 class="domain-title" style="color: white;">Comprehensive Performance Heatmap</h2>
-            <p class="domain-subtitle" style="color: white;">Explore the comprehensive performance heatmap to see SR scores across L1–L7 levels for each model at a glance.</p>
-        </div>
-        <div class="chart-container heatmap-chart-container">
-    """)
-    heatmap_chart = gr.Plot(
-        label="",
-        value=initial_heatmap,
-        elem_classes=["heatmap-plot", "plot-container"]
-    )
-    gr.HTML("""
-        </div>
-    </div>
-    """)
-    # Update functions
-    def get_optimal_sort_order(sort_by_value):
-        """Return the optimal sort order for a given metric"""
-        # Metrics where higher is better (descending)
-        descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
-        # Metrics where lower is better (ascending)
-        ascending_metrics = []
-        if sort_by_value in descending_metrics:
-            return "Descending"
-        elif sort_by_value in ascending_metrics:
-            return "Ascending"
-        else:
-            return "Descending"  # Default fallback
-    def update_table(level_filter, model_type_filter, sort_order):
-        title_html = update_leaderboard_title(level_filter)
-        sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
-        table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
-        return title_html, table_html
-    def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
-        # Get filtered dataframe
         df = load_leaderboard_data()
-        sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
-        filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
-        # Update model selector choices based on filtered data
-        available_models_all = filtered_df['Model'].tolist()
-        available_models = available_models_all[:15]  # Top 15 from filtered results
-        # If selected models are not in available models, reset to top 5
-        if selected_models:
-            valid_selected = [m for m in selected_models if m in available_models]
-            if not valid_selected:
-                valid_selected = available_models[:5]
-        else:
-            valid_selected = available_models[:5]
-        # Create radar chart
-        chart = create_domain_radar_chart(filtered_df, valid_selected)
-        # Prepare heatmap order prioritizing selected models
-        heatmap_order = []
-        for model in valid_selected:
-            if model not in heatmap_order:
-                heatmap_order.append(model)
-        for model in available_models_all:
-            if model not in heatmap_order:
-                heatmap_order.append(model)
-        heatmap_order = heatmap_order[:12]
-        heatmap_fig = create_performance_heatmap(filtered_df, heatmap_order)
-        # Level metric chart
-        effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
-        available_level_models = available_models_all
-        if level_selected_models:
-            valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
-            if not valid_level_models:
-                valid_level_models = available_level_models[:5]
-        else:
-            valid_level_models = available_level_models[:5]
-        level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
-        return (
-            gr.Dropdown(
-                choices=available_models,
-                value=valid_selected,
-                multiselect=True,
-                label="",
-                info=None,
-                container=False,
-                # elem_classes=["model-dropdown"]
-            ),
-            chart,
-            heatmap_fig,
-            gr.Dropdown(
-                choices=available_level_models,
-                value=valid_level_models,
-                multiselect=True,
-                label="",
-                info=None,
-                container=False,
-                elem_classes=["model-dropdown", "level-model-dropdown"]
-            ),
-            level_metric_fig,
-        )
-    def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
-        # Get filtered dataframe
-        df = load_leaderboard_data()
-        sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
-        filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
-        available_models_all = filtered_df['Model'].tolist()
-        if selected_models:
-            valid_selected = [m for m in selected_models if m in available_models_all]
-            if not valid_selected:
-                valid_selected = available_models_all[:5]
-        else:
-            valid_selected = available_models_all[:5]
-        heatmap_order = []
-        for model in valid_selected:
-            if model not in heatmap_order:
-                heatmap_order.append(model)
-        for model in available_models_all:
-            if model not in heatmap_order:
-                heatmap_order.append(model)
-        heatmap_order = heatmap_order[:12]
-        effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
-        available_level_models = available_models_all
-        if level_selected_models:
-            valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
-            if not valid_level_models:
-                valid_level_models = available_level_models[:5]
-        else:
-            valid_level_models = available_level_models[:5]
-        level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
-        return (
-            create_domain_radar_chart(filtered_df, valid_selected),
-            create_performance_heatmap(filtered_df, heatmap_order),
-            gr.Dropdown(
-                choices=available_level_models,
-                value=valid_level_models,
-                multiselect=True,
-                label="",
-                info=None,
-                container=False,
-                elem_classes=["model-dropdown", "level-model-dropdown"]
-            ),
-            level_metric_fig,
-        )
-    def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
-        df = load_leaderboard_data()
-        sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
-        filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
-        available_models = filtered_df['Model'].tolist()
-        if level_selected_models:
-            valid_level_models = [m for m in level_selected_models if m in available_models][:5]
-            if not valid_level_models:
-                valid_level_models = available_models[:5]
-        else:
-            valid_level_models = available_models[:5]
-        effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
-        level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
-        return (
-            gr.Dropdown(
-                choices=available_models,
-                value=valid_level_models,
-                multiselect=True,
-                label="",
-                info=None,
-                container=False,
-                elem_classes=["model-dropdown", "level-model-dropdown"]
-            ),
-            level_chart,
-        )
-    # Update table when filters change
-    filter_inputs = [domain_filter, model_type_filter, sort_order]
-    for input_component in filter_inputs:
-        input_component.change(
-            fn=update_table,
-            inputs=filter_inputs,
-            outputs=[leaderboard_title, leaderboard_table]
-        )
-        # Also update radar chart when filters change
-        input_component.change(
-            fn=update_radar_chart,
-            inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
-            outputs=[model_selector, radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
-        )
-    # Update radar chart when model selection changes
-    model_selector.change(
-        fn=update_radar_only,
-        inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
-        outputs=[radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
-    )
-    level_metric_selector.change(
-        fn=update_level_metric_only,
-        inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
-        outputs=[level_model_selector, level_metric_chart]
-    )
-    level_model_selector.change(
-        fn=update_level_metric_only,
-        inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
-        outputs=[level_model_selector, level_metric_chart]
-    )
-    # Define generate_performance_card function before using it
-    def generate_performance_card(model_name):
-        """Generate HTML for the model performance card"""
-        if not model_name:
-            return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
-                Please select a model to generate its performance card
-            </div>"""
-        # Get model data
-        df = load_leaderboard_data()
-        model_data = df[df['Model'] == model_name]
-        if model_data.empty:
-            return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
-                Model not found in the database
-            </div>"""
-        row = model_data.iloc[0]
-        # Get overall rank based on overall success
-        df_with_success = df.copy()
-        df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
-        df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
-        df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
-        try:
-            rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
-        except:
-            rank = 'N/A'
-        # Format values
-        def format_value(val, decimals=3, prefix='', suffix=''):
-            if pd.isna(val) or val == '':
-                return 'N/A'
-            return f"{prefix}{float(val):.{decimals}f}{suffix}"
         def format_score(value):
             if pd.isna(value) or value == '':
                 return 'N/A'
             return f"{float(value):.3f}"
-        # Use the same order as the domain radar but keep '견고성' (Robustness) last
         radar_metrics = [
             ("Execution Accuracy", row.get('Execution Accuracy')),
             ("Context & Efficiency", row.get('Context & Efficiency')),
             ("Overall Success", row.get('Overall Success')),
-            ("Robustness", row.get('Robustness')),
-            ("Complex Reasoning", row.get('Complex Reasoning')),
             ("Validity", row.get('Call Validity')),
         ]
         radar_values = []
@@ -2426,7 +2227,7 @@ def create_leaderboard_v2_tab():
                     <div class="core-section">
                         <div class="core-metric-grid">
         """
-        ordered_labels = ["Execution Accuracy", "Context & Efficiency", "Overall Success", "Robustness", "Complex Reasoning", "Validity"]
         ordered_metrics = sorted(radar_metrics, key=lambda x: ordered_labels.index(x[0]) if x[0] in ordered_labels else len(ordered_labels))
         top_metrics = ordered_metrics[:3]
         bottom_metrics = ordered_metrics[3:]
@@ -2479,21 +2280,20 @@ def create_leaderboard_v2_tab():
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Model Performance Card</h2>
             <p class="domain-subtitle" style="color: white;">
-                Check out the precision analysis card that visualizes the model's performance spectrum with <br> 6 key metrics and overall success rate (SR) by L1~L7 levels.
             </p>
             <p class="domain-note" style="color: #bdbdbd; font-size: 0.85em; margin-top: 4px;">
-                 ※ Rank is calculated based on the average SR value across L1–L7 levels.
             </p>
         </div>
         <div class="performance-card-content">
     """)
     with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
         gr.HTML("""
-        <div class="domain-header">
-            <h2 class="domain-title" style="color: white;">🤖 Select Model</h2>
-            <p class="domain-subtitle" style="color: white;">Select models for the analysis card.</p>
-        </div>
         """)
         card_model_selector = gr.Dropdown(
             choices=initial_df['Model'].tolist(),
@@ -2523,6 +2323,279 @@ def create_leaderboard_v2_tab():
         </div>
     </div>
     """)
     # Add custom CSS for the performance card
     gr.HTML("""
@@ -2745,18 +2818,18 @@ def create_leaderboard_v2_tab():
     .level-dropdown select,
     .level-dropdown [role="combobox"],
     .level-dropdown button {
-        background: rgba(245, 246, 247, 0.06) !important;
-        border: 1px solid var(--border-subtle) !important;
         border-radius: 999px !important;
         padding: 12px 20px !important;
-        color: var(--text-primary) !important;
         font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
         font-weight: 600 !important;
         font-size: 0.95rem !important;
         text-align: center !important;
         min-height: 46px !important;
         transition: all 0.3s ease !important;
-        box-shadow: 0 10px 24px rgba(255, 210, 30, 0.15) !important;
     }
     .level-dropdown select:hover,
@@ -2773,6 +2846,14 @@ def create_leaderboard_v2_tab():
         margin: 12px auto 0 !important;
     }
     .radar-placeholder {
         display: flex;
         flex-direction: column;
@@ -2925,6 +3006,74 @@ def create_leaderboard_v2_tab():
         }
     }
     </style>
     """)
@@ -3052,15 +3201,13 @@ def create_leaderboard_v2_interface():
 def create_domain_radar_chart(df, selected_models=None, max_models=5):
     """Visualize six core capability metrics on a radar chart."""
     df = df.copy()
-    # Use the same metric order and Korean labels as the model performance card
-    # Match the model card order but place '견고성' (Robustness) last as requested
     metrics_info = [
         {"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"},
-        {"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"},
-        {"column": "Overall Success", "label": "Overall Success", "description": "L1~L7의 Average Success Rate"},
-        {"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"},
         {"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"},
-        {"column": "Call Validity", "label": "Validity", "description": "레벨별 EPR_CVR 평균"},
     ]
     required_columns = [m["column"] for m in metrics_info]
@@ -3217,7 +3364,7 @@ def create_domain_radar_chart(df, selected_models=None, max_models=5):
         autosize=True,
         annotations=[
             dict(
-                text="Ko-Agent Leaderboard",
                 xref="paper", yref="paper",
                 x=0.98, y=0.02,
                 xanchor='right', yanchor='bottom',

     # Level metadata for the 7-stage task framework
     level_details = {
         "ALL": {
+            "title": "<span style='font-family: \"Gowun Dodum\", sans-serif !important;'>ALL · All Tasks</span>",
+            "description": "<span style='font-family: \"Nanum Gothic\", sans-serif !important;'>See average performance across all seven tasks and use it as a baseline for per-level comparison.</span>"
         },
         "L1": {
+            "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L1 · Single Tool Call</span>",
+            "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Evaluates single tool invocation capability and basic command execution accuracy.</span>"
         },
         "L2": {
+            "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L2 · Tool Selection</span>",
+            "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Measures the ability to choose the right tool and invoke it with appropriate parameters.</span>"
         },
         "L3": {
+            "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L3 · Sequential Tool Reasoning</span>",
+            "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Validates multi-step sequential reasoning for solving tasks.</span>"
         },
         "L4": {
+            "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L4 · Parallel Tool Reasoning</span>",
+            "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Evaluates the ability to integrate and summarize information from multiple sources in parallel.</span>"
         },
         "L5": {
+            "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L5 · Error Handling & Robustness</span>",
+            "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Checks awareness of unexpected failures and the strategies used to recover.</span>"
         },
         "L6": {
+            "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L6 · Efficient Tool Utilization</span>",
+            "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Examines operational efficiency in achieving goals with minimal calls and cost.</span>"
         },
         "L7": {
+            "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L7 · Long-Context Memory</span>",
+            "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Analyzes the ability to retain and leverage long conversational context.</span>"
         }
     }
     default_level = "ALL"
             if highlight_map.get(level):
                 header_classes.append("highlight-header")
             table_html += f"""
+                        <th class="{' '.join(header_classes)}" title="Average success rate for {level}">
                             <span class="metric-header">{level} <span class="info-icon">ⓘ</span></span>
                         </th>
             """
     # Header styles and navigation
     gr.HTML("""
     <style>
+    @import url('https://fonts.googleapis.com/css2?family=Nanum+Gothic:wght@400;700;800&family=Gowun+Dodum&family=Do+Hyeon&display=swap');
     /* Enhanced button styling with better gradio compatibility */
     .header-action-button {
         display: inline-block !important;
     }
     #hero-banner {
+        width: 100vw !important;
+        margin: 0 calc(-50vw + 50%) 20px calc(-50vw + 50%) !important;
+        border-radius: 0 !important;
+        overflow: hidden !important;
+        box-shadow: 0 12px 32px rgba(0, 0, 0, 0.25) !important;
+        position: relative !important;
+        left: 50% !important;
+        right: 50% !important;
+        margin-left: -50vw !important;
+        margin-right: -50vw !important;
+        max-width: none !important;
     }
     #hero-banner img {
         width: 100%;
         height: auto;
         display: block;
+        object-fit: cover;
     }
     .hero-title {
+        font-size: 10rem;
         font-weight: 800;
         line-height: 1.1;
         background: linear-gradient(135deg, #FFE082 0%, #FFC107 50%, #FFB300 100%);
         -webkit-background-clip: text;
         -webkit-text-fill-color: transparent;
         margin-bottom: 1rem;
+        font-family: 'Do Hyeon', sans-serif !important;
     }
     .hero-subtitle {
         color: var(--text-secondary);
+        font-size: 3rem;
+        font-family: 'Do Hyeon', sans-serif !important;
         margin-top: 0;
     }
         box-shadow: 0 12px 30px rgba(0, 0, 0, 0.25);
         backdrop-filter: blur(12px);
         -webkit-backdrop-filter: blur(12px);
+        font-family: 'Nanum Gothic', sans-serif !important;
     }
     .dashboard-section.emphasized {
     }
     .section-title {
+        font-size: 3.75rem;
+        font-weight: 700;
         color: var(--text-primary);
         margin-bottom: 12px;
         text-align: center !important;
+        font-family: 'Gowun Dodum', sans-serif !important;
     }
     .section-lead, .section-subtitle {
+        font-size: 1.32rem !important;
         color: var(--text-secondary);
+        max-width: 720px;
         margin: 0 auto 24px auto;
         line-height: 1.7;
         text-align: center !important;
         word-break: keep-all;
         white-space: normal;
         display: block;
+        font-family: 'Nanum Gothic', sans-serif !important;
     }
     .phase-grid {
     }
     .phase-card h3 {
+        font-size: 1.44rem !important;
         color: var(--text-primary);
         margin-bottom: 20px;
         font-weight: 700;
+        font-family: 'Nanum Gothic', sans-serif !important;
     }
     .phase-chart {
     .phase-chart span {
         position: relative;
+        font-size: 1.2rem !important;
         font-weight: 700;
         color: white !important;
+        font-family: 'Nanum Gothic', sans-serif !important;
     }
+    /* Additional specific selectors */
     .phase-card .phase-chart span {
         color: #FFFFFF !important;
         text-shadow: 0 1px 2px rgba(0, 0, 0, 0.8) !important;
+        font-family: 'Nanum Gothic', sans-serif !important;
     }
     .phase-grid .phase-chart span {
         color: #FFFFFF !important;
         z-index: 10 !important;
+        font-family: 'Nanum Gothic', sans-serif !important;
     }
         background: rgba(245, 246, 247, 0.05);
         border: 1px solid rgba(245, 246, 247, 0.08);
         color: var(--text-secondary);
+        font-size: 1.08rem !important;
+        font-family: 'Nanum Gothic', sans-serif !important;
     }
     .scenario-body {
+        max-width: 760px;
         margin: 0 auto;
         text-align: center;
     }
     /* Responsive design */
     @media (max-width: 768px) {
         .hero-title {
+            font-size: 10rem;
         }
         .hero-action-button {
             width: 100% !important;
             gap: 8px;
         }
         .section-title {
+            font-size: 2.7rem;
         }
         .phase-chart {
             width: 100px;
     gr.HTML("""
     <div style="text-align: center; padding: 20px 0;">
         <h1 class="hero-title">Hugging Face KREW Ko-AgentBench</h1>
+        <p class="hero-subtitle">Agent benchmark optimized for real Korean usage.</p>
     </div>
     """)
     </div>
     """)
+    # Section 1: Task Design by Stage
     gr.HTML("""
     <div class="dashboard-section">
         <div class="section-header">
+            <h2 class="section-title" style="font-family: 'Gowun Dodum', sans-serif; font-size: 2.5rem;">7-Level Task Design</h2>
         </div>
+        <p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 720px; line-height: 1.7; word-break: keep-all;">We analyzed agent capabilities across seven stages—from simple tool calls to long-context retention and robustness.</p>
         <div class="phase-grid">
             <div class="phase-card">
+                <h3>Single Turn</h3>
                 <div class="phase-chart" style="--progress:80%;">
                     <span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">80%</span>
                 </div>
+                <ul class="phase-list">
                     <li style="color: white;">L1: Single Tool Call</li>
                     <li style="color: white;">L2: Tool Selection</li>
                     <li style="color: white;">L3: Sequential Tool Reasoning</li>
                     <li style="color: white;">L4: Parallel Tool Reasoning</li>
+                    <li style="color: white;">L5: Error Handling & Robustness</li>
                 </ul>
             </div>
             <div class="phase-card">
+                <h3>Multi Turn</h3>
                 <div class="phase-chart" style="--progress:20%;">
                     <span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">20%</span>
                 </div>
                 <ul class="phase-list">
                     <li style="color: white;">L6: Efficient Tool Utilization</li>
+                    <li style="color: white;">L7: Long-Context Memory</li>
                 </ul>
             </div>
         </div>
     </div>
     """)
+    # Section 2: Core Scenario Design
     gr.HTML("""
     <div class="dashboard-section emphasized">
         <div class="section-header">
+            <h2 class="section-title" style="font-size: 2.0rem;">High-quality scenario design tailored to 18 Korea-specific APIs and real-world use cases.</h2>
         </div>
         <div class="scenario-body">
+            <p>We built realistic scenarios—such as appointment booking and blog review search—by integrating APIs widely used in Korea including Naver Maps, Kakao services, and local websites.</p>
         </div>
     </div>
+        <div class="section-flow">⌄</div>
     """)
+    # Section 3: Key Evaluation Criteria
     gr.HTML("""
     <div class="dashboard-section">
         <div class="section-header">
+            <h2 class="section-title" style="font-size: 2.0rem;">Key Evaluation Criteria</h2>
         </div>
         <div class="criteria-grid">
             <div class="criteria-card">
                 <h3>Cache-based Iterative Evaluation</h3>
                 <ul>
+                    <li>Improved handling of failed API responses</li>
+                    <li>Addresses chronic benchmark issues such as mismatched response attributes</li>
                     <li>Ensures benchmark consistency and reliability</li>
                 </ul>
             </div>
             <div class="criteria-card">
+                <h3>Robustness Testing</h3>
                 <ul>
+                    <li>Evaluates recognition and response strategies for intentional failure scenarios (e.g., discontinued products)</li>
+                    <li>Surfaces models that remain stable in real-world deployments</li>
                 </ul>
             </div>
             <div class="criteria-card">
+                <h3>Level-specific Precision Metrics</h3>
+                <ul>
+                    <li>Evaluates each phase of problem solving, including tool selection, parameter setup, and data flow</li>
                     <li>Quantitatively identifies model strengths and weaknesses</li>
                 </ul>
             </div>
     # Domain filter section with enhanced styling
     gr.HTML("""
     <style>
+    @import url('https://fonts.googleapis.com/css2?family=Nanum+Gothic:wght@400;700;800&family=Gowun+Dodum&family=Do+Hyeon&display=swap');
     /* Enhanced domain selector styling */
     .domain-selector-container {
         background: #ffd21e0d;
     .domain-performance-container .domain-subtitle {
         font-size: 1.05rem;
+        max-width: 720px;
         margin: 0 auto;
     }
     .leaderboard-intro .domain-title,
     .domain-performance-container > .domain-header .domain-title,
     .performance-card-container > .domain-header .domain-title {
     .performance-card-container .domain-subtitle {
         font-size: 1.05rem;
+        max-width: 720px;
         margin: 0 auto;
     }
         -webkit-background-clip: text;
         background-clip: text;
         -webkit-text-fill-color: transparent;
+        text-shadow: 0 0 3px rgba(255, 210, 30, 0.08), 0 0 8px rgba(255, 210, 30, 0.05);
+        filter: drop-shadow(0 0 2px rgba(255, 210, 30, 0.06));
         letter-spacing: 0.02em;
+        animation: title-shimmer 1.25s ease-in-out infinite;
+        font-family: 'Gowun Dodum', sans-serif !important;
     }
     @keyframes title-shimmer {
     .model-dropdown select,
     .model-dropdown [role="combobox"] {
+        background: #000000 !important;
+        border: 1px solid #333333 !important;
         border-radius: 999px !important;
         padding: 12px 24px !important;
+        color: #ffffff !important;
         font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
         font-weight: 600 !important;
         font-size: 1rem !important;
         gap: 8px !important;
         width: 100% !important;
         padding: 12px 24px !important;
+        background: #000000 !important;
+        border: 1px solid #333333 !important;
         border-radius: 999px !important;
+        color: #ffffff !important;
         font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
         font-weight: 600 !important;
         font-size: 0.95rem !important;
         padding: 12px 20px !important;
         font-size: 0.95rem !important;
     }
+    /* Leaderboard controls row styling */
+    .leaderboard-controls-row {
+        margin: 20px 0 !important;
+        padding: 20px !important;
+        background: transparent !important;
+        border: none !important;
+        gap: 40px !important;
+    }
+    .leaderboard-controls-row .gr-column,
+    .leaderboard-controls-row .gr-row,
+    .leaderboard-controls-row .gr-box,
+    .leaderboard-controls-row .gradio-column,
+    .leaderboard-controls-row .gradio-row,
+    .leaderboard-controls-row .gradio-group {
+        background: transparent !important;
+        border: none !important;
+        box-shadow: none !important;
+        padding: 0 !important;
+    }
+    /* Remove all container backgrounds for leaderboard controls */
+    .leaderboard-controls-row * {
+        background-color: transparent !important;
+        background-image: none !important;
+        border: none !important;
+        box-shadow: none !important;
+    }
+    .leaderboard-controls-row .inline-radio,
+    .leaderboard-controls-row .domain-radio {
+        background: transparent !important;
+        border: none !important;
+        box-shadow: none !important;
+    }
+    /* Inline radio styling for integrated controls */
+    .inline-radio {
+        background: transparent !important;
+        border: none !important;
+        box-shadow: none !important;
+        padding: 0 !important;
+    }
+    .inline-radio .wrap {
+        display: flex !important;
+        gap: 8px !important;
+        flex-wrap: wrap !important;
+        justify-content: flex-start !important;
+        background: transparent !important;
+        border: none !important;
+        box-shadow: none !important;
+        padding: 0 !important;
+    }
+    .inline-radio label {
+        padding: 8px 16px !important;
+        background: rgba(245, 246, 247, 0.06) !important;
+        border: 1px solid var(--border-subtle) !important;
+        border-radius: 20px !important;
+        font-size: 0.85rem !important;
+        color: var(--text-primary) !important;
+        transition: all 0.2s ease !important;
+        cursor: pointer !important;
+    }
+    .inline-radio label:hover {
+        background: rgba(255, 210, 30, 0.12) !important;
+        border-color: var(--accent-primary) !important;
+    }
+    .inline-radio input[type="radio"]:checked + label,
+    .inline-radio label[aria-checked="true"] {
+        background: rgba(255, 210, 30, 0.2) !important;
+        border-color: var(--accent-primary) !important;
+        color: white !important;
+        font-weight: 600 !important;
+    }
     </style>
     """)
     level_options = list(level_details.keys())
+    # Main leaderboard table with dynamic title and integrated controls
     leaderboard_title = gr.HTML(update_leaderboard_title(default_level))
+    # Integrated controls within leaderboard section - stacked vertically
+    gr.HTML("<p style='color: white; margin: 5px 0 5px 0; font-size: 1.2rem;'>Select Task Level</p>")
+    domain_filter = gr.Radio(
+        choices=level_options,
+        value=default_level,
+        label="",
+        interactive=True,
+        container=False,
+        elem_classes=["domain-radio", "inline-radio"]
+    )
+    gr.HTML("<p style='color: white; margin: 5px 0 0px 0; font-size: 1.2rem;'>🔍 Filters & Sorting</p>")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.HTML("<span style='color: white; font-size: 1.2rem; margin-bottom: 5px; display: block;'>Model Access</span>")
+            model_type_filter = gr.Radio(
+                choices=["All", "OSS", "API"],
+                value="All",
+                label="",
+                elem_classes=["domain-radio", "inline-radio"],
+                container=False
+            )
+        with gr.Column(scale=1):
+            gr.HTML("<span style='color: white; font-size: 1.2rem; margin-bottom: 5px; display: block;'>Sort Order</span>")
+            sort_order = gr.Radio(
+                choices=["Descending", "Ascending"],
+                value="Descending",
+                label="",
+                elem_classes=["domain-radio", "inline-radio"],
+                container=False
+            )
+    leaderboard_table = gr.HTML(initial_table)
     # Radar Chart Section
     gr.HTML("""
     <div class="domain-selector-container domain-performance-container">
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Core Capability Radar</h2>
+            <p class="domain-subtitle" style="color: white;">Track six essential axes: success, execution, reasoning, robustness, efficiency, and call validity.</p>
         </div>
     """)
+    gr.HTML("<p style='color: white; margin: 10px 0 0 0; font-size: 1.2rem; font-family: \"Nanum Gothic\", sans-serif;'>Select models to compare (up to 5).</p>")
+    # gr.HTML("<p style='color: #b0b0b0; margin: 0 0 10px 0; font-size: 0.9rem;'>You can select up to five models.</p>")
+    model_selector = gr.Dropdown(
+        choices=initial_df['Model'].tolist()[:10],
+        value=initial_df['Model'].tolist()[:5],
+        multiselect=True,
+        label="",
+        info=None,
+        container=False,
+    )
     # Radar chart plot - wrapped in centered container
     gr.HTML('<div class="chart-container radar-chart-container">')
     gr.HTML("</div>")
+    # Define generate_performance_card function before using it
+    def generate_performance_card(model_name):
+        """Generate HTML for the model performance card"""
+        if not model_name:
+            return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
+                Please select a model to generate its performance card
+            </div>"""
+        # Get model data
         df = load_leaderboard_data()
+        model_data = df[df['Model'] == model_name]
+        if model_data.empty:
+            return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
+                Model not found in the database
+            </div>"""
+        row = model_data.iloc[0]
+        # Get overall rank based on overall success
+        df_with_success = df.copy()
+        df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
+        df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
+        df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
+        try:
+            rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
+        except:
+            rank = 'N/A'
+        # Format values
+        def format_value(val, decimals=3, prefix='', suffix=''):
+            if pd.isna(val) or val == '':
+                return 'N/A'
+            return f"{prefix}{float(val):.{decimals}f}{suffix}"
         def format_score(value):
             if pd.isna(value) or value == '':
                 return 'N/A'
             return f"{float(value):.3f}"
         radar_metrics = [
             ("Execution Accuracy", row.get('Execution Accuracy')),
+            ("Complex Reasoning", row.get('Complex Reasoning')),
+            ("Robustness", row.get('Robustness')),
             ("Context & Efficiency", row.get('Context & Efficiency')),
             ("Overall Success", row.get('Overall Success')),
             ("Validity", row.get('Call Validity')),
         ]
         radar_values = []
                     <div class="core-section">
                         <div class="core-metric-grid">
         """
+        ordered_labels = ["Execution Accuracy", "Complex Reasoning", "Robustness", "Context & Efficiency", "Overall Success", "Validity"]
         ordered_metrics = sorted(radar_metrics, key=lambda x: ordered_labels.index(x[0]) if x[0] in ordered_labels else len(ordered_labels))
         top_metrics = ordered_metrics[:3]
         bottom_metrics = ordered_metrics[3:]
         <div class="domain-header">
             <h2 class="domain-title" style="color: white;">Model Performance Card</h2>
             <p class="domain-subtitle" style="color: white;">
+                Explore detailed performance cards that visualize six core metrics plus overall SR across L1–L7 levels.
             </p>
             <p class="domain-note" style="color: #bdbdbd; font-size: 0.85em; margin-top: 4px;">
+                 ※ Ranks are determined by the average SR across L1–L7.
             </p>
         </div>
         <div class="performance-card-content">
     """)
     with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
         gr.HTML("""
+        <p class="domain-subtitle" style="color: white;">Choose a model to generate its analysis card.</p>
         """)
         card_model_selector = gr.Dropdown(
             choices=initial_df['Model'].tolist(),
         </div>
     </div>
     """)
+    # Level metric breakdown section
+    gr.HTML("""
+    <div class="domain-selector-container domain-performance-container level-metrics-wrapper">
+        <div class="domain-header">
+            <h2 class="domain-title" style="color: white;">Level-specific Metrics</h2>
+            <p class="domain-subtitle" style="color: white;">Compare model scores with each Ko-AgentBench level's dedicated metrics for deeper insights.</p>
+        </div>
+    """)
+    gr.HTML("""
+    <p style="color: white; text-align: center; margin: 0 0 20px 0; font-size: 1.2rem; font-family: \'Nanum Gothic\', sans-serif;">Select a level and up to five models to explore detailed metrics.</p>
+    """)
+    with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
+        level_metric_selector = gr.Dropdown(
+            choices=level_ids,
+            value=level_ids[0] if level_ids else None,
+            multiselect=False,
+            label="",
+            info=None,
+            container=False,
+            elem_classes=["level-dropdown"]
+        )
+        level_model_selector = gr.Dropdown(
+            choices=initial_level_model_choices,
+            value=initial_level_model_values,
+            multiselect=True,
+            label="",
+            info=None,
+            container=False,
+            elem_classes=["model-dropdown", "level-model-dropdown"]
+        )
+    gr.HTML('<div class="chart-container level-metric-chart-container">')
+    level_metric_chart = gr.Plot(
+        label="",
+        value=initial_level_metric_chart,
+        elem_classes=["level-metric-plot", "plot-container"]
+    )
+    gr.HTML("""
+        </div>
+    </div>
+    """)
+    # # Heatmap section
+    # gr.HTML("""
+    # <div class="domain-selector-container domain-performance-container heatmap-wrapper">
+    #     <div class="domain-header">
+    #         <h2 class="domain-title" style="color: white;">Comprehensive Performance Heatmap</h2>
+    #         <p class="domain-subtitle" style="color: white;">See each model's L1–L7 SR scores at a glance.</p>
+    #     </div>
+    #     <div class="chart-container heatmap-chart-container">
+    # """)
+    # heatmap_chart = gr.Plot(
+    #     label="",
+    #     value=initial_heatmap,
+    #     elem_classes=["heatmap-plot", "plot-container"]
+    # )
+    # gr.HTML("""
+    #     </div>
+    # </div>
+    # """)
+    # Update functions
+    def get_optimal_sort_order(sort_by_value):
+        """Return the optimal sort order for a given metric"""
+        # Metrics where higher is better (descending)
+        descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
+        # Metrics where lower is better (ascending)
+        ascending_metrics = []
+        if sort_by_value in descending_metrics:
+            return "Descending"
+        elif sort_by_value in ascending_metrics:
+            return "Ascending"
+        else:
+            return "Descending"  # Default fallback
+    def update_table(level_filter, model_type_filter, sort_order):
+        title_html = update_leaderboard_title(level_filter)
+        sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
+        table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
+        return title_html, table_html
+    def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
+        # Get filtered dataframe
+        df = load_leaderboard_data()
+        sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
+        filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
+        # Update model selector choices based on filtered data
+        available_models_all = filtered_df['Model'].tolist()
+        available_models = available_models_all[:15]  # Top 15 from filtered results
+        # If selected models are not in available models, reset to top 5
+        if selected_models:
+            valid_selected = [m for m in selected_models if m in available_models]
+            # Check if more than 5 models are selected and show alert
+            if len(valid_selected) > 5:
+                gr.Warning("You can select up to 5 models.")
+                # Remove the last selected item (6th item) instead of keeping first 5
+                valid_selected = valid_selected[:-1]
+            if not valid_selected:
+                valid_selected = available_models[:5]
+        else:
+            valid_selected = available_models[:5]
+        # Create radar chart
+        chart = create_domain_radar_chart(filtered_df, valid_selected)
+        # Prepare heatmap order prioritizing selected models
+        # Level metric chart
+        effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
+        available_level_models = available_models_all
+        if level_selected_models:
+            valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
+            if not valid_level_models:
+                valid_level_models = available_level_models[:5]
+        else:
+            valid_level_models = available_level_models[:5]
+        level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
+        return (
+            gr.Dropdown(
+                choices=available_models,
+                value=valid_selected,
+                multiselect=True,
+                label="",
+                info=None,
+                container=False,
+                # elem_classes=["model-dropdown"]
+            ),
+            chart,
+            gr.Dropdown(
+                choices=available_level_models,
+                value=valid_level_models,
+                multiselect=True,
+                label="",
+                info=None,
+                container=False,
+                elem_classes=["model-dropdown", "level-model-dropdown"]
+            ),
+            level_metric_fig,
+        )
+    def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
+        # Get filtered dataframe
+        df = load_leaderboard_data()
+        sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
+        filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
+        available_models_all = filtered_df['Model'].tolist()
+        if selected_models:
+            valid_selected = [m for m in selected_models if m in available_models_all]
+            # Check if more than 5 models are selected and show alert
+            if len(valid_selected) > 5:
+                # JavaScript alert for exceeding 5 models
+                gr.Warning("You can select up to 5 models.")
+                # Remove the last selected item (6th item) instead of keeping first 5
+                valid_selected = valid_selected[:-1]
+            if not valid_selected:
+                valid_selected = available_models_all[:5]
+        else:
+            valid_selected = available_models_all[:5]
+        effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
+        available_level_models = available_models_all
+        if level_selected_models:
+            valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
+            if not valid_level_models:
+                valid_level_models = available_level_models[:5]
+        else:
+            valid_level_models = available_level_models[:5]
+        level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
+        return (
+            gr.Dropdown(
+                choices=available_models_all[:15],
+                value=valid_selected,
+                multiselect=True,
+                label="",
+                info=None,
+                container=False,
+            ),
+            create_domain_radar_chart(filtered_df, valid_selected),
+            gr.Dropdown(
+                choices=available_level_models,
+                value=valid_level_models,
+                multiselect=True,
+                label="",
+                info=None,
+                container=False,
+                elem_classes=["model-dropdown", "level-model-dropdown"]
+            ),
+            level_metric_fig,
+        )
+    def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
+        df = load_leaderboard_data()
+        sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
+        filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
+        available_models = filtered_df['Model'].tolist()
+        if level_selected_models:
+            valid_level_models = [m for m in level_selected_models if m in available_models]
+            # Check if more than 5 models are selected and show alert
+            if len(valid_level_models) > 5:
+                gr.Warning("You can select up to 5 models.")
+                # Remove the last selected item (6th item) instead of keeping first 5
+                valid_level_models = valid_level_models[:-1]
+            if not valid_level_models:
+                valid_level_models = available_models[:5]
+        else:
+            valid_level_models = available_models[:5]
+        effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
+        level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
+        return (
+            gr.Dropdown(
+                choices=available_models,
+                value=valid_level_models,
+                multiselect=True,
+                label="",
+                info=None,
+                container=False,
+                elem_classes=["model-dropdown", "level-model-dropdown"]
+            ),
+            level_chart,
+        )
+    # Update table when filters change
+    filter_inputs = [domain_filter, model_type_filter, sort_order]
+    for input_component in filter_inputs:
+        input_component.change(
+            fn=update_table,
+            inputs=filter_inputs,
+            outputs=[leaderboard_title, leaderboard_table]
+        )
+        # Also update radar chart when filters change
+        input_component.change(
+            fn=update_radar_chart,
+            inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+            outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
+        )
+    # Update radar chart when model selection changes
+    model_selector.change(
+        fn=update_radar_only,
+        inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+        outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
+    )
+    level_metric_selector.change(
+        fn=update_level_metric_only,
+        inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+        outputs=[level_model_selector, level_metric_chart]
+    )
+    level_model_selector.change(
+        fn=update_level_metric_only,
+        inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
+        outputs=[level_model_selector, level_metric_chart]
+    )
     # Add custom CSS for the performance card
     gr.HTML("""
     .level-dropdown select,
     .level-dropdown [role="combobox"],
     .level-dropdown button {
+        background: #000000 !important;
+        border: 1px solid #333333 !important;
         border-radius: 999px !important;
         padding: 12px 20px !important;
+        color: #ffffff !important;
         font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
         font-weight: 600 !important;
         font-size: 0.95rem !important;
         text-align: center !important;
         min-height: 46px !important;
         transition: all 0.3s ease !important;
+        box-shadow: 0 10px 24px rgba(0, 0, 0, 0.3) !important;
     }
     .level-dropdown select:hover,
         margin: 12px auto 0 !important;
     }
+    .level-model-dropdown select,
+    .level-model-dropdown [role="combobox"],
+    .level-model-dropdown button {
+        background: #000000 !important;
+        border: 1px solid #333333 !important;
+        color: #ffffff !important;
+    }
     .radar-placeholder {
         display: flex;
         flex-direction: column;
         }
     }
+    /* Force fonts - highest priority */
+    .dashboard-section,
+    .dashboard-section *,
+    .dashboard-section h2,
+    .dashboard-section h3,
+    .dashboard-section p,
+    .dashboard-section li,
+    .section-lead,
+    .section-subtitle,
+    .phase-card h3,
+    .phase-list li,
+    .scenario-body p,
+    .criteria-card h3,
+    .criteria-card ul,
+    .criteria-card li {
+        font-family: "Nanum Gothic", sans-serif !important;
+    }
+    /* Force section-title styling */
+    .section-title,
+    h2.section-title,
+    .dashboard-section .section-title,
+    .section-header .section-title {
+        font-family: "Gowun Dodum", sans-serif !important;
+    }
+    .domain-title,
+    h2.domain-title,
+    .domain-header .domain-title {
+        font-family: "Gowun Dodum", sans-serif !important;
+    }
+    .hero-title,
+    .hero-subtitle,
+    h1.hero-title,
+    p.hero-subtitle {
+        font-family: "Do Hyeon", sans-serif !important;
+        font-size: 2rem; !important;
+    }
+    /* Force hero-title sizing */
+    .hero-title,
+    h1.hero-title {
+        font-size: 4rem !important;
+    }
+    .phase-chart span,
+    .phase-card .phase-chart span,
+    .phase-grid .phase-chart span {
+        font-family: "Nanum Gothic", sans-serif !important;
+        font-size: 1.2rem !important;
+    }
+    .section-lead, .section-subtitle {
+        font-size: 1.32rem !important;
+        font-family: "Nanum Gothic", sans-serif !important;
+    }
+    .phase-card h3 {
+        font-size: 1.44rem !important;
+        font-family: "Nanum Gothic", sans-serif !important;
+    }
+    .phase-list li {
+        font-size: 1.08rem !important;
+        font-family: "Nanum Gothic", sans-serif !important;
+    }
     </style>
     """)
 def create_domain_radar_chart(df, selected_models=None, max_models=5):
     """Visualize six core capability metrics on a radar chart."""
     df = df.copy()
     metrics_info = [
+        {"column": "Overall Success", "label": "Overall Success", "description": "Average SR across L1-L7"},
         {"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"},
         {"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"},
+        {"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"},
+        {"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"},
+        {"column": "Call Validity", "label": "Call Validity", "description": "Average EPR_CVR across levels"},
     ]
     required_columns = [m["column"] for m in metrics_info]
         autosize=True,
         annotations=[
             dict(
+                text="Galileo Agent Leaderboard",
                 xref="paper", yref="paper",
                 x=0.98, y=0.02,
                 xanchor='right', yanchor='bottom',