Harheem Kim commited on
Commit
a9d36f6
·
1 Parent(s): c4adc3b

update en version

Browse files
Files changed (1) hide show
  1. tabs/leaderboard_v1_en.py +612 -465
tabs/leaderboard_v1_en.py CHANGED
@@ -224,36 +224,36 @@ def create_leaderboard_v2_tab():
224
  # Level metadata for the 7-stage task framework
225
  level_details = {
226
  "ALL": {
227
- "title": "ALL · All Tasks",
228
- "description": "Compare overall performance levels and stage-specific strengths of models through average SR across L1~L7 levels."
229
  },
230
  "L1": {
231
- "title": "<span style='color: white;'>L1 · Single Tool Call</span>",
232
- "description": "<span style='color: white;'>Evaluates single tool execution capability and basic command performance accuracy.</span>"
233
  },
234
  "L2": {
235
- "title": "<span style='color: white;'>L2 · Tool Selection</span>",
236
- "description": "<span style='color: white;'>Measures the ability to select appropriate tools and invoke them with proper parameters.</span>"
237
  },
238
  "L3": {
239
- "title": "<span style='color: white;'>L3 · Sequential Tool Reasoning</span>",
240
- "description": "<span style='color: white;'>Validates the process of solving problems through multi-step sequential reasoning.</span>"
241
  },
242
  "L4": {
243
- "title": "<span style='color: white;'>L4 · Parallel Tool Reasoning</span>",
244
- "description": "<span style='color: white;'>Evaluates the ability to integrate and summarize information from multiple sources in parallel.</span>"
245
  },
246
  "L5": {
247
- "title": "<span style='color: white;'>L5 · Error Handling and Robustness</span>",
248
- "description": "<span style='color: white;'>Confirms recognition and response strategies for unexpected errors or failure situations.</span>"
249
  },
250
  "L6": {
251
- "title": "<span style='color: white;'>L6 · Efficient Tool Utilization</span>",
252
- "description": "<span style='color: white;'>Examines operational efficiency in achieving goals with minimal calls and costs.</span>"
253
  },
254
  "L7": {
255
- "title": "<span style='color: white;'>L7 · Long-Context Reasoning</span>",
256
- "description": "<span style='color: white;'>Intensively analyzes the ability to maintain and appropriately utilize long-term conversation context.</span>"
257
  }
258
  }
259
  default_level = "ALL"
@@ -389,7 +389,7 @@ def create_leaderboard_v2_tab():
389
  if highlight_map.get(level):
390
  header_classes.append("highlight-header")
391
  table_html += f"""
392
- <th class="{' '.join(header_classes)}" title="Average Success Rate {level}">
393
  <span class="metric-header">{level} <span class="info-icon">ⓘ</span></span>
394
  </th>
395
  """
@@ -736,6 +736,8 @@ def create_leaderboard_v2_tab():
736
  # Header styles and navigation
737
  gr.HTML("""
738
  <style>
 
 
739
  /* Enhanced button styling with better gradio compatibility */
740
  .header-action-button {
741
  display: inline-block !important;
@@ -791,33 +793,41 @@ def create_leaderboard_v2_tab():
791
  }
792
 
793
  #hero-banner {
794
- width: 100%;
795
- margin: 0 0 20px 0;
796
- border-radius: 0;
797
- overflow: hidden;
798
- box-shadow: 0 12px 32px rgba(0, 0, 0, 0.25);
 
 
 
 
 
 
799
  }
800
 
801
  #hero-banner img {
802
  width: 100%;
803
  height: auto;
804
  display: block;
 
805
  }
806
 
807
  .hero-title {
808
- font-size: 6rem;
809
  font-weight: 800;
810
  line-height: 1.1;
811
  background: linear-gradient(135deg, #FFE082 0%, #FFC107 50%, #FFB300 100%);
812
  -webkit-background-clip: text;
813
  -webkit-text-fill-color: transparent;
814
  margin-bottom: 1rem;
 
815
  }
816
 
817
  .hero-subtitle {
818
  color: var(--text-secondary);
819
- font-size: 1.25rem;
820
- font-family: 'Geist', sans-serif;
821
  margin-top: 0;
822
  }
823
 
@@ -876,6 +886,7 @@ def create_leaderboard_v2_tab():
876
  box-shadow: 0 12px 30px rgba(0, 0, 0, 0.25);
877
  backdrop-filter: blur(12px);
878
  -webkit-backdrop-filter: blur(12px);
 
879
  }
880
 
881
  .dashboard-section.emphasized {
@@ -895,23 +906,25 @@ def create_leaderboard_v2_tab():
895
  }
896
 
897
  .section-title {
898
- font-size: 2.2rem;
899
- font-weight: 1000;
900
  color: var(--text-primary);
901
  margin-bottom: 12px;
902
  text-align: center !important;
 
903
  }
904
 
905
  .section-lead, .section-subtitle {
906
- font-size: 1.1rem;
907
  color: var(--text-secondary);
908
- max-width: 1500px;
909
  margin: 0 auto 24px auto;
910
  line-height: 1.7;
911
  text-align: center !important;
912
  word-break: keep-all;
913
  white-space: normal;
914
  display: block;
 
915
  }
916
 
917
  .phase-grid {
@@ -929,10 +942,11 @@ def create_leaderboard_v2_tab():
929
  }
930
 
931
  .phase-card h3 {
932
- font-size: 1.5rem;
933
  color: var(--text-primary);
934
  margin-bottom: 20px;
935
  font-weight: 700;
 
936
  }
937
 
938
  .phase-chart {
@@ -960,20 +974,23 @@ def create_leaderboard_v2_tab():
960
 
961
  .phase-chart span {
962
  position: relative;
963
- font-size: 1.5rem;
964
  font-weight: 700;
965
  color: white !important;
 
966
  }
967
 
968
- /* 추가적인 구체적 선택자 */
969
  .phase-card .phase-chart span {
970
  color: #FFFFFF !important;
971
  text-shadow: 0 1px 2px rgba(0, 0, 0, 0.8) !important;
 
972
  }
973
 
974
  .phase-grid .phase-chart span {
975
  color: #FFFFFF !important;
976
  z-index: 10 !important;
 
977
  }
978
 
979
 
@@ -991,11 +1008,12 @@ def create_leaderboard_v2_tab():
991
  background: rgba(245, 246, 247, 0.05);
992
  border: 1px solid rgba(245, 246, 247, 0.08);
993
  color: var(--text-secondary);
994
- font-size: 0.95rem;
 
995
  }
996
 
997
  .scenario-body {
998
- max-width: 1200px;
999
  margin: 0 auto;
1000
  text-align: center;
1001
  }
@@ -1054,7 +1072,7 @@ def create_leaderboard_v2_tab():
1054
  /* Responsive design */
1055
  @media (max-width: 768px) {
1056
  .hero-title {
1057
- font-size: 4rem;
1058
  }
1059
  .hero-action-button {
1060
  width: 100% !important;
@@ -1078,7 +1096,7 @@ def create_leaderboard_v2_tab():
1078
  gap: 8px;
1079
  }
1080
  .section-title {
1081
- font-size: 1.8rem;
1082
  }
1083
  .phase-chart {
1084
  width: 100px;
@@ -1103,7 +1121,7 @@ def create_leaderboard_v2_tab():
1103
  gr.HTML("""
1104
  <div style="text-align: center; padding: 20px 0;">
1105
  <h1 class="hero-title">Hugging Face KREW Ko-AgentBench</h1>
1106
- <p class="hero-subtitle">Agent Benchmark Specialized for Korean Service Environment</p>
1107
  </div>
1108
  """)
1109
 
@@ -1144,82 +1162,81 @@ def create_leaderboard_v2_tab():
1144
  </div>
1145
  """)
1146
 
1147
- # Section 1: 단계별 태스크 설계
1148
  gr.HTML("""
1149
  <div class="dashboard-section">
1150
  <div class="section-header">
1151
- <h2 class="section-title">7-Level Task Structure</h2>
1152
  </div>
1153
- <p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 1000px; line-height: 1.7; word-break: keep-all;">From simple tool calls to long-term context understanding and robustness handling,</p>
1154
- <p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 1000px; line-height: 1.7; word-break: keep-all;">we analyzed agent capabilities in 3D across 7 levels.</p>
1155
  <div class="phase-grid">
1156
  <div class="phase-card">
1157
- <h3>Single-Turn</h3>
1158
  <div class="phase-chart" style="--progress:80%;">
1159
  <span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">80%</span>
1160
  </div>
1161
- <ul class="phase-list">
1162
  <li style="color: white;">L1: Single Tool Call</li>
1163
  <li style="color: white;">L2: Tool Selection</li>
1164
  <li style="color: white;">L3: Sequential Tool Reasoning</li>
1165
  <li style="color: white;">L4: Parallel Tool Reasoning</li>
1166
- <li style="color: white;">L5: Error Handling and Robustness</li>
1167
  </ul>
1168
  </div>
1169
  <div class="phase-card">
1170
- <h3>Multi-Turn</h3>
1171
  <div class="phase-chart" style="--progress:20%;">
1172
  <span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">20%</span>
1173
  </div>
1174
  <ul class="phase-list">
1175
  <li style="color: white;">L6: Efficient Tool Utilization</li>
1176
- <li style="color: white;">L7: Long-Context Reasoning</li>
1177
  </ul>
1178
  </div>
1179
  </div>
1180
  </div>
1181
  """)
1182
 
1183
- # Section 2: 핵심 시나리오 구성
1184
  gr.HTML("""
1185
  <div class="dashboard-section emphasized">
1186
  <div class="section-header">
1187
- <h2 class="section-title">Real-life Scenario Design Using 18 APIs Optimized for Domestic Environment</h2>
1188
  </div>
1189
  <div class="scenario-body">
1190
- <p>Realistic, user-centered scenarios—such as appointment booking and blog review search”—were designed</p>
1191
- <p>by integrating major domestic service APIs including Naver Maps and Kakao.</p>
1192
  </div>
1193
- <div class="section-flow">⌄</div>
1194
  </div>
 
1195
  """)
1196
 
1197
- # Section 3: 핵심 평가 기준
1198
  gr.HTML("""
1199
  <div class="dashboard-section">
1200
  <div class="section-header">
1201
- <h2 class="section-title">Key Evaluation Metrics</h2>
1202
  </div>
1203
  <div class="criteria-grid">
1204
  <div class="criteria-card">
1205
  <h3>Cache-based Iterative Evaluation</h3>
1206
  <ul>
1207
- <li>Real API Response Caching</li>
1208
- <li>Solves chronic issues of existing benchmarks such as 'external API instability and information attribute mismatch'</li>
1209
  <li>Ensures benchmark consistency and reliability</li>
1210
  </ul>
1211
  </div>
1212
  <div class="criteria-card">
1213
- <h3>Robustness Test</h3>
1214
  <ul>
1215
- <li>Evaluates error recognition and response capability (strategy) for intentional error situations (product discontinuation)</li>
1216
- <li>Selects models that operate stably in real-world environments</li>
1217
  </ul>
1218
  </div>
1219
  <div class="criteria-card">
1220
- <h3>Level-specific Evaluation Metrics</h3>
1221
- <ul>
1222
- <li>Evaluates problem-solving efficiency at each stage including tool selection, parameter configuration, and data processing flow</li>
1223
  <li>Quantitatively identifies model strengths and weaknesses</li>
1224
  </ul>
1225
  </div>
@@ -1232,6 +1249,8 @@ def create_leaderboard_v2_tab():
1232
  # Domain filter section with enhanced styling
1233
  gr.HTML("""
1234
  <style>
 
 
1235
  /* Enhanced domain selector styling */
1236
  .domain-selector-container {
1237
  background: #ffd21e0d;
@@ -1278,18 +1297,10 @@ def create_leaderboard_v2_tab():
1278
 
1279
  .domain-performance-container .domain-subtitle {
1280
  font-size: 1.05rem;
1281
- max-width: 1000px;
1282
- margin: 0 auto;
1283
- }
1284
-
1285
- .domain-performance-container .domain-subtitle_ {
1286
- font-size: 1.07rem;
1287
- max-width: 1000px;
1288
  margin: 0 auto;
1289
- color: #bdbdbd;
1290
  }
1291
 
1292
-
1293
  .leaderboard-intro .domain-title,
1294
  .domain-performance-container > .domain-header .domain-title,
1295
  .performance-card-container > .domain-header .domain-title {
@@ -1318,7 +1329,7 @@ def create_leaderboard_v2_tab():
1318
 
1319
  .performance-card-container .domain-subtitle {
1320
  font-size: 1.05rem;
1321
- max-width: 1000px;
1322
  margin: 0 auto;
1323
  }
1324
 
@@ -1342,10 +1353,11 @@ def create_leaderboard_v2_tab():
1342
  -webkit-background-clip: text;
1343
  background-clip: text;
1344
  -webkit-text-fill-color: transparent;
1345
- text-shadow: 0 0 22px rgba(255, 210, 30, 0.65), 0 0 45px rgba(255, 210, 30, 0.4);
1346
- filter: drop-shadow(0 0 16px rgba(255, 210, 30, 0.35));
1347
  letter-spacing: 0.02em;
1348
- animation: title-shimmer 5s ease-in-out infinite;
 
1349
  }
1350
 
1351
  @keyframes title-shimmer {
@@ -1633,11 +1645,11 @@ def create_leaderboard_v2_tab():
1633
 
1634
  .model-dropdown select,
1635
  .model-dropdown [role="combobox"] {
1636
- background: rgba(245, 246, 247, 0.06) !important;
1637
- border: 1px solid var(--border-subtle) !important;
1638
  border-radius: 999px !important;
1639
  padding: 12px 24px !important;
1640
- color: var(--text-primary) !important;
1641
  font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
1642
  font-weight: 600 !important;
1643
  font-size: 1rem !important;
@@ -1665,10 +1677,10 @@ def create_leaderboard_v2_tab():
1665
  gap: 8px !important;
1666
  width: 100% !important;
1667
  padding: 12px 24px !important;
1668
- background: rgba(245, 246, 247, 0.06) !important;
1669
- border: 1px solid var(--border-subtle) !important;
1670
  border-radius: 999px !important;
1671
- color: var(--text-primary) !important;
1672
  font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
1673
  font-weight: 600 !important;
1674
  font-size: 0.95rem !important;
@@ -1957,94 +1969,147 @@ def create_leaderboard_v2_tab():
1957
  padding: 12px 20px !important;
1958
  font-size: 0.95rem !important;
1959
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1960
  </style>
1961
 
1962
  """)
1963
 
1964
  level_options = list(level_details.keys())
1965
 
1966
- with gr.Column(elem_classes=["domain-selector-container"], elem_id="task-level-selector"):
1967
- gr.HTML("""
1968
- <div class="domain-header">
1969
- <h2 class="domain-title" style="color: white;">🧠 Select Task Level</h2>
1970
- <p class="domain-subtitle" style="color: white;">Easily compare agent performance across ALL · L1~L7 stages of Ko-AgentBench.</p>
1971
- </div>
1972
- """)
1973
- domain_filter = gr.Radio(
1974
- choices=level_options,
1975
- value=default_level,
1976
- label="",
1977
- interactive=True,
1978
- container=False,
1979
- elem_classes=["domain-radio"]
1980
- )
1981
-
1982
- # Filter controls with domain styling
1983
- with gr.Column(elem_classes=["domain-selector-container", "filters-sorting-container"], elem_id="filters-sorting-container"):
1984
- gr.HTML("""
1985
- <div class="domain-header">
1986
- <h2 class="domain-title" style="color: white;">🔍 Filters & Sorting</h2>
1987
- <p class="domain-subtitle" style="color: white;">Select model type and sorting criteria to explore results in your preferred way.</p>
1988
- </div>
1989
- """)
1990
- with gr.Row(elem_classes=["filters-sorting-row"]):
1991
- with gr.Column(scale=1, elem_classes=["filter-group"]):
1992
- with gr.Row(elem_classes=["filter-group-row"]):
1993
- gr.HTML("<span class='filter-group-label' style='color: white;'>Model Access</span>")
1994
- model_type_filter = gr.Radio(
1995
- choices=["All", "OSS", "API"],
1996
- value="All",
1997
- label="",
1998
- elem_classes=["domain-radio"],
1999
- container=False
2000
- )
2001
- with gr.Column(scale=1, elem_classes=["filter-group"]):
2002
- with gr.Row(elem_classes=["filter-group-row"]):
2003
- gr.HTML("<span class='filter-group-label' style='color: white;'>Sort Order</span>")
2004
- sort_order = gr.Radio(
2005
- choices=["Descending", "Ascending"],
2006
- value="Descending",
2007
- label="",
2008
- elem_classes=["domain-radio"],
2009
- container=False
2010
- )
2011
-
2012
- # Main leaderboard table with dynamic title
2013
  leaderboard_title = gr.HTML(update_leaderboard_title(default_level))
2014
 
2015
- leaderboard_table = gr.HTML(initial_table)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2016
 
2017
- gr.HTML("""
2018
- </div>
2019
- </div>""")
2020
 
2021
  # Radar Chart Section
2022
  gr.HTML("""
2023
  <div class="domain-selector-container domain-performance-container">
2024
  <div class="domain-header">
2025
  <h2 class="domain-title" style="color: white;">Core Capability Radar</h2>
2026
- <h2 class="domain-title" style="color: white;">Core Capability Radar</h2>
2027
- <p class="domain-subtitle_">#Execution Accuracy #Complex Reasoning #Robustness #Context & Efficiency #Overall Success #Validity</p>
2028
- <p class="domain-subtitle" style="color: white;">Analyze model performance capabilities and balance through 6 core competencies.</p>
2029
  </div>
2030
  """)
2031
-
2032
- with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="radar-model-selector"):
2033
- gr.HTML("""
2034
- <div class="domain-header">
2035
- <h2 class="domain-title" style="color: white;">🎯 Select Models for Comparison</h2>
2036
- <p class="domain-subtitle" style="color: white;">Select models to compare in the radar chart.</p>
2037
- </div>
2038
- """)
2039
- model_selector = gr.Dropdown(
2040
- choices=initial_df['Model'].tolist()[:10],
2041
- value=initial_df['Model'].tolist()[:5],
2042
- multiselect=True,
2043
- label="",
2044
- info=None,
2045
- container=False,
2046
- # elem_classes=["model-dropdown"]
2047
- )
2048
 
2049
  # Radar chart plot - wrapped in centered container
2050
  gr.HTML('<div class="chart-container radar-chart-container">')
@@ -2060,317 +2125,53 @@ def create_leaderboard_v2_tab():
2060
 
2061
  gr.HTML("</div>")
2062
 
2063
- # Level metric breakdown section
2064
- gr.HTML("""
2065
- <div class="domain-selector-container domain-performance-container level-metrics-wrapper">
2066
- <div class="domain-header">
2067
- <h2 class="domain-title" style="color: white;">Level-Specific Metric Spotlight</h2>
2068
- <p class="domain-subtitle" style="color: white;">Compare model scores based on unique evaluation metrics for each L1–L7 level.</p>
2069
- </div>
2070
- """)
2071
-
2072
- with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
2073
- gr.HTML("""
2074
- <div class="domain-header">
2075
- <h2 class="domain-title" style="color: white;">🧭 Select Task Level and Models</h2>
2076
- <p class="domain-subtitle" style="color: white;">Select L1–L7 levels and models to explore detailed SR-based metrics.</p>
2077
- </div>
2078
- """)
2079
- level_metric_selector = gr.Dropdown(
2080
- choices=level_ids,
2081
- value=level_ids[0] if level_ids else None,
2082
- multiselect=False,
2083
- label="",
2084
- info=None,
2085
- container=False,
2086
- elem_classes=["level-dropdown"]
2087
- )
2088
- level_model_selector = gr.Dropdown(
2089
- choices=initial_level_model_choices,
2090
- value=initial_level_model_values,
2091
- multiselect=True,
2092
- label="",
2093
- info=None,
2094
- container=False,
2095
- elem_classes=["model-dropdown", "level-model-dropdown"]
2096
- )
2097
-
2098
- gr.HTML('<div class="chart-container level-metric-chart-container">')
2099
- level_metric_chart = gr.Plot(
2100
- label="",
2101
- value=initial_level_metric_chart,
2102
- elem_classes=["level-metric-plot", "plot-container"]
2103
- )
2104
- gr.HTML("""
2105
- </div>
2106
- </div>
2107
- """)
2108
-
2109
- # Heatmap section
2110
- gr.HTML("""
2111
- <div class="domain-selector-container domain-performance-container heatmap-wrapper">
2112
- <div class="domain-header">
2113
- <h2 class="domain-title" style="color: white;">Comprehensive Performance Heatmap</h2>
2114
- <p class="domain-subtitle" style="color: white;">Explore the comprehensive performance heatmap to see SR scores across L1–L7 levels for each model at a glance.</p>
2115
- </div>
2116
- <div class="chart-container heatmap-chart-container">
2117
- """)
2118
- heatmap_chart = gr.Plot(
2119
- label="",
2120
- value=initial_heatmap,
2121
- elem_classes=["heatmap-plot", "plot-container"]
2122
- )
2123
- gr.HTML("""
2124
- </div>
2125
- </div>
2126
- """)
2127
-
2128
- # Update functions
2129
- def get_optimal_sort_order(sort_by_value):
2130
- """Return the optimal sort order for a given metric"""
2131
- # Metrics where higher is better (descending)
2132
- descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
2133
-
2134
- # Metrics where lower is better (ascending)
2135
- ascending_metrics = []
2136
 
2137
- if sort_by_value in descending_metrics:
2138
- return "Descending"
2139
- elif sort_by_value in ascending_metrics:
2140
- return "Ascending"
2141
- else:
2142
- return "Descending" # Default fallback
2143
-
2144
- def update_table(level_filter, model_type_filter, sort_order):
2145
- title_html = update_leaderboard_title(level_filter)
2146
- sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
2147
- table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
2148
- return title_html, table_html
2149
-
2150
- def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
2151
- # Get filtered dataframe
2152
  df = load_leaderboard_data()
2153
- sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
2154
- filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
2155
-
2156
- # Update model selector choices based on filtered data
2157
- available_models_all = filtered_df['Model'].tolist()
2158
- available_models = available_models_all[:15] # Top 15 from filtered results
2159
 
2160
- # If selected models are not in available models, reset to top 5
2161
- if selected_models:
2162
- valid_selected = [m for m in selected_models if m in available_models]
2163
- if not valid_selected:
2164
- valid_selected = available_models[:5]
2165
- else:
2166
- valid_selected = available_models[:5]
2167
 
2168
- # Create radar chart
2169
- chart = create_domain_radar_chart(filtered_df, valid_selected)
2170
 
2171
- # Prepare heatmap order prioritizing selected models
2172
- heatmap_order = []
2173
- for model in valid_selected:
2174
- if model not in heatmap_order:
2175
- heatmap_order.append(model)
2176
- for model in available_models_all:
2177
- if model not in heatmap_order:
2178
- heatmap_order.append(model)
2179
- heatmap_order = heatmap_order[:12]
2180
- heatmap_fig = create_performance_heatmap(filtered_df, heatmap_order)
2181
 
2182
- # Level metric chart
2183
- effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
2184
- available_level_models = available_models_all
2185
- if level_selected_models:
2186
- valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
2187
- if not valid_level_models:
2188
- valid_level_models = available_level_models[:5]
2189
- else:
2190
- valid_level_models = available_level_models[:5]
2191
- level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
2192
-
2193
- return (
2194
- gr.Dropdown(
2195
- choices=available_models,
2196
- value=valid_selected,
2197
- multiselect=True,
2198
- label="",
2199
- info=None,
2200
- container=False,
2201
- # elem_classes=["model-dropdown"]
2202
- ),
2203
- chart,
2204
- heatmap_fig,
2205
- gr.Dropdown(
2206
- choices=available_level_models,
2207
- value=valid_level_models,
2208
- multiselect=True,
2209
- label="",
2210
- info=None,
2211
- container=False,
2212
- elem_classes=["model-dropdown", "level-model-dropdown"]
2213
- ),
2214
- level_metric_fig,
2215
- )
2216
-
2217
- def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
2218
- # Get filtered dataframe
2219
- df = load_leaderboard_data()
2220
- sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
2221
- filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
2222
-
2223
- available_models_all = filtered_df['Model'].tolist()
2224
- if selected_models:
2225
- valid_selected = [m for m in selected_models if m in available_models_all]
2226
- if not valid_selected:
2227
- valid_selected = available_models_all[:5]
2228
- else:
2229
- valid_selected = available_models_all[:5]
2230
-
2231
- heatmap_order = []
2232
- for model in valid_selected:
2233
- if model not in heatmap_order:
2234
- heatmap_order.append(model)
2235
- for model in available_models_all:
2236
- if model not in heatmap_order:
2237
- heatmap_order.append(model)
2238
- heatmap_order = heatmap_order[:12]
2239
-
2240
- effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
2241
- available_level_models = available_models_all
2242
- if level_selected_models:
2243
- valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
2244
- if not valid_level_models:
2245
- valid_level_models = available_level_models[:5]
2246
- else:
2247
- valid_level_models = available_level_models[:5]
2248
- level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
2249
-
2250
- return (
2251
- create_domain_radar_chart(filtered_df, valid_selected),
2252
- create_performance_heatmap(filtered_df, heatmap_order),
2253
- gr.Dropdown(
2254
- choices=available_level_models,
2255
- value=valid_level_models,
2256
- multiselect=True,
2257
- label="",
2258
- info=None,
2259
- container=False,
2260
- elem_classes=["model-dropdown", "level-model-dropdown"]
2261
- ),
2262
- level_metric_fig,
2263
- )
2264
-
2265
- def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
2266
- df = load_leaderboard_data()
2267
- sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
2268
- filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
2269
- available_models = filtered_df['Model'].tolist()
2270
- if level_selected_models:
2271
- valid_level_models = [m for m in level_selected_models if m in available_models][:5]
2272
- if not valid_level_models:
2273
- valid_level_models = available_models[:5]
2274
- else:
2275
- valid_level_models = available_models[:5]
2276
- effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
2277
- level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
2278
- return (
2279
- gr.Dropdown(
2280
- choices=available_models,
2281
- value=valid_level_models,
2282
- multiselect=True,
2283
- label="",
2284
- info=None,
2285
- container=False,
2286
- elem_classes=["model-dropdown", "level-model-dropdown"]
2287
- ),
2288
- level_chart,
2289
- )
2290
-
2291
- # Update table when filters change
2292
- filter_inputs = [domain_filter, model_type_filter, sort_order]
2293
-
2294
- for input_component in filter_inputs:
2295
- input_component.change(
2296
- fn=update_table,
2297
- inputs=filter_inputs,
2298
- outputs=[leaderboard_title, leaderboard_table]
2299
- )
2300
-
2301
- # Also update radar chart when filters change
2302
- input_component.change(
2303
- fn=update_radar_chart,
2304
- inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
2305
- outputs=[model_selector, radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
2306
- )
2307
-
2308
- # Update radar chart when model selection changes
2309
- model_selector.change(
2310
- fn=update_radar_only,
2311
- inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
2312
- outputs=[radar_chart, heatmap_chart, level_model_selector, level_metric_chart]
2313
- )
2314
-
2315
- level_metric_selector.change(
2316
- fn=update_level_metric_only,
2317
- inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
2318
- outputs=[level_model_selector, level_metric_chart]
2319
- )
2320
-
2321
- level_model_selector.change(
2322
- fn=update_level_metric_only,
2323
- inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
2324
- outputs=[level_model_selector, level_metric_chart]
2325
- )
2326
-
2327
- # Define generate_performance_card function before using it
2328
- def generate_performance_card(model_name):
2329
- """Generate HTML for the model performance card"""
2330
- if not model_name:
2331
- return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
2332
- Please select a model to generate its performance card
2333
- </div>"""
2334
-
2335
- # Get model data
2336
- df = load_leaderboard_data()
2337
- model_data = df[df['Model'] == model_name]
2338
-
2339
- if model_data.empty:
2340
- return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
2341
- Model not found in the database
2342
- </div>"""
2343
-
2344
- row = model_data.iloc[0]
2345
-
2346
- # Get overall rank based on overall success
2347
- df_with_success = df.copy()
2348
- df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
2349
- df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
2350
- df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
2351
- try:
2352
- rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
2353
- except:
2354
- rank = 'N/A'
2355
-
2356
- # Format values
2357
- def format_value(val, decimals=3, prefix='', suffix=''):
2358
- if pd.isna(val) or val == '':
2359
- return 'N/A'
2360
- return f"{prefix}{float(val):.{decimals}f}{suffix}"
2361
 
2362
  def format_score(value):
2363
  if pd.isna(value) or value == '':
2364
  return 'N/A'
2365
  return f"{float(value):.3f}"
2366
 
2367
- # Use the same order as the domain radar but keep '견고성' (Robustness) last
2368
  radar_metrics = [
2369
  ("Execution Accuracy", row.get('Execution Accuracy')),
 
 
2370
  ("Context & Efficiency", row.get('Context & Efficiency')),
2371
  ("Overall Success", row.get('Overall Success')),
2372
- ("Robustness", row.get('Robustness')),
2373
- ("Complex Reasoning", row.get('Complex Reasoning')),
2374
  ("Validity", row.get('Call Validity')),
2375
  ]
2376
  radar_values = []
@@ -2426,7 +2227,7 @@ def create_leaderboard_v2_tab():
2426
  <div class="core-section">
2427
  <div class="core-metric-grid">
2428
  """
2429
- ordered_labels = ["Execution Accuracy", "Context & Efficiency", "Overall Success", "Robustness", "Complex Reasoning", "Validity"]
2430
  ordered_metrics = sorted(radar_metrics, key=lambda x: ordered_labels.index(x[0]) if x[0] in ordered_labels else len(ordered_labels))
2431
  top_metrics = ordered_metrics[:3]
2432
  bottom_metrics = ordered_metrics[3:]
@@ -2479,21 +2280,20 @@ def create_leaderboard_v2_tab():
2479
  <div class="domain-header">
2480
  <h2 class="domain-title" style="color: white;">Model Performance Card</h2>
2481
  <p class="domain-subtitle" style="color: white;">
2482
- Check out the precision analysis card that visualizes the model's performance spectrum with <br> 6 key metrics and overall success rate (SR) by L1~L7 levels.
2483
  </p>
2484
  <p class="domain-note" style="color: #bdbdbd; font-size: 0.85em; margin-top: 4px;">
2485
- Rank is calculated based on the average SR value across L1–L7 levels.
2486
  </p>
2487
  </div>
 
2488
  <div class="performance-card-content">
2489
  """)
2490
-
2491
  with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
2492
  gr.HTML("""
2493
- <div class="domain-header">
2494
- <h2 class="domain-title" style="color: white;">🤖 Select Model</h2>
2495
- <p class="domain-subtitle" style="color: white;">Select models for the analysis card.</p>
2496
- </div>
2497
  """)
2498
  card_model_selector = gr.Dropdown(
2499
  choices=initial_df['Model'].tolist(),
@@ -2523,6 +2323,279 @@ def create_leaderboard_v2_tab():
2523
  </div>
2524
  </div>
2525
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2526
 
2527
  # Add custom CSS for the performance card
2528
  gr.HTML("""
@@ -2745,18 +2818,18 @@ def create_leaderboard_v2_tab():
2745
  .level-dropdown select,
2746
  .level-dropdown [role="combobox"],
2747
  .level-dropdown button {
2748
- background: rgba(245, 246, 247, 0.06) !important;
2749
- border: 1px solid var(--border-subtle) !important;
2750
  border-radius: 999px !important;
2751
  padding: 12px 20px !important;
2752
- color: var(--text-primary) !important;
2753
  font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
2754
  font-weight: 600 !important;
2755
  font-size: 0.95rem !important;
2756
  text-align: center !important;
2757
  min-height: 46px !important;
2758
  transition: all 0.3s ease !important;
2759
- box-shadow: 0 10px 24px rgba(255, 210, 30, 0.15) !important;
2760
  }
2761
 
2762
  .level-dropdown select:hover,
@@ -2773,6 +2846,14 @@ def create_leaderboard_v2_tab():
2773
  margin: 12px auto 0 !important;
2774
  }
2775
 
 
 
 
 
 
 
 
 
2776
  .radar-placeholder {
2777
  display: flex;
2778
  flex-direction: column;
@@ -2925,6 +3006,74 @@ def create_leaderboard_v2_tab():
2925
  }
2926
  }
2927
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2928
  </style>
2929
 
2930
  """)
@@ -3052,15 +3201,13 @@ def create_leaderboard_v2_interface():
3052
  def create_domain_radar_chart(df, selected_models=None, max_models=5):
3053
  """Visualize six core capability metrics on a radar chart."""
3054
  df = df.copy()
3055
- # Use the same metric order and Korean labels as the model performance card
3056
- # Match the model card order but place '견고성' (Robustness) last as requested
3057
  metrics_info = [
 
3058
  {"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"},
3059
- {"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"},
3060
- {"column": "Overall Success", "label": "Overall Success", "description": "L1~L7의 Average Success Rate"},
3061
- {"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"},
3062
  {"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"},
3063
- {"column": "Call Validity", "label": "Validity", "description": "레벨별 EPR_CVR 평균"},
 
 
3064
  ]
3065
 
3066
  required_columns = [m["column"] for m in metrics_info]
@@ -3217,7 +3364,7 @@ def create_domain_radar_chart(df, selected_models=None, max_models=5):
3217
  autosize=True,
3218
  annotations=[
3219
  dict(
3220
- text="Ko-Agent Leaderboard",
3221
  xref="paper", yref="paper",
3222
  x=0.98, y=0.02,
3223
  xanchor='right', yanchor='bottom',
 
224
  # Level metadata for the 7-stage task framework
225
  level_details = {
226
  "ALL": {
227
+ "title": "<span style='font-family: \"Gowun Dodum\", sans-serif !important;'>ALL · All Tasks</span>",
228
+ "description": "<span style='font-family: \"Nanum Gothic\", sans-serif !important;'>See average performance across all seven tasks and use it as a baseline for per-level comparison.</span>"
229
  },
230
  "L1": {
231
+ "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L1 · Single Tool Call</span>",
232
+ "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Evaluates single tool invocation capability and basic command execution accuracy.</span>"
233
  },
234
  "L2": {
235
+ "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L2 · Tool Selection</span>",
236
+ "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Measures the ability to choose the right tool and invoke it with appropriate parameters.</span>"
237
  },
238
  "L3": {
239
+ "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L3 · Sequential Tool Reasoning</span>",
240
+ "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Validates multi-step sequential reasoning for solving tasks.</span>"
241
  },
242
  "L4": {
243
+ "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L4 · Parallel Tool Reasoning</span>",
244
+ "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Evaluates the ability to integrate and summarize information from multiple sources in parallel.</span>"
245
  },
246
  "L5": {
247
+ "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L5 · Error Handling & Robustness</span>",
248
+ "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Checks awareness of unexpected failures and the strategies used to recover.</span>"
249
  },
250
  "L6": {
251
+ "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L6 · Efficient Tool Utilization</span>",
252
+ "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Examines operational efficiency in achieving goals with minimal calls and cost.</span>"
253
  },
254
  "L7": {
255
+ "title": "<span style='color: white; font-family: \"Gowun Dodum\", sans-serif !important;'>L7 · Long-Context Memory</span>",
256
+ "description": "<span style='color: white; font-family: \"Nanum Gothic\", sans-serif !important;'>Analyzes the ability to retain and leverage long conversational context.</span>"
257
  }
258
  }
259
  default_level = "ALL"
 
389
  if highlight_map.get(level):
390
  header_classes.append("highlight-header")
391
  table_html += f"""
392
+ <th class="{' '.join(header_classes)}" title="Average success rate for {level}">
393
  <span class="metric-header">{level} <span class="info-icon">ⓘ</span></span>
394
  </th>
395
  """
 
736
  # Header styles and navigation
737
  gr.HTML("""
738
  <style>
739
+ @import url('https://fonts.googleapis.com/css2?family=Nanum+Gothic:wght@400;700;800&family=Gowun+Dodum&family=Do+Hyeon&display=swap');
740
+
741
  /* Enhanced button styling with better gradio compatibility */
742
  .header-action-button {
743
  display: inline-block !important;
 
793
  }
794
 
795
  #hero-banner {
796
+ width: 100vw !important;
797
+ margin: 0 calc(-50vw + 50%) 20px calc(-50vw + 50%) !important;
798
+ border-radius: 0 !important;
799
+ overflow: hidden !important;
800
+ box-shadow: 0 12px 32px rgba(0, 0, 0, 0.25) !important;
801
+ position: relative !important;
802
+ left: 50% !important;
803
+ right: 50% !important;
804
+ margin-left: -50vw !important;
805
+ margin-right: -50vw !important;
806
+ max-width: none !important;
807
  }
808
 
809
  #hero-banner img {
810
  width: 100%;
811
  height: auto;
812
  display: block;
813
+ object-fit: cover;
814
  }
815
 
816
  .hero-title {
817
+ font-size: 10rem;
818
  font-weight: 800;
819
  line-height: 1.1;
820
  background: linear-gradient(135deg, #FFE082 0%, #FFC107 50%, #FFB300 100%);
821
  -webkit-background-clip: text;
822
  -webkit-text-fill-color: transparent;
823
  margin-bottom: 1rem;
824
+ font-family: 'Do Hyeon', sans-serif !important;
825
  }
826
 
827
  .hero-subtitle {
828
  color: var(--text-secondary);
829
+ font-size: 3rem;
830
+ font-family: 'Do Hyeon', sans-serif !important;
831
  margin-top: 0;
832
  }
833
 
 
886
  box-shadow: 0 12px 30px rgba(0, 0, 0, 0.25);
887
  backdrop-filter: blur(12px);
888
  -webkit-backdrop-filter: blur(12px);
889
+ font-family: 'Nanum Gothic', sans-serif !important;
890
  }
891
 
892
  .dashboard-section.emphasized {
 
906
  }
907
 
908
  .section-title {
909
+ font-size: 3.75rem;
910
+ font-weight: 700;
911
  color: var(--text-primary);
912
  margin-bottom: 12px;
913
  text-align: center !important;
914
+ font-family: 'Gowun Dodum', sans-serif !important;
915
  }
916
 
917
  .section-lead, .section-subtitle {
918
+ font-size: 1.32rem !important;
919
  color: var(--text-secondary);
920
+ max-width: 720px;
921
  margin: 0 auto 24px auto;
922
  line-height: 1.7;
923
  text-align: center !important;
924
  word-break: keep-all;
925
  white-space: normal;
926
  display: block;
927
+ font-family: 'Nanum Gothic', sans-serif !important;
928
  }
929
 
930
  .phase-grid {
 
942
  }
943
 
944
  .phase-card h3 {
945
+ font-size: 1.44rem !important;
946
  color: var(--text-primary);
947
  margin-bottom: 20px;
948
  font-weight: 700;
949
+ font-family: 'Nanum Gothic', sans-serif !important;
950
  }
951
 
952
  .phase-chart {
 
974
 
975
  .phase-chart span {
976
  position: relative;
977
+ font-size: 1.2rem !important;
978
  font-weight: 700;
979
  color: white !important;
980
+ font-family: 'Nanum Gothic', sans-serif !important;
981
  }
982
 
983
+ /* Additional specific selectors */
984
  .phase-card .phase-chart span {
985
  color: #FFFFFF !important;
986
  text-shadow: 0 1px 2px rgba(0, 0, 0, 0.8) !important;
987
+ font-family: 'Nanum Gothic', sans-serif !important;
988
  }
989
 
990
  .phase-grid .phase-chart span {
991
  color: #FFFFFF !important;
992
  z-index: 10 !important;
993
+ font-family: 'Nanum Gothic', sans-serif !important;
994
  }
995
 
996
 
 
1008
  background: rgba(245, 246, 247, 0.05);
1009
  border: 1px solid rgba(245, 246, 247, 0.08);
1010
  color: var(--text-secondary);
1011
+ font-size: 1.08rem !important;
1012
+ font-family: 'Nanum Gothic', sans-serif !important;
1013
  }
1014
 
1015
  .scenario-body {
1016
+ max-width: 760px;
1017
  margin: 0 auto;
1018
  text-align: center;
1019
  }
 
1072
  /* Responsive design */
1073
  @media (max-width: 768px) {
1074
  .hero-title {
1075
+ font-size: 10rem;
1076
  }
1077
  .hero-action-button {
1078
  width: 100% !important;
 
1096
  gap: 8px;
1097
  }
1098
  .section-title {
1099
+ font-size: 2.7rem;
1100
  }
1101
  .phase-chart {
1102
  width: 100px;
 
1121
  gr.HTML("""
1122
  <div style="text-align: center; padding: 20px 0;">
1123
  <h1 class="hero-title">Hugging Face KREW Ko-AgentBench</h1>
1124
+ <p class="hero-subtitle">Agent benchmark optimized for real Korean usage.</p>
1125
  </div>
1126
  """)
1127
 
 
1162
  </div>
1163
  """)
1164
 
1165
+ # Section 1: Task Design by Stage
1166
  gr.HTML("""
1167
  <div class="dashboard-section">
1168
  <div class="section-header">
1169
+ <h2 class="section-title" style="font-family: 'Gowun Dodum', sans-serif; font-size: 2.5rem;">7-Level Task Design</h2>
1170
  </div>
1171
+ <p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 720px; line-height: 1.7; word-break: keep-all;">We analyzed agent capabilities across seven stages—from simple tool calls to long-context retention and robustness.</p>
 
1172
  <div class="phase-grid">
1173
  <div class="phase-card">
1174
+ <h3>Single Turn</h3>
1175
  <div class="phase-chart" style="--progress:80%;">
1176
  <span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">80%</span>
1177
  </div>
1178
+ <ul class="phase-list">
1179
  <li style="color: white;">L1: Single Tool Call</li>
1180
  <li style="color: white;">L2: Tool Selection</li>
1181
  <li style="color: white;">L3: Sequential Tool Reasoning</li>
1182
  <li style="color: white;">L4: Parallel Tool Reasoning</li>
1183
+ <li style="color: white;">L5: Error Handling & Robustness</li>
1184
  </ul>
1185
  </div>
1186
  <div class="phase-card">
1187
+ <h3>Multi Turn</h3>
1188
  <div class="phase-chart" style="--progress:20%;">
1189
  <span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">20%</span>
1190
  </div>
1191
  <ul class="phase-list">
1192
  <li style="color: white;">L6: Efficient Tool Utilization</li>
1193
+ <li style="color: white;">L7: Long-Context Memory</li>
1194
  </ul>
1195
  </div>
1196
  </div>
1197
  </div>
1198
  """)
1199
 
1200
+ # Section 2: Core Scenario Design
1201
  gr.HTML("""
1202
  <div class="dashboard-section emphasized">
1203
  <div class="section-header">
1204
+ <h2 class="section-title" style="font-size: 2.0rem;">High-quality scenario design tailored to 18 Korea-specific APIs and real-world use cases.</h2>
1205
  </div>
1206
  <div class="scenario-body">
1207
+ <p>We built realistic scenarios—such as appointment booking and blog review search—by integrating APIs widely used in Korea including Naver Maps, Kakao services, and local websites.</p>
 
1208
  </div>
1209
+
1210
  </div>
1211
+ <div class="section-flow">⌄</div>
1212
  """)
1213
 
1214
+ # Section 3: Key Evaluation Criteria
1215
  gr.HTML("""
1216
  <div class="dashboard-section">
1217
  <div class="section-header">
1218
+ <h2 class="section-title" style="font-size: 2.0rem;">Key Evaluation Criteria</h2>
1219
  </div>
1220
  <div class="criteria-grid">
1221
  <div class="criteria-card">
1222
  <h3>Cache-based Iterative Evaluation</h3>
1223
  <ul>
1224
+ <li>Improved handling of failed API responses</li>
1225
+ <li>Addresses chronic benchmark issues such as mismatched response attributes</li>
1226
  <li>Ensures benchmark consistency and reliability</li>
1227
  </ul>
1228
  </div>
1229
  <div class="criteria-card">
1230
+ <h3>Robustness Testing</h3>
1231
  <ul>
1232
+ <li>Evaluates recognition and response strategies for intentional failure scenarios (e.g., discontinued products)</li>
1233
+ <li>Surfaces models that remain stable in real-world deployments</li>
1234
  </ul>
1235
  </div>
1236
  <div class="criteria-card">
1237
+ <h3>Level-specific Precision Metrics</h3>
1238
+ <ul>
1239
+ <li>Evaluates each phase of problem solving, including tool selection, parameter setup, and data flow</li>
1240
  <li>Quantitatively identifies model strengths and weaknesses</li>
1241
  </ul>
1242
  </div>
 
1249
  # Domain filter section with enhanced styling
1250
  gr.HTML("""
1251
  <style>
1252
+ @import url('https://fonts.googleapis.com/css2?family=Nanum+Gothic:wght@400;700;800&family=Gowun+Dodum&family=Do+Hyeon&display=swap');
1253
+
1254
  /* Enhanced domain selector styling */
1255
  .domain-selector-container {
1256
  background: #ffd21e0d;
 
1297
 
1298
  .domain-performance-container .domain-subtitle {
1299
  font-size: 1.05rem;
1300
+ max-width: 720px;
 
 
 
 
 
 
1301
  margin: 0 auto;
 
1302
  }
1303
 
 
1304
  .leaderboard-intro .domain-title,
1305
  .domain-performance-container > .domain-header .domain-title,
1306
  .performance-card-container > .domain-header .domain-title {
 
1329
 
1330
  .performance-card-container .domain-subtitle {
1331
  font-size: 1.05rem;
1332
+ max-width: 720px;
1333
  margin: 0 auto;
1334
  }
1335
 
 
1353
  -webkit-background-clip: text;
1354
  background-clip: text;
1355
  -webkit-text-fill-color: transparent;
1356
+ text-shadow: 0 0 3px rgba(255, 210, 30, 0.08), 0 0 8px rgba(255, 210, 30, 0.05);
1357
+ filter: drop-shadow(0 0 2px rgba(255, 210, 30, 0.06));
1358
  letter-spacing: 0.02em;
1359
+ animation: title-shimmer 1.25s ease-in-out infinite;
1360
+ font-family: 'Gowun Dodum', sans-serif !important;
1361
  }
1362
 
1363
  @keyframes title-shimmer {
 
1645
 
1646
  .model-dropdown select,
1647
  .model-dropdown [role="combobox"] {
1648
+ background: #000000 !important;
1649
+ border: 1px solid #333333 !important;
1650
  border-radius: 999px !important;
1651
  padding: 12px 24px !important;
1652
+ color: #ffffff !important;
1653
  font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
1654
  font-weight: 600 !important;
1655
  font-size: 1rem !important;
 
1677
  gap: 8px !important;
1678
  width: 100% !important;
1679
  padding: 12px 24px !important;
1680
+ background: #000000 !important;
1681
+ border: 1px solid #333333 !important;
1682
  border-radius: 999px !important;
1683
+ color: #ffffff !important;
1684
  font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
1685
  font-weight: 600 !important;
1686
  font-size: 0.95rem !important;
 
1969
  padding: 12px 20px !important;
1970
  font-size: 0.95rem !important;
1971
  }
1972
+
1973
+ /* Leaderboard controls row styling */
1974
+ .leaderboard-controls-row {
1975
+ margin: 20px 0 !important;
1976
+ padding: 20px !important;
1977
+ background: transparent !important;
1978
+ border: none !important;
1979
+ gap: 40px !important;
1980
+ }
1981
+
1982
+ .leaderboard-controls-row .gr-column,
1983
+ .leaderboard-controls-row .gr-row,
1984
+ .leaderboard-controls-row .gr-box,
1985
+ .leaderboard-controls-row .gradio-column,
1986
+ .leaderboard-controls-row .gradio-row,
1987
+ .leaderboard-controls-row .gradio-group {
1988
+ background: transparent !important;
1989
+ border: none !important;
1990
+ box-shadow: none !important;
1991
+ padding: 0 !important;
1992
+ }
1993
+
1994
+ /* Remove all container backgrounds for leaderboard controls */
1995
+ .leaderboard-controls-row * {
1996
+ background-color: transparent !important;
1997
+ background-image: none !important;
1998
+ border: none !important;
1999
+ box-shadow: none !important;
2000
+ }
2001
+
2002
+ .leaderboard-controls-row .inline-radio,
2003
+ .leaderboard-controls-row .domain-radio {
2004
+ background: transparent !important;
2005
+ border: none !important;
2006
+ box-shadow: none !important;
2007
+ }
2008
+
2009
+ /* Inline radio styling for integrated controls */
2010
+ .inline-radio {
2011
+ background: transparent !important;
2012
+ border: none !important;
2013
+ box-shadow: none !important;
2014
+ padding: 0 !important;
2015
+ }
2016
+
2017
+ .inline-radio .wrap {
2018
+ display: flex !important;
2019
+ gap: 8px !important;
2020
+ flex-wrap: wrap !important;
2021
+ justify-content: flex-start !important;
2022
+ background: transparent !important;
2023
+ border: none !important;
2024
+ box-shadow: none !important;
2025
+ padding: 0 !important;
2026
+ }
2027
+
2028
+ .inline-radio label {
2029
+ padding: 8px 16px !important;
2030
+ background: rgba(245, 246, 247, 0.06) !important;
2031
+ border: 1px solid var(--border-subtle) !important;
2032
+ border-radius: 20px !important;
2033
+ font-size: 0.85rem !important;
2034
+ color: var(--text-primary) !important;
2035
+ transition: all 0.2s ease !important;
2036
+ cursor: pointer !important;
2037
+ }
2038
+
2039
+ .inline-radio label:hover {
2040
+ background: rgba(255, 210, 30, 0.12) !important;
2041
+ border-color: var(--accent-primary) !important;
2042
+ }
2043
+
2044
+ .inline-radio input[type="radio"]:checked + label,
2045
+ .inline-radio label[aria-checked="true"] {
2046
+ background: rgba(255, 210, 30, 0.2) !important;
2047
+ border-color: var(--accent-primary) !important;
2048
+ color: white !important;
2049
+ font-weight: 600 !important;
2050
+ }
2051
  </style>
2052
 
2053
  """)
2054
 
2055
  level_options = list(level_details.keys())
2056
 
2057
+ # Main leaderboard table with dynamic title and integrated controls
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2058
  leaderboard_title = gr.HTML(update_leaderboard_title(default_level))
2059
 
2060
+ # Integrated controls within leaderboard section - stacked vertically
2061
+ gr.HTML("<p style='color: white; margin: 5px 0 5px 0; font-size: 1.2rem;'>Select Task Level</p>")
2062
+ domain_filter = gr.Radio(
2063
+ choices=level_options,
2064
+ value=default_level,
2065
+ label="",
2066
+ interactive=True,
2067
+ container=False,
2068
+ elem_classes=["domain-radio", "inline-radio"]
2069
+ )
2070
+
2071
+ gr.HTML("<p style='color: white; margin: 5px 0 0px 0; font-size: 1.2rem;'>🔍 Filters & Sorting</p>")
2072
+ with gr.Row():
2073
+ with gr.Column(scale=1):
2074
+ gr.HTML("<span style='color: white; font-size: 1.2rem; margin-bottom: 5px; display: block;'>Model Access</span>")
2075
+ model_type_filter = gr.Radio(
2076
+ choices=["All", "OSS", "API"],
2077
+ value="All",
2078
+ label="",
2079
+ elem_classes=["domain-radio", "inline-radio"],
2080
+ container=False
2081
+ )
2082
+ with gr.Column(scale=1):
2083
+ gr.HTML("<span style='color: white; font-size: 1.2rem; margin-bottom: 5px; display: block;'>Sort Order</span>")
2084
+ sort_order = gr.Radio(
2085
+ choices=["Descending", "Ascending"],
2086
+ value="Descending",
2087
+ label="",
2088
+ elem_classes=["domain-radio", "inline-radio"],
2089
+ container=False
2090
+ )
2091
 
2092
+ leaderboard_table = gr.HTML(initial_table)
 
 
2093
 
2094
  # Radar Chart Section
2095
  gr.HTML("""
2096
  <div class="domain-selector-container domain-performance-container">
2097
  <div class="domain-header">
2098
  <h2 class="domain-title" style="color: white;">Core Capability Radar</h2>
2099
+ <p class="domain-subtitle" style="color: white;">Track six essential axes: success, execution, reasoning, robustness, efficiency, and call validity.</p>
 
 
2100
  </div>
2101
  """)
2102
+
2103
+ gr.HTML("<p style='color: white; margin: 10px 0 0 0; font-size: 1.2rem; font-family: \"Nanum Gothic\", sans-serif;'>Select models to compare (up to 5).</p>")
2104
+ # gr.HTML("<p style='color: #b0b0b0; margin: 0 0 10px 0; font-size: 0.9rem;'>You can select up to five models.</p>")
2105
+ model_selector = gr.Dropdown(
2106
+ choices=initial_df['Model'].tolist()[:10],
2107
+ value=initial_df['Model'].tolist()[:5],
2108
+ multiselect=True,
2109
+ label="",
2110
+ info=None,
2111
+ container=False,
2112
+ )
 
 
 
 
 
 
2113
 
2114
  # Radar chart plot - wrapped in centered container
2115
  gr.HTML('<div class="chart-container radar-chart-container">')
 
2125
 
2126
  gr.HTML("</div>")
2127
 
2128
+
2129
+ # Define generate_performance_card function before using it
2130
+ def generate_performance_card(model_name):
2131
+ """Generate HTML for the model performance card"""
2132
+ if not model_name:
2133
+ return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
2134
+ Please select a model to generate its performance card
2135
+ </div>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2136
 
2137
+ # Get model data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2138
  df = load_leaderboard_data()
2139
+ model_data = df[df['Model'] == model_name]
 
 
 
 
 
2140
 
2141
+ if model_data.empty:
2142
+ return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
2143
+ Model not found in the database
2144
+ </div>"""
 
 
 
2145
 
2146
+ row = model_data.iloc[0]
 
2147
 
2148
+ # Get overall rank based on overall success
2149
+ df_with_success = df.copy()
2150
+ df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
2151
+ df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
2152
+ df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
2153
+ try:
2154
+ rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
2155
+ except:
2156
+ rank = 'N/A'
 
2157
 
2158
+ # Format values
2159
+ def format_value(val, decimals=3, prefix='', suffix=''):
2160
+ if pd.isna(val) or val == '':
2161
+ return 'N/A'
2162
+ return f"{prefix}{float(val):.{decimals}f}{suffix}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2163
 
2164
  def format_score(value):
2165
  if pd.isna(value) or value == '':
2166
  return 'N/A'
2167
  return f"{float(value):.3f}"
2168
 
 
2169
  radar_metrics = [
2170
  ("Execution Accuracy", row.get('Execution Accuracy')),
2171
+ ("Complex Reasoning", row.get('Complex Reasoning')),
2172
+ ("Robustness", row.get('Robustness')),
2173
  ("Context & Efficiency", row.get('Context & Efficiency')),
2174
  ("Overall Success", row.get('Overall Success')),
 
 
2175
  ("Validity", row.get('Call Validity')),
2176
  ]
2177
  radar_values = []
 
2227
  <div class="core-section">
2228
  <div class="core-metric-grid">
2229
  """
2230
+ ordered_labels = ["Execution Accuracy", "Complex Reasoning", "Robustness", "Context & Efficiency", "Overall Success", "Validity"]
2231
  ordered_metrics = sorted(radar_metrics, key=lambda x: ordered_labels.index(x[0]) if x[0] in ordered_labels else len(ordered_labels))
2232
  top_metrics = ordered_metrics[:3]
2233
  bottom_metrics = ordered_metrics[3:]
 
2280
  <div class="domain-header">
2281
  <h2 class="domain-title" style="color: white;">Model Performance Card</h2>
2282
  <p class="domain-subtitle" style="color: white;">
2283
+ Explore detailed performance cards that visualize six core metrics plus overall SR across L1L7 levels.
2284
  </p>
2285
  <p class="domain-note" style="color: #bdbdbd; font-size: 0.85em; margin-top: 4px;">
2286
+ Ranks are determined by the average SR across L1–L7.
2287
  </p>
2288
  </div>
2289
+
2290
  <div class="performance-card-content">
2291
  """)
2292
+
2293
  with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
2294
  gr.HTML("""
2295
+ <p class="domain-subtitle" style="color: white;">Choose a model to generate its analysis card.</p>
2296
+
 
 
2297
  """)
2298
  card_model_selector = gr.Dropdown(
2299
  choices=initial_df['Model'].tolist(),
 
2323
  </div>
2324
  </div>
2325
  """)
2326
+
2327
+
2328
+ # Level metric breakdown section
2329
+ gr.HTML("""
2330
+ <div class="domain-selector-container domain-performance-container level-metrics-wrapper">
2331
+ <div class="domain-header">
2332
+ <h2 class="domain-title" style="color: white;">Level-specific Metrics</h2>
2333
+ <p class="domain-subtitle" style="color: white;">Compare model scores with each Ko-AgentBench level's dedicated metrics for deeper insights.</p>
2334
+ </div>
2335
+ """)
2336
+
2337
+ gr.HTML("""
2338
+ <p style="color: white; text-align: center; margin: 0 0 20px 0; font-size: 1.2rem; font-family: \'Nanum Gothic\', sans-serif;">Select a level and up to five models to explore detailed metrics.</p>
2339
+ """)
2340
+
2341
+ with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
2342
+ level_metric_selector = gr.Dropdown(
2343
+ choices=level_ids,
2344
+ value=level_ids[0] if level_ids else None,
2345
+ multiselect=False,
2346
+ label="",
2347
+ info=None,
2348
+ container=False,
2349
+ elem_classes=["level-dropdown"]
2350
+ )
2351
+ level_model_selector = gr.Dropdown(
2352
+ choices=initial_level_model_choices,
2353
+ value=initial_level_model_values,
2354
+ multiselect=True,
2355
+ label="",
2356
+ info=None,
2357
+ container=False,
2358
+ elem_classes=["model-dropdown", "level-model-dropdown"]
2359
+ )
2360
+
2361
+ gr.HTML('<div class="chart-container level-metric-chart-container">')
2362
+ level_metric_chart = gr.Plot(
2363
+ label="",
2364
+ value=initial_level_metric_chart,
2365
+ elem_classes=["level-metric-plot", "plot-container"]
2366
+ )
2367
+ gr.HTML("""
2368
+ </div>
2369
+ </div>
2370
+ """)
2371
+
2372
+ # # Heatmap section
2373
+ # gr.HTML("""
2374
+ # <div class="domain-selector-container domain-performance-container heatmap-wrapper">
2375
+ # <div class="domain-header">
2376
+ # <h2 class="domain-title" style="color: white;">Comprehensive Performance Heatmap</h2>
2377
+ # <p class="domain-subtitle" style="color: white;">See each model's L1–L7 SR scores at a glance.</p>
2378
+ # </div>
2379
+ # <div class="chart-container heatmap-chart-container">
2380
+ # """)
2381
+ # heatmap_chart = gr.Plot(
2382
+ # label="",
2383
+ # value=initial_heatmap,
2384
+ # elem_classes=["heatmap-plot", "plot-container"]
2385
+ # )
2386
+ # gr.HTML("""
2387
+ # </div>
2388
+ # </div>
2389
+ # """)
2390
+
2391
+ # Update functions
2392
+ def get_optimal_sort_order(sort_by_value):
2393
+ """Return the optimal sort order for a given metric"""
2394
+ # Metrics where higher is better (descending)
2395
+ descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
2396
+
2397
+ # Metrics where lower is better (ascending)
2398
+ ascending_metrics = []
2399
+
2400
+ if sort_by_value in descending_metrics:
2401
+ return "Descending"
2402
+ elif sort_by_value in ascending_metrics:
2403
+ return "Ascending"
2404
+ else:
2405
+ return "Descending" # Default fallback
2406
+
2407
+
2408
+
2409
+ def update_table(level_filter, model_type_filter, sort_order):
2410
+ title_html = update_leaderboard_title(level_filter)
2411
+ sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
2412
+ table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
2413
+ return title_html, table_html
2414
+
2415
+ def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
2416
+ # Get filtered dataframe
2417
+ df = load_leaderboard_data()
2418
+ sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
2419
+ filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
2420
+
2421
+ # Update model selector choices based on filtered data
2422
+ available_models_all = filtered_df['Model'].tolist()
2423
+ available_models = available_models_all[:15] # Top 15 from filtered results
2424
+
2425
+ # If selected models are not in available models, reset to top 5
2426
+ if selected_models:
2427
+ valid_selected = [m for m in selected_models if m in available_models]
2428
+ # Check if more than 5 models are selected and show alert
2429
+ if len(valid_selected) > 5:
2430
+ gr.Warning("You can select up to 5 models.")
2431
+ # Remove the last selected item (6th item) instead of keeping first 5
2432
+ valid_selected = valid_selected[:-1]
2433
+ if not valid_selected:
2434
+ valid_selected = available_models[:5]
2435
+ else:
2436
+ valid_selected = available_models[:5]
2437
+
2438
+ # Create radar chart
2439
+ chart = create_domain_radar_chart(filtered_df, valid_selected)
2440
+
2441
+ # Prepare heatmap order prioritizing selected models
2442
+
2443
+
2444
+ # Level metric chart
2445
+ effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
2446
+ available_level_models = available_models_all
2447
+ if level_selected_models:
2448
+ valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
2449
+ if not valid_level_models:
2450
+ valid_level_models = available_level_models[:5]
2451
+ else:
2452
+ valid_level_models = available_level_models[:5]
2453
+ level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
2454
+
2455
+ return (
2456
+ gr.Dropdown(
2457
+ choices=available_models,
2458
+ value=valid_selected,
2459
+ multiselect=True,
2460
+ label="",
2461
+ info=None,
2462
+ container=False,
2463
+ # elem_classes=["model-dropdown"]
2464
+ ),
2465
+ chart,
2466
+ gr.Dropdown(
2467
+ choices=available_level_models,
2468
+ value=valid_level_models,
2469
+ multiselect=True,
2470
+ label="",
2471
+ info=None,
2472
+ container=False,
2473
+ elem_classes=["model-dropdown", "level-model-dropdown"]
2474
+ ),
2475
+ level_metric_fig,
2476
+ )
2477
+
2478
+ def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
2479
+ # Get filtered dataframe
2480
+ df = load_leaderboard_data()
2481
+ sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
2482
+ filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
2483
+
2484
+ available_models_all = filtered_df['Model'].tolist()
2485
+ if selected_models:
2486
+ valid_selected = [m for m in selected_models if m in available_models_all]
2487
+ # Check if more than 5 models are selected and show alert
2488
+ if len(valid_selected) > 5:
2489
+ # JavaScript alert for exceeding 5 models
2490
+ gr.Warning("You can select up to 5 models.")
2491
+ # Remove the last selected item (6th item) instead of keeping first 5
2492
+ valid_selected = valid_selected[:-1]
2493
+ if not valid_selected:
2494
+ valid_selected = available_models_all[:5]
2495
+ else:
2496
+ valid_selected = available_models_all[:5]
2497
+
2498
+
2499
+
2500
+ effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
2501
+ available_level_models = available_models_all
2502
+ if level_selected_models:
2503
+ valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
2504
+ if not valid_level_models:
2505
+ valid_level_models = available_level_models[:5]
2506
+ else:
2507
+ valid_level_models = available_level_models[:5]
2508
+ level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
2509
+
2510
+ return (
2511
+ gr.Dropdown(
2512
+ choices=available_models_all[:15],
2513
+ value=valid_selected,
2514
+ multiselect=True,
2515
+ label="",
2516
+ info=None,
2517
+ container=False,
2518
+ ),
2519
+ create_domain_radar_chart(filtered_df, valid_selected),
2520
+ gr.Dropdown(
2521
+ choices=available_level_models,
2522
+ value=valid_level_models,
2523
+ multiselect=True,
2524
+ label="",
2525
+ info=None,
2526
+ container=False,
2527
+ elem_classes=["model-dropdown", "level-model-dropdown"]
2528
+ ),
2529
+ level_metric_fig,
2530
+ )
2531
+
2532
+ def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
2533
+ df = load_leaderboard_data()
2534
+ sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
2535
+ filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
2536
+ available_models = filtered_df['Model'].tolist()
2537
+ if level_selected_models:
2538
+ valid_level_models = [m for m in level_selected_models if m in available_models]
2539
+ # Check if more than 5 models are selected and show alert
2540
+ if len(valid_level_models) > 5:
2541
+ gr.Warning("You can select up to 5 models.")
2542
+ # Remove the last selected item (6th item) instead of keeping first 5
2543
+ valid_level_models = valid_level_models[:-1]
2544
+ if not valid_level_models:
2545
+ valid_level_models = available_models[:5]
2546
+ else:
2547
+ valid_level_models = available_models[:5]
2548
+ effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
2549
+ level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
2550
+ return (
2551
+ gr.Dropdown(
2552
+ choices=available_models,
2553
+ value=valid_level_models,
2554
+ multiselect=True,
2555
+ label="",
2556
+ info=None,
2557
+ container=False,
2558
+ elem_classes=["model-dropdown", "level-model-dropdown"]
2559
+ ),
2560
+ level_chart,
2561
+ )
2562
+
2563
+ # Update table when filters change
2564
+ filter_inputs = [domain_filter, model_type_filter, sort_order]
2565
+
2566
+ for input_component in filter_inputs:
2567
+ input_component.change(
2568
+ fn=update_table,
2569
+ inputs=filter_inputs,
2570
+ outputs=[leaderboard_title, leaderboard_table]
2571
+ )
2572
+
2573
+ # Also update radar chart when filters change
2574
+ input_component.change(
2575
+ fn=update_radar_chart,
2576
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
2577
+ outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
2578
+ )
2579
+
2580
+ # Update radar chart when model selection changes
2581
+ model_selector.change(
2582
+ fn=update_radar_only,
2583
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
2584
+ outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
2585
+ )
2586
+
2587
+ level_metric_selector.change(
2588
+ fn=update_level_metric_only,
2589
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
2590
+ outputs=[level_model_selector, level_metric_chart]
2591
+ )
2592
+
2593
+ level_model_selector.change(
2594
+ fn=update_level_metric_only,
2595
+ inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
2596
+ outputs=[level_model_selector, level_metric_chart]
2597
+ )
2598
+
2599
 
2600
  # Add custom CSS for the performance card
2601
  gr.HTML("""
 
2818
  .level-dropdown select,
2819
  .level-dropdown [role="combobox"],
2820
  .level-dropdown button {
2821
+ background: #000000 !important;
2822
+ border: 1px solid #333333 !important;
2823
  border-radius: 999px !important;
2824
  padding: 12px 20px !important;
2825
+ color: #ffffff !important;
2826
  font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
2827
  font-weight: 600 !important;
2828
  font-size: 0.95rem !important;
2829
  text-align: center !important;
2830
  min-height: 46px !important;
2831
  transition: all 0.3s ease !important;
2832
+ box-shadow: 0 10px 24px rgba(0, 0, 0, 0.3) !important;
2833
  }
2834
 
2835
  .level-dropdown select:hover,
 
2846
  margin: 12px auto 0 !important;
2847
  }
2848
 
2849
+ .level-model-dropdown select,
2850
+ .level-model-dropdown [role="combobox"],
2851
+ .level-model-dropdown button {
2852
+ background: #000000 !important;
2853
+ border: 1px solid #333333 !important;
2854
+ color: #ffffff !important;
2855
+ }
2856
+
2857
  .radar-placeholder {
2858
  display: flex;
2859
  flex-direction: column;
 
3006
  }
3007
  }
3008
 
3009
+ /* Force fonts - highest priority */
3010
+ .dashboard-section,
3011
+ .dashboard-section *,
3012
+ .dashboard-section h2,
3013
+ .dashboard-section h3,
3014
+ .dashboard-section p,
3015
+ .dashboard-section li,
3016
+ .section-lead,
3017
+ .section-subtitle,
3018
+ .phase-card h3,
3019
+ .phase-list li,
3020
+ .scenario-body p,
3021
+ .criteria-card h3,
3022
+ .criteria-card ul,
3023
+ .criteria-card li {
3024
+ font-family: "Nanum Gothic", sans-serif !important;
3025
+ }
3026
+
3027
+ /* Force section-title styling */
3028
+ .section-title,
3029
+ h2.section-title,
3030
+ .dashboard-section .section-title,
3031
+ .section-header .section-title {
3032
+ font-family: "Gowun Dodum", sans-serif !important;
3033
+ }
3034
+
3035
+ .domain-title,
3036
+ h2.domain-title,
3037
+ .domain-header .domain-title {
3038
+ font-family: "Gowun Dodum", sans-serif !important;
3039
+ }
3040
+
3041
+ .hero-title,
3042
+ .hero-subtitle,
3043
+ h1.hero-title,
3044
+ p.hero-subtitle {
3045
+ font-family: "Do Hyeon", sans-serif !important;
3046
+ font-size: 2rem; !important;
3047
+ }
3048
+
3049
+ /* Force hero-title sizing */
3050
+ .hero-title,
3051
+ h1.hero-title {
3052
+ font-size: 4rem !important;
3053
+ }
3054
+
3055
+ .phase-chart span,
3056
+ .phase-card .phase-chart span,
3057
+ .phase-grid .phase-chart span {
3058
+ font-family: "Nanum Gothic", sans-serif !important;
3059
+ font-size: 1.2rem !important;
3060
+ }
3061
+
3062
+ .section-lead, .section-subtitle {
3063
+ font-size: 1.32rem !important;
3064
+ font-family: "Nanum Gothic", sans-serif !important;
3065
+ }
3066
+
3067
+ .phase-card h3 {
3068
+ font-size: 1.44rem !important;
3069
+ font-family: "Nanum Gothic", sans-serif !important;
3070
+ }
3071
+
3072
+ .phase-list li {
3073
+ font-size: 1.08rem !important;
3074
+ font-family: "Nanum Gothic", sans-serif !important;
3075
+ }
3076
+
3077
  </style>
3078
 
3079
  """)
 
3201
  def create_domain_radar_chart(df, selected_models=None, max_models=5):
3202
  """Visualize six core capability metrics on a radar chart."""
3203
  df = df.copy()
 
 
3204
  metrics_info = [
3205
+ {"column": "Overall Success", "label": "Overall Success", "description": "Average SR across L1-L7"},
3206
  {"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"},
 
 
 
3207
  {"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"},
3208
+ {"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"},
3209
+ {"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"},
3210
+ {"column": "Call Validity", "label": "Call Validity", "description": "Average EPR_CVR across levels"},
3211
  ]
3212
 
3213
  required_columns = [m["column"] for m in metrics_info]
 
3364
  autosize=True,
3365
  annotations=[
3366
  dict(
3367
+ text="Galileo Agent Leaderboard",
3368
  xref="paper", yref="paper",
3369
  x=0.98, y=0.02,
3370
  xanchor='right', yanchor='bottom',