|
|
import gradio as gr |
|
|
import os |
|
|
import base64 |
|
|
from pathlib import Path |
|
|
|
|
|
def encode_image_to_base64(image_path): |
|
|
"""Convert image to base64 for embedding in HTML""" |
|
|
if os.path.exists(image_path): |
|
|
with open(image_path, "rb") as img_file: |
|
|
encoded = base64.b64encode(img_file.read()).decode() |
|
|
|
|
|
ext = Path(image_path).suffix.lower() |
|
|
mime_type = { |
|
|
'.png': 'image/png', |
|
|
'.jpg': 'image/jpeg', |
|
|
'.jpeg': 'image/jpeg', |
|
|
'.gif': 'image/gif', |
|
|
'.webp': 'image/webp' |
|
|
}.get(ext, 'image/png') |
|
|
return f"data:{mime_type};base64,{encoded}" |
|
|
return "" |
|
|
|
|
|
def generate_table_html(): |
|
|
"""Generate table HTML from data""" |
|
|
models = [ |
|
|
{"name": "gpt-5", "overall_score": 0.749, "valid_tool_name_rate": 100.0, "schema_compliance": 99.3, "execution_success": 99.1, "task_fulfillment": 0.677, "information_grounding": 0.828, "tool_appropriateness": 0.767, "parameter_accuracy": 0.749, "dependency_awareness": 0.649, "parallelism_efficiency": 0.339}, |
|
|
{"name": "o3", "overall_score": 0.715, "valid_tool_name_rate": 99.3, "schema_compliance": 99.9, "execution_success": 97.1, "task_fulfillment": 0.641, "information_grounding": 0.706, "tool_appropriateness": 0.724, "parameter_accuracy": 0.726, "dependency_awareness": 0.592, "parallelism_efficiency": 0.359}, |
|
|
{"name": "gpt-oss-120b", "overall_score": 0.692, "valid_tool_name_rate": 97.7, "schema_compliance": 98.8, "execution_success": 94.0, "task_fulfillment": 0.636, "information_grounding": 0.705, "tool_appropriateness": 0.691, "parameter_accuracy": 0.661, "dependency_awareness": 0.576, "parallelism_efficiency": 0.329}, |
|
|
{"name": "gemini-2.5-pro", "overall_score": 0.690, "valid_tool_name_rate": 99.4, "schema_compliance": 99.6, "execution_success": 96.9, "task_fulfillment": 0.562, "information_grounding": 0.725, "tool_appropriateness": 0.717, "parameter_accuracy": 0.670, "dependency_awareness": 0.541, "parallelism_efficiency": 0.329}, |
|
|
{"name": "claude-sonnet-4", "overall_score": 0.681, "valid_tool_name_rate": 100.0, "schema_compliance": 99.8, "execution_success": 98.8, "task_fulfillment": 0.554, "information_grounding": 0.676, "tool_appropriateness": 0.689, "parameter_accuracy": 0.671, "dependency_awareness": 0.541, "parallelism_efficiency": 0.328}, |
|
|
{"name": "qwen3-235b-a22b-2507", "overall_score": 0.678, "valid_tool_name_rate": 99.1, "schema_compliance": 99.3, "execution_success": 94.8, "task_fulfillment": 0.549, "information_grounding": 0.625, "tool_appropriateness": 0.688, "parameter_accuracy": 0.712, "dependency_awareness": 0.542, "parallelism_efficiency": 0.355}, |
|
|
{"name": "glm-4.5", "overall_score": 0.668, "valid_tool_name_rate": 99.7, "schema_compliance": 99.7, "execution_success": 97.4, "task_fulfillment": 0.525, "information_grounding": 0.682, "tool_appropriateness": 0.680, "parameter_accuracy": 0.661, "dependency_awareness": 0.523, "parallelism_efficiency": 0.297}, |
|
|
{"name": "gpt-oss-20b", "overall_score": 0.654, "valid_tool_name_rate": 98.8, "schema_compliance": 99.1, "execution_success": 93.6, "task_fulfillment": 0.547, "information_grounding": 0.623, "tool_appropriateness": 0.661, "parameter_accuracy": 0.638, "dependency_awareness": 0.509, "parallelism_efficiency": 0.309}, |
|
|
{"name": "kimi-k2", "overall_score": 0.629, "valid_tool_name_rate": 98.8, "schema_compliance": 98.1, "execution_success": 94.5, "task_fulfillment": 0.502, "information_grounding": 0.577, "tool_appropriateness": 0.631, "parameter_accuracy": 0.623, "dependency_awareness": 0.448, "parallelism_efficiency": 0.307}, |
|
|
{"name": "qwen3-30b-a3b-instruct-2507", "overall_score": 0.627, "valid_tool_name_rate": 99.2, "schema_compliance": 95.4, "execution_success": 94.4, "task_fulfillment": 0.459, "information_grounding": 0.536, "tool_appropriateness": 0.658, "parameter_accuracy": 0.646, "dependency_awareness": 0.471, "parallelism_efficiency": 0.318}, |
|
|
{"name": "gemini-2.5-flash-lite", "overall_score": 0.598, "valid_tool_name_rate": 98.7, "schema_compliance": 98.8, "execution_success": 91.1, "task_fulfillment": 0.446, "information_grounding": 0.569, "tool_appropriateness": 0.629, "parameter_accuracy": 0.564, "dependency_awareness": 0.423, "parallelism_efficiency": 0.262}, |
|
|
{"name": "gpt-4o", "overall_score": 0.595, "valid_tool_name_rate": 96.7, "schema_compliance": 87.6, "execution_success": 85.3, "task_fulfillment": 0.477, "information_grounding": 0.519, "tool_appropriateness": 0.588, "parameter_accuracy": 0.551, "dependency_awareness": 0.423, "parallelism_efficiency": 0.253}, |
|
|
{"name": "gemma-3-27b-it", "overall_score": 0.582, "valid_tool_name_rate": 98.4, "schema_compliance": 81.6, "execution_success": 85.5, "task_fulfillment": 0.396, "information_grounding": 0.495, "tool_appropriateness": 0.588, "parameter_accuracy": 0.530, "dependency_awareness": 0.408, "parallelism_efficiency": 0.251}, |
|
|
{"name": "llama-3-3-70b-instruct", "overall_score": 0.558, "valid_tool_name_rate": 99.5, "schema_compliance": 93.1, "execution_success": 91.5, "task_fulfillment": 0.366, "information_grounding": 0.476, "tool_appropriateness": 0.554, "parameter_accuracy": 0.486, "dependency_awareness": 0.359, "parallelism_efficiency": 0.244}, |
|
|
{"name": "gpt-4o-mini", "overall_score": 0.557, "valid_tool_name_rate": 95.5, "schema_compliance": 86.5, "execution_success": 84.0, "task_fulfillment": 0.426, "information_grounding": 0.453, "tool_appropriateness": 0.556, "parameter_accuracy": 0.499, "dependency_awareness": 0.359, "parallelism_efficiency": 0.230}, |
|
|
{"name": "mistral-small-2503", "overall_score": 0.530, "valid_tool_name_rate": 92.0, "schema_compliance": 95.6, "execution_success": 87.2, "task_fulfillment": 0.344, "information_grounding": 0.438, "tool_appropriateness": 0.528, "parameter_accuracy": 0.462, "dependency_awareness": 0.345, "parallelism_efficiency": 0.220}, |
|
|
{"name": "llama-3-1-70b-instruct", "overall_score": 0.510, "valid_tool_name_rate": 99.2, "schema_compliance": 90.5, "execution_success": 92.5, "task_fulfillment": 0.314, "information_grounding": 0.432, "tool_appropriateness": 0.523, "parameter_accuracy": 0.433, "dependency_awareness": 0.303, "parallelism_efficiency": 0.190}, |
|
|
{"name": "nova-micro-v1", "overall_score": 0.508, "valid_tool_name_rate": 96.0, "schema_compliance": 93.1, "execution_success": 87.8, "task_fulfillment": 0.339, "information_grounding": 0.419, "tool_appropriateness": 0.504, "parameter_accuracy": 0.428, "dependency_awareness": 0.315, "parallelism_efficiency": 0.212}, |
|
|
{"name": "llama-3-2-90b-vision-instruct", "overall_score": 0.495, "valid_tool_name_rate": 99.6, "schema_compliance": 85.0, "execution_success": 90.9, "task_fulfillment": 0.293, "information_grounding": 0.444, "tool_appropriateness": 0.515, "parameter_accuracy": 0.427, "dependency_awareness": 0.267, "parallelism_efficiency": 0.173}, |
|
|
{"name": "llama-3-1-8b-instruct", "overall_score": 0.428, "valid_tool_name_rate": 96.1, "schema_compliance": 89.4, "execution_success": 90.9, "task_fulfillment": 0.261, "information_grounding": 0.295, "tool_appropriateness": 0.352, "parameter_accuracy": 0.310, "dependency_awareness": 0.221, "parallelism_efficiency": 0.141} |
|
|
] |
|
|
|
|
|
|
|
|
models.sort(key=lambda x: x['overall_score'], reverse=True) |
|
|
|
|
|
rows = [] |
|
|
for model in models: |
|
|
row = f'''<tr> |
|
|
<td class="model-col"><span class="model-name">{model['name']}</span></td> |
|
|
<td class="score-col"><span class="score">{model['overall_score']:.3f}</span></td> |
|
|
<td class="metric-col">{model['valid_tool_name_rate']:.1f}%</td> |
|
|
<td class="metric-col">{model['schema_compliance']:.1f}%</td> |
|
|
<td class="metric-col">{model['execution_success']:.1f}%</td> |
|
|
<td class="metric-col">{model['task_fulfillment']:.3f}</td> |
|
|
<td class="metric-col">{model['information_grounding']:.3f}</td> |
|
|
<td class="metric-col">{model['tool_appropriateness']:.3f}</td> |
|
|
<td class="metric-col">{model['parameter_accuracy']:.3f}</td> |
|
|
<td class="metric-col">{model['dependency_awareness']:.3f}</td> |
|
|
<td class="metric-col">{model['parallelism_efficiency']:.3f}</td> |
|
|
</tr>''' |
|
|
rows.append(row) |
|
|
|
|
|
return '\n'.join(rows) |
|
|
|
|
|
def create_gradio_app(): |
|
|
""" |
|
|
Gradio app to serve the static HTML leaderboard with embedded images |
|
|
This is required for Hugging Face Spaces deployment |
|
|
""" |
|
|
|
|
|
|
|
|
with open('index.html', 'r', encoding='utf-8') as f: |
|
|
html_content = f.read() |
|
|
|
|
|
|
|
|
with open('style.css', 'r', encoding='utf-8') as f: |
|
|
css_content = f.read() |
|
|
|
|
|
|
|
|
diagram_b64 = encode_image_to_base64('mcp-bench.png') |
|
|
ranking_b64 = encode_image_to_base64('ranking.png') |
|
|
|
|
|
|
|
|
html_content = html_content.replace( |
|
|
'src="mcp-bench.png"', |
|
|
f'src="{diagram_b64}"' |
|
|
).replace( |
|
|
'src="ranking.png"', |
|
|
f'src="{ranking_b64}"' |
|
|
) |
|
|
|
|
|
|
|
|
table_html = generate_table_html() |
|
|
|
|
|
|
|
|
combined_html = html_content.replace( |
|
|
'<tbody id="tableBody">\n <!-- Table rows will be generated by JavaScript -->\n </tbody>', |
|
|
f'<tbody id="tableBody">{table_html}</tbody>' |
|
|
).replace( |
|
|
'<link rel="stylesheet" href="style.css">', |
|
|
f'<style>{css_content}</style>' |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks( |
|
|
title="MCP-Bench Leaderboard", |
|
|
theme=gr.themes.Soft(), |
|
|
css=""" |
|
|
.gradio-container { padding: 0 !important; } |
|
|
#leaderboard-container { |
|
|
width: 100% !important; |
|
|
max-width: none !important; |
|
|
margin: 0 !important; |
|
|
padding: 0 !important; |
|
|
} |
|
|
#leaderboard-container * { |
|
|
box-sizing: border-box; |
|
|
} |
|
|
/* Force all buttons to have same blue color and remove underlines */ |
|
|
#leaderboard-container .paper-link { |
|
|
color: white !important; |
|
|
background-color: #4285F4 !important; |
|
|
text-decoration: none !important; |
|
|
} |
|
|
#leaderboard-container .paper-link:hover { |
|
|
color: white !important; |
|
|
background-color: #3367D6 !important; |
|
|
text-decoration: none !important; |
|
|
} |
|
|
#leaderboard-container .paper-link:focus, |
|
|
#leaderboard-container .paper-link:visited, |
|
|
#leaderboard-container .paper-link:active { |
|
|
color: white !important; |
|
|
background-color: #4285F4 !important; |
|
|
text-decoration: none !important; |
|
|
} |
|
|
/* Fix font issues for authors */ |
|
|
#leaderboard-container .paper-authors { |
|
|
font-family: Inter, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important; |
|
|
} |
|
|
/* Remove any underlines from links globally */ |
|
|
#leaderboard-container a { |
|
|
text-decoration: none !important; |
|
|
} |
|
|
""" |
|
|
) as demo: |
|
|
gr.HTML( |
|
|
combined_html, |
|
|
elem_id="leaderboard-container" |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = create_gradio_app() |
|
|
demo.launch() |