mcp-bench / app.py
ztwang's picture
Upload 2 files
54dfcdf verified
raw
history blame
11.7 kB
import gradio as gr
import os
import base64
from pathlib import Path
def encode_image_to_base64(image_path):
"""Convert image to base64 for embedding in HTML"""
if os.path.exists(image_path):
with open(image_path, "rb") as img_file:
encoded = base64.b64encode(img_file.read()).decode()
# Get file extension
ext = Path(image_path).suffix.lower()
mime_type = {
'.png': 'image/png',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.gif': 'image/gif',
'.webp': 'image/webp'
}.get(ext, 'image/png')
return f"data:{mime_type};base64,{encoded}"
return ""
def generate_table_html():
"""Generate table HTML from data"""
models = [
{"name": "gpt-5", "overall_score": 0.749, "valid_tool_name_rate": 100.0, "schema_compliance": 99.3, "execution_success": 99.1, "task_fulfillment": 0.677, "information_grounding": 0.828, "tool_appropriateness": 0.767, "parameter_accuracy": 0.749, "dependency_awareness": 0.649, "parallelism_efficiency": 0.339},
{"name": "o3", "overall_score": 0.715, "valid_tool_name_rate": 99.3, "schema_compliance": 99.9, "execution_success": 97.1, "task_fulfillment": 0.641, "information_grounding": 0.706, "tool_appropriateness": 0.724, "parameter_accuracy": 0.726, "dependency_awareness": 0.592, "parallelism_efficiency": 0.359},
{"name": "gpt-oss-120b", "overall_score": 0.692, "valid_tool_name_rate": 97.7, "schema_compliance": 98.8, "execution_success": 94.0, "task_fulfillment": 0.636, "information_grounding": 0.705, "tool_appropriateness": 0.691, "parameter_accuracy": 0.661, "dependency_awareness": 0.576, "parallelism_efficiency": 0.329},
{"name": "gemini-2.5-pro", "overall_score": 0.690, "valid_tool_name_rate": 99.4, "schema_compliance": 99.6, "execution_success": 96.9, "task_fulfillment": 0.562, "information_grounding": 0.725, "tool_appropriateness": 0.717, "parameter_accuracy": 0.670, "dependency_awareness": 0.541, "parallelism_efficiency": 0.329},
{"name": "claude-sonnet-4", "overall_score": 0.681, "valid_tool_name_rate": 100.0, "schema_compliance": 99.8, "execution_success": 98.8, "task_fulfillment": 0.554, "information_grounding": 0.676, "tool_appropriateness": 0.689, "parameter_accuracy": 0.671, "dependency_awareness": 0.541, "parallelism_efficiency": 0.328},
{"name": "qwen3-235b-a22b-2507", "overall_score": 0.678, "valid_tool_name_rate": 99.1, "schema_compliance": 99.3, "execution_success": 94.8, "task_fulfillment": 0.549, "information_grounding": 0.625, "tool_appropriateness": 0.688, "parameter_accuracy": 0.712, "dependency_awareness": 0.542, "parallelism_efficiency": 0.355},
{"name": "glm-4.5", "overall_score": 0.668, "valid_tool_name_rate": 99.7, "schema_compliance": 99.7, "execution_success": 97.4, "task_fulfillment": 0.525, "information_grounding": 0.682, "tool_appropriateness": 0.680, "parameter_accuracy": 0.661, "dependency_awareness": 0.523, "parallelism_efficiency": 0.297},
{"name": "gpt-oss-20b", "overall_score": 0.654, "valid_tool_name_rate": 98.8, "schema_compliance": 99.1, "execution_success": 93.6, "task_fulfillment": 0.547, "information_grounding": 0.623, "tool_appropriateness": 0.661, "parameter_accuracy": 0.638, "dependency_awareness": 0.509, "parallelism_efficiency": 0.309},
{"name": "kimi-k2", "overall_score": 0.629, "valid_tool_name_rate": 98.8, "schema_compliance": 98.1, "execution_success": 94.5, "task_fulfillment": 0.502, "information_grounding": 0.577, "tool_appropriateness": 0.631, "parameter_accuracy": 0.623, "dependency_awareness": 0.448, "parallelism_efficiency": 0.307},
{"name": "qwen3-30b-a3b-instruct-2507", "overall_score": 0.627, "valid_tool_name_rate": 99.2, "schema_compliance": 95.4, "execution_success": 94.4, "task_fulfillment": 0.459, "information_grounding": 0.536, "tool_appropriateness": 0.658, "parameter_accuracy": 0.646, "dependency_awareness": 0.471, "parallelism_efficiency": 0.318},
{"name": "gemini-2.5-flash-lite", "overall_score": 0.598, "valid_tool_name_rate": 98.7, "schema_compliance": 98.8, "execution_success": 91.1, "task_fulfillment": 0.446, "information_grounding": 0.569, "tool_appropriateness": 0.629, "parameter_accuracy": 0.564, "dependency_awareness": 0.423, "parallelism_efficiency": 0.262},
{"name": "gpt-4o", "overall_score": 0.595, "valid_tool_name_rate": 96.7, "schema_compliance": 87.6, "execution_success": 85.3, "task_fulfillment": 0.477, "information_grounding": 0.519, "tool_appropriateness": 0.588, "parameter_accuracy": 0.551, "dependency_awareness": 0.423, "parallelism_efficiency": 0.253},
{"name": "gemma-3-27b-it", "overall_score": 0.582, "valid_tool_name_rate": 98.4, "schema_compliance": 81.6, "execution_success": 85.5, "task_fulfillment": 0.396, "information_grounding": 0.495, "tool_appropriateness": 0.588, "parameter_accuracy": 0.530, "dependency_awareness": 0.408, "parallelism_efficiency": 0.251},
{"name": "llama-3-3-70b-instruct", "overall_score": 0.558, "valid_tool_name_rate": 99.5, "schema_compliance": 93.1, "execution_success": 91.5, "task_fulfillment": 0.366, "information_grounding": 0.476, "tool_appropriateness": 0.554, "parameter_accuracy": 0.486, "dependency_awareness": 0.359, "parallelism_efficiency": 0.244},
{"name": "gpt-4o-mini", "overall_score": 0.557, "valid_tool_name_rate": 95.5, "schema_compliance": 86.5, "execution_success": 84.0, "task_fulfillment": 0.426, "information_grounding": 0.453, "tool_appropriateness": 0.556, "parameter_accuracy": 0.499, "dependency_awareness": 0.359, "parallelism_efficiency": 0.230},
{"name": "mistral-small-2503", "overall_score": 0.530, "valid_tool_name_rate": 92.0, "schema_compliance": 95.6, "execution_success": 87.2, "task_fulfillment": 0.344, "information_grounding": 0.438, "tool_appropriateness": 0.528, "parameter_accuracy": 0.462, "dependency_awareness": 0.345, "parallelism_efficiency": 0.220},
{"name": "llama-3-1-70b-instruct", "overall_score": 0.510, "valid_tool_name_rate": 99.2, "schema_compliance": 90.5, "execution_success": 92.5, "task_fulfillment": 0.314, "information_grounding": 0.432, "tool_appropriateness": 0.523, "parameter_accuracy": 0.433, "dependency_awareness": 0.303, "parallelism_efficiency": 0.190},
{"name": "nova-micro-v1", "overall_score": 0.508, "valid_tool_name_rate": 96.0, "schema_compliance": 93.1, "execution_success": 87.8, "task_fulfillment": 0.339, "information_grounding": 0.419, "tool_appropriateness": 0.504, "parameter_accuracy": 0.428, "dependency_awareness": 0.315, "parallelism_efficiency": 0.212},
{"name": "llama-3-2-90b-vision-instruct", "overall_score": 0.495, "valid_tool_name_rate": 99.6, "schema_compliance": 85.0, "execution_success": 90.9, "task_fulfillment": 0.293, "information_grounding": 0.444, "tool_appropriateness": 0.515, "parameter_accuracy": 0.427, "dependency_awareness": 0.267, "parallelism_efficiency": 0.173},
{"name": "llama-3-1-8b-instruct", "overall_score": 0.428, "valid_tool_name_rate": 96.1, "schema_compliance": 89.4, "execution_success": 90.9, "task_fulfillment": 0.261, "information_grounding": 0.295, "tool_appropriateness": 0.352, "parameter_accuracy": 0.310, "dependency_awareness": 0.221, "parallelism_efficiency": 0.141}
]
# Sort by overall score descending
models.sort(key=lambda x: x['overall_score'], reverse=True)
rows = []
for model in models:
row = f'''<tr>
<td class="model-col"><span class="model-name">{model['name']}</span></td>
<td class="score-col"><span class="score">{model['overall_score']:.3f}</span></td>
<td class="metric-col">{model['valid_tool_name_rate']:.1f}%</td>
<td class="metric-col">{model['schema_compliance']:.1f}%</td>
<td class="metric-col">{model['execution_success']:.1f}%</td>
<td class="metric-col">{model['task_fulfillment']:.3f}</td>
<td class="metric-col">{model['information_grounding']:.3f}</td>
<td class="metric-col">{model['tool_appropriateness']:.3f}</td>
<td class="metric-col">{model['parameter_accuracy']:.3f}</td>
<td class="metric-col">{model['dependency_awareness']:.3f}</td>
<td class="metric-col">{model['parallelism_efficiency']:.3f}</td>
</tr>'''
rows.append(row)
return '\n'.join(rows)
def create_gradio_app():
"""
Gradio app to serve the static HTML leaderboard with embedded images
This is required for Hugging Face Spaces deployment
"""
# Read the HTML content
with open('index.html', 'r', encoding='utf-8') as f:
html_content = f.read()
# Read the CSS content
with open('style.css', 'r', encoding='utf-8') as f:
css_content = f.read()
# Convert images to base64 for embedding
diagram_b64 = encode_image_to_base64('mcp-bench.png')
ranking_b64 = encode_image_to_base64('ranking.png')
# Replace image references with base64 embedded versions
html_content = html_content.replace(
'src="mcp-bench.png"',
f'src="{diagram_b64}"'
).replace(
'src="ranking.png"',
f'src="{ranking_b64}"'
)
# Generate static table HTML
table_html = generate_table_html()
# Replace the empty tbody with pre-generated content
combined_html = html_content.replace(
'<tbody id="tableBody">\n <!-- Table rows will be generated by JavaScript -->\n </tbody>',
f'<tbody id="tableBody">{table_html}</tbody>'
).replace(
'<link rel="stylesheet" href="style.css">',
f'<style>{css_content}</style>'
)
# The HTML already has the minimal JavaScript for citation copy and date update
# Create the Gradio interface
with gr.Blocks(
title="MCP-Bench Leaderboard",
theme=gr.themes.Soft(),
css="""
.gradio-container { padding: 0 !important; }
#leaderboard-container {
width: 100% !important;
max-width: none !important;
margin: 0 !important;
padding: 0 !important;
}
#leaderboard-container * {
box-sizing: border-box;
}
/* Force all buttons to have same blue color and remove underlines */
#leaderboard-container .paper-link {
color: white !important;
background-color: #4285F4 !important;
text-decoration: none !important;
}
#leaderboard-container .paper-link:hover {
color: white !important;
background-color: #3367D6 !important;
text-decoration: none !important;
}
#leaderboard-container .paper-link:focus,
#leaderboard-container .paper-link:visited,
#leaderboard-container .paper-link:active {
color: white !important;
background-color: #4285F4 !important;
text-decoration: none !important;
}
/* Fix font issues for authors */
#leaderboard-container .paper-authors {
font-family: Inter, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
}
/* Remove any underlines from links globally */
#leaderboard-container a {
text-decoration: none !important;
}
"""
) as demo:
gr.HTML(
combined_html,
elem_id="leaderboard-container"
)
return demo
if __name__ == "__main__":
demo = create_gradio_app()
demo.launch()