File size: 8,034 Bytes
7b6b43e 4966301 7b6b43e 4966301 7b6b43e 4966301 7b6b43e 4966301 7b6b43e 4966301 7b6b43e 4966301 7b6b43e 4966301 7b6b43e 4966301 7b6b43e 4966301 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>MCP Benchmark Leaderboard</title>
<link rel="stylesheet" href="style.css">
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
</head>
<body>
<div class="container">
<!-- Paper Information -->
<header class="paper-header">
<h1 class="paper-title">MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers</h1>
<div class="paper-authors">
<p>Zhenting Wang, Qi Chang, Hemani Patel, Shashank Biju, Cheng-En Wu, Quan Liu, Aolin Ding, Alireza Rezazadeh, Ankit Shah, Yujia Bao, Eugene Siow</p>
<p class="affiliation">Accenture, UC Berkeley</p>
</div>
<div class="paper-links">
<a href="https://github.com/Accenture/mcp-bench" class="paper-link">
<i class="fab fa-github"></i> GitHub
</a>
<a href="https://arxiv.org/abs/2508.20453" class="paper-link">
<i class="fas fa-file-pdf"></i> Paper
</a>
<a href="#leaderboard" class="paper-link">
<i class="fas fa-trophy"></i> Leaderboard
</a>
</div>
</header>
<!-- MCP Diagram -->
<section class="diagram-section">
<img src="mcp-bench.png" alt="MCP-Bench Architecture Diagram" class="diagram-image">
<p class="diagram-caption">
MCP-Bench is a comprehensive evaluation framework designed to assess Large Language Models' (LLMs) capabilities in tool-use scenarios through the Model Context Protocol (MCP). This benchmark provides an end-to-end pipeline for evaluating how effectively different LLMs can discover, select, and utilize tools to solve real-world tasks.
</p>
</section>
<!-- Ranking Chart -->
<section class="chart-section">
<h2 class="section-title">Performance Ranking</h2>
<img src="ranking.png" alt="MCP Benchmark Ranking Chart" class="ranking-chart">
</section>
<!-- Leaderboard Header -->
<section class="leaderboard-section" id="leaderboard">
<h2 class="section-title">Detailed Results</h2>
<div class="controls">
<div class="search-container">
<i class="fas fa-search"></i>
<input type="text" id="searchInput" placeholder="Search models..." class="search-input">
</div>
<div class="filter-container">
<label for="sortSelect">Sort by:</label>
<select id="sortSelect" class="sort-select">
<option value="overall_score">Overall Score</option>
<option value="valid_tool_schema">Valid Tool Schema</option>
<option value="compliance">Compliance</option>
<option value="task_success">Task Success</option>
<option value="schema_understanding">Schema Understanding</option>
<option value="task_completion">Task Completion</option>
<option value="tool_usage">Tool Usage</option>
<option value="planning_effectiveness">Planning Effectiveness</option>
</select>
<button id="sortOrder" class="sort-btn" title="Toggle sort order">
<i class="fas fa-sort-amount-down"></i>
</button>
</div>
</div>
<div class="table-container">
<table class="leaderboard-table" id="leaderboardTable">
<thead>
<tr>
<th class="model-col sortable" data-column="name">
<strong>Model</strong>
<i class="fas fa-sort sort-icon"></i>
</th>
<th class="score-col sortable" data-column="overall_score">
<strong>Overall Score</strong>
<i class="fas fa-sort sort-icon"></i>
</th>
<th class="metric-col sortable" data-column="valid_tool_name_rate">
Valid Tool<br>Name Rate
<i class="fas fa-sort sort-icon"></i>
</th>
<th class="metric-col sortable" data-column="schema_compliance">
Schema<br>Compliance
<i class="fas fa-sort sort-icon"></i>
</th>
<th class="metric-col sortable" data-column="execution_success">
Execution<br>Success
<i class="fas fa-sort sort-icon"></i>
</th>
<th class="metric-col sortable" data-column="task_fulfillment">
Task<br>Fulfillment
<i class="fas fa-sort sort-icon"></i>
</th>
<th class="metric-col sortable" data-column="information_grounding">
Information<br>Grounding
<i class="fas fa-sort sort-icon"></i>
</th>
<th class="metric-col sortable" data-column="tool_appropriateness">
Tool<br>Appropriateness
<i class="fas fa-sort sort-icon"></i>
</th>
<th class="metric-col sortable" data-column="parameter_accuracy">
Parameter<br>Accuracy
<i class="fas fa-sort sort-icon"></i>
</th>
<th class="metric-col sortable" data-column="dependency_awareness">
Dependency<br>Awareness
<i class="fas fa-sort sort-icon"></i>
</th>
<th class="metric-col sortable" data-column="parallelism_efficiency">
Parallelism<br>and Efficiency
<i class="fas fa-sort sort-icon"></i>
</th>
</tr>
</thead>
<tbody id="tableBody">
<!-- Table rows will be generated by JavaScript -->
</tbody>
</table>
</div>
<div class="loading" id="loading">
<i class="fas fa-spinner fa-spin"></i>
Loading leaderboard data...
</div>
</section>
<!-- Citation Section -->
<section class="citation-section">
<h2 class="section-title">Citation</h2>
<div class="citation-box">
<pre class="citation-text">@article{wang2024mcpbench,
title={MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers},
author={Wang, Zhenting and Chang, Qi and Patel, Hemani and Biju, Shashank and Wu, Cheng-En and Liu, Quan and Ding, Aolin and Rezazadeh, Alireza and Shah, Ankit and Bao, Yujia and Siow, Eugene},
journal={arXiv preprint arXiv:2508.20453},
year={2024}
}</pre>
<button class="copy-citation-btn" onclick="copyCitation()">
<i class="fas fa-copy"></i> Copy Citation
</button>
</div>
</section>
<footer class="footer">
<p>Last updated: <span id="lastUpdated"></span></p>
<p>Data source: MCP-Bench Results (ArXiv: 2508.20453)</p>
</footer>
</div>
<script src="script.js"></script>
</body>
</html> |