File size: 8,034 Bytes
7b6b43e
 
 
 
 
4966301
 
 
 
7b6b43e
4966301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b6b43e
 
4966301
 
 
 
 
 
7b6b43e
 
4966301
 
 
 
7b6b43e
4966301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b6b43e
 
 
4966301
 
 
 
 
 
 
 
 
 
 
 
7b6b43e
 
 
4966301
 
 
 
7b6b43e
 
4966301
7b6b43e
4966301
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>MCP Benchmark Leaderboard</title>
    <link rel="stylesheet" href="style.css">
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
</head>
<body>
    <div class="container">
        <!-- Paper Information -->
        <header class="paper-header">
            <h1 class="paper-title">MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers</h1>
            <div class="paper-authors">
                <p>Zhenting Wang, Qi Chang, Hemani Patel, Shashank Biju, Cheng-En Wu, Quan Liu, Aolin Ding, Alireza Rezazadeh, Ankit Shah, Yujia Bao, Eugene Siow</p>
                <p class="affiliation">Accenture, UC Berkeley</p>
            </div>
            <div class="paper-links">
                <a href="https://github.com/Accenture/mcp-bench" class="paper-link">
                    <i class="fab fa-github"></i> GitHub
                </a>
                <a href="https://arxiv.org/abs/2508.20453" class="paper-link">
                    <i class="fas fa-file-pdf"></i> Paper
                </a>
                <a href="#leaderboard" class="paper-link">
                    <i class="fas fa-trophy"></i> Leaderboard
                </a>
            </div>
        </header>

        <!-- MCP Diagram -->
        <section class="diagram-section">
            <img src="mcp-bench.png" alt="MCP-Bench Architecture Diagram" class="diagram-image">
            <p class="diagram-caption">
                MCP-Bench is a comprehensive evaluation framework designed to assess Large Language Models' (LLMs) capabilities in tool-use scenarios through the Model Context Protocol (MCP). This benchmark provides an end-to-end pipeline for evaluating how effectively different LLMs can discover, select, and utilize tools to solve real-world tasks.
            </p>
        </section>

        <!-- Ranking Chart -->
        <section class="chart-section">
            <h2 class="section-title">Performance Ranking</h2>
            <img src="ranking.png" alt="MCP Benchmark Ranking Chart" class="ranking-chart">
        </section>

        <!-- Leaderboard Header -->
        <section class="leaderboard-section" id="leaderboard">
            <h2 class="section-title">Detailed Results</h2>

        <div class="controls">
            <div class="search-container">
                <i class="fas fa-search"></i>
                <input type="text" id="searchInput" placeholder="Search models..." class="search-input">
            </div>
            
            <div class="filter-container">
                <label for="sortSelect">Sort by:</label>
                <select id="sortSelect" class="sort-select">
                    <option value="overall_score">Overall Score</option>
                    <option value="valid_tool_schema">Valid Tool Schema</option>
                    <option value="compliance">Compliance</option>
                    <option value="task_success">Task Success</option>
                    <option value="schema_understanding">Schema Understanding</option>
                    <option value="task_completion">Task Completion</option>
                    <option value="tool_usage">Tool Usage</option>
                    <option value="planning_effectiveness">Planning Effectiveness</option>
                </select>
                
                <button id="sortOrder" class="sort-btn" title="Toggle sort order">
                    <i class="fas fa-sort-amount-down"></i>
                </button>
            </div>
        </div>

        <div class="table-container">
            <table class="leaderboard-table" id="leaderboardTable">
                <thead>
                    <tr>
                        <th class="model-col sortable" data-column="name">
                            <strong>Model</strong>
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                        <th class="score-col sortable" data-column="overall_score">
                            <strong>Overall Score</strong>
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                        <th class="metric-col sortable" data-column="valid_tool_name_rate">
                            Valid Tool<br>Name Rate
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                        <th class="metric-col sortable" data-column="schema_compliance">
                            Schema<br>Compliance
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                        <th class="metric-col sortable" data-column="execution_success">
                            Execution<br>Success
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                        <th class="metric-col sortable" data-column="task_fulfillment">
                            Task<br>Fulfillment
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                        <th class="metric-col sortable" data-column="information_grounding">
                            Information<br>Grounding
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                        <th class="metric-col sortable" data-column="tool_appropriateness">
                            Tool<br>Appropriateness
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                        <th class="metric-col sortable" data-column="parameter_accuracy">
                            Parameter<br>Accuracy
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                        <th class="metric-col sortable" data-column="dependency_awareness">
                            Dependency<br>Awareness
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                        <th class="metric-col sortable" data-column="parallelism_efficiency">
                            Parallelism<br>and Efficiency
                            <i class="fas fa-sort sort-icon"></i>
                        </th>
                    </tr>
                </thead>
                <tbody id="tableBody">
                    <!-- Table rows will be generated by JavaScript -->
                </tbody>
            </table>
        </div>

        <div class="loading" id="loading">
            <i class="fas fa-spinner fa-spin"></i>
            Loading leaderboard data...
        </div>

        </section>

        <!-- Citation Section -->
        <section class="citation-section">
            <h2 class="section-title">Citation</h2>
            <div class="citation-box">
                <pre class="citation-text">@article{wang2024mcpbench,
  title={MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers},
  author={Wang, Zhenting and Chang, Qi and Patel, Hemani and Biju, Shashank and Wu, Cheng-En and Liu, Quan and Ding, Aolin and Rezazadeh, Alireza and Shah, Ankit and Bao, Yujia and Siow, Eugene},
  journal={arXiv preprint arXiv:2508.20453},
  year={2024}
}</pre>
                <button class="copy-citation-btn" onclick="copyCitation()">
                    <i class="fas fa-copy"></i> Copy Citation
                </button>
            </div>
        </section>

        <footer class="footer">
            <p>Last updated: <span id="lastUpdated"></span></p>
            <p>Data source: MCP-Bench Results (ArXiv: 2508.20453)</p>
        </footer>
    </div>

    <script src="script.js"></script>
</body>
</html>