File size: 7,173 Bytes
7b6b43e
 
 
 
 
4966301
 
 
 
7b6b43e
4966301
 
 
 
 
 
 
 
 
 
0b3ed89
4966301
 
0b3ed89
4966301
 
 
 
 
 
7b6b43e
 
4966301
 
 
 
 
 
7b6b43e
 
4966301
a05136a
4966301
 
a05136a
4966301
 
 
 
 
3e04edb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4966301
 
 
 
a399453
4966301
 
a399453
4966301
 
a399453
4966301
 
a399453
4966301
 
a399453
4966301
 
a399453
4966301
 
a399453
4966301
 
a399453
4966301
 
a399453
4966301
 
a399453
4966301
 
a399453
4966301
 
 
 
 
 
 
 
 
 
3e04edb
 
 
 
4966301
7b6b43e
 
 
4966301
 
 
 
 
 
 
0b3ed89
4966301
 
 
 
7b6b43e
 
 
4966301
 
 
 
7b6b43e
 
3e04edb
7b6b43e
4966301
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>MCP Benchmark Leaderboard</title>
    <link rel="stylesheet" href="style.css">
    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
</head>
<body>
    <div class="container">
        <!-- Paper Information -->
        <header class="paper-header">
            <h1 class="paper-title">MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers</h1>
            <div class="paper-authors">
                <p>Zhenting Wang, Qi Chang, Hemani Patel, Shashank Biju, Cheng-En Wu, Quan Liu, Aolin Ding, Alireza Rezazadeh, Ankit Shah, Yujia Bao, Eugene Siow</p>
                <p class="affiliation">Accenture, UC Berkeley</p>
            </div>
            <div class="paper-links">
                <a href="https://github.com/Accenture/mcp-bench" class="paper-link" target="_blank" rel="noopener noreferrer">
                    <i class="fab fa-github"></i> GitHub
                </a>
                <a href="https://arxiv.org/abs/2508.20453" class="paper-link" target="_blank" rel="noopener noreferrer">
                    <i class="fas fa-file-pdf"></i> Paper
                </a>
                <a href="#leaderboard" class="paper-link">
                    <i class="fas fa-trophy"></i> Leaderboard
                </a>
            </div>
        </header>

        <!-- MCP Diagram -->
        <section class="diagram-section">
            <img src="mcp-bench.png" alt="MCP-Bench Architecture Diagram" class="diagram-image">
            <p class="diagram-caption">
                MCP-Bench is a comprehensive evaluation framework designed to assess Large Language Models' (LLMs) capabilities in tool-use scenarios through the Model Context Protocol (MCP). This benchmark provides an end-to-end pipeline for evaluating how effectively different LLMs can discover, select, and utilize tools to solve real-world tasks.
            </p>
        </section>

        <!-- Ranking Chart -->
        <!-- <section class="chart-section">
            <h2 class="section-title">Performance Ranking</h2>
            <img src="ranking.png" alt="MCP Benchmark Ranking Chart" class="ranking-chart">
        </section> -->

        <!-- Leaderboard Header -->
        <section class="leaderboard-section" id="leaderboard">
            <h2 class="section-title">Detailed Results</h2>

        <div class="controls">
            <div class="search-container">
                <i class="fas fa-search"></i>
                <input type="text" id="searchInput" placeholder="Search models..." class="search-input">
            </div>
            
            <div class="filter-container">
                <label for="sortSelect">Sort by:</label>
                <select id="sortSelect" class="sort-select">
                    <option value="overall_score">Overall Score</option>
                    <option value="valid_tool_name_rate">Valid Tool Name Rate</option>
                    <option value="schema_compliance">Schema Compliance</option>
                    <option value="execution_success">Execution Success</option>
                    <option value="task_fulfillment">Task Fulfillment</option>
                    <option value="information_grounding">Information Grounding</option>
                    <option value="tool_appropriateness">Tool Appropriateness</option>
                    <option value="parameter_accuracy">Parameter Accuracy</option>
                    <option value="dependency_awareness">Dependency Awareness</option>
                    <option value="parallelism_efficiency">Parallelism Efficiency</option>
                </select>
                
                <button id="sortOrder" class="sort-btn" title="Toggle sort order">
                    <i class="fas fa-sort-amount-down"></i>
                </button>
            </div>
        </div>

        <div class="table-container">
            <table class="leaderboard-table" id="leaderboardTable">
                <thead>
                    <tr>
                        <th class="model-col">
                            <strong>Model</strong>
                        </th>
                        <th class="score-col">
                            <strong>Overall Score</strong>
                        </th>
                        <th class="metric-col">
                            Valid Tool<br>Name Rate
                        </th>
                        <th class="metric-col">
                            Schema<br>Compliance
                        </th>
                        <th class="metric-col">
                            Execution<br>Success
                        </th>
                        <th class="metric-col">
                            Task<br>Fulfillment
                        </th>
                        <th class="metric-col">
                            Information<br>Grounding
                        </th>
                        <th class="metric-col">
                            Tool<br>Appropriateness
                        </th>
                        <th class="metric-col">
                            Parameter<br>Accuracy
                        </th>
                        <th class="metric-col">
                            Dependency<br>Awareness
                        </th>
                        <th class="metric-col">
                            Parallelism<br>and Efficiency
                        </th>
                    </tr>
                </thead>
                <tbody id="tableBody">
                    <!-- Table rows will be generated by JavaScript -->
                </tbody>
            </table>
        </div>

        <div class="loading" id="loading">
            <i class="fas fa-spinner fa-spin"></i>
            Loading leaderboard data...
        </div>

        </section>

        <!-- Citation Section -->
        <section class="citation-section">
            <h2 class="section-title">Citation</h2>
            <div class="citation-box">
                <pre class="citation-text">@article{wang2024mcpbench,
  title={MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers},
  author={Wang, Zhenting and Chang, Qi and Patel, Hemani and Biju, Shashank and Wu, Cheng-En and Liu, Quan and Ding, Aolin and Rezazadeh, Alireza and Shah, Ankit and Bao, Yujia and Siow, Eugene},
  journal={arXiv preprint arXiv:2508.20453},
  year={2025}
}</pre>
                <button class="copy-citation-btn" onclick="copyCitation()">
                    <i class="fas fa-copy"></i> Copy Citation
                </button>
            </div>
        </section>

        <footer class="footer">
            <p>Last updated: <span id="lastUpdated"></span></p>
            <p>Data source: MCP-Bench Results (ArXiv: 2508.20453)</p>
        </footer>
    </div>

    <script src="script.js"></script>
</body>
</html>