Spaces:
Sleeping
Sleeping
| { | |
| "evaluation_info": { | |
| "title": "Comprehensive MCP Capability Evaluation - 14 Models", | |
| "date": "2025-10-05", | |
| "total_models": 14, | |
| "framework": "llama-cpp-python 0.3.16", | |
| "test_scenarios": 3, | |
| "max_score_per_test": 10 | |
| }, | |
| "test_scenarios": [ | |
| { | |
| "id": 1, | |
| "description": "Get the current game state", | |
| "expected_tool": "get_game_state", | |
| "scoring": { | |
| "correct_tool": 4, | |
| "valid_json": 3, | |
| "proper_terminology": 2, | |
| "coordinates": 1 | |
| } | |
| }, | |
| { | |
| "id": 2, | |
| "description": "Move infantry unit to coordinates 150,200", | |
| "expected_tool": "move_units", | |
| "expected_coordinates": [150, 200], | |
| "scoring": { | |
| "correct_tool": 4, | |
| "valid_json": 3, | |
| "proper_terminology": 2, | |
| "coordinates": 1 | |
| } | |
| }, | |
| { | |
| "id": 3, | |
| "description": "Attack enemy tank at location 300,150", | |
| "expected_tool": "attack_unit", | |
| "expected_coordinates": [300, 150], | |
| "scoring": { | |
| "correct_tool": 4, | |
| "valid_json": 3, | |
| "proper_terminology": 2, | |
| "coordinates": 1 | |
| } | |
| } | |
| ], | |
| "results": [ | |
| { | |
| "rank": 1, | |
| "model_name": "Qwen2.5-Coder-1.5B-Q4", | |
| "filename": "qwen2.5-coder-1.5b-instruct-q4_0.gguf", | |
| "avg_score": 9.7, | |
| "avg_time": 4.12, | |
| "size_mb": 1017, | |
| "efficiency": 2.34, | |
| "status": "champion", | |
| "notes": "Exceptional MCP performance. Wraps JSON in markdown code blocks. 10/10 on tests 2 and 3, 9/10 on test 1.", | |
| "detailed_scores": [9, 10, 10] | |
| }, | |
| { | |
| "rank": 2, | |
| "model_name": "Qwen2.5-Coder-0.5B", | |
| "filename": "qwen2.5-coder-0.5b-instruct-q4_0.gguf", | |
| "avg_score": 4.3, | |
| "avg_time": 2.08, | |
| "size_mb": 409, | |
| "efficiency": 2.08, | |
| "status": "previous_champion", | |
| "notes": "Best budget option. Good balance of size and performance." | |
| }, | |
| { | |
| "rank": 3, | |
| "model_name": "Qwen3-0.6B", | |
| "filename": "Qwen3-0.6B-Q8_0.gguf", | |
| "avg_score": 3.7, | |
| "avg_time": 3.98, | |
| "size_mb": 610, | |
| "efficiency": 0.92, | |
| "status": "functional" | |
| }, | |
| { | |
| "rank": 4, | |
| "model_name": "Gemma-3-270M", | |
| "filename": "gemma-3-270m-it-qat-Q8_0.gguf", | |
| "avg_score": 3.7, | |
| "avg_time": 2.29, | |
| "size_mb": 428, | |
| "efficiency": 1.60, | |
| "status": "functional", | |
| "notes": "Ultra-lightweight champion. Excellent efficiency for its tiny size." | |
| }, | |
| { | |
| "rank": 5, | |
| "model_name": "MCPR-L-3B-Exa-Q8", | |
| "filename": "mcprl-3b-exa.Q8_0.gguf", | |
| "avg_score": 3.7, | |
| "avg_time": 17.42, | |
| "size_mb": 3133, | |
| "efficiency": 0.21, | |
| "status": "functional", | |
| "notes": "MCP-specialized but slow. Large size, poor efficiency." | |
| }, | |
| { | |
| "rank": 6, | |
| "model_name": "Gemma-3n-E2B-it-Q8", | |
| "filename": "google_gemma-3n-E2B-it-Q8_0.gguf", | |
| "avg_score": 3.7, | |
| "avg_time": 14.80, | |
| "size_mb": 4566, | |
| "efficiency": 0.25, | |
| "status": "functional", | |
| "notes": "Largest model tested. Poor efficiency despite high quantization." | |
| }, | |
| { | |
| "rank": 7, | |
| "model_name": "Qwen3-1.7B", | |
| "filename": "Qwen3-1.7B-Q4_0.gguf", | |
| "avg_score": 3.7, | |
| "avg_time": 6.24, | |
| "size_mb": 1008, | |
| "efficiency": 0.59, | |
| "status": "functional" | |
| }, | |
| { | |
| "rank": 8, | |
| "model_name": "Qwen2.5-0.5B", | |
| "filename": "qwen2.5-0.5b-instruct-q4_0.gguf", | |
| "avg_score": 2.7, | |
| "avg_time": 1.17, | |
| "size_mb": 409, | |
| "efficiency": 2.28, | |
| "status": "functional", | |
| "notes": "Fast but limited MCP capability. General-purpose model." | |
| }, | |
| { | |
| "rank": 9, | |
| "model_name": "Gemma-3n-E2B-it-IQ2", | |
| "filename": "gemma-3n-E2B-it-UD-IQ2_XXS.gguf", | |
| "avg_score": 2.3, | |
| "avg_time": 14.11, | |
| "size_mb": 1958, | |
| "efficiency": 0.17, | |
| "status": "functional", | |
| "notes": "Heavy quantization impacts quality." | |
| }, | |
| { | |
| "rank": 10, | |
| "model_name": "Llama-Breeze2-3B-Q2", | |
| "filename": "Llama-Breeze2-3B-Instruct-Text.Q2_K.gguf", | |
| "avg_score": 1.3, | |
| "avg_time": 11.39, | |
| "size_mb": 1424, | |
| "efficiency": 0.12, | |
| "status": "functional", | |
| "notes": "Poor performance. Q2 quantization too aggressive." | |
| }, | |
| { | |
| "rank": 11, | |
| "model_name": "Yi-Coder-1.5B-Q4", | |
| "filename": "Yi-Coder-1.5B.Q4_0.gguf", | |
| "avg_score": 0.0, | |
| "avg_time": 11.64, | |
| "size_mb": 826, | |
| "efficiency": 0.0, | |
| "status": "failed", | |
| "notes": "Prompt format incompatibility. Returns system prompt instead of generating responses." | |
| }, | |
| { | |
| "rank": 12, | |
| "model_name": "MCP-Instruct-v1-Q4", | |
| "filename": "mcp-instruct-v1.Q4_K_M.gguf", | |
| "avg_score": 0.0, | |
| "avg_time": 0.0, | |
| "size_mb": 697, | |
| "efficiency": 0.0, | |
| "status": "failed", | |
| "notes": "Technical error: llama_decode returned -1" | |
| }, | |
| { | |
| "rank": 13, | |
| "model_name": "MCPR-L-3B-Exa-Q2", | |
| "filename": "mcprl-3b-exa.Q2_K.gguf", | |
| "avg_score": 0.0, | |
| "avg_time": 10.63, | |
| "size_mb": 1216, | |
| "efficiency": 0.0, | |
| "status": "failed", | |
| "notes": "Produces gibberish output. Q2 quantization too aggressive for this architecture." | |
| }, | |
| { | |
| "rank": 14, | |
| "model_name": "MCP-Instruct-v1-Q8", | |
| "filename": "mcp-instruct-v1.Q8_0.gguf", | |
| "avg_score": 0.0, | |
| "avg_time": 0.0, | |
| "size_mb": 1465, | |
| "efficiency": 0.0, | |
| "status": "failed", | |
| "notes": "Technical error: llama_decode returned -1. Same issue as Q4 version." | |
| } | |
| ], | |
| "key_insights": { | |
| "champion": { | |
| "model": "Qwen2.5-Coder-1.5B-Q4", | |
| "score": 9.7, | |
| "reason": "Code-specialized models excel at structured JSON generation. Near-perfect MCP capability." | |
| }, | |
| "scaling_effect": { | |
| "observation": "Increasing parameters from 0.5B to 1.5B more than doubled MCP score (4.3 → 9.7)", | |
| "conclusion": "Parameter scaling works exceptionally well for code-specialized models" | |
| }, | |
| "mcp_specialized_disappointment": { | |
| "observation": "MCP-Instruct models completely failed. MCPR-L models scored only 3.7/10 at best.", | |
| "conclusion": "MCP specialization alone is insufficient. Code training provides better foundation." | |
| }, | |
| "quantization_impact": { | |
| "observation": "Q2 quantization caused failures or poor performance. Q4 and Q8 worked well.", | |
| "conclusion": "Avoid Q2 quantization for MCP tasks. Q4 offers best size/quality tradeoff." | |
| }, | |
| "size_efficiency": { | |
| "observation": "Gemma-3-270M (428MB) matched 3133MB model performance", | |
| "conclusion": "Larger models don't guarantee better MCP performance" | |
| } | |
| }, | |
| "recommendations": { | |
| "primary": { | |
| "model": "Qwen2.5-Coder-1.5B-Q4", | |
| "use_case": "Production deployments requiring high-quality MCP", | |
| "requirement": "JSON extraction logic to handle markdown code blocks" | |
| }, | |
| "budget": { | |
| "model": "Qwen2.5-Coder-0.5B", | |
| "use_case": "Resource-constrained environments", | |
| "advantage": "2x smaller, 2x faster, still 4.3/10 performance" | |
| }, | |
| "ultra_lightweight": { | |
| "model": "Gemma-3-270M", | |
| "use_case": "Edge devices, embedded systems", | |
| "advantage": "Only 428MB, decent 3.7/10 performance" | |
| }, | |
| "avoid": [ | |
| { | |
| "model": "MCP-Instruct-v1 (all versions)", | |
| "reason": "Technical incompatibility with llama.cpp" | |
| }, | |
| { | |
| "model": "Yi-Coder-1.5B", | |
| "reason": "Prompt format incompatibility" | |
| }, | |
| { | |
| "model": "Any Q2 quantization", | |
| "reason": "Too aggressive, causes failures or gibberish" | |
| } | |
| ] | |
| } | |
| } | |