rts-commander / docs /reports /final_14_model_evaluation.json
Luigi's picture
Organize project structure: move test scripts to tests/scripts and documentation to docs/reports
d28c36c
{
"evaluation_info": {
"title": "Comprehensive MCP Capability Evaluation - 14 Models",
"date": "2025-10-05",
"total_models": 14,
"framework": "llama-cpp-python 0.3.16",
"test_scenarios": 3,
"max_score_per_test": 10
},
"test_scenarios": [
{
"id": 1,
"description": "Get the current game state",
"expected_tool": "get_game_state",
"scoring": {
"correct_tool": 4,
"valid_json": 3,
"proper_terminology": 2,
"coordinates": 1
}
},
{
"id": 2,
"description": "Move infantry unit to coordinates 150,200",
"expected_tool": "move_units",
"expected_coordinates": [150, 200],
"scoring": {
"correct_tool": 4,
"valid_json": 3,
"proper_terminology": 2,
"coordinates": 1
}
},
{
"id": 3,
"description": "Attack enemy tank at location 300,150",
"expected_tool": "attack_unit",
"expected_coordinates": [300, 150],
"scoring": {
"correct_tool": 4,
"valid_json": 3,
"proper_terminology": 2,
"coordinates": 1
}
}
],
"results": [
{
"rank": 1,
"model_name": "Qwen2.5-Coder-1.5B-Q4",
"filename": "qwen2.5-coder-1.5b-instruct-q4_0.gguf",
"avg_score": 9.7,
"avg_time": 4.12,
"size_mb": 1017,
"efficiency": 2.34,
"status": "champion",
"notes": "Exceptional MCP performance. Wraps JSON in markdown code blocks. 10/10 on tests 2 and 3, 9/10 on test 1.",
"detailed_scores": [9, 10, 10]
},
{
"rank": 2,
"model_name": "Qwen2.5-Coder-0.5B",
"filename": "qwen2.5-coder-0.5b-instruct-q4_0.gguf",
"avg_score": 4.3,
"avg_time": 2.08,
"size_mb": 409,
"efficiency": 2.08,
"status": "previous_champion",
"notes": "Best budget option. Good balance of size and performance."
},
{
"rank": 3,
"model_name": "Qwen3-0.6B",
"filename": "Qwen3-0.6B-Q8_0.gguf",
"avg_score": 3.7,
"avg_time": 3.98,
"size_mb": 610,
"efficiency": 0.92,
"status": "functional"
},
{
"rank": 4,
"model_name": "Gemma-3-270M",
"filename": "gemma-3-270m-it-qat-Q8_0.gguf",
"avg_score": 3.7,
"avg_time": 2.29,
"size_mb": 428,
"efficiency": 1.60,
"status": "functional",
"notes": "Ultra-lightweight champion. Excellent efficiency for its tiny size."
},
{
"rank": 5,
"model_name": "MCPR-L-3B-Exa-Q8",
"filename": "mcprl-3b-exa.Q8_0.gguf",
"avg_score": 3.7,
"avg_time": 17.42,
"size_mb": 3133,
"efficiency": 0.21,
"status": "functional",
"notes": "MCP-specialized but slow. Large size, poor efficiency."
},
{
"rank": 6,
"model_name": "Gemma-3n-E2B-it-Q8",
"filename": "google_gemma-3n-E2B-it-Q8_0.gguf",
"avg_score": 3.7,
"avg_time": 14.80,
"size_mb": 4566,
"efficiency": 0.25,
"status": "functional",
"notes": "Largest model tested. Poor efficiency despite high quantization."
},
{
"rank": 7,
"model_name": "Qwen3-1.7B",
"filename": "Qwen3-1.7B-Q4_0.gguf",
"avg_score": 3.7,
"avg_time": 6.24,
"size_mb": 1008,
"efficiency": 0.59,
"status": "functional"
},
{
"rank": 8,
"model_name": "Qwen2.5-0.5B",
"filename": "qwen2.5-0.5b-instruct-q4_0.gguf",
"avg_score": 2.7,
"avg_time": 1.17,
"size_mb": 409,
"efficiency": 2.28,
"status": "functional",
"notes": "Fast but limited MCP capability. General-purpose model."
},
{
"rank": 9,
"model_name": "Gemma-3n-E2B-it-IQ2",
"filename": "gemma-3n-E2B-it-UD-IQ2_XXS.gguf",
"avg_score": 2.3,
"avg_time": 14.11,
"size_mb": 1958,
"efficiency": 0.17,
"status": "functional",
"notes": "Heavy quantization impacts quality."
},
{
"rank": 10,
"model_name": "Llama-Breeze2-3B-Q2",
"filename": "Llama-Breeze2-3B-Instruct-Text.Q2_K.gguf",
"avg_score": 1.3,
"avg_time": 11.39,
"size_mb": 1424,
"efficiency": 0.12,
"status": "functional",
"notes": "Poor performance. Q2 quantization too aggressive."
},
{
"rank": 11,
"model_name": "Yi-Coder-1.5B-Q4",
"filename": "Yi-Coder-1.5B.Q4_0.gguf",
"avg_score": 0.0,
"avg_time": 11.64,
"size_mb": 826,
"efficiency": 0.0,
"status": "failed",
"notes": "Prompt format incompatibility. Returns system prompt instead of generating responses."
},
{
"rank": 12,
"model_name": "MCP-Instruct-v1-Q4",
"filename": "mcp-instruct-v1.Q4_K_M.gguf",
"avg_score": 0.0,
"avg_time": 0.0,
"size_mb": 697,
"efficiency": 0.0,
"status": "failed",
"notes": "Technical error: llama_decode returned -1"
},
{
"rank": 13,
"model_name": "MCPR-L-3B-Exa-Q2",
"filename": "mcprl-3b-exa.Q2_K.gguf",
"avg_score": 0.0,
"avg_time": 10.63,
"size_mb": 1216,
"efficiency": 0.0,
"status": "failed",
"notes": "Produces gibberish output. Q2 quantization too aggressive for this architecture."
},
{
"rank": 14,
"model_name": "MCP-Instruct-v1-Q8",
"filename": "mcp-instruct-v1.Q8_0.gguf",
"avg_score": 0.0,
"avg_time": 0.0,
"size_mb": 1465,
"efficiency": 0.0,
"status": "failed",
"notes": "Technical error: llama_decode returned -1. Same issue as Q4 version."
}
],
"key_insights": {
"champion": {
"model": "Qwen2.5-Coder-1.5B-Q4",
"score": 9.7,
"reason": "Code-specialized models excel at structured JSON generation. Near-perfect MCP capability."
},
"scaling_effect": {
"observation": "Increasing parameters from 0.5B to 1.5B more than doubled MCP score (4.3 → 9.7)",
"conclusion": "Parameter scaling works exceptionally well for code-specialized models"
},
"mcp_specialized_disappointment": {
"observation": "MCP-Instruct models completely failed. MCPR-L models scored only 3.7/10 at best.",
"conclusion": "MCP specialization alone is insufficient. Code training provides better foundation."
},
"quantization_impact": {
"observation": "Q2 quantization caused failures or poor performance. Q4 and Q8 worked well.",
"conclusion": "Avoid Q2 quantization for MCP tasks. Q4 offers best size/quality tradeoff."
},
"size_efficiency": {
"observation": "Gemma-3-270M (428MB) matched 3133MB model performance",
"conclusion": "Larger models don't guarantee better MCP performance"
}
},
"recommendations": {
"primary": {
"model": "Qwen2.5-Coder-1.5B-Q4",
"use_case": "Production deployments requiring high-quality MCP",
"requirement": "JSON extraction logic to handle markdown code blocks"
},
"budget": {
"model": "Qwen2.5-Coder-0.5B",
"use_case": "Resource-constrained environments",
"advantage": "2x smaller, 2x faster, still 4.3/10 performance"
},
"ultra_lightweight": {
"model": "Gemma-3-270M",
"use_case": "Edge devices, embedded systems",
"advantage": "Only 428MB, decent 3.7/10 performance"
},
"avoid": [
{
"model": "MCP-Instruct-v1 (all versions)",
"reason": "Technical incompatibility with llama.cpp"
},
{
"model": "Yi-Coder-1.5B",
"reason": "Prompt format incompatibility"
},
{
"model": "Any Q2 quantization",
"reason": "Too aggressive, causes failures or gibberish"
}
]
}
}