Spaces:

Luigi
/

rts-commander

Sleeping

File size: 7,720 Bytes

d28c36c

{
  "evaluation_info": {
    "title": "Comprehensive MCP Capability Evaluation - 14 Models",
    "date": "2025-10-05",
    "total_models": 14,
    "framework": "llama-cpp-python 0.3.16",
    "test_scenarios": 3,
    "max_score_per_test": 10
  },
  "test_scenarios": [
    {
      "id": 1,
      "description": "Get the current game state",
      "expected_tool": "get_game_state",
      "scoring": {
        "correct_tool": 4,
        "valid_json": 3,
        "proper_terminology": 2,
        "coordinates": 1
      }
    },
    {
      "id": 2,
      "description": "Move infantry unit to coordinates 150,200",
      "expected_tool": "move_units",
      "expected_coordinates": [150, 200],
      "scoring": {
        "correct_tool": 4,
        "valid_json": 3,
        "proper_terminology": 2,
        "coordinates": 1
      }
    },
    {
      "id": 3,
      "description": "Attack enemy tank at location 300,150",
      "expected_tool": "attack_unit",
      "expected_coordinates": [300, 150],
      "scoring": {
        "correct_tool": 4,
        "valid_json": 3,
        "proper_terminology": 2,
        "coordinates": 1
      }
    }
  ],
  "results": [
    {
      "rank": 1,
      "model_name": "Qwen2.5-Coder-1.5B-Q4",
      "filename": "qwen2.5-coder-1.5b-instruct-q4_0.gguf",
      "avg_score": 9.7,
      "avg_time": 4.12,
      "size_mb": 1017,
      "efficiency": 2.34,
      "status": "champion",
      "notes": "Exceptional MCP performance. Wraps JSON in markdown code blocks. 10/10 on tests 2 and 3, 9/10 on test 1.",
      "detailed_scores": [9, 10, 10]
    },
    {
      "rank": 2,
      "model_name": "Qwen2.5-Coder-0.5B",
      "filename": "qwen2.5-coder-0.5b-instruct-q4_0.gguf",
      "avg_score": 4.3,
      "avg_time": 2.08,
      "size_mb": 409,
      "efficiency": 2.08,
      "status": "previous_champion",
      "notes": "Best budget option. Good balance of size and performance."
    },
    {
      "rank": 3,
      "model_name": "Qwen3-0.6B",
      "filename": "Qwen3-0.6B-Q8_0.gguf",
      "avg_score": 3.7,
      "avg_time": 3.98,
      "size_mb": 610,
      "efficiency": 0.92,
      "status": "functional"
    },
    {
      "rank": 4,
      "model_name": "Gemma-3-270M",
      "filename": "gemma-3-270m-it-qat-Q8_0.gguf",
      "avg_score": 3.7,
      "avg_time": 2.29,
      "size_mb": 428,
      "efficiency": 1.60,
      "status": "functional",
      "notes": "Ultra-lightweight champion. Excellent efficiency for its tiny size."
    },
    {
      "rank": 5,
      "model_name": "MCPR-L-3B-Exa-Q8",
      "filename": "mcprl-3b-exa.Q8_0.gguf",
      "avg_score": 3.7,
      "avg_time": 17.42,
      "size_mb": 3133,
      "efficiency": 0.21,
      "status": "functional",
      "notes": "MCP-specialized but slow. Large size, poor efficiency."
    },
    {
      "rank": 6,
      "model_name": "Gemma-3n-E2B-it-Q8",
      "filename": "google_gemma-3n-E2B-it-Q8_0.gguf",
      "avg_score": 3.7,
      "avg_time": 14.80,
      "size_mb": 4566,
      "efficiency": 0.25,
      "status": "functional",
      "notes": "Largest model tested. Poor efficiency despite high quantization."
    },
    {
      "rank": 7,
      "model_name": "Qwen3-1.7B",
      "filename": "Qwen3-1.7B-Q4_0.gguf",
      "avg_score": 3.7,
      "avg_time": 6.24,
      "size_mb": 1008,
      "efficiency": 0.59,
      "status": "functional"
    },
    {
      "rank": 8,
      "model_name": "Qwen2.5-0.5B",
      "filename": "qwen2.5-0.5b-instruct-q4_0.gguf",
      "avg_score": 2.7,
      "avg_time": 1.17,
      "size_mb": 409,
      "efficiency": 2.28,
      "status": "functional",
      "notes": "Fast but limited MCP capability. General-purpose model."
    },
    {
      "rank": 9,
      "model_name": "Gemma-3n-E2B-it-IQ2",
      "filename": "gemma-3n-E2B-it-UD-IQ2_XXS.gguf",
      "avg_score": 2.3,
      "avg_time": 14.11,
      "size_mb": 1958,
      "efficiency": 0.17,
      "status": "functional",
      "notes": "Heavy quantization impacts quality."
    },
    {
      "rank": 10,
      "model_name": "Llama-Breeze2-3B-Q2",
      "filename": "Llama-Breeze2-3B-Instruct-Text.Q2_K.gguf",
      "avg_score": 1.3,
      "avg_time": 11.39,
      "size_mb": 1424,
      "efficiency": 0.12,
      "status": "functional",
      "notes": "Poor performance. Q2 quantization too aggressive."
    },
    {
      "rank": 11,
      "model_name": "Yi-Coder-1.5B-Q4",
      "filename": "Yi-Coder-1.5B.Q4_0.gguf",
      "avg_score": 0.0,
      "avg_time": 11.64,
      "size_mb": 826,
      "efficiency": 0.0,
      "status": "failed",
      "notes": "Prompt format incompatibility. Returns system prompt instead of generating responses."
    },
    {
      "rank": 12,
      "model_name": "MCP-Instruct-v1-Q4",
      "filename": "mcp-instruct-v1.Q4_K_M.gguf",
      "avg_score": 0.0,
      "avg_time": 0.0,
      "size_mb": 697,
      "efficiency": 0.0,
      "status": "failed",
      "notes": "Technical error: llama_decode returned -1"
    },
    {
      "rank": 13,
      "model_name": "MCPR-L-3B-Exa-Q2",
      "filename": "mcprl-3b-exa.Q2_K.gguf",
      "avg_score": 0.0,
      "avg_time": 10.63,
      "size_mb": 1216,
      "efficiency": 0.0,
      "status": "failed",
      "notes": "Produces gibberish output. Q2 quantization too aggressive for this architecture."
    },
    {
      "rank": 14,
      "model_name": "MCP-Instruct-v1-Q8",
      "filename": "mcp-instruct-v1.Q8_0.gguf",
      "avg_score": 0.0,
      "avg_time": 0.0,
      "size_mb": 1465,
      "efficiency": 0.0,
      "status": "failed",
      "notes": "Technical error: llama_decode returned -1. Same issue as Q4 version."
    }
  ],
  "key_insights": {
    "champion": {
      "model": "Qwen2.5-Coder-1.5B-Q4",
      "score": 9.7,
      "reason": "Code-specialized models excel at structured JSON generation. Near-perfect MCP capability."
    },
    "scaling_effect": {
      "observation": "Increasing parameters from 0.5B to 1.5B more than doubled MCP score (4.3 → 9.7)",
      "conclusion": "Parameter scaling works exceptionally well for code-specialized models"
    },
    "mcp_specialized_disappointment": {
      "observation": "MCP-Instruct models completely failed. MCPR-L models scored only 3.7/10 at best.",
      "conclusion": "MCP specialization alone is insufficient. Code training provides better foundation."
    },
    "quantization_impact": {
      "observation": "Q2 quantization caused failures or poor performance. Q4 and Q8 worked well.",
      "conclusion": "Avoid Q2 quantization for MCP tasks. Q4 offers best size/quality tradeoff."
    },
    "size_efficiency": {
      "observation": "Gemma-3-270M (428MB) matched 3133MB model performance",
      "conclusion": "Larger models don't guarantee better MCP performance"
    }
  },
  "recommendations": {
    "primary": {
      "model": "Qwen2.5-Coder-1.5B-Q4",
      "use_case": "Production deployments requiring high-quality MCP",
      "requirement": "JSON extraction logic to handle markdown code blocks"
    },
    "budget": {
      "model": "Qwen2.5-Coder-0.5B",
      "use_case": "Resource-constrained environments",
      "advantage": "2x smaller, 2x faster, still 4.3/10 performance"
    },
    "ultra_lightweight": {
      "model": "Gemma-3-270M",
      "use_case": "Edge devices, embedded systems",
      "advantage": "Only 428MB, decent 3.7/10 performance"
    },
    "avoid": [
      {
        "model": "MCP-Instruct-v1 (all versions)",
        "reason": "Technical incompatibility with llama.cpp"
      },
      {
        "model": "Yi-Coder-1.5B",
        "reason": "Prompt format incompatibility"
      },
      {
        "model": "Any Q2 quantization",
        "reason": "Too aggressive, causes failures or gibberish"
      }
    ]
  }
}