Spaces:

Luigi
/

rts-commander

Sleeping

App Files Files Community

rts-commander / docs /reports /final_14_model_evaluation.json

Luigi

Organize project structure: move test scripts to tests/scripts and documentation to docs/reports

d28c36c about 2 months ago

raw

history blame contribute delete

7.72 kB

	{
	"evaluation_info": {
	"title": "Comprehensive MCP Capability Evaluation - 14 Models",
	"date": "2025-10-05",
	"total_models": 14,
	"framework": "llama-cpp-python 0.3.16",
	"test_scenarios": 3,
	"max_score_per_test": 10
	},
	"test_scenarios": [
	{
	"id": 1,
	"description": "Get the current game state",
	"expected_tool": "get_game_state",
	"scoring": {
	"correct_tool": 4,
	"valid_json": 3,
	"proper_terminology": 2,
	"coordinates": 1
	}
	},
	{
	"id": 2,
	"description": "Move infantry unit to coordinates 150,200",
	"expected_tool": "move_units",
	"expected_coordinates": [150, 200],
	"scoring": {
	"correct_tool": 4,
	"valid_json": 3,
	"proper_terminology": 2,
	"coordinates": 1
	}
	},
	{
	"id": 3,
	"description": "Attack enemy tank at location 300,150",
	"expected_tool": "attack_unit",
	"expected_coordinates": [300, 150],
	"scoring": {
	"correct_tool": 4,
	"valid_json": 3,
	"proper_terminology": 2,
	"coordinates": 1
	}
	}
	],
	"results": [
	{
	"rank": 1,
	"model_name": "Qwen2.5-Coder-1.5B-Q4",
	"filename": "qwen2.5-coder-1.5b-instruct-q4_0.gguf",
	"avg_score": 9.7,
	"avg_time": 4.12,
	"size_mb": 1017,
	"efficiency": 2.34,
	"status": "champion",
	"notes": "Exceptional MCP performance. Wraps JSON in markdown code blocks. 10/10 on tests 2 and 3, 9/10 on test 1.",
	"detailed_scores": [9, 10, 10]
	},
	{
	"rank": 2,
	"model_name": "Qwen2.5-Coder-0.5B",
	"filename": "qwen2.5-coder-0.5b-instruct-q4_0.gguf",
	"avg_score": 4.3,
	"avg_time": 2.08,
	"size_mb": 409,
	"efficiency": 2.08,
	"status": "previous_champion",
	"notes": "Best budget option. Good balance of size and performance."
	},
	{
	"rank": 3,
	"model_name": "Qwen3-0.6B",
	"filename": "Qwen3-0.6B-Q8_0.gguf",
	"avg_score": 3.7,
	"avg_time": 3.98,
	"size_mb": 610,
	"efficiency": 0.92,
	"status": "functional"
	},
	{
	"rank": 4,
	"model_name": "Gemma-3-270M",
	"filename": "gemma-3-270m-it-qat-Q8_0.gguf",
	"avg_score": 3.7,
	"avg_time": 2.29,
	"size_mb": 428,
	"efficiency": 1.60,
	"status": "functional",
	"notes": "Ultra-lightweight champion. Excellent efficiency for its tiny size."
	},
	{
	"rank": 5,
	"model_name": "MCPR-L-3B-Exa-Q8",
	"filename": "mcprl-3b-exa.Q8_0.gguf",
	"avg_score": 3.7,
	"avg_time": 17.42,
	"size_mb": 3133,
	"efficiency": 0.21,
	"status": "functional",
	"notes": "MCP-specialized but slow. Large size, poor efficiency."
	},
	{
	"rank": 6,
	"model_name": "Gemma-3n-E2B-it-Q8",
	"filename": "google_gemma-3n-E2B-it-Q8_0.gguf",
	"avg_score": 3.7,
	"avg_time": 14.80,
	"size_mb": 4566,
	"efficiency": 0.25,
	"status": "functional",
	"notes": "Largest model tested. Poor efficiency despite high quantization."
	},
	{
	"rank": 7,
	"model_name": "Qwen3-1.7B",
	"filename": "Qwen3-1.7B-Q4_0.gguf",
	"avg_score": 3.7,
	"avg_time": 6.24,
	"size_mb": 1008,
	"efficiency": 0.59,
	"status": "functional"
	},
	{
	"rank": 8,
	"model_name": "Qwen2.5-0.5B",
	"filename": "qwen2.5-0.5b-instruct-q4_0.gguf",
	"avg_score": 2.7,
	"avg_time": 1.17,
	"size_mb": 409,
	"efficiency": 2.28,
	"status": "functional",
	"notes": "Fast but limited MCP capability. General-purpose model."
	},
	{
	"rank": 9,
	"model_name": "Gemma-3n-E2B-it-IQ2",
	"filename": "gemma-3n-E2B-it-UD-IQ2_XXS.gguf",
	"avg_score": 2.3,
	"avg_time": 14.11,
	"size_mb": 1958,
	"efficiency": 0.17,
	"status": "functional",
	"notes": "Heavy quantization impacts quality."
	},
	{
	"rank": 10,
	"model_name": "Llama-Breeze2-3B-Q2",
	"filename": "Llama-Breeze2-3B-Instruct-Text.Q2_K.gguf",
	"avg_score": 1.3,
	"avg_time": 11.39,
	"size_mb": 1424,
	"efficiency": 0.12,
	"status": "functional",
	"notes": "Poor performance. Q2 quantization too aggressive."
	},
	{
	"rank": 11,
	"model_name": "Yi-Coder-1.5B-Q4",
	"filename": "Yi-Coder-1.5B.Q4_0.gguf",
	"avg_score": 0.0,
	"avg_time": 11.64,
	"size_mb": 826,
	"efficiency": 0.0,
	"status": "failed",
	"notes": "Prompt format incompatibility. Returns system prompt instead of generating responses."
	},
	{
	"rank": 12,
	"model_name": "MCP-Instruct-v1-Q4",
	"filename": "mcp-instruct-v1.Q4_K_M.gguf",
	"avg_score": 0.0,
	"avg_time": 0.0,
	"size_mb": 697,
	"efficiency": 0.0,
	"status": "failed",
	"notes": "Technical error: llama_decode returned -1"
	},
	{
	"rank": 13,
	"model_name": "MCPR-L-3B-Exa-Q2",
	"filename": "mcprl-3b-exa.Q2_K.gguf",
	"avg_score": 0.0,
	"avg_time": 10.63,
	"size_mb": 1216,
	"efficiency": 0.0,
	"status": "failed",
	"notes": "Produces gibberish output. Q2 quantization too aggressive for this architecture."
	},
	{
	"rank": 14,
	"model_name": "MCP-Instruct-v1-Q8",
	"filename": "mcp-instruct-v1.Q8_0.gguf",
	"avg_score": 0.0,
	"avg_time": 0.0,
	"size_mb": 1465,
	"efficiency": 0.0,
	"status": "failed",
	"notes": "Technical error: llama_decode returned -1. Same issue as Q4 version."
	}
	],
	"key_insights": {
	"champion": {
	"model": "Qwen2.5-Coder-1.5B-Q4",
	"score": 9.7,
	"reason": "Code-specialized models excel at structured JSON generation. Near-perfect MCP capability."
	},
	"scaling_effect": {
	"observation": "Increasing parameters from 0.5B to 1.5B more than doubled MCP score (4.3 → 9.7)",
	"conclusion": "Parameter scaling works exceptionally well for code-specialized models"
	},
	"mcp_specialized_disappointment": {
	"observation": "MCP-Instruct models completely failed. MCPR-L models scored only 3.7/10 at best.",
	"conclusion": "MCP specialization alone is insufficient. Code training provides better foundation."
	},
	"quantization_impact": {
	"observation": "Q2 quantization caused failures or poor performance. Q4 and Q8 worked well.",
	"conclusion": "Avoid Q2 quantization for MCP tasks. Q4 offers best size/quality tradeoff."
	},
	"size_efficiency": {
	"observation": "Gemma-3-270M (428MB) matched 3133MB model performance",
	"conclusion": "Larger models don't guarantee better MCP performance"
	}
	},
	"recommendations": {
	"primary": {
	"model": "Qwen2.5-Coder-1.5B-Q4",
	"use_case": "Production deployments requiring high-quality MCP",
	"requirement": "JSON extraction logic to handle markdown code blocks"
	},
	"budget": {
	"model": "Qwen2.5-Coder-0.5B",
	"use_case": "Resource-constrained environments",
	"advantage": "2x smaller, 2x faster, still 4.3/10 performance"
	},
	"ultra_lightweight": {
	"model": "Gemma-3-270M",
	"use_case": "Edge devices, embedded systems",
	"advantage": "Only 428MB, decent 3.7/10 performance"
	},
	"avoid": [
	{
	"model": "MCP-Instruct-v1 (all versions)",
	"reason": "Technical incompatibility with llama.cpp"
	},
	{
	"model": "Yi-Coder-1.5B",
	"reason": "Prompt format incompatibility"
	},
	{
	"model": "Any Q2 quantization",
	"reason": "Too aggressive, causes failures or gibberish"
	}
	]
	}
	}