rts-commander / docs /reports /final_model_comparison.json
Luigi's picture
Organize project structure: move test scripts to tests/scripts and documentation to docs/reports
d28c36c
{
"all_results": [
{
"name": "Qwen2.5-0.5B",
"avg_score": 4.0,
"avg_time": 3.52385942141215,
"efficiency": 1.1351190617011169,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 3.418940305709839,
"response": ".\n\nOutils: get_game_state(), move_units(unit_ids, ..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 3.8486745357513428,
"response": "La commande \"move_units\" est utilisée pour déplace..."
},
{
"test": "Vitesse de réponse",
"score": 0,
"time": 3.3039634227752686,
"response": ", je vais faire une tâche de base. Je vais essayer..."
}
]
},
{
"name": "Qwen3-0.6B",
"avg_score": 6.0,
"avg_time": 6.404076337814331,
"efficiency": 0.936903260283084,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 6.516923427581787,
"response": "Exemple: {\"tool\": \"get_game_state\", \"args\": {\"unit..."
},
{
"test": "Action avec paramètres",
"score": 7,
"time": 6.65591287612915,
"response": "Réponse: {\"tool\": \"move_units\", \"args\": {\"unit_ids..."
},
{
"test": "Vitesse de réponse",
"score": 4,
"time": 6.039392709732056,
"response": ", but not too much. The user is asking for a respo..."
}
]
},
{
"name": "Gemma-3-1B",
"avg_score": 4.0,
"avg_time": 6.960511525472005,
"efficiency": 0.5746704082540475,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 7.20223069190979,
"response": "```json\n{{\"tool\": \"get_game_state\", \"args\": {\"map\"..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 6.998988628387451,
"response": "```python\nimport json\n\ndef move_units(unit_ids, ta..."
},
{
"test": "Vitesse de réponse",
"score": 0,
"time": 6.680315256118774,
"response": ".\n\nA. 100\nB. 200\nC. 300\nD. 400\nE. 500\nF. 600\nG. 70..."
}
]
},
{
"name": "Gemma-3-270M",
"avg_score": 4.666666666666667,
"avg_time": 3.6990818977355957,
"efficiency": 1.2615743029434903,
"tests": [
{
"test": "Commande simple",
"score": 5,
"time": 3.697866201400757,
"response": "```\n**Explication:**\n\n* `get_game_state()` : Récup..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 3.690243721008301,
"response": "```\n\n**Explication:**\n\n* `move_units` est un outil..."
},
{
"test": "Vitesse de réponse",
"score": 4,
"time": 3.7091357707977295,
"response": ".\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\n..."
}
]
}
],
"successful_models": [
{
"name": "Qwen2.5-0.5B",
"avg_score": 4.0,
"avg_time": 3.52385942141215,
"efficiency": 1.1351190617011169,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 3.418940305709839,
"response": ".\n\nOutils: get_game_state(), move_units(unit_ids, ..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 3.8486745357513428,
"response": "La commande \"move_units\" est utilisée pour déplace..."
},
{
"test": "Vitesse de réponse",
"score": 0,
"time": 3.3039634227752686,
"response": ", je vais faire une tâche de base. Je vais essayer..."
}
]
},
{
"name": "Qwen3-0.6B",
"avg_score": 6.0,
"avg_time": 6.404076337814331,
"efficiency": 0.936903260283084,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 6.516923427581787,
"response": "Exemple: {\"tool\": \"get_game_state\", \"args\": {\"unit..."
},
{
"test": "Action avec paramètres",
"score": 7,
"time": 6.65591287612915,
"response": "Réponse: {\"tool\": \"move_units\", \"args\": {\"unit_ids..."
},
{
"test": "Vitesse de réponse",
"score": 4,
"time": 6.039392709732056,
"response": ", but not too much. The user is asking for a respo..."
}
]
},
{
"name": "Gemma-3-1B",
"avg_score": 4.0,
"avg_time": 6.960511525472005,
"efficiency": 0.5746704082540475,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 7.20223069190979,
"response": "```json\n{{\"tool\": \"get_game_state\", \"args\": {\"map\"..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 6.998988628387451,
"response": "```python\nimport json\n\ndef move_units(unit_ids, ta..."
},
{
"test": "Vitesse de réponse",
"score": 0,
"time": 6.680315256118774,
"response": ".\n\nA. 100\nB. 200\nC. 300\nD. 400\nE. 500\nF. 600\nG. 70..."
}
]
},
{
"name": "Gemma-3-270M",
"avg_score": 4.666666666666667,
"avg_time": 3.6990818977355957,
"efficiency": 1.2615743029434903,
"tests": [
{
"test": "Commande simple",
"score": 5,
"time": 3.697866201400757,
"response": "```\n**Explication:**\n\n* `get_game_state()` : Récup..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 3.690243721008301,
"response": "```\n\n**Explication:**\n\n* `move_units` est un outil..."
},
{
"test": "Vitesse de réponse",
"score": 4,
"time": 3.7091357707977295,
"response": ".\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\n..."
}
]
}
],
"ranking_by_score": [
{
"name": "Qwen3-0.6B",
"avg_score": 6.0,
"avg_time": 6.404076337814331,
"efficiency": 0.936903260283084,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 6.516923427581787,
"response": "Exemple: {\"tool\": \"get_game_state\", \"args\": {\"unit..."
},
{
"test": "Action avec paramètres",
"score": 7,
"time": 6.65591287612915,
"response": "Réponse: {\"tool\": \"move_units\", \"args\": {\"unit_ids..."
},
{
"test": "Vitesse de réponse",
"score": 4,
"time": 6.039392709732056,
"response": ", but not too much. The user is asking for a respo..."
}
]
},
{
"name": "Gemma-3-270M",
"avg_score": 4.666666666666667,
"avg_time": 3.6990818977355957,
"efficiency": 1.2615743029434903,
"tests": [
{
"test": "Commande simple",
"score": 5,
"time": 3.697866201400757,
"response": "```\n**Explication:**\n\n* `get_game_state()` : Récup..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 3.690243721008301,
"response": "```\n\n**Explication:**\n\n* `move_units` est un outil..."
},
{
"test": "Vitesse de réponse",
"score": 4,
"time": 3.7091357707977295,
"response": ".\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\n..."
}
]
},
{
"name": "Qwen2.5-0.5B",
"avg_score": 4.0,
"avg_time": 3.52385942141215,
"efficiency": 1.1351190617011169,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 3.418940305709839,
"response": ".\n\nOutils: get_game_state(), move_units(unit_ids, ..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 3.8486745357513428,
"response": "La commande \"move_units\" est utilisée pour déplace..."
},
{
"test": "Vitesse de réponse",
"score": 0,
"time": 3.3039634227752686,
"response": ", je vais faire une tâche de base. Je vais essayer..."
}
]
},
{
"name": "Gemma-3-1B",
"avg_score": 4.0,
"avg_time": 6.960511525472005,
"efficiency": 0.5746704082540475,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 7.20223069190979,
"response": "```json\n{{\"tool\": \"get_game_state\", \"args\": {\"map\"..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 6.998988628387451,
"response": "```python\nimport json\n\ndef move_units(unit_ids, ta..."
},
{
"test": "Vitesse de réponse",
"score": 0,
"time": 6.680315256118774,
"response": ".\n\nA. 100\nB. 200\nC. 300\nD. 400\nE. 500\nF. 600\nG. 70..."
}
]
}
],
"ranking_by_efficiency": [
{
"name": "Gemma-3-270M",
"avg_score": 4.666666666666667,
"avg_time": 3.6990818977355957,
"efficiency": 1.2615743029434903,
"tests": [
{
"test": "Commande simple",
"score": 5,
"time": 3.697866201400757,
"response": "```\n**Explication:**\n\n* `get_game_state()` : Récup..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 3.690243721008301,
"response": "```\n\n**Explication:**\n\n* `move_units` est un outil..."
},
{
"test": "Vitesse de réponse",
"score": 4,
"time": 3.7091357707977295,
"response": ".\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\n..."
}
]
},
{
"name": "Qwen2.5-0.5B",
"avg_score": 4.0,
"avg_time": 3.52385942141215,
"efficiency": 1.1351190617011169,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 3.418940305709839,
"response": ".\n\nOutils: get_game_state(), move_units(unit_ids, ..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 3.8486745357513428,
"response": "La commande \"move_units\" est utilisée pour déplace..."
},
{
"test": "Vitesse de réponse",
"score": 0,
"time": 3.3039634227752686,
"response": ", je vais faire une tâche de base. Je vais essayer..."
}
]
},
{
"name": "Qwen3-0.6B",
"avg_score": 6.0,
"avg_time": 6.404076337814331,
"efficiency": 0.936903260283084,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 6.516923427581787,
"response": "Exemple: {\"tool\": \"get_game_state\", \"args\": {\"unit..."
},
{
"test": "Action avec paramètres",
"score": 7,
"time": 6.65591287612915,
"response": "Réponse: {\"tool\": \"move_units\", \"args\": {\"unit_ids..."
},
{
"test": "Vitesse de réponse",
"score": 4,
"time": 6.039392709732056,
"response": ", but not too much. The user is asking for a respo..."
}
]
},
{
"name": "Gemma-3-1B",
"avg_score": 4.0,
"avg_time": 6.960511525472005,
"efficiency": 0.5746704082540475,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 7.20223069190979,
"response": "```json\n{{\"tool\": \"get_game_state\", \"args\": {\"map\"..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 6.998988628387451,
"response": "```python\nimport json\n\ndef move_units(unit_ids, ta..."
},
{
"test": "Vitesse de réponse",
"score": 0,
"time": 6.680315256118774,
"response": ".\n\nA. 100\nB. 200\nC. 300\nD. 400\nE. 500\nF. 600\nG. 70..."
}
]
}
],
"best_overall": {
"name": "Qwen3-0.6B",
"avg_score": 6.0,
"avg_time": 6.404076337814331,
"efficiency": 0.936903260283084,
"tests": [
{
"test": "Commande simple",
"score": 7,
"time": 6.516923427581787,
"response": "Exemple: {\"tool\": \"get_game_state\", \"args\": {\"unit..."
},
{
"test": "Action avec paramètres",
"score": 7,
"time": 6.65591287612915,
"response": "Réponse: {\"tool\": \"move_units\", \"args\": {\"unit_ids..."
},
{
"test": "Vitesse de réponse",
"score": 4,
"time": 6.039392709732056,
"response": ", but not too much. The user is asking for a respo..."
}
]
},
"most_efficient": {
"name": "Gemma-3-270M",
"avg_score": 4.666666666666667,
"avg_time": 3.6990818977355957,
"efficiency": 1.2615743029434903,
"tests": [
{
"test": "Commande simple",
"score": 5,
"time": 3.697866201400757,
"response": "```\n**Explication:**\n\n* `get_game_state()` : Récup..."
},
{
"test": "Action avec paramètres",
"score": 5,
"time": 3.690243721008301,
"response": "```\n\n**Explication:**\n\n* `move_units` est un outil..."
},
{
"test": "Vitesse de réponse",
"score": 4,
"time": 3.7091357707977295,
"response": ".\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\nOK.\n..."
}
]
}
}