Spaces:
Running
Running
update leaderboards
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
|
@@ -54,6 +54,17 @@
|
|
| 54 |
"Total Puzzles": 1000,
|
| 55 |
"Reason Lens": "1549.74"
|
| 56 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
{
|
| 58 |
"Model": "gpt-4-turbo-2024-04-09",
|
| 59 |
"Mode": "greedy",
|
|
@@ -109,6 +120,17 @@
|
|
| 109 |
"Total Puzzles": 1000,
|
| 110 |
"Reason Lens": "1165.90"
|
| 111 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
{
|
| 113 |
"Model": "deepseek-chat",
|
| 114 |
"Mode": "greedy",
|
|
@@ -142,6 +164,17 @@
|
|
| 142 |
"Total Puzzles": 1000,
|
| 143 |
"Reason Lens": "1324.55"
|
| 144 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
{
|
| 146 |
"Model": "gpt-4o-mini-2024-07-18",
|
| 147 |
"Mode": "greedy",
|
|
@@ -307,6 +340,17 @@
|
|
| 307 |
"Total Puzzles": 1000,
|
| 308 |
"Reason Lens": "1078.29"
|
| 309 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
{
|
| 311 |
"Model": "gemma-2-9b-it@nvidia",
|
| 312 |
"Mode": "greedy",
|
|
@@ -439,6 +483,17 @@
|
|
| 439 |
"Total Puzzles": 1000,
|
| 440 |
"Reason Lens": "1473.23"
|
| 441 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
{
|
| 443 |
"Model": "Yi-1.5-9B-Chat",
|
| 444 |
"Mode": "greedy",
|
|
|
|
| 54 |
"Total Puzzles": 1000,
|
| 55 |
"Reason Lens": "1549.74"
|
| 56 |
},
|
| 57 |
+
{
|
| 58 |
+
"Model": "Mistral-Large-2",
|
| 59 |
+
"Mode": "greedy",
|
| 60 |
+
"Puzzle Acc": "29.00",
|
| 61 |
+
"Cell Acc": "47.64",
|
| 62 |
+
"No answer": "1.70",
|
| 63 |
+
"Easy Puzzle Acc": "80.36",
|
| 64 |
+
"Hard Puzzle Acc": "9.03",
|
| 65 |
+
"Total Puzzles": 1000,
|
| 66 |
+
"Reason Lens": "1592.39"
|
| 67 |
+
},
|
| 68 |
{
|
| 69 |
"Model": "gpt-4-turbo-2024-04-09",
|
| 70 |
"Mode": "greedy",
|
|
|
|
| 120 |
"Total Puzzles": 1000,
|
| 121 |
"Reason Lens": "1165.90"
|
| 122 |
},
|
| 123 |
+
{
|
| 124 |
+
"Model": "Meta-Llama-3.1-70B-Instruct",
|
| 125 |
+
"Mode": "greedy",
|
| 126 |
+
"Puzzle Acc": "24.90",
|
| 127 |
+
"Cell Acc": "27.98",
|
| 128 |
+
"No answer": "43.00",
|
| 129 |
+
"Easy Puzzle Acc": "73.57",
|
| 130 |
+
"Hard Puzzle Acc": "5.97",
|
| 131 |
+
"Total Puzzles": 1000,
|
| 132 |
+
"Reason Lens": "1483.68"
|
| 133 |
+
},
|
| 134 |
{
|
| 135 |
"Model": "deepseek-chat",
|
| 136 |
"Mode": "greedy",
|
|
|
|
| 164 |
"Total Puzzles": 1000,
|
| 165 |
"Reason Lens": "1324.55"
|
| 166 |
},
|
| 167 |
+
{
|
| 168 |
+
"Model": "DeepSeek-Coder-V2-0724",
|
| 169 |
+
"Mode": "greedy",
|
| 170 |
+
"Puzzle Acc": "20.50",
|
| 171 |
+
"Cell Acc": "42.35",
|
| 172 |
+
"No answer": "3.40",
|
| 173 |
+
"Easy Puzzle Acc": "61.79",
|
| 174 |
+
"Hard Puzzle Acc": "4.44",
|
| 175 |
+
"Total Puzzles": 1000,
|
| 176 |
+
"Reason Lens": "1230.63"
|
| 177 |
+
},
|
| 178 |
{
|
| 179 |
"Model": "gpt-4o-mini-2024-07-18",
|
| 180 |
"Mode": "greedy",
|
|
|
|
| 340 |
"Total Puzzles": 1000,
|
| 341 |
"Reason Lens": "1078.29"
|
| 342 |
},
|
| 343 |
+
{
|
| 344 |
+
"Model": "Meta-Llama-3.1-8B-Instruct",
|
| 345 |
+
"Mode": "greedy",
|
| 346 |
+
"Puzzle Acc": "12.80",
|
| 347 |
+
"Cell Acc": "13.68",
|
| 348 |
+
"No answer": "61.50",
|
| 349 |
+
"Easy Puzzle Acc": "43.57",
|
| 350 |
+
"Hard Puzzle Acc": "0.83",
|
| 351 |
+
"Total Puzzles": 1000,
|
| 352 |
+
"Reason Lens": "1043.90"
|
| 353 |
+
},
|
| 354 |
{
|
| 355 |
"Model": "gemma-2-9b-it@nvidia",
|
| 356 |
"Mode": "greedy",
|
|
|
|
| 483 |
"Total Puzzles": 1000,
|
| 484 |
"Reason Lens": "1473.23"
|
| 485 |
},
|
| 486 |
+
{
|
| 487 |
+
"Model": "gemma-2-2b-it",
|
| 488 |
+
"Mode": "greedy",
|
| 489 |
+
"Puzzle Acc": "4.20",
|
| 490 |
+
"Cell Acc": "9.97",
|
| 491 |
+
"No answer": "57.20",
|
| 492 |
+
"Easy Puzzle Acc": "14.29",
|
| 493 |
+
"Hard Puzzle Acc": "0.28",
|
| 494 |
+
"Total Puzzles": 1000,
|
| 495 |
+
"Reason Lens": "1032.89"
|
| 496 |
+
},
|
| 497 |
{
|
| 498 |
"Model": "Yi-1.5-9B-Chat",
|
| 499 |
"Mode": "greedy",
|
model_info.json
CHANGED
|
@@ -68,5 +68,10 @@
|
|
| 68 |
"Llama-3.1-405B-Instruct-Turbo": {"pretty_name": "Llama-3.1-405B-Instruct-Turbo π¨", "hf_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct"},
|
| 69 |
"Mistral-Nemo-Instruct-2407": {"pretty_name": "Mistral-Nemo-Inst (12B) π¨", "hf_model_id": "Mistral-Nemo-Instruct-2407"},
|
| 70 |
"Phi-3-mini-4k-instruct": {"pretty_name": "Phi-3-mini-4k-instruct π¨", "hf_model_id": "microsoft/Phi-3-mini-4k-instruct"},
|
| 71 |
-
"Athene-70B": {"pretty_name": "Athene-70B π¨", "hf_model_id": "Nexusflow/Athene-70B"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
}
|
|
|
|
| 68 |
"Llama-3.1-405B-Instruct-Turbo": {"pretty_name": "Llama-3.1-405B-Instruct-Turbo π¨", "hf_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct"},
|
| 69 |
"Mistral-Nemo-Instruct-2407": {"pretty_name": "Mistral-Nemo-Inst (12B) π¨", "hf_model_id": "Mistral-Nemo-Instruct-2407"},
|
| 70 |
"Phi-3-mini-4k-instruct": {"pretty_name": "Phi-3-mini-4k-instruct π¨", "hf_model_id": "microsoft/Phi-3-mini-4k-instruct"},
|
| 71 |
+
"Athene-70B": {"pretty_name": "Athene-70B π¨", "hf_model_id": "Nexusflow/Athene-70B"},
|
| 72 |
+
"Mistral-Large-2": {"pretty_name": "Mistral-Large 2", "hf_model_id": "mistralai/Mistral-Large-Instruct-2407"},
|
| 73 |
+
"Meta-Llama-3.1-8B-Instruct": {"pretty_name": "Llama-3.1-8B-Instruct", "hf_model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct"},
|
| 74 |
+
"Meta-Llama-3.1-70B-Instruct": {"pretty_name": "Llama-3.1-70B-Instruct", "hf_model_id": "meta-llama/Meta-Llama-3.1-70B-Instruct"},
|
| 75 |
+
"gemma-2-2b-it" : {"pretty_name": "Gemma-2-2B-it", "hf_model_id": "google/gemma-2-2b-it"},
|
| 76 |
+
"DeepSeek-Coder-V2-0724": {"pretty_name": "DeepSeek-Coder-V2-0724", "hf_model_id": "deepseek-ai/DeepSeek-Coder-V2-Instruct"}
|
| 77 |
}
|