Spaces:
Running
Running
add new models
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
|
@@ -21,6 +21,28 @@
|
|
| 21 |
"Total Puzzles": 1000,
|
| 22 |
"Reason Lens": "1153.83"
|
| 23 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
{
|
| 25 |
"Model": "gpt-4o-2024-05-13",
|
| 26 |
"Mode": "sampling",
|
|
@@ -230,6 +252,17 @@
|
|
| 230 |
"Total Puzzles": 1000,
|
| 231 |
"Reason Lens": "809.95"
|
| 232 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
{
|
| 234 |
"Model": "gemma-2-27b-it@nvidia",
|
| 235 |
"Mode": "greedy",
|
|
@@ -252,6 +285,17 @@
|
|
| 252 |
"Total Puzzles": 1000,
|
| 253 |
"Reason Lens": "1015.06"
|
| 254 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
{
|
| 256 |
"Model": "reka-core-20240501",
|
| 257 |
"Mode": "greedy",
|
|
@@ -263,17 +307,6 @@
|
|
| 263 |
"Total Puzzles": 1000,
|
| 264 |
"Reason Lens": "1078.29"
|
| 265 |
},
|
| 266 |
-
{
|
| 267 |
-
"Model": "gemma-2-9b-it",
|
| 268 |
-
"Mode": "greedy",
|
| 269 |
-
"Puzzle Acc": "12.90",
|
| 270 |
-
"Cell Acc": "37.07",
|
| 271 |
-
"No answer": "0.50",
|
| 272 |
-
"Easy Puzzle Acc": "42.14",
|
| 273 |
-
"Hard Puzzle Acc": "1.53",
|
| 274 |
-
"Total Puzzles": 1000,
|
| 275 |
-
"Reason Lens": "859.14"
|
| 276 |
-
},
|
| 277 |
{
|
| 278 |
"Model": "gemma-2-9b-it@nvidia",
|
| 279 |
"Mode": "greedy",
|
|
@@ -296,6 +329,28 @@
|
|
| 296 |
"Total Puzzles": 1000,
|
| 297 |
"Reason Lens": "1216.40"
|
| 298 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
{
|
| 300 |
"Model": "Yi-1.5-34B-Chat",
|
| 301 |
"Mode": "greedy",
|
|
@@ -329,6 +384,17 @@
|
|
| 329 |
"Total Puzzles": 1000,
|
| 330 |
"Reason Lens": "820.66"
|
| 331 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
{
|
| 333 |
"Model": "reka-flash-20240226",
|
| 334 |
"Mode": "greedy",
|
|
@@ -351,6 +417,17 @@
|
|
| 351 |
"Total Puzzles": 1000,
|
| 352 |
"Reason Lens": "1148.16"
|
| 353 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
{
|
| 355 |
"Model": "Qwen2-7B-Instruct",
|
| 356 |
"Mode": "greedy",
|
|
|
|
| 21 |
"Total Puzzles": 1000,
|
| 22 |
"Reason Lens": "1153.83"
|
| 23 |
},
|
| 24 |
+
{
|
| 25 |
+
"Model": "Llama-3.1-405B-Instruct-Turbo",
|
| 26 |
+
"Mode": "greedy",
|
| 27 |
+
"Puzzle Acc": "32.60",
|
| 28 |
+
"Cell Acc": "45.80",
|
| 29 |
+
"No answer": "12.50",
|
| 30 |
+
"Easy Puzzle Acc": "87.14",
|
| 31 |
+
"Hard Puzzle Acc": "11.39",
|
| 32 |
+
"Total Puzzles": 1000,
|
| 33 |
+
"Reason Lens": "314.66"
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"Model": "Llama-3.1-405B-Instruct-Turbo",
|
| 37 |
+
"Mode": "sampling",
|
| 38 |
+
"Puzzle Acc": "32.60",
|
| 39 |
+
"Cell Acc": "47.04",
|
| 40 |
+
"No answer": "10.80",
|
| 41 |
+
"Easy Puzzle Acc": "86.07",
|
| 42 |
+
"Hard Puzzle Acc": "11.81",
|
| 43 |
+
"Total Puzzles": 1000,
|
| 44 |
+
"Reason Lens": "439.96"
|
| 45 |
+
},
|
| 46 |
{
|
| 47 |
"Model": "gpt-4o-2024-05-13",
|
| 48 |
"Mode": "sampling",
|
|
|
|
| 252 |
"Total Puzzles": 1000,
|
| 253 |
"Reason Lens": "809.95"
|
| 254 |
},
|
| 255 |
+
{
|
| 256 |
+
"Model": "Athene-70B",
|
| 257 |
+
"Mode": "greedy",
|
| 258 |
+
"Puzzle Acc": "16.70",
|
| 259 |
+
"Cell Acc": "32.98",
|
| 260 |
+
"No answer": "21.10",
|
| 261 |
+
"Easy Puzzle Acc": "52.50",
|
| 262 |
+
"Hard Puzzle Acc": "2.78",
|
| 263 |
+
"Total Puzzles": 1000,
|
| 264 |
+
"Reason Lens": "391.19"
|
| 265 |
+
},
|
| 266 |
{
|
| 267 |
"Model": "gemma-2-27b-it@nvidia",
|
| 268 |
"Mode": "greedy",
|
|
|
|
| 285 |
"Total Puzzles": 1000,
|
| 286 |
"Reason Lens": "1015.06"
|
| 287 |
},
|
| 288 |
+
{
|
| 289 |
+
"Model": "command-r-plus",
|
| 290 |
+
"Mode": "greedy",
|
| 291 |
+
"Puzzle Acc": "13.90",
|
| 292 |
+
"Cell Acc": "39.01",
|
| 293 |
+
"No answer": "0.20",
|
| 294 |
+
"Easy Puzzle Acc": "44.64",
|
| 295 |
+
"Hard Puzzle Acc": "1.94",
|
| 296 |
+
"Total Puzzles": 1000,
|
| 297 |
+
"Reason Lens": "810.53"
|
| 298 |
+
},
|
| 299 |
{
|
| 300 |
"Model": "reka-core-20240501",
|
| 301 |
"Mode": "greedy",
|
|
|
|
| 307 |
"Total Puzzles": 1000,
|
| 308 |
"Reason Lens": "1078.29"
|
| 309 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
{
|
| 311 |
"Model": "gemma-2-9b-it@nvidia",
|
| 312 |
"Mode": "greedy",
|
|
|
|
| 329 |
"Total Puzzles": 1000,
|
| 330 |
"Reason Lens": "1216.40"
|
| 331 |
},
|
| 332 |
+
{
|
| 333 |
+
"Model": "Mistral-Nemo-Instruct-2407",
|
| 334 |
+
"Mode": "greedy",
|
| 335 |
+
"Puzzle Acc": "11.80",
|
| 336 |
+
"Cell Acc": "34.93",
|
| 337 |
+
"No answer": "1.60",
|
| 338 |
+
"Easy Puzzle Acc": "38.93",
|
| 339 |
+
"Hard Puzzle Acc": "1.25",
|
| 340 |
+
"Total Puzzles": 1000,
|
| 341 |
+
"Reason Lens": "925.88"
|
| 342 |
+
},
|
| 343 |
+
{
|
| 344 |
+
"Model": "Phi-3-mini-4k-instruct",
|
| 345 |
+
"Mode": "greedy",
|
| 346 |
+
"Puzzle Acc": "11.60",
|
| 347 |
+
"Cell Acc": "13.50",
|
| 348 |
+
"No answer": "59.00",
|
| 349 |
+
"Easy Puzzle Acc": "38.21",
|
| 350 |
+
"Hard Puzzle Acc": "1.25",
|
| 351 |
+
"Total Puzzles": 1000,
|
| 352 |
+
"Reason Lens": "790.29"
|
| 353 |
+
},
|
| 354 |
{
|
| 355 |
"Model": "Yi-1.5-34B-Chat",
|
| 356 |
"Mode": "greedy",
|
|
|
|
| 384 |
"Total Puzzles": 1000,
|
| 385 |
"Reason Lens": "820.66"
|
| 386 |
},
|
| 387 |
+
{
|
| 388 |
+
"Model": "command-r",
|
| 389 |
+
"Mode": "greedy",
|
| 390 |
+
"Puzzle Acc": "9.90",
|
| 391 |
+
"Cell Acc": "32.66",
|
| 392 |
+
"No answer": "1.50",
|
| 393 |
+
"Easy Puzzle Acc": "32.14",
|
| 394 |
+
"Hard Puzzle Acc": "1.25",
|
| 395 |
+
"Total Puzzles": 1000,
|
| 396 |
+
"Reason Lens": "1005.17"
|
| 397 |
+
},
|
| 398 |
{
|
| 399 |
"Model": "reka-flash-20240226",
|
| 400 |
"Mode": "greedy",
|
|
|
|
| 417 |
"Total Puzzles": 1000,
|
| 418 |
"Reason Lens": "1148.16"
|
| 419 |
},
|
| 420 |
+
{
|
| 421 |
+
"Model": "Mixtral-8x7B-Instruct-v0.1",
|
| 422 |
+
"Mode": "greedy",
|
| 423 |
+
"Puzzle Acc": "8.70",
|
| 424 |
+
"Cell Acc": "26.47",
|
| 425 |
+
"No answer": "20.30",
|
| 426 |
+
"Easy Puzzle Acc": "28.93",
|
| 427 |
+
"Hard Puzzle Acc": "0.83",
|
| 428 |
+
"Total Puzzles": 1000,
|
| 429 |
+
"Reason Lens": "1177.21"
|
| 430 |
+
},
|
| 431 |
{
|
| 432 |
"Model": "Qwen2-7B-Instruct",
|
| 433 |
"Mode": "greedy",
|
model_info.json
CHANGED
|
@@ -64,5 +64,9 @@
|
|
| 64 |
"SELM-Llama-3-8B-Instruct-iter-3": {"pretty_name": "SELM (Llama3-8B-Inst-iter3)", "hf_model_id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3"},
|
| 65 |
"nemotron-4-340b-instruct": {"pretty_name": "Nemotron-4-340B-Instruct", "hf_model_id": "nvidia/Nemotron-4-340B-Instruct"},
|
| 66 |
"Llama-3-8B-Magpie-Align-v0.1": {"pretty_name": "Magpie-8B-Align-v0.1", "hf_model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1"},
|
| 67 |
-
"mathstral-7B-v0.1":{"pretty_name": "mathstral-7B-v0.1", "hf_model_id": "mistralai/mathstral-7B-v0.1"}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
}
|
|
|
|
| 64 |
"SELM-Llama-3-8B-Instruct-iter-3": {"pretty_name": "SELM (Llama3-8B-Inst-iter3)", "hf_model_id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3"},
|
| 65 |
"nemotron-4-340b-instruct": {"pretty_name": "Nemotron-4-340B-Instruct", "hf_model_id": "nvidia/Nemotron-4-340B-Instruct"},
|
| 66 |
"Llama-3-8B-Magpie-Align-v0.1": {"pretty_name": "Magpie-8B-Align-v0.1", "hf_model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1"},
|
| 67 |
+
"mathstral-7B-v0.1":{"pretty_name": "mathstral-7B-v0.1 π¨", "hf_model_id": "mistralai/mathstral-7B-v0.1"},
|
| 68 |
+
"Llama-3.1-405B-Instruct-Turbo": {"pretty_name": "Llama-3.1-405B-Instruct-Turbo π¨", "hf_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct"},
|
| 69 |
+
"Mistral-Nemo-Instruct-2407": {"pretty_name": "Mistral-Nemo-Inst (12B) π¨", "hf_model_id": "Mistral-Nemo-Instruct-2407"},
|
| 70 |
+
"Phi-3-mini-4k-instruct": {"pretty_name": "Phi-3-mini-4k-instruct π¨", "hf_model_id": "microsoft/Phi-3-mini-4k-instruct"},
|
| 71 |
+
"Athene-70B": {"pretty_name": "Athene-70B π¨", "hf_model_id": "Nexusflow/Athene-70B"}
|
| 72 |
}
|