Spaces:
Running
Running
Upload 7 files
Browse files- Acrostic.json +130 -0
- Crossword.json +130 -0
- Cryptogram.json +130 -0
- Drop_Quote.json +130 -0
- Logic_Puzzle.json +130 -0
- Sudoku.json +130 -0
- index.html +305 -19
Acrostic.json
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "llama-3.1-8b",
|
| 4 |
+
"CR": "43.0",
|
| 5 |
+
"S-Acc": "5.5",
|
| 6 |
+
"EM": "0.0",
|
| 7 |
+
"PM-0.5": "0.0",
|
| 8 |
+
"Tokens": "3712"
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"Model": "llama-3.1-70b",
|
| 12 |
+
"CR": "84.0",
|
| 13 |
+
"S-Acc": "35.8",
|
| 14 |
+
"EM": "0.0",
|
| 15 |
+
"PM-0.5": "21.0",
|
| 16 |
+
"Tokens": "3565"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"Model": "llama-3.3-70b",
|
| 20 |
+
"CR": "97.0",
|
| 21 |
+
"S-Acc": "40.8",
|
| 22 |
+
"EM": "0.0",
|
| 23 |
+
"PM-0.5": "28.0",
|
| 24 |
+
"Tokens": "3584"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"Model": "mistral-7b",
|
| 28 |
+
"CR": "75.0",
|
| 29 |
+
"S-Acc": "7.9",
|
| 30 |
+
"EM": "0.0",
|
| 31 |
+
"PM-0.5": "0.0",
|
| 32 |
+
"Tokens": "4599"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"Model": "mistral-small-22b",
|
| 36 |
+
"CR": "67.0",
|
| 37 |
+
"S-Acc": "5.5",
|
| 38 |
+
"EM": "0.0",
|
| 39 |
+
"PM-0.5": "0.0",
|
| 40 |
+
"Tokens": "4170"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"Model": "mistral-large-123b",
|
| 44 |
+
"CR": "98.0",
|
| 45 |
+
"S-Acc": "39.4",
|
| 46 |
+
"EM": "0.0",
|
| 47 |
+
"PM-0.5": "20.0",
|
| 48 |
+
"Tokens": "4279"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"Model": "qwen-2.5-7b",
|
| 52 |
+
"CR": "42.0",
|
| 53 |
+
"S-Acc": "3.6",
|
| 54 |
+
"EM": "0.0",
|
| 55 |
+
"PM-0.5": "0.0",
|
| 56 |
+
"Tokens": "4159"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"Model": "qwen-2.5-32b",
|
| 60 |
+
"CR": "100.0",
|
| 61 |
+
"S-Acc": "31.8",
|
| 62 |
+
"EM": "0.0",
|
| 63 |
+
"PM-0.5": "2.0",
|
| 64 |
+
"Tokens": "4073"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"Model": "qwen-2.5-72b",
|
| 68 |
+
"CR": "100.0",
|
| 69 |
+
"S-Acc": "39.3",
|
| 70 |
+
"EM": "0.0",
|
| 71 |
+
"PM-0.5": "18.0",
|
| 72 |
+
"Tokens": "4110"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"Model": "qwq-32b",
|
| 76 |
+
"CR": "97.0",
|
| 77 |
+
"S-Acc": "31.6",
|
| 78 |
+
"EM": "0.0",
|
| 79 |
+
"PM-0.5": "6.0",
|
| 80 |
+
"Tokens": "4964"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"Model": "deepseek-R1",
|
| 84 |
+
"CR": "100.0",
|
| 85 |
+
"S-Acc": "62.2",
|
| 86 |
+
"EM": "0.0",
|
| 87 |
+
"PM-0.5": "83.0",
|
| 88 |
+
"Tokens": "10076"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"Model": "gemini-2.0-exp",
|
| 92 |
+
"CR": "98.0",
|
| 93 |
+
"S-Acc": "48.0",
|
| 94 |
+
"EM": "0.0",
|
| 95 |
+
"PM-0.5": "48.0",
|
| 96 |
+
"Tokens": "4019"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"Model": "gemini-2.0-thinking",
|
| 100 |
+
"CR": "92.0",
|
| 101 |
+
"S-Acc": "40.7",
|
| 102 |
+
"EM": "0.0",
|
| 103 |
+
"PM-0.5": "27.0",
|
| 104 |
+
"Tokens": "4256"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"Model": "openai-gpt-4o",
|
| 108 |
+
"CR": "100.0",
|
| 109 |
+
"S-Acc": "56.0",
|
| 110 |
+
"EM": "0.0",
|
| 111 |
+
"PM-0.5": "67.0",
|
| 112 |
+
"Tokens": "3229"
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"Model": "openai-o1-mini",
|
| 116 |
+
"CR": "97.0",
|
| 117 |
+
"S-Acc": "34.7",
|
| 118 |
+
"EM": "0.0",
|
| 119 |
+
"PM-0.5": "12.0",
|
| 120 |
+
"Tokens": "10951"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"Model": "openai-o1-preview",
|
| 124 |
+
"CR": "100.0",
|
| 125 |
+
"S-Acc": "67.2",
|
| 126 |
+
"EM": "0.0",
|
| 127 |
+
"PM-0.5": "90.0",
|
| 128 |
+
"Tokens": "14847"
|
| 129 |
+
}
|
| 130 |
+
]
|
Crossword.json
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "llama-3.1-8b",
|
| 4 |
+
"CR": "61.3",
|
| 5 |
+
"S-Acc": "23.3",
|
| 6 |
+
"EM": "0.0",
|
| 7 |
+
"PM-0.5": "14.0",
|
| 8 |
+
"Tokens": "2887"
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"Model": "llama-3.1-70b",
|
| 12 |
+
"CR": "77.3",
|
| 13 |
+
"S-Acc": "46.8",
|
| 14 |
+
"EM": "0.0",
|
| 15 |
+
"PM-0.5": "62.0",
|
| 16 |
+
"Tokens": "3071"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"Model": "llama-3.3-70b",
|
| 20 |
+
"CR": "85.3",
|
| 21 |
+
"S-Acc": "47.6",
|
| 22 |
+
"EM": "0.0",
|
| 23 |
+
"PM-0.5": "65.3",
|
| 24 |
+
"Tokens": "2612"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"Model": "mistral-7b",
|
| 28 |
+
"CR": "94.0",
|
| 29 |
+
"S-Acc": "23.0",
|
| 30 |
+
"EM": "0.0",
|
| 31 |
+
"PM-0.5": "6.7",
|
| 32 |
+
"Tokens": "3655"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"Model": "mistral-small-22b",
|
| 36 |
+
"CR": "98.7",
|
| 37 |
+
"S-Acc": "48.3",
|
| 38 |
+
"EM": "0.0",
|
| 39 |
+
"PM-0.5": "54.0",
|
| 40 |
+
"Tokens": "3134"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"Model": "mistral-large-123b",
|
| 44 |
+
"CR": "99.3",
|
| 45 |
+
"S-Acc": "62.8",
|
| 46 |
+
"EM": "2.0",
|
| 47 |
+
"PM-0.5": "86.0",
|
| 48 |
+
"Tokens": "3237"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"Model": "qwen-2.5-7b",
|
| 52 |
+
"CR": "98.7",
|
| 53 |
+
"S-Acc": "21.1",
|
| 54 |
+
"EM": "0.0",
|
| 55 |
+
"PM-0.5": "3.3",
|
| 56 |
+
"Tokens": "2441"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"Model": "qwen-2.5-32b",
|
| 60 |
+
"CR": "100.0",
|
| 61 |
+
"S-Acc": "34.6",
|
| 62 |
+
"EM": "0.0",
|
| 63 |
+
"PM-0.5": "20.0",
|
| 64 |
+
"Tokens": "2560"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"Model": "qwen-2.5-72b",
|
| 68 |
+
"CR": "100.0",
|
| 69 |
+
"S-Acc": "44.1",
|
| 70 |
+
"EM": "0.0",
|
| 71 |
+
"PM-0.5": "36.7",
|
| 72 |
+
"Tokens": "2734"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"Model": "qwq-32b",
|
| 76 |
+
"CR": "80.0",
|
| 77 |
+
"S-Acc": "30.2",
|
| 78 |
+
"EM": "0.0",
|
| 79 |
+
"PM-0.5": "18.0",
|
| 80 |
+
"Tokens": "4816"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"Model": "deepseek-R1",
|
| 84 |
+
"CR": "100.0",
|
| 85 |
+
"S-Acc": "75.3",
|
| 86 |
+
"EM": "16.7",
|
| 87 |
+
"PM-0.5": "94.0",
|
| 88 |
+
"Tokens": "9809"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"Model": "gemini-2.0-exp",
|
| 92 |
+
"CR": "98.7",
|
| 93 |
+
"S-Acc": "61.6",
|
| 94 |
+
"EM": "0.0",
|
| 95 |
+
"PM-0.5": "83.3",
|
| 96 |
+
"Tokens": "2555"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"Model": "gemini-2.0-thinking",
|
| 100 |
+
"CR": "94.7",
|
| 101 |
+
"S-Acc": "57.7",
|
| 102 |
+
"EM": "1.3",
|
| 103 |
+
"PM-0.5": "79.3",
|
| 104 |
+
"Tokens": "2648"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"Model": "openai-gpt-4o",
|
| 108 |
+
"CR": "100.0",
|
| 109 |
+
"S-Acc": "63.0",
|
| 110 |
+
"EM": "1.3",
|
| 111 |
+
"PM-0.5": "86.7",
|
| 112 |
+
"Tokens": "1726"
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"Model": "openai-o1-mini",
|
| 116 |
+
"CR": "95.3",
|
| 117 |
+
"S-Acc": "45.5",
|
| 118 |
+
"EM": "1.3",
|
| 119 |
+
"PM-0.5": "54.0",
|
| 120 |
+
"Tokens": "7840"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"Model": "openai-o1-preview",
|
| 124 |
+
"CR": "98.0",
|
| 125 |
+
"S-Acc": "77.7",
|
| 126 |
+
"EM": "24.7",
|
| 127 |
+
"PM-0.5": "89.3",
|
| 128 |
+
"Tokens": "10098"
|
| 129 |
+
}
|
| 130 |
+
]
|
Cryptogram.json
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "llama-3.1-8b",
|
| 4 |
+
"CR": "43.0",
|
| 5 |
+
"S-Acc": "2.3",
|
| 6 |
+
"EM": "0.0",
|
| 7 |
+
"PM-0.5": "0.0",
|
| 8 |
+
"Tokens": "2067"
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"Model": "llama-3.1-70b",
|
| 12 |
+
"CR": "62.0",
|
| 13 |
+
"S-Acc": "6.9",
|
| 14 |
+
"EM": "0.0",
|
| 15 |
+
"PM-0.5": "1.0",
|
| 16 |
+
"Tokens": "1297"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"Model": "llama-3.3-70b",
|
| 20 |
+
"CR": "99.0",
|
| 21 |
+
"S-Acc": "14.3",
|
| 22 |
+
"EM": "0.0",
|
| 23 |
+
"PM-0.5": "1.0",
|
| 24 |
+
"Tokens": "1137"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"Model": "mistral-7b",
|
| 28 |
+
"CR": "99.0",
|
| 29 |
+
"S-Acc": "4.3",
|
| 30 |
+
"EM": "0.0",
|
| 31 |
+
"PM-0.5": "0.0",
|
| 32 |
+
"Tokens": "1095"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"Model": "mistral-small-22b",
|
| 36 |
+
"CR": "95.0",
|
| 37 |
+
"S-Acc": "7.0",
|
| 38 |
+
"EM": "0.0",
|
| 39 |
+
"PM-0.5": "0.0",
|
| 40 |
+
"Tokens": "1233"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"Model": "mistral-large-123b",
|
| 44 |
+
"CR": "96.0",
|
| 45 |
+
"S-Acc": "13.7",
|
| 46 |
+
"EM": "0.0",
|
| 47 |
+
"PM-0.5": "1.0",
|
| 48 |
+
"Tokens": "1204"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"Model": "qwen-2.5-7b",
|
| 52 |
+
"CR": "81.0",
|
| 53 |
+
"S-Acc": "3.5",
|
| 54 |
+
"EM": "0.0",
|
| 55 |
+
"PM-0.5": "0.0",
|
| 56 |
+
"Tokens": "1181"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"Model": "qwen-2.5-32b",
|
| 60 |
+
"CR": "89.0",
|
| 61 |
+
"S-Acc": "9.8",
|
| 62 |
+
"EM": "0.0",
|
| 63 |
+
"PM-0.5": "0.0",
|
| 64 |
+
"Tokens": "1303"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"Model": "qwen-2.5-72b",
|
| 68 |
+
"CR": "85.0",
|
| 69 |
+
"S-Acc": "11.8",
|
| 70 |
+
"EM": "0.0",
|
| 71 |
+
"PM-0.5": "0.0",
|
| 72 |
+
"Tokens": "1726"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"Model": "qwq-32b",
|
| 76 |
+
"CR": "47.0",
|
| 77 |
+
"S-Acc": "3.6",
|
| 78 |
+
"EM": "0.0",
|
| 79 |
+
"PM-0.5": "0.0",
|
| 80 |
+
"Tokens": "6491"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"Model": "deepseek-R1",
|
| 84 |
+
"CR": "100.0",
|
| 85 |
+
"S-Acc": "26.0",
|
| 86 |
+
"EM": "4.0",
|
| 87 |
+
"PM-0.5": "21.0",
|
| 88 |
+
"Tokens": "10344"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"Model": "gemini-2.0-exp",
|
| 92 |
+
"CR": "47.0",
|
| 93 |
+
"S-Acc": "8.5",
|
| 94 |
+
"EM": "0.0",
|
| 95 |
+
"PM-0.5": "1.0",
|
| 96 |
+
"Tokens": "1585"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"Model": "gemini-2.0-thinking",
|
| 100 |
+
"CR": "68.0",
|
| 101 |
+
"S-Acc": "11.2",
|
| 102 |
+
"EM": "0.0",
|
| 103 |
+
"PM-0.5": "2.0",
|
| 104 |
+
"Tokens": "4167"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"Model": "openai-gpt-4o",
|
| 108 |
+
"CR": "100.0",
|
| 109 |
+
"S-Acc": "20.7",
|
| 110 |
+
"EM": "0.0",
|
| 111 |
+
"PM-0.5": "5.0",
|
| 112 |
+
"Tokens": "739"
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"Model": "openai-o1-mini",
|
| 116 |
+
"CR": "100.0",
|
| 117 |
+
"S-Acc": "22.7",
|
| 118 |
+
"EM": "1.0",
|
| 119 |
+
"PM-0.5": "13.0",
|
| 120 |
+
"Tokens": "11208"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"Model": "openai-o1-preview",
|
| 124 |
+
"CR": "92.0",
|
| 125 |
+
"S-Acc": "34.8",
|
| 126 |
+
"EM": "13.0",
|
| 127 |
+
"PM-0.5": "29.0",
|
| 128 |
+
"Tokens": "12567"
|
| 129 |
+
}
|
| 130 |
+
]
|
Drop_Quote.json
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "llama-3.1-8b",
|
| 4 |
+
"CR": "44.0",
|
| 5 |
+
"S-Acc": "11.2",
|
| 6 |
+
"EM": "0.0",
|
| 7 |
+
"PM-0.5": "1.0",
|
| 8 |
+
"Tokens": "2122"
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"Model": "llama-3.1-70b",
|
| 12 |
+
"CR": "82.0",
|
| 13 |
+
"S-Acc": "27.7",
|
| 14 |
+
"EM": "0.0",
|
| 15 |
+
"PM-0.5": "12.0",
|
| 16 |
+
"Tokens": "1498"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"Model": "llama-3.3-70b",
|
| 20 |
+
"CR": "99.0",
|
| 21 |
+
"S-Acc": "29.0",
|
| 22 |
+
"EM": "0.0",
|
| 23 |
+
"PM-0.5": "13.0",
|
| 24 |
+
"Tokens": "918"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"Model": "mistral-7b",
|
| 28 |
+
"CR": "66.0",
|
| 29 |
+
"S-Acc": "6.6",
|
| 30 |
+
"EM": "0.0",
|
| 31 |
+
"PM-0.5": "1.0",
|
| 32 |
+
"Tokens": "2336"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"Model": "mistral-small-22b",
|
| 36 |
+
"CR": "97.0",
|
| 37 |
+
"S-Acc": "26.9",
|
| 38 |
+
"EM": "0.0",
|
| 39 |
+
"PM-0.5": "6.0",
|
| 40 |
+
"Tokens": "1614"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"Model": "mistral-large-123b",
|
| 44 |
+
"CR": "98.0",
|
| 45 |
+
"S-Acc": "24.7",
|
| 46 |
+
"EM": "0.0",
|
| 47 |
+
"PM-0.5": "9.0",
|
| 48 |
+
"Tokens": "1565"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"Model": "qwen-2.5-7b",
|
| 52 |
+
"CR": "98.0",
|
| 53 |
+
"S-Acc": "21.9",
|
| 54 |
+
"EM": "0.0",
|
| 55 |
+
"PM-0.5": "4.0",
|
| 56 |
+
"Tokens": "1851"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"Model": "qwen-2.5-32b",
|
| 60 |
+
"CR": "95.0",
|
| 61 |
+
"S-Acc": "28.4",
|
| 62 |
+
"EM": "0.0",
|
| 63 |
+
"PM-0.5": "14.0",
|
| 64 |
+
"Tokens": "1197"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"Model": "qwen-2.5-72b",
|
| 68 |
+
"CR": "94.0",
|
| 69 |
+
"S-Acc": "30.9",
|
| 70 |
+
"EM": "0.0",
|
| 71 |
+
"PM-0.5": "13.0",
|
| 72 |
+
"Tokens": "1756"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"Model": "qwq-32b",
|
| 76 |
+
"CR": "33.0",
|
| 77 |
+
"S-Acc": "7.5",
|
| 78 |
+
"EM": "0.0",
|
| 79 |
+
"PM-0.5": "8.0",
|
| 80 |
+
"Tokens": "6078"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"Model": "deepseek-R1",
|
| 84 |
+
"CR": "100.0",
|
| 85 |
+
"S-Acc": "47.3",
|
| 86 |
+
"EM": "7.0",
|
| 87 |
+
"PM-0.5": "42.0",
|
| 88 |
+
"Tokens": "11422"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"Model": "gemini-2.0-exp",
|
| 92 |
+
"CR": "92.0",
|
| 93 |
+
"S-Acc": "34.2",
|
| 94 |
+
"EM": "0.0",
|
| 95 |
+
"PM-0.5": "17.0",
|
| 96 |
+
"Tokens": "2717"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"Model": "gemini-2.0-thinking",
|
| 100 |
+
"CR": "96.0",
|
| 101 |
+
"S-Acc": "34.4",
|
| 102 |
+
"EM": "0.0",
|
| 103 |
+
"PM-0.5": "23.0",
|
| 104 |
+
"Tokens": "3385"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"Model": "openai-gpt-4o",
|
| 108 |
+
"CR": "99.0",
|
| 109 |
+
"S-Acc": "31.1",
|
| 110 |
+
"EM": "0.0",
|
| 111 |
+
"PM-0.5": "14.0",
|
| 112 |
+
"Tokens": "1165"
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"Model": "openai-o1-mini",
|
| 116 |
+
"CR": "96.0",
|
| 117 |
+
"S-Acc": "34.3",
|
| 118 |
+
"EM": "2.0",
|
| 119 |
+
"PM-0.5": "21.0",
|
| 120 |
+
"Tokens": "13255"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"Model": "openai-o1-preview",
|
| 124 |
+
"CR": "97.0",
|
| 125 |
+
"S-Acc": "38.8",
|
| 126 |
+
"EM": "13.0",
|
| 127 |
+
"PM-0.5": "38.0",
|
| 128 |
+
"Tokens": "13595"
|
| 129 |
+
}
|
| 130 |
+
]
|
Logic_Puzzle.json
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "llama-3.1-8b",
|
| 4 |
+
"CR": "57.0",
|
| 5 |
+
"S-Acc": "16.0",
|
| 6 |
+
"EM": "0.0",
|
| 7 |
+
"PM-0.5": "8.0",
|
| 8 |
+
"Tokens": "1292"
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"Model": "llama-3.1-70b",
|
| 12 |
+
"CR": "56.0",
|
| 13 |
+
"S-Acc": "22.8",
|
| 14 |
+
"EM": "2.0",
|
| 15 |
+
"PM-0.5": "18.0",
|
| 16 |
+
"Tokens": "1164"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"Model": "llama-3.3-70b",
|
| 20 |
+
"CR": "80.5",
|
| 21 |
+
"S-Acc": "32.2",
|
| 22 |
+
"EM": "1.0",
|
| 23 |
+
"PM-0.5": "25.0",
|
| 24 |
+
"Tokens": "1738"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"Model": "mistral-7b",
|
| 28 |
+
"CR": "97.0",
|
| 29 |
+
"S-Acc": "19.1",
|
| 30 |
+
"EM": "0.0",
|
| 31 |
+
"PM-0.5": "4.5",
|
| 32 |
+
"Tokens": "1617"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"Model": "mistral-small-22b",
|
| 36 |
+
"CR": "99.5",
|
| 37 |
+
"S-Acc": "30.7",
|
| 38 |
+
"EM": "0.5",
|
| 39 |
+
"PM-0.5": "12.5",
|
| 40 |
+
"Tokens": "1514"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"Model": "mistral-large-123b",
|
| 44 |
+
"CR": "100.0",
|
| 45 |
+
"S-Acc": "38.3",
|
| 46 |
+
"EM": "3.0",
|
| 47 |
+
"PM-0.5": "30.5",
|
| 48 |
+
"Tokens": "1636"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"Model": "qwen-2.5-7b",
|
| 52 |
+
"CR": "96.5",
|
| 53 |
+
"S-Acc": "25.8",
|
| 54 |
+
"EM": "0.0",
|
| 55 |
+
"PM-0.5": "8.5",
|
| 56 |
+
"Tokens": "1395"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"Model": "qwen-2.5-32b",
|
| 60 |
+
"CR": "93.0",
|
| 61 |
+
"S-Acc": "32.2",
|
| 62 |
+
"EM": "0.0",
|
| 63 |
+
"PM-0.5": "22.5",
|
| 64 |
+
"Tokens": "1207"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"Model": "qwen-2.5-72b",
|
| 68 |
+
"CR": "93.5",
|
| 69 |
+
"S-Acc": "34.0",
|
| 70 |
+
"EM": "0.0",
|
| 71 |
+
"PM-0.5": "23.0",
|
| 72 |
+
"Tokens": "1809"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"Model": "qwq-32b",
|
| 76 |
+
"CR": "78.5",
|
| 77 |
+
"S-Acc": "46.3",
|
| 78 |
+
"EM": "19.5",
|
| 79 |
+
"PM-0.5": "48.0",
|
| 80 |
+
"Tokens": "9523"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"Model": "deepseek-R1",
|
| 84 |
+
"CR": "100.0",
|
| 85 |
+
"S-Acc": "69.4",
|
| 86 |
+
"EM": "42.5",
|
| 87 |
+
"PM-0.5": "68.0",
|
| 88 |
+
"Tokens": "9204"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"Model": "gemini-2.0-exp",
|
| 92 |
+
"CR": "58.0",
|
| 93 |
+
"S-Acc": "24.2",
|
| 94 |
+
"EM": "2.0",
|
| 95 |
+
"PM-0.5": "20.0",
|
| 96 |
+
"Tokens": "2103"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"Model": "gemini-2.0-thinking",
|
| 100 |
+
"CR": "99.0",
|
| 101 |
+
"S-Acc": "45.9",
|
| 102 |
+
"EM": "8.0",
|
| 103 |
+
"PM-0.5": "37.5",
|
| 104 |
+
"Tokens": "4037"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"Model": "openai-gpt-4o",
|
| 108 |
+
"CR": "100.0",
|
| 109 |
+
"S-Acc": "39.3",
|
| 110 |
+
"EM": "3.5",
|
| 111 |
+
"PM-0.5": "29.5",
|
| 112 |
+
"Tokens": "953"
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"Model": "openai-o1-mini",
|
| 116 |
+
"CR": "99.0",
|
| 117 |
+
"S-Acc": "57.2",
|
| 118 |
+
"EM": "23.5",
|
| 119 |
+
"PM-0.5": "53.5",
|
| 120 |
+
"Tokens": "10242"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"Model": "openai-o1-preview",
|
| 124 |
+
"CR": "99.0",
|
| 125 |
+
"S-Acc": "68.8",
|
| 126 |
+
"EM": "41.0",
|
| 127 |
+
"PM-0.5": "68.5",
|
| 128 |
+
"Tokens": "9449"
|
| 129 |
+
}
|
| 130 |
+
]
|
Sudoku.json
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Model": "llama-3.1-8b",
|
| 4 |
+
"CR": "7.5",
|
| 5 |
+
"S-Acc": "1.2",
|
| 6 |
+
"EM": "0.0",
|
| 7 |
+
"PM-0.5": "0.0",
|
| 8 |
+
"Tokens": "2782"
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"Model": "llama-3.1-70b",
|
| 12 |
+
"CR": "69.5",
|
| 13 |
+
"S-Acc": "24.2",
|
| 14 |
+
"EM": "1.0",
|
| 15 |
+
"PM-0.5": "17.5",
|
| 16 |
+
"Tokens": "1939"
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"Model": "llama-3.3-70b",
|
| 20 |
+
"CR": "93.5",
|
| 21 |
+
"S-Acc": "34.8",
|
| 22 |
+
"EM": "7.0",
|
| 23 |
+
"PM-0.5": "22.5",
|
| 24 |
+
"Tokens": "1061"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"Model": "mistral-7b",
|
| 28 |
+
"CR": "84.0",
|
| 29 |
+
"S-Acc": "11.9",
|
| 30 |
+
"EM": "0.0",
|
| 31 |
+
"PM-0.5": "1.5",
|
| 32 |
+
"Tokens": "3108"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"Model": "mistral-small-22b",
|
| 36 |
+
"CR": "89.0",
|
| 37 |
+
"S-Acc": "20.5",
|
| 38 |
+
"EM": "0.5",
|
| 39 |
+
"PM-0.5": "7.5",
|
| 40 |
+
"Tokens": "1968"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"Model": "mistral-large-123b",
|
| 44 |
+
"CR": "85.5",
|
| 45 |
+
"S-Acc": "39.5",
|
| 46 |
+
"EM": "10.0",
|
| 47 |
+
"PM-0.5": "33.5",
|
| 48 |
+
"Tokens": "1955"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"Model": "qwen-2.5-7b",
|
| 52 |
+
"CR": "94.5",
|
| 53 |
+
"S-Acc": "30.2",
|
| 54 |
+
"EM": "1.5",
|
| 55 |
+
"PM-0.5": "15.0",
|
| 56 |
+
"Tokens": "1486"
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"Model": "qwen-2.5-32b",
|
| 60 |
+
"CR": "100.0",
|
| 61 |
+
"S-Acc": "42.8",
|
| 62 |
+
"EM": "3.5",
|
| 63 |
+
"PM-0.5": "30.5",
|
| 64 |
+
"Tokens": "1201"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"Model": "qwen-2.5-72b",
|
| 68 |
+
"CR": "97.5",
|
| 69 |
+
"S-Acc": "43.0",
|
| 70 |
+
"EM": "5.5",
|
| 71 |
+
"PM-0.5": "34.0",
|
| 72 |
+
"Tokens": "2013"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"Model": "qwq-32b",
|
| 76 |
+
"CR": "54.5",
|
| 77 |
+
"S-Acc": "40.1",
|
| 78 |
+
"EM": "31.5",
|
| 79 |
+
"PM-0.5": "35.5",
|
| 80 |
+
"Tokens": "8381"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"Model": "deepseek-R1",
|
| 84 |
+
"CR": "100.0",
|
| 85 |
+
"S-Acc": "70.3",
|
| 86 |
+
"EM": "50.0",
|
| 87 |
+
"PM-0.5": "64.0",
|
| 88 |
+
"Tokens": "8276"
|
| 89 |
+
},
|
| 90 |
+
{
|
| 91 |
+
"Model": "gemini-2.0-exp",
|
| 92 |
+
"CR": "93.0",
|
| 93 |
+
"S-Acc": "45.3",
|
| 94 |
+
"EM": "12.5",
|
| 95 |
+
"PM-0.5": "37.5",
|
| 96 |
+
"Tokens": "2842"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"Model": "gemini-2.0-thinking",
|
| 100 |
+
"CR": "79.5",
|
| 101 |
+
"S-Acc": "46.5",
|
| 102 |
+
"EM": "16.5",
|
| 103 |
+
"PM-0.5": "41.0",
|
| 104 |
+
"Tokens": "3852"
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"Model": "openai-gpt-4o",
|
| 108 |
+
"CR": "100.0",
|
| 109 |
+
"S-Acc": "52.2",
|
| 110 |
+
"EM": "14.5",
|
| 111 |
+
"PM-0.5": "48.0",
|
| 112 |
+
"Tokens": "1103"
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"Model": "openai-o1-mini",
|
| 116 |
+
"CR": "99.0",
|
| 117 |
+
"S-Acc": "53.4",
|
| 118 |
+
"EM": "27.0",
|
| 119 |
+
"PM-0.5": "43.0",
|
| 120 |
+
"Tokens": "3960"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"Model": "openai-o1-preview",
|
| 124 |
+
"CR": "91.5",
|
| 125 |
+
"S-Acc": "65.1",
|
| 126 |
+
"EM": "50.0",
|
| 127 |
+
"PM-0.5": "55.5",
|
| 128 |
+
"Tokens": "8061"
|
| 129 |
+
}
|
| 130 |
+
]
|
index.html
CHANGED
|
@@ -1,19 +1,305 @@
|
|
| 1 |
-
<!
|
| 2 |
-
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>title</title>
|
| 7 |
+
<style>
|
| 8 |
+
:root {
|
| 9 |
+
--primary-color: #2c3e50; /* Dark blue-gray for sophistication */
|
| 10 |
+
--secondary-color: #34495e; /* Complementary darker shade */
|
| 11 |
+
--background-color: #f5f5f5; /* Light gray for a clean look */
|
| 12 |
+
--text-color: #2c3e50; /* Dark text for readability */
|
| 13 |
+
--accent-color: #e74c3c; /* Elegant accent for highlights */
|
| 14 |
+
--header-text-color: #ffffff; /* White for table headers */
|
| 15 |
+
}
|
| 16 |
+
body {
|
| 17 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 18 |
+
line-height: 1.6;
|
| 19 |
+
color: var(--text-color);
|
| 20 |
+
max-width: 1200px;
|
| 21 |
+
margin: 0 auto;
|
| 22 |
+
padding: 20px;
|
| 23 |
+
background-color: var(--background-color);
|
| 24 |
+
}
|
| 25 |
+
h1 {
|
| 26 |
+
color: var(--primary-color);
|
| 27 |
+
text-align: center;
|
| 28 |
+
font-size: 2rem;
|
| 29 |
+
margin-bottom: 30px;
|
| 30 |
+
}
|
| 31 |
+
.tab {
|
| 32 |
+
display: flex;
|
| 33 |
+
justify-content: center;
|
| 34 |
+
margin-bottom: 20px;
|
| 35 |
+
}
|
| 36 |
+
.tab button {
|
| 37 |
+
background-color: var(--primary-color);
|
| 38 |
+
color: var(--header-text-color);
|
| 39 |
+
border: none;
|
| 40 |
+
outline: none;
|
| 41 |
+
cursor: pointer;
|
| 42 |
+
padding: 12px 18px;
|
| 43 |
+
font-size: 14px;
|
| 44 |
+
border-radius: 5px;
|
| 45 |
+
margin: 0 5px;
|
| 46 |
+
transition: background-color 0.3s ease, transform 0.2s ease;
|
| 47 |
+
}
|
| 48 |
+
.tab button:hover {
|
| 49 |
+
background-color: var(--secondary-color);
|
| 50 |
+
transform: scale(1.05);
|
| 51 |
+
}
|
| 52 |
+
.tab button.active {
|
| 53 |
+
background-color: var(--accent-color);
|
| 54 |
+
}
|
| 55 |
+
#taskDescription {
|
| 56 |
+
text-align: left;
|
| 57 |
+
font-size: 1rem;
|
| 58 |
+
margin-bottom: 20px;
|
| 59 |
+
color: var(--text-color);
|
| 60 |
+
}
|
| 61 |
+
.tabcontent {
|
| 62 |
+
display: none;
|
| 63 |
+
padding: 20px;
|
| 64 |
+
background-color: white;
|
| 65 |
+
border-radius: 5px;
|
| 66 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
| 67 |
+
}
|
| 68 |
+
table {
|
| 69 |
+
border-collapse: collapse;
|
| 70 |
+
width: 100%;
|
| 71 |
+
margin-bottom: 20px;
|
| 72 |
+
background-color: white;
|
| 73 |
+
border-radius: 5px;
|
| 74 |
+
overflow: hidden;
|
| 75 |
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
| 76 |
+
}
|
| 77 |
+
th, td {
|
| 78 |
+
text-align: left;
|
| 79 |
+
padding: 12px;
|
| 80 |
+
font-size: 14px;
|
| 81 |
+
color: var(--text-color);
|
| 82 |
+
border-bottom: 1px solid #ddd;
|
| 83 |
+
}
|
| 84 |
+
th {
|
| 85 |
+
background-color: var(--primary-color);
|
| 86 |
+
color: var(--header-text-color);
|
| 87 |
+
font-size: 15px;
|
| 88 |
+
font-weight: bold;
|
| 89 |
+
}
|
| 90 |
+
tr:nth-child(odd) {
|
| 91 |
+
background-color: #fdfdfd; /* Light background for odd rows */
|
| 92 |
+
}
|
| 93 |
+
tr:nth-child(even) {
|
| 94 |
+
background-color: #f7f9fc; /* Slightly darker background for even rows */
|
| 95 |
+
}
|
| 96 |
+
/* 表格行悬停样式 */
|
| 97 |
+
tr:hover {
|
| 98 |
+
background-color: var(--secondary-color);
|
| 99 |
+
}
|
| 100 |
+
/* 单元格悬停时文本颜色 */
|
| 101 |
+
tr:hover th, tr:hover td {
|
| 102 |
+
color: var(--header-text-color); /* 确保文本颜色为白色 */
|
| 103 |
+
}
|
| 104 |
+
/* 悬停时链接的颜色 */
|
| 105 |
+
tr:hover a {
|
| 106 |
+
color: inherit; /* 继承自父元素的颜色 */
|
| 107 |
+
}
|
| 108 |
+
a {
|
| 109 |
+
color: var(--accent-color);
|
| 110 |
+
text-decoration: none;
|
| 111 |
+
font-weight: bold;
|
| 112 |
+
}
|
| 113 |
+
a:hover {
|
| 114 |
+
text-decoration: underline;
|
| 115 |
+
}
|
| 116 |
+
th a {
|
| 117 |
+
color: inherit;
|
| 118 |
+
display: block;
|
| 119 |
+
text-align: left;
|
| 120 |
+
font-size: inherit;
|
| 121 |
+
padding: 0;
|
| 122 |
+
}
|
| 123 |
+
th:hover, th a:hover {
|
| 124 |
+
background-color: var(--accent-color);
|
| 125 |
+
}
|
| 126 |
+
</style>
|
| 127 |
+
|
| 128 |
+
<script type="text/javascript" async
|
| 129 |
+
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
|
| 130 |
+
</script>
|
| 131 |
+
<script type="text/javascript">
|
| 132 |
+
window.onload = function() {
|
| 133 |
+
MathJax.Hub.Config({
|
| 134 |
+
tex2jax: {
|
| 135 |
+
inlineMath: [['$', '$'], ['\\(', '\\)']],
|
| 136 |
+
displayMath: [['$$', '$$'], ['\\[', '\\]']]
|
| 137 |
+
},
|
| 138 |
+
TeX: {
|
| 139 |
+
extensions: ['AMSmath.js', 'AMSsymbols.js']
|
| 140 |
+
}
|
| 141 |
+
});
|
| 142 |
+
|
| 143 |
+
MathJax.Hub.Queue(["Typeset", MathJax.Hub]);
|
| 144 |
+
};
|
| 145 |
+
</script>
|
| 146 |
+
</head>
|
| 147 |
+
<body>
|
| 148 |
+
<h1>\( LR{}^{2} \)Bench: Evaluating Long-chain Reflective Reasoning Capabilities of Large Language Models via Constraint Satisfaction Problems</h1>
|
| 149 |
+
<p>
|
| 150 |
+
descirption
|
| 151 |
+
</p>
|
| 152 |
+
<hr />
|
| 153 |
+
|
| 154 |
+
<div class="tab">
|
| 155 |
+
<button class="tablinks" onclick="openTab(event, 'Acrostic')" id="defaultOpen">Acrostic</button>
|
| 156 |
+
<button class="tablinks" onclick="openTab(event, 'Crossword')">Crossword</button>
|
| 157 |
+
<button class="tablinks" onclick="openTab(event, 'Cryptogram')">Cryptogram</button>
|
| 158 |
+
<button class="tablinks" onclick="openTab(event, 'Logic Puzzle')">Logic Puzzle</button>
|
| 159 |
+
<button class="tablinks" onclick="openTab(event, 'Sudoku')">Sudoku</button>
|
| 160 |
+
<button class="tablinks" onclick="openTab(event, 'Drop Quote')">Drop Quote</button>
|
| 161 |
+
|
| 162 |
+
</div>
|
| 163 |
+
<div id="taskDescription"></div>
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
<div id="Acrostic" class="tabcontent"></div>
|
| 167 |
+
<div id="Drop Quote" class="tabcontent"></div>
|
| 168 |
+
<div id="Crossword" class="tabcontent"></div>
|
| 169 |
+
<div id="Logic Puzzle" class="tabcontent"></div>
|
| 170 |
+
<div id="Cryptogram" class="tabcontent"></div>
|
| 171 |
+
<div id="Sudoku" class="tabcontent"></div>
|
| 172 |
+
|
| 173 |
+
<script>
|
| 174 |
+
const descriptions = {
|
| 175 |
+
'Acrostic': 'The Acrostic task involves word clues like Crossword, but its objective is to form a hidden quotation or sentence from the answers to the clues. This requires that the answer words not only satisfy the corresponding clues but also effectively integrate to construct the ultimate hidden message. We collected 50 easy and 50 hard Acrostic samples from <a href="https://www.printable-puzzles.com/printable-acrostic-puzzles.php" target="_blank"> Printable Puzzles</a> with timestamps ranging from September 2024 to December 2024.',
|
| 176 |
+
'Crossword': 'The Crossword task requires inferring correct words from given clues and filling them into a grid. A key challenge lies in satisfying the constraint of shared letter intersections between horizontal and vertical words. We collected 150 Crossword samples published in 2024 from <a href="https://www.latimes.com" target="_blank"> Los Angeles Times</a> and <a href="https://www.vulture.com" target="_blank"> Vulture</a> in three sizes: $5\\times5$, $10\\times10$, and $15\\times15$, with 50 ones for each size.',
|
| 177 |
+
'Logic Puzzle': 'The Logic Puzzle task constitutes a problem that necessitates logical reasoning to deduce relationships between a set of entities based on the given constraints and clues. The objective is to systematically analyze the given information, employing techniques such as hypothesis formation, elimination, and deductive inference, to determine a unique solution that satisfies all given constraints. We collected 50 puzzles for each of the four sizes ($4\\times4$, $4\\times5$, $4\\times6$, and $4\\times7$) from <a href="https://www.printable-puzzles.com/printable-logic-puzzles.php" target="_blank"> Printable Puzzles</a>, with timestamps ranging from September 2024 to December 2024.',
|
| 178 |
+
'Cryptogram': 'The Cryptogram task involves the decryption of an encrypted quotation or sentence, where each letter of an original text is substituted with another, resulting in an apparently nonsense text. Decryption requires identifying patterns, common letter frequencies, and word structures to deduce the letter-to-letter correspondences, ultimately reconstructing the original content. We collected 50 easy and 50 hard samples from <a href="https://www.printable-puzzles.com/printable-cryptograms.php" target="_blank"> Printable Puzzles</a> with timestamps ranging from September 2024 to December 2024.',
|
| 179 |
+
'Sudoku': 'The Sudoku task consists of filling a \\( n^2 \\times n^2 \\) grid with digits from \\( 1 \\) to \\( n^2 \\), subject to the constraint that each row, column, and \\( n \\times n \\) subgrid contains all digits from 1 to \\( n^2 \\) without repetition. Success in Sudoku relies on logical deduction and careful consideration of the existing digits to determine valid placements for the remaining numbers. From <a href="https://1sudoku.com" target="_blank"> 1sudoku</a>, we collected 200 Sudoku samples in total: 50 easy and 50 hard samples for both \\( 4\\times4 \\) and \\( 9\\times9 \\) sizes.',
|
| 180 |
+
'Drop Quote': 'The Drop Quote task comprises a grid of multiple rows and columns, with each column providing a set of candidate letters. The task requires determining the correct row for letters in each column, effectively "dropping" it into target place to reveal the hidden quotation. We created 50 easy samples by manually compiling common quotations, and collected 50 hard samples from <a href="https://www.printable-puzzles.com/printable-drop-quotes.php" target="_blank"> Printable Puzzles</a>, with timestamps ranging from September 2024 to December 2024.'
|
| 181 |
+
};
|
| 182 |
+
function openTab(evt, tabName) {
|
| 183 |
+
var i, tabcontent, tablinks;
|
| 184 |
+
tabcontent = document.getElementsByClassName("tabcontent");
|
| 185 |
+
for (i = 0; i < tabcontent.length; i++) {
|
| 186 |
+
tabcontent[i].style.display = "none";
|
| 187 |
+
}
|
| 188 |
+
tablinks = document.getElementsByClassName("tablinks");
|
| 189 |
+
for (i = 0; i < tablinks.length; i++) {
|
| 190 |
+
tablinks[i].className = tablinks[i].className.replace(" active", "");
|
| 191 |
+
}
|
| 192 |
+
document.getElementById(tabName).style.display = "block";
|
| 193 |
+
evt.currentTarget.className += " active";
|
| 194 |
+
|
| 195 |
+
const descriptionElement = document.getElementById("taskDescription");
|
| 196 |
+
descriptionElement.innerHTML = descriptions[tabName] || "Select a task to see its description.";
|
| 197 |
+
MathJax.Hub.Queue(["Typeset", MathJax.Hub, "taskDescription"]);
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
document.getElementById("defaultOpen").click();
|
| 201 |
+
|
| 202 |
+
const urls = {
|
| 203 |
+
'Acrostic': 'Acrostic.json',
|
| 204 |
+
'Drop Quote': 'Drop_Quote.json',
|
| 205 |
+
'Crossword': 'Crossword.json',
|
| 206 |
+
'Sudoku': 'Sudoku.json',
|
| 207 |
+
'Logic Puzzle': 'Logic_Puzzle.json',
|
| 208 |
+
'Cryptogram': 'Cryptogram.json',
|
| 209 |
+
};
|
| 210 |
+
|
| 211 |
+
function createTable(data, tableId) {
|
| 212 |
+
let table = `<table id="${tableId}">`;
|
| 213 |
+
table += '<thead><tr>';
|
| 214 |
+
for (let key in data[0]) {
|
| 215 |
+
table += `<th><a href="javascript:void(0);" onclick="sortTable('${tableId}', ${Object.keys(data[0]).indexOf(key)})">${key}</a></th>`;
|
| 216 |
+
}
|
| 217 |
+
table += '</tr></thead><tbody>';
|
| 218 |
+
data.forEach(row => {
|
| 219 |
+
table += '<tr>';
|
| 220 |
+
for (let key in row) {
|
| 221 |
+
table += `<td>${row[key]}</td>`;
|
| 222 |
+
}
|
| 223 |
+
table += '</tr>';
|
| 224 |
+
});
|
| 225 |
+
table += '</tbody></table>';
|
| 226 |
+
return table;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
function sortTable(tableId, n) {
|
| 230 |
+
var table, rows, switching, i, x, y, shouldSwitch, dir, switchcount = 0;
|
| 231 |
+
table = document.getElementById(tableId);
|
| 232 |
+
switching = true;
|
| 233 |
+
dir = "asc";
|
| 234 |
+
while (switching) {
|
| 235 |
+
switching = false;
|
| 236 |
+
rows = table.rows;
|
| 237 |
+
for (i = 1; i < (rows.length - 1); i++) {
|
| 238 |
+
shouldSwitch = false;
|
| 239 |
+
x = rows[i].getElementsByTagName("TD")[n];
|
| 240 |
+
y = rows[i + 1].getElementsByTagName("TD")[n];
|
| 241 |
+
if (dir == "asc") {
|
| 242 |
+
if (isNaN(x.innerHTML)) {
|
| 243 |
+
if (x.innerHTML.toLowerCase() > y.innerHTML.toLowerCase()) {
|
| 244 |
+
shouldSwitch = true;
|
| 245 |
+
break;
|
| 246 |
+
}
|
| 247 |
+
} else {
|
| 248 |
+
if (Number(x.innerHTML) > Number(y.innerHTML)) {
|
| 249 |
+
shouldSwitch = true;
|
| 250 |
+
break;
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
} else if (dir == "desc") {
|
| 254 |
+
if (isNaN(x.innerHTML)) {
|
| 255 |
+
if (x.innerHTML.toLowerCase() < y.innerHTML.toLowerCase()) {
|
| 256 |
+
shouldSwitch = true;
|
| 257 |
+
break;
|
| 258 |
+
}
|
| 259 |
+
} else {
|
| 260 |
+
if (Number(x.innerHTML) < Number(y.innerHTML)) {
|
| 261 |
+
shouldSwitch = true;
|
| 262 |
+
break;
|
| 263 |
+
}
|
| 264 |
+
}
|
| 265 |
+
}
|
| 266 |
+
}
|
| 267 |
+
if (shouldSwitch) {
|
| 268 |
+
rows[i].parentNode.insertBefore(rows[i + 1], rows[i]);
|
| 269 |
+
switching = true;
|
| 270 |
+
switchcount++;
|
| 271 |
+
} else {
|
| 272 |
+
if (switchcount == 0 && dir == "asc") {
|
| 273 |
+
dir = "desc";
|
| 274 |
+
switching = true;
|
| 275 |
+
}
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
Object.keys(urls).forEach(key => {
|
| 281 |
+
fetch(urls[key])
|
| 282 |
+
.then(response => response.json())
|
| 283 |
+
.then(data => {
|
| 284 |
+
let content = '';
|
| 285 |
+
if (Array.isArray(data)) {
|
| 286 |
+
content = createTable(data, `table-${key}`);
|
| 287 |
+
} else if (typeof data === 'object') {
|
| 288 |
+
let tableCounter = 0;
|
| 289 |
+
for (let dataKey in data) {
|
| 290 |
+
content += `<h2>${dataKey}</h2>`;
|
| 291 |
+
content += createTable(data[dataKey], `table-${key}-${tableCounter}`);
|
| 292 |
+
tableCounter++;
|
| 293 |
+
}
|
| 294 |
+
}
|
| 295 |
+
document.getElementById(key).innerHTML = content;
|
| 296 |
+
MathJax.Hub.Queue(["Typeset", MathJax.Hub])
|
| 297 |
+
})
|
| 298 |
+
.catch(error => {
|
| 299 |
+
console.error('Error:', error);
|
| 300 |
+
document.getElementById(key).innerHTML = `<p>Error loading data: ${error.message}</p>`;
|
| 301 |
+
});
|
| 302 |
+
});
|
| 303 |
+
</script>
|
| 304 |
+
</body>
|
| 305 |
+
</html>
|