HardcoreLogic / hardcorelogic.summary.json
JunsWan's picture
Upload 2 files
e738640 verified
[
{
"model": "Qwen3-8B",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 12.37,
"increased complexity": 15.08,
"uncommon elements": 10.58,
"unsolvable puzzle": 69.54,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Qwen3-30B-A3B-Thinking-2507",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 37.33,
"increased complexity": "",
"uncommon elements": "" ,
"unsolvable puzzle": 86.09,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Qwen3-32B",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 20.97,
"increased complexity": 25.38 ,
"uncommon elements": 16.93 ,
"unsolvable puzzle": 65.48,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Qwen3-Next-80B-A3B-Thinking",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 36.35,
"increased complexity": 41.97,
"uncommon elements": 32.13 ,
"unsolvable puzzle": 83.11,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Qwen3-235B-A22B-Thinking-2507",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 43.33,
"increased complexity": 46.93,
"uncommon elements": 40.94 ,
"unsolvable puzzle": 84.41,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "MiniMax-M1-40k",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 6.44,
"increased complexity": 5.27,
"uncommon elements": 6.88 ,
"unsolvable puzzle": 51.39,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "DeepSeek-R1-0528-Qwen3-8B",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 13.83,
"increased complexity": "",
"uncommon elements": "" ,
"unsolvable puzzle": 95.19,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "DeepSeek-V3.1",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 41.43,
"increased complexity": 44.61,
"uncommon elements": 39.09 ,
"unsolvable puzzle": 88.76,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "DeepSeek-R1-0528",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 41.37,
"increased complexity": 45.87,
"uncommon elements": 37.28 ,
"unsolvable puzzle": 93.50,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "GLM-4.5",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 21.67,
"increased complexity": 24.17,
"uncommon elements": 21.49,
"unsolvable puzzle": 93.26,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Kimi-K2-Instruct",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 15.18,
"increased complexity": 17.33,
"uncommon elements": 14.71,
"unsolvable puzzle": 87.46,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "Seed-OSS-36B-Instruct",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 38.96,
"increased complexity": 41.01,
"uncommon elements": 38.79 ,
"unsolvable puzzle": 85.76,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "gpt-oss-120b",
"mode": "sampling (Temp=0.6)",
"open-source": true,
"total accuracy": 51.97,
"increased complexity": 54.08,
"uncommon elements": 51.11,
"unsolvable puzzle": 93.35,
"temperature": 0.6,
"n_sampling": 4,
"n": 50
},
{
"model": "gpt-5",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 69.10,
"increased complexity": 69.89,
"uncommon elements": 67.88,
"unsolvable puzzle": 97.78,
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "gpt-5-mini",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 54.49,
"increased complexity": 55.76,
"uncommon elements": 52.13 ,
"unsolvable puzzle": 98.52,
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "o4-mini",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 50.13,
"increased complexity": 55.11,
"uncommon elements": 47.13 ,
"unsolvable puzzle": 95.00,
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "grok-4",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 59.55,
"increased complexity": 58.26 ,
"uncommon elements": 59.62 ,
"unsolvable puzzle": 97.59,
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "gemini-2.5-pro",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 40.58,
"increased complexity": 43.80,
"uncommon elements": 39.38 ,
"unsolvable puzzle": 91.48,
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "grok-3-mini",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 42.56,
"increased complexity": 48.48,
"uncommon elements": 39.5,
"unsolvable puzzle": 94.63,
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "claude-sonnet-4-thinking",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 30.51,
"increased complexity": 34.67,
"uncommon elements": 28.25 ,
"unsolvable puzzle": 57.96,
"temperature": 0.6,
"n_sampling": 4,
"n": 5
},
{
"model": "gemini-2.5-flash",
"mode": "sampling (Temp=0.6)",
"open-source": false,
"total accuracy": 19.49,
"increased complexity": 25.11,
"uncommon elements": 16.00,
"unsolvable puzzle": 57.78,
"temperature": 0.6,
"n_sampling": 4,
"n": 5
}
]