Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	| [ | |
| { | |
| "model": "Qwen3-8B", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 12.37, | |
| "increased complexity": 15.08, | |
| "uncommon elements": 10.58, | |
| "unsolvable puzzle": 69.54, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Qwen3-30B-A3B-Thinking-2507", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 37.33, | |
| "increased complexity": "", | |
| "uncommon elements": "" , | |
| "unsolvable puzzle": 86.09, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Qwen3-32B", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 20.97, | |
| "increased complexity": 25.38 , | |
| "uncommon elements": 16.93 , | |
| "unsolvable puzzle": 65.48, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Qwen3-Next-80B-A3B-Thinking", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 36.35, | |
| "increased complexity": 41.97, | |
| "uncommon elements": 32.13 , | |
| "unsolvable puzzle": 83.11, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Qwen3-235B-A22B-Thinking-2507", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 43.33, | |
| "increased complexity": 46.93, | |
| "uncommon elements": 40.94 , | |
| "unsolvable puzzle": 84.41, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "MiniMax-M1-40k", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 6.44, | |
| "increased complexity": 5.27, | |
| "uncommon elements": 6.88 , | |
| "unsolvable puzzle": 51.39, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "DeepSeek-R1-0528-Qwen3-8B", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 13.83, | |
| "increased complexity": "", | |
| "uncommon elements": "" , | |
| "unsolvable puzzle": 95.19, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "DeepSeek-V3.1", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 41.43, | |
| "increased complexity": 44.61, | |
| "uncommon elements": 39.09 , | |
| "unsolvable puzzle": 88.76, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "DeepSeek-R1-0528", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 41.37, | |
| "increased complexity": 45.87, | |
| "uncommon elements": 37.28 , | |
| "unsolvable puzzle": 93.50, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "GLM-4.5", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 21.67, | |
| "increased complexity": 24.17, | |
| "uncommon elements": 21.49, | |
| "unsolvable puzzle": 93.26, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Kimi-K2-Instruct", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 15.18, | |
| "increased complexity": 17.33, | |
| "uncommon elements": 14.71, | |
| "unsolvable puzzle": 87.46, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "Seed-OSS-36B-Instruct", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 38.96, | |
| "increased complexity": 41.01, | |
| "uncommon elements": 38.79 , | |
| "unsolvable puzzle": 85.76, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "gpt-oss-120b", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": true, | |
| "total accuracy": 51.97, | |
| "increased complexity": 54.08, | |
| "uncommon elements": 51.11, | |
| "unsolvable puzzle": 93.35, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 50 | |
| }, | |
| { | |
| "model": "gpt-5", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 69.10, | |
| "increased complexity": 69.89, | |
| "uncommon elements": 67.88, | |
| "unsolvable puzzle": 97.78, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "gpt-5-mini", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 54.49, | |
| "increased complexity": 55.76, | |
| "uncommon elements": 52.13 , | |
| "unsolvable puzzle": 98.52, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "o4-mini", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 50.13, | |
| "increased complexity": 55.11, | |
| "uncommon elements": 47.13 , | |
| "unsolvable puzzle": 95.00, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "grok-4", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 59.55, | |
| "increased complexity": 58.26 , | |
| "uncommon elements": 59.62 , | |
| "unsolvable puzzle": 97.59, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "gemini-2.5-pro", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 40.58, | |
| "increased complexity": 43.80, | |
| "uncommon elements": 39.38 , | |
| "unsolvable puzzle": 91.48, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "grok-3-mini", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 42.56, | |
| "increased complexity": 48.48, | |
| "uncommon elements": 39.5, | |
| "unsolvable puzzle": 94.63, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "claude-sonnet-4-thinking", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 30.51, | |
| "increased complexity": 34.67, | |
| "uncommon elements": 28.25 , | |
| "unsolvable puzzle": 57.96, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| }, | |
| { | |
| "model": "gemini-2.5-flash", | |
| "mode": "sampling (Temp=0.6)", | |
| "open-source": false, | |
| "total accuracy": 19.49, | |
| "increased complexity": 25.11, | |
| "uncommon elements": 16.00, | |
| "unsolvable puzzle": 57.78, | |
| "temperature": 0.6, | |
| "n_sampling": 4, | |
| "n": 5 | |
| } | |
| ] | 
