Commit
·
713e157
1
Parent(s):
b29bfe7
- src/lib/benchmarks/ index.ts +2 -0
- src/lib/benchmarks/deepseek.ts +111 -0
src/lib/benchmarks/ index.ts
CHANGED
|
@@ -3,10 +3,12 @@ import { xaiBenchmarks } from "./xai";
|
|
| 3 |
import { googleBenchmarks } from "./google";
|
| 4 |
import { anthropicBenchmarks } from "./anthropic";
|
| 5 |
import { openaiBenchmarks } from "./openai";
|
|
|
|
| 6 |
|
| 7 |
export const benchmarkData: Benchmark[] = [
|
| 8 |
...xaiBenchmarks,
|
| 9 |
...googleBenchmarks,
|
| 10 |
...anthropicBenchmarks,
|
| 11 |
...openaiBenchmarks,
|
|
|
|
| 12 |
];
|
|
|
|
| 3 |
import { googleBenchmarks } from "./google";
|
| 4 |
import { anthropicBenchmarks } from "./anthropic";
|
| 5 |
import { openaiBenchmarks } from "./openai";
|
| 6 |
+
import { deepseekBenchmarks } from "./deepseek";
|
| 7 |
|
| 8 |
export const benchmarkData: Benchmark[] = [
|
| 9 |
...xaiBenchmarks,
|
| 10 |
...googleBenchmarks,
|
| 11 |
...anthropicBenchmarks,
|
| 12 |
...openaiBenchmarks,
|
| 13 |
+
...deepseekBenchmarks
|
| 14 |
];
|
src/lib/benchmarks/deepseek.ts
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Benchmark } from "./types";
|
| 2 |
+
|
| 3 |
+
export const deepseekBenchmarks: Benchmark[] = [
|
| 4 |
+
{
|
| 5 |
+
model: "DeepSeek-R1-0528",
|
| 6 |
+
provider: "DeepSeek",
|
| 7 |
+
inputPrice: 0.55, // Placeholder, update if pricing becomes available
|
| 8 |
+
outputPrice: 2.19,
|
| 9 |
+
benchmark: {
|
| 10 |
+
aime_24: 91.4,
|
| 11 |
+
aime_2025: 87.5,
|
| 12 |
+
gpqa_diamond: 81.0,
|
| 13 |
+
gpqa: 81.0, // For compatibility; can remove if you want to only use gpqa_diamond
|
| 14 |
+
mmlu_pro: 85.0,
|
| 15 |
+
mmlu: 93.4, // MMLU-Redux assumed to be "mmlu"
|
| 16 |
+
simpleqa: 27.8,
|
| 17 |
+
lcb: 73.3, // LiveCodeBench
|
| 18 |
+
aider_polyglot: 71.6,
|
| 19 |
+
swe_bench_verified: 57.6,
|
| 20 |
+
// Optional or less frequent benchmarks:
|
| 21 |
+
humanitys_last_exam: 17.7,
|
| 22 |
+
// Not in BenchmarkMetric, but useful (commented for type safety):
|
| 23 |
+
// codeforces_div1: 1930,
|
| 24 |
+
// frames: 83.0,
|
| 25 |
+
// tau_bench_airline: 53.5,
|
| 26 |
+
// tau_bench_retail: 63.9,
|
| 27 |
+
// bfcl_v3_multiturn: 37.0,
|
| 28 |
+
// cnmo_2024: 86.9,
|
| 29 |
+
// hmmt_2025: 79.4,
|
| 30 |
+
},
|
| 31 |
+
source: "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528",
|
| 32 |
+
},
|
| 33 |
+
|
| 34 |
+
{
|
| 35 |
+
model: "DeepSeek-V3-0324",
|
| 36 |
+
provider: "DeepSeek",
|
| 37 |
+
inputPrice: 0.27, // Placeholder — adjust if actual pricing becomes available
|
| 38 |
+
outputPrice: 1.10,
|
| 39 |
+
benchmark: {
|
| 40 |
+
mmlu: 87.1, // From original DeepSeek-V3
|
| 41 |
+
mmlu_pro: 81.2, // Updated in V3-0324
|
| 42 |
+
gpqa: 68.4, // Updated in V3-0324
|
| 43 |
+
gpqa_diamond: 59.1, // From V3
|
| 44 |
+
aime_24: 59.4, // Updated in V3-0324
|
| 45 |
+
lcb: 49.2, // Updated LiveCodeBench
|
| 46 |
+
simpleqa: 24.9, // From V3
|
| 47 |
+
aider_polyglot: 49.6, // From V3
|
| 48 |
+
swe_bench_verified: 42.0 // From V3
|
| 49 |
+
},
|
| 50 |
+
source: "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324",
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
model: "DeepSeek-V3",
|
| 54 |
+
provider: "DeepSeek",
|
| 55 |
+
inputPrice: 0.27, // Placeholder — update if real pricing is known
|
| 56 |
+
outputPrice: 1.10,
|
| 57 |
+
benchmark: {
|
| 58 |
+
mmlu: 87.1,
|
| 59 |
+
mmlu_pro: 64.4,
|
| 60 |
+
// mmlu_redux: 86.2, // Commented: not in BenchmarkMetric
|
| 61 |
+
gpqa_diamond: 59.1,
|
| 62 |
+
simpleqa: 24.9,
|
| 63 |
+
aime_24: 39.2,
|
| 64 |
+
lcb: 37.6, // LiveCodeBench (Pass@1)
|
| 65 |
+
aider_polyglot: 49.6,
|
| 66 |
+
swe_bench_verified: 42.0,
|
| 67 |
+
|
| 68 |
+
// Optional or not yet in your schema:
|
| 69 |
+
// humanitys_last_exam: undefined,
|
| 70 |
+
// codeforces: 51.6,
|
| 71 |
+
// drop: 89.0,
|
| 72 |
+
// gsm8k: 89.3,
|
| 73 |
+
// math_em: 61.6,
|
| 74 |
+
// mgsm: 79.8,
|
| 75 |
+
// cmath: 90.7,
|
| 76 |
+
// cruxeval_i: 67.3,
|
| 77 |
+
// cruxeval_o: 69.8,
|
| 78 |
+
// triviaqa: 82.9,
|
| 79 |
+
// naturalquestions: 40.0,
|
| 80 |
+
// agieval: 79.6,
|
| 81 |
+
// hellaSwag: 88.9,
|
| 82 |
+
// piqa: 84.7,
|
| 83 |
+
// winogrande: 84.9,
|
| 84 |
+
},
|
| 85 |
+
source: "https://huggingface.co/deepseek-ai/DeepSeek-V3",
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
model: "DeepSeek-R1",
|
| 89 |
+
provider: "DeepSeek",
|
| 90 |
+
inputPrice: 0.60, // Placeholder — update if actual pricing is available
|
| 91 |
+
outputPrice: 1.20,
|
| 92 |
+
benchmark: {
|
| 93 |
+
mmlu: 90.8,
|
| 94 |
+
mmlu_pro: 84.0,
|
| 95 |
+
gpqa_diamond: 71.5,
|
| 96 |
+
simpleqa: 30.1,
|
| 97 |
+
lcb: 65.9, // LiveCodeBench (Pass@1-CoT)
|
| 98 |
+
swe_bench_verified: 49.2,
|
| 99 |
+
aider_polyglot: 53.3,
|
| 100 |
+
aime_24: 79.8,
|
| 101 |
+
// aime_2025: undefined, // not provided
|
| 102 |
+
// gpqa: undefined, // use gpqa_diamond
|
| 103 |
+
// egoschema: undefined,
|
| 104 |
+
// mmmu: undefined,
|
| 105 |
+
// loft: undefined,
|
| 106 |
+
// humanitys_last_exam: undefined, // optional
|
| 107 |
+
},
|
| 108 |
+
source: "https://huggingface.co/deepseek-ai/DeepSeek-R1",
|
| 109 |
+
},
|
| 110 |
+
];
|
| 111 |
+
|