Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Harheem Kim
commited on
Commit
·
56f8de2
1
Parent(s):
86584d5
update csv file
Browse files- .DS_Store +0 -0
- combined_evaluation_summary.csv +13 -12
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
combined_evaluation_summary.csv
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
Model,Vendor,Model Type,L1_Total_Tasks,L2_Total_Tasks,L3_Total_Tasks,L4_Total_Tasks,L5_Total_Tasks,L6_Total_Tasks,L7_Total_Tasks,L1_Evaluated_Tasks,L2_Evaluated_Tasks,L3_Evaluated_Tasks,L4_Evaluated_Tasks,L5_Evaluated_Tasks,L6_Evaluated_Tasks,L7_Evaluated_Tasks,L1_Avg_Exec_Time,L2_Avg_Exec_Time,L3_Avg_Exec_Time,L4_Avg_Exec_Time,L5_Avg_Exec_Time,L6_Avg_Exec_Time,L7_Avg_Exec_Time,L1_Avg_Tokens,L2_Avg_Tokens,L3_Avg_Tokens,L4_Avg_Tokens,L5_Avg_Tokens,L6_Avg_Tokens,L7_Avg_Tokens,L1_Avg_TPS,L2_Avg_TPS,L3_Avg_TPS,L4_Avg_TPS,L5_Avg_TPS,L6_Avg_TPS,L7_Avg_TPS,L1_Avg_TTFT,L2_Avg_TTFT,L3_Avg_TTFT,L4_Avg_TTFT,L5_Avg_TTFT,L6_Avg_TTFT,L7_Avg_TTFT,L1_RRR,L2_RRR,L3_RRR,L4_RRR,L5_RRR,L6_RRR,L7_RRR,L1_SR,L2_SR,L3_SR,L4_SR,L5_SR,L6_SR,L7_SR,L1_EPR_CVR,L2_EPR_CVR,L3_EPR_CVR,L4_EPR_CVR,L5_EPR_CVR,L6_EPR_CVR,L7_EPR_CVR,L1_pass@k,L2_pass@k,L3_pass@k,L4_pass@k,L5_pass@k,L6_pass@k,L7_pass@k,L1_TooAcc,L1_ArgAcc,L1_CallEM,L1_RespOK,L2_SelectAcc,L3_FSM,L3_PSM,L3_ΔSteps_norm,L3_ProvAcc,L4_Coverage,L4_SourceEPR,L5_AdaptiveRoutingScore,L5_FallbackSR,L6_ReuseRage,L6_RedundantCallRate,L6_EffScore,L7_ContextRetention,L7_RefRecall
|
| 2 |
-
gemini_gemini-2.5-flash,Google,API,11,30,10,10,20
|
| 3 |
-
kakaocorp_kanana-1.5-8b-instruct-2505,Kakao,OSS,11,30,10,10,20
|
| 4 |
-
skt_A.X-4.0-Light,SKT,OSS,11,30,10,10,20
|
| 5 |
-
anthropic_claude-sonnet-4-20250514,Anthropic,API,11,30,10,10,20
|
| 6 |
-
Qwen_qwen3-8B,Alibaba,OSS,11,30,10,10,20
|
| 7 |
-
bedrock_openai.gpt-oss-20b-1:0,OpenAI,OSS,11,30,10,10,20
|
| 8 |
-
azure_gpt-4.1,OpenAI,API,11,30,10,10,20
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
| 1 |
Model,Vendor,Model Type,L1_Total_Tasks,L2_Total_Tasks,L3_Total_Tasks,L4_Total_Tasks,L5_Total_Tasks,L6_Total_Tasks,L7_Total_Tasks,L1_Evaluated_Tasks,L2_Evaluated_Tasks,L3_Evaluated_Tasks,L4_Evaluated_Tasks,L5_Evaluated_Tasks,L6_Evaluated_Tasks,L7_Evaluated_Tasks,L1_Avg_Exec_Time,L2_Avg_Exec_Time,L3_Avg_Exec_Time,L4_Avg_Exec_Time,L5_Avg_Exec_Time,L6_Avg_Exec_Time,L7_Avg_Exec_Time,L1_Avg_Tokens,L2_Avg_Tokens,L3_Avg_Tokens,L4_Avg_Tokens,L5_Avg_Tokens,L6_Avg_Tokens,L7_Avg_Tokens,L1_Avg_TPS,L2_Avg_TPS,L3_Avg_TPS,L4_Avg_TPS,L5_Avg_TPS,L6_Avg_TPS,L7_Avg_TPS,L1_Avg_TTFT,L2_Avg_TTFT,L3_Avg_TTFT,L4_Avg_TTFT,L5_Avg_TTFT,L6_Avg_TTFT,L7_Avg_TTFT,L1_RRR,L2_RRR,L3_RRR,L4_RRR,L5_RRR,L6_RRR,L7_RRR,L1_SR,L2_SR,L3_SR,L4_SR,L5_SR,L6_SR,L7_SR,L1_EPR_CVR,L2_EPR_CVR,L3_EPR_CVR,L4_EPR_CVR,L5_EPR_CVR,L6_EPR_CVR,L7_EPR_CVR,L1_pass@k,L2_pass@k,L3_pass@k,L4_pass@k,L5_pass@k,L6_pass@k,L7_pass@k,L1_TooAcc,L1_ArgAcc,L1_CallEM,L1_RespOK,L2_SelectAcc,L3_FSM,L3_PSM,L3_ΔSteps_norm,L3_ProvAcc,L4_Coverage,L4_SourceEPR,L5_AdaptiveRoutingScore,L5_FallbackSR,L6_ReuseRage,L6_RedundantCallRate,L6_EffScore,L7_ContextRetention,L7_RefRecall
|
| 2 |
+
gemini_gemini-2.5-flash,Google,API,11,30,10,10,20,15,10,11,30,10,10,20,15,10,3.5,6.08,4.71,11.43,6.09,18.97,16.46,4274.73,5772.2,6796.7,18579.7,4064.9,48748.93,39339.8,1219.64,949.77,1442.09,1626.13,667.17,2570.11,2390.18,1.7953,2.701,3.2178,2.9207,2.2833,2.5433,2.1474,0.9091,0.8,0.8,1.0,0.8,0.8667,0.9,0.75,0.6583,0.275,0.675,0.325,0.5,0.475,0.8182,0.8,0.2,0.7,0.0667,0.7333,0.7,0.9091,0.7889,0.8,1.0,0.8,0.8667,0.9,0.8182,0.5909,0.2727,0.9091,0.8,0.2,0.2,0.2,0.0,0.45,0.45,0.0667,0.15,0.5333,0.9333,0.2556,0.85,0.775
|
| 3 |
+
kakaocorp_kanana-1.5-8b-instruct-2505,Kakao,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,5.53,17.22,14.51,23.78,9.44,52.98,47.39,4556.36,6107.6,5723.4,7188.3,5665.9,28502.33,28738.1,823.46,354.62,394.38,302.24,599.94,538.01,606.41,1.5236,6.7827,5.9015,7.4927,1.4163,7.764,5.1605,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8409,0.925,0.55,0.55,0.45,0.7167,0.4,1.0,1.0,1.0,0.9,0.225,1.0,0.9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6364,0.2727,1.0,1.0,0.0,0.5333,0.0,0.0,0.2667,0.2667,0.225,0.45,0.4,1.0,0.6,0.825,0.75
|
| 4 |
+
skt_A.X-4.0-Light,SKT,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,5.15,17.37,21.51,9.06,9.23,38.97,33.94,4286.73,7456.1,13579.8,2284.9,6500.85,27744.0,25032.0,833.07,429.13,631.27,252.27,704.42,711.88,737.55,1.3615,5.8379,6.0725,6.2881,1.3627,5.3648,3.902,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5455,0.7417,0.525,0.35,0.2875,0.55,0.45,1.0,1.0,1.0,0.3,0.2583,0.8667,0.9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8182,0.4545,1.0,1.0,0.2,0.7833,0.65,0.1,0.05,0.05,0.25,0.55,0.4,1.0,0.4667,0.8,0.775
|
| 5 |
+
anthropic_claude-sonnet-4-20250514,Anthropic,API,11,30,10,10,20,15,10,11,30,10,10,20,15,10,7.36,14.86,24.85,41.55,21.45,55.38,41.59,4618.09,8203.9,38611.3,84754.4,16068.2,66733.8,62190.0,627.77,552.18,1554.03,2039.93,749.12,1205.02,1495.15,3.4212,6.0131,6.3685,8.5277,3.487,7.754,3.0748,1.0,0.9667,1.0,1.0,1.0,1.0,1.0,0.9545,0.8917,0.975,0.625,0.5625,0.7333,0.575,1.0,0.9667,1.0,1.0,0.2867,1.0,1.0,1.0,0.9778,1.0,0.9667,1.0,1.0,1.0,1.0,0.6591,0.2727,1.0,0.9667,0.6,1.0,0.5,0.15,0.7167,0.7,0.2708,0.8,0.6,1.0,0.5778,0.825,0.7
|
| 6 |
+
Qwen_qwen3-8B,Alibaba,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,24.54,33.11,38.89,61.09,46.28,102.03,92.19,5798.0,7600.07,8380.0,14758.8,9789.4,45946.13,55163.2,236.28,229.53,215.5,241.58,211.54,450.34,598.37,11.0876,13.3456,23.3045,16.4015,8.5784,16.7883,11.2336,1.0,1.0,0.9,0.9,1.0,1.0,1.0,0.5909,0.8083,0.175,0.35,0.45,0.7833,0.525,1.0,1.0,0.4,0.9,0.2258,1.0,0.95,1.0,1.0,0.9,0.8,0.9667,1.0,1.0,1.0,0.7955,0.4545,1.0,1.0,0.2,0.3,0.2,0.1,0.4667,0.4667,0.2333,0.55,0.2,1.0,0.5667,0.85,0.775
|
| 7 |
+
bedrock_openai.gpt-oss-20b-1:0,OpenAI,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,9.79,6.79,12.89,11.76,6.88,19.4,14.43,8388.64,9117.3,13330.0,17044.9,4361.05,16117.67,23303.1,857.25,1342.5,1033.74,1449.89,633.81,830.86,1614.97,2.5504,2.7401,2.8613,2.8396,1.6012,3.0323,1.7099,1.0,1.0,0.8,1.0,1.0,1.0,1.0,0.4545,0.6917,0.35,0.2,0.2875,0.3333,0.25,1.0,0.9667,0.8,0.5,0.1875,0.6667,0.8,0.9394,0.9778,0.7667,0.9333,0.9833,1.0,1.0,1.0,0.6591,0.3636,0.8182,0.9667,0.2,0.7167,0.35,0.0,0.25,0.25,0.1375,0.3,0.2667,1.0,0.2667,0.55,0.625
|
| 8 |
+
azure_gpt-4.1,OpenAI,API,11,30,10,10,20,15,10,11,30,10,10,20,15,10,2.28,4.7,11.23,12.05,5.84,17.71,16.33,2890.55,4419.13,23985.5,12406.0,6131.5,31127.0,38953.1,1270.46,940.69,2136.47,1029.53,1049.07,1757.14,2385.33,0.9657,2.0463,2.2538,3.5501,1.0222,2.6721,1.0284,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9318,0.9167,0.925,0.7,0.4875,0.7333,0.6,1.0,1.0,0.9909,1.0,0.2208,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7273,0.3636,1.0,1.0,0.6,1.0,0.5,0.15,0.5333,0.5333,0.3,0.65,0.6,1.0,0.5889,0.875,0.8
|
| 9 |
+
gemini_gemini-2.5-pro,Google,API,11,30,10,10,20,15,10,11,30,10,10,20,15,10,13.39,14.08,19.53,26.55,18.96,50.02,45.28,9261.45,5576.73,8347.7,16395.9,4082.25,55127.07,47808.8,691.67,396.07,427.46,617.49,215.31,1102.01,1055.74,5.934,7.4011,9.95,9.7731,5.4692,8.5743,7.6825,0.9091,0.8,0.8,1.0,0.75,0.8667,0.9,0.8636,0.5833,0.15,0.35,0.35,0.5167,0.4,0.9091,0.7667,0.4,0.5,0.15,0.8667,0.8,0.9091,0.8,0.8,1.0,0.7833,0.8667,0.9,0.9091,0.6136,0.2727,0.9091,0.7667,0.0,0.2167,0.2,0.0,0.3,0.3,0.15,0.3,0.7333,1.0,0.2444,0.8,0.675
|
| 10 |
+
Qwen_Qwen3-4B-Instruct-2507,Alibaba,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,6.66,22.89,14.8,51.19,11.71,86.63,60.09,5273.09,6447.9,9087.8,17502.5,5363.85,36058.4,37068.1,791.39,281.66,613.83,341.91,458.02,416.23,616.84,2.093,9.1244,4.4172,13.7638,1.8319,14.8681,8.245,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6364,0.6583,0.15,0.375,0.3,0.6167,0.425,1.0,1.0,1.0,0.9,0.15,1.0,1.0,1.0,1.0,1.0,0.9333,1.0,1.0,1.0,1.0,0.75,0.3636,1.0,1.0,0.2,0.6333,0.7,0.0,0.5167,0.5167,0.15,0.3,0.1333,1.0,0.4,0.875,0.8
|
| 11 |
+
bedrock_openai.gpt-oss-120b-1:0,OpenAI,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,11.39,16.7,32.25,33.41,31.7,48.71,36.46,17596.27,16833.23,57622.4,18021.2,18797.0,89591.27,48772.9,1545.5,1007.71,1786.5,539.38,593.04,1839.34,1337.89,3.2513,4.661,7.2904,6.5539,4.096,7.2657,6.3429,1.0,0.9333,0.9,0.8,1.0,1.0,1.0,0.75,0.8333,0.625,0.375,0.3375,0.8167,0.5,0.8489,0.9333,0.795,0.4,0.1689,1.0,0.91,0.9697,0.9667,0.9333,0.8,0.95,1.0,1.0,1.0,0.5909,0.2727,1.0,0.9333,0.1,0.85,0.05,0.0,0.2667,0.2667,0.2667,0.675,0.4,0.4333,0.3106,0.75,0.725
|
| 12 |
+
K-intelligence_Midm-2.0-Base-Instruct,KT,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,5.39,3.9,3.06,3.75,8.13,28.66,16.08,4185.82,2514.93,3418.3,2388.8,3084.5,22909.13,14079.1,775.89,644.46,1117.59,636.3,379.51,799.33,875.38,1.4775,1.8563,1.8855,1.6781,1.0824,1.6794,1.1356,1.0,1.0,1.0,1.0,0.95,1.0,1.0,0.5909,0.5167,0.25,0.325,0.275,0.4833,0.35,0.9091,0.5667,0.2,0.3,0.0667,0.9333,0.6,1.0,1.0,1.0,0.8667,0.9833,1.0,1.0,0.9091,0.6364,0.2727,1.0,0.5667,0.0,0.1,0.0,0.0,0.0,0.0,0.0667,0.15,0.0,0.9333,0.3,0.55,0.5
|
| 13 |
+
azure_gpt-5,OpenAI,API,11,30,10,10,20,15,10,11,30,10,10,20,15,10,18.46,38.65,52.28,66.86,62.16,83.24,43.19,16561.27,23481.23,20064.7,35899.4,15424.8,93426.4,42342.0,897.26,607.5,383.81,536.9,248.16,1122.33,980.46,5.4862,8.2415,11.4348,12.643,7.5599,12.2077,6.7278,1.0,0.9667,0.9,0.8,0.8,1.0,1.0,0.6818,0.2167,0.25,0.15,0.4125,0.2,0.025,1.0,0.9667,0.6,0.6,0.1858,1.0,1.0,1.0,0.9556,0.9,0.9,0.85,1.0,1.0,1.0,0.7045,0.2727,0.8182,0.9667,0.1,0.5,0.3,0.0,0.5,0.5,0.21,0.7,0.2,0.8667,0.0417,0.525,0.45
|
| 14 |
+
bedrock_qwen.qwen3-32b-v1:0,Alibaba,OSS,11,30,10,10,20,15,10,11,30,10,10,20,15,10,3.18,6.76,7.89,15.88,5.64,36.25,26.07,6960.0,6083.97,11256.3,12043.6,7232.95,33715.87,35491.2,2186.21,899.52,1426.58,758.27,1281.95,930.05,1361.56,1.2647,2.8337,2.7304,5.4506,1.1114,5.9054,2.061,1.0,1.0,1.0,0.9,1.0,1.0,1.0,0.7045,0.775,0.475,0.475,0.3375,0.5667,0.525,1.0,1.0,1.0,0.9,0.1917,1.0,1.0,1.0,1.0,1.0,0.9,1.0,1.0,1.0,1.0,0.6364,0.2727,1.0,1.0,0.6,0.85,0.75,0.05,0.3833,0.3833,0.1917,0.4,0.2,0.6,0.5,0.675,0.775
|