Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
f435f54
1
Parent(s):
cfb5604
new models
Browse files- external_models_results.json +269 -3
external_models_results.json
CHANGED
|
@@ -266,8 +266,8 @@
|
|
| 266 |
"result_metrics_npm": 0.7286932366792048
|
| 267 |
},
|
| 268 |
{
|
| 269 |
-
"model": "sabia-3",
|
| 270 |
-
"name": "Sabiá-3",
|
| 271 |
"link": "https://www.maritaca.ai/",
|
| 272 |
"date": "2024-08-20",
|
| 273 |
"status": "full",
|
|
@@ -423,7 +423,7 @@
|
|
| 423 |
},
|
| 424 |
{
|
| 425 |
"model": "gemini-2.5-pro-exp-03-25",
|
| 426 |
-
"name": "Gemini 2.5 Pro Experimental (0325)",
|
| 427 |
"link": "https://aistudio.google.com",
|
| 428 |
"date": "2025-04-03",
|
| 429 |
"status": "full",
|
|
@@ -669,5 +669,271 @@
|
|
| 669 |
},
|
| 670 |
"result_metrics_average": 0.7870599821710969,
|
| 671 |
"result_metrics_npm": 0.6795192293708728
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
}
|
| 673 |
]
|
|
|
|
| 266 |
"result_metrics_npm": 0.7286932366792048
|
| 267 |
},
|
| 268 |
{
|
| 269 |
+
"model": "sabia-3-2024-07-15",
|
| 270 |
+
"name": "Sabiá-3 (2024-07-15)",
|
| 271 |
"link": "https://www.maritaca.ai/",
|
| 272 |
"date": "2024-08-20",
|
| 273 |
"status": "full",
|
|
|
|
| 423 |
},
|
| 424 |
{
|
| 425 |
"model": "gemini-2.5-pro-exp-03-25",
|
| 426 |
+
"name": "Gemini 2.5 Pro Experimental [reasoning] (0325)",
|
| 427 |
"link": "https://aistudio.google.com",
|
| 428 |
"date": "2025-04-03",
|
| 429 |
"status": "full",
|
|
|
|
| 669 |
},
|
| 670 |
"result_metrics_average": 0.7870599821710969,
|
| 671 |
"result_metrics_npm": 0.6795192293708728
|
| 672 |
+
},
|
| 673 |
+
{
|
| 674 |
+
"model": "deepseek-v3_1",
|
| 675 |
+
"name": "deepseek-ai/DeepSeek-V3.1 (API)",
|
| 676 |
+
"link": "https://huggingface.co/deepseek-ai/DeepSeek-V3.1",
|
| 677 |
+
"date": "2025-09-01",
|
| 678 |
+
"status": "full",
|
| 679 |
+
"main_language": "English",
|
| 680 |
+
"model_type": "chat",
|
| 681 |
+
"params": 685.0,
|
| 682 |
+
"result_metrics": {
|
| 683 |
+
"enem_challenge": 0.8887333799860042,
|
| 684 |
+
"bluex": 0.8178025034770514,
|
| 685 |
+
"oab_exams": 0.7038724373576309,
|
| 686 |
+
"assin2_sts": 0.8082104938836681,
|
| 687 |
+
"assin2_rte": 0.949346100935343,
|
| 688 |
+
"faquad_nli": 0.8406862745098038,
|
| 689 |
+
"hatebr_offensive": 0.9211711711711712,
|
| 690 |
+
"portuguese_hate_speech": 0.7423067698027224,
|
| 691 |
+
"tweetsentbr": 0.7584190029617157
|
| 692 |
+
},
|
| 693 |
+
"result_metrics_average": 0.8256164593427902,
|
| 694 |
+
"result_metrics_npm": 0.7370296776379883
|
| 695 |
+
},
|
| 696 |
+
{
|
| 697 |
+
"model": "kimi-k2",
|
| 698 |
+
"name": "moonshotai/Kimi-K2-Instruct (API)",
|
| 699 |
+
"link": "https://huggingface.co/moonshotai/Kimi-K2-Instruct",
|
| 700 |
+
"date": "2025-09-01",
|
| 701 |
+
"status": "full",
|
| 702 |
+
"main_language": "English",
|
| 703 |
+
"model_type": "chat",
|
| 704 |
+
"params": 1000.0,
|
| 705 |
+
"result_metrics": {
|
| 706 |
+
"enem_challenge": 0.8789363191042687,
|
| 707 |
+
"bluex": 0.827538247566064,
|
| 708 |
+
"oab_exams": 0.6970387243735763,
|
| 709 |
+
"assin2_sts": 0.7760142475181766,
|
| 710 |
+
"assin2_rte": 0.9436236879837872,
|
| 711 |
+
"faquad_nli": 0.8531466083708024,
|
| 712 |
+
"hatebr_offensive": 0.8941562198649953,
|
| 713 |
+
"portuguese_hate_speech": 0.7535500455551216,
|
| 714 |
+
"tweetsentbr": 0.7428370464802363
|
| 715 |
+
},
|
| 716 |
+
"result_metrics_average": 0.8185379052018921,
|
| 717 |
+
"result_metrics_npm": 0.7275664672121565
|
| 718 |
+
},
|
| 719 |
+
{
|
| 720 |
+
"model": "sabia-3-1-2025-05-08",
|
| 721 |
+
"name": "Sabiá-3.1 (2025-05-08)",
|
| 722 |
+
"link": "https://www.maritaca.ai/",
|
| 723 |
+
"date": "2025-09-01",
|
| 724 |
+
"status": "full",
|
| 725 |
+
"main_language": "Portuguese",
|
| 726 |
+
"model_type": "proprietary",
|
| 727 |
+
"result_metrics": {
|
| 728 |
+
"enem_challenge": 0.8894331700489853,
|
| 729 |
+
"bluex": 0.8178025034770514,
|
| 730 |
+
"oab_exams": 0.9202733485193622,
|
| 731 |
+
"assin2_sts": 0.8340482244079774,
|
| 732 |
+
"assin2_rte": 0.9423587830430271,
|
| 733 |
+
"faquad_nli": 0.7585644282172838,
|
| 734 |
+
"hatebr_offensive": 0.8308611905928697,
|
| 735 |
+
"portuguese_hate_speech": 0.7543648446960096,
|
| 736 |
+
"tweetsentbr": 0.7398273232644036
|
| 737 |
+
},
|
| 738 |
+
"result_metrics_average": 0.8319482018074411,
|
| 739 |
+
"result_metrics_npm": 0.7331597943893793
|
| 740 |
+
},
|
| 741 |
+
{
|
| 742 |
+
"model": "sabia-3-2024-12-11",
|
| 743 |
+
"name": "Sabiá-3 (2024-12-11)",
|
| 744 |
+
"link": "https://www.maritaca.ai/",
|
| 745 |
+
"date": "2025-09-01",
|
| 746 |
+
"status": "full",
|
| 747 |
+
"main_language": "Portuguese",
|
| 748 |
+
"model_type": "proprietary",
|
| 749 |
+
"result_metrics": {
|
| 750 |
+
"enem_challenge": 0.8691392582225332,
|
| 751 |
+
"bluex": 0.7872044506258693,
|
| 752 |
+
"oab_exams": 0.8009111617312072,
|
| 753 |
+
"assin2_sts": 0.7850131735268517,
|
| 754 |
+
"assin2_rte": 0.9390382723900459,
|
| 755 |
+
"faquad_nli": 0.7968815254182839,
|
| 756 |
+
"hatebr_offensive": 0.8608047226969084,
|
| 757 |
+
"portuguese_hate_speech": 0.7474723628059027,
|
| 758 |
+
"tweetsentbr": 0.7360466511491278
|
| 759 |
+
},
|
| 760 |
+
"result_metrics_average": 0.8136123976185256,
|
| 761 |
+
"result_metrics_npm": 0.7144701465854594
|
| 762 |
+
},
|
| 763 |
+
{
|
| 764 |
+
"model": "sabiazinho-3",
|
| 765 |
+
"name": "Sabiázinho-3 (2025-02-06)",
|
| 766 |
+
"link": "https://www.maritaca.ai/",
|
| 767 |
+
"date": "2025-09-01",
|
| 768 |
+
"status": "full",
|
| 769 |
+
"main_language": "Portuguese",
|
| 770 |
+
"model_type": "proprietary",
|
| 771 |
+
"result_metrics": {
|
| 772 |
+
"enem_challenge": 0.8439468159552135,
|
| 773 |
+
"bluex": 0.7343532684283728,
|
| 774 |
+
"oab_exams": 0.8159453302961276,
|
| 775 |
+
"assin2_sts": 0.8091208202474276,
|
| 776 |
+
"assin2_rte": 0.9370511249219384,
|
| 777 |
+
"faquad_nli": 0.7715445403113343,
|
| 778 |
+
"hatebr_offensive": 0.8604320820258526,
|
| 779 |
+
"portuguese_hate_speech": 0.7129508077161507,
|
| 780 |
+
"tweetsentbr": 0.6798994954276046
|
| 781 |
+
},
|
| 782 |
+
"result_metrics_average": 0.7961382539255579,
|
| 783 |
+
"result_metrics_npm": 0.685954609257193
|
| 784 |
+
},
|
| 785 |
+
{
|
| 786 |
+
"model": "grok-3-mini",
|
| 787 |
+
"name": "Grok 3 Mini [reasoning] (API)",
|
| 788 |
+
"link": "https://x.ai/",
|
| 789 |
+
"date": "2025-09-01",
|
| 790 |
+
"status": "full",
|
| 791 |
+
"main_language": "English",
|
| 792 |
+
"model_type": "chat",
|
| 793 |
+
"result_metrics": {
|
| 794 |
+
"enem_challenge": 0.9412176347095871,
|
| 795 |
+
"bluex": 0.8984700973574409,
|
| 796 |
+
"oab_exams": 0.7075170842824602,
|
| 797 |
+
"assin2_sts": 0.7846153023166811,
|
| 798 |
+
"assin2_rte": 0.9369863526592658,
|
| 799 |
+
"faquad_nli": 0.8974457100080231,
|
| 800 |
+
"hatebr_offensive": 0.9264201247592199,
|
| 801 |
+
"portuguese_hate_speech": 0.6868265194640906,
|
| 802 |
+
"tweetsentbr": 0.7496188889954271
|
| 803 |
+
},
|
| 804 |
+
"result_metrics_average": 0.836568634950244,
|
| 805 |
+
"result_metrics_npm": 0.7505284631974409
|
| 806 |
+
},
|
| 807 |
+
{
|
| 808 |
+
"model": "gpt-5-nano-2025-08-07",
|
| 809 |
+
"name": "GPT 5 Nano [reasoning] (2025-08-07)",
|
| 810 |
+
"link": "https://www.openai.com/",
|
| 811 |
+
"date": "2025-09-01",
|
| 812 |
+
"status": "full",
|
| 813 |
+
"main_language": "English",
|
| 814 |
+
"model_type": "proprietary",
|
| 815 |
+
"result_metrics": {
|
| 816 |
+
"enem_challenge": 0.9013296011196641,
|
| 817 |
+
"bluex": 0.8525730180806675,
|
| 818 |
+
"oab_exams": 0.5913439635535308,
|
| 819 |
+
"assin2_sts": 0.7157982790377855,
|
| 820 |
+
"assin2_rte": 0.9493397775671237,
|
| 821 |
+
"faquad_nli": 0.802473455931782,
|
| 822 |
+
"hatebr_offensive": 0.9169693400085076,
|
| 823 |
+
"portuguese_hate_speech": 0.7166590126291619,
|
| 824 |
+
"tweetsentbr": 0.7385573150818597
|
| 825 |
+
},
|
| 826 |
+
"result_metrics_average": 0.7983381958900091,
|
| 827 |
+
"result_metrics_npm": 0.699331432280926
|
| 828 |
+
},
|
| 829 |
+
{
|
| 830 |
+
"model": "gpt-5-mini-2025-08-07",
|
| 831 |
+
"name": "GPT 5 Mini [reasoning] (2025-08-07)",
|
| 832 |
+
"link": "https://www.openai.com/",
|
| 833 |
+
"date": "2025-09-01",
|
| 834 |
+
"status": "full",
|
| 835 |
+
"main_language": "English",
|
| 836 |
+
"model_type": "proprietary",
|
| 837 |
+
"result_metrics": {
|
| 838 |
+
"enem_challenge": 0.9566130160951715,
|
| 839 |
+
"bluex": 0.913769123783032,
|
| 840 |
+
"oab_exams": 0.7184510250569476,
|
| 841 |
+
"assin2_sts": 0.8151992531421179,
|
| 842 |
+
"assin2_rte": 0.9486789502727531,
|
| 843 |
+
"faquad_nli": 0.7959895379250218,
|
| 844 |
+
"hatebr_offensive": 0.9306148454596409,
|
| 845 |
+
"portuguese_hate_speech": 0.7476857189919288,
|
| 846 |
+
"tweetsentbr": 0.7208063363431595
|
| 847 |
+
},
|
| 848 |
+
"result_metrics_average": 0.8386453118966414,
|
| 849 |
+
"result_metrics_npm": 0.7509015993727701
|
| 850 |
+
},
|
| 851 |
+
{
|
| 852 |
+
"model": "gpt-5_reasoning_minimal-2025-08-07",
|
| 853 |
+
"name": "GPT 5 [reasoning: minimal] (2025-08-07)",
|
| 854 |
+
"link": "https://www.openai.com/",
|
| 855 |
+
"date": "2025-09-01",
|
| 856 |
+
"status": "full",
|
| 857 |
+
"main_language": "English",
|
| 858 |
+
"model_type": "proprietary",
|
| 859 |
+
"result_metrics": {
|
| 860 |
+
"enem_challenge": 0.8432470258922323,
|
| 861 |
+
"bluex": 0.7885952712100139,
|
| 862 |
+
"oab_exams": 0.8104783599088838,
|
| 863 |
+
"assin2_sts": 0.7497712012355019,
|
| 864 |
+
"assin2_rte": 0.9497544911228829,
|
| 865 |
+
"faquad_nli": 0.9049032312001003,
|
| 866 |
+
"hatebr_offensive": 0.9233018502276624,
|
| 867 |
+
"portuguese_hate_speech": 0.7502183789864052,
|
| 868 |
+
"tweetsentbr": 0.7877925879277
|
| 869 |
+
},
|
| 870 |
+
"result_metrics_average": 0.8342291553012646,
|
| 871 |
+
"result_metrics_npm": 0.7560493865775754
|
| 872 |
+
},
|
| 873 |
+
{
|
| 874 |
+
"model": "gemini-2_5_flash_lite",
|
| 875 |
+
"name": "Gemini 2.5 Flash Lite",
|
| 876 |
+
"link": "https://aistudio.google.com",
|
| 877 |
+
"date": "2025-09-01",
|
| 878 |
+
"status": "full",
|
| 879 |
+
"main_language": "English",
|
| 880 |
+
"model_type": "proprietary",
|
| 881 |
+
"result_metrics": {
|
| 882 |
+
"enem_challenge": 0.8257522743177047,
|
| 883 |
+
"bluex": 0.7329624478442281,
|
| 884 |
+
"oab_exams": 0.6783599088838269,
|
| 885 |
+
"assin2_sts": 0.8399704980607736,
|
| 886 |
+
"assin2_rte": 0.9095975398498664,
|
| 887 |
+
"faquad_nli": 0.8289944389172974,
|
| 888 |
+
"hatebr_offensive": 0.8733247194142535,
|
| 889 |
+
"portuguese_hate_speech": 0.7511757826108595,
|
| 890 |
+
"tweetsentbr": 0.7696375203962748
|
| 891 |
+
},
|
| 892 |
+
"result_metrics_average": 0.8010861255883428,
|
| 893 |
+
"result_metrics_npm": 0.6977608761930978
|
| 894 |
+
},
|
| 895 |
+
{
|
| 896 |
+
"model": "gemini-2_5_flash_lite",
|
| 897 |
+
"name": "Gemini 2.5 Flash Lite [reasoning: low]",
|
| 898 |
+
"link": "https://aistudio.google.com",
|
| 899 |
+
"date": "2025-09-01",
|
| 900 |
+
"status": "full",
|
| 901 |
+
"main_language": "English",
|
| 902 |
+
"model_type": "proprietary",
|
| 903 |
+
"result_metrics": {
|
| 904 |
+
"enem_challenge": 0.9013296011196641,
|
| 905 |
+
"bluex": 0.8400556328233658,
|
| 906 |
+
"oab_exams": 0.6943052391799545,
|
| 907 |
+
"assin2_sts": 0.755562697236674,
|
| 908 |
+
"assin2_rte": 0.9464858475885941,
|
| 909 |
+
"faquad_nli": 0.8703946691365647,
|
| 910 |
+
"hatebr_offensive": 0.9080576836597871,
|
| 911 |
+
"portuguese_hate_speech": 0.7416269940699909,
|
| 912 |
+
"tweetsentbr": 0.7520493635069894
|
| 913 |
+
},
|
| 914 |
+
"result_metrics_average": 0.8233186364801761,
|
| 915 |
+
"result_metrics_npm": 0.7360224650390731
|
| 916 |
+
},
|
| 917 |
+
{
|
| 918 |
+
"model": "gemini-2_5_flash",
|
| 919 |
+
"name": "Gemini 2.5 Flash",
|
| 920 |
+
"link": "https://aistudio.google.com",
|
| 921 |
+
"date": "2025-09-01",
|
| 922 |
+
"status": "full",
|
| 923 |
+
"main_language": "English",
|
| 924 |
+
"model_type": "proprietary",
|
| 925 |
+
"result_metrics": {
|
| 926 |
+
"enem_challenge": 0.9097270818754374,
|
| 927 |
+
"bluex": 0.8650904033379694,
|
| 928 |
+
"oab_exams": 0.8355353075170843,
|
| 929 |
+
"assin2_sts": 0.8714666962450285,
|
| 930 |
+
"assin2_rte": 0.9386350099968783,
|
| 931 |
+
"faquad_nli": 0.8578569197125898,
|
| 932 |
+
"hatebr_offensive": 0.8933375064862327,
|
| 933 |
+
"portuguese_hate_speech": 0.7502527990365506,
|
| 934 |
+
"tweetsentbr": 0.7801286503914011
|
| 935 |
+
},
|
| 936 |
+
"result_metrics_average": 0.8557811527332413,
|
| 937 |
+
"result_metrics_npm": 0.7734849178213028
|
| 938 |
}
|
| 939 |
]
|