Spaces:
Running
Running
commit
Browse files- model_results.json +62 -1
model_results.json
CHANGED
|
@@ -813,7 +813,68 @@
|
|
| 813 |
{
|
| 814 |
"model_name": "gemini-2.5-flash",
|
| 815 |
"results": {
|
| 816 |
-
"mmlu_results": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
"unified_exam_results": [
|
| 818 |
{
|
| 819 |
"category": "Average",
|
|
|
|
| 813 |
{
|
| 814 |
"model_name": "gemini-2.5-flash",
|
| 815 |
"results": {
|
| 816 |
+
"mmlu_results": [
|
| 817 |
+
{
|
| 818 |
+
"category": "Average",
|
| 819 |
+
"score": 0.7519
|
| 820 |
+
},
|
| 821 |
+
{
|
| 822 |
+
"category": "Biology",
|
| 823 |
+
"score": 0.8333
|
| 824 |
+
},
|
| 825 |
+
{
|
| 826 |
+
"category": "Business",
|
| 827 |
+
"score": 0.8939
|
| 828 |
+
},
|
| 829 |
+
{
|
| 830 |
+
"category": "Chemistry",
|
| 831 |
+
"score": 0.7579
|
| 832 |
+
},
|
| 833 |
+
{
|
| 834 |
+
"category": "Computer Science",
|
| 835 |
+
"score": 0.8529
|
| 836 |
+
},
|
| 837 |
+
{
|
| 838 |
+
"category": "Economics",
|
| 839 |
+
"score": 0.831
|
| 840 |
+
},
|
| 841 |
+
{
|
| 842 |
+
"category": "Engineering",
|
| 843 |
+
"score": 0.5875
|
| 844 |
+
},
|
| 845 |
+
{
|
| 846 |
+
"category": "Health",
|
| 847 |
+
"score": 0.7941
|
| 848 |
+
},
|
| 849 |
+
{
|
| 850 |
+
"category": "History",
|
| 851 |
+
"score": 0.5862
|
| 852 |
+
},
|
| 853 |
+
{
|
| 854 |
+
"category": "Law",
|
| 855 |
+
"score": 0.6742
|
| 856 |
+
},
|
| 857 |
+
{
|
| 858 |
+
"category": "Math",
|
| 859 |
+
"score": 0.7168
|
| 860 |
+
},
|
| 861 |
+
{
|
| 862 |
+
"category": "Other",
|
| 863 |
+
"score": 0.7273
|
| 864 |
+
},
|
| 865 |
+
{
|
| 866 |
+
"category": "Philosophy",
|
| 867 |
+
"score": 0.7857
|
| 868 |
+
},
|
| 869 |
+
{
|
| 870 |
+
"category": "Physics",
|
| 871 |
+
"score": 0.7248
|
| 872 |
+
},
|
| 873 |
+
{
|
| 874 |
+
"category": "Psychology",
|
| 875 |
+
"score": 0.7612
|
| 876 |
+
}
|
| 877 |
+
],
|
| 878 |
"unified_exam_results": [
|
| 879 |
{
|
| 880 |
"category": "Average",
|