| {"created_at": "2025-09-25T15:36:22.940200", "global_step": 2000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091195}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.29506074487153955, "acc_stderr,none": 0.004551379838156102, "acc_norm,none": 0.3224457279426409, "acc_norm_stderr,none": 0.004664572784985592}, "mmlu": {"acc,none": 0.22988178322176328, "acc_stderr,none": 0.003545428280397627, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24187035069075452, "acc_stderr,none": 0.006240064070915057, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2857142857142857, "acc_stderr,none": 0.040406101782088394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695482995}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.03019028245350194}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.02904133351059804}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2396694214876033, "acc_stderr,none": 0.03896878985070417}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052191}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23798882681564246, "acc_stderr,none": 0.014242630070574885}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.022122439772480764}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.022779719088733393}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2457627118644068, "acc_stderr,none": 0.01099615663514269}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.23978113936272932, "acc_stderr,none": 0.007641243559455741, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23018867924528302, "acc_stderr,none": 0.02590789712240817}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2138728323699422, "acc_stderr,none": 0.03126511206173043}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653694}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3183856502242152, "acc_stderr,none": 0.03126580522513713}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.03760178006026621}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2905982905982906, "acc_stderr,none": 0.029745048572674054}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24010217113665389, "acc_stderr,none": 0.015274685213734195}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.023152722439402307}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.23049645390070922, "acc_stderr,none": 0.025123739226872405}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.18382352941176472, "acc_stderr,none": 0.023529242185193106}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.28313253012048195, "acc_stderr,none": 0.03507295431370519}, "mmlu_social_sciences": {"acc,none": 0.216769580760481, "acc_stderr,none": 0.007426082910195879, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281335}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.17676767676767677, "acc_stderr,none": 0.027178752639044915}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.20207253886010362, "acc_stderr,none": 0.02897908979429673}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2076923076923077, "acc_stderr,none": 0.02056753956724679}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.1908256880733945, "acc_stderr,none": 0.016847676400091105}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2595419847328244, "acc_stderr,none": 0.03844876139785271}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24836601307189543, "acc_stderr,none": 0.017479487001364764}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.1836734693877551, "acc_stderr,none": 0.024789071332007657}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916714}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.26, "acc_stderr,none": 0.0440844002276808}, "mmlu_stem": {"acc,none": 0.21503330161750714, "acc_stderr,none": 0.00730630388783806, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932268}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.035478541985608264}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.18421052631578946, "acc_stderr,none": 0.031546980450822305}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.03716177437566017}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.26382978723404255, "acc_stderr,none": 0.02880998985410297}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20899470899470898, "acc_stderr,none": 0.02094048156533485}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.1774193548387097, "acc_stderr,none": 0.02173254068932927}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.16748768472906403, "acc_stderr,none": 0.02627308604753542}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2111111111111111, "acc_stderr,none": 0.024882116857655113}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1986754966887417, "acc_stderr,none": 0.032578473844367774}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.1527777777777778, "acc_stderr,none": 0.02453632602613422}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.30357142857142855, "acc_stderr,none": 0.04364226155841044}, "sciq": {"alias": "sciq", "acc,none": 0.7, "acc_stderr,none": 0.014498627873361428, "acc_norm,none": 0.618, "acc_norm_stderr,none": 0.015372453034968522}} | |
| {"created_at": "2025-09-25T16:38:44.863816", "global_step": 4000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19737919737919737, "acc_stderr,none": 0.011395305685091197}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3323043218482374, "acc_stderr,none": 0.004700767741735564, "acc_norm,none": 0.3911571400119498, "acc_norm_stderr,none": 0.004870121051762737}, "mmlu": {"acc,none": 0.24284289987181312, "acc_stderr,none": 0.003618042819823074, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2512221041445271, "acc_stderr,none": 0.006315737400173529, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.18253968253968253, "acc_stderr,none": 0.03455071019102148}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24848484848484848, "acc_stderr,none": 0.03374402644139405}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.03114557065948678}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.027303484599069432}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.04065578140908705}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252627}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.315028901734104, "acc_stderr,none": 0.025009313790069713}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24692737430167597, "acc_stderr,none": 0.014422292204808857}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.1864951768488746, "acc_stderr,none": 0.02212243977248076}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25617283950617287, "acc_stderr,none": 0.0242885336377261}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25097783572359844, "acc_stderr,none": 0.011073730299187236}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3216374269005848, "acc_stderr,none": 0.03582529442573122}, "mmlu_other": {"acc,none": 0.23302220791760542, "acc_stderr,none": 0.007588362496194095, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.21509433962264152, "acc_stderr,none": 0.025288394502891363}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483099}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774709}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.23766816143497757, "acc_stderr,none": 0.028568079464714288}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.039891398595317706}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24358974358974358, "acc_stderr,none": 0.028120966503914418}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2515964240102171, "acc_stderr,none": 0.015517322365529627}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.024848018263875192}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2198581560283688, "acc_stderr,none": 0.024706141070705477}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19117647058823528, "acc_stderr,none": 0.02388688192244034}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.25301204819277107, "acc_stderr,none": 0.033844291552331346}, "mmlu_social_sciences": {"acc,none": 0.24829379265518361, "acc_stderr,none": 0.007793374823155598, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.04185774424022056}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.21717171717171718, "acc_stderr,none": 0.029376616484945633}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.26424870466321243, "acc_stderr,none": 0.031821550509166484}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.25210084033613445, "acc_stderr,none": 0.028205545033277723}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.23486238532110093, "acc_stderr,none": 0.018175110510343585}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.24427480916030533, "acc_stderr,none": 0.037683359597287434}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2630718954248366, "acc_stderr,none": 0.017812676542320657}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.21818181818181817, "acc_stderr,none": 0.03955932861795833}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.27755102040816326, "acc_stderr,none": 0.02866685779027465}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.263681592039801, "acc_stderr,none": 0.031157150869355558}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_stem": {"acc,none": 0.23469711385981604, "acc_stderr,none": 0.007552830779446376, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.0391545063041425}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23026315789473684, "acc_stderr,none": 0.03426059424403165}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768077}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.04092563958237655}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.23829787234042554, "acc_stderr,none": 0.027851252973889774}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2689655172413793, "acc_stderr,none": 0.03695183311650232}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.20634920634920634, "acc_stderr,none": 0.02084229093011467}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.22258064516129034, "acc_stderr,none": 0.023664216671642518}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.23645320197044334, "acc_stderr,none": 0.029896114291733545}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.28, "acc_stderr,none": 0.045126085985421276}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.025644108639267603}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18055555555555555, "acc_stderr,none": 0.02623287897149166}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.03952301967702511}, "sciq": {"alias": "sciq", "acc,none": 0.794, "acc_stderr,none": 0.01279561361278655, "acc_norm,none": 0.7, "acc_norm_stderr,none": 0.014498627873361427}} | |
| {"created_at": "2025-09-25T18:19:49.363134", "global_step": 6000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.011552714477876657}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.355008962358096, "acc_stderr,none": 0.004775380866948015, "acc_norm,none": 0.43308105954989046, "acc_norm_stderr,none": 0.0049448895454979516}, "mmlu": {"acc,none": 0.23899729383278734, "acc_stderr,none": 0.003597103128467749, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2556854410201913, "acc_stderr,none": 0.006363385444330416, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.03718489006818115}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2909090909090909, "acc_stderr,none": 0.03546563019624336}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.030778554678693268}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.27848101265822783, "acc_stderr,none": 0.02917868230484255}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.04139112727635464}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.043733130409147614}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.27607361963190186, "acc_stderr,none": 0.0351238528370505}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2254335260115607, "acc_stderr,none": 0.022497230190967554}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25921787709497207, "acc_stderr,none": 0.014655780837497712}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2347266881028939, "acc_stderr,none": 0.024071805887677048}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.023468429832451163}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.25554106910039115, "acc_stderr,none": 0.01113985783359852}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.27485380116959063, "acc_stderr,none": 0.03424042924691584}, "mmlu_other": {"acc,none": 0.23656260057933698, "acc_stderr,none": 0.007616657708750489, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22264150943396227, "acc_stderr,none": 0.025604233470899105}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818317}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.29596412556053814, "acc_stderr,none": 0.030636591348699803}, "mmlu_management": {"alias": " - management", "acc,none": 0.20388349514563106, "acc_stderr,none": 0.03989139859531773}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2264957264957265, "acc_stderr,none": 0.027421007295392926}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24648786717752236, "acc_stderr,none": 0.015411308769686927}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.19281045751633988, "acc_stderr,none": 0.022589318888176748}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2872340425531915, "acc_stderr,none": 0.026992199173064356}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.1801470588235294, "acc_stderr,none": 0.023345163616544855}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2289156626506024, "acc_stderr,none": 0.03270745277352477}, "mmlu_social_sciences": {"acc,none": 0.23366915827104323, "acc_stderr,none": 0.0076275517157326144, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2894736842105263, "acc_stderr,none": 0.04266339443159394}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.02962022787479049}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.27461139896373055, "acc_stderr,none": 0.032210245080411544}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.18461538461538463, "acc_stderr,none": 0.01967163241310029}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.226890756302521, "acc_stderr,none": 0.027205371538279472}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22018348623853212, "acc_stderr,none": 0.01776597865232756}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25163398692810457, "acc_stderr,none": 0.01755581809132227}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.24545454545454545, "acc_stderr,none": 0.04122066502878285}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2163265306122449, "acc_stderr,none": 0.026358916334904062}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.25870646766169153, "acc_stderr,none": 0.030965903123573012}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.04461960433384741}, "mmlu_stem": {"acc,none": 0.22169362511893434, "acc_stderr,none": 0.007396965387292062, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.17, "acc_stderr,none": 0.03775251680686371}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.22962962962962963, "acc_stderr,none": 0.03633384414073465}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.2236842105263158, "acc_stderr,none": 0.03391160934343602}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2708333333333333, "acc_stderr,none": 0.037161774375660164}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036846}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.19607843137254902, "acc_stderr,none": 0.03950581861179963}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.02924188386962881}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.03375672449560554}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.02210112878741543}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2032258064516129, "acc_stderr,none": 0.02289168798455495}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489614}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.023684075585322647}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.19205298013245034, "acc_stderr,none": 0.03216298420593614}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.18518518518518517, "acc_stderr,none": 0.026491914727355154}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.038946411200447915}, "sciq": {"alias": "sciq", "acc,none": 0.814, "acc_stderr,none": 0.012310790208412805, "acc_norm,none": 0.728, "acc_norm_stderr,none": 0.014078856992462618}} | |
| {"created_at": "2025-09-25T20:01:12.057071", "global_step": 8000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20556920556920558, "acc_stderr,none": 0.011569834551534293}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.36666002788289187, "acc_stderr,none": 0.004809077205343491, "acc_norm,none": 0.4554869547898825, "acc_norm_stderr,none": 0.00496996845825617}, "mmlu": {"acc,none": 0.24027916251246262, "acc_stderr,none": 0.0036044826944976378, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2607863974495218, "acc_stderr,none": 0.006404796451568812, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.037184890068181146}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2545454545454545, "acc_stderr,none": 0.03401506715249039}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.22058823529411764, "acc_stderr,none": 0.02910225438967408}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.24050632911392406, "acc_stderr,none": 0.02782078198114968}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.256198347107438, "acc_stderr,none": 0.03984979653302872}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.039578354719809784}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2630057803468208, "acc_stderr,none": 0.023703099525258172}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.26927374301675977, "acc_stderr,none": 0.014835616582882611}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26366559485530544, "acc_stderr,none": 0.02502553850053234}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.27469135802469136, "acc_stderr,none": 0.024836057868294677}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.26010430247718386, "acc_stderr,none": 0.01120438288782384}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.29239766081871343, "acc_stderr,none": 0.03488647713457923}, "mmlu_other": {"acc,none": 0.2281943997425169, "acc_stderr,none": 0.007527404376239629, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.22641509433962265, "acc_stderr,none": 0.025757559893106727}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.20809248554913296, "acc_stderr,none": 0.03095289021774988}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.16, "acc_stderr,none": 0.03684529491774709}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.3004484304932735, "acc_stderr,none": 0.030769352008229136}, "mmlu_management": {"alias": " - management", "acc,none": 0.21359223300970873, "acc_stderr,none": 0.04058042015646035}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2264957264957265, "acc_stderr,none": 0.02742100729539292}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.227330779054917, "acc_stderr,none": 0.014987270640946017}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.21568627450980393, "acc_stderr,none": 0.02355083135199509}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.26595744680851063, "acc_stderr,none": 0.026358065698880585}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.19852941176470587, "acc_stderr,none": 0.024231013370541107}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.03175554786629921}, "mmlu_social_sciences": {"acc,none": 0.2265193370165746, "acc_stderr,none": 0.0075457535700145335, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.2719298245614035, "acc_stderr,none": 0.041857744240220575}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.18686868686868688, "acc_stderr,none": 0.027772533334218964}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.21243523316062177, "acc_stderr,none": 0.02951928261681726}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.19487179487179487, "acc_stderr,none": 0.020083167595181393}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22018348623853212, "acc_stderr,none": 0.017765978652327572}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22900763358778625, "acc_stderr,none": 0.036853466317118506}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2630718954248366, "acc_stderr,none": 0.017812676542320657}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.22727272727272727, "acc_stderr,none": 0.040139645540727735}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.21224489795918366, "acc_stderr,none": 0.026176967197866764}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23383084577114427, "acc_stderr,none": 0.02992941540834839}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_stem": {"acc,none": 0.23501427212178877, "acc_stderr,none": 0.007555760085743377, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368466}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174023}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03459777606810538}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2916666666666667, "acc_stderr,none": 0.03800968060554858}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.16, "acc_stderr,none": 0.0368452949177471}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.04389869956808778}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.24680851063829787, "acc_stderr,none": 0.028185441301234102}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.23448275862068965, "acc_stderr,none": 0.035306258743465914}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.02167921966369314}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2064516129032258, "acc_stderr,none": 0.023025899617188716}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.20689655172413793, "acc_stderr,none": 0.02850137816789395}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.21481481481481482, "acc_stderr,none": 0.025040443877000686}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2251655629139073, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25462962962962965, "acc_stderr,none": 0.029711275860005344}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.22321428571428573, "acc_stderr,none": 0.03952301967702511}, "sciq": {"alias": "sciq", "acc,none": 0.804, "acc_stderr,none": 0.012559527926707363, "acc_norm,none": 0.723, "acc_norm_stderr,none": 0.014158794845306263}} | |
| {"created_at": "2025-09-25T21:42:20.568925", "global_step": 10000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19983619983619982, "acc_stderr,none": 0.01144844799672838}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.3808006373232424, "acc_stderr,none": 0.004845912857338656, "acc_norm,none": 0.476000796654053, "acc_norm_stderr,none": 0.0049840302505072915}, "mmlu": {"acc,none": 0.24213075060532688, "acc_stderr,none": 0.0036118764405864024, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2603613177470776, "acc_stderr,none": 0.006393597563108829, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.19047619047619047, "acc_stderr,none": 0.035122074123020534}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2787878787878788, "acc_stderr,none": 0.03501438706296781}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.2696078431372549, "acc_stderr,none": 0.031145570659486782}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2742616033755274, "acc_stderr,none": 0.029041333510598042}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.32231404958677684, "acc_stderr,none": 0.042664163633521685}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.2777777777777778, "acc_stderr,none": 0.04330043749650742}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3312883435582822, "acc_stderr,none": 0.03697983910025588}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2543352601156069, "acc_stderr,none": 0.023445826276545546}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.23910614525139665, "acc_stderr,none": 0.014265554192331152}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.22508038585209003, "acc_stderr,none": 0.02372008851617903}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.28703703703703703, "acc_stderr,none": 0.025171041915309684}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2561929595827901, "acc_stderr,none": 0.01114917315311058}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.3157894736842105, "acc_stderr,none": 0.03565079670708311}, "mmlu_other": {"acc,none": 0.22594142259414227, "acc_stderr,none": 0.007492973414729824, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.28, "acc_stderr,none": 0.04512608598542127}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.19622641509433963, "acc_stderr,none": 0.024442388131100817}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.03295304696818318}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.2062780269058296, "acc_stderr,none": 0.027157150479563824}, "mmlu_management": {"alias": " - management", "acc,none": 0.17475728155339806, "acc_stderr,none": 0.037601780060266224}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2564102564102564, "acc_stderr,none": 0.02860595370200426}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2413793103448276, "acc_stderr,none": 0.015302380123542094}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.20261437908496732, "acc_stderr,none": 0.023015446877985662}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.2765957446808511, "acc_stderr,none": 0.026684564340460997}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.16176470588235295, "acc_stderr,none": 0.022368672562886754}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.23493975903614459, "acc_stderr,none": 0.03300533186128922}, "mmlu_social_sciences": {"acc,none": 0.23236919077023074, "acc_stderr,none": 0.007611909086572729, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.22807017543859648, "acc_stderr,none": 0.03947152782669415}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.1919191919191919, "acc_stderr,none": 0.028057791672989017}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.24870466321243523, "acc_stderr,none": 0.031195840877700286}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.22268907563025211, "acc_stderr,none": 0.027025433498882385}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.22018348623853212, "acc_stderr,none": 0.01776597865232756}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.2366412213740458, "acc_stderr,none": 0.03727673575596917}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.25980392156862747, "acc_stderr,none": 0.017740899509177795}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.17272727272727273, "acc_stderr,none": 0.03620691833929219}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.22857142857142856, "acc_stderr,none": 0.02688214492230774}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409217}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252605}, "mmlu_stem": {"acc,none": 0.2404059625753251, "acc_stderr,none": 0.007616832499821597, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.0391545063041425}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.23026315789473684, "acc_stderr,none": 0.03426059424403165}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080341}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.2, "acc_stderr,none": 0.040201512610368445}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.04389869956808778}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720683}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.22127659574468084, "acc_stderr,none": 0.027136349602424063}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.18620689655172415, "acc_stderr,none": 0.032439461590046174}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2328042328042328, "acc_stderr,none": 0.021765961672154523}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.23548387096774193, "acc_stderr,none": 0.02413763242933771}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.029678333141444455}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.33, "acc_stderr,none": 0.04725815626252604}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.026202766534652148}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.2582781456953642, "acc_stderr,none": 0.035737053147634576}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2361111111111111, "acc_stderr,none": 0.028963702570791037}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.21428571428571427, "acc_stderr,none": 0.03894641120044792}, "sciq": {"alias": "sciq", "acc,none": 0.816, "acc_stderr,none": 0.012259457340938598, "acc_norm,none": 0.739, "acc_norm_stderr,none": 0.01389503767796513}} | |
| {"created_at": "2025-09-25T23:23:41.529691", "global_step": 12000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19492219492219492, "acc_stderr,none": 0.011341478090883527}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.39075881298546106, "acc_stderr,none": 0.004869232758103323, "acc_norm,none": 0.4992033459470225, "acc_norm_stderr,none": 0.004989775077835652}, "mmlu": {"acc,none": 0.24341261928500213, "acc_stderr,none": 0.0036161878941598355, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.2561105207226355, "acc_stderr,none": 0.006357137928320468, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.23809523809523808, "acc_stderr,none": 0.03809523809523809}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.21212121212121213, "acc_stderr,none": 0.031922715695482995}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3235294117647059, "acc_stderr,none": 0.03283472056108566}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.23628691983122363, "acc_stderr,none": 0.027652153144159256}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2892561983471074, "acc_stderr,none": 0.04139112727635463}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.26851851851851855, "acc_stderr,none": 0.04284467968052192}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.3006134969325153, "acc_stderr,none": 0.03602511318806771}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.02289408248992599}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2871508379888268, "acc_stderr,none": 0.015131608849963745}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2604501607717042, "acc_stderr,none": 0.02492672322484554}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.023468429832451145}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2333767926988266, "acc_stderr,none": 0.010803108481179088}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.03528211258245231}, "mmlu_other": {"acc,none": 0.24364338590280013, "acc_stderr,none": 0.007684210666836717, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.26, "acc_stderr,none": 0.044084400227680794}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.19622641509433963, "acc_stderr,none": 0.024442388131100824}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2023121387283237, "acc_stderr,none": 0.030631145539198823}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.19282511210762332, "acc_stderr,none": 0.026478240960489365}, "mmlu_management": {"alias": " - management", "acc,none": 0.22330097087378642, "acc_stderr,none": 0.04123553189891431}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.3034188034188034, "acc_stderr,none": 0.03011821010694265}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.24904214559386972, "acc_stderr,none": 0.015464676163395976}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.024404394928087866}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.028121636040639882}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21691176470588236, "acc_stderr,none": 0.02503584522771129}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.23493975903614459, "acc_stderr,none": 0.03300533186128922}, "mmlu_social_sciences": {"acc,none": 0.22879428014299644, "acc_stderr,none": 0.0075670957139605515, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518754}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03173071239071724}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.32124352331606215, "acc_stderr,none": 0.033699508685490674}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2153846153846154, "acc_stderr,none": 0.020843034557462878}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.21008403361344538, "acc_stderr,none": 0.026461398717471874}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.21834862385321102, "acc_stderr,none": 0.017712600528722724}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.19083969465648856, "acc_stderr,none": 0.034465133507525975}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2369281045751634, "acc_stderr,none": 0.01720166216978979}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.2, "acc_stderr,none": 0.03831305140884601}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.21224489795918366, "acc_stderr,none": 0.026176967197866764}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.23880597014925373, "acc_stderr,none": 0.030147775935409224}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653695}, "mmlu_stem": {"acc,none": 0.23850301300348875, "acc_stderr,none": 0.007582852910456969, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.23703703703703705, "acc_stderr,none": 0.03673731683969506}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.20394736842105263, "acc_stderr,none": 0.0327900040631005}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.22916666666666666, "acc_stderr,none": 0.03514697467862388}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816507}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.20588235294117646, "acc_stderr,none": 0.04023382273617746}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.22, "acc_stderr,none": 0.041633319989322695}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.31063829787234043, "acc_stderr,none": 0.03025123757921317}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.03455930201924812}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.23015873015873015, "acc_stderr,none": 0.02167921966369314}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.25483870967741934, "acc_stderr,none": 0.024790118459332208}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.22167487684729065, "acc_stderr,none": 0.029225575892489607}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.026067159222275798}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.1456953642384106, "acc_stderr,none": 0.028806043935008668}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2222222222222222, "acc_stderr,none": 0.028353212866863448}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.33035714285714285, "acc_stderr,none": 0.04464285714285712}, "sciq": {"alias": "sciq", "acc,none": 0.837, "acc_stderr,none": 0.01168621271274684, "acc_norm,none": 0.761, "acc_norm_stderr,none": 0.013493000446937593}} | |
| {"created_at": "2025-09-26T01:05:00.955313", "global_step": 14000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.19082719082719082, "acc_stderr,none": 0.011250215810979054}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4047998406691894, "acc_stderr,none": 0.004898501014225845, "acc_norm,none": 0.5144393547102171, "acc_norm_stderr,none": 0.004987700288855851}, "mmlu": {"acc,none": 0.24782794473721692, "acc_stderr,none": 0.003638309163228525, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.23783209351753454, "acc_stderr,none": 0.006204584510444356, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.1746031746031746, "acc_stderr,none": 0.03395490020856111}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3088235294117647, "acc_stderr,none": 0.03242661719827218}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.027303484599069425}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.2066115702479339, "acc_stderr,none": 0.03695980128098824}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.03957835471980979}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2392638036809816, "acc_stderr,none": 0.033519538795212696}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.22832369942196531, "acc_stderr,none": 0.02259870380432162}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.24804469273743016, "acc_stderr,none": 0.014444157808261453}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2347266881028939, "acc_stderr,none": 0.024071805887677048}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25617283950617287, "acc_stderr,none": 0.024288533637726095}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2242503259452412, "acc_stderr,none": 0.010652615824906172}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.30409356725146197, "acc_stderr,none": 0.0352821125824523}, "mmlu_other": {"acc,none": 0.2491149018345671, "acc_stderr,none": 0.007732156620131129, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.33, "acc_stderr,none": 0.047258156262526045}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.02619980880756191}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.03242414757483098}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.38, "acc_stderr,none": 0.048783173121456316}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.18385650224215247, "acc_stderr,none": 0.025998379092356517}, "mmlu_management": {"alias": " - management", "acc,none": 0.2815533980582524, "acc_stderr,none": 0.044532548363264673}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23931623931623933, "acc_stderr,none": 0.027951826808924333}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.2, "acc_stderr,none": 0.04020151261036845}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2515964240102171, "acc_stderr,none": 0.015517322365529627}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.024288619466046123}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.32978723404255317, "acc_stderr,none": 0.028045946942042394}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.21691176470588236, "acc_stderr,none": 0.02503584522771125}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.1927710843373494, "acc_stderr,none": 0.030709824050565267}, "mmlu_social_sciences": {"acc,none": 0.25056873578160543, "acc_stderr,none": 0.007802739285310109, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21052631578947367, "acc_stderr,none": 0.038351539543994194}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.29292929292929293, "acc_stderr,none": 0.032424979581788166}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.34196891191709844, "acc_stderr,none": 0.03423465100104283}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.23333333333333334, "acc_stderr,none": 0.021444547301560476}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.24789915966386555, "acc_stderr,none": 0.028047967224176892}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.24220183486238533, "acc_stderr,none": 0.01836817630659862}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.183206106870229, "acc_stderr,none": 0.03392770926494733}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2630718954248366, "acc_stderr,none": 0.017812676542320653}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724138}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.27346938775510204, "acc_stderr,none": 0.028535560337128448}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.21393034825870647, "acc_stderr,none": 0.028996909693328934}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816505}, "mmlu_stem": {"acc,none": 0.2588011417697431, "acc_stderr,none": 0.00779707211192818, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174021}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.03738520676119668}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.24305555555555555, "acc_stderr,none": 0.03586879280080343}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.23, "acc_stderr,none": 0.042295258468165065}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.042801058373643966}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3446808510638298, "acc_stderr,none": 0.03106898596312215}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.18620689655172415, "acc_stderr,none": 0.032439461590046174}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.24338624338624337, "acc_stderr,none": 0.02210112878741543}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.2645161290322581, "acc_stderr,none": 0.025091892378859275}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2315270935960591, "acc_stderr,none": 0.02967833314144447}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25555555555555554, "acc_stderr,none": 0.02659393910184407}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.23841059602649006, "acc_stderr,none": 0.03479185572599661}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2824074074074074, "acc_stderr,none": 0.030701372111510927}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.24107142857142858, "acc_stderr,none": 0.04059867246952689}, "sciq": {"alias": "sciq", "acc,none": 0.844, "acc_stderr,none": 0.011480235006122367, "acc_norm,none": 0.788, "acc_norm_stderr,none": 0.012931481864938057}} | |
| {"created_at": "2025-09-26T02:46:01.070270", "global_step": 16000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.21375921375921375, "acc_stderr,none": 0.011737086112127208}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41206930890260907, "acc_stderr,none": 0.004912015369160078, "acc_norm,none": 0.5241983668591914, "acc_norm_stderr,none": 0.004983934343250463}, "mmlu": {"acc,none": 0.2695484973650477, "acc_stderr,none": 0.0037329026126065173, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24867162592986186, "acc_stderr,none": 0.006296024046437298, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30952380952380953, "acc_stderr,none": 0.04134913018303316}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2727272727272727, "acc_stderr,none": 0.03477691162163659}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.27941176470588236, "acc_stderr,none": 0.031493281045079556}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.2109704641350211, "acc_stderr,none": 0.02655837250266192}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.15702479338842976, "acc_stderr,none": 0.0332124484254713}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.21296296296296297, "acc_stderr,none": 0.039578354719809784}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.31901840490797545, "acc_stderr,none": 0.03661997551073836}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24566473988439305, "acc_stderr,none": 0.023176298203992012}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2558659217877095, "acc_stderr,none": 0.014593620923210749}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2861736334405145, "acc_stderr,none": 0.025670259242188943}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.21604938271604937, "acc_stderr,none": 0.022899162918445796}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.24185136897001303, "acc_stderr,none": 0.010936550813827071}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.032744852119469564}, "mmlu_other": {"acc,none": 0.27486321210170583, "acc_stderr,none": 0.007950593071343421, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909283}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.2830188679245283, "acc_stderr,none": 0.027724236492700904}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.31213872832369943, "acc_stderr,none": 0.035331333893236574}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.16143497757847533, "acc_stderr,none": 0.02469395789912846}, "mmlu_management": {"alias": " - management", "acc,none": 0.3883495145631068, "acc_stderr,none": 0.0482572933735639}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.23931623931623933, "acc_stderr,none": 0.027951826808924333}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.22, "acc_stderr,none": 0.04163331998932269}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2388250319284802, "acc_stderr,none": 0.015246803197398687}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.2908496732026144, "acc_stderr,none": 0.026004800363952113}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3262411347517731, "acc_stderr,none": 0.027968453043563168}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.39338235294117646, "acc_stderr,none": 0.029674288281311172}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.2289156626506024, "acc_stderr,none": 0.03270745277352477}, "mmlu_social_sciences": {"acc,none": 0.28631784205394867, "acc_stderr,none": 0.008134260696471418, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518753}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.30808080808080807, "acc_stderr,none": 0.032894773300986176}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.34196891191709844, "acc_stderr,none": 0.03423465100104284}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.31025641025641026, "acc_stderr,none": 0.023454674889404288}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.029597329730978103}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.3174311926605505, "acc_stderr,none": 0.0199571521984605}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.22137404580152673, "acc_stderr,none": 0.03641297081313729}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.24019607843137256, "acc_stderr,none": 0.017282760695167425}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.3, "acc_stderr,none": 0.04389311454644287}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.3306122448979592, "acc_stderr,none": 0.0301164262965406}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.24875621890547264, "acc_stderr,none": 0.030567675938916707}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_stem": {"acc,none": 0.2790992705359975, "acc_stderr,none": 0.007982635034781286, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.25, "acc_stderr,none": 0.04351941398892446}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.24444444444444444, "acc_stderr,none": 0.037125378336148665}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3026315789473684, "acc_stderr,none": 0.037385206761196686}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.25, "acc_stderr,none": 0.03621034121889507}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.37, "acc_stderr,none": 0.048523658709391}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768079}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.2549019607843137, "acc_stderr,none": 0.04336432707993178}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.23, "acc_stderr,none": 0.04229525846816506}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.34893617021276596, "acc_stderr,none": 0.031158522131357787}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.296551724137931, "acc_stderr,none": 0.03806142687309994}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.26455026455026454, "acc_stderr,none": 0.022717467897708614}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3161290322580645, "acc_stderr,none": 0.02645087448904277}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.270935960591133, "acc_stderr,none": 0.031270907132976984}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.27037037037037037, "acc_stderr,none": 0.02708037281514566}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.271523178807947, "acc_stderr,none": 0.03631329803969653}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2824074074074074, "acc_stderr,none": 0.030701372111510927}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.16964285714285715, "acc_stderr,none": 0.03562367850095391}, "sciq": {"alias": "sciq", "acc,none": 0.845, "acc_stderr,none": 0.011450157470799468, "acc_norm,none": 0.777, "acc_norm_stderr,none": 0.013169830843425672}} | |
| {"created_at": "2025-09-26T04:27:10.358606", "global_step": 18000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20065520065520065, "acc_stderr,none": 0.011466011466011559}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.41874128659629556, "acc_stderr,none": 0.004923445627861521, "acc_norm,none": 0.5354511053574985, "acc_norm_stderr,none": 0.004977223485342013}, "mmlu": {"acc,none": 0.2566585956416465, "acc_stderr,none": 0.003681812775095193, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24335812964930925, "acc_stderr,none": 0.006252847497914703, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.24603174603174602, "acc_stderr,none": 0.03852273364924318}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.2606060606060606, "acc_stderr,none": 0.034277431758165236}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3137254901960784, "acc_stderr,none": 0.03256685484460388}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.027303484599069405}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.21487603305785125, "acc_stderr,none": 0.037494924487096966}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.23148148148148148, "acc_stderr,none": 0.04077494709252626}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.24539877300613497, "acc_stderr,none": 0.03380939813943354}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.24855491329479767, "acc_stderr,none": 0.023267528432100174}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25027932960893856, "acc_stderr,none": 0.014487500852850414}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.2765273311897106, "acc_stderr,none": 0.0254038329781796}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.25308641975308643, "acc_stderr,none": 0.024191808600713002}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.21707953063885269, "acc_stderr,none": 0.010529243841561366}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.2982456140350877, "acc_stderr,none": 0.03508771929824563}, "mmlu_other": {"acc,none": 0.2594142259414226, "acc_stderr,none": 0.00784066413021631, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.27547169811320754, "acc_stderr,none": 0.027495663683724074}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2832369942196532, "acc_stderr,none": 0.03435568056047875}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.16591928251121077, "acc_stderr,none": 0.024967553196547157}, "mmlu_management": {"alias": " - management", "acc,none": 0.2912621359223301, "acc_stderr,none": 0.04498676320572924}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.25213675213675213, "acc_stderr,none": 0.02844796547623102}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.18, "acc_stderr,none": 0.03861229196653697}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.2656449553001277, "acc_stderr,none": 0.015794302487888715}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.238562091503268, "acc_stderr,none": 0.02440439492808787}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.3333333333333333, "acc_stderr,none": 0.02812163604063989}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.2757352941176471, "acc_stderr,none": 0.027146271936625166}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.20481927710843373, "acc_stderr,none": 0.03141784291663925}, "mmlu_social_sciences": {"acc,none": 0.2567435814104647, "acc_stderr,none": 0.007873104608006323, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.23684210526315788, "acc_stderr,none": 0.03999423879281336}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.03154449888270285}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.31088082901554404, "acc_stderr,none": 0.03340361906276586}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2641025641025641, "acc_stderr,none": 0.022352193737453282}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.27310924369747897, "acc_stderr,none": 0.02894200404099817}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26605504587155965, "acc_stderr,none": 0.018946022322225583}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.1984732824427481, "acc_stderr,none": 0.034981493854624714}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2647058823529412, "acc_stderr,none": 0.017848089574913226}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.18181818181818182, "acc_stderr,none": 0.036942843353378}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2693877551020408, "acc_stderr,none": 0.02840125202902294}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.208955223880597, "acc_stderr,none": 0.028748298931728655}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_stem": {"acc,none": 0.27370758008246115, "acc_stderr,none": 0.007942493120574306, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.03785714465066654}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.34210526315789475, "acc_stderr,none": 0.03860731599316091}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2569444444444444, "acc_stderr,none": 0.03653946969442099}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.042923469599092816}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364396}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.0446196043338474}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2936170212765957, "acc_stderr,none": 0.02977164271249123}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2206896551724138, "acc_stderr,none": 0.034559302019248124}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2751322751322751, "acc_stderr,none": 0.023000086859068646}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.3064516129032258, "acc_stderr,none": 0.026226485652553883}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2660098522167488, "acc_stderr,none": 0.031089826002937523}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206845}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.026719240783712156}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.25462962962962965, "acc_stderr,none": 0.029711275860005344}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.17857142857142858, "acc_stderr,none": 0.036352091215778065}, "sciq": {"alias": "sciq", "acc,none": 0.841, "acc_stderr,none": 0.011569479368271298, "acc_norm,none": 0.786, "acc_norm_stderr,none": 0.01297583802196877}} | |
| {"created_at": "2025-09-26T06:09:10.083706", "global_step": 20000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20393120393120392, "acc_stderr,none": 0.011535521334313655}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.42362079267078273, "acc_stderr,none": 0.004931219148182243, "acc_norm,none": 0.5401314479187412, "acc_norm_stderr,none": 0.004973683026202174}, "mmlu": {"acc,none": 0.2551630821820253, "acc_stderr,none": 0.00367278401732387, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24229543039319873, "acc_stderr,none": 0.006243979804066912, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.04104947269903394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.3088235294117647, "acc_stderr,none": 0.03242661719827218}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22362869198312235, "acc_stderr,none": 0.02712329820522997}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.1652892561983471, "acc_stderr,none": 0.03390780612972776}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.22085889570552147, "acc_stderr,none": 0.032591773927421776}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.2398843930635838, "acc_stderr,none": 0.022989592543123567}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.25921787709497207, "acc_stderr,none": 0.014655780837497712}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.26688102893890675, "acc_stderr,none": 0.025122637608816657}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.24074074074074073, "acc_stderr,none": 0.02378858355165854}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.2255541069100391, "acc_stderr,none": 0.010674556313461989}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.23976608187134502, "acc_stderr,none": 0.032744852119469564}, "mmlu_other": {"acc,none": 0.2565175410363695, "acc_stderr,none": 0.007797540146409975, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.22, "acc_stderr,none": 0.0416333199893227}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.026199808807561897}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.30057803468208094, "acc_stderr,none": 0.03496101481191179}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.35, "acc_stderr,none": 0.0479372485441102}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.15695067264573992, "acc_stderr,none": 0.024413587174907426}, "mmlu_management": {"alias": " - management", "acc,none": 0.3592233009708738, "acc_stderr,none": 0.04750458399041692}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.2264957264957265, "acc_stderr,none": 0.027421007295392923}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.17, "acc_stderr,none": 0.0377525168068637}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.25925925925925924, "acc_stderr,none": 0.01567100600933958}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.23202614379084968, "acc_stderr,none": 0.024170840879340995}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.31560283687943264, "acc_stderr,none": 0.027724989449509317}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.3125, "acc_stderr,none": 0.02815637344037142}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.21084337349397592, "acc_stderr,none": 0.03175554786629919}, "mmlu_social_sciences": {"acc,none": 0.2528436789080273, "acc_stderr,none": 0.007821296122783391, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518753}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.23232323232323232, "acc_stderr,none": 0.030088629490217483}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.30569948186528495, "acc_stderr,none": 0.03324837939758159}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.28205128205128205, "acc_stderr,none": 0.022815813098896597}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.02921354941437215}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.27339449541284405, "acc_stderr,none": 0.01910929984609829}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.16030534351145037, "acc_stderr,none": 0.032178294207446306}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2565359477124183, "acc_stderr,none": 0.01766784161237899}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.19090909090909092, "acc_stderr,none": 0.03764425585984926}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2612244897959184, "acc_stderr,none": 0.028123429335142797}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.21890547263681592, "acc_stderr,none": 0.029239174636647}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.15, "acc_stderr,none": 0.0358870281282637}, "mmlu_stem": {"acc,none": 0.2752933713923248, "acc_stderr,none": 0.007965125637343171, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2518518518518518, "acc_stderr,none": 0.03749850709174021}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.037610708698674805}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.24, "acc_stderr,none": 0.04292346959909282}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.23529411764705882, "acc_stderr,none": 0.04220773659171453}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.2978723404255319, "acc_stderr,none": 0.029896145682095455}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2619047619047619, "acc_stderr,none": 0.022644212615525214}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.29354838709677417, "acc_stderr,none": 0.02590608702131929}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.2512315270935961, "acc_stderr,none": 0.030516530732694436}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.28888888888888886, "acc_stderr,none": 0.027634907264178544}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.304635761589404, "acc_stderr,none": 0.037579499229433426}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.2962962962962963, "acc_stderr,none": 0.03114144782353604}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.19642857142857142, "acc_stderr,none": 0.037709700493470194}, "sciq": {"alias": "sciq", "acc,none": 0.849, "acc_stderr,none": 0.011328165223341681, "acc_norm,none": 0.787, "acc_norm_stderr,none": 0.012953717566737235}} | |
| {"created_at": "2025-09-26T07:49:54.860782", "global_step": 22000, "commonsense_qa": {"alias": "commonsense_qa", "acc,none": 0.20475020475020475, "acc_stderr,none": 0.01155271447787667}, "hellaswag": {"alias": "hellaswag", "acc,none": 0.4244174467237602, "acc_stderr,none": 0.004932441479665535, "acc_norm,none": 0.5433180641306513, "acc_norm_stderr,none": 0.004971019942726587}, "mmlu": {"acc,none": 0.25409485828229594, "acc_stderr,none": 0.003669167340308539, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.24187035069075452, "acc_stderr,none": 0.006240769654939953, "alias": " - humanities"}, "mmlu_formal_logic": {"alias": " - formal_logic", "acc,none": 0.30158730158730157, "acc_stderr,none": 0.04104947269903394}, "mmlu_high_school_european_history": {"alias": " - high_school_european_history", "acc,none": 0.24242424242424243, "acc_stderr,none": 0.03346409881055953}, "mmlu_high_school_us_history": {"alias": " - high_school_us_history", "acc,none": 0.29901960784313725, "acc_stderr,none": 0.03213325717373617}, "mmlu_high_school_world_history": {"alias": " - high_school_world_history", "acc,none": 0.22784810126582278, "acc_stderr,none": 0.027303484599069415}, "mmlu_international_law": {"alias": " - international_law", "acc,none": 0.17355371900826447, "acc_stderr,none": 0.03457272836917669}, "mmlu_jurisprudence": {"alias": " - jurisprudence", "acc,none": 0.25, "acc_stderr,none": 0.04186091791394607}, "mmlu_logical_fallacies": {"alias": " - logical_fallacies", "acc,none": 0.2147239263803681, "acc_stderr,none": 0.03226219377286774}, "mmlu_moral_disputes": {"alias": " - moral_disputes", "acc,none": 0.23699421965317918, "acc_stderr,none": 0.02289408248992599}, "mmlu_moral_scenarios": {"alias": " - moral_scenarios", "acc,none": 0.2569832402234637, "acc_stderr,none": 0.014614465821966356}, "mmlu_philosophy": {"alias": " - philosophy", "acc,none": 0.27009646302250806, "acc_stderr,none": 0.02521804037341063}, "mmlu_prehistory": {"alias": " - prehistory", "acc,none": 0.23765432098765432, "acc_stderr,none": 0.02368359183700855}, "mmlu_professional_law": {"alias": " - professional_law", "acc,none": 0.22359843546284225, "acc_stderr,none": 0.010641589542841385}, "mmlu_world_religions": {"alias": " - world_religions", "acc,none": 0.26900584795321636, "acc_stderr,none": 0.03401052620104089}, "mmlu_other": {"acc,none": 0.25523012552301255, "acc_stderr,none": 0.007789293824152757, "alias": " - other"}, "mmlu_business_ethics": {"alias": " - business_ethics", "acc,none": 0.21, "acc_stderr,none": 0.040936018074033256}, "mmlu_clinical_knowledge": {"alias": " - clinical_knowledge", "acc,none": 0.23773584905660378, "acc_stderr,none": 0.02619980880756189}, "mmlu_college_medicine": {"alias": " - college_medicine", "acc,none": 0.2774566473988439, "acc_stderr,none": 0.03414014007044036}, "mmlu_global_facts": {"alias": " - global_facts", "acc,none": 0.34, "acc_stderr,none": 0.04760952285695236}, "mmlu_human_aging": {"alias": " - human_aging", "acc,none": 0.15695067264573992, "acc_stderr,none": 0.024413587174907412}, "mmlu_management": {"alias": " - management", "acc,none": 0.33980582524271846, "acc_stderr,none": 0.04689765937278135}, "mmlu_marketing": {"alias": " - marketing", "acc,none": 0.24358974358974358, "acc_stderr,none": 0.028120966503914394}, "mmlu_medical_genetics": {"alias": " - medical_genetics", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_miscellaneous": {"alias": " - miscellaneous", "acc,none": 0.26309067688378035, "acc_stderr,none": 0.015745497169049043}, "mmlu_nutrition": {"alias": " - nutrition", "acc,none": 0.22875816993464052, "acc_stderr,none": 0.024051029739912255}, "mmlu_professional_accounting": {"alias": " - professional_accounting", "acc,none": 0.32978723404255317, "acc_stderr,none": 0.028045946942042398}, "mmlu_professional_medicine": {"alias": " - professional_medicine", "acc,none": 0.29411764705882354, "acc_stderr,none": 0.027678468642144714}, "mmlu_virology": {"alias": " - virology", "acc,none": 0.19879518072289157, "acc_stderr,none": 0.03106939026078942}, "mmlu_social_sciences": {"acc,none": 0.25024374390640236, "acc_stderr,none": 0.007804231708081098, "alias": " - social sciences"}, "mmlu_econometrics": {"alias": " - econometrics", "acc,none": 0.21929824561403508, "acc_stderr,none": 0.03892431106518753}, "mmlu_high_school_geography": {"alias": " - high_school_geography", "acc,none": 0.2676767676767677, "acc_stderr,none": 0.03154449888270285}, "mmlu_high_school_government_and_politics": {"alias": " - high_school_government_and_politics", "acc,none": 0.26424870466321243, "acc_stderr,none": 0.031821550509166484}, "mmlu_high_school_macroeconomics": {"alias": " - high_school_macroeconomics", "acc,none": 0.2641025641025641, "acc_stderr,none": 0.02235219373745328}, "mmlu_high_school_microeconomics": {"alias": " - high_school_microeconomics", "acc,none": 0.2815126050420168, "acc_stderr,none": 0.029213549414372153}, "mmlu_high_school_psychology": {"alias": " - high_school_psychology", "acc,none": 0.26422018348623855, "acc_stderr,none": 0.01890416417151019}, "mmlu_human_sexuality": {"alias": " - human_sexuality", "acc,none": 0.15267175572519084, "acc_stderr,none": 0.031545216720054704}, "mmlu_professional_psychology": {"alias": " - professional_psychology", "acc,none": 0.2581699346405229, "acc_stderr,none": 0.017704531653250075}, "mmlu_public_relations": {"alias": " - public_relations", "acc,none": 0.20909090909090908, "acc_stderr,none": 0.03895091015724138}, "mmlu_security_studies": {"alias": " - security_studies", "acc,none": 0.2653061224489796, "acc_stderr,none": 0.02826388994378461}, "mmlu_sociology": {"alias": " - sociology", "acc,none": 0.21393034825870647, "acc_stderr,none": 0.028996909693328916}, "mmlu_us_foreign_policy": {"alias": " - us_foreign_policy", "acc,none": 0.18, "acc_stderr,none": 0.038612291966536955}, "mmlu_stem": {"acc,none": 0.27497621313035203, "acc_stderr,none": 0.007961443731286945, "alias": " - stem"}, "mmlu_abstract_algebra": {"alias": " - abstract_algebra", "acc,none": 0.29, "acc_stderr,none": 0.045604802157206824}, "mmlu_anatomy": {"alias": " - anatomy", "acc,none": 0.2740740740740741, "acc_stderr,none": 0.03853254836552003}, "mmlu_astronomy": {"alias": " - astronomy", "acc,none": 0.3092105263157895, "acc_stderr,none": 0.03761070869867481}, "mmlu_college_biology": {"alias": " - college_biology", "acc,none": 0.2638888888888889, "acc_stderr,none": 0.03685651095897532}, "mmlu_college_chemistry": {"alias": " - college_chemistry", "acc,none": 0.3, "acc_stderr,none": 0.046056618647183814}, "mmlu_college_computer_science": {"alias": " - college_computer_science", "acc,none": 0.26, "acc_stderr,none": 0.04408440022768078}, "mmlu_college_mathematics": {"alias": " - college_mathematics", "acc,none": 0.29, "acc_stderr,none": 0.04560480215720684}, "mmlu_college_physics": {"alias": " - college_physics", "acc,none": 0.24509803921568626, "acc_stderr,none": 0.04280105837364396}, "mmlu_computer_security": {"alias": " - computer_security", "acc,none": 0.27, "acc_stderr,none": 0.044619604333847394}, "mmlu_conceptual_physics": {"alias": " - conceptual_physics", "acc,none": 0.3021276595744681, "acc_stderr,none": 0.03001755447188055}, "mmlu_electrical_engineering": {"alias": " - electrical_engineering", "acc,none": 0.2620689655172414, "acc_stderr,none": 0.036646663372252565}, "mmlu_elementary_mathematics": {"alias": " - elementary_mathematics", "acc,none": 0.2671957671957672, "acc_stderr,none": 0.02278967314577657}, "mmlu_high_school_biology": {"alias": " - high_school_biology", "acc,none": 0.27419354838709675, "acc_stderr,none": 0.025378139970885193}, "mmlu_high_school_chemistry": {"alias": " - high_school_chemistry", "acc,none": 0.24630541871921183, "acc_stderr,none": 0.03031509928561773}, "mmlu_high_school_computer_science": {"alias": " - high_school_computer_science", "acc,none": 0.31, "acc_stderr,none": 0.04648231987117316}, "mmlu_high_school_mathematics": {"alias": " - high_school_mathematics", "acc,none": 0.2814814814814815, "acc_stderr,none": 0.027420019350945273}, "mmlu_high_school_physics": {"alias": " - high_school_physics", "acc,none": 0.31788079470198677, "acc_stderr,none": 0.038020397601079024}, "mmlu_high_school_statistics": {"alias": " - high_school_statistics", "acc,none": 0.27314814814814814, "acc_stderr,none": 0.030388051301678116}, "mmlu_machine_learning": {"alias": " - machine_learning", "acc,none": 0.17857142857142858, "acc_stderr,none": 0.036352091215778065}, "sciq": {"alias": "sciq", "acc,none": 0.847, "acc_stderr,none": 0.011389500459665546, "acc_norm,none": 0.787, "acc_norm_stderr,none": 0.012953717566737235}} | |
