tohuy2710's picture
Upload 4 files
a817ac6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 6666,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0144014401440144,
"grad_norm": 10.98869514465332,
"learning_rate": 4.347826086956522e-07,
"loss": 0.6974,
"step": 32
},
{
"epoch": 0.0288028802880288,
"grad_norm": 10.853715896606445,
"learning_rate": 9.145427286356823e-07,
"loss": 0.6333,
"step": 64
},
{
"epoch": 0.043204320432043204,
"grad_norm": 13.330265045166016,
"learning_rate": 1.3943028485757123e-06,
"loss": 0.5058,
"step": 96
},
{
"epoch": 0.0576057605760576,
"grad_norm": 11.676138877868652,
"learning_rate": 1.8740629685157422e-06,
"loss": 0.5039,
"step": 128
},
{
"epoch": 0.07200720072007201,
"grad_norm": 11.024947166442871,
"learning_rate": 2.3388305847076464e-06,
"loss": 0.5272,
"step": 160
},
{
"epoch": 0.08640864086408641,
"grad_norm": 11.500349998474121,
"learning_rate": 2.8185907046476763e-06,
"loss": 0.5414,
"step": 192
},
{
"epoch": 0.10081008100810081,
"grad_norm": 14.144675254821777,
"learning_rate": 3.2983508245877066e-06,
"loss": 0.5167,
"step": 224
},
{
"epoch": 0.1152115211521152,
"grad_norm": 15.339383125305176,
"learning_rate": 3.763118440779611e-06,
"loss": 0.4782,
"step": 256
},
{
"epoch": 0.12961296129612962,
"grad_norm": 14.547229766845703,
"learning_rate": 4.242878560719641e-06,
"loss": 0.5071,
"step": 288
},
{
"epoch": 0.14401440144014402,
"grad_norm": 16.514537811279297,
"learning_rate": 4.722638680659671e-06,
"loss": 0.5238,
"step": 320
},
{
"epoch": 0.15841584158415842,
"grad_norm": 13.728464126586914,
"learning_rate": 5.2023988005997004e-06,
"loss": 0.6176,
"step": 352
},
{
"epoch": 0.17281728172817282,
"grad_norm": 9.859703063964844,
"learning_rate": 5.682158920539731e-06,
"loss": 0.5022,
"step": 384
},
{
"epoch": 0.18721872187218722,
"grad_norm": 16.529651641845703,
"learning_rate": 6.16191904047976e-06,
"loss": 0.5606,
"step": 416
},
{
"epoch": 0.20162016201620162,
"grad_norm": 12.64522647857666,
"learning_rate": 6.6416791604197905e-06,
"loss": 0.5115,
"step": 448
},
{
"epoch": 0.21602160216021601,
"grad_norm": 9.362441062927246,
"learning_rate": 7.121439280359821e-06,
"loss": 0.5371,
"step": 480
},
{
"epoch": 0.2304230423042304,
"grad_norm": 12.86221694946289,
"learning_rate": 7.60119940029985e-06,
"loss": 0.5343,
"step": 512
},
{
"epoch": 0.2448244824482448,
"grad_norm": 12.512419700622559,
"learning_rate": 8.065967016491755e-06,
"loss": 0.6191,
"step": 544
},
{
"epoch": 0.25922592259225924,
"grad_norm": 14.35505199432373,
"learning_rate": 8.53073463268366e-06,
"loss": 0.5668,
"step": 576
},
{
"epoch": 0.27362736273627364,
"grad_norm": 14.951635360717773,
"learning_rate": 9.010494752623688e-06,
"loss": 0.6024,
"step": 608
},
{
"epoch": 0.28802880288028804,
"grad_norm": 11.853578567504883,
"learning_rate": 9.490254872563718e-06,
"loss": 0.6092,
"step": 640
},
{
"epoch": 0.30243024302430244,
"grad_norm": 9.165077209472656,
"learning_rate": 9.970014992503749e-06,
"loss": 0.5587,
"step": 672
},
{
"epoch": 0.31683168316831684,
"grad_norm": 12.137311935424805,
"learning_rate": 9.999382956748588e-06,
"loss": 0.5444,
"step": 704
},
{
"epoch": 0.33123312331233123,
"grad_norm": 15.218328475952148,
"learning_rate": 9.997364717027728e-06,
"loss": 0.5909,
"step": 736
},
{
"epoch": 0.34563456345634563,
"grad_norm": 13.624066352844238,
"learning_rate": 9.993943105133823e-06,
"loss": 0.6135,
"step": 768
},
{
"epoch": 0.36003600360036003,
"grad_norm": 12.48714542388916,
"learning_rate": 9.989119081932283e-06,
"loss": 0.5868,
"step": 800
},
{
"epoch": 0.37443744374437443,
"grad_norm": 20.65450668334961,
"learning_rate": 9.9828940021171e-06,
"loss": 0.6172,
"step": 832
},
{
"epoch": 0.38883888388838883,
"grad_norm": 15.066054344177246,
"learning_rate": 9.975269613830395e-06,
"loss": 0.5682,
"step": 864
},
{
"epoch": 0.40324032403240323,
"grad_norm": 10.448845863342285,
"learning_rate": 9.966248058171527e-06,
"loss": 0.6659,
"step": 896
},
{
"epoch": 0.41764176417641763,
"grad_norm": 6.550926208496094,
"learning_rate": 9.955831868595796e-06,
"loss": 0.6053,
"step": 928
},
{
"epoch": 0.43204320432043203,
"grad_norm": 18.164133071899414,
"learning_rate": 9.94402397020302e-06,
"loss": 0.5109,
"step": 960
},
{
"epoch": 0.4464446444644464,
"grad_norm": 26.886669158935547,
"learning_rate": 9.930827678916084e-06,
"loss": 0.6006,
"step": 992
},
{
"epoch": 0.4608460846084608,
"grad_norm": 9.457767486572266,
"learning_rate": 9.916246700549754e-06,
"loss": 0.5794,
"step": 1024
},
{
"epoch": 0.4752475247524752,
"grad_norm": 15.5580472946167,
"learning_rate": 9.900285129770016e-06,
"loss": 0.5606,
"step": 1056
},
{
"epoch": 0.4896489648964896,
"grad_norm": 9.84897232055664,
"learning_rate": 9.882947448944177e-06,
"loss": 0.6579,
"step": 1088
},
{
"epoch": 0.504050405040504,
"grad_norm": 10.959239959716797,
"learning_rate": 9.864238526882147e-06,
"loss": 0.5455,
"step": 1120
},
{
"epoch": 0.5184518451845185,
"grad_norm": 15.946135520935059,
"learning_rate": 9.844163617469138e-06,
"loss": 0.6002,
"step": 1152
},
{
"epoch": 0.5328532853285328,
"grad_norm": 12.9840726852417,
"learning_rate": 9.822728358190274e-06,
"loss": 0.6541,
"step": 1184
},
{
"epoch": 0.5472547254725473,
"grad_norm": 11.688780784606934,
"learning_rate": 9.799938768547452e-06,
"loss": 0.6294,
"step": 1216
},
{
"epoch": 0.5616561656165616,
"grad_norm": 13.91913890838623,
"learning_rate": 9.77580124836893e-06,
"loss": 0.5856,
"step": 1248
},
{
"epoch": 0.5760576057605761,
"grad_norm": 13.728696823120117,
"learning_rate": 9.750322576012119e-06,
"loss": 0.622,
"step": 1280
},
{
"epoch": 0.5904590459045904,
"grad_norm": 8.887660026550293,
"learning_rate": 9.723509906460054e-06,
"loss": 0.6663,
"step": 1312
},
{
"epoch": 0.6048604860486049,
"grad_norm": 12.27409839630127,
"learning_rate": 9.69537076931213e-06,
"loss": 0.5707,
"step": 1344
},
{
"epoch": 0.6192619261926192,
"grad_norm": 23.663471221923828,
"learning_rate": 9.665913066669608e-06,
"loss": 0.6018,
"step": 1376
},
{
"epoch": 0.6336633663366337,
"grad_norm": 14.472834587097168,
"learning_rate": 9.635145070916541e-06,
"loss": 0.5736,
"step": 1408
},
{
"epoch": 0.648064806480648,
"grad_norm": 13.856266975402832,
"learning_rate": 9.603075422396685e-06,
"loss": 0.6117,
"step": 1440
},
{
"epoch": 0.6624662466246625,
"grad_norm": 11.307412147521973,
"learning_rate": 9.569713126987122e-06,
"loss": 0.6123,
"step": 1472
},
{
"epoch": 0.6768676867686768,
"grad_norm": 12.995174407958984,
"learning_rate": 9.535067553569175e-06,
"loss": 0.5784,
"step": 1504
},
{
"epoch": 0.6912691269126913,
"grad_norm": 21.58072853088379,
"learning_rate": 9.499148431397448e-06,
"loss": 0.6081,
"step": 1536
},
{
"epoch": 0.7056705670567057,
"grad_norm": 16.922595977783203,
"learning_rate": 9.461965847367611e-06,
"loss": 0.6303,
"step": 1568
},
{
"epoch": 0.7200720072007201,
"grad_norm": 17.760387420654297,
"learning_rate": 9.423530243183783e-06,
"loss": 0.5761,
"step": 1600
},
{
"epoch": 0.7344734473447345,
"grad_norm": 11.714215278625488,
"learning_rate": 9.385111038077417e-06,
"loss": 0.5852,
"step": 1632
},
{
"epoch": 0.7488748874887489,
"grad_norm": 12.36257266998291,
"learning_rate": 9.344240422449167e-06,
"loss": 0.5965,
"step": 1664
},
{
"epoch": 0.7632763276327633,
"grad_norm": 14.300825119018555,
"learning_rate": 9.302149846608464e-06,
"loss": 0.5316,
"step": 1696
},
{
"epoch": 0.7776777677767777,
"grad_norm": 16.075668334960938,
"learning_rate": 9.25885113053368e-06,
"loss": 0.6228,
"step": 1728
},
{
"epoch": 0.7920792079207921,
"grad_norm": 11.793745040893555,
"learning_rate": 9.214356433476091e-06,
"loss": 0.5644,
"step": 1760
},
{
"epoch": 0.8064806480648065,
"grad_norm": 12.965546607971191,
"learning_rate": 9.168678250545255e-06,
"loss": 0.5646,
"step": 1792
},
{
"epoch": 0.8208820882088209,
"grad_norm": 9.327117919921875,
"learning_rate": 9.121829409200145e-06,
"loss": 0.5961,
"step": 1824
},
{
"epoch": 0.8352835283528353,
"grad_norm": 9.750507354736328,
"learning_rate": 9.073823065646882e-06,
"loss": 0.6158,
"step": 1856
},
{
"epoch": 0.8496849684968497,
"grad_norm": 17.131214141845703,
"learning_rate": 9.024672701144184e-06,
"loss": 0.5422,
"step": 1888
},
{
"epoch": 0.8640864086408641,
"grad_norm": 9.129823684692383,
"learning_rate": 8.97439211821753e-06,
"loss": 0.5846,
"step": 1920
},
{
"epoch": 0.8784878487848785,
"grad_norm": 9.996623039245605,
"learning_rate": 8.922995436783104e-06,
"loss": 0.5824,
"step": 1952
},
{
"epoch": 0.8928892889288929,
"grad_norm": 12.214370727539062,
"learning_rate": 8.870497090182593e-06,
"loss": 0.5921,
"step": 1984
},
{
"epoch": 0.9072907290729073,
"grad_norm": 12.267783164978027,
"learning_rate": 8.816911821129992e-06,
"loss": 0.5496,
"step": 2016
},
{
"epoch": 0.9216921692169217,
"grad_norm": 11.49367618560791,
"learning_rate": 8.762254677571517e-06,
"loss": 0.6221,
"step": 2048
},
{
"epoch": 0.9360936093609361,
"grad_norm": 11.387689590454102,
"learning_rate": 8.706541008459798e-06,
"loss": 0.5685,
"step": 2080
},
{
"epoch": 0.9504950495049505,
"grad_norm": 12.342247009277344,
"learning_rate": 8.649786459443555e-06,
"loss": 0.5922,
"step": 2112
},
{
"epoch": 0.9648964896489649,
"grad_norm": 17.48115348815918,
"learning_rate": 8.59200696847395e-06,
"loss": 0.6309,
"step": 2144
},
{
"epoch": 0.9792979297929792,
"grad_norm": 13.321036338806152,
"learning_rate": 8.533218761328843e-06,
"loss": 0.5513,
"step": 2176
},
{
"epoch": 0.9936993699369937,
"grad_norm": 13.628349304199219,
"learning_rate": 8.473438347056239e-06,
"loss": 0.5934,
"step": 2208
},
{
"epoch": 1.008100810081008,
"grad_norm": 6.996334075927734,
"learning_rate": 8.412682513338176e-06,
"loss": 0.4909,
"step": 2240
},
{
"epoch": 1.0225022502250225,
"grad_norm": 10.135323524475098,
"learning_rate": 8.35096832177636e-06,
"loss": 0.3238,
"step": 2272
},
{
"epoch": 1.036903690369037,
"grad_norm": 7.344214916229248,
"learning_rate": 8.28831310310089e-06,
"loss": 0.297,
"step": 2304
},
{
"epoch": 1.0513051305130514,
"grad_norm": 8.131421089172363,
"learning_rate": 8.224734452303397e-06,
"loss": 0.3335,
"step": 2336
},
{
"epoch": 1.0657065706570656,
"grad_norm": 8.626580238342285,
"learning_rate": 8.160250223695987e-06,
"loss": 0.3357,
"step": 2368
},
{
"epoch": 1.08010801080108,
"grad_norm": 20.845579147338867,
"learning_rate": 8.094878525897325e-06,
"loss": 0.3235,
"step": 2400
},
{
"epoch": 1.0945094509450946,
"grad_norm": 13.4508695602417,
"learning_rate": 8.028637716747355e-06,
"loss": 0.3331,
"step": 2432
},
{
"epoch": 1.108910891089109,
"grad_norm": 11.288851737976074,
"learning_rate": 7.961546398151988e-06,
"loss": 0.3425,
"step": 2464
},
{
"epoch": 1.1233123312331232,
"grad_norm": 12.70616626739502,
"learning_rate": 7.893623410859282e-06,
"loss": 0.343,
"step": 2496
},
{
"epoch": 1.1377137713771377,
"grad_norm": 14.446159362792969,
"learning_rate": 7.824887829168522e-06,
"loss": 0.3426,
"step": 2528
},
{
"epoch": 1.1521152115211521,
"grad_norm": 9.85571575164795,
"learning_rate": 7.755358955573747e-06,
"loss": 0.3229,
"step": 2560
},
{
"epoch": 1.1665166516651666,
"grad_norm": 16.067195892333984,
"learning_rate": 7.685056315343165e-06,
"loss": 0.3246,
"step": 2592
},
{
"epoch": 1.1809180918091808,
"grad_norm": 20.33487319946289,
"learning_rate": 7.613999651036016e-06,
"loss": 0.374,
"step": 2624
},
{
"epoch": 1.1953195319531953,
"grad_norm": 12.800278663635254,
"learning_rate": 7.542208916958433e-06,
"loss": 0.3008,
"step": 2656
},
{
"epoch": 1.2097209720972097,
"grad_norm": 10.589587211608887,
"learning_rate": 7.469704273559807e-06,
"loss": 0.3738,
"step": 2688
},
{
"epoch": 1.2241224122412242,
"grad_norm": 9.341863632202148,
"learning_rate": 7.396506081771295e-06,
"loss": 0.3035,
"step": 2720
},
{
"epoch": 1.2385238523852384,
"grad_norm": 8.244071960449219,
"learning_rate": 7.322634897288008e-06,
"loss": 0.3748,
"step": 2752
},
{
"epoch": 1.2529252925292529,
"grad_norm": 6.00961971282959,
"learning_rate": 7.248111464796508e-06,
"loss": 0.2727,
"step": 2784
},
{
"epoch": 1.2673267326732673,
"grad_norm": 9.179043769836426,
"learning_rate": 7.172956712149234e-06,
"loss": 0.3004,
"step": 2816
},
{
"epoch": 1.2817281728172818,
"grad_norm": 9.150934219360352,
"learning_rate": 7.0971917444875015e-06,
"loss": 0.3635,
"step": 2848
},
{
"epoch": 1.296129612961296,
"grad_norm": 9.036107063293457,
"learning_rate": 7.020837838314691e-06,
"loss": 0.2943,
"step": 2880
},
{
"epoch": 1.3105310531053105,
"grad_norm": 9.950891494750977,
"learning_rate": 6.94391643552134e-06,
"loss": 0.3727,
"step": 2912
},
{
"epoch": 1.324932493249325,
"grad_norm": 15.246482849121094,
"learning_rate": 6.866449137363768e-06,
"loss": 0.2992,
"step": 2944
},
{
"epoch": 1.3393339333933394,
"grad_norm": 10.054420471191406,
"learning_rate": 6.788457698397973e-06,
"loss": 0.3322,
"step": 2976
},
{
"epoch": 1.3537353735373538,
"grad_norm": 10.79068374633789,
"learning_rate": 6.709964020370445e-06,
"loss": 0.3245,
"step": 3008
},
{
"epoch": 1.368136813681368,
"grad_norm": 10.584324836730957,
"learning_rate": 6.630990146067687e-06,
"loss": 0.3418,
"step": 3040
},
{
"epoch": 1.3825382538253825,
"grad_norm": 16.110841751098633,
"learning_rate": 6.554047211421132e-06,
"loss": 0.3437,
"step": 3072
},
{
"epoch": 1.396939693969397,
"grad_norm": 11.55324649810791,
"learning_rate": 6.4741928832357855e-06,
"loss": 0.3011,
"step": 3104
},
{
"epoch": 1.4113411341134112,
"grad_norm": 15.144320487976074,
"learning_rate": 6.393924568602145e-06,
"loss": 0.337,
"step": 3136
},
{
"epoch": 1.4257425742574257,
"grad_norm": 10.759925842285156,
"learning_rate": 6.313264808664494e-06,
"loss": 0.3149,
"step": 3168
},
{
"epoch": 1.4401440144014401,
"grad_norm": 14.752134323120117,
"learning_rate": 6.232236254493746e-06,
"loss": 0.3622,
"step": 3200
},
{
"epoch": 1.4545454545454546,
"grad_norm": 11.484366416931152,
"learning_rate": 6.150861660726515e-06,
"loss": 0.3112,
"step": 3232
},
{
"epoch": 1.468946894689469,
"grad_norm": 13.652670860290527,
"learning_rate": 6.069163879175092e-06,
"loss": 0.3403,
"step": 3264
},
{
"epoch": 1.4833483348334833,
"grad_norm": 9.8715181350708,
"learning_rate": 5.9871658524101565e-06,
"loss": 0.3205,
"step": 3296
},
{
"epoch": 1.4977497749774977,
"grad_norm": 11.28494644165039,
"learning_rate": 5.9048906073179824e-06,
"loss": 0.304,
"step": 3328
},
{
"epoch": 1.5121512151215122,
"grad_norm": 13.887341499328613,
"learning_rate": 5.822361248633973e-06,
"loss": 0.3027,
"step": 3360
},
{
"epoch": 1.5265526552655264,
"grad_norm": 7.218605041503906,
"learning_rate": 5.7396009524543274e-06,
"loss": 0.3081,
"step": 3392
},
{
"epoch": 1.5409540954095409,
"grad_norm": 7.5199503898620605,
"learning_rate": 5.656632959727683e-06,
"loss": 0.2977,
"step": 3424
},
{
"epoch": 1.5553555355535553,
"grad_norm": 9.384359359741211,
"learning_rate": 5.57348056972852e-06,
"loss": 0.3032,
"step": 3456
},
{
"epoch": 1.5697569756975698,
"grad_norm": 14.385334014892578,
"learning_rate": 5.492772883806706e-06,
"loss": 0.3244,
"step": 3488
},
{
"epoch": 1.5841584158415842,
"grad_norm": 3.620002269744873,
"learning_rate": 5.40932574467665e-06,
"loss": 0.2992,
"step": 3520
},
{
"epoch": 1.5985598559855987,
"grad_norm": 9.094705581665039,
"learning_rate": 5.325763657690609e-06,
"loss": 0.2942,
"step": 3552
},
{
"epoch": 1.612961296129613,
"grad_norm": 10.146318435668945,
"learning_rate": 5.242110088958073e-06,
"loss": 0.3395,
"step": 3584
},
{
"epoch": 1.6273627362736274,
"grad_norm": 13.460920333862305,
"learning_rate": 5.158388530278656e-06,
"loss": 0.3097,
"step": 3616
},
{
"epoch": 1.6417641764176416,
"grad_norm": 11.407035827636719,
"learning_rate": 5.074622492545074e-06,
"loss": 0.3159,
"step": 3648
},
{
"epoch": 1.656165616561656,
"grad_norm": 10.230562210083008,
"learning_rate": 4.9908354991407666e-06,
"loss": 0.327,
"step": 3680
},
{
"epoch": 1.6705670567056705,
"grad_norm": 11.073262214660645,
"learning_rate": 4.9070510793339835e-06,
"loss": 0.2981,
"step": 3712
},
{
"epoch": 1.684968496849685,
"grad_norm": 12.47003173828125,
"learning_rate": 4.823292761670264e-06,
"loss": 0.3065,
"step": 3744
},
{
"epoch": 1.6993699369936994,
"grad_norm": 25.917009353637695,
"learning_rate": 4.74219897937266e-06,
"loss": 0.2992,
"step": 3776
},
{
"epoch": 1.7137713771377139,
"grad_norm": 11.52535629272461,
"learning_rate": 4.658560774737667e-06,
"loss": 0.3474,
"step": 3808
},
{
"epoch": 1.7281728172817283,
"grad_norm": 9.606013298034668,
"learning_rate": 4.5750184539003665e-06,
"loss": 0.3436,
"step": 3840
},
{
"epoch": 1.7425742574257426,
"grad_norm": 15.367290496826172,
"learning_rate": 4.4915954774194676e-06,
"loss": 0.2932,
"step": 3872
},
{
"epoch": 1.756975697569757,
"grad_norm": 7.993281364440918,
"learning_rate": 4.408315272339104e-06,
"loss": 0.3203,
"step": 3904
},
{
"epoch": 1.7713771377137713,
"grad_norm": 8.027710914611816,
"learning_rate": 4.325201225609999e-06,
"loss": 0.3139,
"step": 3936
},
{
"epoch": 1.7857785778577857,
"grad_norm": 10.957657814025879,
"learning_rate": 4.242276677521877e-06,
"loss": 0.3453,
"step": 3968
},
{
"epoch": 1.8001800180018002,
"grad_norm": 10.544370651245117,
"learning_rate": 4.159564915148997e-06,
"loss": 0.2853,
"step": 4000
},
{
"epoch": 1.8145814581458146,
"grad_norm": 12.427223205566406,
"learning_rate": 4.077089165810611e-06,
"loss": 0.3355,
"step": 4032
},
{
"epoch": 1.828982898289829,
"grad_norm": 18.81423568725586,
"learning_rate": 3.994872590548211e-06,
"loss": 0.328,
"step": 4064
},
{
"epoch": 1.8433843384338435,
"grad_norm": 9.081976890563965,
"learning_rate": 3.9129382776213945e-06,
"loss": 0.293,
"step": 4096
},
{
"epoch": 1.8577857785778578,
"grad_norm": 8.164251327514648,
"learning_rate": 3.831309236024159e-06,
"loss": 0.2782,
"step": 4128
},
{
"epoch": 1.8721872187218722,
"grad_norm": 9.661165237426758,
"learning_rate": 3.7500083890234606e-06,
"loss": 0.3296,
"step": 4160
},
{
"epoch": 1.8865886588658864,
"grad_norm": 14.657001495361328,
"learning_rate": 3.66905856772185e-06,
"loss": 0.2631,
"step": 4192
},
{
"epoch": 1.900990099009901,
"grad_norm": 16.178007125854492,
"learning_rate": 3.5884825046459805e-06,
"loss": 0.2765,
"step": 4224
},
{
"epoch": 1.9153915391539154,
"grad_norm": 7.737805366516113,
"learning_rate": 3.508302827362805e-06,
"loss": 0.317,
"step": 4256
},
{
"epoch": 1.9297929792979298,
"grad_norm": 9.958755493164062,
"learning_rate": 3.4285420521252533e-06,
"loss": 0.2975,
"step": 4288
},
{
"epoch": 1.9441944194419443,
"grad_norm": 13.483292579650879,
"learning_rate": 3.3492225775491582e-06,
"loss": 0.2776,
"step": 4320
},
{
"epoch": 1.9585958595859587,
"grad_norm": 11.092999458312988,
"learning_rate": 3.270366678323219e-06,
"loss": 0.3453,
"step": 4352
},
{
"epoch": 1.972997299729973,
"grad_norm": 11.395092964172363,
"learning_rate": 3.1919964989537755e-06,
"loss": 0.2678,
"step": 4384
},
{
"epoch": 1.9873987398739874,
"grad_norm": 10.83203411102295,
"learning_rate": 3.1141340475461316e-06,
"loss": 0.3074,
"step": 4416
},
{
"epoch": 2.0018001800180016,
"grad_norm": 6.892433166503906,
"learning_rate": 3.03680118962418e-06,
"loss": 0.3064,
"step": 4448
},
{
"epoch": 2.016201620162016,
"grad_norm": 6.377696514129639,
"learning_rate": 2.9600196419900795e-06,
"loss": 0.1179,
"step": 4480
},
{
"epoch": 2.0306030603060305,
"grad_norm": 8.865036964416504,
"learning_rate": 2.883810966625684e-06,
"loss": 0.1444,
"step": 4512
},
{
"epoch": 2.045004500450045,
"grad_norm": 7.50280237197876,
"learning_rate": 2.8081965646374582e-06,
"loss": 0.1138,
"step": 4544
},
{
"epoch": 2.0594059405940595,
"grad_norm": 9.762062072753906,
"learning_rate": 2.7331976702465647e-06,
"loss": 0.1251,
"step": 4576
},
{
"epoch": 2.073807380738074,
"grad_norm": 10.026853561401367,
"learning_rate": 2.658835344825821e-06,
"loss": 0.1239,
"step": 4608
},
{
"epoch": 2.0882088208820884,
"grad_norm": 10.256850242614746,
"learning_rate": 2.5851304709851855e-06,
"loss": 0.1131,
"step": 4640
},
{
"epoch": 2.102610261026103,
"grad_norm": 11.005268096923828,
"learning_rate": 2.5121037467074596e-06,
"loss": 0.1224,
"step": 4672
},
{
"epoch": 2.117011701170117,
"grad_norm": 11.805505752563477,
"learning_rate": 2.4397756795358287e-06,
"loss": 0.1101,
"step": 4704
},
{
"epoch": 2.1314131413141313,
"grad_norm": 7.897261142730713,
"learning_rate": 2.3703932820444233e-06,
"loss": 0.1238,
"step": 4736
},
{
"epoch": 2.1458145814581457,
"grad_norm": 10.117232322692871,
"learning_rate": 2.2994998626884623e-06,
"loss": 0.1072,
"step": 4768
},
{
"epoch": 2.16021602160216,
"grad_norm": 9.720394134521484,
"learning_rate": 2.2293648043808946e-06,
"loss": 0.1377,
"step": 4800
},
{
"epoch": 2.1746174617461747,
"grad_norm": 9.53496265411377,
"learning_rate": 2.1600078026201977e-06,
"loss": 0.114,
"step": 4832
},
{
"epoch": 2.189018901890189,
"grad_norm": 9.775762557983398,
"learning_rate": 2.091448334409112e-06,
"loss": 0.1053,
"step": 4864
},
{
"epoch": 2.2034203420342036,
"grad_norm": 11.364226341247559,
"learning_rate": 2.0237056527850555e-06,
"loss": 0.0966,
"step": 4896
},
{
"epoch": 2.217821782178218,
"grad_norm": 17.391111373901367,
"learning_rate": 1.95679878141344e-06,
"loss": 0.1016,
"step": 4928
},
{
"epoch": 2.232223222322232,
"grad_norm": 10.15132999420166,
"learning_rate": 1.8907465092453986e-06,
"loss": 0.1184,
"step": 4960
},
{
"epoch": 2.2466246624662465,
"grad_norm": 12.572965621948242,
"learning_rate": 1.8255673852414274e-06,
"loss": 0.1015,
"step": 4992
},
{
"epoch": 2.261026102610261,
"grad_norm": 5.2437968254089355,
"learning_rate": 1.7612797131624243e-06,
"loss": 0.0993,
"step": 5024
},
{
"epoch": 2.2754275427542754,
"grad_norm": 8.918633460998535,
"learning_rate": 1.6979015464295785e-06,
"loss": 0.1079,
"step": 5056
},
{
"epoch": 2.28982898289829,
"grad_norm": 9.719175338745117,
"learning_rate": 1.6354506830545625e-06,
"loss": 0.1016,
"step": 5088
},
{
"epoch": 2.3042304230423043,
"grad_norm": 24.36786460876465,
"learning_rate": 1.5739446606414522e-06,
"loss": 0.1342,
"step": 5120
},
{
"epoch": 2.3186318631863188,
"grad_norm": 7.500924587249756,
"learning_rate": 1.5134007514617827e-06,
"loss": 0.0955,
"step": 5152
},
{
"epoch": 2.333033303330333,
"grad_norm": 10.143550872802734,
"learning_rate": 1.4538359576040923e-06,
"loss": 0.1451,
"step": 5184
},
{
"epoch": 2.3474347434743477,
"grad_norm": 15.87746524810791,
"learning_rate": 1.395267006199363e-06,
"loss": 0.1329,
"step": 5216
},
{
"epoch": 2.3618361836183617,
"grad_norm": 12.571290016174316,
"learning_rate": 1.33771034472367e-06,
"loss": 0.0978,
"step": 5248
},
{
"epoch": 2.376237623762376,
"grad_norm": 11.44887924194336,
"learning_rate": 1.2811821363793497e-06,
"loss": 0.1037,
"step": 5280
},
{
"epoch": 2.3906390639063906,
"grad_norm": 11.863057136535645,
"learning_rate": 1.2256982555560243e-06,
"loss": 0.116,
"step": 5312
},
{
"epoch": 2.405040504050405,
"grad_norm": 5.990699768066406,
"learning_rate": 1.171274283372703e-06,
"loss": 0.1037,
"step": 5344
},
{
"epoch": 2.4194419441944195,
"grad_norm": 10.163458824157715,
"learning_rate": 1.1179255033022624e-06,
"loss": 0.1387,
"step": 5376
},
{
"epoch": 2.433843384338434,
"grad_norm": 11.073272705078125,
"learning_rate": 1.0672833301104142e-06,
"loss": 0.12,
"step": 5408
},
{
"epoch": 2.4482448244824484,
"grad_norm": 10.601714134216309,
"learning_rate": 1.016094827126849e-06,
"loss": 0.0957,
"step": 5440
},
{
"epoch": 2.4626462646264624,
"grad_norm": 7.653828144073486,
"learning_rate": 9.660250941303178e-07,
"loss": 0.113,
"step": 5472
},
{
"epoch": 2.477047704770477,
"grad_norm": 10.332706451416016,
"learning_rate": 9.170881918256042e-07,
"loss": 0.1001,
"step": 5504
},
{
"epoch": 2.4914491449144913,
"grad_norm": 7.20733642578125,
"learning_rate": 8.692978627932148e-07,
"loss": 0.0998,
"step": 5536
},
{
"epoch": 2.5058505850585058,
"grad_norm": 4.12467622756958,
"learning_rate": 8.226675276301416e-07,
"loss": 0.0855,
"step": 5568
},
{
"epoch": 2.5202520252025202,
"grad_norm": 9.155281066894531,
"learning_rate": 7.772102811810689e-07,
"loss": 0.122,
"step": 5600
},
{
"epoch": 2.5346534653465347,
"grad_norm": 5.481142997741699,
"learning_rate": 7.329388888610384e-07,
"loss": 0.0978,
"step": 5632
},
{
"epoch": 2.549054905490549,
"grad_norm": 7.971097946166992,
"learning_rate": 6.898657830706367e-07,
"loss": 0.1157,
"step": 5664
},
{
"epoch": 2.5634563456345636,
"grad_norm": 11.405919075012207,
"learning_rate": 6.492928309381779e-07,
"loss": 0.0979,
"step": 5696
},
{
"epoch": 2.577857785778578,
"grad_norm": 13.61896800994873,
"learning_rate": 6.08613879617217e-07,
"loss": 0.1041,
"step": 5728
},
{
"epoch": 2.592259225922592,
"grad_norm": 29.495868682861328,
"learning_rate": 5.691681280788214e-07,
"loss": 0.0992,
"step": 5760
},
{
"epoch": 2.6066606660666065,
"grad_norm": 11.616131782531738,
"learning_rate": 5.309666535753417e-07,
"loss": 0.1051,
"step": 5792
},
{
"epoch": 2.621062106210621,
"grad_norm": 8.599799156188965,
"learning_rate": 4.940201839382114e-07,
"loss": 0.1091,
"step": 5824
},
{
"epoch": 2.6354635463546354,
"grad_norm": 12.70768928527832,
"learning_rate": 4.5833909456532764e-07,
"loss": 0.1111,
"step": 5856
},
{
"epoch": 2.64986498649865,
"grad_norm": 11.202815055847168,
"learning_rate": 4.2393340550740844e-07,
"loss": 0.1046,
"step": 5888
},
{
"epoch": 2.6642664266426643,
"grad_norm": 11.233574867248535,
"learning_rate": 3.908127786541427e-07,
"loss": 0.1022,
"step": 5920
},
{
"epoch": 2.678667866786679,
"grad_norm": 9.141092300415039,
"learning_rate": 3.589865150209071e-07,
"loss": 0.0984,
"step": 5952
},
{
"epoch": 2.693069306930693,
"grad_norm": 15.323395729064941,
"learning_rate": 3.2846355213683456e-07,
"loss": 0.1279,
"step": 5984
},
{
"epoch": 2.7074707470747077,
"grad_norm": 10.427877426147461,
"learning_rate": 2.9925246153496067e-07,
"loss": 0.1169,
"step": 6016
},
{
"epoch": 2.7218721872187217,
"grad_norm": 12.140647888183594,
"learning_rate": 2.713614463451364e-07,
"loss": 0.1267,
"step": 6048
},
{
"epoch": 2.736273627362736,
"grad_norm": 13.221187591552734,
"learning_rate": 2.4479833899041183e-07,
"loss": 0.1198,
"step": 6080
},
{
"epoch": 2.7506750675067506,
"grad_norm": 5.667657852172852,
"learning_rate": 2.19570598987513e-07,
"loss": 0.1312,
"step": 6112
},
{
"epoch": 2.765076507650765,
"grad_norm": 0.735275149345398,
"learning_rate": 1.9568531085204067e-07,
"loss": 0.0699,
"step": 6144
},
{
"epoch": 2.7794779477947795,
"grad_norm": 4.510382652282715,
"learning_rate": 1.731491821089848e-07,
"loss": 0.0965,
"step": 6176
},
{
"epoch": 2.793879387938794,
"grad_norm": 13.006752967834473,
"learning_rate": 1.5196854140909545e-07,
"loss": 0.1026,
"step": 6208
},
{
"epoch": 2.8082808280828084,
"grad_norm": 10.59677791595459,
"learning_rate": 1.321493367516574e-07,
"loss": 0.1093,
"step": 6240
},
{
"epoch": 2.8226822682268224,
"grad_norm": 8.108485221862793,
"learning_rate": 1.136971338141596e-07,
"loss": 0.1191,
"step": 6272
},
{
"epoch": 2.8370837083708373,
"grad_norm": 11.026812553405762,
"learning_rate": 9.661711438932686e-08,
"loss": 0.1008,
"step": 6304
},
{
"epoch": 2.8514851485148514,
"grad_norm": 7.0519890785217285,
"learning_rate": 8.09140749299564e-08,
"loss": 0.1386,
"step": 6336
},
{
"epoch": 2.865886588658866,
"grad_norm": 27.898834228515625,
"learning_rate": 6.659242520196562e-08,
"loss": 0.103,
"step": 6368
},
{
"epoch": 2.8802880288028803,
"grad_norm": 7.882510185241699,
"learning_rate": 5.365618704603392e-08,
"loss": 0.0812,
"step": 6400
},
{
"epoch": 2.8946894689468947,
"grad_norm": 15.030416488647461,
"learning_rate": 4.2108993248173855e-08,
"loss": 0.102,
"step": 6432
},
{
"epoch": 2.909090909090909,
"grad_norm": 11.23343276977539,
"learning_rate": 3.195408651956944e-08,
"loss": 0.0976,
"step": 6464
},
{
"epoch": 2.9234923492349236,
"grad_norm": 6.837776184082031,
"learning_rate": 2.3194318585945673e-08,
"loss": 0.0938,
"step": 6496
},
{
"epoch": 2.937893789378938,
"grad_norm": 9.184088706970215,
"learning_rate": 1.583214938674138e-08,
"loss": 0.1087,
"step": 6528
},
{
"epoch": 2.952295229522952,
"grad_norm": 7.366724967956543,
"learning_rate": 1.0034770609533285e-08,
"loss": 0.1139,
"step": 6560
},
{
"epoch": 2.9666966696669665,
"grad_norm": 10.501226425170898,
"learning_rate": 5.429794877803151e-09,
"loss": 0.0928,
"step": 6592
},
{
"epoch": 2.981098109810981,
"grad_norm": 7.6273908615112305,
"learning_rate": 2.2274065574556804e-09,
"loss": 0.1124,
"step": 6624
},
{
"epoch": 2.9954995499549955,
"grad_norm": 5.487658977508545,
"learning_rate": 4.2850495100610344e-10,
"loss": 0.1158,
"step": 6656
},
{
"epoch": 3.0,
"step": 6666,
"total_flos": 9491636163499776.0,
"train_loss": 0.3371730904088448,
"train_runtime": 4274.2917,
"train_samples_per_second": 1.56,
"train_steps_per_second": 1.56
}
],
"logging_steps": 32,
"max_steps": 6666,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9491636163499776.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}