llama3.1-8b-classification-gpt4o-100k / trainer_state.json

Model save

984f772 verified about 1 year ago

106 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 10.0,
	"eval_steps": 500,
	"global_step": 2960,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.0033783783783783786,
	"grad_norm": 9.802501678466797,
	"learning_rate": 6.756756756756758e-07,
	"loss": 2.6921,
	"step": 1
	},
	{
	"epoch": 0.016891891891891893,
	"grad_norm": 10.374316215515137,
	"learning_rate": 3.3783783783783788e-06,
	"loss": 2.7409,
	"step": 5
	},
	{
	"epoch": 0.033783783783783786,
	"grad_norm": 9.445246696472168,
	"learning_rate": 6.7567567567567575e-06,
	"loss": 2.6534,
	"step": 10
	},
	{
	"epoch": 0.05067567567567568,
	"grad_norm": 3.71943736076355,
	"learning_rate": 1.0135135135135136e-05,
	"loss": 2.4339,
	"step": 15
	},
	{
	"epoch": 0.06756756756756757,
	"grad_norm": 1.7139111757278442,
	"learning_rate": 1.3513513513513515e-05,
	"loss": 2.2659,
	"step": 20
	},
	{
	"epoch": 0.08445945945945946,
	"grad_norm": 0.7590915560722351,
	"learning_rate": 1.6891891891891892e-05,
	"loss": 2.1065,
	"step": 25
	},
	{
	"epoch": 0.10135135135135136,
	"grad_norm": 0.6881681680679321,
	"learning_rate": 2.0270270270270273e-05,
	"loss": 1.9905,
	"step": 30
	},
	{
	"epoch": 0.11824324324324324,
	"grad_norm": 0.6322100162506104,
	"learning_rate": 2.364864864864865e-05,
	"loss": 1.8675,
	"step": 35
	},
	{
	"epoch": 0.13513513513513514,
	"grad_norm": 0.6217833757400513,
	"learning_rate": 2.702702702702703e-05,
	"loss": 1.7354,
	"step": 40
	},
	{
	"epoch": 0.15202702702702703,
	"grad_norm": 0.4574459493160248,
	"learning_rate": 3.0405405405405407e-05,
	"loss": 1.6276,
	"step": 45
	},
	{
	"epoch": 0.16891891891891891,
	"grad_norm": 0.3643452525138855,
	"learning_rate": 3.3783783783783784e-05,
	"loss": 1.5622,
	"step": 50
	},
	{
	"epoch": 0.1858108108108108,
	"grad_norm": 0.6475837230682373,
	"learning_rate": 3.7162162162162165e-05,
	"loss": 1.5175,
	"step": 55
	},
	{
	"epoch": 0.20270270270270271,
	"grad_norm": 0.29574820399284363,
	"learning_rate": 4.0540540540540545e-05,
	"loss": 1.4953,
	"step": 60
	},
	{
	"epoch": 0.2195945945945946,
	"grad_norm": 0.31240248680114746,
	"learning_rate": 4.391891891891892e-05,
	"loss": 1.4493,
	"step": 65
	},
	{
	"epoch": 0.23648648648648649,
	"grad_norm": 0.2868952751159668,
	"learning_rate": 4.72972972972973e-05,
	"loss": 1.4327,
	"step": 70
	},
	{
	"epoch": 0.2533783783783784,
	"grad_norm": 0.3093927800655365,
	"learning_rate": 5.067567567567568e-05,
	"loss": 1.4149,
	"step": 75
	},
	{
	"epoch": 0.2702702702702703,
	"grad_norm": 0.37402284145355225,
	"learning_rate": 5.405405405405406e-05,
	"loss": 1.3976,
	"step": 80
	},
	{
	"epoch": 0.28716216216216217,
	"grad_norm": 0.34647682309150696,
	"learning_rate": 5.7432432432432434e-05,
	"loss": 1.3977,
	"step": 85
	},
	{
	"epoch": 0.30405405405405406,
	"grad_norm": 0.30035659670829773,
	"learning_rate": 6.0810810810810814e-05,
	"loss": 1.3589,
	"step": 90
	},
	{
	"epoch": 0.32094594594594594,
	"grad_norm": 0.33794718980789185,
	"learning_rate": 6.41891891891892e-05,
	"loss": 1.3818,
	"step": 95
	},
	{
	"epoch": 0.33783783783783783,
	"grad_norm": 0.40184497833251953,
	"learning_rate": 6.756756756756757e-05,
	"loss": 1.3577,
	"step": 100
	},
	{
	"epoch": 0.3547297297297297,
	"grad_norm": 0.32776907086372375,
	"learning_rate": 7.094594594594594e-05,
	"loss": 1.3408,
	"step": 105
	},
	{
	"epoch": 0.3716216216216216,
	"grad_norm": 0.32861512899398804,
	"learning_rate": 7.432432432432433e-05,
	"loss": 1.3036,
	"step": 110
	},
	{
	"epoch": 0.3885135135135135,
	"grad_norm": 0.3542137145996094,
	"learning_rate": 7.77027027027027e-05,
	"loss": 1.3261,
	"step": 115
	},
	{
	"epoch": 0.40540540540540543,
	"grad_norm": 0.3485589921474457,
	"learning_rate": 8.108108108108109e-05,
	"loss": 1.3107,
	"step": 120
	},
	{
	"epoch": 0.4222972972972973,
	"grad_norm": 0.3495419919490814,
	"learning_rate": 8.445945945945946e-05,
	"loss": 1.2784,
	"step": 125
	},
	{
	"epoch": 0.4391891891891892,
	"grad_norm": 0.3283160626888275,
	"learning_rate": 8.783783783783784e-05,
	"loss": 1.2816,
	"step": 130
	},
	{
	"epoch": 0.4560810810810811,
	"grad_norm": 0.331221342086792,
	"learning_rate": 9.121621621621623e-05,
	"loss": 1.2697,
	"step": 135
	},
	{
	"epoch": 0.47297297297297297,
	"grad_norm": 0.38272470235824585,
	"learning_rate": 9.45945945945946e-05,
	"loss": 1.2806,
	"step": 140
	},
	{
	"epoch": 0.48986486486486486,
	"grad_norm": 0.3326016962528229,
	"learning_rate": 9.797297297297297e-05,
	"loss": 1.2729,
	"step": 145
	},
	{
	"epoch": 0.5067567567567568,
	"grad_norm": 0.31695079803466797,
	"learning_rate": 0.00010135135135135136,
	"loss": 1.2657,
	"step": 150
	},
	{
	"epoch": 0.5236486486486487,
	"grad_norm": 0.40642571449279785,
	"learning_rate": 0.00010472972972972975,
	"loss": 1.2454,
	"step": 155
	},
	{
	"epoch": 0.5405405405405406,
	"grad_norm": 0.3561699688434601,
	"learning_rate": 0.00010810810810810812,
	"loss": 1.2414,
	"step": 160
	},
	{
	"epoch": 0.5574324324324325,
	"grad_norm": 0.30583736300468445,
	"learning_rate": 0.0001114864864864865,
	"loss": 1.2473,
	"step": 165
	},
	{
	"epoch": 0.5743243243243243,
	"grad_norm": 0.3610832393169403,
	"learning_rate": 0.00011486486486486487,
	"loss": 1.2487,
	"step": 170
	},
	{
	"epoch": 0.5912162162162162,
	"grad_norm": 0.33005717396736145,
	"learning_rate": 0.00011824324324324326,
	"loss": 1.2512,
	"step": 175
	},
	{
	"epoch": 0.6081081081081081,
	"grad_norm": 0.3080041706562042,
	"learning_rate": 0.00012162162162162163,
	"loss": 1.2544,
	"step": 180
	},
	{
	"epoch": 0.625,
	"grad_norm": 0.3453957736492157,
	"learning_rate": 0.000125,
	"loss": 1.2329,
	"step": 185
	},
	{
	"epoch": 0.6418918918918919,
	"grad_norm": 0.4040939211845398,
	"learning_rate": 0.0001283783783783784,
	"loss": 1.2356,
	"step": 190
	},
	{
	"epoch": 0.6587837837837838,
	"grad_norm": 0.39047908782958984,
	"learning_rate": 0.00013175675675675675,
	"loss": 1.2215,
	"step": 195
	},
	{
	"epoch": 0.6756756756756757,
	"grad_norm": 0.27441543340682983,
	"learning_rate": 0.00013513513513513514,
	"loss": 1.2374,
	"step": 200
	},
	{
	"epoch": 0.6925675675675675,
	"grad_norm": 0.26817697286605835,
	"learning_rate": 0.00013851351351351352,
	"loss": 1.2446,
	"step": 205
	},
	{
	"epoch": 0.7094594594594594,
	"grad_norm": 0.4692605435848236,
	"learning_rate": 0.00014189189189189188,
	"loss": 1.2369,
	"step": 210
	},
	{
	"epoch": 0.7263513513513513,
	"grad_norm": 0.47006717324256897,
	"learning_rate": 0.00014527027027027027,
	"loss": 1.2289,
	"step": 215
	},
	{
	"epoch": 0.7432432432432432,
	"grad_norm": 0.26643019914627075,
	"learning_rate": 0.00014864864864864866,
	"loss": 1.2272,
	"step": 220
	},
	{
	"epoch": 0.7601351351351351,
	"grad_norm": 0.27256107330322266,
	"learning_rate": 0.00015202702702702702,
	"loss": 1.2301,
	"step": 225
	},
	{
	"epoch": 0.777027027027027,
	"grad_norm": 0.2612285912036896,
	"learning_rate": 0.0001554054054054054,
	"loss": 1.2303,
	"step": 230
	},
	{
	"epoch": 0.793918918918919,
	"grad_norm": 0.2759920656681061,
	"learning_rate": 0.0001587837837837838,
	"loss": 1.2177,
	"step": 235
	},
	{
	"epoch": 0.8108108108108109,
	"grad_norm": 0.29133257269859314,
	"learning_rate": 0.00016216216216216218,
	"loss": 1.2174,
	"step": 240
	},
	{
	"epoch": 0.8277027027027027,
	"grad_norm": 0.3231314420700073,
	"learning_rate": 0.00016554054054054057,
	"loss": 1.2036,
	"step": 245
	},
	{
	"epoch": 0.8445945945945946,
	"grad_norm": 0.27160102128982544,
	"learning_rate": 0.00016891891891891893,
	"loss": 1.2302,
	"step": 250
	},
	{
	"epoch": 0.8614864864864865,
	"grad_norm": 0.29660171270370483,
	"learning_rate": 0.00017229729729729732,
	"loss": 1.2033,
	"step": 255
	},
	{
	"epoch": 0.8783783783783784,
	"grad_norm": 0.2654610276222229,
	"learning_rate": 0.00017567567567567568,
	"loss": 1.2012,
	"step": 260
	},
	{
	"epoch": 0.8952702702702703,
	"grad_norm": 0.28142857551574707,
	"learning_rate": 0.00017905405405405406,
	"loss": 1.2052,
	"step": 265
	},
	{
	"epoch": 0.9121621621621622,
	"grad_norm": 0.24720372259616852,
	"learning_rate": 0.00018243243243243245,
	"loss": 1.192,
	"step": 270
	},
	{
	"epoch": 0.9290540540540541,
	"grad_norm": 0.2735718786716461,
	"learning_rate": 0.0001858108108108108,
	"loss": 1.213,
	"step": 275
	},
	{
	"epoch": 0.9459459459459459,
	"grad_norm": 0.30433931946754456,
	"learning_rate": 0.0001891891891891892,
	"loss": 1.2059,
	"step": 280
	},
	{
	"epoch": 0.9628378378378378,
	"grad_norm": 0.3330329358577728,
	"learning_rate": 0.00019256756756756758,
	"loss": 1.206,
	"step": 285
	},
	{
	"epoch": 0.9797297297297297,
	"grad_norm": 0.27602413296699524,
	"learning_rate": 0.00019594594594594594,
	"loss": 1.2043,
	"step": 290
	},
	{
	"epoch": 0.9966216216216216,
	"grad_norm": 0.23838359117507935,
	"learning_rate": 0.00019932432432432433,
	"loss": 1.2062,
	"step": 295
	},
	{
	"epoch": 1.0,
	"eval_loss": 1.6780701875686646,
	"eval_runtime": 0.3945,
	"eval_samples_per_second": 5.07,
	"eval_steps_per_second": 2.535,
	"step": 296
	},
	{
	"epoch": 1.0135135135135136,
	"grad_norm": 0.3066512644290924,
	"learning_rate": 0.00019999888744757143,
	"loss": 1.1826,
	"step": 300
	},
	{
	"epoch": 1.0304054054054055,
	"grad_norm": 0.42127561569213867,
	"learning_rate": 0.0001999943677457578,
	"loss": 1.1683,
	"step": 305
	},
	{
	"epoch": 1.0472972972972974,
	"grad_norm": 0.28215768933296204,
	"learning_rate": 0.000199986371517049,
	"loss": 1.1752,
	"step": 310
	},
	{
	"epoch": 1.0641891891891893,
	"grad_norm": 0.35595354437828064,
	"learning_rate": 0.0001999748990394517,
	"loss": 1.1515,
	"step": 315
	},
	{
	"epoch": 1.0810810810810811,
	"grad_norm": 0.23858019709587097,
	"learning_rate": 0.0001999599507118322,
	"loss": 1.1604,
	"step": 320
	},
	{
	"epoch": 1.097972972972973,
	"grad_norm": 0.2836330831050873,
	"learning_rate": 0.0001999415270539023,
	"loss": 1.1714,
	"step": 325
	},
	{
	"epoch": 1.114864864864865,
	"grad_norm": 0.28962427377700806,
	"learning_rate": 0.00019991962870620153,
	"loss": 1.1693,
	"step": 330
	},
	{
	"epoch": 1.1317567567567568,
	"grad_norm": 0.2537465989589691,
	"learning_rate": 0.00019989425643007476,
	"loss": 1.1537,
	"step": 335
	},
	{
	"epoch": 1.1486486486486487,
	"grad_norm": 0.23751677572727203,
	"learning_rate": 0.00019986541110764565,
	"loss": 1.1664,
	"step": 340
	},
	{
	"epoch": 1.1655405405405406,
	"grad_norm": 0.3039610981941223,
	"learning_rate": 0.0001998330937417861,
	"loss": 1.1607,
	"step": 345
	},
	{
	"epoch": 1.1824324324324325,
	"grad_norm": 0.22566653788089752,
	"learning_rate": 0.00019979730545608126,
	"loss": 1.1532,
	"step": 350
	},
	{
	"epoch": 1.1993243243243243,
	"grad_norm": 0.27842891216278076,
	"learning_rate": 0.00019975804749479062,
	"loss": 1.1589,
	"step": 355
	},
	{
	"epoch": 1.2162162162162162,
	"grad_norm": 0.2455698400735855,
	"learning_rate": 0.00019971532122280464,
	"loss": 1.1608,
	"step": 360
	},
	{
	"epoch": 1.2331081081081081,
	"grad_norm": 0.23679549992084503,
	"learning_rate": 0.00019966912812559732,
	"loss": 1.1691,
	"step": 365
	},
	{
	"epoch": 1.25,
	"grad_norm": 0.22320061922073364,
	"learning_rate": 0.00019961946980917456,
	"loss": 1.1551,
	"step": 370
	},
	{
	"epoch": 1.2668918918918919,
	"grad_norm": 0.2794288992881775,
	"learning_rate": 0.00019956634800001832,
	"loss": 1.1667,
	"step": 375
	},
	{
	"epoch": 1.2837837837837838,
	"grad_norm": 0.2269154042005539,
	"learning_rate": 0.0001995097645450266,
	"loss": 1.1589,
	"step": 380
	},
	{
	"epoch": 1.3006756756756757,
	"grad_norm": 0.22751463949680328,
	"learning_rate": 0.00019944972141144928,
	"loss": 1.1522,
	"step": 385
	},
	{
	"epoch": 1.3175675675675675,
	"grad_norm": 0.2368728667497635,
	"learning_rate": 0.00019938622068681953,
	"loss": 1.1487,
	"step": 390
	},
	{
	"epoch": 1.3344594594594594,
	"grad_norm": 0.2409171611070633,
	"learning_rate": 0.00019931926457888156,
	"loss": 1.1575,
	"step": 395
	},
	{
	"epoch": 1.3513513513513513,
	"grad_norm": 0.24245265126228333,
	"learning_rate": 0.0001992488554155135,
	"loss": 1.1443,
	"step": 400
	},
	{
	"epoch": 1.3682432432432432,
	"grad_norm": 0.21953873336315155,
	"learning_rate": 0.0001991749956446468,
	"loss": 1.1578,
	"step": 405
	},
	{
	"epoch": 1.385135135135135,
	"grad_norm": 0.21402998268604279,
	"learning_rate": 0.00019909768783418086,
	"loss": 1.1655,
	"step": 410
	},
	{
	"epoch": 1.402027027027027,
	"grad_norm": 0.22115997970104218,
	"learning_rate": 0.00019901693467189386,
	"loss": 1.1515,
	"step": 415
	},
	{
	"epoch": 1.4189189189189189,
	"grad_norm": 0.2362441122531891,
	"learning_rate": 0.00019893273896534936,
	"loss": 1.1579,
	"step": 420
	},
	{
	"epoch": 1.4358108108108107,
	"grad_norm": 0.2779642641544342,
	"learning_rate": 0.0001988451036417986,
	"loss": 1.1518,
	"step": 425
	},
	{
	"epoch": 1.4527027027027026,
	"grad_norm": 0.22553110122680664,
	"learning_rate": 0.00019875403174807882,
	"loss": 1.1722,
	"step": 430
	},
	{
	"epoch": 1.4695945945945945,
	"grad_norm": 0.22423289716243744,
	"learning_rate": 0.0001986595264505072,
	"loss": 1.1628,
	"step": 435
	},
	{
	"epoch": 1.4864864864864864,
	"grad_norm": 0.23659999668598175,
	"learning_rate": 0.00019856159103477086,
	"loss": 1.1442,
	"step": 440
	},
	{
	"epoch": 1.5033783783783785,
	"grad_norm": 0.23966625332832336,
	"learning_rate": 0.00019846022890581267,
	"loss": 1.1486,
	"step": 445
	},
	{
	"epoch": 1.5202702702702702,
	"grad_norm": 0.2399033010005951,
	"learning_rate": 0.0001983554435877128,
	"loss": 1.144,
	"step": 450
	},
	{
	"epoch": 1.5371621621621623,
	"grad_norm": 0.2575773000717163,
	"learning_rate": 0.0001982472387235662,
	"loss": 1.1693,
	"step": 455
	},
	{
	"epoch": 1.554054054054054,
	"grad_norm": 0.23619942367076874,
	"learning_rate": 0.00019813561807535598,
	"loss": 1.1494,
	"step": 460
	},
	{
	"epoch": 1.570945945945946,
	"grad_norm": 0.24643085896968842,
	"learning_rate": 0.0001980205855238225,
	"loss": 1.1543,
	"step": 465
	},
	{
	"epoch": 1.5878378378378377,
	"grad_norm": 0.2060076743364334,
	"learning_rate": 0.00019790214506832868,
	"loss": 1.1597,
	"step": 470
	},
	{
	"epoch": 1.6047297297297298,
	"grad_norm": 0.20906926691532135,
	"learning_rate": 0.00019778030082672068,
	"loss": 1.1393,
	"step": 475
	},
	{
	"epoch": 1.6216216216216215,
	"grad_norm": 0.21041174232959747,
	"learning_rate": 0.00019765505703518496,
	"loss": 1.1519,
	"step": 480
	},
	{
	"epoch": 1.6385135135135136,
	"grad_norm": 0.21494755148887634,
	"learning_rate": 0.00019752641804810084,
	"loss": 1.1497,
	"step": 485
	},
	{
	"epoch": 1.6554054054054053,
	"grad_norm": 0.21202711760997772,
	"learning_rate": 0.0001973943883378892,
	"loss": 1.1579,
	"step": 490
	},
	{
	"epoch": 1.6722972972972974,
	"grad_norm": 0.20677632093429565,
	"learning_rate": 0.00019725897249485704,
	"loss": 1.1473,
	"step": 495
	},
	{
	"epoch": 1.689189189189189,
	"grad_norm": 0.2177901715040207,
	"learning_rate": 0.00019712017522703764,
	"loss": 1.154,
	"step": 500
	},
	{
	"epoch": 1.7060810810810811,
	"grad_norm": 0.212003692984581,
	"learning_rate": 0.0001969780013600272,
	"loss": 1.1608,
	"step": 505
	},
	{
	"epoch": 1.722972972972973,
	"grad_norm": 0.21401935815811157,
	"learning_rate": 0.00019683245583681675,
	"loss": 1.1619,
	"step": 510
	},
	{
	"epoch": 1.739864864864865,
	"grad_norm": 0.22224700450897217,
	"learning_rate": 0.00019668354371762066,
	"loss": 1.1565,
	"step": 515
	},
	{
	"epoch": 1.7567567567567568,
	"grad_norm": 0.2198743373155594,
	"learning_rate": 0.00019653127017970034,
	"loss": 1.148,
	"step": 520
	},
	{
	"epoch": 1.7736486486486487,
	"grad_norm": 0.21117670834064484,
	"learning_rate": 0.0001963756405171845,
	"loss": 1.1567,
	"step": 525
	},
	{
	"epoch": 1.7905405405405406,
	"grad_norm": 0.23106643557548523,
	"learning_rate": 0.00019621666014088494,
	"loss": 1.1417,
	"step": 530
	},
	{
	"epoch": 1.8074324324324325,
	"grad_norm": 0.20598255097866058,
	"learning_rate": 0.00019605433457810855,
	"loss": 1.1491,
	"step": 535
	},
	{
	"epoch": 1.8243243243243243,
	"grad_norm": 0.2185199111700058,
	"learning_rate": 0.00019588866947246498,
	"loss": 1.1474,
	"step": 540
	},
	{
	"epoch": 1.8412162162162162,
	"grad_norm": 0.21996720135211945,
	"learning_rate": 0.00019571967058367064,
	"loss": 1.1574,
	"step": 545
	},
	{
	"epoch": 1.8581081081081081,
	"grad_norm": 0.205213725566864,
	"learning_rate": 0.00019554734378734824,
	"loss": 1.1596,
	"step": 550
	},
	{
	"epoch": 1.875,
	"grad_norm": 0.19933567941188812,
	"learning_rate": 0.0001953716950748227,
	"loss": 1.1466,
	"step": 555
	},
	{
	"epoch": 1.8918918918918919,
	"grad_norm": 0.19704587757587433,
	"learning_rate": 0.00019519273055291266,
	"loss": 1.1399,
	"step": 560
	},
	{
	"epoch": 1.9087837837837838,
	"grad_norm": 0.20990757644176483,
	"learning_rate": 0.00019501045644371832,
	"loss": 1.1363,
	"step": 565
	},
	{
	"epoch": 1.9256756756756757,
	"grad_norm": 0.2083408534526825,
	"learning_rate": 0.000194824879084405,
	"loss": 1.1446,
	"step": 570
	},
	{
	"epoch": 1.9425675675675675,
	"grad_norm": 0.2556820213794708,
	"learning_rate": 0.00019463600492698296,
	"loss": 1.1372,
	"step": 575
	},
	{
	"epoch": 1.9594594594594594,
	"grad_norm": 0.20939995348453522,
	"learning_rate": 0.00019444384053808288,
	"loss": 1.1421,
	"step": 580
	},
	{
	"epoch": 1.9763513513513513,
	"grad_norm": 0.2339630275964737,
	"learning_rate": 0.00019424839259872778,
	"loss": 1.1421,
	"step": 585
	},
	{
	"epoch": 1.9932432432432432,
	"grad_norm": 0.3135931193828583,
	"learning_rate": 0.00019404966790410047,
	"loss": 1.1339,
	"step": 590
	},
	{
	"epoch": 2.0,
	"eval_loss": 1.6897428035736084,
	"eval_runtime": 0.3945,
	"eval_samples_per_second": 5.07,
	"eval_steps_per_second": 2.535,
	"step": 592
	},
	{
	"epoch": 2.010135135135135,
	"grad_norm": 0.2158200591802597,
	"learning_rate": 0.0001938476733633076,
	"loss": 1.0977,
	"step": 595
	},
	{
	"epoch": 2.027027027027027,
	"grad_norm": 0.22781264781951904,
	"learning_rate": 0.00019364241599913924,
	"loss": 1.0711,
	"step": 600
	},
	{
	"epoch": 2.043918918918919,
	"grad_norm": 0.24521173536777496,
	"learning_rate": 0.0001934339029478248,
	"loss": 1.0767,
	"step": 605
	},
	{
	"epoch": 2.060810810810811,
	"grad_norm": 0.21851304173469543,
	"learning_rate": 0.00019322214145878487,
	"loss": 1.0549,
	"step": 610
	},
	{
	"epoch": 2.0777027027027026,
	"grad_norm": 0.21393460035324097,
	"learning_rate": 0.00019300713889437926,
	"loss": 1.068,
	"step": 615
	},
	{
	"epoch": 2.0945945945945947,
	"grad_norm": 0.23508517444133759,
	"learning_rate": 0.00019278890272965096,
	"loss": 1.0776,
	"step": 620
	},
	{
	"epoch": 2.1114864864864864,
	"grad_norm": 0.2709183990955353,
	"learning_rate": 0.00019256744055206622,
	"loss": 1.0867,
	"step": 625
	},
	{
	"epoch": 2.1283783783783785,
	"grad_norm": 0.22891944646835327,
	"learning_rate": 0.000192342760061251,
	"loss": 1.0719,
	"step": 630
	},
	{
	"epoch": 2.14527027027027,
	"grad_norm": 0.24709245562553406,
	"learning_rate": 0.0001921148690687228,
	"loss": 1.0687,
	"step": 635
	},
	{
	"epoch": 2.1621621621621623,
	"grad_norm": 0.2254343330860138,
	"learning_rate": 0.00019188377549761963,
	"loss": 1.0687,
	"step": 640
	},
	{
	"epoch": 2.179054054054054,
	"grad_norm": 0.22168201208114624,
	"learning_rate": 0.00019164948738242409,
	"loss": 1.0765,
	"step": 645
	},
	{
	"epoch": 2.195945945945946,
	"grad_norm": 0.23680733144283295,
	"learning_rate": 0.00019141201286868435,
	"loss": 1.0741,
	"step": 650
	},
	{
	"epoch": 2.2128378378378377,
	"grad_norm": 0.23159544169902802,
	"learning_rate": 0.00019117136021273075,
	"loss": 1.0795,
	"step": 655
	},
	{
	"epoch": 2.22972972972973,
	"grad_norm": 0.23217150568962097,
	"learning_rate": 0.00019092753778138886,
	"loss": 1.0947,
	"step": 660
	},
	{
	"epoch": 2.2466216216216215,
	"grad_norm": 0.22594888508319855,
	"learning_rate": 0.0001906805540516885,
	"loss": 1.059,
	"step": 665
	},
	{
	"epoch": 2.2635135135135136,
	"grad_norm": 0.23356075584888458,
	"learning_rate": 0.00019043041761056907,
	"loss": 1.084,
	"step": 670
	},
	{
	"epoch": 2.2804054054054053,
	"grad_norm": 0.21952542662620544,
	"learning_rate": 0.0001901771371545811,
	"loss": 1.0807,
	"step": 675
	},
	{
	"epoch": 2.2972972972972974,
	"grad_norm": 0.21846647560596466,
	"learning_rate": 0.00018992072148958368,
	"loss": 1.0878,
	"step": 680
	},
	{
	"epoch": 2.314189189189189,
	"grad_norm": 0.23093639314174652,
	"learning_rate": 0.00018966117953043852,
	"loss": 1.074,
	"step": 685
	},
	{
	"epoch": 2.331081081081081,
	"grad_norm": 0.224954292178154,
	"learning_rate": 0.00018939852030069981,
	"loss": 1.0784,
	"step": 690
	},
	{
	"epoch": 2.347972972972973,
	"grad_norm": 0.2606515884399414,
	"learning_rate": 0.00018913275293230069,
	"loss": 1.0757,
	"step": 695
	},
	{
	"epoch": 2.364864864864865,
	"grad_norm": 0.2542010247707367,
	"learning_rate": 0.0001888638866652356,
	"loss": 1.0705,
	"step": 700
	},
	{
	"epoch": 2.3817567567567566,
	"grad_norm": 0.2348444014787674,
	"learning_rate": 0.00018859193084723913,
	"loss": 1.0857,
	"step": 705
	},
	{
	"epoch": 2.3986486486486487,
	"grad_norm": 0.2732667922973633,
	"learning_rate": 0.00018831689493346095,
	"loss": 1.073,
	"step": 710
	},
	{
	"epoch": 2.4155405405405403,
	"grad_norm": 0.24476487934589386,
	"learning_rate": 0.00018803878848613716,
	"loss": 1.0862,
	"step": 715
	},
	{
	"epoch": 2.4324324324324325,
	"grad_norm": 0.25073671340942383,
	"learning_rate": 0.00018775762117425777,
	"loss": 1.0699,
	"step": 720
	},
	{
	"epoch": 2.4493243243243246,
	"grad_norm": 0.23084624111652374,
	"learning_rate": 0.0001874734027732306,
	"loss": 1.0827,
	"step": 725
	},
	{
	"epoch": 2.4662162162162162,
	"grad_norm": 0.2258080244064331,
	"learning_rate": 0.00018718614316454133,
	"loss": 1.088,
	"step": 730
	},
	{
	"epoch": 2.483108108108108,
	"grad_norm": 0.23056402802467346,
	"learning_rate": 0.00018689585233541003,
	"loss": 1.0698,
	"step": 735
	},
	{
	"epoch": 2.5,
	"grad_norm": 0.22269397974014282,
	"learning_rate": 0.00018660254037844388,
	"loss": 1.0666,
	"step": 740
	},
	{
	"epoch": 2.516891891891892,
	"grad_norm": 0.21295320987701416,
	"learning_rate": 0.0001863062174912863,
	"loss": 1.0781,
	"step": 745
	},
	{
	"epoch": 2.5337837837837838,
	"grad_norm": 0.21225321292877197,
	"learning_rate": 0.00018600689397626246,
	"loss": 1.0724,
	"step": 750
	},
	{
	"epoch": 2.5506756756756754,
	"grad_norm": 0.22661367058753967,
	"learning_rate": 0.00018570458024002093,
	"loss": 1.0792,
	"step": 755
	},
	{
	"epoch": 2.5675675675675675,
	"grad_norm": 0.22279423475265503,
	"learning_rate": 0.0001853992867931721,
	"loss": 1.082,
	"step": 760
	},
	{
	"epoch": 2.5844594594594597,
	"grad_norm": 0.22243249416351318,
	"learning_rate": 0.0001850910242499225,
	"loss": 1.0662,
	"step": 765
	},
	{
	"epoch": 2.6013513513513513,
	"grad_norm": 0.22147369384765625,
	"learning_rate": 0.00018477980332770607,
	"loss": 1.0718,
	"step": 770
	},
	{
	"epoch": 2.618243243243243,
	"grad_norm": 0.2354060411453247,
	"learning_rate": 0.00018446563484681127,
	"loss": 1.09,
	"step": 775
	},
	{
	"epoch": 2.635135135135135,
	"grad_norm": 0.24088838696479797,
	"learning_rate": 0.00018414852973000503,
	"loss": 1.0897,
	"step": 780
	},
	{
	"epoch": 2.652027027027027,
	"grad_norm": 0.2794990539550781,
	"learning_rate": 0.00018382849900215294,
	"loss": 1.0804,
	"step": 785
	},
	{
	"epoch": 2.668918918918919,
	"grad_norm": 0.25418001413345337,
	"learning_rate": 0.00018350555378983608,
	"loss": 1.0729,
	"step": 790
	},
	{
	"epoch": 2.685810810810811,
	"grad_norm": 0.2769224941730499,
	"learning_rate": 0.0001831797053209639,
	"loss": 1.0812,
	"step": 795
	},
	{
	"epoch": 2.7027027027027026,
	"grad_norm": 0.2639266550540924,
	"learning_rate": 0.00018285096492438424,
	"loss": 1.0841,
	"step": 800
	},
	{
	"epoch": 2.7195945945945947,
	"grad_norm": 0.21467705070972443,
	"learning_rate": 0.000182519344029489,
	"loss": 1.0852,
	"step": 805
	},
	{
	"epoch": 2.7364864864864864,
	"grad_norm": 0.22124196588993073,
	"learning_rate": 0.00018218485416581726,
	"loss": 1.0849,
	"step": 810
	},
	{
	"epoch": 2.7533783783783785,
	"grad_norm": 0.21145068109035492,
	"learning_rate": 0.00018184750696265408,
	"loss": 1.0706,
	"step": 815
	},
	{
	"epoch": 2.77027027027027,
	"grad_norm": 0.22575508058071136,
	"learning_rate": 0.00018150731414862622,
	"loss": 1.0737,
	"step": 820
	},
	{
	"epoch": 2.7871621621621623,
	"grad_norm": 0.22897441685199738,
	"learning_rate": 0.00018116428755129459,
	"loss": 1.076,
	"step": 825
	},
	{
	"epoch": 2.804054054054054,
	"grad_norm": 0.224187970161438,
	"learning_rate": 0.00018081843909674276,
	"loss": 1.075,
	"step": 830
	},
	{
	"epoch": 2.820945945945946,
	"grad_norm": 0.22817817330360413,
	"learning_rate": 0.00018046978080916252,
	"loss": 1.0802,
	"step": 835
	},
	{
	"epoch": 2.8378378378378377,
	"grad_norm": 0.23358392715454102,
	"learning_rate": 0.00018011832481043576,
	"loss": 1.073,
	"step": 840
	},
	{
	"epoch": 2.85472972972973,
	"grad_norm": 0.2256878912448883,
	"learning_rate": 0.00017976408331971298,
	"loss": 1.0712,
	"step": 845
	},
	{
	"epoch": 2.8716216216216215,
	"grad_norm": 0.2276696115732193,
	"learning_rate": 0.0001794070686529886,
	"loss": 1.0888,
	"step": 850
	},
	{
	"epoch": 2.8885135135135136,
	"grad_norm": 0.2123207151889801,
	"learning_rate": 0.00017904729322267256,
	"loss": 1.0856,
	"step": 855
	},
	{
	"epoch": 2.9054054054054053,
	"grad_norm": 0.2253648340702057,
	"learning_rate": 0.000178684769537159,
	"loss": 1.0769,
	"step": 860
	},
	{
	"epoch": 2.9222972972972974,
	"grad_norm": 0.23328694701194763,
	"learning_rate": 0.00017831951020039126,
	"loss": 1.0805,
	"step": 865
	},
	{
	"epoch": 2.939189189189189,
	"grad_norm": 0.2189178615808487,
	"learning_rate": 0.0001779515279114236,
	"loss": 1.083,
	"step": 870
	},
	{
	"epoch": 2.956081081081081,
	"grad_norm": 0.21634751558303833,
	"learning_rate": 0.0001775808354639799,
	"loss": 1.0777,
	"step": 875
	},
	{
	"epoch": 2.972972972972973,
	"grad_norm": 0.22920973598957062,
	"learning_rate": 0.00017720744574600863,
	"loss": 1.0622,
	"step": 880
	},
	{
	"epoch": 2.989864864864865,
	"grad_norm": 0.23738548159599304,
	"learning_rate": 0.00017683137173923495,
	"loss": 1.0779,
	"step": 885
	},
	{
	"epoch": 3.0,
	"eval_loss": 1.7535914182662964,
	"eval_runtime": 0.3941,
	"eval_samples_per_second": 5.075,
	"eval_steps_per_second": 2.537,
	"step": 888
	},
	{
	"epoch": 3.0067567567567566,
	"grad_norm": 0.22737225890159607,
	"learning_rate": 0.00017645262651870926,
	"loss": 1.0427,
	"step": 890
	},
	{
	"epoch": 3.0236486486486487,
	"grad_norm": 0.2775098383426666,
	"learning_rate": 0.00017607122325235267,
	"loss": 0.9853,
	"step": 895
	},
	{
	"epoch": 3.0405405405405403,
	"grad_norm": 0.2837352752685547,
	"learning_rate": 0.0001756871752004992,
	"loss": 0.9753,
	"step": 900
	},
	{
	"epoch": 3.0574324324324325,
	"grad_norm": 0.25329145789146423,
	"learning_rate": 0.00017530049571543464,
	"loss": 0.9845,
	"step": 905
	},
	{
	"epoch": 3.074324324324324,
	"grad_norm": 0.2581470310688019,
	"learning_rate": 0.0001749111982409325,
	"loss": 0.974,
	"step": 910
	},
	{
	"epoch": 3.0912162162162162,
	"grad_norm": 0.2744286358356476,
	"learning_rate": 0.00017451929631178648,
	"loss": 0.9777,
	"step": 915
	},
	{
	"epoch": 3.108108108108108,
	"grad_norm": 0.2783578038215637,
	"learning_rate": 0.00017412480355334005,
	"loss": 0.9883,
	"step": 920
	},
	{
	"epoch": 3.125,
	"grad_norm": 0.27584517002105713,
	"learning_rate": 0.0001737277336810124,
	"loss": 0.98,
	"step": 925
	},
	{
	"epoch": 3.141891891891892,
	"grad_norm": 0.26467305421829224,
	"learning_rate": 0.00017332810049982208,
	"loss": 0.9956,
	"step": 930
	},
	{
	"epoch": 3.1587837837837838,
	"grad_norm": 0.25240039825439453,
	"learning_rate": 0.00017292591790390665,
	"loss": 0.9933,
	"step": 935
	},
	{
	"epoch": 3.175675675675676,
	"grad_norm": 0.24769380688667297,
	"learning_rate": 0.00017252119987603973,
	"loss": 0.9742,
	"step": 940
	},
	{
	"epoch": 3.1925675675675675,
	"grad_norm": 0.27298596501350403,
	"learning_rate": 0.00017211396048714498,
	"loss": 0.9866,
	"step": 945
	},
	{
	"epoch": 3.2094594594594597,
	"grad_norm": 0.2657850682735443,
	"learning_rate": 0.00017170421389580667,
	"loss": 0.99,
	"step": 950
	},
	{
	"epoch": 3.2263513513513513,
	"grad_norm": 0.23783531785011292,
	"learning_rate": 0.00017129197434777763,
	"loss": 0.9891,
	"step": 955
	},
	{
	"epoch": 3.2432432432432434,
	"grad_norm": 0.24934813380241394,
	"learning_rate": 0.00017087725617548385,
	"loss": 0.9986,
	"step": 960
	},
	{
	"epoch": 3.260135135135135,
	"grad_norm": 0.265461802482605,
	"learning_rate": 0.0001704600737975262,
	"loss": 0.977,
	"step": 965
	},
	{
	"epoch": 3.277027027027027,
	"grad_norm": 0.26984909176826477,
	"learning_rate": 0.00017004044171817925,
	"loss": 1.0041,
	"step": 970
	},
	{
	"epoch": 3.293918918918919,
	"grad_norm": 0.26064538955688477,
	"learning_rate": 0.00016961837452688676,
	"loss": 1.0007,
	"step": 975
	},
	{
	"epoch": 3.310810810810811,
	"grad_norm": 0.253579705953598,
	"learning_rate": 0.00016919388689775464,
	"loss": 1.0069,
	"step": 980
	},
	{
	"epoch": 3.3277027027027026,
	"grad_norm": 0.26410114765167236,
	"learning_rate": 0.00016876699358904068,
	"loss": 1.004,
	"step": 985
	},
	{
	"epoch": 3.3445945945945947,
	"grad_norm": 0.2758503556251526,
	"learning_rate": 0.00016833770944264153,
	"loss": 1.0048,
	"step": 990
	},
	{
	"epoch": 3.3614864864864864,
	"grad_norm": 0.2595711648464203,
	"learning_rate": 0.00016790604938357663,
	"loss": 0.9929,
	"step": 995
	},
	{
	"epoch": 3.3783783783783785,
	"grad_norm": 0.26039746403694153,
	"learning_rate": 0.00016747202841946928,
	"loss": 1.0006,
	"step": 1000
	},
	{
	"epoch": 3.39527027027027,
	"grad_norm": 0.25514382123947144,
	"learning_rate": 0.0001670356616400249,
	"loss": 1.012,
	"step": 1005
	},
	{
	"epoch": 3.4121621621621623,
	"grad_norm": 0.26591041684150696,
	"learning_rate": 0.00016659696421650645,
	"loss": 1.0039,
	"step": 1010
	},
	{
	"epoch": 3.429054054054054,
	"grad_norm": 0.26443612575531006,
	"learning_rate": 0.00016615595140120686,
	"loss": 0.9982,
	"step": 1015
	},
	{
	"epoch": 3.445945945945946,
	"grad_norm": 0.2647687792778015,
	"learning_rate": 0.00016571263852691888,
	"loss": 1.0028,
	"step": 1020
	},
	{
	"epoch": 3.4628378378378377,
	"grad_norm": 0.2620026767253876,
	"learning_rate": 0.0001652670410064019,
	"loss": 0.9951,
	"step": 1025
	},
	{
	"epoch": 3.47972972972973,
	"grad_norm": 0.2619130313396454,
	"learning_rate": 0.00016481917433184607,
	"loss": 0.9882,
	"step": 1030
	},
	{
	"epoch": 3.4966216216216215,
	"grad_norm": 0.24988499283790588,
	"learning_rate": 0.0001643690540743339,
	"loss": 0.9958,
	"step": 1035
	},
	{
	"epoch": 3.5135135135135136,
	"grad_norm": 0.2864786982536316,
	"learning_rate": 0.0001639166958832985,
	"loss": 1.0017,
	"step": 1040
	},
	{
	"epoch": 3.5304054054054053,
	"grad_norm": 0.2665320038795471,
	"learning_rate": 0.00016346211548597995,
	"loss": 0.9994,
	"step": 1045
	},
	{
	"epoch": 3.5472972972972974,
	"grad_norm": 0.2629227936267853,
	"learning_rate": 0.00016300532868687806,
	"loss": 1.007,
	"step": 1050
	},
	{
	"epoch": 3.564189189189189,
	"grad_norm": 0.25602978467941284,
	"learning_rate": 0.00016254635136720328,
	"loss": 1.0057,
	"step": 1055
	},
	{
	"epoch": 3.581081081081081,
	"grad_norm": 0.2551196813583374,
	"learning_rate": 0.0001620851994843244,
	"loss": 0.9972,
	"step": 1060
	},
	{
	"epoch": 3.597972972972973,
	"grad_norm": 0.27250906825065613,
	"learning_rate": 0.00016162188907121354,
	"loss": 1.0075,
	"step": 1065
	},
	{
	"epoch": 3.614864864864865,
	"grad_norm": 0.2675882577896118,
	"learning_rate": 0.00016115643623588915,
	"loss": 1.0103,
	"step": 1070
	},
	{
	"epoch": 3.631756756756757,
	"grad_norm": 0.2731866240501404,
	"learning_rate": 0.00016068885716085567,
	"loss": 1.0016,
	"step": 1075
	},
	{
	"epoch": 3.6486486486486487,
	"grad_norm": 0.249202698469162,
	"learning_rate": 0.00016021916810254097,
	"loss": 1.0086,
	"step": 1080
	},
	{
	"epoch": 3.6655405405405403,
	"grad_norm": 0.2600172460079193,
	"learning_rate": 0.00015974738539073125,
	"loss": 1.0032,
	"step": 1085
	},
	{
	"epoch": 3.6824324324324325,
	"grad_norm": 0.2564319372177124,
	"learning_rate": 0.00015927352542800317,
	"loss": 1.0087,
	"step": 1090
	},
	{
	"epoch": 3.6993243243243246,
	"grad_norm": 0.25873422622680664,
	"learning_rate": 0.00015879760468915372,
	"loss": 1.0006,
	"step": 1095
	},
	{
	"epoch": 3.7162162162162162,
	"grad_norm": 0.2660174071788788,
	"learning_rate": 0.00015831963972062733,
	"loss": 0.988,
	"step": 1100
	},
	{
	"epoch": 3.733108108108108,
	"grad_norm": 0.26095345616340637,
	"learning_rate": 0.0001578396471399406,
	"loss": 1.0109,
	"step": 1105
	},
	{
	"epoch": 3.75,
	"grad_norm": 0.2525663673877716,
	"learning_rate": 0.0001573576436351046,
	"loss": 1.001,
	"step": 1110
	},
	{
	"epoch": 3.766891891891892,
	"grad_norm": 0.2541150152683258,
	"learning_rate": 0.0001568736459640447,
	"loss": 0.9995,
	"step": 1115
	},
	{
	"epoch": 3.7837837837837838,
	"grad_norm": 0.2548198997974396,
	"learning_rate": 0.0001563876709540178,
	"loss": 1.007,
	"step": 1120
	},
	{
	"epoch": 3.8006756756756754,
	"grad_norm": 0.26351451873779297,
	"learning_rate": 0.00015589973550102747,
	"loss": 1.0056,
	"step": 1125
	},
	{
	"epoch": 3.8175675675675675,
	"grad_norm": 0.2661518454551697,
	"learning_rate": 0.00015540985656923645,
	"loss": 1.0159,
	"step": 1130
	},
	{
	"epoch": 3.8344594594594597,
	"grad_norm": 0.2599773406982422,
	"learning_rate": 0.00015491805119037684,
	"loss": 1.0102,
	"step": 1135
	},
	{
	"epoch": 3.8513513513513513,
	"grad_norm": 0.2605207562446594,
	"learning_rate": 0.0001544243364631579,
	"loss": 1.009,
	"step": 1140
	},
	{
	"epoch": 3.868243243243243,
	"grad_norm": 0.2640506625175476,
	"learning_rate": 0.00015392872955267175,
	"loss": 1.0125,
	"step": 1145
	},
	{
	"epoch": 3.885135135135135,
	"grad_norm": 0.29407069087028503,
	"learning_rate": 0.00015343124768979637,
	"loss": 1.0107,
	"step": 1150
	},
	{
	"epoch": 3.902027027027027,
	"grad_norm": 0.2638514041900635,
	"learning_rate": 0.00015293190817059667,
	"loss": 1.0046,
	"step": 1155
	},
	{
	"epoch": 3.918918918918919,
	"grad_norm": 0.26569753885269165,
	"learning_rate": 0.00015243072835572318,
	"loss": 0.9985,
	"step": 1160
	},
	{
	"epoch": 3.935810810810811,
	"grad_norm": 0.24786274135112762,
	"learning_rate": 0.0001519277256698083,
	"loss": 1.0086,
	"step": 1165
	},
	{
	"epoch": 3.9527027027027026,
	"grad_norm": 0.27254632115364075,
	"learning_rate": 0.0001514229176008607,
	"loss": 1.0048,
	"step": 1170
	},
	{
	"epoch": 3.9695945945945947,
	"grad_norm": 0.26518264412879944,
	"learning_rate": 0.0001509163216996572,
	"loss": 1.0014,
	"step": 1175
	},
	{
	"epoch": 3.9864864864864864,
	"grad_norm": 0.24938583374023438,
	"learning_rate": 0.00015040795557913245,
	"loss": 1.0043,
	"step": 1180
	},
	{
	"epoch": 4.0,
	"eval_loss": 1.8225109577178955,
	"eval_runtime": 0.3942,
	"eval_samples_per_second": 5.073,
	"eval_steps_per_second": 2.537,
	"step": 1184
	},
	{
	"epoch": 4.003378378378378,
	"grad_norm": 0.41594985127449036,
	"learning_rate": 0.00014989783691376696,
	"loss": 0.9886,
	"step": 1185
	},
	{
	"epoch": 4.02027027027027,
	"grad_norm": 0.332119345664978,
	"learning_rate": 0.00014938598343897214,
	"loss": 0.8971,
	"step": 1190
	},
	{
	"epoch": 4.037162162162162,
	"grad_norm": 0.2723919749259949,
	"learning_rate": 0.000148872412950474,
	"loss": 0.9054,
	"step": 1195
	},
	{
	"epoch": 4.054054054054054,
	"grad_norm": 0.3006138801574707,
	"learning_rate": 0.00014835714330369446,
	"loss": 0.8955,
	"step": 1200
	},
	{
	"epoch": 4.070945945945946,
	"grad_norm": 0.3039803206920624,
	"learning_rate": 0.00014784019241313026,
	"loss": 0.8937,
	"step": 1205
	},
	{
	"epoch": 4.087837837837838,
	"grad_norm": 0.2896163761615753,
	"learning_rate": 0.00014732157825173044,
	"loss": 0.8998,
	"step": 1210
	},
	{
	"epoch": 4.10472972972973,
	"grad_norm": 0.2962886095046997,
	"learning_rate": 0.00014680131885027141,
	"loss": 0.9087,
	"step": 1215
	},
	{
	"epoch": 4.121621621621622,
	"grad_norm": 0.2953561246395111,
	"learning_rate": 0.0001462794322967299,
	"loss": 0.9078,
	"step": 1220
	},
	{
	"epoch": 4.138513513513513,
	"grad_norm": 0.2991558015346527,
	"learning_rate": 0.00014575593673565426,
	"loss": 0.9004,
	"step": 1225
	},
	{
	"epoch": 4.155405405405405,
	"grad_norm": 0.32434654235839844,
	"learning_rate": 0.00014523085036753354,
	"loss": 0.8972,
	"step": 1230
	},
	{
	"epoch": 4.172297297297297,
	"grad_norm": 0.29733654856681824,
	"learning_rate": 0.00014470419144816483,
	"loss": 0.905,
	"step": 1235
	},
	{
	"epoch": 4.1891891891891895,
	"grad_norm": 0.2878667116165161,
	"learning_rate": 0.00014417597828801832,
	"loss": 0.9037,
	"step": 1240
	},
	{
	"epoch": 4.206081081081081,
	"grad_norm": 0.3089180886745453,
	"learning_rate": 0.00014364622925160098,
	"loss": 0.9004,
	"step": 1245
	},
	{
	"epoch": 4.222972972972973,
	"grad_norm": 0.29691433906555176,
	"learning_rate": 0.00014311496275681783,
	"loss": 0.9105,
	"step": 1250
	},
	{
	"epoch": 4.239864864864865,
	"grad_norm": 0.31907522678375244,
	"learning_rate": 0.0001425821972743318,
	"loss": 0.9051,
	"step": 1255
	},
	{
	"epoch": 4.256756756756757,
	"grad_norm": 0.3177861273288727,
	"learning_rate": 0.00014204795132692144,
	"loss": 0.9059,
	"step": 1260
	},
	{
	"epoch": 4.273648648648648,
	"grad_norm": 0.3413095474243164,
	"learning_rate": 0.00014151224348883692,
	"loss": 0.9068,
	"step": 1265
	},
	{
	"epoch": 4.29054054054054,
	"grad_norm": 0.31278854608535767,
	"learning_rate": 0.00014097509238515432,
	"loss": 0.9178,
	"step": 1270
	},
	{
	"epoch": 4.3074324324324325,
	"grad_norm": 0.3215930461883545,
	"learning_rate": 0.00014043651669112808,
	"loss": 0.9048,
	"step": 1275
	},
	{
	"epoch": 4.324324324324325,
	"grad_norm": 0.32147011160850525,
	"learning_rate": 0.00013989653513154165,
	"loss": 0.9182,
	"step": 1280
	},
	{
	"epoch": 4.341216216216216,
	"grad_norm": 0.30455154180526733,
	"learning_rate": 0.0001393551664800566,
	"loss": 0.9159,
	"step": 1285
	},
	{
	"epoch": 4.358108108108108,
	"grad_norm": 0.310214638710022,
	"learning_rate": 0.00013881242955855974,
	"loss": 0.9157,
	"step": 1290
	},
	{
	"epoch": 4.375,
	"grad_norm": 0.3040444254875183,
	"learning_rate": 0.000138268343236509,
	"loss": 0.9136,
	"step": 1295
	},
	{
	"epoch": 4.391891891891892,
	"grad_norm": 0.32138949632644653,
	"learning_rate": 0.000137722926430277,
	"loss": 0.9198,
	"step": 1300
	},
	{
	"epoch": 4.408783783783784,
	"grad_norm": 0.3029273748397827,
	"learning_rate": 0.00013717619810249378,
	"loss": 0.9207,
	"step": 1305
	},
	{
	"epoch": 4.425675675675675,
	"grad_norm": 0.3084327280521393,
	"learning_rate": 0.00013662817726138728,
	"loss": 0.9128,
	"step": 1310
	},
	{
	"epoch": 4.4425675675675675,
	"grad_norm": 0.2980863153934479,
	"learning_rate": 0.00013607888296012259,
	"loss": 0.919,
	"step": 1315
	},
	{
	"epoch": 4.45945945945946,
	"grad_norm": 0.3012111186981201,
	"learning_rate": 0.00013552833429613938,
	"loss": 0.913,
	"step": 1320
	},
	{
	"epoch": 4.476351351351352,
	"grad_norm": 0.3067188262939453,
	"learning_rate": 0.0001349765504104881,
	"loss": 0.9098,
	"step": 1325
	},
	{
	"epoch": 4.493243243243243,
	"grad_norm": 0.30859634280204773,
	"learning_rate": 0.0001344235504871645,
	"loss": 0.9103,
	"step": 1330
	},
	{
	"epoch": 4.510135135135135,
	"grad_norm": 0.309527724981308,
	"learning_rate": 0.00013386935375244246,
	"loss": 0.9118,
	"step": 1335
	},
	{
	"epoch": 4.527027027027027,
	"grad_norm": 0.29956597089767456,
	"learning_rate": 0.00013331397947420576,
	"loss": 0.9248,
	"step": 1340
	},
	{
	"epoch": 4.543918918918919,
	"grad_norm": 0.30333107709884644,
	"learning_rate": 0.00013275744696127805,
	"loss": 0.9235,
	"step": 1345
	},
	{
	"epoch": 4.5608108108108105,
	"grad_norm": 0.3010920584201813,
	"learning_rate": 0.00013219977556275163,
	"loss": 0.9204,
	"step": 1350
	},
	{
	"epoch": 4.577702702702703,
	"grad_norm": 0.30947473645210266,
	"learning_rate": 0.00013164098466731468,
	"loss": 0.9244,
	"step": 1355
	},
	{
	"epoch": 4.594594594594595,
	"grad_norm": 0.30661630630493164,
	"learning_rate": 0.00013108109370257712,
	"loss": 0.9177,
	"step": 1360
	},
	{
	"epoch": 4.611486486486487,
	"grad_norm": 0.2866823971271515,
	"learning_rate": 0.00013052012213439536,
	"loss": 0.9107,
	"step": 1365
	},
	{
	"epoch": 4.628378378378378,
	"grad_norm": 0.3211285471916199,
	"learning_rate": 0.0001299580894661953,
	"loss": 0.9242,
	"step": 1370
	},
	{
	"epoch": 4.64527027027027,
	"grad_norm": 0.3097619414329529,
	"learning_rate": 0.00012939501523829444,
	"loss": 0.91,
	"step": 1375
	},
	{
	"epoch": 4.662162162162162,
	"grad_norm": 0.30498236417770386,
	"learning_rate": 0.0001288309190272222,
	"loss": 0.9176,
	"step": 1380
	},
	{
	"epoch": 4.679054054054054,
	"grad_norm": 0.31782612204551697,
	"learning_rate": 0.00012826582044503978,
	"loss": 0.91,
	"step": 1385
	},
	{
	"epoch": 4.695945945945946,
	"grad_norm": 0.32527872920036316,
	"learning_rate": 0.00012769973913865794,
	"loss": 0.9119,
	"step": 1390
	},
	{
	"epoch": 4.712837837837838,
	"grad_norm": 0.2965739369392395,
	"learning_rate": 0.000127132694789154,
	"loss": 0.9333,
	"step": 1395
	},
	{
	"epoch": 4.72972972972973,
	"grad_norm": 0.31443119049072266,
	"learning_rate": 0.00012656470711108764,
	"loss": 0.9184,
	"step": 1400
	},
	{
	"epoch": 4.746621621621622,
	"grad_norm": 0.30386343598365784,
	"learning_rate": 0.00012599579585181552,
	"loss": 0.912,
	"step": 1405
	},
	{
	"epoch": 4.763513513513513,
	"grad_norm": 0.2971736788749695,
	"learning_rate": 0.00012542598079080456,
	"loss": 0.9115,
	"step": 1410
	},
	{
	"epoch": 4.780405405405405,
	"grad_norm": 0.29560431838035583,
	"learning_rate": 0.00012485528173894448,
	"loss": 0.9176,
	"step": 1415
	},
	{
	"epoch": 4.797297297297297,
	"grad_norm": 0.30718737840652466,
	"learning_rate": 0.0001242837185378587,
	"loss": 0.9184,
	"step": 1420
	},
	{
	"epoch": 4.8141891891891895,
	"grad_norm": 0.29568740725517273,
	"learning_rate": 0.00012371131105921504,
	"loss": 0.9214,
	"step": 1425
	},
	{
	"epoch": 4.831081081081081,
	"grad_norm": 0.32252946496009827,
	"learning_rate": 0.00012313807920403419,
	"loss": 0.9252,
	"step": 1430
	},
	{
	"epoch": 4.847972972972973,
	"grad_norm": 0.31315141916275024,
	"learning_rate": 0.00012256404290199825,
	"loss": 0.9308,
	"step": 1435
	},
	{
	"epoch": 4.864864864864865,
	"grad_norm": 0.3065871000289917,
	"learning_rate": 0.00012198922211075778,
	"loss": 0.9186,
	"step": 1440
	},
	{
	"epoch": 4.881756756756757,
	"grad_norm": 0.31804540753364563,
	"learning_rate": 0.00012141363681523776,
	"loss": 0.9275,
	"step": 1445
	},
	{
	"epoch": 4.898648648648649,
	"grad_norm": 0.313486784696579,
	"learning_rate": 0.00012083730702694291,
	"loss": 0.9315,
	"step": 1450
	},
	{
	"epoch": 4.91554054054054,
	"grad_norm": 0.31312400102615356,
	"learning_rate": 0.00012026025278326187,
	"loss": 0.934,
	"step": 1455
	},
	{
	"epoch": 4.9324324324324325,
	"grad_norm": 0.321845680475235,
	"learning_rate": 0.00011968249414677055,
	"loss": 0.9266,
	"step": 1460
	},
	{
	"epoch": 4.949324324324325,
	"grad_norm": 0.29238423705101013,
	"learning_rate": 0.00011910405120453476,
	"loss": 0.9203,
	"step": 1465
	},
	{
	"epoch": 4.966216216216216,
	"grad_norm": 0.30449482798576355,
	"learning_rate": 0.00011852494406741165,
	"loss": 0.9254,
	"step": 1470
	},
	{
	"epoch": 4.983108108108108,
	"grad_norm": 0.3126208186149597,
	"learning_rate": 0.00011794519286935055,
	"loss": 0.9181,
	"step": 1475
	},
	{
	"epoch": 5.0,
	"grad_norm": 0.29170361161231995,
	"learning_rate": 0.00011736481776669306,
	"loss": 0.9288,
	"step": 1480
	},
	{
	"epoch": 5.0,
	"eval_loss": 2.0044448375701904,
	"eval_runtime": 0.3932,
	"eval_samples_per_second": 5.087,
	"eval_steps_per_second": 2.543,
	"step": 1480
	},
	{
	"epoch": 5.016891891891892,
	"grad_norm": 0.46076056361198425,
	"learning_rate": 0.0001167838389374722,
	"loss": 0.8221,
	"step": 1485
	},
	{
	"epoch": 5.033783783783784,
	"grad_norm": 0.32739222049713135,
	"learning_rate": 0.00011620227658071087,
	"loss": 0.8178,
	"step": 1490
	},
	{
	"epoch": 5.050675675675675,
	"grad_norm": 0.38803204894065857,
	"learning_rate": 0.00011562015091571963,
	"loss": 0.8143,
	"step": 1495
	},
	{
	"epoch": 5.0675675675675675,
	"grad_norm": 0.32274121046066284,
	"learning_rate": 0.00011503748218139369,
	"loss": 0.821,
	"step": 1500
	},
	{
	"epoch": 5.08445945945946,
	"grad_norm": 0.3647359013557434,
	"learning_rate": 0.00011445429063550926,
	"loss": 0.8265,
	"step": 1505
	},
	{
	"epoch": 5.101351351351352,
	"grad_norm": 0.36681613326072693,
	"learning_rate": 0.00011387059655401932,
	"loss": 0.8248,
	"step": 1510
	},
	{
	"epoch": 5.118243243243243,
	"grad_norm": 0.35085347294807434,
	"learning_rate": 0.00011328642023034857,
	"loss": 0.823,
	"step": 1515
	},
	{
	"epoch": 5.135135135135135,
	"grad_norm": 0.3212147653102875,
	"learning_rate": 0.00011270178197468789,
	"loss": 0.8265,
	"step": 1520
	},
	{
	"epoch": 5.152027027027027,
	"grad_norm": 0.35389629006385803,
	"learning_rate": 0.00011211670211328833,
	"loss": 0.8252,
	"step": 1525
	},
	{
	"epoch": 5.168918918918919,
	"grad_norm": 0.350277841091156,
	"learning_rate": 0.00011153120098775434,
	"loss": 0.8193,
	"step": 1530
	},
	{
	"epoch": 5.1858108108108105,
	"grad_norm": 0.35216981172561646,
	"learning_rate": 0.00011094529895433652,
	"loss": 0.8291,
	"step": 1535
	},
	{
	"epoch": 5.202702702702703,
	"grad_norm": 0.33077818155288696,
	"learning_rate": 0.00011035901638322392,
	"loss": 0.8145,
	"step": 1540
	},
	{
	"epoch": 5.219594594594595,
	"grad_norm": 0.34553956985473633,
	"learning_rate": 0.0001097723736578359,
	"loss": 0.8297,
	"step": 1545
	},
	{
	"epoch": 5.236486486486487,
	"grad_norm": 0.349026083946228,
	"learning_rate": 0.00010918539117411333,
	"loss": 0.8363,
	"step": 1550
	},
	{
	"epoch": 5.253378378378378,
	"grad_norm": 0.34249648451805115,
	"learning_rate": 0.00010859808933980948,
	"loss": 0.8228,
	"step": 1555
	},
	{
	"epoch": 5.27027027027027,
	"grad_norm": 0.3591874837875366,
	"learning_rate": 0.00010801048857378071,
	"loss": 0.8272,
	"step": 1560
	},
	{
	"epoch": 5.287162162162162,
	"grad_norm": 0.3266925513744354,
	"learning_rate": 0.00010742260930527625,
	"loss": 0.8264,
	"step": 1565
	},
	{
	"epoch": 5.304054054054054,
	"grad_norm": 0.3557049632072449,
	"learning_rate": 0.00010683447197322817,
	"loss": 0.8327,
	"step": 1570
	},
	{
	"epoch": 5.320945945945946,
	"grad_norm": 0.34309855103492737,
	"learning_rate": 0.00010624609702554069,
	"loss": 0.8362,
	"step": 1575
	},
	{
	"epoch": 5.337837837837838,
	"grad_norm": 0.33597272634506226,
	"learning_rate": 0.00010565750491837925,
	"loss": 0.8274,
	"step": 1580
	},
	{
	"epoch": 5.35472972972973,
	"grad_norm": 0.33070334792137146,
	"learning_rate": 0.0001050687161154593,
	"loss": 0.8309,
	"step": 1585
	},
	{
	"epoch": 5.371621621621622,
	"grad_norm": 0.34598931670188904,
	"learning_rate": 0.00010447975108733492,
	"loss": 0.846,
	"step": 1590
	},
	{
	"epoch": 5.388513513513513,
	"grad_norm": 0.3528457283973694,
	"learning_rate": 0.00010389063031068698,
	"loss": 0.8199,
	"step": 1595
	},
	{
	"epoch": 5.405405405405405,
	"grad_norm": 0.3506796956062317,
	"learning_rate": 0.00010330137426761135,
	"loss": 0.8377,
	"step": 1600
	},
	{
	"epoch": 5.422297297297297,
	"grad_norm": 0.35415780544281006,
	"learning_rate": 0.00010271200344490674,
	"loss": 0.8357,
	"step": 1605
	},
	{
	"epoch": 5.4391891891891895,
	"grad_norm": 0.33977410197257996,
	"learning_rate": 0.00010212253833336237,
	"loss": 0.8273,
	"step": 1610
	},
	{
	"epoch": 5.456081081081081,
	"grad_norm": 0.3760969638824463,
	"learning_rate": 0.00010153299942704566,
	"loss": 0.8404,
	"step": 1615
	},
	{
	"epoch": 5.472972972972973,
	"grad_norm": 0.3504043519496918,
	"learning_rate": 0.00010094340722258969,
	"loss": 0.8368,
	"step": 1620
	},
	{
	"epoch": 5.489864864864865,
	"grad_norm": 0.3397385776042938,
	"learning_rate": 0.00010035378221848053,
	"loss": 0.8327,
	"step": 1625
	},
	{
	"epoch": 5.506756756756757,
	"grad_norm": 0.33861246705055237,
	"learning_rate": 9.976414491434463e-05,
	"loss": 0.8419,
	"step": 1630
	},
	{
	"epoch": 5.523648648648649,
	"grad_norm": 0.3566323220729828,
	"learning_rate": 9.917451581023607e-05,
	"loss": 0.8366,
	"step": 1635
	},
	{
	"epoch": 5.54054054054054,
	"grad_norm": 0.3398774266242981,
	"learning_rate": 9.858491540592382e-05,
	"loss": 0.8306,
	"step": 1640
	},
	{
	"epoch": 5.5574324324324325,
	"grad_norm": 0.3483969271183014,
	"learning_rate": 9.799536420017906e-05,
	"loss": 0.8333,
	"step": 1645
	},
	{
	"epoch": 5.574324324324325,
	"grad_norm": 0.34190595149993896,
	"learning_rate": 9.740588269006246e-05,
	"loss": 0.838,
	"step": 1650
	},
	{
	"epoch": 5.591216216216216,
	"grad_norm": 0.35382217168807983,
	"learning_rate": 9.681649137021158e-05,
	"loss": 0.8489,
	"step": 1655
	},
	{
	"epoch": 5.608108108108108,
	"grad_norm": 0.3321906328201294,
	"learning_rate": 9.622721073212832e-05,
	"loss": 0.8364,
	"step": 1660
	},
	{
	"epoch": 5.625,
	"grad_norm": 0.34170404076576233,
	"learning_rate": 9.563806126346642e-05,
	"loss": 0.841,
	"step": 1665
	},
	{
	"epoch": 5.641891891891892,
	"grad_norm": 0.34292900562286377,
	"learning_rate": 9.504906344731932e-05,
	"loss": 0.8366,
	"step": 1670
	},
	{
	"epoch": 5.658783783783784,
	"grad_norm": 0.35314562916755676,
	"learning_rate": 9.446023776150787e-05,
	"loss": 0.838,
	"step": 1675
	},
	{
	"epoch": 5.675675675675675,
	"grad_norm": 0.3411477506160736,
	"learning_rate": 9.38716046778684e-05,
	"loss": 0.8441,
	"step": 1680
	},
	{
	"epoch": 5.6925675675675675,
	"grad_norm": 0.3432328701019287,
	"learning_rate": 9.328318466154102e-05,
	"loss": 0.8459,
	"step": 1685
	},
	{
	"epoch": 5.70945945945946,
	"grad_norm": 0.33872732520103455,
	"learning_rate": 9.269499817025814e-05,
	"loss": 0.8388,
	"step": 1690
	},
	{
	"epoch": 5.726351351351351,
	"grad_norm": 0.34312689304351807,
	"learning_rate": 9.210706565363305e-05,
	"loss": 0.8332,
	"step": 1695
	},
	{
	"epoch": 5.743243243243243,
	"grad_norm": 0.3369201123714447,
	"learning_rate": 9.151940755244912e-05,
	"loss": 0.831,
	"step": 1700
	},
	{
	"epoch": 5.760135135135135,
	"grad_norm": 0.34367725253105164,
	"learning_rate": 9.093204429794898e-05,
	"loss": 0.8303,
	"step": 1705
	},
	{
	"epoch": 5.777027027027027,
	"grad_norm": 0.3678775727748871,
	"learning_rate": 9.034499631112437e-05,
	"loss": 0.8413,
	"step": 1710
	},
	{
	"epoch": 5.793918918918919,
	"grad_norm": 0.34643349051475525,
	"learning_rate": 8.975828400200592e-05,
	"loss": 0.845,
	"step": 1715
	},
	{
	"epoch": 5.8108108108108105,
	"grad_norm": 0.35629916191101074,
	"learning_rate": 8.917192776895382e-05,
	"loss": 0.836,
	"step": 1720
	},
	{
	"epoch": 5.827702702702703,
	"grad_norm": 0.3395968973636627,
	"learning_rate": 8.858594799794835e-05,
	"loss": 0.8384,
	"step": 1725
	},
	{
	"epoch": 5.844594594594595,
	"grad_norm": 0.3399130403995514,
	"learning_rate": 8.800036506188129e-05,
	"loss": 0.841,
	"step": 1730
	},
	{
	"epoch": 5.861486486486487,
	"grad_norm": 0.3563048541545868,
	"learning_rate": 8.741519931984766e-05,
	"loss": 0.8388,
	"step": 1735
	},
	{
	"epoch": 5.878378378378378,
	"grad_norm": 0.34680601954460144,
	"learning_rate": 8.683047111643763e-05,
	"loss": 0.8368,
	"step": 1740
	},
	{
	"epoch": 5.89527027027027,
	"grad_norm": 0.3650359511375427,
	"learning_rate": 8.624620078102951e-05,
	"loss": 0.8447,
	"step": 1745
	},
	{
	"epoch": 5.912162162162162,
	"grad_norm": 0.34037554264068604,
	"learning_rate": 8.566240862708274e-05,
	"loss": 0.8355,
	"step": 1750
	},
	{
	"epoch": 5.929054054054054,
	"grad_norm": 0.35734692215919495,
	"learning_rate": 8.507911495143173e-05,
	"loss": 0.8425,
	"step": 1755
	},
	{
	"epoch": 5.945945945945946,
	"grad_norm": 0.3381343483924866,
	"learning_rate": 8.449634003358022e-05,
	"loss": 0.8418,
	"step": 1760
	},
	{
	"epoch": 5.962837837837838,
	"grad_norm": 0.3489098846912384,
	"learning_rate": 8.39141041349961e-05,
	"loss": 0.847,
	"step": 1765
	},
	{
	"epoch": 5.97972972972973,
	"grad_norm": 0.361604243516922,
	"learning_rate": 8.33324274984071e-05,
	"loss": 0.8428,
	"step": 1770
	},
	{
	"epoch": 5.996621621621622,
	"grad_norm": 0.34529900550842285,
	"learning_rate": 8.275133034709699e-05,
	"loss": 0.8437,
	"step": 1775
	},
	{
	"epoch": 6.0,
	"eval_loss": 2.170966863632202,
	"eval_runtime": 0.3935,
	"eval_samples_per_second": 5.083,
	"eval_steps_per_second": 2.541,
	"step": 1776
	},
	{
	"epoch": 6.013513513513513,
	"grad_norm": 0.3619636595249176,
	"learning_rate": 8.217083288420241e-05,
	"loss": 0.7823,
	"step": 1780
	},
	{
	"epoch": 6.030405405405405,
	"grad_norm": 0.33571234345436096,
	"learning_rate": 8.159095529201049e-05,
	"loss": 0.7663,
	"step": 1785
	},
	{
	"epoch": 6.047297297297297,
	"grad_norm": 0.3377952575683594,
	"learning_rate": 8.101171773125716e-05,
	"loss": 0.764,
	"step": 1790
	},
	{
	"epoch": 6.0641891891891895,
	"grad_norm": 0.3851635754108429,
	"learning_rate": 8.043314034042631e-05,
	"loss": 0.7543,
	"step": 1795
	},
	{
	"epoch": 6.081081081081081,
	"grad_norm": 0.3411933481693268,
	"learning_rate": 7.985524323504948e-05,
	"loss": 0.7569,
	"step": 1800
	},
	{
	"epoch": 6.097972972972973,
	"grad_norm": 0.3682069480419159,
	"learning_rate": 7.927804650700659e-05,
	"loss": 0.7546,
	"step": 1805
	},
	{
	"epoch": 6.114864864864865,
	"grad_norm": 0.35545244812965393,
	"learning_rate": 7.870157022382735e-05,
	"loss": 0.7615,
	"step": 1810
	},
	{
	"epoch": 6.131756756756757,
	"grad_norm": 0.39011305570602417,
	"learning_rate": 7.812583442799368e-05,
	"loss": 0.7611,
	"step": 1815
	},
	{
	"epoch": 6.148648648648648,
	"grad_norm": 0.33269399404525757,
	"learning_rate": 7.755085913624274e-05,
	"loss": 0.7599,
	"step": 1820
	},
	{
	"epoch": 6.16554054054054,
	"grad_norm": 0.3615286946296692,
	"learning_rate": 7.697666433887108e-05,
	"loss": 0.7501,
	"step": 1825
	},
	{
	"epoch": 6.1824324324324325,
	"grad_norm": 0.3396786153316498,
	"learning_rate": 7.640326999903967e-05,
	"loss": 0.757,
	"step": 1830
	},
	{
	"epoch": 6.199324324324325,
	"grad_norm": 0.38157907128334045,
	"learning_rate": 7.583069605207975e-05,
	"loss": 0.7506,
	"step": 1835
	},
	{
	"epoch": 6.216216216216216,
	"grad_norm": 0.3560575842857361,
	"learning_rate": 7.525896240479976e-05,
	"loss": 0.754,
	"step": 1840
	},
	{
	"epoch": 6.233108108108108,
	"grad_norm": 0.3762560784816742,
	"learning_rate": 7.468808893479327e-05,
	"loss": 0.7614,
	"step": 1845
	},
	{
	"epoch": 6.25,
	"grad_norm": 0.36987847089767456,
	"learning_rate": 7.411809548974792e-05,
	"loss": 0.7637,
	"step": 1850
	},
	{
	"epoch": 6.266891891891892,
	"grad_norm": 0.406857967376709,
	"learning_rate": 7.354900188675525e-05,
	"loss": 0.761,
	"step": 1855
	},
	{
	"epoch": 6.283783783783784,
	"grad_norm": 0.3850703835487366,
	"learning_rate": 7.29808279116218e-05,
	"loss": 0.7656,
	"step": 1860
	},
	{
	"epoch": 6.300675675675675,
	"grad_norm": 0.34307488799095154,
	"learning_rate": 7.24135933181812e-05,
	"loss": 0.7501,
	"step": 1865
	},
	{
	"epoch": 6.3175675675675675,
	"grad_norm": 0.3922889232635498,
	"learning_rate": 7.184731782760746e-05,
	"loss": 0.7584,
	"step": 1870
	},
	{
	"epoch": 6.33445945945946,
	"grad_norm": 0.36379769444465637,
	"learning_rate": 7.128202112772912e-05,
	"loss": 0.7626,
	"step": 1875
	},
	{
	"epoch": 6.351351351351352,
	"grad_norm": 0.3796177804470062,
	"learning_rate": 7.071772287234497e-05,
	"loss": 0.7739,
	"step": 1880
	},
	{
	"epoch": 6.368243243243243,
	"grad_norm": 0.3752601146697998,
	"learning_rate": 7.015444268054059e-05,
	"loss": 0.7658,
	"step": 1885
	},
	{
	"epoch": 6.385135135135135,
	"grad_norm": 0.3463265597820282,
	"learning_rate": 6.959220013600641e-05,
	"loss": 0.7584,
	"step": 1890
	},
	{
	"epoch": 6.402027027027027,
	"grad_norm": 0.3532774746417999,
	"learning_rate": 6.903101478635662e-05,
	"loss": 0.7715,
	"step": 1895
	},
	{
	"epoch": 6.418918918918919,
	"grad_norm": 0.3608658015727997,
	"learning_rate": 6.847090614244977e-05,
	"loss": 0.7682,
	"step": 1900
	},
	{
	"epoch": 6.4358108108108105,
	"grad_norm": 0.39848268032073975,
	"learning_rate": 6.791189367771025e-05,
	"loss": 0.7658,
	"step": 1905
	},
	{
	"epoch": 6.452702702702703,
	"grad_norm": 0.3448575437068939,
	"learning_rate": 6.735399682745145e-05,
	"loss": 0.7736,
	"step": 1910
	},
	{
	"epoch": 6.469594594594595,
	"grad_norm": 0.3646429181098938,
	"learning_rate": 6.679723498819986e-05,
	"loss": 0.7657,
	"step": 1915
	},
	{
	"epoch": 6.486486486486487,
	"grad_norm": 0.3576849699020386,
	"learning_rate": 6.624162751702076e-05,
	"loss": 0.7741,
	"step": 1920
	},
	{
	"epoch": 6.503378378378378,
	"grad_norm": 0.3550150990486145,
	"learning_rate": 6.568719373084538e-05,
	"loss": 0.7636,
	"step": 1925
	},
	{
	"epoch": 6.52027027027027,
	"grad_norm": 0.3779493570327759,
	"learning_rate": 6.513395290579901e-05,
	"loss": 0.7641,
	"step": 1930
	},
	{
	"epoch": 6.537162162162162,
	"grad_norm": 0.36017805337905884,
	"learning_rate": 6.458192427653112e-05,
	"loss": 0.7676,
	"step": 1935
	},
	{
	"epoch": 6.554054054054054,
	"grad_norm": 0.38434022665023804,
	"learning_rate": 6.403112703554643e-05,
	"loss": 0.7701,
	"step": 1940
	},
	{
	"epoch": 6.570945945945946,
	"grad_norm": 0.358761191368103,
	"learning_rate": 6.348158033253773e-05,
	"loss": 0.7539,
	"step": 1945
	},
	{
	"epoch": 6.587837837837838,
	"grad_norm": 0.37006473541259766,
	"learning_rate": 6.293330327372005e-05,
	"loss": 0.7767,
	"step": 1950
	},
	{
	"epoch": 6.60472972972973,
	"grad_norm": 0.3721785247325897,
	"learning_rate": 6.238631492116644e-05,
	"loss": 0.7715,
	"step": 1955
	},
	{
	"epoch": 6.621621621621622,
	"grad_norm": 0.3626702129840851,
	"learning_rate": 6.184063429214515e-05,
	"loss": 0.766,
	"step": 1960
	},
	{
	"epoch": 6.638513513513513,
	"grad_norm": 0.37497058510780334,
	"learning_rate": 6.129628035845861e-05,
	"loss": 0.7658,
	"step": 1965
	},
	{
	"epoch": 6.655405405405405,
	"grad_norm": 0.36465275287628174,
	"learning_rate": 6.0753272045783625e-05,
	"loss": 0.7666,
	"step": 1970
	},
	{
	"epoch": 6.672297297297297,
	"grad_norm": 0.3648873567581177,
	"learning_rate": 6.021162823301358e-05,
	"loss": 0.7661,
	"step": 1975
	},
	{
	"epoch": 6.6891891891891895,
	"grad_norm": 0.3486686646938324,
	"learning_rate": 5.967136775160187e-05,
	"loss": 0.7638,
	"step": 1980
	},
	{
	"epoch": 6.706081081081081,
	"grad_norm": 0.36590924859046936,
	"learning_rate": 5.913250938490744e-05,
	"loss": 0.7753,
	"step": 1985
	},
	{
	"epoch": 6.722972972972973,
	"grad_norm": 0.36060139536857605,
	"learning_rate": 5.859507186754146e-05,
	"loss": 0.778,
	"step": 1990
	},
	{
	"epoch": 6.739864864864865,
	"grad_norm": 0.4011731743812561,
	"learning_rate": 5.80590738847162e-05,
	"loss": 0.7653,
	"step": 1995
	},
	{
	"epoch": 6.756756756756757,
	"grad_norm": 0.38411641120910645,
	"learning_rate": 5.752453407159522e-05,
	"loss": 0.76,
	"step": 2000
	},
	{
	"epoch": 6.773648648648649,
	"grad_norm": 0.37505170702934265,
	"learning_rate": 5.699147101264566e-05,
	"loss": 0.7709,
	"step": 2005
	},
	{
	"epoch": 6.79054054054054,
	"grad_norm": 0.3904276192188263,
	"learning_rate": 5.645990324099197e-05,
	"loss": 0.7659,
	"step": 2010
	},
	{
	"epoch": 6.8074324324324325,
	"grad_norm": 0.3751082420349121,
	"learning_rate": 5.5929849237771556e-05,
	"loss": 0.7564,
	"step": 2015
	},
	{
	"epoch": 6.824324324324325,
	"grad_norm": 0.3594505488872528,
	"learning_rate": 5.540132743149242e-05,
	"loss": 0.7723,
	"step": 2020
	},
	{
	"epoch": 6.841216216216216,
	"grad_norm": 0.3686336874961853,
	"learning_rate": 5.487435619739214e-05,
	"loss": 0.7645,
	"step": 2025
	},
	{
	"epoch": 6.858108108108108,
	"grad_norm": 0.37959080934524536,
	"learning_rate": 5.434895385679937e-05,
	"loss": 0.761,
	"step": 2030
	},
	{
	"epoch": 6.875,
	"grad_norm": 0.38148415088653564,
	"learning_rate": 5.382513867649663e-05,
	"loss": 0.766,
	"step": 2035
	},
	{
	"epoch": 6.891891891891892,
	"grad_norm": 0.37155023217201233,
	"learning_rate": 5.33029288680852e-05,
	"loss": 0.7753,
	"step": 2040
	},
	{
	"epoch": 6.908783783783784,
	"grad_norm": 0.3691665828227997,
	"learning_rate": 5.2782342587352154e-05,
	"loss": 0.7641,
	"step": 2045
	},
	{
	"epoch": 6.925675675675675,
	"grad_norm": 0.4007939398288727,
	"learning_rate": 5.226339793363898e-05,
	"loss": 0.7717,
	"step": 2050
	},
	{
	"epoch": 6.9425675675675675,
	"grad_norm": 0.36151981353759766,
	"learning_rate": 5.174611294921224e-05,
	"loss": 0.7832,
	"step": 2055
	},
	{
	"epoch": 6.95945945945946,
	"grad_norm": 0.38270819187164307,
	"learning_rate": 5.123050561863657e-05,
	"loss": 0.7619,
	"step": 2060
	},
	{
	"epoch": 6.976351351351351,
	"grad_norm": 0.35164088010787964,
	"learning_rate": 5.071659386814907e-05,
	"loss": 0.7725,
	"step": 2065
	},
	{
	"epoch": 6.993243243243243,
	"grad_norm": 0.3853191137313843,
	"learning_rate": 5.020439556503629e-05,
	"loss": 0.7654,
	"step": 2070
	},
	{
	"epoch": 7.0,
	"eval_loss": 2.40800142288208,
	"eval_runtime": 0.394,
	"eval_samples_per_second": 5.076,
	"eval_steps_per_second": 2.538,
	"step": 2072
	},
	{
	"epoch": 7.010135135135135,
	"grad_norm": 0.3015079200267792,
	"learning_rate": 4.969392851701305e-05,
	"loss": 0.7406,
	"step": 2075
	},
	{
	"epoch": 7.027027027027027,
	"grad_norm": 0.47633570432662964,
	"learning_rate": 4.918521047160308e-05,
	"loss": 0.7101,
	"step": 2080
	},
	{
	"epoch": 7.043918918918919,
	"grad_norm": 0.31147924065589905,
	"learning_rate": 4.8678259115522215e-05,
	"loss": 0.7144,
	"step": 2085
	},
	{
	"epoch": 7.0608108108108105,
	"grad_norm": 0.3377055823802948,
	"learning_rate": 4.817309207406346e-05,
	"loss": 0.7091,
	"step": 2090
	},
	{
	"epoch": 7.077702702702703,
	"grad_norm": 0.3804275393486023,
	"learning_rate": 4.7669726910484e-05,
	"loss": 0.7083,
	"step": 2095
	},
	{
	"epoch": 7.094594594594595,
	"grad_norm": 0.3246239721775055,
	"learning_rate": 4.716818112539485e-05,
	"loss": 0.7076,
	"step": 2100
	},
	{
	"epoch": 7.111486486486487,
	"grad_norm": 0.3758985996246338,
	"learning_rate": 4.666847215615226e-05,
	"loss": 0.7112,
	"step": 2105
	},
	{
	"epoch": 7.128378378378378,
	"grad_norm": 0.3744657337665558,
	"learning_rate": 4.617061737625139e-05,
	"loss": 0.714,
	"step": 2110
	},
	{
	"epoch": 7.14527027027027,
	"grad_norm": 0.35453036427497864,
	"learning_rate": 4.567463409472255e-05,
	"loss": 0.7144,
	"step": 2115
	},
	{
	"epoch": 7.162162162162162,
	"grad_norm": 0.36035045981407166,
	"learning_rate": 4.518053955552903e-05,
	"loss": 0.7153,
	"step": 2120
	},
	{
	"epoch": 7.179054054054054,
	"grad_norm": 0.362409383058548,
	"learning_rate": 4.468835093696796e-05,
	"loss": 0.7179,
	"step": 2125
	},
	{
	"epoch": 7.195945945945946,
	"grad_norm": 0.4178987145423889,
	"learning_rate": 4.419808535107287e-05,
	"loss": 0.7109,
	"step": 2130
	},
	{
	"epoch": 7.212837837837838,
	"grad_norm": 0.36226364970207214,
	"learning_rate": 4.370975984301866e-05,
	"loss": 0.7112,
	"step": 2135
	},
	{
	"epoch": 7.22972972972973,
	"grad_norm": 0.34748539328575134,
	"learning_rate": 4.322339139052921e-05,
	"loss": 0.7115,
	"step": 2140
	},
	{
	"epoch": 7.246621621621622,
	"grad_norm": 0.3634675443172455,
	"learning_rate": 4.273899690328702e-05,
	"loss": 0.7043,
	"step": 2145
	},
	{
	"epoch": 7.263513513513513,
	"grad_norm": 0.3675166070461273,
	"learning_rate": 4.2256593222345185e-05,
	"loss": 0.7124,
	"step": 2150
	},
	{
	"epoch": 7.280405405405405,
	"grad_norm": 0.33852246403694153,
	"learning_rate": 4.177619711954211e-05,
	"loss": 0.7122,
	"step": 2155
	},
	{
	"epoch": 7.297297297297297,
	"grad_norm": 0.34997648000717163,
	"learning_rate": 4.129782529691815e-05,
	"loss": 0.7161,
	"step": 2160
	},
	{
	"epoch": 7.3141891891891895,
	"grad_norm": 0.3947296738624573,
	"learning_rate": 4.082149438613514e-05,
	"loss": 0.715,
	"step": 2165
	},
	{
	"epoch": 7.331081081081081,
	"grad_norm": 0.3766041696071625,
	"learning_rate": 4.034722094789809e-05,
	"loss": 0.7104,
	"step": 2170
	},
	{
	"epoch": 7.347972972972973,
	"grad_norm": 0.39250659942626953,
	"learning_rate": 3.987502147137928e-05,
	"loss": 0.7157,
	"step": 2175
	},
	{
	"epoch": 7.364864864864865,
	"grad_norm": 0.356827050447464,
	"learning_rate": 3.9404912373645185e-05,
	"loss": 0.7104,
	"step": 2180
	},
	{
	"epoch": 7.381756756756757,
	"grad_norm": 0.3731355369091034,
	"learning_rate": 3.893690999908562e-05,
	"loss": 0.7167,
	"step": 2185
	},
	{
	"epoch": 7.398648648648648,
	"grad_norm": 0.3654830753803253,
	"learning_rate": 3.8471030618845375e-05,
	"loss": 0.7151,
	"step": 2190
	},
	{
	"epoch": 7.41554054054054,
	"grad_norm": 0.3466781675815582,
	"learning_rate": 3.800729043025871e-05,
	"loss": 0.7208,
	"step": 2195
	},
	{
	"epoch": 7.4324324324324325,
	"grad_norm": 0.37476223707199097,
	"learning_rate": 3.7545705556286126e-05,
	"loss": 0.7083,
	"step": 2200
	},
	{
	"epoch": 7.449324324324325,
	"grad_norm": 0.361871600151062,
	"learning_rate": 3.708629204495371e-05,
	"loss": 0.7195,
	"step": 2205
	},
	{
	"epoch": 7.466216216216216,
	"grad_norm": 0.3652123510837555,
	"learning_rate": 3.662906586879542e-05,
	"loss": 0.7132,
	"step": 2210
	},
	{
	"epoch": 7.483108108108108,
	"grad_norm": 0.36584657430648804,
	"learning_rate": 3.61740429242975e-05,
	"loss": 0.71,
	"step": 2215
	},
	{
	"epoch": 7.5,
	"grad_norm": 0.34037116169929504,
	"learning_rate": 3.5721239031346066e-05,
	"loss": 0.7175,
	"step": 2220
	},
	{
	"epoch": 7.516891891891892,
	"grad_norm": 0.34989210963249207,
	"learning_rate": 3.5270669932676926e-05,
	"loss": 0.7236,
	"step": 2225
	},
	{
	"epoch": 7.533783783783784,
	"grad_norm": 0.35882651805877686,
	"learning_rate": 3.48223512933282e-05,
	"loss": 0.7159,
	"step": 2230
	},
	{
	"epoch": 7.550675675675675,
	"grad_norm": 0.32638296484947205,
	"learning_rate": 3.437629870009591e-05,
	"loss": 0.7221,
	"step": 2235
	},
	{
	"epoch": 7.5675675675675675,
	"grad_norm": 0.37272724509239197,
	"learning_rate": 3.393252766099187e-05,
	"loss": 0.7132,
	"step": 2240
	},
	{
	"epoch": 7.58445945945946,
	"grad_norm": 0.3713020086288452,
	"learning_rate": 3.349105360470456e-05,
	"loss": 0.7246,
	"step": 2245
	},
	{
	"epoch": 7.601351351351351,
	"grad_norm": 0.35202324390411377,
	"learning_rate": 3.305189188006281e-05,
	"loss": 0.7289,
	"step": 2250
	},
	{
	"epoch": 7.618243243243243,
	"grad_norm": 0.3543793559074402,
	"learning_rate": 3.2615057755502e-05,
	"loss": 0.7129,
	"step": 2255
	},
	{
	"epoch": 7.635135135135135,
	"grad_norm": 0.3830936849117279,
	"learning_rate": 3.218056641853337e-05,
	"loss": 0.7287,
	"step": 2260
	},
	{
	"epoch": 7.652027027027027,
	"grad_norm": 0.36788904666900635,
	"learning_rate": 3.174843297521596e-05,
	"loss": 0.7107,
	"step": 2265
	},
	{
	"epoch": 7.668918918918919,
	"grad_norm": 0.34784045815467834,
	"learning_rate": 3.1318672449631284e-05,
	"loss": 0.7129,
	"step": 2270
	},
	{
	"epoch": 7.6858108108108105,
	"grad_norm": 0.3825985789299011,
	"learning_rate": 3.089129978336118e-05,
	"loss": 0.7048,
	"step": 2275
	},
	{
	"epoch": 7.702702702702703,
	"grad_norm": 0.4050070643424988,
	"learning_rate": 3.0466329834968233e-05,
	"loss": 0.7165,
	"step": 2280
	},
	{
	"epoch": 7.719594594594595,
	"grad_norm": 0.3602808117866516,
	"learning_rate": 3.0043777379479098e-05,
	"loss": 0.7163,
	"step": 2285
	},
	{
	"epoch": 7.736486486486487,
	"grad_norm": 0.35466307401657104,
	"learning_rate": 2.9623657107870996e-05,
	"loss": 0.7149,
	"step": 2290
	},
	{
	"epoch": 7.753378378378378,
	"grad_norm": 0.3452269732952118,
	"learning_rate": 2.9205983626560874e-05,
	"loss": 0.7196,
	"step": 2295
	},
	{
	"epoch": 7.77027027027027,
	"grad_norm": 0.3634475767612457,
	"learning_rate": 2.879077145689746e-05,
	"loss": 0.7153,
	"step": 2300
	},
	{
	"epoch": 7.787162162162162,
	"grad_norm": 0.3627691864967346,
	"learning_rate": 2.8378035034656625e-05,
	"loss": 0.7112,
	"step": 2305
	},
	{
	"epoch": 7.804054054054054,
	"grad_norm": 0.3404904901981354,
	"learning_rate": 2.7967788709539233e-05,
	"loss": 0.7159,
	"step": 2310
	},
	{
	"epoch": 7.820945945945946,
	"grad_norm": 0.38526642322540283,
	"learning_rate": 2.7560046744672495e-05,
	"loss": 0.7218,
	"step": 2315
	},
	{
	"epoch": 7.837837837837838,
	"grad_norm": 0.354755699634552,
	"learning_rate": 2.7154823316113932e-05,
	"loss": 0.7123,
	"step": 2320
	},
	{
	"epoch": 7.85472972972973,
	"grad_norm": 0.3782195746898651,
	"learning_rate": 2.6752132512358475e-05,
	"loss": 0.7091,
	"step": 2325
	},
	{
	"epoch": 7.871621621621622,
	"grad_norm": 0.39233171939849854,
	"learning_rate": 2.6351988333848788e-05,
	"loss": 0.7208,
	"step": 2330
	},
	{
	"epoch": 7.888513513513513,
	"grad_norm": 0.4432124197483063,
	"learning_rate": 2.5954404692488433e-05,
	"loss": 0.7032,
	"step": 2335
	},
	{
	"epoch": 7.905405405405405,
	"grad_norm": 0.3653867542743683,
	"learning_rate": 2.5559395411158115e-05,
	"loss": 0.7246,
	"step": 2340
	},
	{
	"epoch": 7.922297297297297,
	"grad_norm": 0.37708407640457153,
	"learning_rate": 2.5166974223235296e-05,
	"loss": 0.7135,
	"step": 2345
	},
	{
	"epoch": 7.9391891891891895,
	"grad_norm": 0.3550487160682678,
	"learning_rate": 2.4777154772116496e-05,
	"loss": 0.7105,
	"step": 2350
	},
	{
	"epoch": 7.956081081081081,
	"grad_norm": 0.35054445266723633,
	"learning_rate": 2.438995061074314e-05,
	"loss": 0.7179,
	"step": 2355
	},
	{
	"epoch": 7.972972972972973,
	"grad_norm": 0.35555845499038696,
	"learning_rate": 2.4005375201130274e-05,
	"loss": 0.7076,
	"step": 2360
	},
	{
	"epoch": 7.989864864864865,
	"grad_norm": 0.38198524713516235,
	"learning_rate": 2.362344191389846e-05,
	"loss": 0.7117,
	"step": 2365
	},
	{
	"epoch": 8.0,
	"eval_loss": 2.655390977859497,
	"eval_runtime": 0.3941,
	"eval_samples_per_second": 5.074,
	"eval_steps_per_second": 2.537,
	"step": 2368
	},
	{
	"epoch": 8.006756756756756,
	"grad_norm": 0.2672366499900818,
	"learning_rate": 2.324416402780907e-05,
	"loss": 0.7016,
	"step": 2370
	},
	{
	"epoch": 8.02364864864865,
	"grad_norm": 0.3170325756072998,
	"learning_rate": 2.2867554729302542e-05,
	"loss": 0.6812,
	"step": 2375
	},
	{
	"epoch": 8.04054054054054,
	"grad_norm": 0.3713083863258362,
	"learning_rate": 2.249362711203985e-05,
	"loss": 0.6825,
	"step": 2380
	},
	{
	"epoch": 8.057432432432432,
	"grad_norm": 0.3441585898399353,
	"learning_rate": 2.2122394176447416e-05,
	"loss": 0.6786,
	"step": 2385
	},
	{
	"epoch": 8.074324324324325,
	"grad_norm": 0.29649627208709717,
	"learning_rate": 2.1753868829265046e-05,
	"loss": 0.671,
	"step": 2390
	},
	{
	"epoch": 8.091216216216216,
	"grad_norm": 0.31710395216941833,
	"learning_rate": 2.1388063883097152e-05,
	"loss": 0.6788,
	"step": 2395
	},
	{
	"epoch": 8.108108108108109,
	"grad_norm": 0.3464438319206238,
	"learning_rate": 2.102499205596743e-05,
	"loss": 0.6843,
	"step": 2400
	},
	{
	"epoch": 8.125,
	"grad_norm": 0.3463502824306488,
	"learning_rate": 2.0664665970876496e-05,
	"loss": 0.6896,
	"step": 2405
	},
	{
	"epoch": 8.141891891891891,
	"grad_norm": 0.32347431778907776,
	"learning_rate": 2.0307098155363236e-05,
	"loss": 0.6949,
	"step": 2410
	},
	{
	"epoch": 8.158783783783784,
	"grad_norm": 0.30408981442451477,
	"learning_rate": 1.9952301041069122e-05,
	"loss": 0.6808,
	"step": 2415
	},
	{
	"epoch": 8.175675675675675,
	"grad_norm": 0.3631693124771118,
	"learning_rate": 1.9600286963305957e-05,
	"loss": 0.6882,
	"step": 2420
	},
	{
	"epoch": 8.192567567567568,
	"grad_norm": 0.31960511207580566,
	"learning_rate": 1.9251068160627173e-05,
	"loss": 0.6849,
	"step": 2425
	},
	{
	"epoch": 8.20945945945946,
	"grad_norm": 0.3153926134109497,
	"learning_rate": 1.8904656774402208e-05,
	"loss": 0.6768,
	"step": 2430
	},
	{
	"epoch": 8.22635135135135,
	"grad_norm": 0.3084424138069153,
	"learning_rate": 1.8561064848394382e-05,
	"loss": 0.6744,
	"step": 2435
	},
	{
	"epoch": 8.243243243243244,
	"grad_norm": 0.3217174708843231,
	"learning_rate": 1.8220304328342252e-05,
	"loss": 0.6882,
	"step": 2440
	},
	{
	"epoch": 8.260135135135135,
	"grad_norm": 0.3653244972229004,
	"learning_rate": 1.7882387061544182e-05,
	"loss": 0.6812,
	"step": 2445
	},
	{
	"epoch": 8.277027027027026,
	"grad_norm": 0.32076555490493774,
	"learning_rate": 1.754732479644655e-05,
	"loss": 0.6835,
	"step": 2450
	},
	{
	"epoch": 8.29391891891892,
	"grad_norm": 0.35145509243011475,
	"learning_rate": 1.721512918223527e-05,
	"loss": 0.6885,
	"step": 2455
	},
	{
	"epoch": 8.31081081081081,
	"grad_norm": 0.3196760416030884,
	"learning_rate": 1.688581176843066e-05,
	"loss": 0.6814,
	"step": 2460
	},
	{
	"epoch": 8.327702702702704,
	"grad_norm": 0.34739652276039124,
	"learning_rate": 1.6559384004486055e-05,
	"loss": 0.6856,
	"step": 2465
	},
	{
	"epoch": 8.344594594594595,
	"grad_norm": 0.3565291166305542,
	"learning_rate": 1.6235857239389696e-05,
	"loss": 0.6849,
	"step": 2470
	},
	{
	"epoch": 8.361486486486486,
	"grad_norm": 0.3656858205795288,
	"learning_rate": 1.5915242721270074e-05,
	"loss": 0.681,
	"step": 2475
	},
	{
	"epoch": 8.378378378378379,
	"grad_norm": 0.32651442289352417,
	"learning_rate": 1.5597551597004966e-05,
	"loss": 0.683,
	"step": 2480
	},
	{
	"epoch": 8.39527027027027,
	"grad_norm": 0.3386393189430237,
	"learning_rate": 1.5282794911833887e-05,
	"loss": 0.6823,
	"step": 2485
	},
	{
	"epoch": 8.412162162162161,
	"grad_norm": 0.31998586654663086,
	"learning_rate": 1.4970983608973942e-05,
	"loss": 0.6788,
	"step": 2490
	},
	{
	"epoch": 8.429054054054054,
	"grad_norm": 0.34341830015182495,
	"learning_rate": 1.4662128529239572e-05,
	"loss": 0.6944,
	"step": 2495
	},
	{
	"epoch": 8.445945945945946,
	"grad_norm": 0.32450416684150696,
	"learning_rate": 1.4356240410665433e-05,
	"loss": 0.6946,
	"step": 2500
	},
	{
	"epoch": 8.462837837837839,
	"grad_norm": 0.3322451710700989,
	"learning_rate": 1.4053329888133238e-05,
	"loss": 0.683,
	"step": 2505
	},
	{
	"epoch": 8.47972972972973,
	"grad_norm": 0.3628733456134796,
	"learning_rate": 1.3753407493001968e-05,
	"loss": 0.6824,
	"step": 2510
	},
	{
	"epoch": 8.496621621621621,
	"grad_norm": 0.3203790783882141,
	"learning_rate": 1.3456483652741591e-05,
	"loss": 0.6843,
	"step": 2515
	},
	{
	"epoch": 8.513513513513514,
	"grad_norm": 0.3382638096809387,
	"learning_rate": 1.3162568690570743e-05,
	"loss": 0.6882,
	"step": 2520
	},
	{
	"epoch": 8.530405405405405,
	"grad_norm": 0.34006133675575256,
	"learning_rate": 1.287167282509767e-05,
	"loss": 0.6781,
	"step": 2525
	},
	{
	"epoch": 8.547297297297296,
	"grad_norm": 0.33302438259124756,
	"learning_rate": 1.2583806169964961e-05,
	"loss": 0.6818,
	"step": 2530
	},
	{
	"epoch": 8.56418918918919,
	"grad_norm": 0.35714635252952576,
	"learning_rate": 1.2298978733498035e-05,
	"loss": 0.6903,
	"step": 2535
	},
	{
	"epoch": 8.58108108108108,
	"grad_norm": 0.34445202350616455,
	"learning_rate": 1.2017200418357078e-05,
	"loss": 0.6884,
	"step": 2540
	},
	{
	"epoch": 8.597972972972974,
	"grad_norm": 0.35791710019111633,
	"learning_rate": 1.1738481021192704e-05,
	"loss": 0.6805,
	"step": 2545
	},
	{
	"epoch": 8.614864864864865,
	"grad_norm": 0.4606862962245941,
	"learning_rate": 1.14628302323056e-05,
	"loss": 0.6833,
	"step": 2550
	},
	{
	"epoch": 8.631756756756756,
	"grad_norm": 0.3396778702735901,
	"learning_rate": 1.1190257635309275e-05,
	"loss": 0.6788,
	"step": 2555
	},
	{
	"epoch": 8.64864864864865,
	"grad_norm": 0.3137703537940979,
	"learning_rate": 1.0920772706797167e-05,
	"loss": 0.6778,
	"step": 2560
	},
	{
	"epoch": 8.66554054054054,
	"grad_norm": 0.3266281187534332,
	"learning_rate": 1.0654384816012953e-05,
	"loss": 0.6928,
	"step": 2565
	},
	{
	"epoch": 8.682432432432432,
	"grad_norm": 0.33806994557380676,
	"learning_rate": 1.0391103224524956e-05,
	"loss": 0.694,
	"step": 2570
	},
	{
	"epoch": 8.699324324324325,
	"grad_norm": 0.3242711126804352,
	"learning_rate": 1.013093708590408e-05,
	"loss": 0.6769,
	"step": 2575
	},
	{
	"epoch": 8.716216216216216,
	"grad_norm": 0.3551606833934784,
	"learning_rate": 9.873895445405523e-06,
	"loss": 0.6824,
	"step": 2580
	},
	{
	"epoch": 8.733108108108109,
	"grad_norm": 0.34394511580467224,
	"learning_rate": 9.619987239654405e-06,
	"loss": 0.681,
	"step": 2585
	},
	{
	"epoch": 8.75,
	"grad_norm": 0.35514023900032043,
	"learning_rate": 9.369221296335006e-06,
	"loss": 0.6908,
	"step": 2590
	},
	{
	"epoch": 8.766891891891891,
	"grad_norm": 0.31281572580337524,
	"learning_rate": 9.121606333883792e-06,
	"loss": 0.6881,
	"step": 2595
	},
	{
	"epoch": 8.783783783783784,
	"grad_norm": 0.3141974210739136,
	"learning_rate": 8.87715096118642e-06,
	"loss": 0.6797,
	"step": 2600
	},
	{
	"epoch": 8.800675675675675,
	"grad_norm": 0.3446739912033081,
	"learning_rate": 8.635863677278378e-06,
	"loss": 0.6862,
	"step": 2605
	},
	{
	"epoch": 8.817567567567568,
	"grad_norm": 0.3194230794906616,
	"learning_rate": 8.397752871049436e-06,
	"loss": 0.6764,
	"step": 2610
	},
	{
	"epoch": 8.83445945945946,
	"grad_norm": 0.3229275047779083,
	"learning_rate": 8.162826820952097e-06,
	"loss": 0.6868,
	"step": 2615
	},
	{
	"epoch": 8.85135135135135,
	"grad_norm": 0.3260205388069153,
	"learning_rate": 7.931093694713687e-06,
	"loss": 0.6917,
	"step": 2620
	},
	{
	"epoch": 8.868243243243244,
	"grad_norm": 0.3324912190437317,
	"learning_rate": 7.702561549052445e-06,
	"loss": 0.6748,
	"step": 2625
	},
	{
	"epoch": 8.885135135135135,
	"grad_norm": 0.3662506937980652,
	"learning_rate": 7.477238329397418e-06,
	"loss": 0.6918,
	"step": 2630
	},
	{
	"epoch": 8.902027027027026,
	"grad_norm": 0.3210934102535248,
	"learning_rate": 7.255131869612108e-06,
	"loss": 0.694,
	"step": 2635
	},
	{
	"epoch": 8.91891891891892,
	"grad_norm": 0.35362377762794495,
	"learning_rate": 7.03624989172228e-06,
	"loss": 0.678,
	"step": 2640
	},
	{
	"epoch": 8.93581081081081,
	"grad_norm": 0.3684268295764923,
	"learning_rate": 6.820600005647382e-06,
	"loss": 0.6913,
	"step": 2645
	},
	{
	"epoch": 8.952702702702704,
	"grad_norm": 0.3233438730239868,
	"learning_rate": 6.608189708935964e-06,
	"loss": 0.6818,
	"step": 2650
	},
	{
	"epoch": 8.969594594594595,
	"grad_norm": 0.3141653537750244,
	"learning_rate": 6.3990263865050695e-06,
	"loss": 0.6843,
	"step": 2655
	},
	{
	"epoch": 8.986486486486486,
	"grad_norm": 0.32477766275405884,
	"learning_rate": 6.1931173103834115e-06,
	"loss": 0.6916,
	"step": 2660
	},
	{
	"epoch": 9.0,
	"eval_loss": 2.91719913482666,
	"eval_runtime": 0.3931,
	"eval_samples_per_second": 5.088,
	"eval_steps_per_second": 2.544,
	"step": 2664
	},
	{
	"epoch": 9.003378378378379,
	"grad_norm": 0.26982101798057556,
	"learning_rate": 5.9904696394586405e-06,
	"loss": 0.6797,
	"step": 2665
	},
	{
	"epoch": 9.02027027027027,
	"grad_norm": 0.25923973321914673,
	"learning_rate": 5.791090419228351e-06,
	"loss": 0.6622,
	"step": 2670
	},
	{
	"epoch": 9.037162162162161,
	"grad_norm": 0.2826623022556305,
	"learning_rate": 5.594986581555173e-06,
	"loss": 0.6712,
	"step": 2675
	},
	{
	"epoch": 9.054054054054054,
	"grad_norm": 0.3254013657569885,
	"learning_rate": 5.402164944425758e-06,
	"loss": 0.6644,
	"step": 2680
	},
	{
	"epoch": 9.070945945945946,
	"grad_norm": 0.3010416626930237,
	"learning_rate": 5.212632211713797e-06,
	"loss": 0.6741,
	"step": 2685
	},
	{
	"epoch": 9.087837837837839,
	"grad_norm": 0.36080434918403625,
	"learning_rate": 5.026394972946813e-06,
	"loss": 0.6675,
	"step": 2690
	},
	{
	"epoch": 9.10472972972973,
	"grad_norm": 0.2993578314781189,
	"learning_rate": 4.843459703077202e-06,
	"loss": 0.6798,
	"step": 2695
	},
	{
	"epoch": 9.121621621621621,
	"grad_norm": 0.3179502785205841,
	"learning_rate": 4.66383276225707e-06,
	"loss": 0.6769,
	"step": 2700
	},
	{
	"epoch": 9.138513513513514,
	"grad_norm": 0.29940682649612427,
	"learning_rate": 4.487520395617029e-06,
	"loss": 0.6624,
	"step": 2705
	},
	{
	"epoch": 9.155405405405405,
	"grad_norm": 0.340571790933609,
	"learning_rate": 4.314528733049206e-06,
	"loss": 0.6626,
	"step": 2710
	},
	{
	"epoch": 9.172297297297296,
	"grad_norm": 0.3252014219760895,
	"learning_rate": 4.144863788993991e-06,
	"loss": 0.6798,
	"step": 2715
	},
	{
	"epoch": 9.18918918918919,
	"grad_norm": 0.3229629397392273,
	"learning_rate": 3.9785314622310495e-06,
	"loss": 0.675,
	"step": 2720
	},
	{
	"epoch": 9.20608108108108,
	"grad_norm": 0.2807313799858093,
	"learning_rate": 3.815537535674174e-06,
	"loss": 0.6765,
	"step": 2725
	},
	{
	"epoch": 9.222972972972974,
	"grad_norm": 0.2970227599143982,
	"learning_rate": 3.655887676170222e-06,
	"loss": 0.6678,
	"step": 2730
	},
	{
	"epoch": 9.239864864864865,
	"grad_norm": 0.3028378486633301,
	"learning_rate": 3.4995874343021094e-06,
	"loss": 0.6728,
	"step": 2735
	},
	{
	"epoch": 9.256756756756756,
	"grad_norm": 0.31875497102737427,
	"learning_rate": 3.3466422441958634e-06,
	"loss": 0.6761,
	"step": 2740
	},
	{
	"epoch": 9.27364864864865,
	"grad_norm": 0.319791316986084,
	"learning_rate": 3.1970574233316397e-06,
	"loss": 0.6623,
	"step": 2745
	},
	{
	"epoch": 9.29054054054054,
	"grad_norm": 0.2787954807281494,
	"learning_rate": 3.050838172358883e-06,
	"loss": 0.6679,
	"step": 2750
	},
	{
	"epoch": 9.307432432432432,
	"grad_norm": 0.3424752950668335,
	"learning_rate": 2.9079895749154927e-06,
	"loss": 0.6626,
	"step": 2755
	},
	{
	"epoch": 9.324324324324325,
	"grad_norm": 0.302796334028244,
	"learning_rate": 2.7685165974510986e-06,
	"loss": 0.6721,
	"step": 2760
	},
	{
	"epoch": 9.341216216216216,
	"grad_norm": 0.29907652735710144,
	"learning_rate": 2.6324240890544193e-06,
	"loss": 0.6696,
	"step": 2765
	},
	{
	"epoch": 9.358108108108109,
	"grad_norm": 0.31798794865608215,
	"learning_rate": 2.499716781284556e-06,
	"loss": 0.6705,
	"step": 2770
	},
	{
	"epoch": 9.375,
	"grad_norm": 0.3437163829803467,
	"learning_rate": 2.3703992880066638e-06,
	"loss": 0.6726,
	"step": 2775
	},
	{
	"epoch": 9.391891891891891,
	"grad_norm": 0.2871228754520416,
	"learning_rate": 2.2444761052313856e-06,
	"loss": 0.6715,
	"step": 2780
	},
	{
	"epoch": 9.408783783783784,
	"grad_norm": 0.2895369827747345,
	"learning_rate": 2.1219516109586056e-06,
	"loss": 0.6812,
	"step": 2785
	},
	{
	"epoch": 9.425675675675675,
	"grad_norm": 0.29224950075149536,
	"learning_rate": 2.002830065025263e-06,
	"loss": 0.6788,
	"step": 2790
	},
	{
	"epoch": 9.442567567567568,
	"grad_norm": 0.32183554768562317,
	"learning_rate": 1.8871156089572018e-06,
	"loss": 0.6731,
	"step": 2795
	},
	{
	"epoch": 9.45945945945946,
	"grad_norm": 0.30122798681259155,
	"learning_rate": 1.7748122658251876e-06,
	"loss": 0.6724,
	"step": 2800
	},
	{
	"epoch": 9.47635135135135,
	"grad_norm": 0.3141665458679199,
	"learning_rate": 1.665923940105074e-06,
	"loss": 0.6725,
	"step": 2805
	},
	{
	"epoch": 9.493243243243244,
	"grad_norm": 0.3088280260562897,
	"learning_rate": 1.56045441754199e-06,
	"loss": 0.6748,
	"step": 2810
	},
	{
	"epoch": 9.510135135135135,
	"grad_norm": 0.2941664457321167,
	"learning_rate": 1.4584073650187878e-06,
	"loss": 0.6656,
	"step": 2815
	},
	{
	"epoch": 9.527027027027026,
	"grad_norm": 0.3141193687915802,
	"learning_rate": 1.3597863304285475e-06,
	"loss": 0.6732,
	"step": 2820
	},
	{
	"epoch": 9.54391891891892,
	"grad_norm": 0.29874399304389954,
	"learning_rate": 1.2645947425511395e-06,
	"loss": 0.6749,
	"step": 2825
	},
	{
	"epoch": 9.56081081081081,
	"grad_norm": 0.2963665723800659,
	"learning_rate": 1.1728359109341446e-06,
	"loss": 0.6737,
	"step": 2830
	},
	{
	"epoch": 9.577702702702704,
	"grad_norm": 0.2843508720397949,
	"learning_rate": 1.0845130257777114e-06,
	"loss": 0.6758,
	"step": 2835
	},
	{
	"epoch": 9.594594594594595,
	"grad_norm": 0.3049289882183075,
	"learning_rate": 9.996291578236228e-07,
	"loss": 0.6771,
	"step": 2840
	},
	{
	"epoch": 9.611486486486486,
	"grad_norm": 0.2939779460430145,
	"learning_rate": 9.18187258248604e-07,
	"loss": 0.6655,
	"step": 2845
	},
	{
	"epoch": 9.628378378378379,
	"grad_norm": 0.2909776270389557,
	"learning_rate": 8.401901585616823e-07,
	"loss": 0.6745,
	"step": 2850
	},
	{
	"epoch": 9.64527027027027,
	"grad_norm": 0.3179049491882324,
	"learning_rate": 7.656405705057435e-07,
	"loss": 0.6621,
	"step": 2855
	},
	{
	"epoch": 9.662162162162161,
	"grad_norm": 0.31932151317596436,
	"learning_rate": 6.945410859632295e-07,
	"loss": 0.6615,
	"step": 2860
	},
	{
	"epoch": 9.679054054054054,
	"grad_norm": 0.33364373445510864,
	"learning_rate": 6.268941768660886e-07,
	"loss": 0.6752,
	"step": 2865
	},
	{
	"epoch": 9.695945945945946,
	"grad_norm": 0.3177048861980438,
	"learning_rate": 5.627021951097545e-07,
	"loss": 0.6793,
	"step": 2870
	},
	{
	"epoch": 9.712837837837839,
	"grad_norm": 0.3155570328235626,
	"learning_rate": 5.019673724714458e-07,
	"loss": 0.6557,
	"step": 2875
	},
	{
	"epoch": 9.72972972972973,
	"grad_norm": 0.2894674241542816,
	"learning_rate": 4.44691820532539e-07,
	"loss": 0.6679,
	"step": 2880
	},
	{
	"epoch": 9.746621621621621,
	"grad_norm": 0.2958497405052185,
	"learning_rate": 3.908775306051604e-07,
	"loss": 0.667,
	"step": 2885
	},
	{
	"epoch": 9.763513513513514,
	"grad_norm": 0.30099859833717346,
	"learning_rate": 3.405263736629416e-07,
	"loss": 0.6828,
	"step": 2890
	},
	{
	"epoch": 9.780405405405405,
	"grad_norm": 0.28636157512664795,
	"learning_rate": 2.9364010027599364e-07,
	"loss": 0.6692,
	"step": 2895
	},
	{
	"epoch": 9.797297297297296,
	"grad_norm": 0.3025320768356323,
	"learning_rate": 2.5022034055003364e-07,
	"loss": 0.6735,
	"step": 2900
	},
	{
	"epoch": 9.81418918918919,
	"grad_norm": 0.3249851167201996,
	"learning_rate": 2.1026860406970772e-07,
	"loss": 0.6678,
	"step": 2905
	},
	{
	"epoch": 9.83108108108108,
	"grad_norm": 0.29301130771636963,
	"learning_rate": 1.7378627984612207e-07,
	"loss": 0.6773,
	"step": 2910
	},
	{
	"epoch": 9.847972972972974,
	"grad_norm": 0.3139761686325073,
	"learning_rate": 1.4077463626852582e-07,
	"loss": 0.6585,
	"step": 2915
	},
	{
	"epoch": 9.864864864864865,
	"grad_norm": 0.2988782227039337,
	"learning_rate": 1.1123482106021322e-07,
	"loss": 0.6832,
	"step": 2920
	},
	{
	"epoch": 9.881756756756756,
	"grad_norm": 0.34064996242523193,
	"learning_rate": 8.516786123867748e-08,
	"loss": 0.6655,
	"step": 2925
	},
	{
	"epoch": 9.89864864864865,
	"grad_norm": 0.31197425723075867,
	"learning_rate": 6.25746630798063e-08,
	"loss": 0.666,
	"step": 2930
	},
	{
	"epoch": 9.91554054054054,
	"grad_norm": 0.3064212203025818,
	"learning_rate": 4.3456012086462436e-08,
	"loss": 0.6671,
	"step": 2935
	},
	{
	"epoch": 9.932432432432432,
	"grad_norm": 0.3351791501045227,
	"learning_rate": 2.7812572961127824e-08,
	"loss": 0.6746,
	"step": 2940
	},
	{
	"epoch": 9.949324324324325,
	"grad_norm": 0.2936666011810303,
	"learning_rate": 1.564488958279986e-08,
	"loss": 0.6647,
	"step": 2945
	},
	{
	"epoch": 9.966216216216216,
	"grad_norm": 0.29400986433029175,
	"learning_rate": 6.953384988095391e-09,
	"loss": 0.6718,
	"step": 2950
	},
	{
	"epoch": 9.983108108108109,
	"grad_norm": 0.32395681738853455,
	"learning_rate": 1.7383613565291612e-09,
	"loss": 0.6809,
	"step": 2955
	},
	{
	"epoch": 10.0,
	"grad_norm": 0.2938132882118225,
	"learning_rate": 0.0,
	"loss": 0.6652,
	"step": 2960
	},
	{
	"epoch": 10.0,
	"eval_loss": 3.0329792499542236,
	"eval_runtime": 0.4325,
	"eval_samples_per_second": 4.624,
	"eval_steps_per_second": 2.312,
	"step": 2960
	},
	{
	"epoch": 10.0,
	"step": 2960,
	"total_flos": 4.416382035459834e+18,
	"train_loss": 0.922980490487975,
	"train_runtime": 12382.7598,
	"train_samples_per_second": 7.645,
	"train_steps_per_second": 0.239
	}
	],
	"logging_steps": 5,
	"max_steps": 2960,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 10,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 4.416382035459834e+18,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}