Rubywong123 commited on
Commit
5a61961
·
verified ·
1 Parent(s): a30c8f0

Upload folder using huggingface_hub

Browse files
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9975369458128078,
3
- "total_flos": 6.535464838821315e+17,
4
- "train_loss": 0.07531778989014802,
5
- "train_runtime": 2373.3356,
6
  "train_samples": 6493,
7
- "train_samples_per_second": 2.736,
8
- "train_steps_per_second": 0.114
9
  }
 
1
  {
2
  "epoch": 0.9975369458128078,
3
+ "total_flos": 8.643970128528015e+17,
4
+ "train_loss": 0.07066047384783074,
5
+ "train_runtime": 3995.5402,
6
  "train_samples": 6493,
7
+ "train_samples_per_second": 1.625,
8
+ "train_steps_per_second": 0.068
9
  }
checkpoint-270/global_step270/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dedf5e24008a057b2113b076f681b2b454849a4db318431fbc74408f618ddb9e
3
  size 22846855742
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a17f241c3739c198d81e3141056df94bb779dd757a21f9f89af53204d04cfc5e
3
  size 22846855742
checkpoint-270/global_step270/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:610b5442c088cebe956ee3ba7fe3113906412f1c00cb0bd3050b7879901bca29
3
  size 22846855742
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ee0a96c67870843152695939ea59ac5025f4e3d225b51b3f1a26e8b1f1ac462
3
  size 22846855742
checkpoint-270/global_step270/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a93c3071d7d2b031f079aae3b4ed8e34c9ef37920b961a969b305ac47ca9221
3
  size 22846855742
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21b65f0ec86bbf10a81af977328abd2d86aa4a03f299d85c7db6aa55c0fd14df
3
  size 22846855742
checkpoint-270/global_step270/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:061547ecac15f2efe4743d93d693cff0ff43eaa95f91113037d068839b125635
3
  size 22846855742
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20131606e9a33434e3b92633c89bc129e2e60eba86f8236193a952b7a58de305
3
  size 22846855742
checkpoint-270/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2dabcb8b0e4eba4f88cc4d30176973016812cd5099a7a909b235afb1eae6519
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2577dc0dbdd5aeedf99725bf6e4df1233bacd820fee699f17a2c80a4b53ce29f
3
  size 4877660776
checkpoint-270/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7abf6cb7f1a7b0f888af0c3be62bdee65d6181f15f6781262a73b4a90c14ee07
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d903975881e7178cedcf8c3b6dc3994cae9143f64293260912c927341aa533a
3
  size 4932751008
checkpoint-270/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9698bdbdee3a3d9c6e2667f88e1ce9e9f5566668788f3d2c351b0d8353303bed
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a405c24d1e443985effa911704afeb8b8895263c4ca3dd7454eeb3de4cef8b3
3
  size 4330865200
checkpoint-270/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:945b3a362040493fde5d41f5e830d4cd91f1b52673fcc546d1fecdee39777261
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47537180f9b5498e5481a81d4e10154e932f26f4c2237edbf8d69850e580ed4c
3
  size 1089994880
checkpoint-270/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83396048d512ec1f3178af0d7c1f79a226bba041822614b0e26a4fd2d4b55bf7
3
- size 11421995
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb27d51a5fa5caa8502d091726ff7f63ada64f766ff94afe49fde7d3faba216f
3
+ size 11421996
checkpoint-270/trainer_state.json CHANGED
@@ -10,385 +10,385 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.003694581280788177,
13
- "grad_norm": 1.7654963384889737,
14
  "learning_rate": 3.7037037037037036e-07,
15
- "loss": 0.7143,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.01847290640394089,
20
- "grad_norm": 1.5199857516271311,
21
  "learning_rate": 1.8518518518518519e-06,
22
- "loss": 0.6647,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03694581280788178,
27
- "grad_norm": 0.49041355078257287,
28
  "learning_rate": 3.7037037037037037e-06,
29
- "loss": 0.5847,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.05541871921182266,
34
- "grad_norm": 0.27640312040532583,
35
  "learning_rate": 5.555555555555557e-06,
36
- "loss": 0.3425,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.07389162561576355,
41
- "grad_norm": 0.18417484630910924,
42
  "learning_rate": 7.4074074074074075e-06,
43
- "loss": 0.1877,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.09236453201970443,
48
- "grad_norm": 0.10832669450973324,
49
  "learning_rate": 9.25925925925926e-06,
50
- "loss": 0.1471,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.11083743842364532,
55
- "grad_norm": 0.08199058472718684,
56
  "learning_rate": 9.996239762521152e-06,
57
- "loss": 0.1209,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.12931034482758622,
62
- "grad_norm": 0.06662942983937706,
63
  "learning_rate": 9.973281012033009e-06,
64
- "loss": 0.1046,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.1477832512315271,
69
- "grad_norm": 0.06811085294724732,
70
  "learning_rate": 9.929548316723983e-06,
71
- "loss": 0.0958,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.16625615763546797,
76
- "grad_norm": 0.055261792291083314,
77
  "learning_rate": 9.86522435289912e-06,
78
- "loss": 0.0804,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.18472906403940886,
83
- "grad_norm": 0.05912952493839196,
84
  "learning_rate": 9.7805778088694e-06,
85
- "loss": 0.072,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.20320197044334976,
90
- "grad_norm": 0.05664454056150425,
91
  "learning_rate": 9.67596226261095e-06,
92
- "loss": 0.0657,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.22167487684729065,
97
- "grad_norm": 0.05397727582384683,
98
  "learning_rate": 9.551814704830734e-06,
99
- "loss": 0.0626,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.24014778325123154,
104
- "grad_norm": 0.054354437756367974,
105
  "learning_rate": 9.40865371360804e-06,
106
- "loss": 0.0567,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.25862068965517243,
111
- "grad_norm": 0.052656359957651395,
112
  "learning_rate": 9.247077288236488e-06,
113
- "loss": 0.0506,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.2770935960591133,
118
- "grad_norm": 0.06099527619651085,
119
  "learning_rate": 9.067760351314838e-06,
120
- "loss": 0.0447,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.2955665024630542,
125
- "grad_norm": 0.05601267986585614,
126
  "learning_rate": 8.871451929520662e-06,
127
- "loss": 0.046,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.31403940886699505,
132
- "grad_norm": 0.05031077501489638,
133
  "learning_rate": 8.658972024843063e-06,
134
- "loss": 0.0434,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.33251231527093594,
139
- "grad_norm": 0.05165558400160997,
140
  "learning_rate": 8.43120818934367e-06,
141
- "loss": 0.0388,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.35098522167487683,
146
- "grad_norm": 0.05542969825706018,
147
  "learning_rate": 8.18911181775353e-06,
148
- "loss": 0.0474,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.3694581280788177,
153
- "grad_norm": 0.054028908120351174,
154
  "learning_rate": 7.93369417339209e-06,
155
- "loss": 0.0426,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.3879310344827586,
160
- "grad_norm": 0.043910381575552423,
161
  "learning_rate": 7.666022164008458e-06,
162
- "loss": 0.0402,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.4064039408866995,
167
- "grad_norm": 0.04420065737962207,
168
  "learning_rate": 7.387213885189746e-06,
169
- "loss": 0.0441,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.4248768472906404,
174
- "grad_norm": 0.042881775226418783,
175
  "learning_rate": 7.098433949952146e-06,
176
- "loss": 0.0422,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.4433497536945813,
181
- "grad_norm": 0.04261745777882021,
182
  "learning_rate": 6.800888624023552e-06,
183
- "loss": 0.0414,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.4618226600985222,
188
- "grad_norm": 0.03486953918711132,
189
  "learning_rate": 6.495820787138209e-06,
190
- "loss": 0.0345,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.4802955665024631,
195
- "grad_norm": 0.04758579863549666,
196
  "learning_rate": 6.184504741390596e-06,
197
- "loss": 0.0393,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.4987684729064039,
202
- "grad_norm": 0.04528381812617609,
203
  "learning_rate": 5.8682408883346535e-06,
204
- "loss": 0.0355,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.5172413793103449,
209
- "grad_norm": 0.05087543134069284,
210
  "learning_rate": 5.548350297062659e-06,
211
- "loss": 0.0376,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.5357142857142857,
216
- "grad_norm": 0.03521666590779419,
217
  "learning_rate": 5.2261691859535325e-06,
218
- "loss": 0.0316,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.5541871921182266,
223
- "grad_norm": 0.0517150609216818,
224
  "learning_rate": 4.903043341140879e-06,
225
- "loss": 0.032,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.5726600985221675,
230
- "grad_norm": 0.03346412011949477,
231
  "learning_rate": 4.580322495015466e-06,
232
- "loss": 0.0303,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.5911330049261084,
237
- "grad_norm": 0.040115617332954906,
238
  "learning_rate": 4.259354688243758e-06,
239
- "loss": 0.0382,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.6096059113300493,
244
- "grad_norm": 0.04277501887386235,
245
  "learning_rate": 3.941480638852948e-06,
246
- "loss": 0.0291,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.6280788177339901,
251
- "grad_norm": 0.03898041246450539,
252
  "learning_rate": 3.6280281419034934e-06,
253
- "loss": 0.0317,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.646551724137931,
258
- "grad_norm": 0.04371627160570444,
259
  "learning_rate": 3.3203065231422904e-06,
260
- "loss": 0.0301,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.6650246305418719,
265
- "grad_norm": 0.03238864584138372,
266
  "learning_rate": 3.019601169804216e-06,
267
- "loss": 0.0354,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.6834975369458128,
272
- "grad_norm": 0.041127251144739585,
273
  "learning_rate": 2.7271681614074973e-06,
274
- "loss": 0.0294,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.7019704433497537,
279
- "grad_norm": 0.045180481360547094,
280
  "learning_rate": 2.4442290229706344e-06,
281
- "loss": 0.0358,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.7204433497536946,
286
- "grad_norm": 0.045021953447442344,
287
  "learning_rate": 2.171965622567308e-06,
288
- "loss": 0.0306,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.7389162561576355,
293
- "grad_norm": 0.050026098917487306,
294
  "learning_rate": 1.9115152345327154e-06,
295
- "loss": 0.0418,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.7573891625615764,
300
- "grad_norm": 0.03656415909500236,
301
  "learning_rate": 1.6639657889429017e-06,
302
- "loss": 0.0286,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.7758620689655172,
307
- "grad_norm": 0.045197818476724314,
308
  "learning_rate": 1.4303513272105057e-06,
309
- "loss": 0.0317,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.7943349753694581,
314
- "grad_norm": 0.041762867738677684,
315
  "learning_rate": 1.2116476827794104e-06,
316
- "loss": 0.0355,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.812807881773399,
321
- "grad_norm": 0.03975638695681742,
322
  "learning_rate": 1.008768404960535e-06,
323
- "loss": 0.034,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.8312807881773399,
328
- "grad_norm": 0.03688322160939588,
329
  "learning_rate": 8.225609429353187e-07,
330
- "loss": 0.0306,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.8497536945812808,
335
- "grad_norm": 0.045910201896259065,
336
  "learning_rate": 6.53803105866761e-07,
337
- "loss": 0.032,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.8682266009852216,
342
- "grad_norm": 0.03466115718136847,
343
  "learning_rate": 5.031998139045352e-07,
344
- "loss": 0.03,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.8866995073891626,
349
- "grad_norm": 0.04041208746429919,
350
  "learning_rate": 3.7138015365554834e-07,
351
- "loss": 0.033,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.9051724137931034,
356
- "grad_norm": 0.030110072192059505,
357
  "learning_rate": 2.5889475041961767e-07,
358
- "loss": 0.0316,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.9236453201970444,
363
- "grad_norm": 0.04208800780561471,
364
  "learning_rate": 1.6621346816668993e-07,
365
- "loss": 0.0317,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.9421182266009852,
370
- "grad_norm": 0.040891559299692404,
371
  "learning_rate": 9.372344686307655e-08,
372
- "loss": 0.0365,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.9605911330049262,
377
- "grad_norm": 0.03589408474423174,
378
  "learning_rate": 4.172748534499449e-08,
379
- "loss": 0.0288,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.979064039408867,
384
- "grad_norm": 0.03745965637860769,
385
  "learning_rate": 1.044277649433989e-08,
386
- "loss": 0.0307,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.9975369458128078,
391
- "grad_norm": 0.031721423195282906,
392
  "learning_rate": 0.0,
393
  "loss": 0.033,
394
  "step": 270
@@ -411,7 +411,7 @@
411
  "attributes": {}
412
  }
413
  },
414
- "total_flos": 6.535464838821315e+17,
415
  "train_batch_size": 1,
416
  "trial_name": null,
417
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.003694581280788177,
13
+ "grad_norm": 1.3164679266754151,
14
  "learning_rate": 3.7037037037037036e-07,
15
+ "loss": 0.5867,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.01847290640394089,
20
+ "grad_norm": 0.8941813303905811,
21
  "learning_rate": 1.8518518518518519e-06,
22
+ "loss": 0.5684,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03694581280788178,
27
+ "grad_norm": 0.4848149582760561,
28
  "learning_rate": 3.7037037037037037e-06,
29
+ "loss": 0.5272,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.05541871921182266,
34
+ "grad_norm": 0.23489374203812283,
35
  "learning_rate": 5.555555555555557e-06,
36
+ "loss": 0.3213,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.07389162561576355,
41
+ "grad_norm": 0.17590513475537736,
42
  "learning_rate": 7.4074074074074075e-06,
43
+ "loss": 0.1719,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.09236453201970443,
48
+ "grad_norm": 0.09924337039651394,
49
  "learning_rate": 9.25925925925926e-06,
50
+ "loss": 0.1392,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.11083743842364532,
55
+ "grad_norm": 0.06829427058420333,
56
  "learning_rate": 9.996239762521152e-06,
57
+ "loss": 0.1168,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.12931034482758622,
62
+ "grad_norm": 0.06324690561682543,
63
  "learning_rate": 9.973281012033009e-06,
64
+ "loss": 0.1009,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.1477832512315271,
69
+ "grad_norm": 0.049072943668117305,
70
  "learning_rate": 9.929548316723983e-06,
71
+ "loss": 0.0886,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.16625615763546797,
76
+ "grad_norm": 0.04472692512923622,
77
  "learning_rate": 9.86522435289912e-06,
78
+ "loss": 0.0772,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.18472906403940886,
83
+ "grad_norm": 0.04320806158867543,
84
  "learning_rate": 9.7805778088694e-06,
85
+ "loss": 0.0677,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.20320197044334976,
90
+ "grad_norm": 0.04347088345515666,
91
  "learning_rate": 9.67596226261095e-06,
92
+ "loss": 0.0652,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.22167487684729065,
97
+ "grad_norm": 0.04636829853932609,
98
  "learning_rate": 9.551814704830734e-06,
99
+ "loss": 0.0618,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.24014778325123154,
104
+ "grad_norm": 0.04911651390500423,
105
  "learning_rate": 9.40865371360804e-06,
106
+ "loss": 0.0564,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.25862068965517243,
111
+ "grad_norm": 0.046124586761473525,
112
  "learning_rate": 9.247077288236488e-06,
113
+ "loss": 0.0496,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.2770935960591133,
118
+ "grad_norm": 0.04282690739591252,
119
  "learning_rate": 9.067760351314838e-06,
120
+ "loss": 0.0443,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.2955665024630542,
125
+ "grad_norm": 0.0501495787415551,
126
  "learning_rate": 8.871451929520662e-06,
127
+ "loss": 0.0472,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.31403940886699505,
132
+ "grad_norm": 0.048856614296215864,
133
  "learning_rate": 8.658972024843063e-06,
134
+ "loss": 0.045,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.33251231527093594,
139
+ "grad_norm": 0.052135344528722635,
140
  "learning_rate": 8.43120818934367e-06,
141
+ "loss": 0.0407,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.35098522167487683,
146
+ "grad_norm": 0.04496874450828456,
147
  "learning_rate": 8.18911181775353e-06,
148
+ "loss": 0.0443,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.3694581280788177,
153
+ "grad_norm": 0.046961464584805046,
154
  "learning_rate": 7.93369417339209e-06,
155
+ "loss": 0.043,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.3879310344827586,
160
+ "grad_norm": 0.03774079876533218,
161
  "learning_rate": 7.666022164008458e-06,
162
+ "loss": 0.039,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.4064039408866995,
167
+ "grad_norm": 0.039388091387549375,
168
  "learning_rate": 7.387213885189746e-06,
169
+ "loss": 0.043,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.4248768472906404,
174
+ "grad_norm": 0.04010524552891231,
175
  "learning_rate": 7.098433949952146e-06,
176
+ "loss": 0.0418,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.4433497536945813,
181
+ "grad_norm": 0.03364245597783716,
182
  "learning_rate": 6.800888624023552e-06,
183
+ "loss": 0.0396,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.4618226600985222,
188
+ "grad_norm": 0.03214650791918716,
189
  "learning_rate": 6.495820787138209e-06,
190
+ "loss": 0.0343,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.4802955665024631,
195
+ "grad_norm": 0.04248682387196562,
196
  "learning_rate": 6.184504741390596e-06,
197
+ "loss": 0.0385,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.4987684729064039,
202
+ "grad_norm": 0.041745690695414894,
203
  "learning_rate": 5.8682408883346535e-06,
204
+ "loss": 0.0354,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.5172413793103449,
209
+ "grad_norm": 0.043998119847084,
210
  "learning_rate": 5.548350297062659e-06,
211
+ "loss": 0.0363,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.5357142857142857,
216
+ "grad_norm": 0.03523911995673237,
217
  "learning_rate": 5.2261691859535325e-06,
218
+ "loss": 0.0311,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.5541871921182266,
223
+ "grad_norm": 0.051683301339415226,
224
  "learning_rate": 4.903043341140879e-06,
225
+ "loss": 0.0322,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.5726600985221675,
230
+ "grad_norm": 0.029645536386539162,
231
  "learning_rate": 4.580322495015466e-06,
232
+ "loss": 0.0297,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.5911330049261084,
237
+ "grad_norm": 0.038478089898929216,
238
  "learning_rate": 4.259354688243758e-06,
239
+ "loss": 0.0373,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.6096059113300493,
244
+ "grad_norm": 0.04016397060959619,
245
  "learning_rate": 3.941480638852948e-06,
246
+ "loss": 0.0293,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.6280788177339901,
251
+ "grad_norm": 0.03295300026030036,
252
  "learning_rate": 3.6280281419034934e-06,
253
+ "loss": 0.0306,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.646551724137931,
258
+ "grad_norm": 0.04371251122688708,
259
  "learning_rate": 3.3203065231422904e-06,
260
+ "loss": 0.0292,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.6650246305418719,
265
+ "grad_norm": 0.02878432244430226,
266
  "learning_rate": 3.019601169804216e-06,
267
+ "loss": 0.0348,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.6834975369458128,
272
+ "grad_norm": 0.03582599925982462,
273
  "learning_rate": 2.7271681614074973e-06,
274
+ "loss": 0.0292,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.7019704433497537,
279
+ "grad_norm": 0.04498392884678493,
280
  "learning_rate": 2.4442290229706344e-06,
281
+ "loss": 0.0355,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.7204433497536946,
286
+ "grad_norm": 0.03888587884987569,
287
  "learning_rate": 2.171965622567308e-06,
288
+ "loss": 0.0311,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.7389162561576355,
293
+ "grad_norm": 0.04684421314626146,
294
  "learning_rate": 1.9115152345327154e-06,
295
+ "loss": 0.0391,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.7573891625615764,
300
+ "grad_norm": 0.031274018167510506,
301
  "learning_rate": 1.6639657889429017e-06,
302
+ "loss": 0.0275,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.7758620689655172,
307
+ "grad_norm": 0.03793766745215515,
308
  "learning_rate": 1.4303513272105057e-06,
309
+ "loss": 0.0312,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.7943349753694581,
314
+ "grad_norm": 0.03084526133895099,
315
  "learning_rate": 1.2116476827794104e-06,
316
+ "loss": 0.0334,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.812807881773399,
321
+ "grad_norm": 0.035909978135080484,
322
  "learning_rate": 1.008768404960535e-06,
323
+ "loss": 0.0329,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.8312807881773399,
328
+ "grad_norm": 0.03337500963867465,
329
  "learning_rate": 8.225609429353187e-07,
330
+ "loss": 0.0299,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.8497536945812808,
335
+ "grad_norm": 0.04194808091582502,
336
  "learning_rate": 6.53803105866761e-07,
337
+ "loss": 0.0312,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.8682266009852216,
342
+ "grad_norm": 0.030185202275612954,
343
  "learning_rate": 5.031998139045352e-07,
344
+ "loss": 0.0291,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.8866995073891626,
349
+ "grad_norm": 0.03678202209697851,
350
  "learning_rate": 3.7138015365554834e-07,
351
+ "loss": 0.0329,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.9051724137931034,
356
+ "grad_norm": 0.026908956915127933,
357
  "learning_rate": 2.5889475041961767e-07,
358
+ "loss": 0.0318,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.9236453201970444,
363
+ "grad_norm": 0.04036220391547687,
364
  "learning_rate": 1.6621346816668993e-07,
365
+ "loss": 0.0304,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.9421182266009852,
370
+ "grad_norm": 0.037246305929994866,
371
  "learning_rate": 9.372344686307655e-08,
372
+ "loss": 0.036,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.9605911330049262,
377
+ "grad_norm": 0.031641745281581694,
378
  "learning_rate": 4.172748534499449e-08,
379
+ "loss": 0.0296,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.979064039408867,
384
+ "grad_norm": 0.03135655037281048,
385
  "learning_rate": 1.044277649433989e-08,
386
+ "loss": 0.0294,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.9975369458128078,
391
+ "grad_norm": 0.03097346411772941,
392
  "learning_rate": 0.0,
393
  "loss": 0.033,
394
  "step": 270
 
411
  "attributes": {}
412
  }
413
  },
414
+ "total_flos": 8.643970128528015e+17,
415
  "train_batch_size": 1,
416
  "trial_name": null,
417
  "trial_params": null
checkpoint-270/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ff4e5552e9ba58a4e2366ab6ffcb7a8cbfe2b6095f19a538d8ef47b18b91bde
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68d8b75a1101f455c3bf040fa2f84956565ddbcbbaa9e7e94c618c261996c857
3
  size 7352
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2dabcb8b0e4eba4f88cc4d30176973016812cd5099a7a909b235afb1eae6519
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2577dc0dbdd5aeedf99725bf6e4df1233bacd820fee699f17a2c80a4b53ce29f
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7abf6cb7f1a7b0f888af0c3be62bdee65d6181f15f6781262a73b4a90c14ee07
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d903975881e7178cedcf8c3b6dc3994cae9143f64293260912c927341aa533a
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9698bdbdee3a3d9c6e2667f88e1ce9e9f5566668788f3d2c351b0d8353303bed
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a405c24d1e443985effa911704afeb8b8895263c4ca3dd7454eeb3de4cef8b3
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:945b3a362040493fde5d41f5e830d4cd91f1b52673fcc546d1fecdee39777261
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47537180f9b5498e5481a81d4e10154e932f26f4c2237edbf8d69850e580ed4c
3
  size 1089994880
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83396048d512ec1f3178af0d7c1f79a226bba041822614b0e26a4fd2d4b55bf7
3
- size 11421995
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb27d51a5fa5caa8502d091726ff7f63ada64f766ff94afe49fde7d3faba216f
3
+ size 11421996
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9975369458128078,
3
- "total_flos": 6.535464838821315e+17,
4
- "train_loss": 0.07531778989014802,
5
- "train_runtime": 2373.3356,
6
  "train_samples": 6493,
7
- "train_samples_per_second": 2.736,
8
- "train_steps_per_second": 0.114
9
  }
 
1
  {
2
  "epoch": 0.9975369458128078,
3
+ "total_flos": 8.643970128528015e+17,
4
+ "train_loss": 0.07066047384783074,
5
+ "train_runtime": 3995.5402,
6
  "train_samples": 6493,
7
+ "train_samples_per_second": 1.625,
8
+ "train_steps_per_second": 0.068
9
  }
trainer_state.json CHANGED
@@ -10,385 +10,385 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.003694581280788177,
13
- "grad_norm": 1.7654963384889737,
14
  "learning_rate": 3.7037037037037036e-07,
15
- "loss": 0.7143,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.01847290640394089,
20
- "grad_norm": 1.5199857516271311,
21
  "learning_rate": 1.8518518518518519e-06,
22
- "loss": 0.6647,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03694581280788178,
27
- "grad_norm": 0.49041355078257287,
28
  "learning_rate": 3.7037037037037037e-06,
29
- "loss": 0.5847,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.05541871921182266,
34
- "grad_norm": 0.27640312040532583,
35
  "learning_rate": 5.555555555555557e-06,
36
- "loss": 0.3425,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.07389162561576355,
41
- "grad_norm": 0.18417484630910924,
42
  "learning_rate": 7.4074074074074075e-06,
43
- "loss": 0.1877,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.09236453201970443,
48
- "grad_norm": 0.10832669450973324,
49
  "learning_rate": 9.25925925925926e-06,
50
- "loss": 0.1471,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.11083743842364532,
55
- "grad_norm": 0.08199058472718684,
56
  "learning_rate": 9.996239762521152e-06,
57
- "loss": 0.1209,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.12931034482758622,
62
- "grad_norm": 0.06662942983937706,
63
  "learning_rate": 9.973281012033009e-06,
64
- "loss": 0.1046,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.1477832512315271,
69
- "grad_norm": 0.06811085294724732,
70
  "learning_rate": 9.929548316723983e-06,
71
- "loss": 0.0958,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.16625615763546797,
76
- "grad_norm": 0.055261792291083314,
77
  "learning_rate": 9.86522435289912e-06,
78
- "loss": 0.0804,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.18472906403940886,
83
- "grad_norm": 0.05912952493839196,
84
  "learning_rate": 9.7805778088694e-06,
85
- "loss": 0.072,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.20320197044334976,
90
- "grad_norm": 0.05664454056150425,
91
  "learning_rate": 9.67596226261095e-06,
92
- "loss": 0.0657,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.22167487684729065,
97
- "grad_norm": 0.05397727582384683,
98
  "learning_rate": 9.551814704830734e-06,
99
- "loss": 0.0626,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.24014778325123154,
104
- "grad_norm": 0.054354437756367974,
105
  "learning_rate": 9.40865371360804e-06,
106
- "loss": 0.0567,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.25862068965517243,
111
- "grad_norm": 0.052656359957651395,
112
  "learning_rate": 9.247077288236488e-06,
113
- "loss": 0.0506,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.2770935960591133,
118
- "grad_norm": 0.06099527619651085,
119
  "learning_rate": 9.067760351314838e-06,
120
- "loss": 0.0447,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.2955665024630542,
125
- "grad_norm": 0.05601267986585614,
126
  "learning_rate": 8.871451929520662e-06,
127
- "loss": 0.046,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.31403940886699505,
132
- "grad_norm": 0.05031077501489638,
133
  "learning_rate": 8.658972024843063e-06,
134
- "loss": 0.0434,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.33251231527093594,
139
- "grad_norm": 0.05165558400160997,
140
  "learning_rate": 8.43120818934367e-06,
141
- "loss": 0.0388,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.35098522167487683,
146
- "grad_norm": 0.05542969825706018,
147
  "learning_rate": 8.18911181775353e-06,
148
- "loss": 0.0474,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.3694581280788177,
153
- "grad_norm": 0.054028908120351174,
154
  "learning_rate": 7.93369417339209e-06,
155
- "loss": 0.0426,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.3879310344827586,
160
- "grad_norm": 0.043910381575552423,
161
  "learning_rate": 7.666022164008458e-06,
162
- "loss": 0.0402,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.4064039408866995,
167
- "grad_norm": 0.04420065737962207,
168
  "learning_rate": 7.387213885189746e-06,
169
- "loss": 0.0441,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.4248768472906404,
174
- "grad_norm": 0.042881775226418783,
175
  "learning_rate": 7.098433949952146e-06,
176
- "loss": 0.0422,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.4433497536945813,
181
- "grad_norm": 0.04261745777882021,
182
  "learning_rate": 6.800888624023552e-06,
183
- "loss": 0.0414,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.4618226600985222,
188
- "grad_norm": 0.03486953918711132,
189
  "learning_rate": 6.495820787138209e-06,
190
- "loss": 0.0345,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.4802955665024631,
195
- "grad_norm": 0.04758579863549666,
196
  "learning_rate": 6.184504741390596e-06,
197
- "loss": 0.0393,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.4987684729064039,
202
- "grad_norm": 0.04528381812617609,
203
  "learning_rate": 5.8682408883346535e-06,
204
- "loss": 0.0355,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.5172413793103449,
209
- "grad_norm": 0.05087543134069284,
210
  "learning_rate": 5.548350297062659e-06,
211
- "loss": 0.0376,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.5357142857142857,
216
- "grad_norm": 0.03521666590779419,
217
  "learning_rate": 5.2261691859535325e-06,
218
- "loss": 0.0316,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.5541871921182266,
223
- "grad_norm": 0.0517150609216818,
224
  "learning_rate": 4.903043341140879e-06,
225
- "loss": 0.032,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.5726600985221675,
230
- "grad_norm": 0.03346412011949477,
231
  "learning_rate": 4.580322495015466e-06,
232
- "loss": 0.0303,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.5911330049261084,
237
- "grad_norm": 0.040115617332954906,
238
  "learning_rate": 4.259354688243758e-06,
239
- "loss": 0.0382,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.6096059113300493,
244
- "grad_norm": 0.04277501887386235,
245
  "learning_rate": 3.941480638852948e-06,
246
- "loss": 0.0291,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.6280788177339901,
251
- "grad_norm": 0.03898041246450539,
252
  "learning_rate": 3.6280281419034934e-06,
253
- "loss": 0.0317,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.646551724137931,
258
- "grad_norm": 0.04371627160570444,
259
  "learning_rate": 3.3203065231422904e-06,
260
- "loss": 0.0301,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.6650246305418719,
265
- "grad_norm": 0.03238864584138372,
266
  "learning_rate": 3.019601169804216e-06,
267
- "loss": 0.0354,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.6834975369458128,
272
- "grad_norm": 0.041127251144739585,
273
  "learning_rate": 2.7271681614074973e-06,
274
- "loss": 0.0294,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.7019704433497537,
279
- "grad_norm": 0.045180481360547094,
280
  "learning_rate": 2.4442290229706344e-06,
281
- "loss": 0.0358,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.7204433497536946,
286
- "grad_norm": 0.045021953447442344,
287
  "learning_rate": 2.171965622567308e-06,
288
- "loss": 0.0306,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.7389162561576355,
293
- "grad_norm": 0.050026098917487306,
294
  "learning_rate": 1.9115152345327154e-06,
295
- "loss": 0.0418,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.7573891625615764,
300
- "grad_norm": 0.03656415909500236,
301
  "learning_rate": 1.6639657889429017e-06,
302
- "loss": 0.0286,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.7758620689655172,
307
- "grad_norm": 0.045197818476724314,
308
  "learning_rate": 1.4303513272105057e-06,
309
- "loss": 0.0317,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.7943349753694581,
314
- "grad_norm": 0.041762867738677684,
315
  "learning_rate": 1.2116476827794104e-06,
316
- "loss": 0.0355,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.812807881773399,
321
- "grad_norm": 0.03975638695681742,
322
  "learning_rate": 1.008768404960535e-06,
323
- "loss": 0.034,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.8312807881773399,
328
- "grad_norm": 0.03688322160939588,
329
  "learning_rate": 8.225609429353187e-07,
330
- "loss": 0.0306,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.8497536945812808,
335
- "grad_norm": 0.045910201896259065,
336
  "learning_rate": 6.53803105866761e-07,
337
- "loss": 0.032,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.8682266009852216,
342
- "grad_norm": 0.03466115718136847,
343
  "learning_rate": 5.031998139045352e-07,
344
- "loss": 0.03,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.8866995073891626,
349
- "grad_norm": 0.04041208746429919,
350
  "learning_rate": 3.7138015365554834e-07,
351
- "loss": 0.033,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.9051724137931034,
356
- "grad_norm": 0.030110072192059505,
357
  "learning_rate": 2.5889475041961767e-07,
358
- "loss": 0.0316,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.9236453201970444,
363
- "grad_norm": 0.04208800780561471,
364
  "learning_rate": 1.6621346816668993e-07,
365
- "loss": 0.0317,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.9421182266009852,
370
- "grad_norm": 0.040891559299692404,
371
  "learning_rate": 9.372344686307655e-08,
372
- "loss": 0.0365,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.9605911330049262,
377
- "grad_norm": 0.03589408474423174,
378
  "learning_rate": 4.172748534499449e-08,
379
- "loss": 0.0288,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.979064039408867,
384
- "grad_norm": 0.03745965637860769,
385
  "learning_rate": 1.044277649433989e-08,
386
- "loss": 0.0307,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.9975369458128078,
391
- "grad_norm": 0.031721423195282906,
392
  "learning_rate": 0.0,
393
  "loss": 0.033,
394
  "step": 270
@@ -396,11 +396,11 @@
396
  {
397
  "epoch": 0.9975369458128078,
398
  "step": 270,
399
- "total_flos": 6.535464838821315e+17,
400
- "train_loss": 0.07531778989014802,
401
- "train_runtime": 2373.3356,
402
- "train_samples_per_second": 2.736,
403
- "train_steps_per_second": 0.114
404
  }
405
  ],
406
  "logging_steps": 5,
@@ -420,7 +420,7 @@
420
  "attributes": {}
421
  }
422
  },
423
- "total_flos": 6.535464838821315e+17,
424
  "train_batch_size": 1,
425
  "trial_name": null,
426
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.003694581280788177,
13
+ "grad_norm": 1.3164679266754151,
14
  "learning_rate": 3.7037037037037036e-07,
15
+ "loss": 0.5867,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.01847290640394089,
20
+ "grad_norm": 0.8941813303905811,
21
  "learning_rate": 1.8518518518518519e-06,
22
+ "loss": 0.5684,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03694581280788178,
27
+ "grad_norm": 0.4848149582760561,
28
  "learning_rate": 3.7037037037037037e-06,
29
+ "loss": 0.5272,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.05541871921182266,
34
+ "grad_norm": 0.23489374203812283,
35
  "learning_rate": 5.555555555555557e-06,
36
+ "loss": 0.3213,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.07389162561576355,
41
+ "grad_norm": 0.17590513475537736,
42
  "learning_rate": 7.4074074074074075e-06,
43
+ "loss": 0.1719,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.09236453201970443,
48
+ "grad_norm": 0.09924337039651394,
49
  "learning_rate": 9.25925925925926e-06,
50
+ "loss": 0.1392,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.11083743842364532,
55
+ "grad_norm": 0.06829427058420333,
56
  "learning_rate": 9.996239762521152e-06,
57
+ "loss": 0.1168,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.12931034482758622,
62
+ "grad_norm": 0.06324690561682543,
63
  "learning_rate": 9.973281012033009e-06,
64
+ "loss": 0.1009,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.1477832512315271,
69
+ "grad_norm": 0.049072943668117305,
70
  "learning_rate": 9.929548316723983e-06,
71
+ "loss": 0.0886,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.16625615763546797,
76
+ "grad_norm": 0.04472692512923622,
77
  "learning_rate": 9.86522435289912e-06,
78
+ "loss": 0.0772,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.18472906403940886,
83
+ "grad_norm": 0.04320806158867543,
84
  "learning_rate": 9.7805778088694e-06,
85
+ "loss": 0.0677,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.20320197044334976,
90
+ "grad_norm": 0.04347088345515666,
91
  "learning_rate": 9.67596226261095e-06,
92
+ "loss": 0.0652,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.22167487684729065,
97
+ "grad_norm": 0.04636829853932609,
98
  "learning_rate": 9.551814704830734e-06,
99
+ "loss": 0.0618,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.24014778325123154,
104
+ "grad_norm": 0.04911651390500423,
105
  "learning_rate": 9.40865371360804e-06,
106
+ "loss": 0.0564,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.25862068965517243,
111
+ "grad_norm": 0.046124586761473525,
112
  "learning_rate": 9.247077288236488e-06,
113
+ "loss": 0.0496,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.2770935960591133,
118
+ "grad_norm": 0.04282690739591252,
119
  "learning_rate": 9.067760351314838e-06,
120
+ "loss": 0.0443,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.2955665024630542,
125
+ "grad_norm": 0.0501495787415551,
126
  "learning_rate": 8.871451929520662e-06,
127
+ "loss": 0.0472,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.31403940886699505,
132
+ "grad_norm": 0.048856614296215864,
133
  "learning_rate": 8.658972024843063e-06,
134
+ "loss": 0.045,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.33251231527093594,
139
+ "grad_norm": 0.052135344528722635,
140
  "learning_rate": 8.43120818934367e-06,
141
+ "loss": 0.0407,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.35098522167487683,
146
+ "grad_norm": 0.04496874450828456,
147
  "learning_rate": 8.18911181775353e-06,
148
+ "loss": 0.0443,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.3694581280788177,
153
+ "grad_norm": 0.046961464584805046,
154
  "learning_rate": 7.93369417339209e-06,
155
+ "loss": 0.043,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.3879310344827586,
160
+ "grad_norm": 0.03774079876533218,
161
  "learning_rate": 7.666022164008458e-06,
162
+ "loss": 0.039,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.4064039408866995,
167
+ "grad_norm": 0.039388091387549375,
168
  "learning_rate": 7.387213885189746e-06,
169
+ "loss": 0.043,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.4248768472906404,
174
+ "grad_norm": 0.04010524552891231,
175
  "learning_rate": 7.098433949952146e-06,
176
+ "loss": 0.0418,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.4433497536945813,
181
+ "grad_norm": 0.03364245597783716,
182
  "learning_rate": 6.800888624023552e-06,
183
+ "loss": 0.0396,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.4618226600985222,
188
+ "grad_norm": 0.03214650791918716,
189
  "learning_rate": 6.495820787138209e-06,
190
+ "loss": 0.0343,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.4802955665024631,
195
+ "grad_norm": 0.04248682387196562,
196
  "learning_rate": 6.184504741390596e-06,
197
+ "loss": 0.0385,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.4987684729064039,
202
+ "grad_norm": 0.041745690695414894,
203
  "learning_rate": 5.8682408883346535e-06,
204
+ "loss": 0.0354,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.5172413793103449,
209
+ "grad_norm": 0.043998119847084,
210
  "learning_rate": 5.548350297062659e-06,
211
+ "loss": 0.0363,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.5357142857142857,
216
+ "grad_norm": 0.03523911995673237,
217
  "learning_rate": 5.2261691859535325e-06,
218
+ "loss": 0.0311,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.5541871921182266,
223
+ "grad_norm": 0.051683301339415226,
224
  "learning_rate": 4.903043341140879e-06,
225
+ "loss": 0.0322,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.5726600985221675,
230
+ "grad_norm": 0.029645536386539162,
231
  "learning_rate": 4.580322495015466e-06,
232
+ "loss": 0.0297,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.5911330049261084,
237
+ "grad_norm": 0.038478089898929216,
238
  "learning_rate": 4.259354688243758e-06,
239
+ "loss": 0.0373,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.6096059113300493,
244
+ "grad_norm": 0.04016397060959619,
245
  "learning_rate": 3.941480638852948e-06,
246
+ "loss": 0.0293,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.6280788177339901,
251
+ "grad_norm": 0.03295300026030036,
252
  "learning_rate": 3.6280281419034934e-06,
253
+ "loss": 0.0306,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.646551724137931,
258
+ "grad_norm": 0.04371251122688708,
259
  "learning_rate": 3.3203065231422904e-06,
260
+ "loss": 0.0292,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.6650246305418719,
265
+ "grad_norm": 0.02878432244430226,
266
  "learning_rate": 3.019601169804216e-06,
267
+ "loss": 0.0348,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.6834975369458128,
272
+ "grad_norm": 0.03582599925982462,
273
  "learning_rate": 2.7271681614074973e-06,
274
+ "loss": 0.0292,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.7019704433497537,
279
+ "grad_norm": 0.04498392884678493,
280
  "learning_rate": 2.4442290229706344e-06,
281
+ "loss": 0.0355,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.7204433497536946,
286
+ "grad_norm": 0.03888587884987569,
287
  "learning_rate": 2.171965622567308e-06,
288
+ "loss": 0.0311,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.7389162561576355,
293
+ "grad_norm": 0.04684421314626146,
294
  "learning_rate": 1.9115152345327154e-06,
295
+ "loss": 0.0391,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.7573891625615764,
300
+ "grad_norm": 0.031274018167510506,
301
  "learning_rate": 1.6639657889429017e-06,
302
+ "loss": 0.0275,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.7758620689655172,
307
+ "grad_norm": 0.03793766745215515,
308
  "learning_rate": 1.4303513272105057e-06,
309
+ "loss": 0.0312,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.7943349753694581,
314
+ "grad_norm": 0.03084526133895099,
315
  "learning_rate": 1.2116476827794104e-06,
316
+ "loss": 0.0334,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.812807881773399,
321
+ "grad_norm": 0.035909978135080484,
322
  "learning_rate": 1.008768404960535e-06,
323
+ "loss": 0.0329,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.8312807881773399,
328
+ "grad_norm": 0.03337500963867465,
329
  "learning_rate": 8.225609429353187e-07,
330
+ "loss": 0.0299,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.8497536945812808,
335
+ "grad_norm": 0.04194808091582502,
336
  "learning_rate": 6.53803105866761e-07,
337
+ "loss": 0.0312,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.8682266009852216,
342
+ "grad_norm": 0.030185202275612954,
343
  "learning_rate": 5.031998139045352e-07,
344
+ "loss": 0.0291,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.8866995073891626,
349
+ "grad_norm": 0.03678202209697851,
350
  "learning_rate": 3.7138015365554834e-07,
351
+ "loss": 0.0329,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.9051724137931034,
356
+ "grad_norm": 0.026908956915127933,
357
  "learning_rate": 2.5889475041961767e-07,
358
+ "loss": 0.0318,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.9236453201970444,
363
+ "grad_norm": 0.04036220391547687,
364
  "learning_rate": 1.6621346816668993e-07,
365
+ "loss": 0.0304,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.9421182266009852,
370
+ "grad_norm": 0.037246305929994866,
371
  "learning_rate": 9.372344686307655e-08,
372
+ "loss": 0.036,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.9605911330049262,
377
+ "grad_norm": 0.031641745281581694,
378
  "learning_rate": 4.172748534499449e-08,
379
+ "loss": 0.0296,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.979064039408867,
384
+ "grad_norm": 0.03135655037281048,
385
  "learning_rate": 1.044277649433989e-08,
386
+ "loss": 0.0294,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.9975369458128078,
391
+ "grad_norm": 0.03097346411772941,
392
  "learning_rate": 0.0,
393
  "loss": 0.033,
394
  "step": 270
 
396
  {
397
  "epoch": 0.9975369458128078,
398
  "step": 270,
399
+ "total_flos": 8.643970128528015e+17,
400
+ "train_loss": 0.07066047384783074,
401
+ "train_runtime": 3995.5402,
402
+ "train_samples_per_second": 1.625,
403
+ "train_steps_per_second": 0.068
404
  }
405
  ],
406
  "logging_steps": 5,
 
420
  "attributes": {}
421
  }
422
  },
423
+ "total_flos": 8.643970128528015e+17,
424
  "train_batch_size": 1,
425
  "trial_name": null,
426
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ff4e5552e9ba58a4e2366ab6ffcb7a8cbfe2b6095f19a538d8ef47b18b91bde
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68d8b75a1101f455c3bf040fa2f84956565ddbcbbaa9e7e94c618c261996c857
3
  size 7352