ajsal27 commited on
Commit
b6420ab
1 Parent(s): 261dc7f

Model save

Browse files
Files changed (1) hide show
  1. trainer_state.json +1561 -0
trainer_state.json ADDED
@@ -0,0 +1,1561 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 100,
6
+ "global_step": 4233,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01771793054571226,
13
+ "grad_norm": 0.24673239886760712,
14
+ "learning_rate": 0.0001,
15
+ "loss": 3.8313,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.03543586109142452,
20
+ "grad_norm": 0.23432838916778564,
21
+ "learning_rate": 0.0001,
22
+ "loss": 3.6164,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.05315379163713678,
27
+ "grad_norm": 0.4262351393699646,
28
+ "learning_rate": 0.0001,
29
+ "loss": 3.4113,
30
+ "step": 75
31
+ },
32
+ {
33
+ "epoch": 0.07087172218284904,
34
+ "grad_norm": 0.329630047082901,
35
+ "learning_rate": 0.0001,
36
+ "loss": 3.1874,
37
+ "step": 100
38
+ },
39
+ {
40
+ "epoch": 0.07087172218284904,
41
+ "eval_loss": 3.0117650032043457,
42
+ "eval_runtime": 31.2497,
43
+ "eval_samples_per_second": 9.536,
44
+ "eval_steps_per_second": 2.4,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.0885896527285613,
49
+ "grad_norm": 0.39532458782196045,
50
+ "learning_rate": 0.0001,
51
+ "loss": 3.0928,
52
+ "step": 125
53
+ },
54
+ {
55
+ "epoch": 0.10630758327427356,
56
+ "grad_norm": 0.38724836707115173,
57
+ "learning_rate": 0.0001,
58
+ "loss": 2.8781,
59
+ "step": 150
60
+ },
61
+ {
62
+ "epoch": 0.12402551381998582,
63
+ "grad_norm": 0.2721017897129059,
64
+ "learning_rate": 0.0001,
65
+ "loss": 2.9166,
66
+ "step": 175
67
+ },
68
+ {
69
+ "epoch": 0.14174344436569808,
70
+ "grad_norm": 0.40747421979904175,
71
+ "learning_rate": 0.0001,
72
+ "loss": 2.8756,
73
+ "step": 200
74
+ },
75
+ {
76
+ "epoch": 0.14174344436569808,
77
+ "eval_loss": 2.822810411453247,
78
+ "eval_runtime": 32.4779,
79
+ "eval_samples_per_second": 9.175,
80
+ "eval_steps_per_second": 2.309,
81
+ "step": 200
82
+ },
83
+ {
84
+ "epoch": 0.15946137491141035,
85
+ "grad_norm": 0.4922279119491577,
86
+ "learning_rate": 0.0001,
87
+ "loss": 2.9015,
88
+ "step": 225
89
+ },
90
+ {
91
+ "epoch": 0.1771793054571226,
92
+ "grad_norm": 0.3577287197113037,
93
+ "learning_rate": 0.0001,
94
+ "loss": 2.8639,
95
+ "step": 250
96
+ },
97
+ {
98
+ "epoch": 0.19489723600283487,
99
+ "grad_norm": 0.26928216218948364,
100
+ "learning_rate": 0.0001,
101
+ "loss": 2.6776,
102
+ "step": 275
103
+ },
104
+ {
105
+ "epoch": 0.21261516654854712,
106
+ "grad_norm": 0.3623720109462738,
107
+ "learning_rate": 0.0001,
108
+ "loss": 2.7134,
109
+ "step": 300
110
+ },
111
+ {
112
+ "epoch": 0.21261516654854712,
113
+ "eval_loss": 2.735769271850586,
114
+ "eval_runtime": 32.6152,
115
+ "eval_samples_per_second": 9.137,
116
+ "eval_steps_per_second": 2.3,
117
+ "step": 300
118
+ },
119
+ {
120
+ "epoch": 0.2303330970942594,
121
+ "grad_norm": 0.48846569657325745,
122
+ "learning_rate": 0.0001,
123
+ "loss": 2.7118,
124
+ "step": 325
125
+ },
126
+ {
127
+ "epoch": 0.24805102763997164,
128
+ "grad_norm": 0.555242657661438,
129
+ "learning_rate": 0.0001,
130
+ "loss": 2.7072,
131
+ "step": 350
132
+ },
133
+ {
134
+ "epoch": 0.2657689581856839,
135
+ "grad_norm": 0.3421138823032379,
136
+ "learning_rate": 0.0001,
137
+ "loss": 2.7114,
138
+ "step": 375
139
+ },
140
+ {
141
+ "epoch": 0.28348688873139616,
142
+ "grad_norm": 0.3518970012664795,
143
+ "learning_rate": 0.0001,
144
+ "loss": 2.6948,
145
+ "step": 400
146
+ },
147
+ {
148
+ "epoch": 0.28348688873139616,
149
+ "eval_loss": 2.6832730770111084,
150
+ "eval_runtime": 32.6428,
151
+ "eval_samples_per_second": 9.129,
152
+ "eval_steps_per_second": 2.298,
153
+ "step": 400
154
+ },
155
+ {
156
+ "epoch": 0.30120481927710846,
157
+ "grad_norm": 0.7523891925811768,
158
+ "learning_rate": 0.0001,
159
+ "loss": 2.7691,
160
+ "step": 425
161
+ },
162
+ {
163
+ "epoch": 0.3189227498228207,
164
+ "grad_norm": 0.36019715666770935,
165
+ "learning_rate": 0.0001,
166
+ "loss": 2.7116,
167
+ "step": 450
168
+ },
169
+ {
170
+ "epoch": 0.33664068036853295,
171
+ "grad_norm": 0.47855111956596375,
172
+ "learning_rate": 0.0001,
173
+ "loss": 2.5935,
174
+ "step": 475
175
+ },
176
+ {
177
+ "epoch": 0.3543586109142452,
178
+ "grad_norm": 0.3621235489845276,
179
+ "learning_rate": 0.0001,
180
+ "loss": 2.6386,
181
+ "step": 500
182
+ },
183
+ {
184
+ "epoch": 0.3543586109142452,
185
+ "eval_loss": 2.6440742015838623,
186
+ "eval_runtime": 32.6151,
187
+ "eval_samples_per_second": 9.137,
188
+ "eval_steps_per_second": 2.3,
189
+ "step": 500
190
+ },
191
+ {
192
+ "epoch": 0.3720765414599575,
193
+ "grad_norm": 0.6827765703201294,
194
+ "learning_rate": 0.0001,
195
+ "loss": 2.6515,
196
+ "step": 525
197
+ },
198
+ {
199
+ "epoch": 0.38979447200566975,
200
+ "grad_norm": 0.5584849119186401,
201
+ "learning_rate": 0.0001,
202
+ "loss": 2.7017,
203
+ "step": 550
204
+ },
205
+ {
206
+ "epoch": 0.407512402551382,
207
+ "grad_norm": 0.42108777165412903,
208
+ "learning_rate": 0.0001,
209
+ "loss": 2.6605,
210
+ "step": 575
211
+ },
212
+ {
213
+ "epoch": 0.42523033309709424,
214
+ "grad_norm": 0.4811398684978485,
215
+ "learning_rate": 0.0001,
216
+ "loss": 2.6525,
217
+ "step": 600
218
+ },
219
+ {
220
+ "epoch": 0.42523033309709424,
221
+ "eval_loss": 2.6149747371673584,
222
+ "eval_runtime": 32.6256,
223
+ "eval_samples_per_second": 9.134,
224
+ "eval_steps_per_second": 2.299,
225
+ "step": 600
226
+ },
227
+ {
228
+ "epoch": 0.44294826364280654,
229
+ "grad_norm": 0.3859870731830597,
230
+ "learning_rate": 0.0001,
231
+ "loss": 2.6805,
232
+ "step": 625
233
+ },
234
+ {
235
+ "epoch": 0.4606661941885188,
236
+ "grad_norm": 0.5074867010116577,
237
+ "learning_rate": 0.0001,
238
+ "loss": 2.66,
239
+ "step": 650
240
+ },
241
+ {
242
+ "epoch": 0.47838412473423103,
243
+ "grad_norm": 0.4459812343120575,
244
+ "learning_rate": 0.0001,
245
+ "loss": 2.645,
246
+ "step": 675
247
+ },
248
+ {
249
+ "epoch": 0.4961020552799433,
250
+ "grad_norm": 0.3960532248020172,
251
+ "learning_rate": 0.0001,
252
+ "loss": 2.6242,
253
+ "step": 700
254
+ },
255
+ {
256
+ "epoch": 0.4961020552799433,
257
+ "eval_loss": 2.5855672359466553,
258
+ "eval_runtime": 32.5667,
259
+ "eval_samples_per_second": 9.15,
260
+ "eval_steps_per_second": 2.303,
261
+ "step": 700
262
+ },
263
+ {
264
+ "epoch": 0.5138199858256556,
265
+ "grad_norm": 0.28771647810935974,
266
+ "learning_rate": 0.0001,
267
+ "loss": 2.6985,
268
+ "step": 725
269
+ },
270
+ {
271
+ "epoch": 0.5315379163713678,
272
+ "grad_norm": 0.3489636182785034,
273
+ "learning_rate": 0.0001,
274
+ "loss": 2.7126,
275
+ "step": 750
276
+ },
277
+ {
278
+ "epoch": 0.5492558469170801,
279
+ "grad_norm": 0.5809263586997986,
280
+ "learning_rate": 0.0001,
281
+ "loss": 2.558,
282
+ "step": 775
283
+ },
284
+ {
285
+ "epoch": 0.5669737774627923,
286
+ "grad_norm": 0.5202405452728271,
287
+ "learning_rate": 0.0001,
288
+ "loss": 2.6444,
289
+ "step": 800
290
+ },
291
+ {
292
+ "epoch": 0.5669737774627923,
293
+ "eval_loss": 2.5701286792755127,
294
+ "eval_runtime": 32.5138,
295
+ "eval_samples_per_second": 9.165,
296
+ "eval_steps_per_second": 2.307,
297
+ "step": 800
298
+ },
299
+ {
300
+ "epoch": 0.5846917080085046,
301
+ "grad_norm": 0.4100205600261688,
302
+ "learning_rate": 0.0001,
303
+ "loss": 2.5452,
304
+ "step": 825
305
+ },
306
+ {
307
+ "epoch": 0.6024096385542169,
308
+ "grad_norm": 0.494243323802948,
309
+ "learning_rate": 0.0001,
310
+ "loss": 2.566,
311
+ "step": 850
312
+ },
313
+ {
314
+ "epoch": 0.6201275690999292,
315
+ "grad_norm": 0.41979244351387024,
316
+ "learning_rate": 0.0001,
317
+ "loss": 2.6271,
318
+ "step": 875
319
+ },
320
+ {
321
+ "epoch": 0.6378454996456414,
322
+ "grad_norm": 0.4423984885215759,
323
+ "learning_rate": 0.0001,
324
+ "loss": 2.6007,
325
+ "step": 900
326
+ },
327
+ {
328
+ "epoch": 0.6378454996456414,
329
+ "eval_loss": 2.554037094116211,
330
+ "eval_runtime": 32.562,
331
+ "eval_samples_per_second": 9.152,
332
+ "eval_steps_per_second": 2.303,
333
+ "step": 900
334
+ },
335
+ {
336
+ "epoch": 0.6555634301913537,
337
+ "grad_norm": 0.49124300479888916,
338
+ "learning_rate": 0.0001,
339
+ "loss": 2.6006,
340
+ "step": 925
341
+ },
342
+ {
343
+ "epoch": 0.6732813607370659,
344
+ "grad_norm": 0.4179531931877136,
345
+ "learning_rate": 0.0001,
346
+ "loss": 2.634,
347
+ "step": 950
348
+ },
349
+ {
350
+ "epoch": 0.6909992912827781,
351
+ "grad_norm": 0.35822227597236633,
352
+ "learning_rate": 0.0001,
353
+ "loss": 2.5601,
354
+ "step": 975
355
+ },
356
+ {
357
+ "epoch": 0.7087172218284904,
358
+ "grad_norm": 0.44015607237815857,
359
+ "learning_rate": 0.0001,
360
+ "loss": 2.462,
361
+ "step": 1000
362
+ },
363
+ {
364
+ "epoch": 0.7087172218284904,
365
+ "eval_loss": 2.5418050289154053,
366
+ "eval_runtime": 32.5949,
367
+ "eval_samples_per_second": 9.143,
368
+ "eval_steps_per_second": 2.301,
369
+ "step": 1000
370
+ },
371
+ {
372
+ "epoch": 0.7264351523742026,
373
+ "grad_norm": 0.3899957835674286,
374
+ "learning_rate": 0.0001,
375
+ "loss": 2.538,
376
+ "step": 1025
377
+ },
378
+ {
379
+ "epoch": 0.744153082919915,
380
+ "grad_norm": 0.5775083303451538,
381
+ "learning_rate": 0.0001,
382
+ "loss": 2.5553,
383
+ "step": 1050
384
+ },
385
+ {
386
+ "epoch": 0.7618710134656272,
387
+ "grad_norm": 0.6485064625740051,
388
+ "learning_rate": 0.0001,
389
+ "loss": 2.5878,
390
+ "step": 1075
391
+ },
392
+ {
393
+ "epoch": 0.7795889440113395,
394
+ "grad_norm": 0.5756754875183105,
395
+ "learning_rate": 0.0001,
396
+ "loss": 2.5641,
397
+ "step": 1100
398
+ },
399
+ {
400
+ "epoch": 0.7795889440113395,
401
+ "eval_loss": 2.5314512252807617,
402
+ "eval_runtime": 32.673,
403
+ "eval_samples_per_second": 9.121,
404
+ "eval_steps_per_second": 2.295,
405
+ "step": 1100
406
+ },
407
+ {
408
+ "epoch": 0.7973068745570517,
409
+ "grad_norm": 0.46359655261039734,
410
+ "learning_rate": 0.0001,
411
+ "loss": 2.5062,
412
+ "step": 1125
413
+ },
414
+ {
415
+ "epoch": 0.815024805102764,
416
+ "grad_norm": 0.6569529175758362,
417
+ "learning_rate": 0.0001,
418
+ "loss": 2.5921,
419
+ "step": 1150
420
+ },
421
+ {
422
+ "epoch": 0.8327427356484762,
423
+ "grad_norm": 0.5323280096054077,
424
+ "learning_rate": 0.0001,
425
+ "loss": 2.4839,
426
+ "step": 1175
427
+ },
428
+ {
429
+ "epoch": 0.8504606661941885,
430
+ "grad_norm": 0.46269166469573975,
431
+ "learning_rate": 0.0001,
432
+ "loss": 2.4672,
433
+ "step": 1200
434
+ },
435
+ {
436
+ "epoch": 0.8504606661941885,
437
+ "eval_loss": 2.5238418579101562,
438
+ "eval_runtime": 32.6451,
439
+ "eval_samples_per_second": 9.128,
440
+ "eval_steps_per_second": 2.297,
441
+ "step": 1200
442
+ },
443
+ {
444
+ "epoch": 0.8681785967399008,
445
+ "grad_norm": 0.4144786596298218,
446
+ "learning_rate": 0.0001,
447
+ "loss": 2.5174,
448
+ "step": 1225
449
+ },
450
+ {
451
+ "epoch": 0.8858965272856131,
452
+ "grad_norm": 0.6366602182388306,
453
+ "learning_rate": 0.0001,
454
+ "loss": 2.512,
455
+ "step": 1250
456
+ },
457
+ {
458
+ "epoch": 0.9036144578313253,
459
+ "grad_norm": 0.490589439868927,
460
+ "learning_rate": 0.0001,
461
+ "loss": 2.5243,
462
+ "step": 1275
463
+ },
464
+ {
465
+ "epoch": 0.9213323883770376,
466
+ "grad_norm": 0.5880224108695984,
467
+ "learning_rate": 0.0001,
468
+ "loss": 2.5017,
469
+ "step": 1300
470
+ },
471
+ {
472
+ "epoch": 0.9213323883770376,
473
+ "eval_loss": 2.51455020904541,
474
+ "eval_runtime": 32.6422,
475
+ "eval_samples_per_second": 9.129,
476
+ "eval_steps_per_second": 2.298,
477
+ "step": 1300
478
+ },
479
+ {
480
+ "epoch": 0.9390503189227498,
481
+ "grad_norm": 0.5042569637298584,
482
+ "learning_rate": 0.0001,
483
+ "loss": 2.5238,
484
+ "step": 1325
485
+ },
486
+ {
487
+ "epoch": 0.9567682494684621,
488
+ "grad_norm": 0.3827306032180786,
489
+ "learning_rate": 0.0001,
490
+ "loss": 2.5785,
491
+ "step": 1350
492
+ },
493
+ {
494
+ "epoch": 0.9744861800141743,
495
+ "grad_norm": 0.4069231450557709,
496
+ "learning_rate": 0.0001,
497
+ "loss": 2.5312,
498
+ "step": 1375
499
+ },
500
+ {
501
+ "epoch": 0.9922041105598866,
502
+ "grad_norm": 0.5515998005867004,
503
+ "learning_rate": 0.0001,
504
+ "loss": 2.6389,
505
+ "step": 1400
506
+ },
507
+ {
508
+ "epoch": 0.9922041105598866,
509
+ "eval_loss": 2.5082926750183105,
510
+ "eval_runtime": 32.6146,
511
+ "eval_samples_per_second": 9.137,
512
+ "eval_steps_per_second": 2.3,
513
+ "step": 1400
514
+ },
515
+ {
516
+ "epoch": 1.009922041105599,
517
+ "grad_norm": 0.34569019079208374,
518
+ "learning_rate": 0.0001,
519
+ "loss": 2.4996,
520
+ "step": 1425
521
+ },
522
+ {
523
+ "epoch": 1.0276399716513112,
524
+ "grad_norm": 0.4051329493522644,
525
+ "learning_rate": 0.0001,
526
+ "loss": 2.4935,
527
+ "step": 1450
528
+ },
529
+ {
530
+ "epoch": 1.0453579021970234,
531
+ "grad_norm": 0.5617458820343018,
532
+ "learning_rate": 0.0001,
533
+ "loss": 2.5528,
534
+ "step": 1475
535
+ },
536
+ {
537
+ "epoch": 1.0630758327427356,
538
+ "grad_norm": 0.7300685048103333,
539
+ "learning_rate": 0.0001,
540
+ "loss": 2.4869,
541
+ "step": 1500
542
+ },
543
+ {
544
+ "epoch": 1.0630758327427356,
545
+ "eval_loss": 2.502122640609741,
546
+ "eval_runtime": 32.6521,
547
+ "eval_samples_per_second": 9.127,
548
+ "eval_steps_per_second": 2.297,
549
+ "step": 1500
550
+ },
551
+ {
552
+ "epoch": 1.080793763288448,
553
+ "grad_norm": 0.3424386978149414,
554
+ "learning_rate": 0.0001,
555
+ "loss": 2.5036,
556
+ "step": 1525
557
+ },
558
+ {
559
+ "epoch": 1.0985116938341601,
560
+ "grad_norm": 0.48358920216560364,
561
+ "learning_rate": 0.0001,
562
+ "loss": 2.4546,
563
+ "step": 1550
564
+ },
565
+ {
566
+ "epoch": 1.1162296243798724,
567
+ "grad_norm": 0.620297372341156,
568
+ "learning_rate": 0.0001,
569
+ "loss": 2.4505,
570
+ "step": 1575
571
+ },
572
+ {
573
+ "epoch": 1.1339475549255846,
574
+ "grad_norm": 0.41402992606163025,
575
+ "learning_rate": 0.0001,
576
+ "loss": 2.5302,
577
+ "step": 1600
578
+ },
579
+ {
580
+ "epoch": 1.1339475549255846,
581
+ "eval_loss": 2.4941623210906982,
582
+ "eval_runtime": 32.6717,
583
+ "eval_samples_per_second": 9.121,
584
+ "eval_steps_per_second": 2.296,
585
+ "step": 1600
586
+ },
587
+ {
588
+ "epoch": 1.1516654854712969,
589
+ "grad_norm": 0.6208804249763489,
590
+ "learning_rate": 0.0001,
591
+ "loss": 2.5035,
592
+ "step": 1625
593
+ },
594
+ {
595
+ "epoch": 1.1693834160170091,
596
+ "grad_norm": 0.3958500623703003,
597
+ "learning_rate": 0.0001,
598
+ "loss": 2.6296,
599
+ "step": 1650
600
+ },
601
+ {
602
+ "epoch": 1.1871013465627214,
603
+ "grad_norm": 0.5418444275856018,
604
+ "learning_rate": 0.0001,
605
+ "loss": 2.5194,
606
+ "step": 1675
607
+ },
608
+ {
609
+ "epoch": 1.2048192771084336,
610
+ "grad_norm": 0.7059912085533142,
611
+ "learning_rate": 0.0001,
612
+ "loss": 2.497,
613
+ "step": 1700
614
+ },
615
+ {
616
+ "epoch": 1.2048192771084336,
617
+ "eval_loss": 2.4886298179626465,
618
+ "eval_runtime": 32.6428,
619
+ "eval_samples_per_second": 9.129,
620
+ "eval_steps_per_second": 2.298,
621
+ "step": 1700
622
+ },
623
+ {
624
+ "epoch": 1.222537207654146,
625
+ "grad_norm": 0.5447824001312256,
626
+ "learning_rate": 0.0001,
627
+ "loss": 2.5098,
628
+ "step": 1725
629
+ },
630
+ {
631
+ "epoch": 1.2402551381998583,
632
+ "grad_norm": 0.508121132850647,
633
+ "learning_rate": 0.0001,
634
+ "loss": 2.4463,
635
+ "step": 1750
636
+ },
637
+ {
638
+ "epoch": 1.2579730687455706,
639
+ "grad_norm": 0.5141641497612,
640
+ "learning_rate": 0.0001,
641
+ "loss": 2.4407,
642
+ "step": 1775
643
+ },
644
+ {
645
+ "epoch": 1.2756909992912828,
646
+ "grad_norm": 0.7895596623420715,
647
+ "learning_rate": 0.0001,
648
+ "loss": 2.4965,
649
+ "step": 1800
650
+ },
651
+ {
652
+ "epoch": 1.2756909992912828,
653
+ "eval_loss": 2.4845895767211914,
654
+ "eval_runtime": 32.6353,
655
+ "eval_samples_per_second": 9.131,
656
+ "eval_steps_per_second": 2.298,
657
+ "step": 1800
658
+ },
659
+ {
660
+ "epoch": 1.293408929836995,
661
+ "grad_norm": 0.33818376064300537,
662
+ "learning_rate": 0.0001,
663
+ "loss": 2.3678,
664
+ "step": 1825
665
+ },
666
+ {
667
+ "epoch": 1.3111268603827073,
668
+ "grad_norm": 0.36670777201652527,
669
+ "learning_rate": 0.0001,
670
+ "loss": 2.4522,
671
+ "step": 1850
672
+ },
673
+ {
674
+ "epoch": 1.3288447909284196,
675
+ "grad_norm": 0.46683812141418457,
676
+ "learning_rate": 0.0001,
677
+ "loss": 2.5355,
678
+ "step": 1875
679
+ },
680
+ {
681
+ "epoch": 1.3465627214741318,
682
+ "grad_norm": 0.45993196964263916,
683
+ "learning_rate": 0.0001,
684
+ "loss": 2.5535,
685
+ "step": 1900
686
+ },
687
+ {
688
+ "epoch": 1.3465627214741318,
689
+ "eval_loss": 2.4782519340515137,
690
+ "eval_runtime": 32.6522,
691
+ "eval_samples_per_second": 9.126,
692
+ "eval_steps_per_second": 2.297,
693
+ "step": 1900
694
+ },
695
+ {
696
+ "epoch": 1.364280652019844,
697
+ "grad_norm": 0.4003180265426636,
698
+ "learning_rate": 0.0001,
699
+ "loss": 2.4353,
700
+ "step": 1925
701
+ },
702
+ {
703
+ "epoch": 1.3819985825655563,
704
+ "grad_norm": 0.4715386629104614,
705
+ "learning_rate": 0.0001,
706
+ "loss": 2.4086,
707
+ "step": 1950
708
+ },
709
+ {
710
+ "epoch": 1.3997165131112685,
711
+ "grad_norm": 0.4463669955730438,
712
+ "learning_rate": 0.0001,
713
+ "loss": 2.5299,
714
+ "step": 1975
715
+ },
716
+ {
717
+ "epoch": 1.4174344436569808,
718
+ "grad_norm": 0.6194645762443542,
719
+ "learning_rate": 0.0001,
720
+ "loss": 2.5747,
721
+ "step": 2000
722
+ },
723
+ {
724
+ "epoch": 1.4174344436569808,
725
+ "eval_loss": 2.4732108116149902,
726
+ "eval_runtime": 32.5875,
727
+ "eval_samples_per_second": 9.145,
728
+ "eval_steps_per_second": 2.301,
729
+ "step": 2000
730
+ },
731
+ {
732
+ "epoch": 1.4351523742026933,
733
+ "grad_norm": 0.4712235927581787,
734
+ "learning_rate": 0.0001,
735
+ "loss": 2.433,
736
+ "step": 2025
737
+ },
738
+ {
739
+ "epoch": 1.4528703047484055,
740
+ "grad_norm": 0.5619576573371887,
741
+ "learning_rate": 0.0001,
742
+ "loss": 2.4883,
743
+ "step": 2050
744
+ },
745
+ {
746
+ "epoch": 1.4705882352941178,
747
+ "grad_norm": 0.44078579545021057,
748
+ "learning_rate": 0.0001,
749
+ "loss": 2.5575,
750
+ "step": 2075
751
+ },
752
+ {
753
+ "epoch": 1.48830616583983,
754
+ "grad_norm": 0.44347578287124634,
755
+ "learning_rate": 0.0001,
756
+ "loss": 2.4534,
757
+ "step": 2100
758
+ },
759
+ {
760
+ "epoch": 1.48830616583983,
761
+ "eval_loss": 2.467924118041992,
762
+ "eval_runtime": 32.6469,
763
+ "eval_samples_per_second": 9.128,
764
+ "eval_steps_per_second": 2.297,
765
+ "step": 2100
766
+ },
767
+ {
768
+ "epoch": 1.5060240963855422,
769
+ "grad_norm": 0.4806898832321167,
770
+ "learning_rate": 0.0001,
771
+ "loss": 2.4605,
772
+ "step": 2125
773
+ },
774
+ {
775
+ "epoch": 1.5237420269312545,
776
+ "grad_norm": 0.4710817337036133,
777
+ "learning_rate": 0.0001,
778
+ "loss": 2.5124,
779
+ "step": 2150
780
+ },
781
+ {
782
+ "epoch": 1.5414599574769667,
783
+ "grad_norm": 0.4697023630142212,
784
+ "learning_rate": 0.0001,
785
+ "loss": 2.5251,
786
+ "step": 2175
787
+ },
788
+ {
789
+ "epoch": 1.559177888022679,
790
+ "grad_norm": 0.4275937080383301,
791
+ "learning_rate": 0.0001,
792
+ "loss": 2.4909,
793
+ "step": 2200
794
+ },
795
+ {
796
+ "epoch": 1.559177888022679,
797
+ "eval_loss": 2.465703248977661,
798
+ "eval_runtime": 32.66,
799
+ "eval_samples_per_second": 9.124,
800
+ "eval_steps_per_second": 2.296,
801
+ "step": 2200
802
+ },
803
+ {
804
+ "epoch": 1.5768958185683912,
805
+ "grad_norm": 0.6271502375602722,
806
+ "learning_rate": 0.0001,
807
+ "loss": 2.4563,
808
+ "step": 2225
809
+ },
810
+ {
811
+ "epoch": 1.5946137491141035,
812
+ "grad_norm": 0.5484246015548706,
813
+ "learning_rate": 0.0001,
814
+ "loss": 2.4781,
815
+ "step": 2250
816
+ },
817
+ {
818
+ "epoch": 1.6123316796598157,
819
+ "grad_norm": 0.7340563535690308,
820
+ "learning_rate": 0.0001,
821
+ "loss": 2.5204,
822
+ "step": 2275
823
+ },
824
+ {
825
+ "epoch": 1.630049610205528,
826
+ "grad_norm": 0.43257761001586914,
827
+ "learning_rate": 0.0001,
828
+ "loss": 2.5192,
829
+ "step": 2300
830
+ },
831
+ {
832
+ "epoch": 1.630049610205528,
833
+ "eval_loss": 2.461730718612671,
834
+ "eval_runtime": 32.5818,
835
+ "eval_samples_per_second": 9.146,
836
+ "eval_steps_per_second": 2.302,
837
+ "step": 2300
838
+ },
839
+ {
840
+ "epoch": 1.6477675407512402,
841
+ "grad_norm": 0.7394423484802246,
842
+ "learning_rate": 0.0001,
843
+ "loss": 2.5862,
844
+ "step": 2325
845
+ },
846
+ {
847
+ "epoch": 1.6654854712969525,
848
+ "grad_norm": 0.48102429509162903,
849
+ "learning_rate": 0.0001,
850
+ "loss": 2.519,
851
+ "step": 2350
852
+ },
853
+ {
854
+ "epoch": 1.6832034018426647,
855
+ "grad_norm": 0.5994846820831299,
856
+ "learning_rate": 0.0001,
857
+ "loss": 2.4566,
858
+ "step": 2375
859
+ },
860
+ {
861
+ "epoch": 1.700921332388377,
862
+ "grad_norm": 0.4805436134338379,
863
+ "learning_rate": 0.0001,
864
+ "loss": 2.4271,
865
+ "step": 2400
866
+ },
867
+ {
868
+ "epoch": 1.700921332388377,
869
+ "eval_loss": 2.457273006439209,
870
+ "eval_runtime": 32.6806,
871
+ "eval_samples_per_second": 9.119,
872
+ "eval_steps_per_second": 2.295,
873
+ "step": 2400
874
+ },
875
+ {
876
+ "epoch": 1.7186392629340892,
877
+ "grad_norm": 0.6208567023277283,
878
+ "learning_rate": 0.0001,
879
+ "loss": 2.3581,
880
+ "step": 2425
881
+ },
882
+ {
883
+ "epoch": 1.7363571934798014,
884
+ "grad_norm": 0.44081413745880127,
885
+ "learning_rate": 0.0001,
886
+ "loss": 2.4295,
887
+ "step": 2450
888
+ },
889
+ {
890
+ "epoch": 1.7540751240255137,
891
+ "grad_norm": 0.4629543721675873,
892
+ "learning_rate": 0.0001,
893
+ "loss": 2.4569,
894
+ "step": 2475
895
+ },
896
+ {
897
+ "epoch": 1.771793054571226,
898
+ "grad_norm": 0.518991231918335,
899
+ "learning_rate": 0.0001,
900
+ "loss": 2.4855,
901
+ "step": 2500
902
+ },
903
+ {
904
+ "epoch": 1.771793054571226,
905
+ "eval_loss": 2.454190731048584,
906
+ "eval_runtime": 32.6137,
907
+ "eval_samples_per_second": 9.137,
908
+ "eval_steps_per_second": 2.3,
909
+ "step": 2500
910
+ },
911
+ {
912
+ "epoch": 1.7895109851169382,
913
+ "grad_norm": 0.6166653037071228,
914
+ "learning_rate": 0.0001,
915
+ "loss": 2.4761,
916
+ "step": 2525
917
+ },
918
+ {
919
+ "epoch": 1.8072289156626506,
920
+ "grad_norm": 0.5490785241127014,
921
+ "learning_rate": 0.0001,
922
+ "loss": 2.5232,
923
+ "step": 2550
924
+ },
925
+ {
926
+ "epoch": 1.824946846208363,
927
+ "grad_norm": 0.6279402375221252,
928
+ "learning_rate": 0.0001,
929
+ "loss": 2.4425,
930
+ "step": 2575
931
+ },
932
+ {
933
+ "epoch": 1.8426647767540751,
934
+ "grad_norm": 0.606396496295929,
935
+ "learning_rate": 0.0001,
936
+ "loss": 2.4599,
937
+ "step": 2600
938
+ },
939
+ {
940
+ "epoch": 1.8426647767540751,
941
+ "eval_loss": 2.4530327320098877,
942
+ "eval_runtime": 32.6381,
943
+ "eval_samples_per_second": 9.13,
944
+ "eval_steps_per_second": 2.298,
945
+ "step": 2600
946
+ },
947
+ {
948
+ "epoch": 1.8603827072997874,
949
+ "grad_norm": 0.5355327129364014,
950
+ "learning_rate": 0.0001,
951
+ "loss": 2.5615,
952
+ "step": 2625
953
+ },
954
+ {
955
+ "epoch": 1.8781006378454996,
956
+ "grad_norm": 0.3971356451511383,
957
+ "learning_rate": 0.0001,
958
+ "loss": 2.445,
959
+ "step": 2650
960
+ },
961
+ {
962
+ "epoch": 1.8958185683912119,
963
+ "grad_norm": 0.48701226711273193,
964
+ "learning_rate": 0.0001,
965
+ "loss": 2.5405,
966
+ "step": 2675
967
+ },
968
+ {
969
+ "epoch": 1.9135364989369241,
970
+ "grad_norm": 0.5117021203041077,
971
+ "learning_rate": 0.0001,
972
+ "loss": 2.4482,
973
+ "step": 2700
974
+ },
975
+ {
976
+ "epoch": 1.9135364989369241,
977
+ "eval_loss": 2.444391965866089,
978
+ "eval_runtime": 32.6264,
979
+ "eval_samples_per_second": 9.134,
980
+ "eval_steps_per_second": 2.299,
981
+ "step": 2700
982
+ },
983
+ {
984
+ "epoch": 1.9312544294826366,
985
+ "grad_norm": 0.40752479434013367,
986
+ "learning_rate": 0.0001,
987
+ "loss": 2.421,
988
+ "step": 2725
989
+ },
990
+ {
991
+ "epoch": 1.9489723600283488,
992
+ "grad_norm": 0.4466327428817749,
993
+ "learning_rate": 0.0001,
994
+ "loss": 2.4286,
995
+ "step": 2750
996
+ },
997
+ {
998
+ "epoch": 1.966690290574061,
999
+ "grad_norm": 0.35452115535736084,
1000
+ "learning_rate": 0.0001,
1001
+ "loss": 2.4265,
1002
+ "step": 2775
1003
+ },
1004
+ {
1005
+ "epoch": 1.9844082211197733,
1006
+ "grad_norm": 0.7978025674819946,
1007
+ "learning_rate": 0.0001,
1008
+ "loss": 2.493,
1009
+ "step": 2800
1010
+ },
1011
+ {
1012
+ "epoch": 1.9844082211197733,
1013
+ "eval_loss": 2.444624423980713,
1014
+ "eval_runtime": 32.6679,
1015
+ "eval_samples_per_second": 9.122,
1016
+ "eval_steps_per_second": 2.296,
1017
+ "step": 2800
1018
+ },
1019
+ {
1020
+ "epoch": 2.0021261516654856,
1021
+ "grad_norm": 0.46374988555908203,
1022
+ "learning_rate": 0.0001,
1023
+ "loss": 2.5357,
1024
+ "step": 2825
1025
+ },
1026
+ {
1027
+ "epoch": 2.019844082211198,
1028
+ "grad_norm": 0.4631684422492981,
1029
+ "learning_rate": 0.0001,
1030
+ "loss": 2.4015,
1031
+ "step": 2850
1032
+ },
1033
+ {
1034
+ "epoch": 2.03756201275691,
1035
+ "grad_norm": 0.4475260376930237,
1036
+ "learning_rate": 0.0001,
1037
+ "loss": 2.3658,
1038
+ "step": 2875
1039
+ },
1040
+ {
1041
+ "epoch": 2.0552799433026223,
1042
+ "grad_norm": 0.47790655493736267,
1043
+ "learning_rate": 0.0001,
1044
+ "loss": 2.3527,
1045
+ "step": 2900
1046
+ },
1047
+ {
1048
+ "epoch": 2.0552799433026223,
1049
+ "eval_loss": 2.441364049911499,
1050
+ "eval_runtime": 32.6599,
1051
+ "eval_samples_per_second": 9.124,
1052
+ "eval_steps_per_second": 2.296,
1053
+ "step": 2900
1054
+ },
1055
+ {
1056
+ "epoch": 2.0729978738483346,
1057
+ "grad_norm": 0.5602151155471802,
1058
+ "learning_rate": 0.0001,
1059
+ "loss": 2.4763,
1060
+ "step": 2925
1061
+ },
1062
+ {
1063
+ "epoch": 2.090715804394047,
1064
+ "grad_norm": 0.37178730964660645,
1065
+ "learning_rate": 0.0001,
1066
+ "loss": 2.4431,
1067
+ "step": 2950
1068
+ },
1069
+ {
1070
+ "epoch": 2.108433734939759,
1071
+ "grad_norm": 0.47269827127456665,
1072
+ "learning_rate": 0.0001,
1073
+ "loss": 2.5528,
1074
+ "step": 2975
1075
+ },
1076
+ {
1077
+ "epoch": 2.1261516654854713,
1078
+ "grad_norm": 0.5636725425720215,
1079
+ "learning_rate": 0.0001,
1080
+ "loss": 2.5243,
1081
+ "step": 3000
1082
+ },
1083
+ {
1084
+ "epoch": 2.1261516654854713,
1085
+ "eval_loss": 2.4375791549682617,
1086
+ "eval_runtime": 32.6693,
1087
+ "eval_samples_per_second": 9.122,
1088
+ "eval_steps_per_second": 2.296,
1089
+ "step": 3000
1090
+ },
1091
+ {
1092
+ "epoch": 2.1438695960311835,
1093
+ "grad_norm": 0.5602971315383911,
1094
+ "learning_rate": 0.0001,
1095
+ "loss": 2.4726,
1096
+ "step": 3025
1097
+ },
1098
+ {
1099
+ "epoch": 2.161587526576896,
1100
+ "grad_norm": 0.7102957367897034,
1101
+ "learning_rate": 0.0001,
1102
+ "loss": 2.4513,
1103
+ "step": 3050
1104
+ },
1105
+ {
1106
+ "epoch": 2.179305457122608,
1107
+ "grad_norm": 0.5028663277626038,
1108
+ "learning_rate": 0.0001,
1109
+ "loss": 2.4038,
1110
+ "step": 3075
1111
+ },
1112
+ {
1113
+ "epoch": 2.1970233876683203,
1114
+ "grad_norm": 0.5358246564865112,
1115
+ "learning_rate": 0.0001,
1116
+ "loss": 2.4644,
1117
+ "step": 3100
1118
+ },
1119
+ {
1120
+ "epoch": 2.1970233876683203,
1121
+ "eval_loss": 2.433030605316162,
1122
+ "eval_runtime": 32.6486,
1123
+ "eval_samples_per_second": 9.128,
1124
+ "eval_steps_per_second": 2.297,
1125
+ "step": 3100
1126
+ },
1127
+ {
1128
+ "epoch": 2.2147413182140325,
1129
+ "grad_norm": 0.5380859971046448,
1130
+ "learning_rate": 0.0001,
1131
+ "loss": 2.4342,
1132
+ "step": 3125
1133
+ },
1134
+ {
1135
+ "epoch": 2.2324592487597448,
1136
+ "grad_norm": 0.8703417181968689,
1137
+ "learning_rate": 0.0001,
1138
+ "loss": 2.4462,
1139
+ "step": 3150
1140
+ },
1141
+ {
1142
+ "epoch": 2.250177179305457,
1143
+ "grad_norm": 0.44465309381484985,
1144
+ "learning_rate": 0.0001,
1145
+ "loss": 2.4428,
1146
+ "step": 3175
1147
+ },
1148
+ {
1149
+ "epoch": 2.2678951098511693,
1150
+ "grad_norm": 0.4541110396385193,
1151
+ "learning_rate": 0.0001,
1152
+ "loss": 2.386,
1153
+ "step": 3200
1154
+ },
1155
+ {
1156
+ "epoch": 2.2678951098511693,
1157
+ "eval_loss": 2.4308454990386963,
1158
+ "eval_runtime": 32.6678,
1159
+ "eval_samples_per_second": 9.122,
1160
+ "eval_steps_per_second": 2.296,
1161
+ "step": 3200
1162
+ },
1163
+ {
1164
+ "epoch": 2.2856130403968815,
1165
+ "grad_norm": 0.6527560949325562,
1166
+ "learning_rate": 0.0001,
1167
+ "loss": 2.3964,
1168
+ "step": 3225
1169
+ },
1170
+ {
1171
+ "epoch": 2.3033309709425938,
1172
+ "grad_norm": 0.5541362762451172,
1173
+ "learning_rate": 0.0001,
1174
+ "loss": 2.4817,
1175
+ "step": 3250
1176
+ },
1177
+ {
1178
+ "epoch": 2.321048901488306,
1179
+ "grad_norm": 0.5997689366340637,
1180
+ "learning_rate": 0.0001,
1181
+ "loss": 2.496,
1182
+ "step": 3275
1183
+ },
1184
+ {
1185
+ "epoch": 2.3387668320340183,
1186
+ "grad_norm": 0.5446316003799438,
1187
+ "learning_rate": 0.0001,
1188
+ "loss": 2.3762,
1189
+ "step": 3300
1190
+ },
1191
+ {
1192
+ "epoch": 2.3387668320340183,
1193
+ "eval_loss": 2.428109645843506,
1194
+ "eval_runtime": 32.6878,
1195
+ "eval_samples_per_second": 9.117,
1196
+ "eval_steps_per_second": 2.294,
1197
+ "step": 3300
1198
+ },
1199
+ {
1200
+ "epoch": 2.3564847625797305,
1201
+ "grad_norm": 0.3934761881828308,
1202
+ "learning_rate": 0.0001,
1203
+ "loss": 2.5195,
1204
+ "step": 3325
1205
+ },
1206
+ {
1207
+ "epoch": 2.3742026931254427,
1208
+ "grad_norm": 0.47348254919052124,
1209
+ "learning_rate": 0.0001,
1210
+ "loss": 2.5055,
1211
+ "step": 3350
1212
+ },
1213
+ {
1214
+ "epoch": 2.3919206236711554,
1215
+ "grad_norm": 0.5292345881462097,
1216
+ "learning_rate": 0.0001,
1217
+ "loss": 2.3635,
1218
+ "step": 3375
1219
+ },
1220
+ {
1221
+ "epoch": 2.4096385542168672,
1222
+ "grad_norm": 0.6056796312332153,
1223
+ "learning_rate": 0.0001,
1224
+ "loss": 2.3827,
1225
+ "step": 3400
1226
+ },
1227
+ {
1228
+ "epoch": 2.4096385542168672,
1229
+ "eval_loss": 2.4244818687438965,
1230
+ "eval_runtime": 32.5864,
1231
+ "eval_samples_per_second": 9.145,
1232
+ "eval_steps_per_second": 2.302,
1233
+ "step": 3400
1234
+ },
1235
+ {
1236
+ "epoch": 2.42735648476258,
1237
+ "grad_norm": 0.2907162010669708,
1238
+ "learning_rate": 0.0001,
1239
+ "loss": 2.3354,
1240
+ "step": 3425
1241
+ },
1242
+ {
1243
+ "epoch": 2.445074415308292,
1244
+ "grad_norm": 0.43741077184677124,
1245
+ "learning_rate": 0.0001,
1246
+ "loss": 2.4199,
1247
+ "step": 3450
1248
+ },
1249
+ {
1250
+ "epoch": 2.4627923458540044,
1251
+ "grad_norm": 0.36141782999038696,
1252
+ "learning_rate": 0.0001,
1253
+ "loss": 2.4165,
1254
+ "step": 3475
1255
+ },
1256
+ {
1257
+ "epoch": 2.4805102763997167,
1258
+ "grad_norm": 0.5461854338645935,
1259
+ "learning_rate": 0.0001,
1260
+ "loss": 2.3487,
1261
+ "step": 3500
1262
+ },
1263
+ {
1264
+ "epoch": 2.4805102763997167,
1265
+ "eval_loss": 2.4221482276916504,
1266
+ "eval_runtime": 32.6681,
1267
+ "eval_samples_per_second": 9.122,
1268
+ "eval_steps_per_second": 2.296,
1269
+ "step": 3500
1270
+ },
1271
+ {
1272
+ "epoch": 2.498228206945429,
1273
+ "grad_norm": 0.61762934923172,
1274
+ "learning_rate": 0.0001,
1275
+ "loss": 2.4136,
1276
+ "step": 3525
1277
+ },
1278
+ {
1279
+ "epoch": 2.515946137491141,
1280
+ "grad_norm": 0.41114169359207153,
1281
+ "learning_rate": 0.0001,
1282
+ "loss": 2.3587,
1283
+ "step": 3550
1284
+ },
1285
+ {
1286
+ "epoch": 2.5336640680368534,
1287
+ "grad_norm": 0.5726279020309448,
1288
+ "learning_rate": 0.0001,
1289
+ "loss": 2.5009,
1290
+ "step": 3575
1291
+ },
1292
+ {
1293
+ "epoch": 2.5513819985825656,
1294
+ "grad_norm": 0.4807787239551544,
1295
+ "learning_rate": 0.0001,
1296
+ "loss": 2.4737,
1297
+ "step": 3600
1298
+ },
1299
+ {
1300
+ "epoch": 2.5513819985825656,
1301
+ "eval_loss": 2.4191701412200928,
1302
+ "eval_runtime": 32.6743,
1303
+ "eval_samples_per_second": 9.12,
1304
+ "eval_steps_per_second": 2.295,
1305
+ "step": 3600
1306
+ },
1307
+ {
1308
+ "epoch": 2.569099929128278,
1309
+ "grad_norm": 0.5931722521781921,
1310
+ "learning_rate": 0.0001,
1311
+ "loss": 2.4178,
1312
+ "step": 3625
1313
+ },
1314
+ {
1315
+ "epoch": 2.58681785967399,
1316
+ "grad_norm": 0.4658395051956177,
1317
+ "learning_rate": 0.0001,
1318
+ "loss": 2.5162,
1319
+ "step": 3650
1320
+ },
1321
+ {
1322
+ "epoch": 2.6045357902197024,
1323
+ "grad_norm": 0.5829235315322876,
1324
+ "learning_rate": 0.0001,
1325
+ "loss": 2.3402,
1326
+ "step": 3675
1327
+ },
1328
+ {
1329
+ "epoch": 2.6222537207654146,
1330
+ "grad_norm": 0.6382436156272888,
1331
+ "learning_rate": 0.0001,
1332
+ "loss": 2.4907,
1333
+ "step": 3700
1334
+ },
1335
+ {
1336
+ "epoch": 2.6222537207654146,
1337
+ "eval_loss": 2.417147397994995,
1338
+ "eval_runtime": 32.6259,
1339
+ "eval_samples_per_second": 9.134,
1340
+ "eval_steps_per_second": 2.299,
1341
+ "step": 3700
1342
+ },
1343
+ {
1344
+ "epoch": 2.639971651311127,
1345
+ "grad_norm": 0.578823983669281,
1346
+ "learning_rate": 0.0001,
1347
+ "loss": 2.4281,
1348
+ "step": 3725
1349
+ },
1350
+ {
1351
+ "epoch": 2.657689581856839,
1352
+ "grad_norm": 0.5311617255210876,
1353
+ "learning_rate": 0.0001,
1354
+ "loss": 2.4315,
1355
+ "step": 3750
1356
+ },
1357
+ {
1358
+ "epoch": 2.6754075124025514,
1359
+ "grad_norm": 0.4713846743106842,
1360
+ "learning_rate": 0.0001,
1361
+ "loss": 2.4268,
1362
+ "step": 3775
1363
+ },
1364
+ {
1365
+ "epoch": 2.6931254429482636,
1366
+ "grad_norm": 0.7472273111343384,
1367
+ "learning_rate": 0.0001,
1368
+ "loss": 2.3967,
1369
+ "step": 3800
1370
+ },
1371
+ {
1372
+ "epoch": 2.6931254429482636,
1373
+ "eval_loss": 2.415893077850342,
1374
+ "eval_runtime": 32.6303,
1375
+ "eval_samples_per_second": 9.133,
1376
+ "eval_steps_per_second": 2.298,
1377
+ "step": 3800
1378
+ },
1379
+ {
1380
+ "epoch": 2.710843373493976,
1381
+ "grad_norm": 0.5875506401062012,
1382
+ "learning_rate": 0.0001,
1383
+ "loss": 2.3541,
1384
+ "step": 3825
1385
+ },
1386
+ {
1387
+ "epoch": 2.728561304039688,
1388
+ "grad_norm": 0.38152602314949036,
1389
+ "learning_rate": 0.0001,
1390
+ "loss": 2.4623,
1391
+ "step": 3850
1392
+ },
1393
+ {
1394
+ "epoch": 2.7462792345854004,
1395
+ "grad_norm": 0.35034802556037903,
1396
+ "learning_rate": 0.0001,
1397
+ "loss": 2.4392,
1398
+ "step": 3875
1399
+ },
1400
+ {
1401
+ "epoch": 2.7639971651311126,
1402
+ "grad_norm": 0.3683781027793884,
1403
+ "learning_rate": 0.0001,
1404
+ "loss": 2.4772,
1405
+ "step": 3900
1406
+ },
1407
+ {
1408
+ "epoch": 2.7639971651311126,
1409
+ "eval_loss": 2.414635181427002,
1410
+ "eval_runtime": 32.6546,
1411
+ "eval_samples_per_second": 9.126,
1412
+ "eval_steps_per_second": 2.297,
1413
+ "step": 3900
1414
+ },
1415
+ {
1416
+ "epoch": 2.781715095676825,
1417
+ "grad_norm": 0.632203221321106,
1418
+ "learning_rate": 0.0001,
1419
+ "loss": 2.48,
1420
+ "step": 3925
1421
+ },
1422
+ {
1423
+ "epoch": 2.799433026222537,
1424
+ "grad_norm": 0.4688514173030853,
1425
+ "learning_rate": 0.0001,
1426
+ "loss": 2.4058,
1427
+ "step": 3950
1428
+ },
1429
+ {
1430
+ "epoch": 2.8171509567682493,
1431
+ "grad_norm": 0.3703823685646057,
1432
+ "learning_rate": 0.0001,
1433
+ "loss": 2.3802,
1434
+ "step": 3975
1435
+ },
1436
+ {
1437
+ "epoch": 2.8348688873139616,
1438
+ "grad_norm": 0.4395906329154968,
1439
+ "learning_rate": 0.0001,
1440
+ "loss": 2.4114,
1441
+ "step": 4000
1442
+ },
1443
+ {
1444
+ "epoch": 2.8348688873139616,
1445
+ "eval_loss": 2.4105727672576904,
1446
+ "eval_runtime": 32.6241,
1447
+ "eval_samples_per_second": 9.134,
1448
+ "eval_steps_per_second": 2.299,
1449
+ "step": 4000
1450
+ },
1451
+ {
1452
+ "epoch": 2.852586817859674,
1453
+ "grad_norm": 0.3975163996219635,
1454
+ "learning_rate": 0.0001,
1455
+ "loss": 2.4189,
1456
+ "step": 4025
1457
+ },
1458
+ {
1459
+ "epoch": 2.8703047484053865,
1460
+ "grad_norm": 0.37457334995269775,
1461
+ "learning_rate": 0.0001,
1462
+ "loss": 2.3946,
1463
+ "step": 4050
1464
+ },
1465
+ {
1466
+ "epoch": 2.8880226789510983,
1467
+ "grad_norm": 0.3786819279193878,
1468
+ "learning_rate": 0.0001,
1469
+ "loss": 2.4683,
1470
+ "step": 4075
1471
+ },
1472
+ {
1473
+ "epoch": 2.905740609496811,
1474
+ "grad_norm": 0.633921205997467,
1475
+ "learning_rate": 0.0001,
1476
+ "loss": 2.4017,
1477
+ "step": 4100
1478
+ },
1479
+ {
1480
+ "epoch": 2.905740609496811,
1481
+ "eval_loss": 2.406451463699341,
1482
+ "eval_runtime": 32.6727,
1483
+ "eval_samples_per_second": 9.121,
1484
+ "eval_steps_per_second": 2.295,
1485
+ "step": 4100
1486
+ },
1487
+ {
1488
+ "epoch": 2.923458540042523,
1489
+ "grad_norm": 0.9005404710769653,
1490
+ "learning_rate": 0.0001,
1491
+ "loss": 2.4099,
1492
+ "step": 4125
1493
+ },
1494
+ {
1495
+ "epoch": 2.9411764705882355,
1496
+ "grad_norm": 0.5802463293075562,
1497
+ "learning_rate": 0.0001,
1498
+ "loss": 2.4068,
1499
+ "step": 4150
1500
+ },
1501
+ {
1502
+ "epoch": 2.9588944011339473,
1503
+ "grad_norm": 0.3155713975429535,
1504
+ "learning_rate": 0.0001,
1505
+ "loss": 2.3682,
1506
+ "step": 4175
1507
+ },
1508
+ {
1509
+ "epoch": 2.97661233167966,
1510
+ "grad_norm": 0.4876560568809509,
1511
+ "learning_rate": 0.0001,
1512
+ "loss": 2.3477,
1513
+ "step": 4200
1514
+ },
1515
+ {
1516
+ "epoch": 2.97661233167966,
1517
+ "eval_loss": 2.405850648880005,
1518
+ "eval_runtime": 32.6084,
1519
+ "eval_samples_per_second": 9.139,
1520
+ "eval_steps_per_second": 2.3,
1521
+ "step": 4200
1522
+ },
1523
+ {
1524
+ "epoch": 2.9943302622253722,
1525
+ "grad_norm": 0.49624720215797424,
1526
+ "learning_rate": 0.0001,
1527
+ "loss": 2.445,
1528
+ "step": 4225
1529
+ },
1530
+ {
1531
+ "epoch": 3.0,
1532
+ "step": 4233,
1533
+ "total_flos": 9152314711474176.0,
1534
+ "train_loss": 2.5398848939386913,
1535
+ "train_runtime": 5263.4986,
1536
+ "train_samples_per_second": 3.217,
1537
+ "train_steps_per_second": 0.804
1538
+ }
1539
+ ],
1540
+ "logging_steps": 25,
1541
+ "max_steps": 4233,
1542
+ "num_input_tokens_seen": 0,
1543
+ "num_train_epochs": 3,
1544
+ "save_steps": 100,
1545
+ "stateful_callbacks": {
1546
+ "TrainerControl": {
1547
+ "args": {
1548
+ "should_epoch_stop": false,
1549
+ "should_evaluate": false,
1550
+ "should_log": false,
1551
+ "should_save": true,
1552
+ "should_training_stop": true
1553
+ },
1554
+ "attributes": {}
1555
+ }
1556
+ },
1557
+ "total_flos": 9152314711474176.0,
1558
+ "train_batch_size": 4,
1559
+ "trial_name": null,
1560
+ "trial_params": null
1561
+ }