Karimsliti commited on
Commit
24db02e
1 Parent(s): 922952b

Upload trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +461 -0
trainer_state.json ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0202020202020203,
5
+ "eval_steps": 20,
6
+ "global_step": 400,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.050505050505050504,
13
+ "grad_norm": 0.1097809225320816,
14
+ "learning_rate": 2.9999999999999997e-05,
15
+ "loss": 1.2164,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.10101010101010101,
20
+ "grad_norm": 0.16844922304153442,
21
+ "learning_rate": 5.9999999999999995e-05,
22
+ "loss": 1.1774,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.10101010101010101,
27
+ "eval_loss": 1.1594151258468628,
28
+ "eval_runtime": 295.5081,
29
+ "eval_samples_per_second": 2.382,
30
+ "eval_steps_per_second": 0.298,
31
+ "step": 20
32
+ },
33
+ {
34
+ "epoch": 0.15151515151515152,
35
+ "grad_norm": 0.3893352150917053,
36
+ "learning_rate": 8.999999999999999e-05,
37
+ "loss": 0.9942,
38
+ "step": 30
39
+ },
40
+ {
41
+ "epoch": 0.20202020202020202,
42
+ "grad_norm": 0.4363686740398407,
43
+ "learning_rate": 0.00011999999999999999,
44
+ "loss": 0.7259,
45
+ "step": 40
46
+ },
47
+ {
48
+ "epoch": 0.20202020202020202,
49
+ "eval_loss": 0.6028452515602112,
50
+ "eval_runtime": 294.8084,
51
+ "eval_samples_per_second": 2.388,
52
+ "eval_steps_per_second": 0.298,
53
+ "step": 40
54
+ },
55
+ {
56
+ "epoch": 0.25252525252525254,
57
+ "grad_norm": 0.40602898597717285,
58
+ "learning_rate": 0.00015,
59
+ "loss": 0.4454,
60
+ "step": 50
61
+ },
62
+ {
63
+ "epoch": 0.30303030303030304,
64
+ "grad_norm": 0.2177702635526657,
65
+ "learning_rate": 0.00017999999999999998,
66
+ "loss": 0.5139,
67
+ "step": 60
68
+ },
69
+ {
70
+ "epoch": 0.30303030303030304,
71
+ "eval_loss": 0.3834875822067261,
72
+ "eval_runtime": 295.1108,
73
+ "eval_samples_per_second": 2.386,
74
+ "eval_steps_per_second": 0.298,
75
+ "step": 60
76
+ },
77
+ {
78
+ "epoch": 0.35353535353535354,
79
+ "grad_norm": 0.1777501404285431,
80
+ "learning_rate": 0.00020999999999999998,
81
+ "loss": 0.4096,
82
+ "step": 70
83
+ },
84
+ {
85
+ "epoch": 0.40404040404040403,
86
+ "grad_norm": 0.1749625951051712,
87
+ "learning_rate": 0.00023999999999999998,
88
+ "loss": 0.2589,
89
+ "step": 80
90
+ },
91
+ {
92
+ "epoch": 0.40404040404040403,
93
+ "eval_loss": 0.28596043586730957,
94
+ "eval_runtime": 294.9045,
95
+ "eval_samples_per_second": 2.387,
96
+ "eval_steps_per_second": 0.298,
97
+ "step": 80
98
+ },
99
+ {
100
+ "epoch": 0.45454545454545453,
101
+ "grad_norm": 0.2074197232723236,
102
+ "learning_rate": 0.00027,
103
+ "loss": 0.1932,
104
+ "step": 90
105
+ },
106
+ {
107
+ "epoch": 0.5050505050505051,
108
+ "grad_norm": 0.36931025981903076,
109
+ "learning_rate": 0.0003,
110
+ "loss": 0.2177,
111
+ "step": 100
112
+ },
113
+ {
114
+ "epoch": 0.5050505050505051,
115
+ "eval_loss": 0.25518086552619934,
116
+ "eval_runtime": 295.271,
117
+ "eval_samples_per_second": 2.384,
118
+ "eval_steps_per_second": 0.298,
119
+ "step": 100
120
+ },
121
+ {
122
+ "epoch": 0.5555555555555556,
123
+ "grad_norm": 0.20555801689624786,
124
+ "learning_rate": 0.00029,
125
+ "loss": 0.299,
126
+ "step": 110
127
+ },
128
+ {
129
+ "epoch": 0.6060606060606061,
130
+ "grad_norm": 0.14223843812942505,
131
+ "learning_rate": 0.00028,
132
+ "loss": 0.2508,
133
+ "step": 120
134
+ },
135
+ {
136
+ "epoch": 0.6060606060606061,
137
+ "eval_loss": 0.2106790542602539,
138
+ "eval_runtime": 294.889,
139
+ "eval_samples_per_second": 2.387,
140
+ "eval_steps_per_second": 0.298,
141
+ "step": 120
142
+ },
143
+ {
144
+ "epoch": 0.6565656565656566,
145
+ "grad_norm": 0.14103205502033234,
146
+ "learning_rate": 0.00027,
147
+ "loss": 0.1666,
148
+ "step": 130
149
+ },
150
+ {
151
+ "epoch": 0.7070707070707071,
152
+ "grad_norm": 0.1655016392469406,
153
+ "learning_rate": 0.00026,
154
+ "loss": 0.1511,
155
+ "step": 140
156
+ },
157
+ {
158
+ "epoch": 0.7070707070707071,
159
+ "eval_loss": 0.20343247056007385,
160
+ "eval_runtime": 294.7497,
161
+ "eval_samples_per_second": 2.388,
162
+ "eval_steps_per_second": 0.299,
163
+ "step": 140
164
+ },
165
+ {
166
+ "epoch": 0.7575757575757576,
167
+ "grad_norm": 0.33550018072128296,
168
+ "learning_rate": 0.00025,
169
+ "loss": 0.1798,
170
+ "step": 150
171
+ },
172
+ {
173
+ "epoch": 0.8080808080808081,
174
+ "grad_norm": 0.18753309547901154,
175
+ "learning_rate": 0.00023999999999999998,
176
+ "loss": 0.245,
177
+ "step": 160
178
+ },
179
+ {
180
+ "epoch": 0.8080808080808081,
181
+ "eval_loss": 0.19205082952976227,
182
+ "eval_runtime": 294.8122,
183
+ "eval_samples_per_second": 2.388,
184
+ "eval_steps_per_second": 0.298,
185
+ "step": 160
186
+ },
187
+ {
188
+ "epoch": 0.8585858585858586,
189
+ "grad_norm": 0.18931691348552704,
190
+ "learning_rate": 0.00023,
191
+ "loss": 0.2125,
192
+ "step": 170
193
+ },
194
+ {
195
+ "epoch": 0.9090909090909091,
196
+ "grad_norm": 0.1642046868801117,
197
+ "learning_rate": 0.00021999999999999995,
198
+ "loss": 0.1479,
199
+ "step": 180
200
+ },
201
+ {
202
+ "epoch": 0.9090909090909091,
203
+ "eval_loss": 0.1822979599237442,
204
+ "eval_runtime": 295.0975,
205
+ "eval_samples_per_second": 2.386,
206
+ "eval_steps_per_second": 0.298,
207
+ "step": 180
208
+ },
209
+ {
210
+ "epoch": 0.9595959595959596,
211
+ "grad_norm": 0.14441154897212982,
212
+ "learning_rate": 0.00020999999999999998,
213
+ "loss": 0.1399,
214
+ "step": 190
215
+ },
216
+ {
217
+ "epoch": 1.0101010101010102,
218
+ "grad_norm": 0.20781415700912476,
219
+ "learning_rate": 0.00019999999999999998,
220
+ "loss": 0.17,
221
+ "step": 200
222
+ },
223
+ {
224
+ "epoch": 1.0101010101010102,
225
+ "eval_loss": 0.18901540338993073,
226
+ "eval_runtime": 294.9368,
227
+ "eval_samples_per_second": 2.387,
228
+ "eval_steps_per_second": 0.298,
229
+ "step": 200
230
+ },
231
+ {
232
+ "epoch": 1.0606060606060606,
233
+ "grad_norm": 0.18098795413970947,
234
+ "learning_rate": 0.00018999999999999998,
235
+ "loss": 0.2102,
236
+ "step": 210
237
+ },
238
+ {
239
+ "epoch": 1.1111111111111112,
240
+ "grad_norm": 0.12826304137706757,
241
+ "learning_rate": 0.00017999999999999998,
242
+ "loss": 0.2085,
243
+ "step": 220
244
+ },
245
+ {
246
+ "epoch": 1.1111111111111112,
247
+ "eval_loss": 0.17017190158367157,
248
+ "eval_runtime": 295.0936,
249
+ "eval_samples_per_second": 2.386,
250
+ "eval_steps_per_second": 0.298,
251
+ "step": 220
252
+ },
253
+ {
254
+ "epoch": 1.1616161616161615,
255
+ "grad_norm": 0.13167431950569153,
256
+ "learning_rate": 0.00016999999999999999,
257
+ "loss": 0.1399,
258
+ "step": 230
259
+ },
260
+ {
261
+ "epoch": 1.2121212121212122,
262
+ "grad_norm": 0.13884004950523376,
263
+ "learning_rate": 0.00015999999999999999,
264
+ "loss": 0.1311,
265
+ "step": 240
266
+ },
267
+ {
268
+ "epoch": 1.2121212121212122,
269
+ "eval_loss": 0.16837279498577118,
270
+ "eval_runtime": 294.8077,
271
+ "eval_samples_per_second": 2.388,
272
+ "eval_steps_per_second": 0.298,
273
+ "step": 240
274
+ },
275
+ {
276
+ "epoch": 1.2626262626262625,
277
+ "grad_norm": 0.17478157579898834,
278
+ "learning_rate": 0.00015,
279
+ "loss": 0.1559,
280
+ "step": 250
281
+ },
282
+ {
283
+ "epoch": 1.3131313131313131,
284
+ "grad_norm": 0.16669002175331116,
285
+ "learning_rate": 0.00014,
286
+ "loss": 0.205,
287
+ "step": 260
288
+ },
289
+ {
290
+ "epoch": 1.3131313131313131,
291
+ "eval_loss": 0.16558928787708282,
292
+ "eval_runtime": 294.9081,
293
+ "eval_samples_per_second": 2.387,
294
+ "eval_steps_per_second": 0.298,
295
+ "step": 260
296
+ },
297
+ {
298
+ "epoch": 1.3636363636363638,
299
+ "grad_norm": 0.14852865040302277,
300
+ "learning_rate": 0.00013,
301
+ "loss": 0.1826,
302
+ "step": 270
303
+ },
304
+ {
305
+ "epoch": 1.4141414141414141,
306
+ "grad_norm": 0.14237315952777863,
307
+ "learning_rate": 0.00011999999999999999,
308
+ "loss": 0.1329,
309
+ "step": 280
310
+ },
311
+ {
312
+ "epoch": 1.4141414141414141,
313
+ "eval_loss": 0.15846213698387146,
314
+ "eval_runtime": 295.0174,
315
+ "eval_samples_per_second": 2.386,
316
+ "eval_steps_per_second": 0.298,
317
+ "step": 280
318
+ },
319
+ {
320
+ "epoch": 1.4646464646464645,
321
+ "grad_norm": 0.1401522010564804,
322
+ "learning_rate": 0.00010999999999999998,
323
+ "loss": 0.1205,
324
+ "step": 290
325
+ },
326
+ {
327
+ "epoch": 1.5151515151515151,
328
+ "grad_norm": 0.18983517587184906,
329
+ "learning_rate": 9.999999999999999e-05,
330
+ "loss": 0.1607,
331
+ "step": 300
332
+ },
333
+ {
334
+ "epoch": 1.5151515151515151,
335
+ "eval_loss": 0.16163967549800873,
336
+ "eval_runtime": 294.807,
337
+ "eval_samples_per_second": 2.388,
338
+ "eval_steps_per_second": 0.299,
339
+ "step": 300
340
+ },
341
+ {
342
+ "epoch": 1.5656565656565657,
343
+ "grad_norm": 0.1500634402036667,
344
+ "learning_rate": 8.999999999999999e-05,
345
+ "loss": 0.2121,
346
+ "step": 310
347
+ },
348
+ {
349
+ "epoch": 1.6161616161616161,
350
+ "grad_norm": 0.15094490349292755,
351
+ "learning_rate": 7.999999999999999e-05,
352
+ "loss": 0.1713,
353
+ "step": 320
354
+ },
355
+ {
356
+ "epoch": 1.6161616161616161,
357
+ "eval_loss": 0.1540195345878601,
358
+ "eval_runtime": 295.157,
359
+ "eval_samples_per_second": 2.385,
360
+ "eval_steps_per_second": 0.298,
361
+ "step": 320
362
+ },
363
+ {
364
+ "epoch": 1.6666666666666665,
365
+ "grad_norm": 0.15443100035190582,
366
+ "learning_rate": 7e-05,
367
+ "loss": 0.129,
368
+ "step": 330
369
+ },
370
+ {
371
+ "epoch": 1.7171717171717171,
372
+ "grad_norm": 0.1568954586982727,
373
+ "learning_rate": 5.9999999999999995e-05,
374
+ "loss": 0.1196,
375
+ "step": 340
376
+ },
377
+ {
378
+ "epoch": 1.7171717171717171,
379
+ "eval_loss": 0.15283828973770142,
380
+ "eval_runtime": 295.3671,
381
+ "eval_samples_per_second": 2.383,
382
+ "eval_steps_per_second": 0.298,
383
+ "step": 340
384
+ },
385
+ {
386
+ "epoch": 1.7676767676767677,
387
+ "grad_norm": 0.19448110461235046,
388
+ "learning_rate": 4.9999999999999996e-05,
389
+ "loss": 0.1603,
390
+ "step": 350
391
+ },
392
+ {
393
+ "epoch": 1.8181818181818183,
394
+ "grad_norm": 0.12574005126953125,
395
+ "learning_rate": 3.9999999999999996e-05,
396
+ "loss": 0.1722,
397
+ "step": 360
398
+ },
399
+ {
400
+ "epoch": 1.8181818181818183,
401
+ "eval_loss": 0.15110255777835846,
402
+ "eval_runtime": 295.1101,
403
+ "eval_samples_per_second": 2.386,
404
+ "eval_steps_per_second": 0.298,
405
+ "step": 360
406
+ },
407
+ {
408
+ "epoch": 1.8686868686868687,
409
+ "grad_norm": 0.11745467782020569,
410
+ "learning_rate": 2.9999999999999997e-05,
411
+ "loss": 0.1669,
412
+ "step": 370
413
+ },
414
+ {
415
+ "epoch": 1.9191919191919191,
416
+ "grad_norm": 0.1326703280210495,
417
+ "learning_rate": 1.9999999999999998e-05,
418
+ "loss": 0.1243,
419
+ "step": 380
420
+ },
421
+ {
422
+ "epoch": 1.9191919191919191,
423
+ "eval_loss": 0.1487378031015396,
424
+ "eval_runtime": 295.0041,
425
+ "eval_samples_per_second": 2.386,
426
+ "eval_steps_per_second": 0.298,
427
+ "step": 380
428
+ },
429
+ {
430
+ "epoch": 1.9696969696969697,
431
+ "grad_norm": 0.1454104483127594,
432
+ "learning_rate": 9.999999999999999e-06,
433
+ "loss": 0.1148,
434
+ "step": 390
435
+ },
436
+ {
437
+ "epoch": 2.0202020202020203,
438
+ "grad_norm": 0.1772913634777069,
439
+ "learning_rate": 0.0,
440
+ "loss": 0.1633,
441
+ "step": 400
442
+ },
443
+ {
444
+ "epoch": 2.0202020202020203,
445
+ "eval_loss": 0.14871937036514282,
446
+ "eval_runtime": 295.1345,
447
+ "eval_samples_per_second": 2.385,
448
+ "eval_steps_per_second": 0.298,
449
+ "step": 400
450
+ }
451
+ ],
452
+ "logging_steps": 10,
453
+ "max_steps": 400,
454
+ "num_input_tokens_seen": 0,
455
+ "num_train_epochs": 3,
456
+ "save_steps": 20,
457
+ "total_flos": 2.2256168195948544e+17,
458
+ "train_batch_size": 8,
459
+ "trial_name": null,
460
+ "trial_params": null
461
+ }