File size: 10,718 Bytes
5ae3b1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.0,
  "eval_steps": 100,
  "global_step": 2345,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.21321961620469082,
      "grad_norm": 1.6159120798110962,
      "learning_rate": 0.0007658848614072496,
      "loss": 0.026,
      "step": 100
    },
    {
      "epoch": 0.21321961620469082,
      "eval_accuracy": 0.9867,
      "eval_loss": 0.04582102224230766,
      "eval_runtime": 5.154,
      "eval_samples_per_second": 1940.235,
      "eval_steps_per_second": 15.328,
      "step": 100
    },
    {
      "epoch": 0.42643923240938164,
      "grad_norm": 0.22664281725883484,
      "learning_rate": 0.000731769722814499,
      "loss": 0.0232,
      "step": 200
    },
    {
      "epoch": 0.42643923240938164,
      "eval_accuracy": 0.9867,
      "eval_loss": 0.04527696222066879,
      "eval_runtime": 5.4041,
      "eval_samples_per_second": 1850.446,
      "eval_steps_per_second": 14.619,
      "step": 200
    },
    {
      "epoch": 0.6396588486140725,
      "grad_norm": 1.3475489616394043,
      "learning_rate": 0.0006976545842217485,
      "loss": 0.0277,
      "step": 300
    },
    {
      "epoch": 0.6396588486140725,
      "eval_accuracy": 0.9863,
      "eval_loss": 0.04839787632226944,
      "eval_runtime": 5.3353,
      "eval_samples_per_second": 1874.323,
      "eval_steps_per_second": 14.807,
      "step": 300
    },
    {
      "epoch": 0.8528784648187633,
      "grad_norm": 0.9408140182495117,
      "learning_rate": 0.0006635394456289979,
      "loss": 0.0293,
      "step": 400
    },
    {
      "epoch": 0.8528784648187633,
      "eval_accuracy": 0.9865,
      "eval_loss": 0.046898942440748215,
      "eval_runtime": 5.1935,
      "eval_samples_per_second": 1925.498,
      "eval_steps_per_second": 15.211,
      "step": 400
    },
    {
      "epoch": 1.0660980810234542,
      "grad_norm": 0.6018700003623962,
      "learning_rate": 0.0006294243070362473,
      "loss": 0.0235,
      "step": 500
    },
    {
      "epoch": 1.0660980810234542,
      "eval_accuracy": 0.9899,
      "eval_loss": 0.028819510713219643,
      "eval_runtime": 5.1407,
      "eval_samples_per_second": 1945.277,
      "eval_steps_per_second": 15.368,
      "step": 500
    },
    {
      "epoch": 1.279317697228145,
      "grad_norm": 0.7029837369918823,
      "learning_rate": 0.0005953091684434968,
      "loss": 0.0203,
      "step": 600
    },
    {
      "epoch": 1.279317697228145,
      "eval_accuracy": 0.9924,
      "eval_loss": 0.02526690438389778,
      "eval_runtime": 4.869,
      "eval_samples_per_second": 2053.81,
      "eval_steps_per_second": 16.225,
      "step": 600
    },
    {
      "epoch": 1.4925373134328357,
      "grad_norm": 0.17393378913402557,
      "learning_rate": 0.0005611940298507463,
      "loss": 0.0182,
      "step": 700
    },
    {
      "epoch": 1.4925373134328357,
      "eval_accuracy": 0.9916,
      "eval_loss": 0.028603948652744293,
      "eval_runtime": 4.6847,
      "eval_samples_per_second": 2134.586,
      "eval_steps_per_second": 16.863,
      "step": 700
    },
    {
      "epoch": 1.7057569296375266,
      "grad_norm": 0.9470173120498657,
      "learning_rate": 0.0005270788912579957,
      "loss": 0.0205,
      "step": 800
    },
    {
      "epoch": 1.7057569296375266,
      "eval_accuracy": 0.9935,
      "eval_loss": 0.02031560428440571,
      "eval_runtime": 5.0835,
      "eval_samples_per_second": 1967.155,
      "eval_steps_per_second": 15.541,
      "step": 800
    },
    {
      "epoch": 1.9189765458422174,
      "grad_norm": 1.4821853637695312,
      "learning_rate": 0.0004929637526652453,
      "loss": 0.0162,
      "step": 900
    },
    {
      "epoch": 1.9189765458422174,
      "eval_accuracy": 0.9913,
      "eval_loss": 0.023791342973709106,
      "eval_runtime": 5.2196,
      "eval_samples_per_second": 1915.863,
      "eval_steps_per_second": 15.135,
      "step": 900
    },
    {
      "epoch": 2.1321961620469083,
      "grad_norm": 1.0076653957366943,
      "learning_rate": 0.0004588486140724947,
      "loss": 0.0118,
      "step": 1000
    },
    {
      "epoch": 2.1321961620469083,
      "eval_accuracy": 0.9916,
      "eval_loss": 0.024731909856200218,
      "eval_runtime": 5.8209,
      "eval_samples_per_second": 1717.944,
      "eval_steps_per_second": 13.572,
      "step": 1000
    },
    {
      "epoch": 2.345415778251599,
      "grad_norm": 0.39255017042160034,
      "learning_rate": 0.0004247334754797441,
      "loss": 0.0121,
      "step": 1100
    },
    {
      "epoch": 2.345415778251599,
      "eval_accuracy": 0.9932,
      "eval_loss": 0.019426949322223663,
      "eval_runtime": 5.5038,
      "eval_samples_per_second": 1816.931,
      "eval_steps_per_second": 14.354,
      "step": 1100
    },
    {
      "epoch": 2.55863539445629,
      "grad_norm": 0.4671665132045746,
      "learning_rate": 0.0003906183368869936,
      "loss": 0.0154,
      "step": 1200
    },
    {
      "epoch": 2.55863539445629,
      "eval_accuracy": 0.9933,
      "eval_loss": 0.01936325989663601,
      "eval_runtime": 5.4304,
      "eval_samples_per_second": 1841.5,
      "eval_steps_per_second": 14.548,
      "step": 1200
    },
    {
      "epoch": 2.771855010660981,
      "grad_norm": 0.17253048717975616,
      "learning_rate": 0.0003565031982942431,
      "loss": 0.015,
      "step": 1300
    },
    {
      "epoch": 2.771855010660981,
      "eval_accuracy": 0.9933,
      "eval_loss": 0.02162059210240841,
      "eval_runtime": 5.3729,
      "eval_samples_per_second": 1861.177,
      "eval_steps_per_second": 14.703,
      "step": 1300
    },
    {
      "epoch": 2.9850746268656714,
      "grad_norm": 0.8343185782432556,
      "learning_rate": 0.00032238805970149256,
      "loss": 0.0145,
      "step": 1400
    },
    {
      "epoch": 2.9850746268656714,
      "eval_accuracy": 0.9919,
      "eval_loss": 0.02381654642522335,
      "eval_runtime": 5.1099,
      "eval_samples_per_second": 1957.0,
      "eval_steps_per_second": 15.46,
      "step": 1400
    },
    {
      "epoch": 3.1982942430703627,
      "grad_norm": 0.3106481432914734,
      "learning_rate": 0.000288272921108742,
      "loss": 0.0098,
      "step": 1500
    },
    {
      "epoch": 3.1982942430703627,
      "eval_accuracy": 0.993,
      "eval_loss": 0.020756520330905914,
      "eval_runtime": 4.9767,
      "eval_samples_per_second": 2009.353,
      "eval_steps_per_second": 15.874,
      "step": 1500
    },
    {
      "epoch": 3.411513859275053,
      "grad_norm": 0.03616774454712868,
      "learning_rate": 0.0002541577825159915,
      "loss": 0.0093,
      "step": 1600
    },
    {
      "epoch": 3.411513859275053,
      "eval_accuracy": 0.9929,
      "eval_loss": 0.021822581067681313,
      "eval_runtime": 5.4965,
      "eval_samples_per_second": 1819.356,
      "eval_steps_per_second": 14.373,
      "step": 1600
    },
    {
      "epoch": 3.624733475479744,
      "grad_norm": 0.6045345067977905,
      "learning_rate": 0.00022004264392324095,
      "loss": 0.0073,
      "step": 1700
    },
    {
      "epoch": 3.624733475479744,
      "eval_accuracy": 0.9933,
      "eval_loss": 0.018862707540392876,
      "eval_runtime": 5.7766,
      "eval_samples_per_second": 1731.12,
      "eval_steps_per_second": 13.676,
      "step": 1700
    },
    {
      "epoch": 3.837953091684435,
      "grad_norm": 0.631215512752533,
      "learning_rate": 0.0001859275053304904,
      "loss": 0.008,
      "step": 1800
    },
    {
      "epoch": 3.837953091684435,
      "eval_accuracy": 0.9932,
      "eval_loss": 0.01944512128829956,
      "eval_runtime": 5.1522,
      "eval_samples_per_second": 1940.922,
      "eval_steps_per_second": 15.333,
      "step": 1800
    },
    {
      "epoch": 4.051172707889126,
      "grad_norm": 0.5996153950691223,
      "learning_rate": 0.0001518123667377399,
      "loss": 0.006,
      "step": 1900
    },
    {
      "epoch": 4.051172707889126,
      "eval_accuracy": 0.9938,
      "eval_loss": 0.018317226320505142,
      "eval_runtime": 4.9544,
      "eval_samples_per_second": 2018.428,
      "eval_steps_per_second": 15.946,
      "step": 1900
    },
    {
      "epoch": 4.264392324093817,
      "grad_norm": 0.0069321137852966785,
      "learning_rate": 0.00011769722814498933,
      "loss": 0.0063,
      "step": 2000
    },
    {
      "epoch": 4.264392324093817,
      "eval_accuracy": 0.9934,
      "eval_loss": 0.018384862691164017,
      "eval_runtime": 4.7636,
      "eval_samples_per_second": 2099.232,
      "eval_steps_per_second": 16.584,
      "step": 2000
    },
    {
      "epoch": 4.477611940298507,
      "grad_norm": 0.06509185582399368,
      "learning_rate": 8.392324093816631e-05,
      "loss": 0.0043,
      "step": 2100
    },
    {
      "epoch": 4.477611940298507,
      "eval_accuracy": 0.9932,
      "eval_loss": 0.018380142748355865,
      "eval_runtime": 4.6951,
      "eval_samples_per_second": 2129.883,
      "eval_steps_per_second": 16.826,
      "step": 2100
    },
    {
      "epoch": 4.690831556503198,
      "grad_norm": 0.17103737592697144,
      "learning_rate": 4.980810234541578e-05,
      "loss": 0.0035,
      "step": 2200
    },
    {
      "epoch": 4.690831556503198,
      "eval_accuracy": 0.9931,
      "eval_loss": 0.018344268202781677,
      "eval_runtime": 4.6229,
      "eval_samples_per_second": 2163.133,
      "eval_steps_per_second": 17.089,
      "step": 2200
    },
    {
      "epoch": 4.904051172707889,
      "grad_norm": 0.6465263962745667,
      "learning_rate": 1.5692963752665246e-05,
      "loss": 0.0061,
      "step": 2300
    },
    {
      "epoch": 4.904051172707889,
      "eval_accuracy": 0.9931,
      "eval_loss": 0.018412619829177856,
      "eval_runtime": 4.7593,
      "eval_samples_per_second": 2101.13,
      "eval_steps_per_second": 16.599,
      "step": 2300
    },
    {
      "epoch": 5.0,
      "step": 2345,
      "total_flos": 1346208595200000.0,
      "train_loss": 0.014373551363121472,
      "train_runtime": 335.5598,
      "train_samples_per_second": 894.028,
      "train_steps_per_second": 6.988
    }
  ],
  "logging_steps": 100,
  "max_steps": 2345,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 3000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1346208595200000.0,
  "train_batch_size": 128,
  "trial_name": null,
  "trial_params": null
}