Daniil Larionov commited on
Commit
3d6becb
1 Parent(s): e45f954

End of training

Browse files
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9306930899620056,
4
+ "eval_loss": 0.39733296632766724,
5
+ "eval_runtime": 0.4549,
6
+ "eval_samples": 101,
7
+ "eval_samples_per_second": 222.004,
8
+ "eval_steps_per_second": 28.575,
9
+ "train_loss": 0.1360975634233144,
10
+ "train_runtime": 211.9781,
11
+ "train_samples": 908,
12
+ "train_samples_per_second": 42.835,
13
+ "train_steps_per_second": 2.689
14
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.9306930899620056,
4
+ "eval_loss": 0.39733296632766724,
5
+ "eval_runtime": 0.4549,
6
+ "eval_samples": 101,
7
+ "eval_samples_per_second": 222.004,
8
+ "eval_steps_per_second": 28.575
9
+ }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fff84e5e67955a0f2abce1f5d2ee40fbb4d2327d0dd86f77e6e07cabf2a0a6e
3
  size 713322669
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eb89ee804cc02c1c4c21c5890fc3226645b832719184d7fb5cf4e1c123736ac
3
  size 713322669
runs/events.out.tfevents.1636571350.5690110ef819.12031.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0a3e221bc7dc371af3fe8b0a1c290293bd424c88eb72953e6e2dc525ec420b2
3
- size 5871
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87c197d2235d8ffd5d993a98eea6166d153c051ff9586290ceeb376a05fceb9a
3
+ size 25255
runs/events.out.tfevents.1636571613.5690110ef819.12031.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:200a6591b465c775454bb582f74f4cfaf88dfac2c18b68f92e570bad196f57f0
3
+ size 363
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "train_loss": 0.1360975634233144,
4
+ "train_runtime": 211.9781,
5
+ "train_samples": 908,
6
+ "train_samples_per_second": 42.835,
7
+ "train_steps_per_second": 2.689
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,805 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "global_step": 570,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 1.4285714285714286e-06,
13
+ "loss": 1.0416,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.09,
18
+ "learning_rate": 7.142857142857143e-06,
19
+ "loss": 1.0953,
20
+ "step": 5
21
+ },
22
+ {
23
+ "epoch": 0.18,
24
+ "learning_rate": 1.4285714285714285e-05,
25
+ "loss": 0.9549,
26
+ "step": 10
27
+ },
28
+ {
29
+ "epoch": 0.26,
30
+ "learning_rate": 2.1428571428571428e-05,
31
+ "loss": 0.8765,
32
+ "step": 15
33
+ },
34
+ {
35
+ "epoch": 0.35,
36
+ "learning_rate": 2.857142857142857e-05,
37
+ "loss": 0.7896,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.44,
42
+ "learning_rate": 3.571428571428572e-05,
43
+ "loss": 0.8036,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.53,
48
+ "learning_rate": 4.2857142857142856e-05,
49
+ "loss": 0.6975,
50
+ "step": 30
51
+ },
52
+ {
53
+ "epoch": 0.61,
54
+ "learning_rate": 5e-05,
55
+ "loss": 0.7023,
56
+ "step": 35
57
+ },
58
+ {
59
+ "epoch": 0.7,
60
+ "learning_rate": 4.998922515567496e-05,
61
+ "loss": 0.6305,
62
+ "step": 40
63
+ },
64
+ {
65
+ "epoch": 0.79,
66
+ "learning_rate": 4.995690991048146e-05,
67
+ "loss": 0.7052,
68
+ "step": 45
69
+ },
70
+ {
71
+ "epoch": 0.88,
72
+ "learning_rate": 4.99030821197584e-05,
73
+ "loss": 0.6331,
74
+ "step": 50
75
+ },
76
+ {
77
+ "epoch": 0.96,
78
+ "learning_rate": 4.982778818239101e-05,
79
+ "loss": 0.4892,
80
+ "step": 55
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_accuracy": 0.7524752616882324,
85
+ "eval_loss": 0.5470823049545288,
86
+ "eval_runtime": 0.4397,
87
+ "eval_samples_per_second": 229.701,
88
+ "eval_steps_per_second": 29.566,
89
+ "step": 57
90
+ },
91
+ {
92
+ "epoch": 1.05,
93
+ "learning_rate": 4.97310930008156e-05,
94
+ "loss": 0.4139,
95
+ "step": 60
96
+ },
97
+ {
98
+ "epoch": 1.14,
99
+ "learning_rate": 4.961307992507443e-05,
100
+ "loss": 0.3817,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 1.23,
105
+ "learning_rate": 4.947385068096907e-05,
106
+ "loss": 0.2255,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 1.32,
111
+ "learning_rate": 4.9313525282373974e-05,
112
+ "loss": 0.3455,
113
+ "step": 75
114
+ },
115
+ {
116
+ "epoch": 1.4,
117
+ "learning_rate": 4.9132241927786035e-05,
118
+ "loss": 0.3717,
119
+ "step": 80
120
+ },
121
+ {
122
+ "epoch": 1.49,
123
+ "learning_rate": 4.893015688119921e-05,
124
+ "loss": 0.4358,
125
+ "step": 85
126
+ },
127
+ {
128
+ "epoch": 1.58,
129
+ "learning_rate": 4.870744433740688e-05,
130
+ "loss": 0.3544,
131
+ "step": 90
132
+ },
133
+ {
134
+ "epoch": 1.67,
135
+ "learning_rate": 4.8464296271848155e-05,
136
+ "loss": 0.1876,
137
+ "step": 95
138
+ },
139
+ {
140
+ "epoch": 1.75,
141
+ "learning_rate": 4.8200922275127355e-05,
142
+ "loss": 0.1961,
143
+ "step": 100
144
+ },
145
+ {
146
+ "epoch": 1.84,
147
+ "learning_rate": 4.7917549372349616e-05,
148
+ "loss": 0.2548,
149
+ "step": 105
150
+ },
151
+ {
152
+ "epoch": 1.93,
153
+ "learning_rate": 4.761442182742799e-05,
154
+ "loss": 0.2302,
155
+ "step": 110
156
+ },
157
+ {
158
+ "epoch": 2.0,
159
+ "eval_accuracy": 0.8613861203193665,
160
+ "eval_loss": 0.42704179883003235,
161
+ "eval_runtime": 0.4285,
162
+ "eval_samples_per_second": 235.733,
163
+ "eval_steps_per_second": 30.342,
164
+ "step": 114
165
+ },
166
+ {
167
+ "epoch": 2.02,
168
+ "learning_rate": 4.7291800932531064e-05,
169
+ "loss": 0.3154,
170
+ "step": 115
171
+ },
172
+ {
173
+ "epoch": 2.11,
174
+ "learning_rate": 4.694996478285231e-05,
175
+ "loss": 0.2103,
176
+ "step": 120
177
+ },
178
+ {
179
+ "epoch": 2.19,
180
+ "learning_rate": 4.6589208036895535e-05,
181
+ "loss": 0.1499,
182
+ "step": 125
183
+ },
184
+ {
185
+ "epoch": 2.28,
186
+ "learning_rate": 4.620984166248288e-05,
187
+ "loss": 0.1851,
188
+ "step": 130
189
+ },
190
+ {
191
+ "epoch": 2.37,
192
+ "learning_rate": 4.581219266870446e-05,
193
+ "loss": 0.0972,
194
+ "step": 135
195
+ },
196
+ {
197
+ "epoch": 2.46,
198
+ "learning_rate": 4.53966038240406e-05,
199
+ "loss": 0.2495,
200
+ "step": 140
201
+ },
202
+ {
203
+ "epoch": 2.54,
204
+ "learning_rate": 4.496343336089965e-05,
205
+ "loss": 0.0528,
206
+ "step": 145
207
+ },
208
+ {
209
+ "epoch": 2.63,
210
+ "learning_rate": 4.4513054666826146e-05,
211
+ "loss": 0.127,
212
+ "step": 150
213
+ },
214
+ {
215
+ "epoch": 2.72,
216
+ "learning_rate": 4.4045855962645363e-05,
217
+ "loss": 0.2142,
218
+ "step": 155
219
+ },
220
+ {
221
+ "epoch": 2.81,
222
+ "learning_rate": 4.3562239967821805e-05,
223
+ "loss": 0.1373,
224
+ "step": 160
225
+ },
226
+ {
227
+ "epoch": 2.89,
228
+ "learning_rate": 4.306262355332006e-05,
229
+ "loss": 0.0635,
230
+ "step": 165
231
+ },
232
+ {
233
+ "epoch": 2.98,
234
+ "learning_rate": 4.254743738226721e-05,
235
+ "loss": 0.1744,
236
+ "step": 170
237
+ },
238
+ {
239
+ "epoch": 3.0,
240
+ "eval_accuracy": 0.9306930899620056,
241
+ "eval_loss": 0.24064408242702484,
242
+ "eval_runtime": 1.6792,
243
+ "eval_samples_per_second": 60.146,
244
+ "eval_steps_per_second": 7.742,
245
+ "step": 171
246
+ },
247
+ {
248
+ "epoch": 3.07,
249
+ "learning_rate": 4.201712553872658e-05,
250
+ "loss": 0.0273,
251
+ "step": 175
252
+ },
253
+ {
254
+ "epoch": 3.16,
255
+ "learning_rate": 4.147214514490278e-05,
256
+ "loss": 0.0421,
257
+ "step": 180
258
+ },
259
+ {
260
+ "epoch": 3.25,
261
+ "learning_rate": 4.0912965967108125e-05,
262
+ "loss": 0.0495,
263
+ "step": 185
264
+ },
265
+ {
266
+ "epoch": 3.33,
267
+ "learning_rate": 4.034007001082985e-05,
268
+ "loss": 0.0192,
269
+ "step": 190
270
+ },
271
+ {
272
+ "epoch": 3.42,
273
+ "learning_rate": 3.975395110524742e-05,
274
+ "loss": 0.0252,
275
+ "step": 195
276
+ },
277
+ {
278
+ "epoch": 3.51,
279
+ "learning_rate": 3.9155114477557933e-05,
280
+ "loss": 0.0318,
281
+ "step": 200
282
+ },
283
+ {
284
+ "epoch": 3.6,
285
+ "learning_rate": 3.854407631747654e-05,
286
+ "loss": 0.0873,
287
+ "step": 205
288
+ },
289
+ {
290
+ "epoch": 3.68,
291
+ "learning_rate": 3.792136333228735e-05,
292
+ "loss": 0.0091,
293
+ "step": 210
294
+ },
295
+ {
296
+ "epoch": 3.77,
297
+ "learning_rate": 3.728751229282836e-05,
298
+ "loss": 0.0402,
299
+ "step": 215
300
+ },
301
+ {
302
+ "epoch": 3.86,
303
+ "learning_rate": 3.664306957080159e-05,
304
+ "loss": 0.1102,
305
+ "step": 220
306
+ },
307
+ {
308
+ "epoch": 3.95,
309
+ "learning_rate": 3.598859066780754e-05,
310
+ "loss": 0.0497,
311
+ "step": 225
312
+ },
313
+ {
314
+ "epoch": 4.0,
315
+ "eval_accuracy": 0.9009901285171509,
316
+ "eval_loss": 0.5820234417915344,
317
+ "eval_runtime": 0.4317,
318
+ "eval_samples_per_second": 233.942,
319
+ "eval_steps_per_second": 30.111,
320
+ "step": 228
321
+ },
322
+ {
323
+ "epoch": 4.04,
324
+ "learning_rate": 3.5324639736509714e-05,
325
+ "loss": 0.1968,
326
+ "step": 230
327
+ },
328
+ {
329
+ "epoch": 4.12,
330
+ "learning_rate": 3.4651789094342044e-05,
331
+ "loss": 0.0267,
332
+ "step": 235
333
+ },
334
+ {
335
+ "epoch": 4.21,
336
+ "learning_rate": 3.39706187301784e-05,
337
+ "loss": 0.0339,
338
+ "step": 240
339
+ },
340
+ {
341
+ "epoch": 4.3,
342
+ "learning_rate": 3.3281715804389403e-05,
343
+ "loss": 0.0614,
344
+ "step": 245
345
+ },
346
+ {
347
+ "epoch": 4.39,
348
+ "learning_rate": 3.258567414271748e-05,
349
+ "loss": 0.0682,
350
+ "step": 250
351
+ },
352
+ {
353
+ "epoch": 4.47,
354
+ "learning_rate": 3.18830937244065e-05,
355
+ "loss": 0.0068,
356
+ "step": 255
357
+ },
358
+ {
359
+ "epoch": 4.56,
360
+ "learning_rate": 3.117458016502711e-05,
361
+ "loss": 0.0028,
362
+ "step": 260
363
+ },
364
+ {
365
+ "epoch": 4.65,
366
+ "learning_rate": 3.046074419444366e-05,
367
+ "loss": 0.0069,
368
+ "step": 265
369
+ },
370
+ {
371
+ "epoch": 4.74,
372
+ "learning_rate": 2.9742201130372693e-05,
373
+ "loss": 0.0019,
374
+ "step": 270
375
+ },
376
+ {
377
+ "epoch": 4.82,
378
+ "learning_rate": 2.901957034798671e-05,
379
+ "loss": 0.0868,
380
+ "step": 275
381
+ },
382
+ {
383
+ "epoch": 4.91,
384
+ "learning_rate": 2.8293474746020472e-05,
385
+ "loss": 0.002,
386
+ "step": 280
387
+ },
388
+ {
389
+ "epoch": 5.0,
390
+ "learning_rate": 2.756454020984009e-05,
391
+ "loss": 0.0207,
392
+ "step": 285
393
+ },
394
+ {
395
+ "epoch": 5.0,
396
+ "eval_accuracy": 0.9306930899620056,
397
+ "eval_loss": 0.3814234733581543,
398
+ "eval_runtime": 0.4243,
399
+ "eval_samples_per_second": 238.063,
400
+ "eval_steps_per_second": 30.642,
401
+ "step": 285
402
+ },
403
+ {
404
+ "epoch": 5.09,
405
+ "learning_rate": 2.68333950719376e-05,
406
+ "loss": 0.0021,
407
+ "step": 290
408
+ },
409
+ {
410
+ "epoch": 5.18,
411
+ "learning_rate": 2.6100669570316195e-05,
412
+ "loss": 0.0019,
413
+ "step": 295
414
+ },
415
+ {
416
+ "epoch": 5.26,
417
+ "learning_rate": 2.5366995305232916e-05,
418
+ "loss": 0.0016,
419
+ "step": 300
420
+ },
421
+ {
422
+ "epoch": 5.35,
423
+ "learning_rate": 2.463300469476709e-05,
424
+ "loss": 0.0798,
425
+ "step": 305
426
+ },
427
+ {
428
+ "epoch": 5.44,
429
+ "learning_rate": 2.3899330429683807e-05,
430
+ "loss": 0.071,
431
+ "step": 310
432
+ },
433
+ {
434
+ "epoch": 5.53,
435
+ "learning_rate": 2.3166604928062406e-05,
436
+ "loss": 0.0023,
437
+ "step": 315
438
+ },
439
+ {
440
+ "epoch": 5.61,
441
+ "learning_rate": 2.243545979015992e-05,
442
+ "loss": 0.0024,
443
+ "step": 320
444
+ },
445
+ {
446
+ "epoch": 5.7,
447
+ "learning_rate": 2.1706525253979534e-05,
448
+ "loss": 0.002,
449
+ "step": 325
450
+ },
451
+ {
452
+ "epoch": 5.79,
453
+ "learning_rate": 2.0980429652013297e-05,
454
+ "loss": 0.0019,
455
+ "step": 330
456
+ },
457
+ {
458
+ "epoch": 5.88,
459
+ "learning_rate": 2.025779886962731e-05,
460
+ "loss": 0.0138,
461
+ "step": 335
462
+ },
463
+ {
464
+ "epoch": 5.96,
465
+ "learning_rate": 1.9539255805556344e-05,
466
+ "loss": 0.0019,
467
+ "step": 340
468
+ },
469
+ {
470
+ "epoch": 6.0,
471
+ "eval_accuracy": 0.9306930899620056,
472
+ "eval_loss": 0.39340826869010925,
473
+ "eval_runtime": 0.4347,
474
+ "eval_samples_per_second": 232.361,
475
+ "eval_steps_per_second": 29.908,
476
+ "step": 342
477
+ },
478
+ {
479
+ "epoch": 6.05,
480
+ "learning_rate": 1.8825419834972902e-05,
481
+ "loss": 0.1167,
482
+ "step": 345
483
+ },
484
+ {
485
+ "epoch": 6.14,
486
+ "learning_rate": 1.811690627559351e-05,
487
+ "loss": 0.0018,
488
+ "step": 350
489
+ },
490
+ {
491
+ "epoch": 6.23,
492
+ "learning_rate": 1.7414325857282526e-05,
493
+ "loss": 0.0019,
494
+ "step": 355
495
+ },
496
+ {
497
+ "epoch": 6.32,
498
+ "learning_rate": 1.6718284195610606e-05,
499
+ "loss": 0.0521,
500
+ "step": 360
501
+ },
502
+ {
503
+ "epoch": 6.4,
504
+ "learning_rate": 1.6029381269821604e-05,
505
+ "loss": 0.0957,
506
+ "step": 365
507
+ },
508
+ {
509
+ "epoch": 6.49,
510
+ "learning_rate": 1.534821090565796e-05,
511
+ "loss": 0.0021,
512
+ "step": 370
513
+ },
514
+ {
515
+ "epoch": 6.58,
516
+ "learning_rate": 1.4675360263490295e-05,
517
+ "loss": 0.0017,
518
+ "step": 375
519
+ },
520
+ {
521
+ "epoch": 6.67,
522
+ "learning_rate": 1.4011409332192472e-05,
523
+ "loss": 0.0015,
524
+ "step": 380
525
+ },
526
+ {
527
+ "epoch": 6.75,
528
+ "learning_rate": 1.335693042919841e-05,
529
+ "loss": 0.0029,
530
+ "step": 385
531
+ },
532
+ {
533
+ "epoch": 6.84,
534
+ "learning_rate": 1.2712487707171645e-05,
535
+ "loss": 0.0782,
536
+ "step": 390
537
+ },
538
+ {
539
+ "epoch": 6.93,
540
+ "learning_rate": 1.2078636667712649e-05,
541
+ "loss": 0.0053,
542
+ "step": 395
543
+ },
544
+ {
545
+ "epoch": 7.0,
546
+ "eval_accuracy": 0.9306930899620056,
547
+ "eval_loss": 0.3684821128845215,
548
+ "eval_runtime": 0.4335,
549
+ "eval_samples_per_second": 232.965,
550
+ "eval_steps_per_second": 29.986,
551
+ "step": 399
552
+ },
553
+ {
554
+ "epoch": 7.02,
555
+ "learning_rate": 1.1455923682523475e-05,
556
+ "loss": 0.003,
557
+ "step": 400
558
+ },
559
+ {
560
+ "epoch": 7.11,
561
+ "learning_rate": 1.0844885522442074e-05,
562
+ "loss": 0.0016,
563
+ "step": 405
564
+ },
565
+ {
566
+ "epoch": 7.19,
567
+ "learning_rate": 1.0246048894752589e-05,
568
+ "loss": 0.0014,
569
+ "step": 410
570
+ },
571
+ {
572
+ "epoch": 7.28,
573
+ "learning_rate": 9.659929989170154e-06,
574
+ "loss": 0.0014,
575
+ "step": 415
576
+ },
577
+ {
578
+ "epoch": 7.37,
579
+ "learning_rate": 9.087034032891883e-06,
580
+ "loss": 0.0012,
581
+ "step": 420
582
+ },
583
+ {
584
+ "epoch": 7.46,
585
+ "learning_rate": 8.527854855097225e-06,
586
+ "loss": 0.0016,
587
+ "step": 425
588
+ },
589
+ {
590
+ "epoch": 7.54,
591
+ "learning_rate": 7.982874461273438e-06,
592
+ "loss": 0.0838,
593
+ "step": 430
594
+ },
595
+ {
596
+ "epoch": 7.63,
597
+ "learning_rate": 7.452562617732794e-06,
598
+ "loss": 0.0012,
599
+ "step": 435
600
+ },
601
+ {
602
+ "epoch": 7.72,
603
+ "learning_rate": 6.93737644667995e-06,
604
+ "loss": 0.0013,
605
+ "step": 440
606
+ },
607
+ {
608
+ "epoch": 7.81,
609
+ "learning_rate": 6.4377600321782e-06,
610
+ "loss": 0.0013,
611
+ "step": 445
612
+ },
613
+ {
614
+ "epoch": 7.89,
615
+ "learning_rate": 5.954144037354645e-06,
616
+ "loss": 0.0015,
617
+ "step": 450
618
+ },
619
+ {
620
+ "epoch": 7.98,
621
+ "learning_rate": 5.486945333173851e-06,
622
+ "loss": 0.0012,
623
+ "step": 455
624
+ },
625
+ {
626
+ "epoch": 8.0,
627
+ "eval_accuracy": 0.9306930899620056,
628
+ "eval_loss": 0.3639465570449829,
629
+ "eval_runtime": 0.4448,
630
+ "eval_samples_per_second": 227.087,
631
+ "eval_steps_per_second": 29.229,
632
+ "step": 456
633
+ },
634
+ {
635
+ "epoch": 8.07,
636
+ "learning_rate": 5.036566639100351e-06,
637
+ "loss": 0.0012,
638
+ "step": 460
639
+ },
640
+ {
641
+ "epoch": 8.16,
642
+ "learning_rate": 4.603396175959404e-06,
643
+ "loss": 0.0013,
644
+ "step": 465
645
+ },
646
+ {
647
+ "epoch": 8.25,
648
+ "learning_rate": 4.187807331295549e-06,
649
+ "loss": 0.0013,
650
+ "step": 470
651
+ },
652
+ {
653
+ "epoch": 8.33,
654
+ "learning_rate": 3.7901583375171273e-06,
655
+ "loss": 0.001,
656
+ "step": 475
657
+ },
658
+ {
659
+ "epoch": 8.42,
660
+ "learning_rate": 3.4107919631044732e-06,
661
+ "loss": 0.001,
662
+ "step": 480
663
+ },
664
+ {
665
+ "epoch": 8.51,
666
+ "learning_rate": 3.0500352171476897e-06,
667
+ "loss": 0.001,
668
+ "step": 485
669
+ },
670
+ {
671
+ "epoch": 8.6,
672
+ "learning_rate": 2.708199067468939e-06,
673
+ "loss": 0.0009,
674
+ "step": 490
675
+ },
676
+ {
677
+ "epoch": 8.68,
678
+ "learning_rate": 2.385578172572009e-06,
679
+ "loss": 0.0008,
680
+ "step": 495
681
+ },
682
+ {
683
+ "epoch": 8.77,
684
+ "learning_rate": 2.0824506276503897e-06,
685
+ "loss": 0.0594,
686
+ "step": 500
687
+ },
688
+ {
689
+ "epoch": 8.86,
690
+ "learning_rate": 1.7990777248726442e-06,
691
+ "loss": 0.0011,
692
+ "step": 505
693
+ },
694
+ {
695
+ "epoch": 8.95,
696
+ "learning_rate": 1.5357037281518522e-06,
697
+ "loss": 0.001,
698
+ "step": 510
699
+ },
700
+ {
701
+ "epoch": 9.0,
702
+ "eval_accuracy": 0.9306930899620056,
703
+ "eval_loss": 0.39552733302116394,
704
+ "eval_runtime": 0.4382,
705
+ "eval_samples_per_second": 230.47,
706
+ "eval_steps_per_second": 29.664,
707
+ "step": 513
708
+ },
709
+ {
710
+ "epoch": 9.04,
711
+ "learning_rate": 1.2925556625931173e-06,
712
+ "loss": 0.0019,
713
+ "step": 515
714
+ },
715
+ {
716
+ "epoch": 9.12,
717
+ "learning_rate": 1.0698431188007952e-06,
718
+ "loss": 0.0009,
719
+ "step": 520
720
+ },
721
+ {
722
+ "epoch": 9.21,
723
+ "learning_rate": 8.677580722139672e-07,
724
+ "loss": 0.001,
725
+ "step": 525
726
+ },
727
+ {
728
+ "epoch": 9.3,
729
+ "learning_rate": 6.864747176260289e-07,
730
+ "loss": 0.0009,
731
+ "step": 530
732
+ },
733
+ {
734
+ "epoch": 9.39,
735
+ "learning_rate": 5.261493190309303e-07,
736
+ "loss": 0.0008,
737
+ "step": 535
738
+ },
739
+ {
740
+ "epoch": 9.47,
741
+ "learning_rate": 3.8692007492557024e-07,
742
+ "loss": 0.0011,
743
+ "step": 540
744
+ },
745
+ {
746
+ "epoch": 9.56,
747
+ "learning_rate": 2.6890699918440676e-07,
748
+ "loss": 0.0016,
749
+ "step": 545
750
+ },
751
+ {
752
+ "epoch": 9.65,
753
+ "learning_rate": 1.7221181760899152e-07,
754
+ "loss": 0.0009,
755
+ "step": 550
756
+ },
757
+ {
758
+ "epoch": 9.74,
759
+ "learning_rate": 9.691788024160376e-08,
760
+ "loss": 0.001,
761
+ "step": 555
762
+ },
763
+ {
764
+ "epoch": 9.82,
765
+ "learning_rate": 4.3090089518540987e-08,
766
+ "loss": 0.045,
767
+ "step": 560
768
+ },
769
+ {
770
+ "epoch": 9.91,
771
+ "learning_rate": 1.0774844325039946e-08,
772
+ "loss": 0.0009,
773
+ "step": 565
774
+ },
775
+ {
776
+ "epoch": 10.0,
777
+ "learning_rate": 0.0,
778
+ "loss": 0.0009,
779
+ "step": 570
780
+ },
781
+ {
782
+ "epoch": 10.0,
783
+ "eval_accuracy": 0.9306930899620056,
784
+ "eval_loss": 0.39733296632766724,
785
+ "eval_runtime": 1.6316,
786
+ "eval_samples_per_second": 61.901,
787
+ "eval_steps_per_second": 7.968,
788
+ "step": 570
789
+ },
790
+ {
791
+ "epoch": 10.0,
792
+ "step": 570,
793
+ "total_flos": 197222006749320.0,
794
+ "train_loss": 0.1360975634233144,
795
+ "train_runtime": 211.9781,
796
+ "train_samples_per_second": 42.835,
797
+ "train_steps_per_second": 2.689
798
+ }
799
+ ],
800
+ "max_steps": 570,
801
+ "num_train_epochs": 10,
802
+ "total_flos": 197222006749320.0,
803
+ "trial_name": null,
804
+ "trial_params": null
805
+ }