tiendoan commited on
Commit
a73000d
1 Parent(s): 70f1975

Training in progress, step 352

Browse files
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "total_flos": 3.4828624117074493e+18,
4
+ "train_loss": 0.40373469023457303,
5
+ "train_runtime": 1995.1511,
6
+ "train_samples_per_second": 22.527,
7
+ "train_steps_per_second": 0.706
8
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b252f90d6f06b178343a72a7a4d16d1986f8cc8307b69c1006547908dc83bac1
3
  size 343230128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d3c42fdb32fba9b201cecb12dbc5314c3699f4eb4a43f6d070041f7a0aa0ecc
3
  size 343230128
runs/Nov07_15-25-23_509d87c7c7ea/events.out.tfevents.1730993124.509d87c7c7ea.30.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b68070a7005ea78360e178257b816af32338a29f712bf1b6edf0f427d61f47d
3
+ size 12869
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "total_flos": 3.4828624117074493e+18,
4
+ "train_loss": 0.40373469023457303,
5
+ "train_runtime": 1995.1511,
6
+ "train_samples_per_second": 22.527,
7
+ "train_steps_per_second": 0.706
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1049 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0417679101228714,
3
+ "best_model_checkpoint": "./finetune-vit-base-patch16-224/checkpoint-1200",
4
+ "epoch": 4.0,
5
+ "eval_steps": 400,
6
+ "global_step": 1408,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.028409090909090908,
13
+ "grad_norm": 246460.15625,
14
+ "learning_rate": 4.9644886363636365e-05,
15
+ "loss": 1.1228,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.056818181818181816,
20
+ "grad_norm": 226149.65625,
21
+ "learning_rate": 4.9289772727272735e-05,
22
+ "loss": 0.9359,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.08522727272727272,
27
+ "grad_norm": 229797.6875,
28
+ "learning_rate": 4.893465909090909e-05,
29
+ "loss": 0.9185,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.11363636363636363,
34
+ "grad_norm": 274111.03125,
35
+ "learning_rate": 4.857954545454545e-05,
36
+ "loss": 0.9599,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.14204545454545456,
41
+ "grad_norm": 189042.953125,
42
+ "learning_rate": 4.822443181818182e-05,
43
+ "loss": 0.9459,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.17045454545454544,
48
+ "grad_norm": 233362.859375,
49
+ "learning_rate": 4.7869318181818185e-05,
50
+ "loss": 0.9634,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.19886363636363635,
55
+ "grad_norm": 267175.90625,
56
+ "learning_rate": 4.751420454545455e-05,
57
+ "loss": 0.8705,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.22727272727272727,
62
+ "grad_norm": 211430.734375,
63
+ "learning_rate": 4.715909090909091e-05,
64
+ "loss": 0.9014,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.2556818181818182,
69
+ "grad_norm": 238574.546875,
70
+ "learning_rate": 4.6803977272727274e-05,
71
+ "loss": 0.8607,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.2840909090909091,
76
+ "grad_norm": 260448.125,
77
+ "learning_rate": 4.6448863636363636e-05,
78
+ "loss": 0.8127,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.3125,
83
+ "grad_norm": 168009.265625,
84
+ "learning_rate": 4.609375e-05,
85
+ "loss": 0.8228,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.3409090909090909,
90
+ "grad_norm": 232205.125,
91
+ "learning_rate": 4.573863636363637e-05,
92
+ "loss": 0.8704,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.3693181818181818,
97
+ "grad_norm": 302465.9375,
98
+ "learning_rate": 4.538352272727273e-05,
99
+ "loss": 0.8896,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.3977272727272727,
104
+ "grad_norm": 210630.53125,
105
+ "learning_rate": 4.5028409090909094e-05,
106
+ "loss": 0.8732,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.42613636363636365,
111
+ "grad_norm": 171584.9375,
112
+ "learning_rate": 4.4673295454545457e-05,
113
+ "loss": 0.7886,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.45454545454545453,
118
+ "grad_norm": 255000.359375,
119
+ "learning_rate": 4.431818181818182e-05,
120
+ "loss": 0.9411,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.48295454545454547,
125
+ "grad_norm": 244293.703125,
126
+ "learning_rate": 4.396306818181818e-05,
127
+ "loss": 0.8608,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.5113636363636364,
132
+ "grad_norm": 235527.875,
133
+ "learning_rate": 4.360795454545455e-05,
134
+ "loss": 0.8106,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.5397727272727273,
139
+ "grad_norm": 234210.1875,
140
+ "learning_rate": 4.3252840909090914e-05,
141
+ "loss": 0.795,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.5681818181818182,
146
+ "grad_norm": 182797.875,
147
+ "learning_rate": 4.289772727272727e-05,
148
+ "loss": 0.7926,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.5965909090909091,
153
+ "grad_norm": 324642.5,
154
+ "learning_rate": 4.254261363636364e-05,
155
+ "loss": 0.778,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.625,
160
+ "grad_norm": 359272.71875,
161
+ "learning_rate": 4.21875e-05,
162
+ "loss": 0.7829,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.6534090909090909,
167
+ "grad_norm": 279676.875,
168
+ "learning_rate": 4.1832386363636365e-05,
169
+ "loss": 0.8244,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.6818181818181818,
174
+ "grad_norm": 259783.71875,
175
+ "learning_rate": 4.1477272727272734e-05,
176
+ "loss": 0.7465,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.7102272727272727,
181
+ "grad_norm": 184817.609375,
182
+ "learning_rate": 4.112215909090909e-05,
183
+ "loss": 0.7447,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.7386363636363636,
188
+ "grad_norm": 221672.1875,
189
+ "learning_rate": 4.076704545454545e-05,
190
+ "loss": 0.8206,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.7670454545454546,
195
+ "grad_norm": 251710.0,
196
+ "learning_rate": 4.041193181818182e-05,
197
+ "loss": 0.8222,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.7954545454545454,
202
+ "grad_norm": 287394.75,
203
+ "learning_rate": 4.0056818181818185e-05,
204
+ "loss": 0.8751,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.8238636363636364,
209
+ "grad_norm": 261405.84375,
210
+ "learning_rate": 3.970170454545455e-05,
211
+ "loss": 0.8049,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.8522727272727273,
216
+ "grad_norm": 339216.5,
217
+ "learning_rate": 3.934659090909091e-05,
218
+ "loss": 0.7734,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.8806818181818182,
223
+ "grad_norm": 253168.921875,
224
+ "learning_rate": 3.899147727272727e-05,
225
+ "loss": 0.7916,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.9090909090909091,
230
+ "grad_norm": 243938.09375,
231
+ "learning_rate": 3.8636363636363636e-05,
232
+ "loss": 0.8075,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.9375,
237
+ "grad_norm": 224975.296875,
238
+ "learning_rate": 3.828125e-05,
239
+ "loss": 0.724,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.9659090909090909,
244
+ "grad_norm": 314409.71875,
245
+ "learning_rate": 3.792613636363637e-05,
246
+ "loss": 0.86,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.9943181818181818,
251
+ "grad_norm": 254573.59375,
252
+ "learning_rate": 3.757102272727273e-05,
253
+ "loss": 0.7882,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 1.0227272727272727,
258
+ "grad_norm": 239098.109375,
259
+ "learning_rate": 3.721590909090909e-05,
260
+ "loss": 0.5987,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 1.0511363636363635,
265
+ "grad_norm": 212271.015625,
266
+ "learning_rate": 3.6860795454545456e-05,
267
+ "loss": 0.5594,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 1.0795454545454546,
272
+ "grad_norm": 258443.203125,
273
+ "learning_rate": 3.650568181818182e-05,
274
+ "loss": 0.5778,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 1.1079545454545454,
279
+ "grad_norm": 251415.8125,
280
+ "learning_rate": 3.615056818181818e-05,
281
+ "loss": 0.5707,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 1.1363636363636362,
286
+ "grad_norm": 191828.046875,
287
+ "learning_rate": 3.579545454545455e-05,
288
+ "loss": 0.6151,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 1.1363636363636362,
293
+ "eval_f1": 0.7879138483446066,
294
+ "eval_loss": 0.5355119705200195,
295
+ "eval_runtime": 204.0556,
296
+ "eval_samples_per_second": 55.063,
297
+ "eval_steps_per_second": 3.445,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 1.1647727272727273,
302
+ "grad_norm": 233159.53125,
303
+ "learning_rate": 3.5440340909090914e-05,
304
+ "loss": 0.548,
305
+ "step": 410
306
+ },
307
+ {
308
+ "epoch": 1.1931818181818181,
309
+ "grad_norm": 206000.609375,
310
+ "learning_rate": 3.508522727272727e-05,
311
+ "loss": 0.5118,
312
+ "step": 420
313
+ },
314
+ {
315
+ "epoch": 1.2215909090909092,
316
+ "grad_norm": 262176.0625,
317
+ "learning_rate": 3.473011363636364e-05,
318
+ "loss": 0.5221,
319
+ "step": 430
320
+ },
321
+ {
322
+ "epoch": 1.25,
323
+ "grad_norm": 225265.671875,
324
+ "learning_rate": 3.4375e-05,
325
+ "loss": 0.5489,
326
+ "step": 440
327
+ },
328
+ {
329
+ "epoch": 1.2784090909090908,
330
+ "grad_norm": 261512.140625,
331
+ "learning_rate": 3.4019886363636365e-05,
332
+ "loss": 0.5682,
333
+ "step": 450
334
+ },
335
+ {
336
+ "epoch": 1.3068181818181819,
337
+ "grad_norm": 336397.46875,
338
+ "learning_rate": 3.3664772727272734e-05,
339
+ "loss": 0.5585,
340
+ "step": 460
341
+ },
342
+ {
343
+ "epoch": 1.3352272727272727,
344
+ "grad_norm": 253634.796875,
345
+ "learning_rate": 3.330965909090909e-05,
346
+ "loss": 0.5239,
347
+ "step": 470
348
+ },
349
+ {
350
+ "epoch": 1.3636363636363638,
351
+ "grad_norm": 253387.1875,
352
+ "learning_rate": 3.295454545454545e-05,
353
+ "loss": 0.5411,
354
+ "step": 480
355
+ },
356
+ {
357
+ "epoch": 1.3920454545454546,
358
+ "grad_norm": 175611.75,
359
+ "learning_rate": 3.259943181818182e-05,
360
+ "loss": 0.4704,
361
+ "step": 490
362
+ },
363
+ {
364
+ "epoch": 1.4204545454545454,
365
+ "grad_norm": 210382.125,
366
+ "learning_rate": 3.2244318181818185e-05,
367
+ "loss": 0.4668,
368
+ "step": 500
369
+ },
370
+ {
371
+ "epoch": 1.4488636363636362,
372
+ "grad_norm": 207340.484375,
373
+ "learning_rate": 3.188920454545455e-05,
374
+ "loss": 0.5243,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 1.4772727272727273,
379
+ "grad_norm": 211227.53125,
380
+ "learning_rate": 3.153409090909091e-05,
381
+ "loss": 0.5158,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 1.5056818181818183,
386
+ "grad_norm": 263875.125,
387
+ "learning_rate": 3.117897727272727e-05,
388
+ "loss": 0.5264,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 1.5340909090909092,
393
+ "grad_norm": 250973.984375,
394
+ "learning_rate": 3.0823863636363636e-05,
395
+ "loss": 0.4892,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 1.5625,
400
+ "grad_norm": 210192.90625,
401
+ "learning_rate": 3.0468750000000002e-05,
402
+ "loss": 0.565,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 1.5909090909090908,
407
+ "grad_norm": 277090.34375,
408
+ "learning_rate": 3.0113636363636365e-05,
409
+ "loss": 0.5501,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 1.6193181818181817,
414
+ "grad_norm": 262420.625,
415
+ "learning_rate": 2.975852272727273e-05,
416
+ "loss": 0.4802,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 1.6477272727272727,
421
+ "grad_norm": 247244.59375,
422
+ "learning_rate": 2.940340909090909e-05,
423
+ "loss": 0.4778,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 1.6761363636363638,
428
+ "grad_norm": 238716.140625,
429
+ "learning_rate": 2.9048295454545453e-05,
430
+ "loss": 0.4998,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 1.7045454545454546,
435
+ "grad_norm": 288676.875,
436
+ "learning_rate": 2.869318181818182e-05,
437
+ "loss": 0.4763,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 1.7329545454545454,
442
+ "grad_norm": 254478.03125,
443
+ "learning_rate": 2.8338068181818185e-05,
444
+ "loss": 0.4912,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 1.7613636363636362,
449
+ "grad_norm": 295674.3125,
450
+ "learning_rate": 2.7982954545454548e-05,
451
+ "loss": 0.4892,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 1.7897727272727273,
456
+ "grad_norm": 279737.21875,
457
+ "learning_rate": 2.7627840909090914e-05,
458
+ "loss": 0.4677,
459
+ "step": 630
460
+ },
461
+ {
462
+ "epoch": 1.8181818181818183,
463
+ "grad_norm": 325599.34375,
464
+ "learning_rate": 2.7272727272727273e-05,
465
+ "loss": 0.4977,
466
+ "step": 640
467
+ },
468
+ {
469
+ "epoch": 1.8465909090909092,
470
+ "grad_norm": 303249.375,
471
+ "learning_rate": 2.6917613636363636e-05,
472
+ "loss": 0.5212,
473
+ "step": 650
474
+ },
475
+ {
476
+ "epoch": 1.875,
477
+ "grad_norm": 269595.21875,
478
+ "learning_rate": 2.6562500000000002e-05,
479
+ "loss": 0.5283,
480
+ "step": 660
481
+ },
482
+ {
483
+ "epoch": 1.9034090909090908,
484
+ "grad_norm": 274965.3125,
485
+ "learning_rate": 2.6207386363636365e-05,
486
+ "loss": 0.5194,
487
+ "step": 670
488
+ },
489
+ {
490
+ "epoch": 1.9318181818181817,
491
+ "grad_norm": 250650.328125,
492
+ "learning_rate": 2.585227272727273e-05,
493
+ "loss": 0.5274,
494
+ "step": 680
495
+ },
496
+ {
497
+ "epoch": 1.9602272727272727,
498
+ "grad_norm": 232058.15625,
499
+ "learning_rate": 2.549715909090909e-05,
500
+ "loss": 0.5002,
501
+ "step": 690
502
+ },
503
+ {
504
+ "epoch": 1.9886363636363638,
505
+ "grad_norm": 251402.0,
506
+ "learning_rate": 2.5142045454545453e-05,
507
+ "loss": 0.4618,
508
+ "step": 700
509
+ },
510
+ {
511
+ "epoch": 2.0170454545454546,
512
+ "grad_norm": 192832.578125,
513
+ "learning_rate": 2.478693181818182e-05,
514
+ "loss": 0.3425,
515
+ "step": 710
516
+ },
517
+ {
518
+ "epoch": 2.0454545454545454,
519
+ "grad_norm": 200086.390625,
520
+ "learning_rate": 2.4431818181818185e-05,
521
+ "loss": 0.2832,
522
+ "step": 720
523
+ },
524
+ {
525
+ "epoch": 2.0738636363636362,
526
+ "grad_norm": 162459.609375,
527
+ "learning_rate": 2.4076704545454544e-05,
528
+ "loss": 0.2102,
529
+ "step": 730
530
+ },
531
+ {
532
+ "epoch": 2.102272727272727,
533
+ "grad_norm": 132360.765625,
534
+ "learning_rate": 2.372159090909091e-05,
535
+ "loss": 0.2097,
536
+ "step": 740
537
+ },
538
+ {
539
+ "epoch": 2.1306818181818183,
540
+ "grad_norm": 146930.046875,
541
+ "learning_rate": 2.3366477272727273e-05,
542
+ "loss": 0.1884,
543
+ "step": 750
544
+ },
545
+ {
546
+ "epoch": 2.159090909090909,
547
+ "grad_norm": 246238.796875,
548
+ "learning_rate": 2.3011363636363636e-05,
549
+ "loss": 0.1969,
550
+ "step": 760
551
+ },
552
+ {
553
+ "epoch": 2.1875,
554
+ "grad_norm": 232657.203125,
555
+ "learning_rate": 2.2656250000000002e-05,
556
+ "loss": 0.1925,
557
+ "step": 770
558
+ },
559
+ {
560
+ "epoch": 2.215909090909091,
561
+ "grad_norm": 227103.3125,
562
+ "learning_rate": 2.2301136363636365e-05,
563
+ "loss": 0.1851,
564
+ "step": 780
565
+ },
566
+ {
567
+ "epoch": 2.2443181818181817,
568
+ "grad_norm": 171326.71875,
569
+ "learning_rate": 2.1946022727272727e-05,
570
+ "loss": 0.2253,
571
+ "step": 790
572
+ },
573
+ {
574
+ "epoch": 2.2727272727272725,
575
+ "grad_norm": 121495.1953125,
576
+ "learning_rate": 2.1590909090909093e-05,
577
+ "loss": 0.1867,
578
+ "step": 800
579
+ },
580
+ {
581
+ "epoch": 2.2727272727272725,
582
+ "eval_f1": 0.9550551797792809,
583
+ "eval_loss": 0.17148956656455994,
584
+ "eval_runtime": 203.949,
585
+ "eval_samples_per_second": 55.092,
586
+ "eval_steps_per_second": 3.447,
587
+ "step": 800
588
+ },
589
+ {
590
+ "epoch": 2.3011363636363638,
591
+ "grad_norm": 238023.546875,
592
+ "learning_rate": 2.1235795454545456e-05,
593
+ "loss": 0.2143,
594
+ "step": 810
595
+ },
596
+ {
597
+ "epoch": 2.3295454545454546,
598
+ "grad_norm": 215472.78125,
599
+ "learning_rate": 2.088068181818182e-05,
600
+ "loss": 0.1681,
601
+ "step": 820
602
+ },
603
+ {
604
+ "epoch": 2.3579545454545454,
605
+ "grad_norm": 185951.046875,
606
+ "learning_rate": 2.0525568181818185e-05,
607
+ "loss": 0.2,
608
+ "step": 830
609
+ },
610
+ {
611
+ "epoch": 2.3863636363636362,
612
+ "grad_norm": 288287.34375,
613
+ "learning_rate": 2.0170454545454544e-05,
614
+ "loss": 0.1899,
615
+ "step": 840
616
+ },
617
+ {
618
+ "epoch": 2.4147727272727275,
619
+ "grad_norm": 184342.796875,
620
+ "learning_rate": 1.981534090909091e-05,
621
+ "loss": 0.1898,
622
+ "step": 850
623
+ },
624
+ {
625
+ "epoch": 2.4431818181818183,
626
+ "grad_norm": 143657.375,
627
+ "learning_rate": 1.9460227272727273e-05,
628
+ "loss": 0.1707,
629
+ "step": 860
630
+ },
631
+ {
632
+ "epoch": 2.471590909090909,
633
+ "grad_norm": 142439.578125,
634
+ "learning_rate": 1.9105113636363636e-05,
635
+ "loss": 0.1505,
636
+ "step": 870
637
+ },
638
+ {
639
+ "epoch": 2.5,
640
+ "grad_norm": 255553.71875,
641
+ "learning_rate": 1.8750000000000002e-05,
642
+ "loss": 0.2047,
643
+ "step": 880
644
+ },
645
+ {
646
+ "epoch": 2.528409090909091,
647
+ "grad_norm": 217335.078125,
648
+ "learning_rate": 1.8394886363636364e-05,
649
+ "loss": 0.18,
650
+ "step": 890
651
+ },
652
+ {
653
+ "epoch": 2.5568181818181817,
654
+ "grad_norm": 143375.3125,
655
+ "learning_rate": 1.8039772727272727e-05,
656
+ "loss": 0.2372,
657
+ "step": 900
658
+ },
659
+ {
660
+ "epoch": 2.5852272727272725,
661
+ "grad_norm": 325331.0625,
662
+ "learning_rate": 1.7684659090909093e-05,
663
+ "loss": 0.2047,
664
+ "step": 910
665
+ },
666
+ {
667
+ "epoch": 2.6136363636363638,
668
+ "grad_norm": 160601.78125,
669
+ "learning_rate": 1.7329545454545456e-05,
670
+ "loss": 0.1999,
671
+ "step": 920
672
+ },
673
+ {
674
+ "epoch": 2.6420454545454546,
675
+ "grad_norm": 114873.859375,
676
+ "learning_rate": 1.697443181818182e-05,
677
+ "loss": 0.1736,
678
+ "step": 930
679
+ },
680
+ {
681
+ "epoch": 2.6704545454545454,
682
+ "grad_norm": 191060.78125,
683
+ "learning_rate": 1.6619318181818185e-05,
684
+ "loss": 0.1809,
685
+ "step": 940
686
+ },
687
+ {
688
+ "epoch": 2.6988636363636362,
689
+ "grad_norm": 303838.96875,
690
+ "learning_rate": 1.6264204545454544e-05,
691
+ "loss": 0.238,
692
+ "step": 950
693
+ },
694
+ {
695
+ "epoch": 2.7272727272727275,
696
+ "grad_norm": 92415.265625,
697
+ "learning_rate": 1.590909090909091e-05,
698
+ "loss": 0.137,
699
+ "step": 960
700
+ },
701
+ {
702
+ "epoch": 2.7556818181818183,
703
+ "grad_norm": 227939.296875,
704
+ "learning_rate": 1.5553977272727273e-05,
705
+ "loss": 0.1811,
706
+ "step": 970
707
+ },
708
+ {
709
+ "epoch": 2.784090909090909,
710
+ "grad_norm": 244860.359375,
711
+ "learning_rate": 1.5198863636363636e-05,
712
+ "loss": 0.2235,
713
+ "step": 980
714
+ },
715
+ {
716
+ "epoch": 2.8125,
717
+ "grad_norm": 199524.078125,
718
+ "learning_rate": 1.484375e-05,
719
+ "loss": 0.1885,
720
+ "step": 990
721
+ },
722
+ {
723
+ "epoch": 2.840909090909091,
724
+ "grad_norm": 245456.046875,
725
+ "learning_rate": 1.4488636363636366e-05,
726
+ "loss": 0.2261,
727
+ "step": 1000
728
+ },
729
+ {
730
+ "epoch": 2.8693181818181817,
731
+ "grad_norm": 291130.96875,
732
+ "learning_rate": 1.4133522727272727e-05,
733
+ "loss": 0.1767,
734
+ "step": 1010
735
+ },
736
+ {
737
+ "epoch": 2.8977272727272725,
738
+ "grad_norm": 119223.3046875,
739
+ "learning_rate": 1.3778409090909091e-05,
740
+ "loss": 0.1589,
741
+ "step": 1020
742
+ },
743
+ {
744
+ "epoch": 2.9261363636363638,
745
+ "grad_norm": 205424.078125,
746
+ "learning_rate": 1.3423295454545456e-05,
747
+ "loss": 0.1666,
748
+ "step": 1030
749
+ },
750
+ {
751
+ "epoch": 2.9545454545454546,
752
+ "grad_norm": 177895.84375,
753
+ "learning_rate": 1.3068181818181819e-05,
754
+ "loss": 0.1572,
755
+ "step": 1040
756
+ },
757
+ {
758
+ "epoch": 2.9829545454545454,
759
+ "grad_norm": 337598.78125,
760
+ "learning_rate": 1.2713068181818183e-05,
761
+ "loss": 0.1938,
762
+ "step": 1050
763
+ },
764
+ {
765
+ "epoch": 3.0113636363636362,
766
+ "grad_norm": 173000.0,
767
+ "learning_rate": 1.2357954545454546e-05,
768
+ "loss": 0.1126,
769
+ "step": 1060
770
+ },
771
+ {
772
+ "epoch": 3.039772727272727,
773
+ "grad_norm": 97144.171875,
774
+ "learning_rate": 1.200284090909091e-05,
775
+ "loss": 0.0462,
776
+ "step": 1070
777
+ },
778
+ {
779
+ "epoch": 3.0681818181818183,
780
+ "grad_norm": 54899.234375,
781
+ "learning_rate": 1.1647727272727273e-05,
782
+ "loss": 0.0615,
783
+ "step": 1080
784
+ },
785
+ {
786
+ "epoch": 3.096590909090909,
787
+ "grad_norm": 36492.046875,
788
+ "learning_rate": 1.1292613636363637e-05,
789
+ "loss": 0.0491,
790
+ "step": 1090
791
+ },
792
+ {
793
+ "epoch": 3.125,
794
+ "grad_norm": 37996.1953125,
795
+ "learning_rate": 1.09375e-05,
796
+ "loss": 0.0562,
797
+ "step": 1100
798
+ },
799
+ {
800
+ "epoch": 3.153409090909091,
801
+ "grad_norm": 190393.703125,
802
+ "learning_rate": 1.0582386363636364e-05,
803
+ "loss": 0.054,
804
+ "step": 1110
805
+ },
806
+ {
807
+ "epoch": 3.1818181818181817,
808
+ "grad_norm": 179904.40625,
809
+ "learning_rate": 1.0227272727272729e-05,
810
+ "loss": 0.0728,
811
+ "step": 1120
812
+ },
813
+ {
814
+ "epoch": 3.210227272727273,
815
+ "grad_norm": 100628.515625,
816
+ "learning_rate": 9.872159090909091e-06,
817
+ "loss": 0.0625,
818
+ "step": 1130
819
+ },
820
+ {
821
+ "epoch": 3.2386363636363638,
822
+ "grad_norm": 118374.3984375,
823
+ "learning_rate": 9.517045454545454e-06,
824
+ "loss": 0.0569,
825
+ "step": 1140
826
+ },
827
+ {
828
+ "epoch": 3.2670454545454546,
829
+ "grad_norm": 75175.8359375,
830
+ "learning_rate": 9.161931818181818e-06,
831
+ "loss": 0.0436,
832
+ "step": 1150
833
+ },
834
+ {
835
+ "epoch": 3.2954545454545454,
836
+ "grad_norm": 158238.78125,
837
+ "learning_rate": 8.806818181818183e-06,
838
+ "loss": 0.079,
839
+ "step": 1160
840
+ },
841
+ {
842
+ "epoch": 3.3238636363636362,
843
+ "grad_norm": 68349.515625,
844
+ "learning_rate": 8.451704545454546e-06,
845
+ "loss": 0.056,
846
+ "step": 1170
847
+ },
848
+ {
849
+ "epoch": 3.3522727272727275,
850
+ "grad_norm": 43816.8671875,
851
+ "learning_rate": 8.09659090909091e-06,
852
+ "loss": 0.0443,
853
+ "step": 1180
854
+ },
855
+ {
856
+ "epoch": 3.3806818181818183,
857
+ "grad_norm": 61632.68359375,
858
+ "learning_rate": 7.741477272727273e-06,
859
+ "loss": 0.0554,
860
+ "step": 1190
861
+ },
862
+ {
863
+ "epoch": 3.409090909090909,
864
+ "grad_norm": 60831.44140625,
865
+ "learning_rate": 7.386363636363637e-06,
866
+ "loss": 0.0871,
867
+ "step": 1200
868
+ },
869
+ {
870
+ "epoch": 3.409090909090909,
871
+ "eval_f1": 0.9917230331078676,
872
+ "eval_loss": 0.0417679101228714,
873
+ "eval_runtime": 204.1321,
874
+ "eval_samples_per_second": 55.043,
875
+ "eval_steps_per_second": 3.444,
876
+ "step": 1200
877
+ },
878
+ {
879
+ "epoch": 3.4375,
880
+ "grad_norm": 90207.28125,
881
+ "learning_rate": 7.031250000000001e-06,
882
+ "loss": 0.0676,
883
+ "step": 1210
884
+ },
885
+ {
886
+ "epoch": 3.465909090909091,
887
+ "grad_norm": 63487.5546875,
888
+ "learning_rate": 6.676136363636363e-06,
889
+ "loss": 0.0346,
890
+ "step": 1220
891
+ },
892
+ {
893
+ "epoch": 3.4943181818181817,
894
+ "grad_norm": 83902.515625,
895
+ "learning_rate": 6.321022727272729e-06,
896
+ "loss": 0.0587,
897
+ "step": 1230
898
+ },
899
+ {
900
+ "epoch": 3.5227272727272725,
901
+ "grad_norm": 26082.44921875,
902
+ "learning_rate": 5.965909090909091e-06,
903
+ "loss": 0.0385,
904
+ "step": 1240
905
+ },
906
+ {
907
+ "epoch": 3.5511363636363638,
908
+ "grad_norm": 71738.4140625,
909
+ "learning_rate": 5.610795454545455e-06,
910
+ "loss": 0.0497,
911
+ "step": 1250
912
+ },
913
+ {
914
+ "epoch": 3.5795454545454546,
915
+ "grad_norm": 115759.3671875,
916
+ "learning_rate": 5.255681818181818e-06,
917
+ "loss": 0.0679,
918
+ "step": 1260
919
+ },
920
+ {
921
+ "epoch": 3.6079545454545454,
922
+ "grad_norm": 49416.74609375,
923
+ "learning_rate": 4.900568181818182e-06,
924
+ "loss": 0.0565,
925
+ "step": 1270
926
+ },
927
+ {
928
+ "epoch": 3.6363636363636362,
929
+ "grad_norm": 164339.484375,
930
+ "learning_rate": 4.5454545454545455e-06,
931
+ "loss": 0.0374,
932
+ "step": 1280
933
+ },
934
+ {
935
+ "epoch": 3.6647727272727275,
936
+ "grad_norm": 74746.796875,
937
+ "learning_rate": 4.190340909090909e-06,
938
+ "loss": 0.0382,
939
+ "step": 1290
940
+ },
941
+ {
942
+ "epoch": 3.6931818181818183,
943
+ "grad_norm": 29929.04296875,
944
+ "learning_rate": 3.835227272727273e-06,
945
+ "loss": 0.039,
946
+ "step": 1300
947
+ },
948
+ {
949
+ "epoch": 3.721590909090909,
950
+ "grad_norm": 59106.06640625,
951
+ "learning_rate": 3.480113636363636e-06,
952
+ "loss": 0.0376,
953
+ "step": 1310
954
+ },
955
+ {
956
+ "epoch": 3.75,
957
+ "grad_norm": 187797.71875,
958
+ "learning_rate": 3.125e-06,
959
+ "loss": 0.056,
960
+ "step": 1320
961
+ },
962
+ {
963
+ "epoch": 3.778409090909091,
964
+ "grad_norm": 42829.46875,
965
+ "learning_rate": 2.7698863636363637e-06,
966
+ "loss": 0.0434,
967
+ "step": 1330
968
+ },
969
+ {
970
+ "epoch": 3.8068181818181817,
971
+ "grad_norm": 252679.109375,
972
+ "learning_rate": 2.4147727272727273e-06,
973
+ "loss": 0.0502,
974
+ "step": 1340
975
+ },
976
+ {
977
+ "epoch": 3.8352272727272725,
978
+ "grad_norm": 35090.86328125,
979
+ "learning_rate": 2.059659090909091e-06,
980
+ "loss": 0.0686,
981
+ "step": 1350
982
+ },
983
+ {
984
+ "epoch": 3.8636363636363638,
985
+ "grad_norm": 287442.9375,
986
+ "learning_rate": 1.7045454545454546e-06,
987
+ "loss": 0.0579,
988
+ "step": 1360
989
+ },
990
+ {
991
+ "epoch": 3.8920454545454546,
992
+ "grad_norm": 241179.890625,
993
+ "learning_rate": 1.3494318181818183e-06,
994
+ "loss": 0.065,
995
+ "step": 1370
996
+ },
997
+ {
998
+ "epoch": 3.9204545454545454,
999
+ "grad_norm": 20388.59765625,
1000
+ "learning_rate": 9.943181818181819e-07,
1001
+ "loss": 0.0281,
1002
+ "step": 1380
1003
+ },
1004
+ {
1005
+ "epoch": 3.9488636363636362,
1006
+ "grad_norm": 44893.046875,
1007
+ "learning_rate": 6.392045454545455e-07,
1008
+ "loss": 0.0297,
1009
+ "step": 1390
1010
+ },
1011
+ {
1012
+ "epoch": 3.9772727272727275,
1013
+ "grad_norm": 30813.0546875,
1014
+ "learning_rate": 2.840909090909091e-07,
1015
+ "loss": 0.048,
1016
+ "step": 1400
1017
+ },
1018
+ {
1019
+ "epoch": 4.0,
1020
+ "step": 1408,
1021
+ "total_flos": 3.4828624117074493e+18,
1022
+ "train_loss": 0.40373469023457303,
1023
+ "train_runtime": 1995.1511,
1024
+ "train_samples_per_second": 22.527,
1025
+ "train_steps_per_second": 0.706
1026
+ }
1027
+ ],
1028
+ "logging_steps": 10,
1029
+ "max_steps": 1408,
1030
+ "num_input_tokens_seen": 0,
1031
+ "num_train_epochs": 4,
1032
+ "save_steps": 400,
1033
+ "stateful_callbacks": {
1034
+ "TrainerControl": {
1035
+ "args": {
1036
+ "should_epoch_stop": false,
1037
+ "should_evaluate": false,
1038
+ "should_log": false,
1039
+ "should_save": true,
1040
+ "should_training_stop": true
1041
+ },
1042
+ "attributes": {}
1043
+ }
1044
+ },
1045
+ "total_flos": 3.4828624117074493e+18,
1046
+ "train_batch_size": 32,
1047
+ "trial_name": null,
1048
+ "trial_params": null
1049
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93c94b1e2af28ebd0231de3ea12cc2305ae0723ff20e56cce8aa6dee01ecbdfc
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b584c7abff93919a16bdd8a77a0bf32e568b3b8bfccee1f93593599a2675fc3
3
  size 5240