AlekseyKorshuk commited on
Commit
1b6f2d8
1 Parent(s): 59e1032

huggingartists

Browse files
README.md CHANGED
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/morgenshtern")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/26pogqcg/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on MORGENSHTERN's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/2f54nbz5) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/2f54nbz5/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
 
45
  dataset = load_dataset("huggingartists/morgenshtern")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/2g9p829k/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on MORGENSHTERN's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/3cvafvz3) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/3cvafvz3/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 1.004758358001709, "eval_runtime": 6.3083, "eval_samples_per_second": 20.766, "eval_steps_per_second": 2.695, "epoch": 14.0}
 
1
+ {"eval_loss": 1.0789222717285156, "eval_runtime": 7.246, "eval_samples_per_second": 20.701, "eval_steps_per_second": 2.622, "epoch": 16.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73f4525a915f9e923ec8c1948b7703789a43fc65ae0c0ce7a70b29cc32f99056
3
  size 497764120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9654bfbbbc16c001bf3ca7f1bde7b170d24b36f207e71c7d64f5e319546120e9
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3189c4875db3d6c2430dd21cc7aaf58494cb3ea16a1644a9eba14559109a180
3
  size 995604017
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d310145beb326e0a9d566eb5622927be8d76bfb9430b33c32dd51b5c7a06790
3
  size 995604017
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:77256682836976672d4132cd08e64ed8578aca99003f5b1ad793033c0c217d56
3
  size 510403817
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec149b6dfd697bf23788055aefe01a7ddb91b8831442e474a5c68c182988271b
3
  size 510403817
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c58bc32277fe141391df912d7cedc705240c20ad9546d00ca7addc7fbb5b47a9
3
- size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:160d145d45171c3680d1a0f735e7dba3ead27f25402db68bfdfe08b789e7823a
3
+ size 14567
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7618b7dada376a378c38d9aa39cd28c5eded00be56339f85d3ea9724b270283c
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c82347c45c6ff6d74c3b0b79bef318683f5527c56da1c685e3b3ba8d34edddd5
3
  size 623
trainer_state.json CHANGED
@@ -1,1728 +1,138 @@
1
  {
2
- "best_metric": 1.004758358001709,
3
- "best_model_checkpoint": "output/morgenshtern/checkpoint-1300",
4
- "epoch": 13.0,
5
- "global_step": 1300,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.05,
12
- "learning_rate": 0.00013638815138477438,
13
- "loss": 2.534,
14
  "step": 5
15
  },
16
  {
17
  "epoch": 0.1,
18
- "learning_rate": 0.00013397182122930294,
19
- "loss": 2.1207,
20
  "step": 10
21
  },
22
  {
23
  "epoch": 0.15,
24
- "learning_rate": 0.0001300082017869573,
25
- "loss": 1.9437,
26
  "step": 15
27
  },
28
  {
29
- "epoch": 0.2,
30
- "learning_rate": 0.00012459110818763681,
31
- "loss": 2.0463,
32
  "step": 20
33
  },
34
  {
35
- "epoch": 0.2,
36
- "eval_loss": 1.9048744440078735,
37
- "eval_runtime": 2.5437,
38
- "eval_samples_per_second": 44.03,
39
- "eval_steps_per_second": 5.504,
40
- "step": 20
41
- },
42
- {
43
- "epoch": 0.25,
44
- "learning_rate": 0.00011784875792222071,
45
- "loss": 1.953,
46
  "step": 25
47
  },
48
  {
49
- "epoch": 0.29,
50
- "learning_rate": 0.000109940736055617,
51
- "loss": 2.0751,
52
  "step": 30
53
  },
54
  {
55
- "epoch": 0.34,
56
- "learning_rate": 0.0001010542179989503,
57
- "loss": 1.9436,
58
  "step": 35
59
  },
60
  {
61
- "epoch": 0.39,
62
- "learning_rate": 9.139953924430466e-05,
63
- "loss": 1.8745,
64
- "step": 40
65
- },
66
- {
67
- "epoch": 0.39,
68
- "eval_loss": 1.7966899871826172,
69
- "eval_runtime": 2.5987,
70
- "eval_samples_per_second": 43.099,
71
- "eval_steps_per_second": 5.387,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.44,
76
- "learning_rate": 8.120521692221673e-05,
77
- "loss": 1.8989,
78
  "step": 45
79
  },
80
  {
81
- "epoch": 0.49,
82
- "learning_rate": 7.071254101695329e-05,
83
- "loss": 1.8346,
84
  "step": 50
85
  },
86
  {
87
- "epoch": 0.54,
88
- "learning_rate": 6.016986326040062e-05,
89
- "loss": 1.8576,
90
  "step": 55
91
  },
92
  {
93
- "epoch": 0.59,
94
- "learning_rate": 4.982671888105512e-05,
95
- "loss": 1.8876,
96
- "step": 60
97
- },
98
- {
99
- "epoch": 0.59,
100
- "eval_loss": 1.760327935218811,
101
- "eval_runtime": 2.6932,
102
- "eval_samples_per_second": 41.586,
103
- "eval_steps_per_second": 5.198,
104
  "step": 60
105
  },
106
  {
107
- "epoch": 0.64,
108
- "learning_rate": 3.992792034076668e-05,
109
- "loss": 1.7386,
110
  "step": 65
111
  },
112
  {
113
- "epoch": 0.69,
114
- "learning_rate": 3.0707762854909304e-05,
115
- "loss": 1.9309,
116
  "step": 70
117
  },
118
  {
119
- "epoch": 0.74,
120
- "learning_rate": 2.2384478845846175e-05,
121
- "loss": 1.6283,
122
  "step": 75
123
  },
124
  {
125
- "epoch": 0.78,
126
- "learning_rate": 1.5155072587539005e-05,
127
- "loss": 1.9742,
128
- "step": 80
129
- },
130
- {
131
- "epoch": 0.78,
132
- "eval_loss": 1.7276561260223389,
133
- "eval_runtime": 2.6936,
134
- "eval_samples_per_second": 41.581,
135
- "eval_steps_per_second": 5.198,
136
  "step": 80
137
  },
138
- {
139
- "epoch": 0.83,
140
- "learning_rate": 9.190657300387505e-06,
141
- "loss": 1.8267,
142
- "step": 85
143
- },
144
  {
145
  "epoch": 0.88,
146
- "learning_rate": 4.6324050628611986e-06,
147
- "loss": 1.7887,
148
- "step": 90
149
  },
150
  {
151
  "epoch": 0.93,
152
- "learning_rate": 1.5882054016913933e-06,
153
- "loss": 1.8381,
154
- "step": 95
155
- },
156
- {
157
- "epoch": 0.98,
158
- "learning_rate": 1.3011164863877445e-07,
159
- "loss": 1.8954,
160
- "step": 100
161
  },
162
  {
163
  "epoch": 0.98,
164
- "eval_loss": 1.7208999395370483,
165
- "eval_runtime": 2.7574,
166
- "eval_samples_per_second": 40.618,
167
- "eval_steps_per_second": 5.077,
168
- "step": 100
169
- },
170
- {
171
- "epoch": 0.91,
172
- "learning_rate": 3.0216830127274476e-06,
173
- "loss": 1.8376,
174
- "step": 105
175
- },
176
- {
177
- "epoch": 0.95,
178
- "learning_rate": 9.037005536513067e-07,
179
- "loss": 1.7024,
180
- "step": 110
181
- },
182
- {
183
- "epoch": 0.99,
184
- "learning_rate": 2.515656508272057e-08,
185
- "loss": 1.7911,
186
- "step": 115
187
- },
188
- {
189
- "epoch": 1.03,
190
- "learning_rate": 4.0213613921093164e-07,
191
- "loss": 1.8512,
192
- "step": 120
193
- },
194
- {
195
- "epoch": 1.08,
196
- "learning_rate": 2.0277372298297e-06,
197
- "loss": 1.7573,
198
- "step": 125
199
- },
200
- {
201
- "epoch": 1.12,
202
- "learning_rate": 4.8721970205680935e-06,
203
- "loss": 1.7902,
204
- "step": 130
205
- },
206
- {
207
- "epoch": 1.16,
208
- "learning_rate": 8.88343684654658e-06,
209
- "loss": 1.7602,
210
- "step": 135
211
- },
212
- {
213
- "epoch": 1.21,
214
- "learning_rate": 1.3988015692592823e-05,
215
- "loss": 1.8606,
216
- "step": 140
217
- },
218
- {
219
- "epoch": 1.25,
220
- "learning_rate": 2.009247481060283e-05,
221
- "loss": 1.6102,
222
- "step": 145
223
- },
224
- {
225
- "epoch": 1.29,
226
- "learning_rate": 2.708504883770769e-05,
227
- "loss": 1.8574,
228
- "step": 150
229
- },
230
- {
231
- "epoch": 1.34,
232
- "learning_rate": 3.483771208671411e-05,
233
- "loss": 1.6927,
234
- "step": 155
235
- },
236
- {
237
- "epoch": 1.38,
238
- "learning_rate": 4.320852254368187e-05,
239
- "loss": 1.7203,
240
- "step": 160
241
- },
242
- {
243
- "epoch": 1.42,
244
- "learning_rate": 5.204422065684016e-05,
245
- "loss": 1.8592,
246
- "step": 165
247
- },
248
- {
249
- "epoch": 1.47,
250
- "learning_rate": 6.118303533611755e-05,
251
- "loss": 1.7338,
252
- "step": 170
253
- },
254
- {
255
- "epoch": 1.51,
256
- "learning_rate": 7.045764578878282e-05,
257
- "loss": 1.7386,
258
- "step": 175
259
- },
260
- {
261
- "epoch": 1.55,
262
- "learning_rate": 7.969824496351964e-05,
263
- "loss": 1.6874,
264
- "step": 180
265
- },
266
- {
267
- "epoch": 1.59,
268
- "learning_rate": 8.873564851492995e-05,
269
- "loss": 1.8691,
270
- "step": 185
271
- },
272
- {
273
- "epoch": 1.64,
274
- "learning_rate": 9.740439236703416e-05,
275
- "loss": 1.7808,
276
- "step": 190
277
- },
278
- {
279
- "epoch": 1.68,
280
- "learning_rate": 0.00010554576216307802,
281
- "loss": 1.8296,
282
- "step": 195
283
- },
284
- {
285
- "epoch": 1.72,
286
- "learning_rate": 0.00011301069913603334,
287
- "loss": 1.737,
288
- "step": 200
289
- },
290
- {
291
- "epoch": 1.77,
292
- "learning_rate": 0.0001196625291967717,
293
- "loss": 1.8,
294
- "step": 205
295
- },
296
- {
297
- "epoch": 1.81,
298
- "learning_rate": 0.00012537946527356269,
299
- "loss": 1.6787,
300
- "step": 210
301
- },
302
- {
303
- "epoch": 1.85,
304
- "learning_rate": 0.000130056837088046,
305
- "loss": 1.664,
306
- "step": 215
307
- },
308
- {
309
- "epoch": 1.9,
310
- "learning_rate": 0.00013360900754314024,
311
- "loss": 1.5839,
312
- "step": 220
313
- },
314
- {
315
- "epoch": 1.94,
316
- "learning_rate": 0.0001359709406361119,
317
- "loss": 1.8525,
318
- "step": 225
319
- },
320
- {
321
- "epoch": 1.98,
322
- "learning_rate": 0.0001370993921901871,
323
- "loss": 1.7228,
324
- "step": 230
325
- },
326
- {
327
- "epoch": 2.4,
328
- "learning_rate": 9.021642375642038e-05,
329
- "loss": 1.6079,
330
- "step": 235
331
- },
332
- {
333
- "epoch": 2.45,
334
- "learning_rate": 7.954855279928984e-05,
335
- "loss": 1.6691,
336
- "step": 240
337
- },
338
- {
339
- "epoch": 2.5,
340
- "learning_rate": 6.860000000000001e-05,
341
- "loss": 1.7047,
342
- "step": 245
343
- },
344
- {
345
- "epoch": 2.55,
346
- "learning_rate": 5.765144720071019e-05,
347
- "loss": 1.6921,
348
- "step": 250
349
- },
350
- {
351
- "epoch": 2.6,
352
- "learning_rate": 4.698357624357961e-05,
353
- "loss": 1.5894,
354
- "step": 255
355
- },
356
- {
357
- "epoch": 2.65,
358
- "learning_rate": 3.686987328947878e-05,
359
- "loss": 1.6388,
360
- "step": 260
361
- },
362
- {
363
- "epoch": 2.7,
364
- "learning_rate": 2.7569617608302645e-05,
365
- "loss": 1.6748,
366
- "step": 265
367
- },
368
- {
369
- "epoch": 2.76,
370
- "learning_rate": 1.932123458329584e-05,
371
- "loss": 1.6765,
372
- "step": 270
373
- },
374
- {
375
- "epoch": 2.81,
376
- "learning_rate": 1.233618333464885e-05,
377
- "loss": 1.6658,
378
- "step": 275
379
- },
380
- {
381
- "epoch": 2.86,
382
- "learning_rate": 6.793535661894062e-06,
383
- "loss": 1.5677,
384
- "step": 280
385
- },
386
- {
387
- "epoch": 2.91,
388
- "learning_rate": 2.8353852816850843e-06,
389
- "loss": 1.6118,
390
- "step": 285
391
- },
392
- {
393
- "epoch": 2.96,
394
- "learning_rate": 5.632050517253132e-07,
395
- "loss": 1.552,
396
- "step": 290
397
- },
398
- {
399
- "epoch": 3.0,
400
- "eval_loss": 1.546966552734375,
401
- "eval_runtime": 6.307,
402
- "eval_samples_per_second": 22.99,
403
- "eval_steps_per_second": 3.013,
404
- "step": 294
405
- },
406
- {
407
- "epoch": 3.01,
408
- "learning_rate": 3.52455686328105e-08,
409
- "loss": 1.4805,
410
- "step": 295
411
- },
412
- {
413
- "epoch": 3.06,
414
- "learning_rate": 1.2650418304129032e-06,
415
- "loss": 1.6327,
416
- "step": 300
417
- },
418
- {
419
- "epoch": 3.11,
420
- "learning_rate": 4.221066247386418e-06,
421
- "loss": 1.7416,
422
- "step": 305
423
- },
424
- {
425
- "epoch": 3.16,
426
- "learning_rate": 8.827536897135471e-06,
427
- "loss": 1.6078,
428
- "step": 310
429
- },
430
- {
431
- "epoch": 3.21,
432
- "learning_rate": 1.496636030269314e-05,
433
- "loss": 1.5727,
434
- "step": 315
435
- },
436
- {
437
- "epoch": 3.27,
438
- "learning_rate": 2.2480158928073662e-05,
439
- "loss": 1.6548,
440
- "step": 320
441
- },
442
- {
443
- "epoch": 3.32,
444
- "learning_rate": 3.117630577695637e-05,
445
- "loss": 1.596,
446
- "step": 325
447
- },
448
- {
449
- "epoch": 3.37,
450
- "learning_rate": 4.08318626618038e-05,
451
- "loss": 1.4367,
452
- "step": 330
453
- },
454
- {
455
- "epoch": 3.42,
456
- "learning_rate": 5.119929554380771e-05,
457
- "loss": 1.621,
458
- "step": 335
459
- },
460
- {
461
- "epoch": 3.47,
462
- "learning_rate": 6.201282042273297e-05,
463
- "loss": 1.7111,
464
- "step": 340
465
- },
466
- {
467
- "epoch": 3.52,
468
- "learning_rate": 7.299521709067686e-05,
469
- "loss": 1.6959,
470
- "step": 345
471
- },
472
- {
473
- "epoch": 3.57,
474
- "learning_rate": 8.386493606940314e-05,
475
- "loss": 1.7452,
476
- "step": 350
477
- },
478
- {
479
- "epoch": 3.62,
480
- "learning_rate": 9.434331653472505e-05,
481
- "loss": 1.6352,
482
- "step": 355
483
- },
484
- {
485
- "epoch": 3.67,
486
- "learning_rate": 0.00010416173018610202,
487
- "loss": 1.5285,
488
- "step": 360
489
- },
490
- {
491
- "epoch": 3.72,
492
- "learning_rate": 0.00011306846791811431,
493
- "loss": 1.7379,
494
- "step": 365
495
- },
496
- {
497
- "epoch": 3.78,
498
- "learning_rate": 0.00012083519274412256,
499
- "loss": 1.7212,
500
- "step": 370
501
- },
502
- {
503
- "epoch": 3.83,
504
- "learning_rate": 0.0001272627935421667,
505
- "loss": 1.6984,
506
- "step": 375
507
- },
508
- {
509
- "epoch": 3.88,
510
- "learning_rate": 0.00013218648955393709,
511
- "loss": 1.5563,
512
- "step": 380
513
- },
514
- {
515
- "epoch": 3.93,
516
- "learning_rate": 0.0001354800547756731,
517
- "loss": 1.6133,
518
- "step": 385
519
- },
520
- {
521
- "epoch": 3.98,
522
- "learning_rate": 0.00013705905394267309,
523
- "loss": 1.6524,
524
- "step": 390
525
- },
526
- {
527
- "epoch": 4.0,
528
- "eval_loss": 1.4850682020187378,
529
- "eval_runtime": 6.9944,
530
- "eval_samples_per_second": 21.303,
531
- "eval_steps_per_second": 2.716,
532
- "step": 392
533
- },
534
- {
535
- "epoch": 4.03,
536
- "learning_rate": 0.000136883007148315,
537
- "loss": 1.6487,
538
- "step": 395
539
- },
540
- {
541
- "epoch": 4.08,
542
- "learning_rate": 0.00013495642760447742,
543
- "loss": 1.5179,
544
- "step": 400
545
- },
546
- {
547
- "epoch": 4.13,
548
- "learning_rate": 0.00013132870593888477,
549
- "loss": 1.3698,
550
- "step": 405
551
- },
552
- {
553
- "epoch": 4.18,
554
- "learning_rate": 0.00012609284399558033,
555
- "loss": 1.6439,
556
- "step": 410
557
- },
558
- {
559
- "epoch": 4.23,
560
- "learning_rate": 0.00011938307059936668,
561
- "loss": 1.5471,
562
- "step": 415
563
- },
564
- {
565
- "epoch": 4.29,
566
- "learning_rate": 0.00011137140040750914,
567
- "loss": 1.6368,
568
- "step": 420
569
- },
570
- {
571
- "epoch": 4.34,
572
- "learning_rate": 0.00010226322406747016,
573
- "loss": 1.5678,
574
- "step": 425
575
- },
576
- {
577
- "epoch": 4.39,
578
- "learning_rate": 9.229204273330182e-05,
579
- "loss": 1.6766,
580
- "step": 430
581
- },
582
- {
583
- "epoch": 4.44,
584
- "learning_rate": 8.171348192891424e-05,
585
- "loss": 1.6611,
586
- "step": 435
587
- },
588
- {
589
- "epoch": 4.49,
590
- "learning_rate": 7.079873822141565e-05,
591
- "loss": 1.6183,
592
- "step": 440
593
- },
594
- {
595
- "epoch": 4.54,
596
- "learning_rate": 5.982762670844296e-05,
597
- "loss": 1.5438,
598
- "step": 445
599
- },
600
- {
601
- "epoch": 4.59,
602
- "learning_rate": 4.908140755711112e-05,
603
- "loss": 1.6215,
604
- "step": 450
605
- },
606
- {
607
- "epoch": 4.64,
608
- "learning_rate": 3.883557549653544e-05,
609
- "loss": 1.5207,
610
- "step": 455
611
- },
612
- {
613
- "epoch": 4.69,
614
- "learning_rate": 2.9352797115619177e-05,
615
- "loss": 1.4362,
616
- "step": 460
617
- },
618
- {
619
- "epoch": 4.74,
620
- "learning_rate": 2.0876177028600835e-05,
621
- "loss": 1.5917,
622
- "step": 465
623
- },
624
- {
625
- "epoch": 4.8,
626
- "learning_rate": 1.3623025539858168e-05,
627
- "loss": 1.4801,
628
- "step": 470
629
- },
630
- {
631
- "epoch": 4.85,
632
- "learning_rate": 7.779287582812185e-06,
633
- "loss": 1.4835,
634
- "step": 475
635
- },
636
- {
637
- "epoch": 4.9,
638
- "learning_rate": 3.494775755068154e-06,
639
- "loss": 1.5351,
640
- "step": 480
641
- },
642
- {
643
- "epoch": 4.95,
644
- "learning_rate": 8.793296577687332e-07,
645
- "loss": 1.4665,
646
- "step": 485
647
- },
648
- {
649
- "epoch": 5.0,
650
- "learning_rate": 0.0,
651
- "loss": 1.5628,
652
- "step": 490
653
- },
654
- {
655
- "epoch": 5.0,
656
- "eval_loss": 1.417140007019043,
657
- "eval_runtime": 6.9215,
658
- "eval_samples_per_second": 21.527,
659
- "eval_steps_per_second": 2.745,
660
- "step": 490
661
- },
662
- {
663
- "epoch": 5.05,
664
- "learning_rate": 8.79329657768718e-07,
665
- "loss": 1.3775,
666
- "step": 495
667
- },
668
- {
669
- "epoch": 5.1,
670
- "learning_rate": 3.4947757550681237e-06,
671
- "loss": 1.5294,
672
- "step": 500
673
- },
674
- {
675
- "epoch": 5.15,
676
- "learning_rate": 7.7792875828122e-06,
677
- "loss": 1.436,
678
- "step": 505
679
- },
680
- {
681
- "epoch": 5.2,
682
- "learning_rate": 1.3623025539858192e-05,
683
- "loss": 1.5282,
684
- "step": 510
685
- },
686
- {
687
- "epoch": 5.26,
688
- "learning_rate": 2.087617702860069e-05,
689
- "loss": 1.5277,
690
- "step": 515
691
- },
692
- {
693
- "epoch": 5.31,
694
- "learning_rate": 2.9352797115619008e-05,
695
- "loss": 1.6218,
696
- "step": 520
697
- },
698
- {
699
- "epoch": 5.36,
700
- "learning_rate": 3.8835575496535365e-05,
701
- "loss": 1.4628,
702
- "step": 525
703
- },
704
- {
705
- "epoch": 5.41,
706
- "learning_rate": 4.9081407557111025e-05,
707
- "loss": 1.4702,
708
- "step": 530
709
- },
710
- {
711
- "epoch": 5.46,
712
- "learning_rate": 5.9827626708442996e-05,
713
- "loss": 1.3757,
714
- "step": 535
715
- },
716
- {
717
- "epoch": 5.51,
718
- "learning_rate": 7.079873822141567e-05,
719
- "loss": 1.3535,
720
- "step": 540
721
- },
722
- {
723
- "epoch": 5.56,
724
- "learning_rate": 8.171348192891405e-05,
725
- "loss": 1.4668,
726
- "step": 545
727
- },
728
- {
729
- "epoch": 5.61,
730
- "learning_rate": 9.229204273330163e-05,
731
- "loss": 1.4463,
732
- "step": 550
733
- },
734
- {
735
- "epoch": 5.66,
736
- "learning_rate": 0.00010226322406747008,
737
- "loss": 1.3922,
738
- "step": 555
739
- },
740
- {
741
- "epoch": 5.71,
742
- "learning_rate": 0.00011137140040750908,
743
- "loss": 1.4607,
744
- "step": 560
745
- },
746
- {
747
- "epoch": 5.77,
748
- "learning_rate": 0.00011938307059936662,
749
- "loss": 1.533,
750
- "step": 565
751
- },
752
- {
753
- "epoch": 5.82,
754
- "learning_rate": 0.00012609284399558025,
755
- "loss": 1.5392,
756
- "step": 570
757
- },
758
- {
759
- "epoch": 5.87,
760
- "learning_rate": 0.00013132870593888474,
761
- "loss": 1.6347,
762
- "step": 575
763
- },
764
- {
765
- "epoch": 5.92,
766
- "learning_rate": 0.00013495642760447742,
767
- "loss": 1.6022,
768
- "step": 580
769
- },
770
- {
771
- "epoch": 5.97,
772
- "learning_rate": 0.000136883007148315,
773
- "loss": 1.4927,
774
- "step": 585
775
- },
776
- {
777
- "epoch": 6.0,
778
- "eval_loss": 1.4285378456115723,
779
- "eval_runtime": 6.9515,
780
- "eval_samples_per_second": 21.434,
781
- "eval_steps_per_second": 2.733,
782
- "step": 588
783
- },
784
- {
785
- "epoch": 6.02,
786
- "learning_rate": 0.00013705905394267309,
787
- "loss": 1.415,
788
- "step": 590
789
- },
790
- {
791
- "epoch": 6.07,
792
- "learning_rate": 0.00013548005477567314,
793
- "loss": 1.4139,
794
- "step": 595
795
- },
796
- {
797
- "epoch": 6.12,
798
- "learning_rate": 0.00013218648955393714,
799
- "loss": 1.522,
800
- "step": 600
801
- },
802
- {
803
- "epoch": 6.17,
804
- "learning_rate": 0.00012726279354216682,
805
- "loss": 1.4313,
806
- "step": 605
807
- },
808
- {
809
- "epoch": 6.22,
810
- "learning_rate": 0.0001208351927441227,
811
- "loss": 1.3831,
812
- "step": 610
813
- },
814
- {
815
- "epoch": 6.28,
816
- "learning_rate": 0.00011306846791811419,
817
- "loss": 1.4225,
818
- "step": 615
819
- },
820
- {
821
- "epoch": 6.33,
822
- "learning_rate": 0.0001041617301861021,
823
- "loss": 1.5253,
824
- "step": 620
825
- },
826
- {
827
- "epoch": 6.38,
828
- "learning_rate": 9.434331653472514e-05,
829
- "loss": 1.3459,
830
- "step": 625
831
- },
832
- {
833
- "epoch": 6.43,
834
- "learning_rate": 8.386493606940322e-05,
835
- "loss": 1.3962,
836
- "step": 630
837
- },
838
- {
839
- "epoch": 6.48,
840
- "learning_rate": 7.299521709067695e-05,
841
- "loss": 1.3858,
842
- "step": 635
843
- },
844
- {
845
- "epoch": 6.53,
846
- "learning_rate": 6.201282042273305e-05,
847
- "loss": 1.4634,
848
- "step": 640
849
- },
850
- {
851
- "epoch": 6.58,
852
- "learning_rate": 5.11992955438078e-05,
853
- "loss": 1.4289,
854
- "step": 645
855
- },
856
- {
857
- "epoch": 6.63,
858
- "learning_rate": 4.0831862661803776e-05,
859
- "loss": 1.4365,
860
- "step": 650
861
- },
862
- {
863
- "epoch": 6.68,
864
- "learning_rate": 3.1176305776956335e-05,
865
- "loss": 1.4483,
866
- "step": 655
867
- },
868
- {
869
- "epoch": 6.73,
870
- "learning_rate": 2.248015892807363e-05,
871
- "loss": 1.387,
872
- "step": 660
873
- },
874
- {
875
- "epoch": 6.79,
876
- "learning_rate": 1.496636030269327e-05,
877
- "loss": 1.4562,
878
- "step": 665
879
- },
880
- {
881
- "epoch": 6.84,
882
- "learning_rate": 8.827536897135571e-06,
883
- "loss": 1.4136,
884
- "step": 670
885
- },
886
- {
887
- "epoch": 6.89,
888
- "learning_rate": 4.221066247386487e-06,
889
- "loss": 1.3972,
890
- "step": 675
891
- },
892
- {
893
- "epoch": 6.94,
894
- "learning_rate": 1.2650418304129413e-06,
895
- "loss": 1.4283,
896
- "step": 680
897
- },
898
- {
899
- "epoch": 6.99,
900
- "learning_rate": 3.5245568632818114e-08,
901
- "loss": 1.4148,
902
- "step": 685
903
- },
904
- {
905
- "epoch": 7.0,
906
- "eval_loss": 1.388408899307251,
907
- "eval_runtime": 6.9772,
908
- "eval_samples_per_second": 21.355,
909
- "eval_steps_per_second": 2.723,
910
- "step": 686
911
- },
912
- {
913
- "epoch": 7.04,
914
- "learning_rate": 5.632050517253056e-07,
915
- "loss": 1.4365,
916
- "step": 690
917
- },
918
- {
919
- "epoch": 7.09,
920
- "learning_rate": 2.8353852816850615e-06,
921
- "loss": 1.2923,
922
- "step": 695
923
- },
924
- {
925
- "epoch": 7.14,
926
- "learning_rate": 6.793535661894024e-06,
927
- "loss": 1.3917,
928
- "step": 700
929
- },
930
- {
931
- "epoch": 7.19,
932
- "learning_rate": 1.2336183334648805e-05,
933
- "loss": 1.4588,
934
- "step": 705
935
- },
936
- {
937
- "epoch": 7.24,
938
- "learning_rate": 1.932123458329587e-05,
939
- "loss": 1.3238,
940
- "step": 710
941
- },
942
- {
943
- "epoch": 7.3,
944
- "learning_rate": 2.7569617608302577e-05,
945
- "loss": 1.3604,
946
- "step": 715
947
- },
948
- {
949
- "epoch": 7.35,
950
- "learning_rate": 3.686987328947871e-05,
951
- "loss": 1.3916,
952
- "step": 720
953
- },
954
- {
955
- "epoch": 7.4,
956
- "learning_rate": 4.698357624357965e-05,
957
- "loss": 1.3839,
958
- "step": 725
959
- },
960
- {
961
- "epoch": 7.45,
962
- "learning_rate": 5.7651447200710234e-05,
963
- "loss": 1.3886,
964
- "step": 730
965
- },
966
- {
967
- "epoch": 7.5,
968
- "learning_rate": 6.859999999999982e-05,
969
- "loss": 1.3475,
970
- "step": 735
971
- },
972
- {
973
- "epoch": 7.55,
974
- "learning_rate": 7.954855279928965e-05,
975
- "loss": 1.3304,
976
- "step": 740
977
- },
978
- {
979
- "epoch": 7.6,
980
- "learning_rate": 9.021642375642024e-05,
981
- "loss": 1.3287,
982
- "step": 745
983
- },
984
- {
985
- "epoch": 7.65,
986
- "learning_rate": 0.00010033012671052118,
987
- "loss": 1.4354,
988
- "step": 750
989
- },
990
- {
991
- "epoch": 7.7,
992
- "learning_rate": 0.00010963038239169733,
993
- "loss": 1.3597,
994
- "step": 755
995
- },
996
- {
997
- "epoch": 7.76,
998
- "learning_rate": 0.00011787876541670406,
999
- "loss": 1.3221,
1000
- "step": 760
1001
- },
1002
- {
1003
- "epoch": 7.81,
1004
- "learning_rate": 0.00012486381666535114,
1005
- "loss": 1.3458,
1006
- "step": 765
1007
- },
1008
- {
1009
- "epoch": 7.86,
1010
- "learning_rate": 0.00013040646433810593,
1011
- "loss": 1.4032,
1012
- "step": 770
1013
- },
1014
- {
1015
- "epoch": 7.91,
1016
- "learning_rate": 0.00013436461471831492,
1017
- "loss": 1.403,
1018
- "step": 775
1019
- },
1020
- {
1021
- "epoch": 7.96,
1022
- "learning_rate": 0.00013663679494827467,
1023
- "loss": 1.4234,
1024
- "step": 780
1025
- },
1026
- {
1027
- "epoch": 8.0,
1028
- "eval_loss": 1.4104630947113037,
1029
- "eval_runtime": 6.9539,
1030
- "eval_samples_per_second": 21.427,
1031
- "eval_steps_per_second": 2.732,
1032
- "step": 784
1033
- },
1034
- {
1035
- "epoch": 8.01,
1036
- "learning_rate": 0.0001371647544313672,
1037
- "loss": 1.1334,
1038
- "step": 785
1039
- },
1040
- {
1041
- "epoch": 8.06,
1042
- "learning_rate": 0.0001359349581695871,
1043
- "loss": 1.2244,
1044
- "step": 790
1045
- },
1046
- {
1047
- "epoch": 8.11,
1048
- "learning_rate": 0.00013297893375261365,
1049
- "loss": 1.361,
1050
- "step": 795
1051
- },
1052
- {
1053
- "epoch": 8.16,
1054
- "learning_rate": 0.00012837246310286448,
1055
- "loss": 1.3544,
1056
- "step": 800
1057
- },
1058
- {
1059
- "epoch": 8.21,
1060
- "learning_rate": 0.00012223363969730697,
1061
- "loss": 1.1642,
1062
- "step": 805
1063
- },
1064
- {
1065
- "epoch": 8.27,
1066
- "learning_rate": 0.00011471984107192647,
1067
- "loss": 1.3799,
1068
- "step": 810
1069
- },
1070
- {
1071
- "epoch": 8.32,
1072
- "learning_rate": 0.00010602369422304377,
1073
- "loss": 1.3705,
1074
- "step": 815
1075
- },
1076
- {
1077
- "epoch": 8.37,
1078
- "learning_rate": 9.636813733819635e-05,
1079
- "loss": 1.2818,
1080
- "step": 820
1081
- },
1082
- {
1083
- "epoch": 8.42,
1084
- "learning_rate": 8.600070445619209e-05,
1085
- "loss": 1.3989,
1086
- "step": 825
1087
- },
1088
- {
1089
- "epoch": 8.47,
1090
- "learning_rate": 7.518717957726708e-05,
1091
- "loss": 1.2671,
1092
- "step": 830
1093
- },
1094
- {
1095
- "epoch": 8.52,
1096
- "learning_rate": 6.420478290932294e-05,
1097
- "loss": 1.359,
1098
- "step": 835
1099
- },
1100
- {
1101
- "epoch": 8.57,
1102
- "learning_rate": 5.33350639305969e-05,
1103
- "loss": 1.3868,
1104
- "step": 840
1105
- },
1106
- {
1107
- "epoch": 8.62,
1108
- "learning_rate": 4.285668346527499e-05,
1109
- "loss": 1.4007,
1110
- "step": 845
1111
- },
1112
- {
1113
- "epoch": 8.67,
1114
- "learning_rate": 3.3038269813898015e-05,
1115
- "loss": 1.351,
1116
- "step": 850
1117
- },
1118
- {
1119
- "epoch": 8.72,
1120
- "learning_rate": 2.413153208188573e-05,
1121
- "loss": 1.317,
1122
- "step": 855
1123
- },
1124
- {
1125
- "epoch": 8.78,
1126
- "learning_rate": 1.636480725587754e-05,
1127
- "loss": 1.2524,
1128
- "step": 860
1129
- },
1130
- {
1131
- "epoch": 8.83,
1132
- "learning_rate": 9.937206457833243e-06,
1133
- "loss": 1.3045,
1134
- "step": 865
1135
- },
1136
- {
1137
- "epoch": 8.88,
1138
- "learning_rate": 5.013510446062984e-06,
1139
- "loss": 1.4154,
1140
- "step": 870
1141
- },
1142
- {
1143
- "epoch": 8.93,
1144
- "learning_rate": 1.719945224326892e-06,
1145
- "loss": 1.2452,
1146
- "step": 875
1147
- },
1148
- {
1149
- "epoch": 8.98,
1150
- "learning_rate": 1.4094605732693502e-07,
1151
- "loss": 1.3688,
1152
- "step": 880
1153
- },
1154
- {
1155
- "epoch": 9.0,
1156
- "eval_loss": 1.3758981227874756,
1157
- "eval_runtime": 6.9963,
1158
- "eval_samples_per_second": 21.297,
1159
- "eval_steps_per_second": 2.716,
1160
- "step": 882
1161
- },
1162
- {
1163
- "epoch": 9.03,
1164
- "learning_rate": 3.169928516849862e-07,
1165
- "loss": 1.4731,
1166
- "step": 885
1167
- },
1168
- {
1169
- "epoch": 9.08,
1170
- "learning_rate": 2.2435723955225417e-06,
1171
- "loss": 1.217,
1172
- "step": 890
1173
- },
1174
- {
1175
- "epoch": 9.13,
1176
- "learning_rate": 5.8712940611152096e-06,
1177
- "loss": 1.2492,
1178
- "step": 895
1179
- },
1180
- {
1181
- "epoch": 9.18,
1182
- "learning_rate": 1.1107156004419803e-05,
1183
- "loss": 1.2326,
1184
- "step": 900
1185
- },
1186
- {
1187
- "epoch": 9.23,
1188
- "learning_rate": 1.7816929400633287e-05,
1189
- "loss": 1.2444,
1190
- "step": 905
1191
- },
1192
- {
1193
- "epoch": 9.29,
1194
- "learning_rate": 2.582859959249101e-05,
1195
- "loss": 1.2759,
1196
- "step": 910
1197
- },
1198
- {
1199
- "epoch": 9.34,
1200
- "learning_rate": 3.493677593252981e-05,
1201
- "loss": 1.2315,
1202
- "step": 915
1203
- },
1204
- {
1205
- "epoch": 9.39,
1206
- "learning_rate": 4.490795726669825e-05,
1207
- "loss": 1.2353,
1208
- "step": 920
1209
- },
1210
- {
1211
- "epoch": 9.44,
1212
- "learning_rate": 5.548651807108583e-05,
1213
- "loss": 1.2733,
1214
- "step": 925
1215
- },
1216
- {
1217
- "epoch": 9.49,
1218
- "learning_rate": 6.640126177858445e-05,
1219
- "loss": 1.205,
1220
- "step": 930
1221
- },
1222
- {
1223
- "epoch": 9.54,
1224
- "learning_rate": 7.737237329155688e-05,
1225
- "loss": 1.2441,
1226
- "step": 935
1227
- },
1228
- {
1229
- "epoch": 9.59,
1230
- "learning_rate": 8.811859244288885e-05,
1231
- "loss": 1.2469,
1232
- "step": 940
1233
- },
1234
- {
1235
- "epoch": 9.64,
1236
- "learning_rate": 9.83644245034643e-05,
1237
- "loss": 1.2763,
1238
- "step": 945
1239
- },
1240
- {
1241
- "epoch": 9.69,
1242
- "learning_rate": 0.00010784720288438088,
1243
- "loss": 1.1781,
1244
- "step": 950
1245
- },
1246
- {
1247
- "epoch": 9.74,
1248
- "learning_rate": 0.00011632382297139905,
1249
- "loss": 1.2822,
1250
- "step": 955
1251
- },
1252
- {
1253
- "epoch": 9.8,
1254
- "learning_rate": 0.00012357697446014173,
1255
- "loss": 1.2663,
1256
- "step": 960
1257
- },
1258
- {
1259
- "epoch": 9.85,
1260
- "learning_rate": 0.00012942071241718773,
1261
- "loss": 1.3906,
1262
- "step": 965
1263
- },
1264
- {
1265
- "epoch": 9.9,
1266
- "learning_rate": 0.00013370522424493184,
1267
- "loss": 1.3015,
1268
- "step": 970
1269
- },
1270
- {
1271
- "epoch": 9.95,
1272
- "learning_rate": 0.00013632067034223124,
1273
- "loss": 1.2348,
1274
- "step": 975
1275
- },
1276
- {
1277
- "epoch": 10.0,
1278
- "learning_rate": 0.0001372,
1279
- "loss": 1.2793,
1280
- "step": 980
1281
- },
1282
- {
1283
- "epoch": 10.0,
1284
- "eval_loss": 1.3976633548736572,
1285
- "eval_runtime": 7.0041,
1286
- "eval_samples_per_second": 21.273,
1287
- "eval_steps_per_second": 2.713,
1288
- "step": 980
1289
- },
1290
- {
1291
- "epoch": 10.05,
1292
- "learning_rate": 0.00013632067034223126,
1293
- "loss": 1.2203,
1294
- "step": 985
1295
- },
1296
- {
1297
- "epoch": 10.1,
1298
- "learning_rate": 0.0001337052242449319,
1299
- "loss": 1.3016,
1300
- "step": 990
1301
- },
1302
- {
1303
- "epoch": 10.15,
1304
- "learning_rate": 0.00012942071241718781,
1305
- "loss": 1.2078,
1306
- "step": 995
1307
- },
1308
- {
1309
- "epoch": 10.2,
1310
- "learning_rate": 0.00012357697446014183,
1311
- "loss": 1.2408,
1312
- "step": 1000
1313
- },
1314
- {
1315
- "epoch": 10.26,
1316
- "learning_rate": 0.00011632382297139899,
1317
- "loss": 1.2247,
1318
- "step": 1005
1319
- },
1320
- {
1321
- "epoch": 10.31,
1322
- "learning_rate": 0.00010784720288438083,
1323
- "loss": 1.0939,
1324
- "step": 1010
1325
- },
1326
- {
1327
- "epoch": 10.36,
1328
- "learning_rate": 9.836442450346467e-05,
1329
- "loss": 1.1944,
1330
- "step": 1015
1331
- },
1332
- {
1333
- "epoch": 10.41,
1334
- "learning_rate": 8.811859244288877e-05,
1335
- "loss": 1.2171,
1336
- "step": 1020
1337
- },
1338
- {
1339
- "epoch": 10.46,
1340
- "learning_rate": 7.737237329155728e-05,
1341
- "loss": 1.2546,
1342
- "step": 1025
1343
- },
1344
- {
1345
- "epoch": 10.51,
1346
- "learning_rate": 6.640126177858486e-05,
1347
- "loss": 1.1607,
1348
- "step": 1030
1349
- },
1350
- {
1351
- "epoch": 10.56,
1352
- "learning_rate": 5.5486518071086e-05,
1353
- "loss": 1.2547,
1354
- "step": 1035
1355
- },
1356
- {
1357
- "epoch": 10.61,
1358
- "learning_rate": 4.4907957266698644e-05,
1359
- "loss": 1.2295,
1360
- "step": 1040
1361
- },
1362
- {
1363
- "epoch": 10.66,
1364
- "learning_rate": 3.493677593252996e-05,
1365
- "loss": 1.2566,
1366
- "step": 1045
1367
- },
1368
- {
1369
- "epoch": 10.71,
1370
- "learning_rate": 2.5828599592491143e-05,
1371
- "loss": 1.2949,
1372
- "step": 1050
1373
- },
1374
- {
1375
- "epoch": 10.77,
1376
- "learning_rate": 1.7816929400633402e-05,
1377
- "loss": 1.2886,
1378
- "step": 1055
1379
- },
1380
- {
1381
- "epoch": 10.82,
1382
- "learning_rate": 1.1107156004419895e-05,
1383
- "loss": 1.1858,
1384
- "step": 1060
1385
- },
1386
- {
1387
- "epoch": 10.87,
1388
- "learning_rate": 5.871294061115278e-06,
1389
- "loss": 1.2322,
1390
- "step": 1065
1391
- },
1392
- {
1393
- "epoch": 10.92,
1394
- "learning_rate": 2.2435723955225265e-06,
1395
- "loss": 1.0434,
1396
- "step": 1070
1397
- },
1398
- {
1399
- "epoch": 10.97,
1400
- "learning_rate": 3.169928516850014e-07,
1401
- "loss": 1.2505,
1402
- "step": 1075
1403
- },
1404
- {
1405
- "epoch": 11.0,
1406
- "eval_loss": 1.3747466802597046,
1407
- "eval_runtime": 7.0127,
1408
- "eval_samples_per_second": 21.247,
1409
- "eval_steps_per_second": 2.709,
1410
- "step": 1078
1411
- },
1412
- {
1413
- "epoch": 11.37,
1414
- "learning_rate": 4.1043693868806304e-05,
1415
- "loss": 1.1875,
1416
- "step": 1080
1417
- },
1418
- {
1419
- "epoch": 11.42,
1420
- "learning_rate": 5.17596955821411e-05,
1421
- "loss": 1.3125,
1422
- "step": 1085
1423
- },
1424
- {
1425
- "epoch": 11.47,
1426
- "learning_rate": 6.293505690059783e-05,
1427
- "loss": 1.2444,
1428
- "step": 1090
1429
- },
1430
- {
1431
- "epoch": 11.53,
1432
- "learning_rate": 7.426494309940222e-05,
1433
- "loss": 1.3268,
1434
- "step": 1095
1435
- },
1436
- {
1437
- "epoch": 11.58,
1438
- "learning_rate": 8.544030441785894e-05,
1439
- "loss": 1.3066,
1440
- "step": 1100
1441
- },
1442
- {
1443
- "epoch": 11.63,
1444
- "learning_rate": 9.615630613119375e-05,
1445
- "loss": 1.2542,
1446
- "step": 1105
1447
- },
1448
- {
1449
- "epoch": 11.68,
1450
- "learning_rate": 0.00010612064364719844,
1451
- "loss": 1.246,
1452
- "step": 1110
1453
- },
1454
- {
1455
- "epoch": 11.74,
1456
- "learning_rate": 0.00011506151581352574,
1457
- "loss": 1.2672,
1458
- "step": 1115
1459
- },
1460
- {
1461
- "epoch": 11.79,
1462
- "learning_rate": 0.00012273503894459246,
1463
- "loss": 1.2702,
1464
- "step": 1120
1465
- },
1466
- {
1467
- "epoch": 11.84,
1468
- "learning_rate": 0.00012893189933276523,
1469
- "loss": 1.3417,
1470
- "step": 1125
1471
- },
1472
- {
1473
- "epoch": 11.89,
1474
- "learning_rate": 0.00013348306278066356,
1475
- "loss": 1.2473,
1476
- "step": 1130
1477
- },
1478
- {
1479
- "epoch": 11.95,
1480
- "learning_rate": 0.00013626438541342674,
1481
- "loss": 1.1708,
1482
- "step": 1135
1483
- },
1484
- {
1485
- "epoch": 12.0,
1486
- "learning_rate": 0.0001372,
1487
- "loss": 1.1564,
1488
- "step": 1140
1489
- },
1490
- {
1491
- "epoch": 12.0,
1492
- "eval_loss": 1.1428929567337036,
1493
- "eval_runtime": 7.9608,
1494
- "eval_samples_per_second": 21.48,
1495
- "eval_steps_per_second": 2.764,
1496
- "step": 1140
1497
- },
1498
- {
1499
- "epoch": 11.8,
1500
- "learning_rate": 0.00012461626728572456,
1501
- "loss": 1.2708,
1502
- "step": 1145
1503
- },
1504
- {
1505
- "epoch": 11.86,
1506
- "learning_rate": 0.000130268089438458,
1507
- "loss": 1.2385,
1508
- "step": 1150
1509
- },
1510
- {
1511
- "epoch": 11.91,
1512
- "learning_rate": 0.00013430626843929596,
1513
- "loss": 1.2608,
1514
- "step": 1155
1515
- },
1516
- {
1517
- "epoch": 11.96,
1518
- "learning_rate": 0.00013662513894413278,
1519
- "loss": 1.2132,
1520
- "step": 1160
1521
- },
1522
- {
1523
- "epoch": 12.0,
1524
- "eval_loss": 1.1082079410552979,
1525
- "eval_runtime": 6.721,
1526
- "eval_samples_per_second": 23.211,
1527
- "eval_steps_per_second": 2.976,
1528
- "step": 1164
1529
- },
1530
- {
1531
- "epoch": 12.01,
1532
- "learning_rate": 0.00013716402403652231,
1533
- "loss": 1.2701,
1534
- "step": 1165
1535
- },
1536
- {
1537
- "epoch": 12.06,
1538
- "learning_rate": 0.0001359088229352192,
1539
- "loss": 1.2274,
1540
- "step": 1170
1541
- },
1542
- {
1543
- "epoch": 12.11,
1544
- "learning_rate": 0.0001328923799634352,
1545
- "loss": 1.1978,
1546
- "step": 1175
1547
- },
1548
- {
1549
- "epoch": 12.16,
1550
- "learning_rate": 0.0001281936251251452,
1551
- "loss": 1.0879,
1552
- "step": 1180
1553
- },
1554
- {
1555
- "epoch": 12.22,
1556
- "learning_rate": 0.00012193550877662404,
1557
- "loss": 1.2841,
1558
- "step": 1185
1559
- },
1560
- {
1561
- "epoch": 12.27,
1562
- "learning_rate": 0.00011428178443580113,
1563
- "loss": 1.166,
1564
- "step": 1190
1565
- },
1566
- {
1567
- "epoch": 12.32,
1568
- "learning_rate": 0.0001054327239123201,
1569
- "loss": 1.2385,
1570
- "step": 1195
1571
- },
1572
- {
1573
- "epoch": 12.37,
1574
- "learning_rate": 9.561987687870095e-05,
1575
- "loss": 1.1758,
1576
- "step": 1200
1577
- },
1578
- {
1579
- "epoch": 12.42,
1580
- "learning_rate": 8.51000120067249e-05,
1581
- "loss": 1.1698,
1582
- "step": 1205
1583
- },
1584
- {
1585
- "epoch": 12.47,
1586
- "learning_rate": 7.414839820879227e-05,
1587
- "loss": 1.3722,
1588
- "step": 1210
1589
- },
1590
- {
1591
- "epoch": 12.53,
1592
- "learning_rate": 6.305160179120769e-05,
1593
- "loss": 1.1787,
1594
- "step": 1215
1595
- },
1596
- {
1597
- "epoch": 12.58,
1598
- "learning_rate": 5.209998799327507e-05,
1599
- "loss": 1.2814,
1600
- "step": 1220
1601
- },
1602
- {
1603
- "epoch": 12.63,
1604
- "learning_rate": 4.158012312129902e-05,
1605
- "loss": 1.2154,
1606
- "step": 1225
1607
- },
1608
- {
1609
- "epoch": 12.68,
1610
- "learning_rate": 3.176727608767987e-05,
1611
- "loss": 1.2798,
1612
- "step": 1230
1613
- },
1614
- {
1615
- "epoch": 12.73,
1616
- "learning_rate": 2.291821556419886e-05,
1617
- "loss": 1.2241,
1618
- "step": 1235
1619
- },
1620
- {
1621
- "epoch": 12.78,
1622
- "learning_rate": 1.5264491223375942e-05,
1623
- "loss": 1.1194,
1624
- "step": 1240
1625
- },
1626
- {
1627
- "epoch": 12.84,
1628
- "learning_rate": 9.006374874854777e-06,
1629
- "loss": 1.1957,
1630
- "step": 1245
1631
- },
1632
- {
1633
- "epoch": 12.89,
1634
- "learning_rate": 4.3076200365648044e-06,
1635
- "loss": 1.1553,
1636
- "step": 1250
1637
- },
1638
- {
1639
- "epoch": 12.94,
1640
- "learning_rate": 1.2911770647808012e-06,
1641
- "loss": 1.1644,
1642
- "step": 1255
1643
- },
1644
- {
1645
- "epoch": 12.99,
1646
- "learning_rate": 3.597596347767558e-08,
1647
- "loss": 1.1646,
1648
- "step": 1260
1649
- },
1650
- {
1651
- "epoch": 13.0,
1652
- "eval_loss": 1.0746197700500488,
1653
- "eval_runtime": 7.0141,
1654
- "eval_samples_per_second": 22.241,
1655
- "eval_steps_per_second": 2.851,
1656
- "step": 1261
1657
- },
1658
- {
1659
- "epoch": 12.91,
1660
- "learning_rate": 2.835385281685176e-06,
1661
- "loss": 1.0594,
1662
- "step": 1265
1663
- },
1664
- {
1665
- "epoch": 12.96,
1666
- "learning_rate": 5.632050517253284e-07,
1667
- "loss": 1.1527,
1668
- "step": 1270
1669
- },
1670
- {
1671
- "epoch": 13.0,
1672
- "eval_loss": 1.029100775718689,
1673
- "eval_runtime": 6.9675,
1674
- "eval_samples_per_second": 21.242,
1675
- "eval_steps_per_second": 2.727,
1676
- "step": 1274
1677
- },
1678
- {
1679
- "epoch": 12.75,
1680
- "learning_rate": 2.0092474810602958e-05,
1681
- "loss": 1.2194,
1682
- "step": 1275
1683
- },
1684
- {
1685
- "epoch": 12.8,
1686
- "learning_rate": 1.3101434185878674e-05,
1687
- "loss": 1.0397,
1688
- "step": 1280
1689
- },
1690
- {
1691
- "epoch": 12.85,
1692
- "learning_rate": 7.476952440677985e-06,
1693
- "loss": 1.0537,
1694
- "step": 1285
1695
- },
1696
- {
1697
- "epoch": 12.9,
1698
- "learning_rate": 3.35752298215246e-06,
1699
- "loss": 1.2597,
1700
- "step": 1290
1701
- },
1702
- {
1703
- "epoch": 12.95,
1704
- "learning_rate": 8.445798351736176e-07,
1705
- "loss": 1.0949,
1706
- "step": 1295
1707
- },
1708
- {
1709
- "epoch": 13.0,
1710
- "learning_rate": 0.0,
1711
- "loss": 1.1523,
1712
- "step": 1300
1713
  },
1714
  {
1715
- "epoch": 13.0,
1716
- "eval_loss": 1.004758358001709,
1717
- "eval_runtime": 6.2971,
1718
- "eval_samples_per_second": 20.803,
1719
- "eval_steps_per_second": 2.7,
1720
- "step": 1300
1721
  }
1722
  ],
1723
- "max_steps": 1400,
1724
- "num_train_epochs": 14,
1725
- "total_flos": 1346568486912000.0,
1726
  "trial_name": null,
1727
  "trial_params": null
1728
  }
 
1
  {
2
+ "best_metric": 1.0789222717285156,
3
+ "best_model_checkpoint": "output/morgenshtern/checkpoint-97",
4
+ "epoch": 1.0,
5
+ "global_step": 97,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.05,
12
+ "learning_rate": 0.00013630248621914493,
13
+ "loss": 1.1495,
14
  "step": 5
15
  },
16
  {
17
  "epoch": 0.1,
18
+ "learning_rate": 0.00013363342974500022,
19
+ "loss": 1.133,
20
  "step": 10
21
  },
22
  {
23
  "epoch": 0.15,
24
+ "learning_rate": 0.00012926267066407974,
25
+ "loss": 1.1617,
26
  "step": 15
27
  },
28
  {
29
+ "epoch": 0.21,
30
+ "learning_rate": 0.00012330457680460286,
31
+ "loss": 1.1061,
32
  "step": 20
33
  },
34
  {
35
+ "epoch": 0.26,
36
+ "learning_rate": 0.0001159150511212779,
37
+ "loss": 1.2273,
 
 
 
 
 
 
 
 
38
  "step": 25
39
  },
40
  {
41
+ "epoch": 0.31,
42
+ "learning_rate": 0.00010728745224777217,
43
+ "loss": 1.0743,
44
  "step": 30
45
  },
46
  {
47
+ "epoch": 0.36,
48
+ "learning_rate": 9.764753496206386e-05,
49
+ "loss": 1.2278,
50
  "step": 35
51
  },
52
  {
53
+ "epoch": 0.41,
54
+ "learning_rate": 8.724754295541321e-05,
55
+ "loss": 1.1852,
 
 
 
 
 
 
 
 
56
  "step": 40
57
  },
58
  {
59
+ "epoch": 0.46,
60
+ "learning_rate": 7.635960847699664e-05,
61
+ "loss": 1.1321,
62
  "step": 45
63
  },
64
  {
65
+ "epoch": 0.52,
66
+ "learning_rate": 6.526863156294906e-05,
67
+ "loss": 1.0477,
68
  "step": 50
69
  },
70
  {
71
+ "epoch": 0.57,
72
+ "learning_rate": 5.426482517605479e-05,
73
+ "loss": 1.1788,
74
  "step": 55
75
  },
76
  {
77
+ "epoch": 0.62,
78
+ "learning_rate": 4.363612132430507e-05,
79
+ "loss": 1.0456,
 
 
 
 
 
 
 
 
80
  "step": 60
81
  },
82
  {
83
+ "epoch": 0.67,
84
+ "learning_rate": 3.36606368642472e-05,
85
+ "loss": 1.1629,
86
  "step": 65
87
  },
88
  {
89
+ "epoch": 0.72,
90
+ "learning_rate": 2.4599396133304144e-05,
91
+ "loss": 1.1195,
92
  "step": 70
93
  },
94
  {
95
+ "epoch": 0.77,
96
+ "learning_rate": 1.6689500834906116e-05,
97
+ "loss": 0.9719,
98
  "step": 75
99
  },
100
  {
101
+ "epoch": 0.82,
102
+ "learning_rate": 1.0137925897200545e-05,
103
+ "loss": 1.159,
 
 
 
 
 
 
 
 
104
  "step": 80
105
  },
 
 
 
 
 
 
106
  {
107
  "epoch": 0.88,
108
+ "learning_rate": 5.116103646510543e-06,
109
+ "loss": 1.0041,
110
+ "step": 85
111
  },
112
  {
113
  "epoch": 0.93,
114
+ "learning_rate": 1.7554380092209137e-06,
115
+ "loss": 1.0917,
116
+ "step": 90
 
 
 
 
 
 
117
  },
118
  {
119
  "epoch": 0.98,
120
+ "learning_rate": 1.4386612009299613e-07,
121
+ "loss": 1.0484,
122
+ "step": 95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  },
124
  {
125
+ "epoch": 1.0,
126
+ "eval_loss": 1.0789222717285156,
127
+ "eval_runtime": 7.1308,
128
+ "eval_samples_per_second": 21.035,
129
+ "eval_steps_per_second": 2.664,
130
+ "step": 97
131
  }
132
  ],
133
+ "max_steps": 1552,
134
+ "num_train_epochs": 16,
135
+ "total_flos": 101381308416000.0,
136
  "trial_name": null,
137
  "trial_params": null
138
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e678611f04c51489e9a36568716af96aebefa9660b4f8c5449f0a99f9629d37
3
  size 2671
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5497b3aaf0bdb0eca71885ab53b83d8d3468893d10b8e506f44d16425afcbf44
3
  size 2671