TinyPixel commited on
Commit
9de6c0c
·
1 Parent(s): 3771be8

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -16,10 +16,10 @@
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
 
19
  "dense",
20
- "dense_h_to_4h",
21
  "dense_4h_to_h",
22
- "query_key_value"
23
  ],
24
  "task_type": "CAUSAL_LM"
25
  }
 
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
+ "query_key_value",
20
  "dense",
 
21
  "dense_4h_to_h",
22
+ "dense_h_to_4h"
23
  ],
24
  "task_type": "CAUSAL_LM"
25
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87ed2acf6125adcd0b24b33bc85700484416c3f26f621297c7c4b99078d32734
3
  size 134235712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a7a9747374432a0f6b7788c3655afa6f8be8dfa37512bcbd1a41edc5c6837fb
3
  size 134235712
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ce2abea04ce84564218c3cf846e1bf599b1376d44e961f38757081a2ec3ee3c
3
  size 268515002
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8658d8be4f72b390d027ba69ffb967e257b8cf3f32e53fb8c2b198bb6292c987
3
  size 268515002
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5df93484ff14822d285a99b747b77d524936bc9c829bc86f91863db23667392
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2447a2ffeb2e0455ab321301be0814daf3bad517ee0376a7befef6219cb56a0
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b5b7f865fff447a834d063766ffbff2b06cf8776b1ce383609115b7efb4a180
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3b02631b1745c2ee48fc6f33fd739a472feb6116dcb1bf398a6d4c6e32417c0
3
  size 1064
special_tokens_map.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "bos_token": "<|endoftext|>",
3
  "eos_token": "<|endoftext|>",
4
- "pad_token": "[PAD]",
5
  "unk_token": "<|endoftext|>"
6
  }
 
1
  {
2
  "bos_token": "<|endoftext|>",
3
  "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
  "unk_token": "<|endoftext|>"
6
  }
tokenizer_config.json CHANGED
@@ -206,7 +206,7 @@
206
  "clean_up_tokenization_spaces": true,
207
  "eos_token": "<|endoftext|>",
208
  "model_max_length": 1000000000000000019884624838656,
209
- "pad_token": "[PAD]",
210
  "tokenizer_class": "GPTNeoXTokenizer",
211
  "unk_token": "<|endoftext|>"
212
  }
 
206
  "clean_up_tokenization_spaces": true,
207
  "eos_token": "<|endoftext|>",
208
  "model_max_length": 1000000000000000019884624838656,
209
+ "pad_token": "<|endoftext|>",
210
  "tokenizer_class": "GPTNeoXTokenizer",
211
  "unk_token": "<|endoftext|>"
212
  }
trainer_state.json CHANGED
@@ -1,1225 +1,2833 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.994408201304753,
5
  "eval_steps": 500,
6
- "global_step": 402,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03,
13
- "learning_rate": 3.0769230769230774e-06,
14
- "loss": 2.7224,
15
  "step": 2
16
  },
17
  {
18
- "epoch": 0.06,
19
- "learning_rate": 6.153846153846155e-06,
20
- "loss": 2.023,
21
  "step": 4
22
  },
23
  {
24
- "epoch": 0.09,
25
- "learning_rate": 9.230769230769232e-06,
26
- "loss": 2.011,
27
  "step": 6
28
  },
29
  {
30
- "epoch": 0.12,
31
- "learning_rate": 1.230769230769231e-05,
32
- "loss": 2.0488,
33
  "step": 8
34
  },
35
  {
36
- "epoch": 0.15,
37
- "learning_rate": 1.5384615384615387e-05,
38
- "loss": 1.806,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.18,
43
- "learning_rate": 1.8461538461538465e-05,
44
- "loss": 2.3016,
45
  "step": 12
46
  },
47
  {
48
- "epoch": 0.21,
49
- "learning_rate": 1.9999673886943734e-05,
50
- "loss": 2.5816,
51
  "step": 14
52
  },
53
  {
54
- "epoch": 0.24,
55
- "learning_rate": 1.9997065110111884e-05,
56
- "loss": 3.187,
57
  "step": 16
58
  },
59
  {
60
- "epoch": 0.27,
61
- "learning_rate": 1.9991848237042037e-05,
62
- "loss": 2.1655,
63
  "step": 18
64
  },
65
  {
66
- "epoch": 0.3,
67
- "learning_rate": 1.998402462874433e-05,
68
- "loss": 1.9262,
69
  "step": 20
70
  },
71
  {
72
- "epoch": 0.33,
73
- "learning_rate": 1.9973596326290136e-05,
74
- "loss": 1.9535,
75
  "step": 22
76
  },
77
  {
78
- "epoch": 0.36,
79
- "learning_rate": 1.9960566050279568e-05,
80
- "loss": 1.9173,
81
  "step": 24
82
  },
83
  {
84
- "epoch": 0.39,
85
- "learning_rate": 1.994493720013169e-05,
86
- "loss": 1.9742,
87
  "step": 26
88
  },
89
  {
90
- "epoch": 0.42,
91
- "learning_rate": 1.9926713853197696e-05,
92
- "loss": 2.3098,
93
  "step": 28
94
  },
95
  {
96
- "epoch": 0.45,
97
- "learning_rate": 1.9905900763697152e-05,
98
- "loss": 2.6274,
99
  "step": 30
100
  },
101
  {
102
- "epoch": 0.48,
103
- "learning_rate": 1.9882503361477707e-05,
104
- "loss": 2.8624,
105
  "step": 32
106
  },
107
  {
108
- "epoch": 0.51,
109
- "learning_rate": 1.9856527750598493e-05,
110
- "loss": 2.5503,
111
  "step": 34
112
  },
113
  {
114
- "epoch": 0.54,
115
- "learning_rate": 1.9827980707737704e-05,
116
- "loss": 2.179,
117
  "step": 36
118
  },
119
  {
120
- "epoch": 0.57,
121
- "learning_rate": 1.979686968042461e-05,
122
- "loss": 1.8671,
123
  "step": 38
124
  },
125
  {
126
- "epoch": 0.6,
127
- "learning_rate": 1.976320278509663e-05,
128
- "loss": 1.9073,
129
  "step": 40
130
  },
131
  {
132
- "epoch": 0.63,
133
- "learning_rate": 1.9726988804981847e-05,
134
- "loss": 1.7712,
135
  "step": 42
136
  },
137
  {
138
- "epoch": 0.66,
139
- "learning_rate": 1.9688237187807594e-05,
140
- "loss": 2.0556,
141
  "step": 44
142
  },
143
  {
144
- "epoch": 0.69,
145
- "learning_rate": 1.9646958043335678e-05,
146
- "loss": 2.3458,
147
  "step": 46
148
  },
149
  {
150
- "epoch": 0.72,
151
- "learning_rate": 1.9603162140724863e-05,
152
- "loss": 2.4738,
153
  "step": 48
154
  },
155
  {
156
- "epoch": 0.75,
157
- "learning_rate": 1.9556860905721363e-05,
158
- "loss": 2.671,
159
  "step": 50
160
  },
161
  {
162
- "epoch": 0.78,
163
- "learning_rate": 1.950806641767802e-05,
164
- "loss": 1.9429,
165
  "step": 52
166
  },
167
  {
168
- "epoch": 0.81,
169
- "learning_rate": 1.9456791406402964e-05,
170
- "loss": 1.8356,
171
  "step": 54
172
  },
173
  {
174
- "epoch": 0.84,
175
- "learning_rate": 1.940304924883858e-05,
176
- "loss": 1.8648,
177
  "step": 56
178
  },
179
  {
180
- "epoch": 0.86,
181
- "learning_rate": 1.934685396557165e-05,
182
- "loss": 1.8207,
183
  "step": 58
184
  },
185
  {
186
- "epoch": 0.89,
187
- "learning_rate": 1.9288220217175583e-05,
188
- "loss": 1.7519,
189
  "step": 60
190
  },
191
  {
192
- "epoch": 0.92,
193
- "learning_rate": 1.9227163300385662e-05,
194
- "loss": 1.9448,
195
  "step": 62
196
  },
197
  {
198
- "epoch": 0.95,
199
- "learning_rate": 1.9163699144108343e-05,
200
- "loss": 2.1599,
201
  "step": 64
202
  },
203
  {
204
- "epoch": 0.98,
205
- "learning_rate": 1.9097844305265625e-05,
206
- "loss": 1.7499,
207
  "step": 66
208
  },
209
  {
210
- "epoch": 1.01,
211
- "learning_rate": 1.9029615964475572e-05,
212
- "loss": 2.3244,
213
  "step": 68
214
  },
215
  {
216
- "epoch": 1.04,
217
- "learning_rate": 1.8959031921570136e-05,
218
- "loss": 2.0381,
219
  "step": 70
220
  },
221
  {
222
- "epoch": 1.07,
223
- "learning_rate": 1.8886110590951417e-05,
224
- "loss": 1.7528,
225
  "step": 72
226
  },
227
  {
228
- "epoch": 1.1,
229
- "learning_rate": 1.88108709967876e-05,
230
- "loss": 1.9127,
231
  "step": 74
232
  },
233
  {
234
- "epoch": 1.13,
235
- "learning_rate": 1.873333276804983e-05,
236
- "loss": 1.654,
237
  "step": 76
238
  },
239
  {
240
- "epoch": 1.16,
241
- "learning_rate": 1.865351613339125e-05,
242
- "loss": 1.642,
243
  "step": 78
244
  },
245
  {
246
- "epoch": 1.19,
247
- "learning_rate": 1.8571441915869663e-05,
248
- "loss": 1.9048,
249
  "step": 80
250
  },
251
  {
252
- "epoch": 1.22,
253
- "learning_rate": 1.848713152751506e-05,
254
- "loss": 2.04,
255
  "step": 82
256
  },
257
  {
258
- "epoch": 1.25,
259
- "learning_rate": 1.8400606963743517e-05,
260
- "loss": 2.168,
261
  "step": 84
262
  },
263
  {
264
- "epoch": 1.28,
265
- "learning_rate": 1.8311890797618918e-05,
266
- "loss": 2.0551,
267
  "step": 86
268
  },
269
  {
270
- "epoch": 1.31,
271
- "learning_rate": 1.822100617396391e-05,
272
- "loss": 1.8066,
273
  "step": 88
274
  },
275
  {
276
- "epoch": 1.34,
277
- "learning_rate": 1.8127976803321793e-05,
278
- "loss": 1.7716,
279
  "step": 90
280
  },
281
  {
282
- "epoch": 1.37,
283
- "learning_rate": 1.8032826955770723e-05,
284
- "loss": 1.7329,
285
  "step": 92
286
  },
287
  {
288
- "epoch": 1.4,
289
- "learning_rate": 1.7935581454592005e-05,
290
- "loss": 1.6597,
291
  "step": 94
292
  },
293
  {
294
- "epoch": 1.43,
295
- "learning_rate": 1.7836265669794032e-05,
296
- "loss": 1.7824,
297
  "step": 96
298
  },
299
  {
300
- "epoch": 1.46,
301
- "learning_rate": 1.7734905511493614e-05,
302
- "loss": 1.9783,
303
  "step": 98
304
  },
305
  {
306
- "epoch": 1.49,
307
- "learning_rate": 1.763152742315637e-05,
308
- "loss": 1.8963,
309
  "step": 100
310
  },
311
  {
312
- "epoch": 1.52,
313
- "learning_rate": 1.7526158374697997e-05,
314
- "loss": 2.5619,
315
  "step": 102
316
  },
317
  {
318
- "epoch": 1.55,
319
- "learning_rate": 1.7418825855448208e-05,
320
- "loss": 1.6946,
321
  "step": 104
322
  },
323
  {
324
- "epoch": 1.58,
325
- "learning_rate": 1.7309557866979113e-05,
326
- "loss": 1.7272,
327
  "step": 106
328
  },
329
  {
330
- "epoch": 1.61,
331
- "learning_rate": 1.7198382915800034e-05,
332
- "loss": 1.6596,
333
  "step": 108
334
  },
335
  {
336
- "epoch": 1.64,
337
- "learning_rate": 1.7085330005920516e-05,
338
- "loss": 1.5296,
339
  "step": 110
340
  },
341
  {
342
- "epoch": 1.67,
343
- "learning_rate": 1.6970428631283602e-05,
344
- "loss": 1.7269,
345
  "step": 112
346
  },
347
  {
348
- "epoch": 1.7,
349
- "learning_rate": 1.6853708768071265e-05,
350
- "loss": 2.0672,
351
  "step": 114
352
  },
353
  {
354
- "epoch": 1.73,
355
- "learning_rate": 1.6735200866884037e-05,
356
- "loss": 2.3257,
357
  "step": 116
358
  },
359
  {
360
- "epoch": 1.76,
361
- "learning_rate": 1.6614935844796863e-05,
362
- "loss": 2.3116,
363
  "step": 118
364
  },
365
  {
366
- "epoch": 1.79,
367
- "learning_rate": 1.649294507729327e-05,
368
- "loss": 1.6772,
369
  "step": 120
370
  },
371
  {
372
- "epoch": 1.82,
373
- "learning_rate": 1.6369260390079933e-05,
374
- "loss": 1.7853,
375
  "step": 122
376
  },
377
  {
378
- "epoch": 1.85,
379
- "learning_rate": 1.6243914050783783e-05,
380
- "loss": 1.6112,
381
  "step": 124
382
  },
383
  {
384
- "epoch": 1.88,
385
- "learning_rate": 1.6116938760533843e-05,
386
- "loss": 1.5548,
387
  "step": 126
388
  },
389
  {
390
- "epoch": 1.91,
391
- "learning_rate": 1.5988367645429938e-05,
392
- "loss": 1.8593,
393
  "step": 128
394
  },
395
  {
396
- "epoch": 1.94,
397
- "learning_rate": 1.585823424790056e-05,
398
- "loss": 2.0341,
399
  "step": 130
400
  },
401
  {
402
- "epoch": 1.97,
403
- "learning_rate": 1.5726572517952122e-05,
404
- "loss": 1.5791,
405
  "step": 132
406
  },
407
  {
408
- "epoch": 2.0,
409
- "learning_rate": 1.559341680431185e-05,
410
- "loss": 1.7591,
411
  "step": 134
412
  },
413
  {
414
- "epoch": 2.03,
415
- "learning_rate": 1.545880184546669e-05,
416
- "loss": 2.2547,
417
  "step": 136
418
  },
419
  {
420
- "epoch": 2.06,
421
- "learning_rate": 1.532276276060051e-05,
422
- "loss": 1.8692,
423
  "step": 138
424
  },
425
  {
426
- "epoch": 2.09,
427
- "learning_rate": 1.518533504043199e-05,
428
- "loss": 1.7114,
429
  "step": 140
430
  },
431
  {
432
- "epoch": 2.12,
433
- "learning_rate": 1.5046554537955587e-05,
434
- "loss": 1.6847,
435
  "step": 142
436
  },
437
  {
438
- "epoch": 2.15,
439
- "learning_rate": 1.4906457459087977e-05,
440
- "loss": 1.6882,
441
  "step": 144
442
  },
443
  {
444
- "epoch": 2.18,
445
- "learning_rate": 1.4765080353222447e-05,
446
- "loss": 1.6619,
447
  "step": 146
448
  },
449
  {
450
- "epoch": 2.21,
451
- "learning_rate": 1.462246010369364e-05,
452
- "loss": 1.7873,
453
  "step": 148
454
  },
455
  {
456
- "epoch": 2.24,
457
- "learning_rate": 1.4478633918155216e-05,
458
- "loss": 1.6883,
459
  "step": 150
460
  },
461
  {
462
- "epoch": 2.27,
463
- "learning_rate": 1.4333639318872891e-05,
464
- "loss": 2.4291,
465
  "step": 152
466
  },
467
  {
468
- "epoch": 2.3,
469
- "learning_rate": 1.4187514132935393e-05,
470
- "loss": 2.2567,
471
  "step": 154
472
  },
473
  {
474
- "epoch": 2.33,
475
- "learning_rate": 1.4040296482385893e-05,
476
- "loss": 1.6691,
477
  "step": 156
478
  },
479
  {
480
- "epoch": 2.36,
481
- "learning_rate": 1.3892024774276496e-05,
482
- "loss": 1.7297,
483
  "step": 158
484
  },
485
  {
486
- "epoch": 2.39,
487
- "learning_rate": 1.3742737690648362e-05,
488
- "loss": 1.5559,
489
  "step": 160
490
  },
491
  {
492
- "epoch": 2.42,
493
- "learning_rate": 1.3592474178440116e-05,
494
- "loss": 1.6777,
495
  "step": 162
496
  },
497
  {
498
- "epoch": 2.45,
499
- "learning_rate": 1.34412734393271e-05,
500
- "loss": 1.8319,
501
  "step": 164
502
  },
503
  {
504
- "epoch": 2.48,
505
- "learning_rate": 1.3289174919494228e-05,
506
- "loss": 1.8486,
507
  "step": 166
508
  },
509
  {
510
- "epoch": 2.51,
511
- "learning_rate": 1.3136218299344993e-05,
512
- "loss": 2.2382,
513
  "step": 168
514
  },
515
  {
516
- "epoch": 2.53,
517
- "learning_rate": 1.2982443483149423e-05,
518
- "loss": 1.6871,
519
  "step": 170
520
  },
521
  {
522
- "epoch": 2.56,
523
- "learning_rate": 1.2827890588633589e-05,
524
- "loss": 1.8091,
525
  "step": 172
526
  },
527
  {
528
- "epoch": 2.59,
529
- "learning_rate": 1.267259993651345e-05,
530
- "loss": 1.7956,
531
  "step": 174
532
  },
533
  {
534
- "epoch": 2.62,
535
- "learning_rate": 1.2516612039975745e-05,
536
- "loss": 1.5036,
537
  "step": 176
538
  },
539
  {
540
- "epoch": 2.65,
541
- "learning_rate": 1.2359967594108643e-05,
542
- "loss": 1.5951,
543
  "step": 178
544
  },
545
  {
546
- "epoch": 2.68,
547
- "learning_rate": 1.2202707465284973e-05,
548
- "loss": 1.8153,
549
  "step": 180
550
  },
551
  {
552
- "epoch": 2.71,
553
- "learning_rate": 1.2044872680500743e-05,
554
- "loss": 1.9109,
555
  "step": 182
556
  },
557
  {
558
- "epoch": 2.74,
559
- "learning_rate": 1.188650441667177e-05,
560
- "loss": 2.602,
561
  "step": 184
562
  },
563
  {
564
- "epoch": 2.77,
565
- "learning_rate": 1.172764398989118e-05,
566
- "loss": 1.7423,
567
  "step": 186
568
  },
569
  {
570
- "epoch": 2.8,
571
- "learning_rate": 1.1568332844650623e-05,
572
- "loss": 1.7171,
573
  "step": 188
574
  },
575
  {
576
- "epoch": 2.83,
577
- "learning_rate": 1.1408612543027963e-05,
578
- "loss": 1.7039,
579
  "step": 190
580
  },
581
  {
582
- "epoch": 2.86,
583
- "learning_rate": 1.1248524753844325e-05,
584
- "loss": 1.3292,
585
  "step": 192
586
  },
587
  {
588
- "epoch": 2.89,
589
- "learning_rate": 1.1088111241793258e-05,
590
- "loss": 1.7061,
591
  "step": 194
592
  },
593
  {
594
- "epoch": 2.92,
595
- "learning_rate": 1.0927413856544906e-05,
596
- "loss": 1.9157,
597
  "step": 196
598
  },
599
  {
600
- "epoch": 2.95,
601
- "learning_rate": 1.0766474521828022e-05,
602
- "loss": 1.8149,
603
  "step": 198
604
  },
605
  {
606
- "epoch": 2.98,
607
- "learning_rate": 1.0605335224492617e-05,
608
- "loss": 1.4948,
609
  "step": 200
610
  },
611
  {
612
- "epoch": 3.01,
613
- "learning_rate": 1.0444038003556201e-05,
614
- "loss": 1.8947,
615
  "step": 202
616
  },
617
  {
618
- "epoch": 3.04,
619
- "learning_rate": 1.0282624939236367e-05,
620
- "loss": 2.2848,
621
  "step": 204
622
  },
623
  {
624
- "epoch": 3.07,
625
- "learning_rate": 1.0121138141972649e-05,
626
- "loss": 1.6915,
627
  "step": 206
628
  },
629
  {
630
- "epoch": 3.1,
631
- "learning_rate": 9.959619741440486e-06,
632
- "loss": 1.6911,
633
  "step": 208
634
  },
635
  {
636
- "epoch": 3.13,
637
- "learning_rate": 9.798111875560167e-06,
638
- "loss": 1.6485,
639
  "step": 210
640
  },
641
  {
642
- "epoch": 3.16,
643
- "learning_rate": 9.636656679503647e-06,
644
- "loss": 1.4867,
645
  "step": 212
646
  },
647
  {
648
- "epoch": 3.19,
649
- "learning_rate": 9.475296274702044e-06,
650
- "loss": 1.7333,
651
  "step": 214
652
  },
653
  {
654
- "epoch": 3.22,
655
- "learning_rate": 9.314072757856752e-06,
656
- "loss": 1.9008,
657
  "step": 216
658
  },
659
  {
660
- "epoch": 3.25,
661
- "learning_rate": 9.153028189956986e-06,
662
- "loss": 1.8957,
663
  "step": 218
664
  },
665
  {
666
- "epoch": 3.28,
667
- "learning_rate": 8.99220458530664e-06,
668
- "loss": 2.1675,
669
  "step": 220
670
  },
671
  {
672
- "epoch": 3.31,
673
- "learning_rate": 8.831643900563372e-06,
674
- "loss": 1.6982,
675
  "step": 222
676
  },
677
  {
678
- "epoch": 3.34,
679
- "learning_rate": 8.671388023792642e-06,
680
- "loss": 1.7398,
681
  "step": 224
682
  },
683
  {
684
- "epoch": 3.37,
685
- "learning_rate": 8.511478763539737e-06,
686
- "loss": 1.9094,
687
  "step": 226
688
  },
689
  {
690
- "epoch": 3.4,
691
- "learning_rate": 8.351957837922467e-06,
692
- "loss": 1.4749,
693
  "step": 228
694
  },
695
  {
696
- "epoch": 3.43,
697
- "learning_rate": 8.192866863747516e-06,
698
- "loss": 1.6732,
699
  "step": 230
700
  },
701
  {
702
- "epoch": 3.46,
703
- "learning_rate": 8.034247345653148e-06,
704
- "loss": 1.8966,
705
  "step": 232
706
  },
707
  {
708
- "epoch": 3.49,
709
- "learning_rate": 7.876140665281273e-06,
710
- "loss": 1.9921,
711
  "step": 234
712
  },
713
  {
714
- "epoch": 3.52,
715
- "learning_rate": 7.718588070481501e-06,
716
- "loss": 1.9,
717
  "step": 236
718
  },
719
  {
720
- "epoch": 3.55,
721
- "learning_rate": 7.561630664550179e-06,
722
- "loss": 1.6396,
723
  "step": 238
724
  },
725
  {
726
- "epoch": 3.58,
727
- "learning_rate": 7.405309395507098e-06,
728
- "loss": 1.672,
729
  "step": 240
730
  },
731
  {
732
- "epoch": 3.61,
733
- "learning_rate": 7.249665045412704e-06,
734
- "loss": 1.6482,
735
  "step": 242
736
  },
737
  {
738
- "epoch": 3.64,
739
- "learning_rate": 7.0947382197286566e-06,
740
- "loss": 1.5786,
741
  "step": 244
742
  },
743
  {
744
- "epoch": 3.67,
745
- "learning_rate": 6.94056933672439e-06,
746
- "loss": 1.7509,
747
  "step": 246
748
  },
749
  {
750
- "epoch": 3.7,
751
- "learning_rate": 6.787198616932571e-06,
752
- "loss": 1.803,
753
  "step": 248
754
  },
755
  {
756
- "epoch": 3.73,
757
- "learning_rate": 6.634666072656097e-06,
758
- "loss": 1.5227,
759
  "step": 250
760
  },
761
  {
762
- "epoch": 3.76,
763
- "learning_rate": 6.483011497529457e-06,
764
- "loss": 2.5859,
765
  "step": 252
766
  },
767
  {
768
- "epoch": 3.79,
769
- "learning_rate": 6.332274456137097e-06,
770
- "loss": 1.7433,
771
  "step": 254
772
  },
773
  {
774
- "epoch": 3.82,
775
- "learning_rate": 6.182494273691602e-06,
776
- "loss": 1.6223,
777
  "step": 256
778
  },
779
  {
780
- "epoch": 3.85,
781
- "learning_rate": 6.033710025774253e-06,
782
- "loss": 1.5475,
783
  "step": 258
784
  },
785
  {
786
- "epoch": 3.88,
787
- "learning_rate": 5.885960528140784e-06,
788
- "loss": 1.3609,
789
  "step": 260
790
  },
791
  {
792
- "epoch": 3.91,
793
- "learning_rate": 5.739284326594845e-06,
794
- "loss": 1.6793,
795
  "step": 262
796
  },
797
  {
798
- "epoch": 3.94,
799
- "learning_rate": 5.59371968693198e-06,
800
- "loss": 1.8558,
801
  "step": 264
802
  },
803
  {
804
- "epoch": 3.97,
805
- "learning_rate": 5.449304584956582e-06,
806
- "loss": 1.8424,
807
  "step": 266
808
  },
809
  {
810
- "epoch": 4.0,
811
- "learning_rate": 5.306076696574522e-06,
812
- "loss": 1.672,
813
  "step": 268
814
  },
815
  {
816
- "epoch": 4.03,
817
- "learning_rate": 5.164073387964057e-06,
818
- "loss": 2.6413,
819
  "step": 270
820
  },
821
  {
822
- "epoch": 4.06,
823
- "learning_rate": 5.023331705827477e-06,
824
- "loss": 1.8991,
825
  "step": 272
826
  },
827
  {
828
- "epoch": 4.09,
829
- "learning_rate": 4.883888367726153e-06,
830
- "loss": 1.7208,
831
  "step": 274
832
  },
833
  {
834
- "epoch": 4.12,
835
- "learning_rate": 4.74577975250143e-06,
836
- "loss": 1.6942,
837
  "step": 276
838
  },
839
  {
840
- "epoch": 4.15,
841
- "learning_rate": 4.609041890783882e-06,
842
- "loss": 1.5046,
843
  "step": 278
844
  },
845
  {
846
- "epoch": 4.18,
847
- "learning_rate": 4.473710455593416e-06,
848
- "loss": 1.484,
849
  "step": 280
850
  },
851
  {
852
- "epoch": 4.21,
853
- "learning_rate": 4.339820753032692e-06,
854
- "loss": 1.7197,
855
  "step": 282
856
  },
857
  {
858
- "epoch": 4.23,
859
- "learning_rate": 4.207407713076221e-06,
860
- "loss": 1.9587,
861
  "step": 284
862
  },
863
  {
864
- "epoch": 4.26,
865
- "learning_rate": 4.076505880457642e-06,
866
- "loss": 2.2569,
867
  "step": 286
868
  },
869
  {
870
- "epoch": 4.29,
871
- "learning_rate": 3.947149405657469e-06,
872
- "loss": 1.8487,
873
  "step": 288
874
  },
875
  {
876
- "epoch": 4.32,
877
- "learning_rate": 3.8193720359936905e-06,
878
- "loss": 1.7394,
879
  "step": 290
880
  },
881
  {
882
- "epoch": 4.35,
883
- "learning_rate": 3.69320710681758e-06,
884
- "loss": 1.7156,
885
  "step": 292
886
  },
887
  {
888
- "epoch": 4.38,
889
- "learning_rate": 3.5686875328169513e-06,
890
- "loss": 1.3529,
891
  "step": 294
892
  },
893
  {
894
- "epoch": 4.41,
895
- "learning_rate": 3.4458457994291763e-06,
896
- "loss": 1.6026,
897
  "step": 296
898
  },
899
  {
900
- "epoch": 4.44,
901
- "learning_rate": 3.324713954366171e-06,
902
- "loss": 1.8169,
903
  "step": 298
904
  },
905
  {
906
- "epoch": 4.47,
907
- "learning_rate": 3.2053235992536137e-06,
908
- "loss": 1.5503,
909
  "step": 300
910
  },
911
  {
912
- "epoch": 4.5,
913
- "learning_rate": 3.0877058813864856e-06,
914
- "loss": 2.1322,
915
  "step": 302
916
  },
917
  {
918
- "epoch": 4.53,
919
- "learning_rate": 2.9718914856032033e-06,
920
- "loss": 1.6235,
921
  "step": 304
922
  },
923
  {
924
- "epoch": 4.56,
925
- "learning_rate": 2.8579106262803467e-06,
926
- "loss": 1.7218,
927
  "step": 306
928
  },
929
  {
930
- "epoch": 4.59,
931
- "learning_rate": 2.7457930394501564e-06,
932
- "loss": 1.6783,
933
  "step": 308
934
  },
935
  {
936
- "epoch": 4.62,
937
- "learning_rate": 2.635567975042809e-06,
938
- "loss": 1.6301,
939
  "step": 310
940
  },
941
  {
942
- "epoch": 4.65,
943
- "learning_rate": 2.527264189255507e-06,
944
- "loss": 1.6768,
945
  "step": 312
946
  },
947
  {
948
- "epoch": 4.68,
949
- "learning_rate": 2.420909937050405e-06,
950
- "loss": 1.7345,
951
  "step": 314
952
  },
953
  {
954
- "epoch": 4.71,
955
- "learning_rate": 2.3165329647832525e-06,
956
- "loss": 1.8255,
957
  "step": 316
958
  },
959
  {
960
- "epoch": 4.74,
961
- "learning_rate": 2.214160502964783e-06,
962
- "loss": 2.1617,
963
  "step": 318
964
  },
965
  {
966
- "epoch": 4.77,
967
- "learning_rate": 2.1138192591566177e-06,
968
- "loss": 2.0365,
969
  "step": 320
970
  },
971
  {
972
- "epoch": 4.8,
973
- "learning_rate": 2.0155354110036607e-06,
974
- "loss": 1.7143,
975
  "step": 322
976
  },
977
  {
978
- "epoch": 4.83,
979
- "learning_rate": 1.9193345994046965e-06,
980
- "loss": 1.7533,
981
  "step": 324
982
  },
983
  {
984
- "epoch": 4.86,
985
- "learning_rate": 1.8252419218230389e-06,
986
- "loss": 1.6113,
987
  "step": 326
988
  },
989
  {
990
- "epoch": 4.89,
991
- "learning_rate": 1.7332819257389388e-06,
992
- "loss": 1.3852,
993
  "step": 328
994
  },
995
  {
996
- "epoch": 4.92,
997
- "learning_rate": 1.6434786022455073e-06,
998
- "loss": 1.6639,
999
  "step": 330
1000
  },
1001
  {
1002
- "epoch": 4.95,
1003
- "learning_rate": 1.5558553797897469e-06,
1004
- "loss": 1.6325,
1005
  "step": 332
1006
  },
1007
  {
1008
- "epoch": 4.98,
1009
- "learning_rate": 1.4704351180604126e-06,
1010
- "loss": 1.7177,
1011
  "step": 334
1012
  },
1013
  {
1014
- "epoch": 5.01,
1015
- "learning_rate": 1.3872401020242222e-06,
1016
- "loss": 1.9718,
1017
  "step": 336
1018
  },
1019
  {
1020
- "epoch": 5.04,
1021
- "learning_rate": 1.3062920361120224e-06,
1022
- "loss": 2.3978,
1023
  "step": 338
1024
  },
1025
  {
1026
- "epoch": 5.07,
1027
- "learning_rate": 1.2276120385564006e-06,
1028
- "loss": 1.6484,
1029
  "step": 340
1030
  },
1031
  {
1032
- "epoch": 5.1,
1033
- "learning_rate": 1.1512206358822264e-06,
1034
- "loss": 1.6426,
1035
  "step": 342
1036
  },
1037
  {
1038
- "epoch": 5.13,
1039
- "learning_rate": 1.077137757551573e-06,
1040
- "loss": 1.5856,
1041
  "step": 344
1042
  },
1043
  {
1044
- "epoch": 5.16,
1045
- "learning_rate": 1.005382730764386e-06,
1046
- "loss": 1.5543,
1047
  "step": 346
1048
  },
1049
  {
1050
- "epoch": 5.19,
1051
- "learning_rate": 9.359742754162926e-07,
1052
- "loss": 1.5946,
1053
  "step": 348
1054
  },
1055
  {
1056
- "epoch": 5.22,
1057
- "learning_rate": 8.689304992148285e-07,
1058
- "loss": 1.6595,
1059
  "step": 350
1060
  },
1061
  {
1062
- "epoch": 5.25,
1063
- "learning_rate": 8.042688929554076e-07,
1064
- "loss": 1.8412,
1065
  "step": 352
1066
  },
1067
  {
1068
- "epoch": 5.28,
1069
- "learning_rate": 7.420063259581856e-07,
1070
- "loss": 2.3088,
1071
  "step": 354
1072
  },
1073
  {
1074
- "epoch": 5.31,
1075
- "learning_rate": 6.821590416671108e-07,
1076
- "loss": 1.8214,
1077
  "step": 356
1078
  },
1079
  {
1080
- "epoch": 5.34,
1081
- "learning_rate": 6.247426534122292e-07,
1082
- "loss": 1.8167,
1083
  "step": 358
1084
  },
1085
  {
1086
- "epoch": 5.37,
1087
- "learning_rate": 5.697721403363699e-07,
1088
- "loss": 1.5789,
1089
  "step": 360
1090
  },
1091
  {
1092
- "epoch": 5.4,
1093
- "learning_rate": 5.172618434873112e-07,
1094
- "loss": 1.7484,
1095
  "step": 362
1096
  },
1097
  {
1098
- "epoch": 5.43,
1099
- "learning_rate": 4.672254620763839e-07,
1100
- "loss": 1.728,
1101
  "step": 364
1102
  },
1103
  {
1104
- "epoch": 5.46,
1105
- "learning_rate": 4.196760499045505e-07,
1106
- "loss": 1.9324,
1107
  "step": 366
1108
  },
1109
  {
1110
- "epoch": 5.49,
1111
- "learning_rate": 3.746260119568368e-07,
1112
- "loss": 1.4902,
1113
  "step": 368
1114
  },
1115
  {
1116
- "epoch": 5.52,
1117
- "learning_rate": 3.320871011660498e-07,
1118
- "loss": 2.3606,
1119
  "step": 370
1120
  },
1121
  {
1122
- "epoch": 5.55,
1123
- "learning_rate": 2.920704153465936e-07,
1124
- "loss": 1.7926,
1125
  "step": 372
1126
  },
1127
  {
1128
- "epoch": 5.58,
1129
- "learning_rate": 2.5458639429921105e-07,
1130
- "loss": 1.8292,
1131
  "step": 374
1132
  },
1133
  {
1134
- "epoch": 5.61,
1135
- "learning_rate": 2.196448170873755e-07,
1136
- "loss": 1.6128,
1137
  "step": 376
1138
  },
1139
  {
1140
- "epoch": 5.64,
1141
- "learning_rate": 1.8725479948607515e-07,
1142
- "loss": 1.3591,
1143
  "step": 378
1144
  },
1145
  {
1146
- "epoch": 5.67,
1147
- "learning_rate": 1.5742479160362978e-07,
1148
- "loss": 1.4791,
1149
  "step": 380
1150
  },
1151
  {
1152
- "epoch": 5.7,
1153
- "learning_rate": 1.3016257567717295e-07,
1154
- "loss": 1.8504,
1155
  "step": 382
1156
  },
1157
  {
1158
- "epoch": 5.73,
1159
- "learning_rate": 1.054752640423784e-07,
1160
- "loss": 1.7373,
1161
  "step": 384
1162
  },
1163
  {
1164
- "epoch": 5.76,
1165
- "learning_rate": 8.336929727794318e-08,
1166
- "loss": 2.1627,
1167
  "step": 386
1168
  },
1169
  {
1170
- "epoch": 5.79,
1171
- "learning_rate": 6.385044252533723e-08,
1172
- "loss": 1.6952,
1173
  "step": 388
1174
  },
1175
  {
1176
- "epoch": 5.82,
1177
- "learning_rate": 4.692379198422803e-08,
1178
- "loss": 1.7199,
1179
  "step": 390
1180
  },
1181
  {
1182
- "epoch": 5.85,
1183
- "learning_rate": 3.259376158400329e-08,
1184
- "loss": 1.7157,
1185
  "step": 392
1186
  },
1187
  {
1188
- "epoch": 5.88,
1189
- "learning_rate": 2.0864089831711398e-08,
1190
- "loss": 1.4137,
1191
  "step": 394
1192
  },
1193
  {
1194
- "epoch": 5.9,
1195
- "learning_rate": 1.1737836836737126e-08,
1196
- "loss": 1.6499,
1197
  "step": 396
1198
  },
1199
  {
1200
- "epoch": 5.93,
1201
- "learning_rate": 5.217383512463592e-09,
1202
- "loss": 1.7672,
1203
  "step": 398
1204
  },
1205
  {
1206
- "epoch": 5.96,
1207
- "learning_rate": 1.3044309551213385e-09,
1208
- "loss": 1.5814,
1209
  "step": 400
1210
  },
1211
  {
1212
- "epoch": 5.99,
1213
- "learning_rate": 0.0,
1214
- "loss": 1.6152,
1215
  "step": 402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1216
  }
1217
  ],
1218
  "logging_steps": 2,
1219
- "max_steps": 402,
1220
- "num_train_epochs": 6,
1221
  "save_steps": 500,
1222
- "total_flos": 1.7781494793805824e+16,
1223
  "trial_name": null,
1224
  "trial_params": null
1225
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9998001465591899,
5
  "eval_steps": 500,
6
+ "global_step": 938,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
13
+ "learning_rate": 4.705882352941177e-06,
14
+ "loss": 2.3894,
15
  "step": 2
16
  },
17
  {
18
+ "epoch": 0.0,
19
+ "learning_rate": 9.411764705882354e-06,
20
+ "loss": 2.4461,
21
  "step": 4
22
  },
23
  {
24
+ "epoch": 0.01,
25
+ "learning_rate": 1.411764705882353e-05,
26
+ "loss": 2.5984,
27
  "step": 6
28
  },
29
  {
30
+ "epoch": 0.01,
31
+ "learning_rate": 1.8823529411764708e-05,
32
+ "loss": 2.7012,
33
  "step": 8
34
  },
35
  {
36
+ "epoch": 0.01,
37
+ "learning_rate": 2.3529411764705884e-05,
38
+ "loss": 2.5558,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.01,
43
+ "learning_rate": 2.823529411764706e-05,
44
+ "loss": 2.7513,
45
  "step": 12
46
  },
47
  {
48
+ "epoch": 0.01,
49
+ "learning_rate": 3.294117647058824e-05,
50
+ "loss": 2.7527,
51
  "step": 14
52
  },
53
  {
54
+ "epoch": 0.02,
55
+ "learning_rate": 3.7647058823529415e-05,
56
+ "loss": 2.6276,
57
  "step": 16
58
  },
59
  {
60
+ "epoch": 0.02,
61
+ "learning_rate": 4.235294117647059e-05,
62
+ "loss": 2.6711,
63
  "step": 18
64
  },
65
  {
66
+ "epoch": 0.02,
67
+ "learning_rate": 4.705882352941177e-05,
68
+ "loss": 2.6975,
69
  "step": 20
70
  },
71
  {
72
+ "epoch": 0.02,
73
+ "learning_rate": 5.176470588235295e-05,
74
+ "loss": 2.5489,
75
  "step": 22
76
  },
77
  {
78
+ "epoch": 0.03,
79
+ "learning_rate": 5.647058823529412e-05,
80
+ "loss": 2.5387,
81
  "step": 24
82
  },
83
  {
84
+ "epoch": 0.03,
85
+ "learning_rate": 6.11764705882353e-05,
86
+ "loss": 2.5694,
87
  "step": 26
88
  },
89
  {
90
+ "epoch": 0.03,
91
+ "learning_rate": 6.588235294117648e-05,
92
+ "loss": 2.6244,
93
  "step": 28
94
  },
95
  {
96
+ "epoch": 0.03,
97
+ "learning_rate": 7.058823529411765e-05,
98
+ "loss": 2.5018,
99
  "step": 30
100
  },
101
  {
102
+ "epoch": 0.03,
103
+ "learning_rate": 7.529411764705883e-05,
104
+ "loss": 2.4123,
105
  "step": 32
106
  },
107
  {
108
+ "epoch": 0.04,
109
+ "learning_rate": 8e-05,
110
+ "loss": 2.3234,
111
  "step": 34
112
  },
113
  {
114
+ "epoch": 0.04,
115
+ "learning_rate": 8.470588235294118e-05,
116
+ "loss": 2.0958,
117
  "step": 36
118
  },
119
  {
120
+ "epoch": 0.04,
121
+ "learning_rate": 8.941176470588236e-05,
122
+ "loss": 2.2023,
123
  "step": 38
124
  },
125
  {
126
+ "epoch": 0.04,
127
+ "learning_rate": 9.411764705882353e-05,
128
+ "loss": 2.1864,
129
  "step": 40
130
  },
131
  {
132
+ "epoch": 0.04,
133
+ "learning_rate": 9.882352941176471e-05,
134
+ "loss": 2.0768,
135
  "step": 42
136
  },
137
  {
138
+ "epoch": 0.05,
139
+ "learning_rate": 0.0001035294117647059,
140
+ "loss": 2.189,
141
  "step": 44
142
  },
143
  {
144
+ "epoch": 0.05,
145
+ "learning_rate": 0.00010823529411764706,
146
+ "loss": 1.9094,
147
  "step": 46
148
  },
149
  {
150
+ "epoch": 0.05,
151
+ "learning_rate": 0.00011294117647058824,
152
+ "loss": 2.0136,
153
  "step": 48
154
  },
155
  {
156
+ "epoch": 0.05,
157
+ "learning_rate": 0.00011764705882352942,
158
+ "loss": 1.7677,
159
  "step": 50
160
  },
161
  {
162
+ "epoch": 0.06,
163
+ "learning_rate": 0.0001223529411764706,
164
+ "loss": 2.3789,
165
  "step": 52
166
  },
167
  {
168
+ "epoch": 0.06,
169
+ "learning_rate": 0.00012705882352941175,
170
+ "loss": 2.2419,
171
  "step": 54
172
  },
173
  {
174
+ "epoch": 0.06,
175
+ "learning_rate": 0.00013176470588235296,
176
+ "loss": 2.2654,
177
  "step": 56
178
  },
179
  {
180
+ "epoch": 0.06,
181
+ "learning_rate": 0.00013647058823529413,
182
+ "loss": 2.3797,
183
  "step": 58
184
  },
185
  {
186
+ "epoch": 0.06,
187
+ "learning_rate": 0.0001411764705882353,
188
+ "loss": 2.319,
189
  "step": 60
190
  },
191
  {
192
+ "epoch": 0.07,
193
+ "learning_rate": 0.00014588235294117646,
194
+ "loss": 2.3527,
195
  "step": 62
196
  },
197
  {
198
+ "epoch": 0.07,
199
+ "learning_rate": 0.00015058823529411766,
200
+ "loss": 2.129,
201
  "step": 64
202
  },
203
  {
204
+ "epoch": 0.07,
205
+ "learning_rate": 0.00015529411764705884,
206
+ "loss": 2.2285,
207
  "step": 66
208
  },
209
  {
210
+ "epoch": 0.07,
211
+ "learning_rate": 0.00016,
212
+ "loss": 2.2231,
213
  "step": 68
214
  },
215
  {
216
+ "epoch": 0.07,
217
+ "learning_rate": 0.0001647058823529412,
218
+ "loss": 2.0318,
219
  "step": 70
220
  },
221
  {
222
+ "epoch": 0.08,
223
+ "learning_rate": 0.00016941176470588237,
224
+ "loss": 2.2135,
225
  "step": 72
226
  },
227
  {
228
+ "epoch": 0.08,
229
+ "learning_rate": 0.00017411764705882354,
230
+ "loss": 2.093,
231
  "step": 74
232
  },
233
  {
234
+ "epoch": 0.08,
235
+ "learning_rate": 0.00017882352941176472,
236
+ "loss": 2.0507,
237
  "step": 76
238
  },
239
  {
240
+ "epoch": 0.08,
241
+ "learning_rate": 0.0001835294117647059,
242
+ "loss": 2.115,
243
  "step": 78
244
  },
245
  {
246
+ "epoch": 0.09,
247
+ "learning_rate": 0.00018823529411764707,
248
+ "loss": 2.1991,
249
  "step": 80
250
  },
251
  {
252
+ "epoch": 0.09,
253
+ "learning_rate": 0.00019294117647058825,
254
+ "loss": 2.1561,
255
  "step": 82
256
  },
257
  {
258
+ "epoch": 0.09,
259
+ "learning_rate": 0.00019764705882352942,
260
+ "loss": 2.1816,
261
  "step": 84
262
  },
263
  {
264
+ "epoch": 0.09,
265
+ "learning_rate": 0.00019999993373829138,
266
+ "loss": 1.9079,
267
  "step": 86
268
  },
269
  {
270
+ "epoch": 0.09,
271
+ "learning_rate": 0.00019999940364514925,
272
+ "loss": 2.1371,
273
  "step": 88
274
  },
275
  {
276
+ "epoch": 0.1,
277
+ "learning_rate": 0.00019999834346167496,
278
+ "loss": 2.0254,
279
  "step": 90
280
  },
281
  {
282
+ "epoch": 0.1,
283
+ "learning_rate": 0.00019999675319348848,
284
+ "loss": 1.9081,
285
  "step": 92
286
  },
287
  {
288
+ "epoch": 0.1,
289
+ "learning_rate": 0.0001999946328490197,
290
+ "loss": 1.9681,
291
  "step": 94
292
  },
293
  {
294
+ "epoch": 0.1,
295
+ "learning_rate": 0.0001999919824395085,
296
+ "loss": 2.1084,
297
  "step": 96
298
  },
299
  {
300
+ "epoch": 0.1,
301
+ "learning_rate": 0.00019998880197900446,
302
+ "loss": 1.8421,
303
  "step": 98
304
  },
305
  {
306
+ "epoch": 0.11,
307
+ "learning_rate": 0.00019998509148436697,
308
+ "loss": 2.0253,
309
  "step": 100
310
  },
311
  {
312
+ "epoch": 0.11,
313
+ "learning_rate": 0.0001999808509752652,
314
+ "loss": 2.2719,
315
  "step": 102
316
  },
317
  {
318
+ "epoch": 0.11,
319
+ "learning_rate": 0.00019997608047417776,
320
+ "loss": 2.3961,
321
  "step": 104
322
  },
323
  {
324
+ "epoch": 0.11,
325
+ "learning_rate": 0.0001999707800063928,
326
+ "loss": 2.2332,
327
  "step": 106
328
  },
329
  {
330
+ "epoch": 0.12,
331
+ "learning_rate": 0.00019996494960000774,
332
+ "loss": 2.3148,
333
  "step": 108
334
  },
335
  {
336
+ "epoch": 0.12,
337
+ "learning_rate": 0.00019995858928592916,
338
+ "loss": 2.2659,
339
  "step": 110
340
  },
341
  {
342
+ "epoch": 0.12,
343
+ "learning_rate": 0.00019995169909787271,
344
+ "loss": 2.3069,
345
  "step": 112
346
  },
347
  {
348
+ "epoch": 0.12,
349
+ "learning_rate": 0.0001999442790723628,
350
+ "loss": 2.1792,
351
  "step": 114
352
  },
353
  {
354
+ "epoch": 0.12,
355
+ "learning_rate": 0.0001999363292487325,
356
+ "loss": 2.2628,
357
  "step": 116
358
  },
359
  {
360
+ "epoch": 0.13,
361
+ "learning_rate": 0.00019992784966912333,
362
+ "loss": 2.3282,
363
  "step": 118
364
  },
365
  {
366
+ "epoch": 0.13,
367
+ "learning_rate": 0.00019991884037848497,
368
+ "loss": 2.286,
369
  "step": 120
370
  },
371
  {
372
+ "epoch": 0.13,
373
+ "learning_rate": 0.00019990930142457515,
374
+ "loss": 2.2392,
375
  "step": 122
376
  },
377
  {
378
+ "epoch": 0.13,
379
+ "learning_rate": 0.00019989923285795914,
380
+ "loss": 2.1468,
381
  "step": 124
382
  },
383
  {
384
+ "epoch": 0.13,
385
+ "learning_rate": 0.0001998886347320098,
386
+ "loss": 2.2453,
387
  "step": 126
388
  },
389
  {
390
+ "epoch": 0.14,
391
+ "learning_rate": 0.00019987750710290713,
392
+ "loss": 2.1149,
393
  "step": 128
394
  },
395
  {
396
+ "epoch": 0.14,
397
+ "learning_rate": 0.00019986585002963793,
398
+ "loss": 2.1051,
399
  "step": 130
400
  },
401
  {
402
+ "epoch": 0.14,
403
+ "learning_rate": 0.00019985366357399564,
404
+ "loss": 2.0007,
405
  "step": 132
406
  },
407
  {
408
+ "epoch": 0.14,
409
+ "learning_rate": 0.00019984094780057978,
410
+ "loss": 1.963,
411
  "step": 134
412
  },
413
  {
414
+ "epoch": 0.14,
415
+ "learning_rate": 0.00019982770277679596,
416
+ "loss": 1.9235,
417
  "step": 136
418
  },
419
  {
420
+ "epoch": 0.15,
421
+ "learning_rate": 0.00019981392857285505,
422
+ "loss": 1.861,
423
  "step": 138
424
  },
425
  {
426
+ "epoch": 0.15,
427
+ "learning_rate": 0.0001997996252617733,
428
+ "loss": 2.1247,
429
  "step": 140
430
  },
431
  {
432
+ "epoch": 0.15,
433
+ "learning_rate": 0.00019978479291937165,
434
+ "loss": 1.7524,
435
  "step": 142
436
  },
437
  {
438
+ "epoch": 0.15,
439
+ "learning_rate": 0.0001997694316242753,
440
+ "loss": 1.9346,
441
  "step": 144
442
  },
443
  {
444
+ "epoch": 0.16,
445
+ "learning_rate": 0.00019975354145791355,
446
+ "loss": 1.6186,
447
  "step": 146
448
  },
449
  {
450
+ "epoch": 0.16,
451
+ "learning_rate": 0.00019973712250451908,
452
+ "loss": 1.9757,
453
  "step": 148
454
  },
455
  {
456
+ "epoch": 0.16,
457
+ "learning_rate": 0.00019972017485112774,
458
+ "loss": 1.9019,
459
  "step": 150
460
  },
461
  {
462
+ "epoch": 0.16,
463
+ "learning_rate": 0.00019970269858757787,
464
+ "loss": 2.2553,
465
  "step": 152
466
  },
467
  {
468
+ "epoch": 0.16,
469
+ "learning_rate": 0.00019968469380651015,
470
+ "loss": 2.2539,
471
  "step": 154
472
  },
473
  {
474
+ "epoch": 0.17,
475
+ "learning_rate": 0.00019966616060336655,
476
+ "loss": 2.173,
477
  "step": 156
478
  },
479
  {
480
+ "epoch": 0.17,
481
+ "learning_rate": 0.00019964709907639057,
482
+ "loss": 2.1496,
483
  "step": 158
484
  },
485
  {
486
+ "epoch": 0.17,
487
+ "learning_rate": 0.000199627509326626,
488
+ "loss": 2.2138,
489
  "step": 160
490
  },
491
  {
492
+ "epoch": 0.17,
493
+ "learning_rate": 0.00019960739145791684,
494
+ "loss": 2.2994,
495
  "step": 162
496
  },
497
  {
498
+ "epoch": 0.17,
499
+ "learning_rate": 0.00019958674557690666,
500
+ "loss": 2.2498,
501
  "step": 164
502
  },
503
  {
504
+ "epoch": 0.18,
505
+ "learning_rate": 0.00019956557179303788,
506
+ "loss": 2.2241,
507
  "step": 166
508
  },
509
  {
510
+ "epoch": 0.18,
511
+ "learning_rate": 0.00019954387021855138,
512
+ "loss": 2.257,
513
  "step": 168
514
  },
515
  {
516
+ "epoch": 0.18,
517
+ "learning_rate": 0.00019952164096848578,
518
+ "loss": 2.1482,
519
  "step": 170
520
  },
521
  {
522
+ "epoch": 0.18,
523
+ "learning_rate": 0.00019949888416067688,
524
+ "loss": 2.1853,
525
  "step": 172
526
  },
527
  {
528
+ "epoch": 0.19,
529
+ "learning_rate": 0.00019947559991575706,
530
+ "loss": 2.1136,
531
  "step": 174
532
  },
533
  {
534
+ "epoch": 0.19,
535
+ "learning_rate": 0.00019945178835715458,
536
+ "loss": 2.1496,
537
  "step": 176
538
  },
539
  {
540
+ "epoch": 0.19,
541
+ "learning_rate": 0.00019942744961109297,
542
+ "loss": 2.0761,
543
  "step": 178
544
  },
545
  {
546
+ "epoch": 0.19,
547
+ "learning_rate": 0.0001994025838065903,
548
+ "loss": 2.0665,
549
  "step": 180
550
  },
551
  {
552
+ "epoch": 0.19,
553
+ "learning_rate": 0.00019937719107545864,
554
+ "loss": 2.1202,
555
  "step": 182
556
  },
557
  {
558
+ "epoch": 0.2,
559
+ "learning_rate": 0.00019935127155230314,
560
+ "loss": 1.9078,
561
  "step": 184
562
  },
563
  {
564
+ "epoch": 0.2,
565
+ "learning_rate": 0.0001993248253745216,
566
+ "loss": 1.9247,
567
  "step": 186
568
  },
569
  {
570
+ "epoch": 0.2,
571
+ "learning_rate": 0.0001992978526823034,
572
+ "loss": 1.8159,
573
  "step": 188
574
  },
575
  {
576
+ "epoch": 0.2,
577
+ "learning_rate": 0.00019927035361862904,
578
+ "loss": 1.7471,
579
  "step": 190
580
  },
581
  {
582
+ "epoch": 0.2,
583
+ "learning_rate": 0.0001992423283292693,
584
+ "loss": 1.8876,
585
  "step": 192
586
  },
587
  {
588
+ "epoch": 0.21,
589
+ "learning_rate": 0.00019921377696278437,
590
+ "loss": 1.771,
591
  "step": 194
592
  },
593
  {
594
+ "epoch": 0.21,
595
+ "learning_rate": 0.00019918469967052327,
596
+ "loss": 1.7633,
597
  "step": 196
598
  },
599
  {
600
+ "epoch": 0.21,
601
+ "learning_rate": 0.00019915509660662275,
602
+ "loss": 1.7469,
603
  "step": 198
604
  },
605
  {
606
+ "epoch": 0.21,
607
+ "learning_rate": 0.00019912496792800677,
608
+ "loss": 1.9769,
609
  "step": 200
610
  },
611
  {
612
+ "epoch": 0.22,
613
+ "learning_rate": 0.00019909431379438544,
614
+ "loss": 2.2334,
615
  "step": 202
616
  },
617
  {
618
+ "epoch": 0.22,
619
+ "learning_rate": 0.0001990631343682544,
620
+ "loss": 2.226,
621
  "step": 204
622
  },
623
  {
624
+ "epoch": 0.22,
625
+ "learning_rate": 0.00019903142981489373,
626
+ "loss": 2.129,
627
  "step": 206
628
  },
629
  {
630
+ "epoch": 0.22,
631
+ "learning_rate": 0.0001989992003023672,
632
+ "loss": 2.2408,
633
  "step": 208
634
  },
635
  {
636
+ "epoch": 0.22,
637
+ "learning_rate": 0.00019896644600152135,
638
+ "loss": 2.142,
639
  "step": 210
640
  },
641
  {
642
+ "epoch": 0.23,
643
+ "learning_rate": 0.0001989331670859846,
644
+ "loss": 2.3385,
645
  "step": 212
646
  },
647
  {
648
+ "epoch": 0.23,
649
+ "learning_rate": 0.00019889936373216634,
650
+ "loss": 2.2516,
651
  "step": 214
652
  },
653
  {
654
+ "epoch": 0.23,
655
+ "learning_rate": 0.0001988650361192559,
656
+ "loss": 2.2209,
657
  "step": 216
658
  },
659
  {
660
+ "epoch": 0.23,
661
+ "learning_rate": 0.00019883018442922178,
662
+ "loss": 2.1603,
663
  "step": 218
664
  },
665
  {
666
+ "epoch": 0.23,
667
+ "learning_rate": 0.0001987948088468105,
668
+ "loss": 2.2952,
669
  "step": 220
670
  },
671
  {
672
+ "epoch": 0.24,
673
+ "learning_rate": 0.00019875890955954573,
674
+ "loss": 2.2628,
675
  "step": 222
676
  },
677
  {
678
+ "epoch": 0.24,
679
+ "learning_rate": 0.00019872248675772722,
680
+ "loss": 2.2564,
681
  "step": 224
682
  },
683
  {
684
+ "epoch": 0.24,
685
+ "learning_rate": 0.0001986855406344299,
686
+ "loss": 2.0969,
687
  "step": 226
688
  },
689
  {
690
+ "epoch": 0.24,
691
+ "learning_rate": 0.00019864807138550273,
692
+ "loss": 2.1252,
693
  "step": 228
694
  },
695
  {
696
+ "epoch": 0.25,
697
+ "learning_rate": 0.00019861007920956786,
698
+ "loss": 1.9448,
699
  "step": 230
700
  },
701
  {
702
+ "epoch": 0.25,
703
+ "learning_rate": 0.0001985715643080192,
704
+ "loss": 1.9488,
705
  "step": 232
706
  },
707
  {
708
+ "epoch": 0.25,
709
+ "learning_rate": 0.00019853252688502187,
710
+ "loss": 1.9668,
711
  "step": 234
712
  },
713
  {
714
+ "epoch": 0.25,
715
+ "learning_rate": 0.00019849296714751063,
716
+ "loss": 1.8091,
717
  "step": 236
718
  },
719
  {
720
+ "epoch": 0.25,
721
+ "learning_rate": 0.0001984528853051891,
722
+ "loss": 1.9742,
723
  "step": 238
724
  },
725
  {
726
+ "epoch": 0.26,
727
+ "learning_rate": 0.00019841228157052853,
728
+ "loss": 1.6913,
729
  "step": 240
730
  },
731
  {
732
+ "epoch": 0.26,
733
+ "learning_rate": 0.00019837115615876664,
734
+ "loss": 1.8882,
735
  "step": 242
736
  },
737
  {
738
+ "epoch": 0.26,
739
+ "learning_rate": 0.00019832950928790657,
740
+ "loss": 1.5621,
741
  "step": 244
742
  },
743
  {
744
+ "epoch": 0.26,
745
+ "learning_rate": 0.0001982873411787157,
746
+ "loss": 1.6418,
747
  "step": 246
748
  },
749
  {
750
+ "epoch": 0.26,
751
+ "learning_rate": 0.0001982446520547244,
752
+ "loss": 1.837,
753
  "step": 248
754
  },
755
  {
756
+ "epoch": 0.27,
757
+ "learning_rate": 0.00019820144214222497,
758
+ "loss": 1.8345,
759
  "step": 250
760
  },
761
  {
762
+ "epoch": 0.27,
763
+ "learning_rate": 0.00019815771167027034,
764
+ "loss": 2.2542,
765
  "step": 252
766
  },
767
  {
768
+ "epoch": 0.27,
769
+ "learning_rate": 0.00019811346087067287,
770
+ "loss": 2.3098,
771
  "step": 254
772
  },
773
  {
774
+ "epoch": 0.27,
775
+ "learning_rate": 0.00019806868997800317,
776
+ "loss": 2.1615,
777
  "step": 256
778
  },
779
  {
780
+ "epoch": 0.27,
781
+ "learning_rate": 0.0001980233992295889,
782
+ "loss": 2.314,
783
  "step": 258
784
  },
785
  {
786
+ "epoch": 0.28,
787
+ "learning_rate": 0.00019797758886551324,
788
+ "loss": 2.2309,
789
  "step": 260
790
  },
791
  {
792
+ "epoch": 0.28,
793
+ "learning_rate": 0.0001979312591286141,
794
+ "loss": 2.1631,
795
  "step": 262
796
  },
797
  {
798
+ "epoch": 0.28,
799
+ "learning_rate": 0.00019788441026448225,
800
+ "loss": 2.0791,
801
  "step": 264
802
  },
803
  {
804
+ "epoch": 0.28,
805
+ "learning_rate": 0.0001978370425214606,
806
+ "loss": 2.2653,
807
  "step": 266
808
  },
809
  {
810
+ "epoch": 0.29,
811
+ "learning_rate": 0.0001977891561506424,
812
+ "loss": 2.2547,
813
  "step": 268
814
  },
815
  {
816
+ "epoch": 0.29,
817
+ "learning_rate": 0.00019774075140587024,
818
+ "loss": 2.2326,
819
  "step": 270
820
  },
821
  {
822
+ "epoch": 0.29,
823
+ "learning_rate": 0.00019769182854373444,
824
+ "loss": 2.0206,
825
  "step": 272
826
  },
827
  {
828
+ "epoch": 0.29,
829
+ "learning_rate": 0.000197642387823572,
830
+ "loss": 2.0665,
831
  "step": 274
832
  },
833
  {
834
+ "epoch": 0.29,
835
+ "learning_rate": 0.00019759242950746487,
836
+ "loss": 2.1532,
837
  "step": 276
838
  },
839
  {
840
+ "epoch": 0.3,
841
+ "learning_rate": 0.0001975419538602389,
842
+ "loss": 2.0988,
843
  "step": 278
844
  },
845
  {
846
+ "epoch": 0.3,
847
+ "learning_rate": 0.0001974909611494622,
848
+ "loss": 1.937,
849
  "step": 280
850
  },
851
  {
852
+ "epoch": 0.3,
853
+ "learning_rate": 0.0001974394516454438,
854
+ "loss": 1.9614,
855
  "step": 282
856
  },
857
  {
858
+ "epoch": 0.3,
859
+ "learning_rate": 0.00019738742562123225,
860
+ "loss": 1.8518,
861
  "step": 284
862
  },
863
  {
864
+ "epoch": 0.3,
865
+ "learning_rate": 0.00019733488335261408,
866
+ "loss": 1.7994,
867
  "step": 286
868
  },
869
  {
870
+ "epoch": 0.31,
871
+ "learning_rate": 0.00019728182511811245,
872
+ "loss": 1.9275,
873
  "step": 288
874
  },
875
  {
876
+ "epoch": 0.31,
877
+ "learning_rate": 0.00019722825119898566,
878
+ "loss": 1.9991,
879
  "step": 290
880
  },
881
  {
882
+ "epoch": 0.31,
883
+ "learning_rate": 0.0001971741618792255,
884
+ "loss": 1.6737,
885
  "step": 292
886
  },
887
  {
888
+ "epoch": 0.31,
889
+ "learning_rate": 0.00019711955744555594,
890
+ "loss": 1.4669,
891
  "step": 294
892
  },
893
  {
894
+ "epoch": 0.32,
895
+ "learning_rate": 0.0001970644381874316,
896
+ "loss": 1.5002,
897
  "step": 296
898
  },
899
  {
900
+ "epoch": 0.32,
901
+ "learning_rate": 0.00019700880439703602,
902
+ "loss": 1.9151,
903
  "step": 298
904
  },
905
  {
906
+ "epoch": 0.32,
907
+ "learning_rate": 0.00019695265636928032,
908
+ "loss": 1.7493,
909
  "step": 300
910
  },
911
  {
912
+ "epoch": 0.32,
913
+ "learning_rate": 0.00019689599440180153,
914
+ "loss": 2.2535,
915
  "step": 302
916
  },
917
  {
918
+ "epoch": 0.32,
919
+ "learning_rate": 0.00019683881879496107,
920
+ "loss": 2.2597,
921
  "step": 304
922
  },
923
  {
924
+ "epoch": 0.33,
925
+ "learning_rate": 0.00019678112985184308,
926
+ "loss": 2.3117,
927
  "step": 306
928
  },
929
  {
930
+ "epoch": 0.33,
931
+ "learning_rate": 0.00019672292787825292,
932
+ "loss": 2.1624,
933
  "step": 308
934
  },
935
  {
936
+ "epoch": 0.33,
937
+ "learning_rate": 0.00019666421318271547,
938
+ "loss": 2.1857,
939
  "step": 310
940
  },
941
  {
942
+ "epoch": 0.33,
943
+ "learning_rate": 0.0001966049860764735,
944
+ "loss": 2.2101,
945
  "step": 312
946
  },
947
  {
948
+ "epoch": 0.33,
949
+ "learning_rate": 0.00019654524687348607,
950
+ "loss": 2.1749,
951
  "step": 314
952
  },
953
  {
954
+ "epoch": 0.34,
955
+ "learning_rate": 0.00019648499589042676,
956
+ "loss": 2.1557,
957
  "step": 316
958
  },
959
  {
960
+ "epoch": 0.34,
961
+ "learning_rate": 0.00019642423344668218,
962
+ "loss": 2.2259,
963
  "step": 318
964
  },
965
  {
966
+ "epoch": 0.34,
967
+ "learning_rate": 0.00019636295986435003,
968
+ "loss": 2.1723,
969
  "step": 320
970
  },
971
  {
972
+ "epoch": 0.34,
973
+ "learning_rate": 0.00019630117546823759,
974
+ "loss": 2.294,
975
  "step": 322
976
  },
977
  {
978
+ "epoch": 0.35,
979
+ "learning_rate": 0.00019623888058585993,
980
+ "loss": 2.2241,
981
  "step": 324
982
  },
983
  {
984
+ "epoch": 0.35,
985
+ "learning_rate": 0.00019617607554743818,
986
+ "loss": 2.1496,
987
  "step": 326
988
  },
989
  {
990
+ "epoch": 0.35,
991
+ "learning_rate": 0.00019611276068589776,
992
+ "loss": 2.1116,
993
  "step": 328
994
  },
995
  {
996
+ "epoch": 0.35,
997
+ "learning_rate": 0.00019604893633686662,
998
+ "loss": 2.1723,
999
  "step": 330
1000
  },
1001
  {
1002
+ "epoch": 0.35,
1003
+ "learning_rate": 0.0001959846028386735,
1004
+ "loss": 2.0301,
1005
  "step": 332
1006
  },
1007
  {
1008
+ "epoch": 0.36,
1009
+ "learning_rate": 0.00019591976053234608,
1010
+ "loss": 1.8651,
1011
  "step": 334
1012
  },
1013
  {
1014
+ "epoch": 0.36,
1015
+ "learning_rate": 0.0001958544097616092,
1016
+ "loss": 1.8286,
1017
  "step": 336
1018
  },
1019
  {
1020
+ "epoch": 0.36,
1021
+ "learning_rate": 0.00019578855087288302,
1022
+ "loss": 1.9081,
1023
  "step": 338
1024
  },
1025
  {
1026
+ "epoch": 0.36,
1027
+ "learning_rate": 0.0001957221842152813,
1028
+ "loss": 1.8546,
1029
  "step": 340
1030
  },
1031
  {
1032
+ "epoch": 0.36,
1033
+ "learning_rate": 0.0001956553101406093,
1034
+ "loss": 1.7914,
1035
  "step": 342
1036
  },
1037
  {
1038
+ "epoch": 0.37,
1039
+ "learning_rate": 0.00019558792900336216,
1040
+ "loss": 1.7997,
1041
  "step": 344
1042
  },
1043
  {
1044
+ "epoch": 0.37,
1045
+ "learning_rate": 0.00019552004116072294,
1046
+ "loss": 1.3596,
1047
  "step": 346
1048
  },
1049
  {
1050
+ "epoch": 0.37,
1051
+ "learning_rate": 0.0001954516469725606,
1052
+ "loss": 1.6549,
1053
  "step": 348
1054
  },
1055
  {
1056
+ "epoch": 0.37,
1057
+ "learning_rate": 0.00019538274680142834,
1058
+ "loss": 1.6592,
1059
  "step": 350
1060
  },
1061
  {
1062
+ "epoch": 0.38,
1063
+ "learning_rate": 0.00019531334101256147,
1064
+ "loss": 2.2759,
1065
  "step": 352
1066
  },
1067
  {
1068
+ "epoch": 0.38,
1069
+ "learning_rate": 0.00019524342997387557,
1070
+ "loss": 2.1805,
1071
  "step": 354
1072
  },
1073
  {
1074
+ "epoch": 0.38,
1075
+ "learning_rate": 0.0001951730140559645,
1076
+ "loss": 2.2585,
1077
  "step": 356
1078
  },
1079
  {
1080
+ "epoch": 0.38,
1081
+ "learning_rate": 0.00019510209363209847,
1082
+ "loss": 2.2573,
1083
  "step": 358
1084
  },
1085
  {
1086
+ "epoch": 0.38,
1087
+ "learning_rate": 0.00019503066907822198,
1088
+ "loss": 2.3154,
1089
  "step": 360
1090
  },
1091
  {
1092
+ "epoch": 0.39,
1093
+ "learning_rate": 0.000194958740772952,
1094
+ "loss": 2.2306,
1095
  "step": 362
1096
  },
1097
  {
1098
+ "epoch": 0.39,
1099
+ "learning_rate": 0.00019488630909757579,
1100
+ "loss": 2.3178,
1101
  "step": 364
1102
  },
1103
  {
1104
+ "epoch": 0.39,
1105
+ "learning_rate": 0.00019481337443604893,
1106
+ "loss": 2.2691,
1107
  "step": 366
1108
  },
1109
  {
1110
+ "epoch": 0.39,
1111
+ "learning_rate": 0.0001947399371749933,
1112
+ "loss": 2.153,
1113
  "step": 368
1114
  },
1115
  {
1116
+ "epoch": 0.39,
1117
+ "learning_rate": 0.00019466599770369509,
1118
+ "loss": 2.0471,
1119
  "step": 370
1120
  },
1121
  {
1122
+ "epoch": 0.4,
1123
+ "learning_rate": 0.00019459155641410257,
1124
+ "loss": 2.3142,
1125
  "step": 372
1126
  },
1127
  {
1128
+ "epoch": 0.4,
1129
+ "learning_rate": 0.00019451661370082426,
1130
+ "loss": 2.1241,
1131
  "step": 374
1132
  },
1133
  {
1134
+ "epoch": 0.4,
1135
+ "learning_rate": 0.0001944411699611265,
1136
+ "loss": 2.1079,
1137
  "step": 376
1138
  },
1139
  {
1140
+ "epoch": 0.4,
1141
+ "learning_rate": 0.0001943652255949317,
1142
+ "loss": 2.063,
1143
  "step": 378
1144
  },
1145
  {
1146
+ "epoch": 0.41,
1147
+ "learning_rate": 0.00019428878100481606,
1148
+ "loss": 2.0062,
1149
  "step": 380
1150
  },
1151
  {
1152
+ "epoch": 0.41,
1153
+ "learning_rate": 0.00019421183659600725,
1154
+ "loss": 1.9,
1155
  "step": 382
1156
  },
1157
  {
1158
+ "epoch": 0.41,
1159
+ "learning_rate": 0.00019413439277638265,
1160
+ "loss": 1.9723,
1161
  "step": 384
1162
  },
1163
  {
1164
+ "epoch": 0.41,
1165
+ "learning_rate": 0.00019405644995646696,
1166
+ "loss": 1.969,
1167
  "step": 386
1168
  },
1169
  {
1170
+ "epoch": 0.41,
1171
+ "learning_rate": 0.00019397800854942986,
1172
+ "loss": 2.0202,
1173
  "step": 388
1174
  },
1175
  {
1176
+ "epoch": 0.42,
1177
+ "learning_rate": 0.00019389906897108428,
1178
+ "loss": 1.9028,
1179
  "step": 390
1180
  },
1181
  {
1182
+ "epoch": 0.42,
1183
+ "learning_rate": 0.0001938196316398837,
1184
+ "loss": 1.8172,
1185
  "step": 392
1186
  },
1187
  {
1188
+ "epoch": 0.42,
1189
+ "learning_rate": 0.00019373969697692028,
1190
+ "loss": 1.7243,
1191
  "step": 394
1192
  },
1193
  {
1194
+ "epoch": 0.42,
1195
+ "learning_rate": 0.00019365926540592247,
1196
+ "loss": 1.4621,
1197
  "step": 396
1198
  },
1199
  {
1200
+ "epoch": 0.42,
1201
+ "learning_rate": 0.0001935783373532528,
1202
+ "loss": 1.7987,
1203
  "step": 398
1204
  },
1205
  {
1206
+ "epoch": 0.43,
1207
+ "learning_rate": 0.00019349691324790555,
1208
+ "loss": 1.7935,
1209
  "step": 400
1210
  },
1211
  {
1212
+ "epoch": 0.43,
1213
+ "learning_rate": 0.0001934149935215047,
1214
+ "loss": 2.2858,
1215
  "step": 402
1216
+ },
1217
+ {
1218
+ "epoch": 0.43,
1219
+ "learning_rate": 0.00019333257860830135,
1220
+ "loss": 2.3798,
1221
+ "step": 404
1222
+ },
1223
+ {
1224
+ "epoch": 0.43,
1225
+ "learning_rate": 0.00019324966894517155,
1226
+ "loss": 2.1534,
1227
+ "step": 406
1228
+ },
1229
+ {
1230
+ "epoch": 0.43,
1231
+ "learning_rate": 0.00019316626497161408,
1232
+ "loss": 2.2648,
1233
+ "step": 408
1234
+ },
1235
+ {
1236
+ "epoch": 0.44,
1237
+ "learning_rate": 0.00019308236712974795,
1238
+ "loss": 2.2946,
1239
+ "step": 410
1240
+ },
1241
+ {
1242
+ "epoch": 0.44,
1243
+ "learning_rate": 0.0001929979758643102,
1244
+ "loss": 2.319,
1245
+ "step": 412
1246
+ },
1247
+ {
1248
+ "epoch": 0.44,
1249
+ "learning_rate": 0.00019291309162265338,
1250
+ "loss": 2.2271,
1251
+ "step": 414
1252
+ },
1253
+ {
1254
+ "epoch": 0.44,
1255
+ "learning_rate": 0.0001928277148547434,
1256
+ "loss": 2.0746,
1257
+ "step": 416
1258
+ },
1259
+ {
1260
+ "epoch": 0.45,
1261
+ "learning_rate": 0.00019274184601315687,
1262
+ "loss": 2.1231,
1263
+ "step": 418
1264
+ },
1265
+ {
1266
+ "epoch": 0.45,
1267
+ "learning_rate": 0.000192655485553079,
1268
+ "loss": 2.3176,
1269
+ "step": 420
1270
+ },
1271
+ {
1272
+ "epoch": 0.45,
1273
+ "learning_rate": 0.00019256863393230094,
1274
+ "loss": 2.1657,
1275
+ "step": 422
1276
+ },
1277
+ {
1278
+ "epoch": 0.45,
1279
+ "learning_rate": 0.00019248129161121748,
1280
+ "loss": 2.2696,
1281
+ "step": 424
1282
+ },
1283
+ {
1284
+ "epoch": 0.45,
1285
+ "learning_rate": 0.0001923934590528246,
1286
+ "loss": 2.2132,
1287
+ "step": 426
1288
+ },
1289
+ {
1290
+ "epoch": 0.46,
1291
+ "learning_rate": 0.00019230513672271698,
1292
+ "loss": 2.104,
1293
+ "step": 428
1294
+ },
1295
+ {
1296
+ "epoch": 0.46,
1297
+ "learning_rate": 0.0001922163250890855,
1298
+ "loss": 1.9319,
1299
+ "step": 430
1300
+ },
1301
+ {
1302
+ "epoch": 0.46,
1303
+ "learning_rate": 0.0001921270246227149,
1304
+ "loss": 1.9152,
1305
+ "step": 432
1306
+ },
1307
+ {
1308
+ "epoch": 0.46,
1309
+ "learning_rate": 0.00019203723579698108,
1310
+ "loss": 1.8615,
1311
+ "step": 434
1312
+ },
1313
+ {
1314
+ "epoch": 0.46,
1315
+ "learning_rate": 0.00019194695908784882,
1316
+ "loss": 2.0695,
1317
+ "step": 436
1318
+ },
1319
+ {
1320
+ "epoch": 0.47,
1321
+ "learning_rate": 0.0001918561949738691,
1322
+ "loss": 1.8626,
1323
+ "step": 438
1324
+ },
1325
+ {
1326
+ "epoch": 0.47,
1327
+ "learning_rate": 0.0001917649439361765,
1328
+ "loss": 1.7538,
1329
+ "step": 440
1330
+ },
1331
+ {
1332
+ "epoch": 0.47,
1333
+ "learning_rate": 0.00019167320645848695,
1334
+ "loss": 2.0082,
1335
+ "step": 442
1336
+ },
1337
+ {
1338
+ "epoch": 0.47,
1339
+ "learning_rate": 0.00019158098302709476,
1340
+ "loss": 1.6011,
1341
+ "step": 444
1342
+ },
1343
+ {
1344
+ "epoch": 0.48,
1345
+ "learning_rate": 0.00019148827413087034,
1346
+ "loss": 1.5286,
1347
+ "step": 446
1348
+ },
1349
+ {
1350
+ "epoch": 0.48,
1351
+ "learning_rate": 0.00019139508026125754,
1352
+ "loss": 1.8795,
1353
+ "step": 448
1354
+ },
1355
+ {
1356
+ "epoch": 0.48,
1357
+ "learning_rate": 0.000191301401912271,
1358
+ "loss": 1.7509,
1359
+ "step": 450
1360
+ },
1361
+ {
1362
+ "epoch": 0.48,
1363
+ "learning_rate": 0.00019120723958049353,
1364
+ "loss": 2.3155,
1365
+ "step": 452
1366
+ },
1367
+ {
1368
+ "epoch": 0.48,
1369
+ "learning_rate": 0.0001911125937650736,
1370
+ "loss": 2.2125,
1371
+ "step": 454
1372
+ },
1373
+ {
1374
+ "epoch": 0.49,
1375
+ "learning_rate": 0.00019101746496772242,
1376
+ "loss": 2.2262,
1377
+ "step": 456
1378
+ },
1379
+ {
1380
+ "epoch": 0.49,
1381
+ "learning_rate": 0.0001909218536927116,
1382
+ "loss": 2.3185,
1383
+ "step": 458
1384
+ },
1385
+ {
1386
+ "epoch": 0.49,
1387
+ "learning_rate": 0.0001908257604468703,
1388
+ "loss": 2.2496,
1389
+ "step": 460
1390
+ },
1391
+ {
1392
+ "epoch": 0.49,
1393
+ "learning_rate": 0.00019072918573958254,
1394
+ "loss": 2.3569,
1395
+ "step": 462
1396
+ },
1397
+ {
1398
+ "epoch": 0.49,
1399
+ "learning_rate": 0.0001906321300827846,
1400
+ "loss": 2.1697,
1401
+ "step": 464
1402
+ },
1403
+ {
1404
+ "epoch": 0.5,
1405
+ "learning_rate": 0.00019053459399096215,
1406
+ "loss": 2.197,
1407
+ "step": 466
1408
+ },
1409
+ {
1410
+ "epoch": 0.5,
1411
+ "learning_rate": 0.00019043657798114766,
1412
+ "loss": 1.9929,
1413
+ "step": 468
1414
+ },
1415
+ {
1416
+ "epoch": 0.5,
1417
+ "learning_rate": 0.00019033808257291768,
1418
+ "loss": 2.1488,
1419
+ "step": 470
1420
+ },
1421
+ {
1422
+ "epoch": 0.5,
1423
+ "learning_rate": 0.0001902391082883899,
1424
+ "loss": 2.2224,
1425
+ "step": 472
1426
+ },
1427
+ {
1428
+ "epoch": 0.51,
1429
+ "learning_rate": 0.00019013965565222062,
1430
+ "loss": 2.0495,
1431
+ "step": 474
1432
+ },
1433
+ {
1434
+ "epoch": 0.51,
1435
+ "learning_rate": 0.00019003972519160178,
1436
+ "loss": 2.0964,
1437
+ "step": 476
1438
+ },
1439
+ {
1440
+ "epoch": 0.51,
1441
+ "learning_rate": 0.0001899393174362582,
1442
+ "loss": 1.8708,
1443
+ "step": 478
1444
+ },
1445
+ {
1446
+ "epoch": 0.51,
1447
+ "learning_rate": 0.00018983843291844492,
1448
+ "loss": 1.9741,
1449
+ "step": 480
1450
+ },
1451
+ {
1452
+ "epoch": 0.51,
1453
+ "learning_rate": 0.00018973707217294415,
1454
+ "loss": 1.9908,
1455
+ "step": 482
1456
+ },
1457
+ {
1458
+ "epoch": 0.52,
1459
+ "learning_rate": 0.00018963523573706264,
1460
+ "loss": 1.7973,
1461
+ "step": 484
1462
+ },
1463
+ {
1464
+ "epoch": 0.52,
1465
+ "learning_rate": 0.0001895329241506287,
1466
+ "loss": 1.9293,
1467
+ "step": 486
1468
+ },
1469
+ {
1470
+ "epoch": 0.52,
1471
+ "learning_rate": 0.00018943013795598944,
1472
+ "loss": 1.94,
1473
+ "step": 488
1474
+ },
1475
+ {
1476
+ "epoch": 0.52,
1477
+ "learning_rate": 0.00018932687769800767,
1478
+ "loss": 1.9435,
1479
+ "step": 490
1480
+ },
1481
+ {
1482
+ "epoch": 0.52,
1483
+ "learning_rate": 0.00018922314392405944,
1484
+ "loss": 1.7907,
1485
+ "step": 492
1486
+ },
1487
+ {
1488
+ "epoch": 0.53,
1489
+ "learning_rate": 0.00018911893718403063,
1490
+ "loss": 1.6344,
1491
+ "step": 494
1492
+ },
1493
+ {
1494
+ "epoch": 0.53,
1495
+ "learning_rate": 0.00018901425803031447,
1496
+ "loss": 1.3563,
1497
+ "step": 496
1498
+ },
1499
+ {
1500
+ "epoch": 0.53,
1501
+ "learning_rate": 0.00018890910701780826,
1502
+ "loss": 1.7,
1503
+ "step": 498
1504
+ },
1505
+ {
1506
+ "epoch": 0.53,
1507
+ "learning_rate": 0.00018880348470391077,
1508
+ "loss": 1.7682,
1509
+ "step": 500
1510
+ },
1511
+ {
1512
+ "epoch": 0.54,
1513
+ "learning_rate": 0.00018869739164851889,
1514
+ "loss": 2.2041,
1515
+ "step": 502
1516
+ },
1517
+ {
1518
+ "epoch": 0.54,
1519
+ "learning_rate": 0.00018859082841402513,
1520
+ "loss": 2.237,
1521
+ "step": 504
1522
+ },
1523
+ {
1524
+ "epoch": 0.54,
1525
+ "learning_rate": 0.0001884837955653142,
1526
+ "loss": 2.1694,
1527
+ "step": 506
1528
+ },
1529
+ {
1530
+ "epoch": 0.54,
1531
+ "learning_rate": 0.00018837629366976025,
1532
+ "loss": 2.2373,
1533
+ "step": 508
1534
+ },
1535
+ {
1536
+ "epoch": 0.54,
1537
+ "learning_rate": 0.0001882683232972239,
1538
+ "loss": 2.2511,
1539
+ "step": 510
1540
+ },
1541
+ {
1542
+ "epoch": 0.55,
1543
+ "learning_rate": 0.000188159885020049,
1544
+ "loss": 2.2474,
1545
+ "step": 512
1546
+ },
1547
+ {
1548
+ "epoch": 0.55,
1549
+ "learning_rate": 0.00018805097941305984,
1550
+ "loss": 2.1938,
1551
+ "step": 514
1552
+ },
1553
+ {
1554
+ "epoch": 0.55,
1555
+ "learning_rate": 0.00018794160705355796,
1556
+ "loss": 2.0874,
1557
+ "step": 516
1558
+ },
1559
+ {
1560
+ "epoch": 0.55,
1561
+ "learning_rate": 0.00018783176852131908,
1562
+ "loss": 2.2795,
1563
+ "step": 518
1564
+ },
1565
+ {
1566
+ "epoch": 0.55,
1567
+ "learning_rate": 0.00018772146439859015,
1568
+ "loss": 2.179,
1569
+ "step": 520
1570
+ },
1571
+ {
1572
+ "epoch": 0.56,
1573
+ "learning_rate": 0.00018761069527008613,
1574
+ "loss": 2.0959,
1575
+ "step": 522
1576
+ },
1577
+ {
1578
+ "epoch": 0.56,
1579
+ "learning_rate": 0.00018749946172298698,
1580
+ "loss": 2.3345,
1581
+ "step": 524
1582
+ },
1583
+ {
1584
+ "epoch": 0.56,
1585
+ "learning_rate": 0.00018738776434693447,
1586
+ "loss": 2.0522,
1587
+ "step": 526
1588
+ },
1589
+ {
1590
+ "epoch": 0.56,
1591
+ "learning_rate": 0.00018727560373402917,
1592
+ "loss": 2.1055,
1593
+ "step": 528
1594
+ },
1595
+ {
1596
+ "epoch": 0.56,
1597
+ "learning_rate": 0.00018716298047882714,
1598
+ "loss": 2.0651,
1599
+ "step": 530
1600
+ },
1601
+ {
1602
+ "epoch": 0.57,
1603
+ "learning_rate": 0.00018704989517833695,
1604
+ "loss": 2.0961,
1605
+ "step": 532
1606
+ },
1607
+ {
1608
+ "epoch": 0.57,
1609
+ "learning_rate": 0.0001869363484320164,
1610
+ "loss": 1.9879,
1611
+ "step": 534
1612
+ },
1613
+ {
1614
+ "epoch": 0.57,
1615
+ "learning_rate": 0.00018682234084176945,
1616
+ "loss": 2.0118,
1617
+ "step": 536
1618
+ },
1619
+ {
1620
+ "epoch": 0.57,
1621
+ "learning_rate": 0.0001867078730119429,
1622
+ "loss": 1.9059,
1623
+ "step": 538
1624
+ },
1625
+ {
1626
+ "epoch": 0.58,
1627
+ "learning_rate": 0.00018659294554932324,
1628
+ "loss": 1.8227,
1629
+ "step": 540
1630
+ },
1631
+ {
1632
+ "epoch": 0.58,
1633
+ "learning_rate": 0.00018647755906313348,
1634
+ "loss": 1.8794,
1635
+ "step": 542
1636
+ },
1637
+ {
1638
+ "epoch": 0.58,
1639
+ "learning_rate": 0.0001863617141650299,
1640
+ "loss": 1.6243,
1641
+ "step": 544
1642
+ },
1643
+ {
1644
+ "epoch": 0.58,
1645
+ "learning_rate": 0.00018624541146909873,
1646
+ "loss": 1.6429,
1647
+ "step": 546
1648
+ },
1649
+ {
1650
+ "epoch": 0.58,
1651
+ "learning_rate": 0.00018612865159185304,
1652
+ "loss": 2.03,
1653
+ "step": 548
1654
+ },
1655
+ {
1656
+ "epoch": 0.59,
1657
+ "learning_rate": 0.0001860114351522293,
1658
+ "loss": 1.7897,
1659
+ "step": 550
1660
+ },
1661
+ {
1662
+ "epoch": 0.59,
1663
+ "learning_rate": 0.00018589376277158425,
1664
+ "loss": 2.2226,
1665
+ "step": 552
1666
+ },
1667
+ {
1668
+ "epoch": 0.59,
1669
+ "learning_rate": 0.00018577563507369153,
1670
+ "loss": 2.2998,
1671
+ "step": 554
1672
+ },
1673
+ {
1674
+ "epoch": 0.59,
1675
+ "learning_rate": 0.00018565705268473837,
1676
+ "loss": 2.1385,
1677
+ "step": 556
1678
+ },
1679
+ {
1680
+ "epoch": 0.59,
1681
+ "learning_rate": 0.0001855380162333223,
1682
+ "loss": 2.1793,
1683
+ "step": 558
1684
+ },
1685
+ {
1686
+ "epoch": 0.6,
1687
+ "learning_rate": 0.0001854185263504478,
1688
+ "loss": 2.2456,
1689
+ "step": 560
1690
+ },
1691
+ {
1692
+ "epoch": 0.6,
1693
+ "learning_rate": 0.00018529858366952298,
1694
+ "loss": 2.162,
1695
+ "step": 562
1696
+ },
1697
+ {
1698
+ "epoch": 0.6,
1699
+ "learning_rate": 0.00018517818882635617,
1700
+ "loss": 2.1046,
1701
+ "step": 564
1702
+ },
1703
+ {
1704
+ "epoch": 0.6,
1705
+ "learning_rate": 0.0001850573424591526,
1706
+ "loss": 2.1761,
1707
+ "step": 566
1708
+ },
1709
+ {
1710
+ "epoch": 0.61,
1711
+ "learning_rate": 0.00018493604520851097,
1712
+ "loss": 2.2593,
1713
+ "step": 568
1714
+ },
1715
+ {
1716
+ "epoch": 0.61,
1717
+ "learning_rate": 0.00018481429771742018,
1718
+ "loss": 2.2067,
1719
+ "step": 570
1720
+ },
1721
+ {
1722
+ "epoch": 0.61,
1723
+ "learning_rate": 0.00018469210063125572,
1724
+ "loss": 2.1257,
1725
+ "step": 572
1726
+ },
1727
+ {
1728
+ "epoch": 0.61,
1729
+ "learning_rate": 0.00018456945459777643,
1730
+ "loss": 2.2823,
1731
+ "step": 574
1732
+ },
1733
+ {
1734
+ "epoch": 0.61,
1735
+ "learning_rate": 0.0001844463602671209,
1736
+ "loss": 2.2942,
1737
+ "step": 576
1738
+ },
1739
+ {
1740
+ "epoch": 0.62,
1741
+ "learning_rate": 0.0001843228182918042,
1742
+ "loss": 2.1126,
1743
+ "step": 578
1744
+ },
1745
+ {
1746
+ "epoch": 0.62,
1747
+ "learning_rate": 0.0001841988293267143,
1748
+ "loss": 2.2014,
1749
+ "step": 580
1750
+ },
1751
+ {
1752
+ "epoch": 0.62,
1753
+ "learning_rate": 0.00018407439402910858,
1754
+ "loss": 2.0244,
1755
+ "step": 582
1756
+ },
1757
+ {
1758
+ "epoch": 0.62,
1759
+ "learning_rate": 0.00018394951305861055,
1760
+ "loss": 1.9842,
1761
+ "step": 584
1762
+ },
1763
+ {
1764
+ "epoch": 0.62,
1765
+ "learning_rate": 0.00018382418707720604,
1766
+ "loss": 1.8637,
1767
+ "step": 586
1768
+ },
1769
+ {
1770
+ "epoch": 0.63,
1771
+ "learning_rate": 0.00018369841674923998,
1772
+ "loss": 1.8229,
1773
+ "step": 588
1774
+ },
1775
+ {
1776
+ "epoch": 0.63,
1777
+ "learning_rate": 0.00018357220274141262,
1778
+ "loss": 1.8023,
1779
+ "step": 590
1780
+ },
1781
+ {
1782
+ "epoch": 0.63,
1783
+ "learning_rate": 0.00018344554572277628,
1784
+ "loss": 1.6493,
1785
+ "step": 592
1786
+ },
1787
+ {
1788
+ "epoch": 0.63,
1789
+ "learning_rate": 0.00018331844636473152,
1790
+ "loss": 1.4272,
1791
+ "step": 594
1792
+ },
1793
+ {
1794
+ "epoch": 0.64,
1795
+ "learning_rate": 0.00018319090534102381,
1796
+ "loss": 1.5226,
1797
+ "step": 596
1798
+ },
1799
+ {
1800
+ "epoch": 0.64,
1801
+ "learning_rate": 0.0001830629233277398,
1802
+ "loss": 1.5868,
1803
+ "step": 598
1804
+ },
1805
+ {
1806
+ "epoch": 0.64,
1807
+ "learning_rate": 0.00018293450100330375,
1808
+ "loss": 1.6859,
1809
+ "step": 600
1810
+ },
1811
+ {
1812
+ "epoch": 0.64,
1813
+ "learning_rate": 0.00018280563904847415,
1814
+ "loss": 2.2781,
1815
+ "step": 602
1816
+ },
1817
+ {
1818
+ "epoch": 0.64,
1819
+ "learning_rate": 0.0001826763381463398,
1820
+ "loss": 2.2743,
1821
+ "step": 604
1822
+ },
1823
+ {
1824
+ "epoch": 0.65,
1825
+ "learning_rate": 0.0001825465989823164,
1826
+ "loss": 2.2974,
1827
+ "step": 606
1828
+ },
1829
+ {
1830
+ "epoch": 0.65,
1831
+ "learning_rate": 0.00018241642224414272,
1832
+ "loss": 2.089,
1833
+ "step": 608
1834
+ },
1835
+ {
1836
+ "epoch": 0.65,
1837
+ "learning_rate": 0.00018228580862187727,
1838
+ "loss": 2.3559,
1839
+ "step": 610
1840
+ },
1841
+ {
1842
+ "epoch": 0.65,
1843
+ "learning_rate": 0.00018215475880789433,
1844
+ "loss": 2.2152,
1845
+ "step": 612
1846
+ },
1847
+ {
1848
+ "epoch": 0.65,
1849
+ "learning_rate": 0.00018202327349688043,
1850
+ "loss": 2.1726,
1851
+ "step": 614
1852
+ },
1853
+ {
1854
+ "epoch": 0.66,
1855
+ "learning_rate": 0.00018189135338583066,
1856
+ "loss": 2.2242,
1857
+ "step": 616
1858
+ },
1859
+ {
1860
+ "epoch": 0.66,
1861
+ "learning_rate": 0.00018175899917404492,
1862
+ "loss": 2.2506,
1863
+ "step": 618
1864
+ },
1865
+ {
1866
+ "epoch": 0.66,
1867
+ "learning_rate": 0.00018162621156312433,
1868
+ "loss": 2.1324,
1869
+ "step": 620
1870
+ },
1871
+ {
1872
+ "epoch": 0.66,
1873
+ "learning_rate": 0.00018149299125696735,
1874
+ "loss": 2.006,
1875
+ "step": 622
1876
+ },
1877
+ {
1878
+ "epoch": 0.67,
1879
+ "learning_rate": 0.00018135933896176612,
1880
+ "loss": 2.2178,
1881
+ "step": 624
1882
+ },
1883
+ {
1884
+ "epoch": 0.67,
1885
+ "learning_rate": 0.00018122525538600282,
1886
+ "loss": 2.0817,
1887
+ "step": 626
1888
+ },
1889
+ {
1890
+ "epoch": 0.67,
1891
+ "learning_rate": 0.00018109074124044572,
1892
+ "loss": 2.0006,
1893
+ "step": 628
1894
+ },
1895
+ {
1896
+ "epoch": 0.67,
1897
+ "learning_rate": 0.00018095579723814557,
1898
+ "loss": 1.9424,
1899
+ "step": 630
1900
+ },
1901
+ {
1902
+ "epoch": 0.67,
1903
+ "learning_rate": 0.00018082042409443174,
1904
+ "loss": 1.9661,
1905
+ "step": 632
1906
+ },
1907
+ {
1908
+ "epoch": 0.68,
1909
+ "learning_rate": 0.00018068462252690843,
1910
+ "loss": 1.8678,
1911
+ "step": 634
1912
+ },
1913
+ {
1914
+ "epoch": 0.68,
1915
+ "learning_rate": 0.00018054839325545096,
1916
+ "loss": 1.8343,
1917
+ "step": 636
1918
+ },
1919
+ {
1920
+ "epoch": 0.68,
1921
+ "learning_rate": 0.0001804117370022018,
1922
+ "loss": 1.8003,
1923
+ "step": 638
1924
+ },
1925
+ {
1926
+ "epoch": 0.68,
1927
+ "learning_rate": 0.0001802746544915669,
1928
+ "loss": 1.7928,
1929
+ "step": 640
1930
+ },
1931
+ {
1932
+ "epoch": 0.68,
1933
+ "learning_rate": 0.00018013714645021166,
1934
+ "loss": 1.8922,
1935
+ "step": 642
1936
+ },
1937
+ {
1938
+ "epoch": 0.69,
1939
+ "learning_rate": 0.00017999921360705733,
1940
+ "loss": 1.6961,
1941
+ "step": 644
1942
+ },
1943
+ {
1944
+ "epoch": 0.69,
1945
+ "learning_rate": 0.0001798608566932769,
1946
+ "loss": 1.3043,
1947
+ "step": 646
1948
+ },
1949
+ {
1950
+ "epoch": 0.69,
1951
+ "learning_rate": 0.00017972207644229138,
1952
+ "loss": 1.8295,
1953
+ "step": 648
1954
+ },
1955
+ {
1956
+ "epoch": 0.69,
1957
+ "learning_rate": 0.0001795828735897658,
1958
+ "loss": 1.7105,
1959
+ "step": 650
1960
+ },
1961
+ {
1962
+ "epoch": 0.69,
1963
+ "learning_rate": 0.00017944324887360553,
1964
+ "loss": 2.1881,
1965
+ "step": 652
1966
+ },
1967
+ {
1968
+ "epoch": 0.7,
1969
+ "learning_rate": 0.000179303203033952,
1970
+ "loss": 2.2434,
1971
+ "step": 654
1972
+ },
1973
+ {
1974
+ "epoch": 0.7,
1975
+ "learning_rate": 0.0001791627368131792,
1976
+ "loss": 2.3566,
1977
+ "step": 656
1978
+ },
1979
+ {
1980
+ "epoch": 0.7,
1981
+ "learning_rate": 0.00017902185095588927,
1982
+ "loss": 2.3222,
1983
+ "step": 658
1984
+ },
1985
+ {
1986
+ "epoch": 0.7,
1987
+ "learning_rate": 0.00017888054620890915,
1988
+ "loss": 2.181,
1989
+ "step": 660
1990
+ },
1991
+ {
1992
+ "epoch": 0.71,
1993
+ "learning_rate": 0.00017873882332128597,
1994
+ "loss": 2.3261,
1995
+ "step": 662
1996
+ },
1997
+ {
1998
+ "epoch": 0.71,
1999
+ "learning_rate": 0.00017859668304428365,
2000
+ "loss": 2.2798,
2001
+ "step": 664
2002
+ },
2003
+ {
2004
+ "epoch": 0.71,
2005
+ "learning_rate": 0.00017845412613137844,
2006
+ "loss": 2.0487,
2007
+ "step": 666
2008
+ },
2009
+ {
2010
+ "epoch": 0.71,
2011
+ "learning_rate": 0.00017831115333825535,
2012
+ "loss": 2.1863,
2013
+ "step": 668
2014
+ },
2015
+ {
2016
+ "epoch": 0.71,
2017
+ "learning_rate": 0.00017816776542280377,
2018
+ "loss": 2.1308,
2019
+ "step": 670
2020
+ },
2021
+ {
2022
+ "epoch": 0.72,
2023
+ "learning_rate": 0.0001780239631451138,
2024
+ "loss": 2.3122,
2025
+ "step": 672
2026
+ },
2027
+ {
2028
+ "epoch": 0.72,
2029
+ "learning_rate": 0.0001778797472674719,
2030
+ "loss": 2.2757,
2031
+ "step": 674
2032
+ },
2033
+ {
2034
+ "epoch": 0.72,
2035
+ "learning_rate": 0.00017773511855435708,
2036
+ "loss": 2.0241,
2037
+ "step": 676
2038
+ },
2039
+ {
2040
+ "epoch": 0.72,
2041
+ "learning_rate": 0.00017759007777243672,
2042
+ "loss": 1.9896,
2043
+ "step": 678
2044
+ },
2045
+ {
2046
+ "epoch": 0.72,
2047
+ "learning_rate": 0.00017744462569056256,
2048
+ "loss": 1.9607,
2049
+ "step": 680
2050
+ },
2051
+ {
2052
+ "epoch": 0.73,
2053
+ "learning_rate": 0.00017729876307976663,
2054
+ "loss": 1.9798,
2055
+ "step": 682
2056
+ },
2057
+ {
2058
+ "epoch": 0.73,
2059
+ "learning_rate": 0.00017715249071325717,
2060
+ "loss": 1.9075,
2061
+ "step": 684
2062
+ },
2063
+ {
2064
+ "epoch": 0.73,
2065
+ "learning_rate": 0.00017700580936641443,
2066
+ "loss": 2.0141,
2067
+ "step": 686
2068
+ },
2069
+ {
2070
+ "epoch": 0.73,
2071
+ "learning_rate": 0.00017685871981678672,
2072
+ "loss": 1.9238,
2073
+ "step": 688
2074
+ },
2075
+ {
2076
+ "epoch": 0.74,
2077
+ "learning_rate": 0.00017671122284408614,
2078
+ "loss": 1.9244,
2079
+ "step": 690
2080
+ },
2081
+ {
2082
+ "epoch": 0.74,
2083
+ "learning_rate": 0.00017656331923018457,
2084
+ "loss": 1.6621,
2085
+ "step": 692
2086
+ },
2087
+ {
2088
+ "epoch": 0.74,
2089
+ "learning_rate": 0.00017641500975910945,
2090
+ "loss": 1.7402,
2091
+ "step": 694
2092
+ },
2093
+ {
2094
+ "epoch": 0.74,
2095
+ "learning_rate": 0.0001762662952170396,
2096
+ "loss": 1.3913,
2097
+ "step": 696
2098
+ },
2099
+ {
2100
+ "epoch": 0.74,
2101
+ "learning_rate": 0.0001761171763923012,
2102
+ "loss": 1.5825,
2103
+ "step": 698
2104
+ },
2105
+ {
2106
+ "epoch": 0.75,
2107
+ "learning_rate": 0.0001759676540753634,
2108
+ "loss": 1.6809,
2109
+ "step": 700
2110
+ },
2111
+ {
2112
+ "epoch": 0.75,
2113
+ "learning_rate": 0.00017581772905883423,
2114
+ "loss": 2.3459,
2115
+ "step": 702
2116
+ },
2117
+ {
2118
+ "epoch": 0.75,
2119
+ "learning_rate": 0.00017566740213745648,
2120
+ "loss": 2.1963,
2121
+ "step": 704
2122
+ },
2123
+ {
2124
+ "epoch": 0.75,
2125
+ "learning_rate": 0.00017551667410810337,
2126
+ "loss": 2.1334,
2127
+ "step": 706
2128
+ },
2129
+ {
2130
+ "epoch": 0.75,
2131
+ "learning_rate": 0.00017536554576977442,
2132
+ "loss": 2.3778,
2133
+ "step": 708
2134
+ },
2135
+ {
2136
+ "epoch": 0.76,
2137
+ "learning_rate": 0.00017521401792359108,
2138
+ "loss": 2.183,
2139
+ "step": 710
2140
+ },
2141
+ {
2142
+ "epoch": 0.76,
2143
+ "learning_rate": 0.0001750620913727926,
2144
+ "loss": 2.1674,
2145
+ "step": 712
2146
+ },
2147
+ {
2148
+ "epoch": 0.76,
2149
+ "learning_rate": 0.00017490976692273176,
2150
+ "loss": 2.3534,
2151
+ "step": 714
2152
+ },
2153
+ {
2154
+ "epoch": 0.76,
2155
+ "learning_rate": 0.00017475704538087055,
2156
+ "loss": 2.1677,
2157
+ "step": 716
2158
+ },
2159
+ {
2160
+ "epoch": 0.77,
2161
+ "learning_rate": 0.00017460392755677592,
2162
+ "loss": 2.1642,
2163
+ "step": 718
2164
+ },
2165
+ {
2166
+ "epoch": 0.77,
2167
+ "learning_rate": 0.0001744504142621155,
2168
+ "loss": 2.1983,
2169
+ "step": 720
2170
+ },
2171
+ {
2172
+ "epoch": 0.77,
2173
+ "learning_rate": 0.0001742965063106533,
2174
+ "loss": 2.1661,
2175
+ "step": 722
2176
+ },
2177
+ {
2178
+ "epoch": 0.77,
2179
+ "learning_rate": 0.0001741422045182453,
2180
+ "loss": 2.2135,
2181
+ "step": 724
2182
+ },
2183
+ {
2184
+ "epoch": 0.77,
2185
+ "learning_rate": 0.00017398750970283532,
2186
+ "loss": 2.1288,
2187
+ "step": 726
2188
+ },
2189
+ {
2190
+ "epoch": 0.78,
2191
+ "learning_rate": 0.00017383242268445047,
2192
+ "loss": 1.9906,
2193
+ "step": 728
2194
+ },
2195
+ {
2196
+ "epoch": 0.78,
2197
+ "learning_rate": 0.00017367694428519696,
2198
+ "loss": 1.9031,
2199
+ "step": 730
2200
+ },
2201
+ {
2202
+ "epoch": 0.78,
2203
+ "learning_rate": 0.00017352107532925569,
2204
+ "loss": 1.831,
2205
+ "step": 732
2206
+ },
2207
+ {
2208
+ "epoch": 0.78,
2209
+ "learning_rate": 0.00017336481664287777,
2210
+ "loss": 1.9116,
2211
+ "step": 734
2212
+ },
2213
+ {
2214
+ "epoch": 0.78,
2215
+ "learning_rate": 0.00017320816905438044,
2216
+ "loss": 1.8241,
2217
+ "step": 736
2218
+ },
2219
+ {
2220
+ "epoch": 0.79,
2221
+ "learning_rate": 0.0001730511333941423,
2222
+ "loss": 1.8745,
2223
+ "step": 738
2224
+ },
2225
+ {
2226
+ "epoch": 0.79,
2227
+ "learning_rate": 0.00017289371049459922,
2228
+ "loss": 1.8306,
2229
+ "step": 740
2230
+ },
2231
+ {
2232
+ "epoch": 0.79,
2233
+ "learning_rate": 0.00017273590119023968,
2234
+ "loss": 1.6032,
2235
+ "step": 742
2236
+ },
2237
+ {
2238
+ "epoch": 0.79,
2239
+ "learning_rate": 0.00017257770631760058,
2240
+ "loss": 1.3571,
2241
+ "step": 744
2242
+ },
2243
+ {
2244
+ "epoch": 0.8,
2245
+ "learning_rate": 0.00017241912671526265,
2246
+ "loss": 1.6282,
2247
+ "step": 746
2248
+ },
2249
+ {
2250
+ "epoch": 0.8,
2251
+ "learning_rate": 0.00017226016322384604,
2252
+ "loss": 1.5942,
2253
+ "step": 748
2254
+ },
2255
+ {
2256
+ "epoch": 0.8,
2257
+ "learning_rate": 0.00017210081668600586,
2258
+ "loss": 1.6988,
2259
+ "step": 750
2260
+ },
2261
+ {
2262
+ "epoch": 0.8,
2263
+ "learning_rate": 0.00017194108794642775,
2264
+ "loss": 2.334,
2265
+ "step": 752
2266
+ },
2267
+ {
2268
+ "epoch": 0.8,
2269
+ "learning_rate": 0.00017178097785182337,
2270
+ "loss": 2.2731,
2271
+ "step": 754
2272
+ },
2273
+ {
2274
+ "epoch": 0.81,
2275
+ "learning_rate": 0.0001716204872509259,
2276
+ "loss": 2.1642,
2277
+ "step": 756
2278
+ },
2279
+ {
2280
+ "epoch": 0.81,
2281
+ "learning_rate": 0.00017145961699448559,
2282
+ "loss": 2.4107,
2283
+ "step": 758
2284
+ },
2285
+ {
2286
+ "epoch": 0.81,
2287
+ "learning_rate": 0.00017129836793526517,
2288
+ "loss": 2.2767,
2289
+ "step": 760
2290
+ },
2291
+ {
2292
+ "epoch": 0.81,
2293
+ "learning_rate": 0.00017113674092803543,
2294
+ "loss": 2.3137,
2295
+ "step": 762
2296
+ },
2297
+ {
2298
+ "epoch": 0.81,
2299
+ "learning_rate": 0.00017097473682957067,
2300
+ "loss": 2.3095,
2301
+ "step": 764
2302
+ },
2303
+ {
2304
+ "epoch": 0.82,
2305
+ "learning_rate": 0.00017081235649864395,
2306
+ "loss": 2.1327,
2307
+ "step": 766
2308
+ },
2309
+ {
2310
+ "epoch": 0.82,
2311
+ "learning_rate": 0.00017064960079602297,
2312
+ "loss": 2.2666,
2313
+ "step": 768
2314
+ },
2315
+ {
2316
+ "epoch": 0.82,
2317
+ "learning_rate": 0.00017048647058446505,
2318
+ "loss": 2.2871,
2319
+ "step": 770
2320
+ },
2321
+ {
2322
+ "epoch": 0.82,
2323
+ "learning_rate": 0.00017032296672871283,
2324
+ "loss": 2.2055,
2325
+ "step": 772
2326
+ },
2327
+ {
2328
+ "epoch": 0.82,
2329
+ "learning_rate": 0.00017015909009548966,
2330
+ "loss": 2.1066,
2331
+ "step": 774
2332
+ },
2333
+ {
2334
+ "epoch": 0.83,
2335
+ "learning_rate": 0.00016999484155349483,
2336
+ "loss": 2.1887,
2337
+ "step": 776
2338
+ },
2339
+ {
2340
+ "epoch": 0.83,
2341
+ "learning_rate": 0.00016983022197339923,
2342
+ "loss": 2.1014,
2343
+ "step": 778
2344
+ },
2345
+ {
2346
+ "epoch": 0.83,
2347
+ "learning_rate": 0.00016966523222784058,
2348
+ "loss": 2.1138,
2349
+ "step": 780
2350
+ },
2351
+ {
2352
+ "epoch": 0.83,
2353
+ "learning_rate": 0.00016949987319141868,
2354
+ "loss": 1.8663,
2355
+ "step": 782
2356
+ },
2357
+ {
2358
+ "epoch": 0.84,
2359
+ "learning_rate": 0.0001693341457406911,
2360
+ "loss": 1.7421,
2361
+ "step": 784
2362
+ },
2363
+ {
2364
+ "epoch": 0.84,
2365
+ "learning_rate": 0.00016916805075416823,
2366
+ "loss": 1.8293,
2367
+ "step": 786
2368
+ },
2369
+ {
2370
+ "epoch": 0.84,
2371
+ "learning_rate": 0.0001690015891123088,
2372
+ "loss": 1.8993,
2373
+ "step": 788
2374
+ },
2375
+ {
2376
+ "epoch": 0.84,
2377
+ "learning_rate": 0.00016883476169751518,
2378
+ "loss": 1.8486,
2379
+ "step": 790
2380
+ },
2381
+ {
2382
+ "epoch": 0.84,
2383
+ "learning_rate": 0.0001686675693941286,
2384
+ "loss": 1.7033,
2385
+ "step": 792
2386
+ },
2387
+ {
2388
+ "epoch": 0.85,
2389
+ "learning_rate": 0.00016850001308842458,
2390
+ "loss": 1.5906,
2391
+ "step": 794
2392
+ },
2393
+ {
2394
+ "epoch": 0.85,
2395
+ "learning_rate": 0.00016833209366860826,
2396
+ "loss": 1.4477,
2397
+ "step": 796
2398
+ },
2399
+ {
2400
+ "epoch": 0.85,
2401
+ "learning_rate": 0.00016816381202480946,
2402
+ "loss": 1.8339,
2403
+ "step": 798
2404
+ },
2405
+ {
2406
+ "epoch": 0.85,
2407
+ "learning_rate": 0.00016799516904907828,
2408
+ "loss": 1.6691,
2409
+ "step": 800
2410
+ },
2411
+ {
2412
+ "epoch": 0.85,
2413
+ "learning_rate": 0.00016782616563538016,
2414
+ "loss": 2.4258,
2415
+ "step": 802
2416
+ },
2417
+ {
2418
+ "epoch": 0.86,
2419
+ "learning_rate": 0.0001676568026795912,
2420
+ "loss": 2.3445,
2421
+ "step": 804
2422
+ },
2423
+ {
2424
+ "epoch": 0.86,
2425
+ "learning_rate": 0.00016748708107949337,
2426
+ "loss": 2.137,
2427
+ "step": 806
2428
+ },
2429
+ {
2430
+ "epoch": 0.86,
2431
+ "learning_rate": 0.00016731700173476988,
2432
+ "loss": 2.2656,
2433
+ "step": 808
2434
+ },
2435
+ {
2436
+ "epoch": 0.86,
2437
+ "learning_rate": 0.00016714656554700022,
2438
+ "loss": 2.2945,
2439
+ "step": 810
2440
+ },
2441
+ {
2442
+ "epoch": 0.87,
2443
+ "learning_rate": 0.0001669757734196556,
2444
+ "loss": 2.0641,
2445
+ "step": 812
2446
+ },
2447
+ {
2448
+ "epoch": 0.87,
2449
+ "learning_rate": 0.0001668046262580939,
2450
+ "loss": 2.0698,
2451
+ "step": 814
2452
+ },
2453
+ {
2454
+ "epoch": 0.87,
2455
+ "learning_rate": 0.00016663312496955517,
2456
+ "loss": 2.1787,
2457
+ "step": 816
2458
+ },
2459
+ {
2460
+ "epoch": 0.87,
2461
+ "learning_rate": 0.00016646127046315653,
2462
+ "loss": 2.0295,
2463
+ "step": 818
2464
+ },
2465
+ {
2466
+ "epoch": 0.87,
2467
+ "learning_rate": 0.0001662890636498875,
2468
+ "loss": 2.2736,
2469
+ "step": 820
2470
+ },
2471
+ {
2472
+ "epoch": 0.88,
2473
+ "learning_rate": 0.00016611650544260526,
2474
+ "loss": 2.1205,
2475
+ "step": 822
2476
+ },
2477
+ {
2478
+ "epoch": 0.88,
2479
+ "learning_rate": 0.00016594359675602962,
2480
+ "loss": 2.1249,
2481
+ "step": 824
2482
+ },
2483
+ {
2484
+ "epoch": 0.88,
2485
+ "learning_rate": 0.00016577033850673824,
2486
+ "loss": 1.9355,
2487
+ "step": 826
2488
+ },
2489
+ {
2490
+ "epoch": 0.88,
2491
+ "learning_rate": 0.00016559673161316188,
2492
+ "loss": 1.8423,
2493
+ "step": 828
2494
+ },
2495
+ {
2496
+ "epoch": 0.88,
2497
+ "learning_rate": 0.00016542277699557934,
2498
+ "loss": 2.0979,
2499
+ "step": 830
2500
+ },
2501
+ {
2502
+ "epoch": 0.89,
2503
+ "learning_rate": 0.00016524847557611278,
2504
+ "loss": 1.7747,
2505
+ "step": 832
2506
+ },
2507
+ {
2508
+ "epoch": 0.89,
2509
+ "learning_rate": 0.00016507382827872264,
2510
+ "loss": 1.7137,
2511
+ "step": 834
2512
+ },
2513
+ {
2514
+ "epoch": 0.89,
2515
+ "learning_rate": 0.0001648988360292029,
2516
+ "loss": 1.9926,
2517
+ "step": 836
2518
+ },
2519
+ {
2520
+ "epoch": 0.89,
2521
+ "learning_rate": 0.0001647234997551761,
2522
+ "loss": 2.033,
2523
+ "step": 838
2524
+ },
2525
+ {
2526
+ "epoch": 0.9,
2527
+ "learning_rate": 0.00016454782038608835,
2528
+ "loss": 1.7298,
2529
+ "step": 840
2530
+ },
2531
+ {
2532
+ "epoch": 0.9,
2533
+ "learning_rate": 0.00016437179885320466,
2534
+ "loss": 1.8326,
2535
+ "step": 842
2536
+ },
2537
+ {
2538
+ "epoch": 0.9,
2539
+ "learning_rate": 0.00016419543608960367,
2540
+ "loss": 1.5744,
2541
+ "step": 844
2542
+ },
2543
+ {
2544
+ "epoch": 0.9,
2545
+ "learning_rate": 0.00016401873303017287,
2546
+ "loss": 1.471,
2547
+ "step": 846
2548
+ },
2549
+ {
2550
+ "epoch": 0.9,
2551
+ "learning_rate": 0.00016384169061160376,
2552
+ "loss": 1.7169,
2553
+ "step": 848
2554
+ },
2555
+ {
2556
+ "epoch": 0.91,
2557
+ "learning_rate": 0.00016366430977238667,
2558
+ "loss": 1.5636,
2559
+ "step": 850
2560
+ },
2561
+ {
2562
+ "epoch": 0.91,
2563
+ "learning_rate": 0.00016348659145280585,
2564
+ "loss": 2.3071,
2565
+ "step": 852
2566
+ },
2567
+ {
2568
+ "epoch": 0.91,
2569
+ "learning_rate": 0.00016330853659493456,
2570
+ "loss": 2.1993,
2571
+ "step": 854
2572
+ },
2573
+ {
2574
+ "epoch": 0.91,
2575
+ "learning_rate": 0.00016313014614263003,
2576
+ "loss": 2.1583,
2577
+ "step": 856
2578
+ },
2579
+ {
2580
+ "epoch": 0.91,
2581
+ "learning_rate": 0.0001629514210415284,
2582
+ "loss": 2.1996,
2583
+ "step": 858
2584
+ },
2585
+ {
2586
+ "epoch": 0.92,
2587
+ "learning_rate": 0.00016277236223903986,
2588
+ "loss": 2.309,
2589
+ "step": 860
2590
+ },
2591
+ {
2592
+ "epoch": 0.92,
2593
+ "learning_rate": 0.00016259297068434343,
2594
+ "loss": 2.2155,
2595
+ "step": 862
2596
+ },
2597
+ {
2598
+ "epoch": 0.92,
2599
+ "learning_rate": 0.0001624132473283821,
2600
+ "loss": 2.2474,
2601
+ "step": 864
2602
+ },
2603
+ {
2604
+ "epoch": 0.92,
2605
+ "learning_rate": 0.00016223319312385766,
2606
+ "loss": 2.2034,
2607
+ "step": 866
2608
+ },
2609
+ {
2610
+ "epoch": 0.93,
2611
+ "learning_rate": 0.00016205280902522576,
2612
+ "loss": 2.2016,
2613
+ "step": 868
2614
+ },
2615
+ {
2616
+ "epoch": 0.93,
2617
+ "learning_rate": 0.00016187209598869074,
2618
+ "loss": 2.133,
2619
+ "step": 870
2620
+ },
2621
+ {
2622
+ "epoch": 0.93,
2623
+ "learning_rate": 0.00016169105497220064,
2624
+ "loss": 2.1746,
2625
+ "step": 872
2626
+ },
2627
+ {
2628
+ "epoch": 0.93,
2629
+ "learning_rate": 0.00016150968693544215,
2630
+ "loss": 2.0806,
2631
+ "step": 874
2632
+ },
2633
+ {
2634
+ "epoch": 0.93,
2635
+ "learning_rate": 0.00016132799283983542,
2636
+ "loss": 2.2318,
2637
+ "step": 876
2638
+ },
2639
+ {
2640
+ "epoch": 0.94,
2641
+ "learning_rate": 0.000161145973648529,
2642
+ "loss": 2.0729,
2643
+ "step": 878
2644
+ },
2645
+ {
2646
+ "epoch": 0.94,
2647
+ "learning_rate": 0.0001609636303263948,
2648
+ "loss": 1.8199,
2649
+ "step": 880
2650
+ },
2651
+ {
2652
+ "epoch": 0.94,
2653
+ "learning_rate": 0.00016078096384002292,
2654
+ "loss": 2.0973,
2655
+ "step": 882
2656
+ },
2657
+ {
2658
+ "epoch": 0.94,
2659
+ "learning_rate": 0.00016059797515771652,
2660
+ "loss": 1.7302,
2661
+ "step": 884
2662
+ },
2663
+ {
2664
+ "epoch": 0.94,
2665
+ "learning_rate": 0.00016041466524948663,
2666
+ "loss": 1.9608,
2667
+ "step": 886
2668
+ },
2669
+ {
2670
+ "epoch": 0.95,
2671
+ "learning_rate": 0.00016023103508704725,
2672
+ "loss": 1.8483,
2673
+ "step": 888
2674
+ },
2675
+ {
2676
+ "epoch": 0.95,
2677
+ "learning_rate": 0.00016004708564380985,
2678
+ "loss": 1.9501,
2679
+ "step": 890
2680
+ },
2681
+ {
2682
+ "epoch": 0.95,
2683
+ "learning_rate": 0.0001598628178948785,
2684
+ "loss": 1.6526,
2685
+ "step": 892
2686
+ },
2687
+ {
2688
+ "epoch": 0.95,
2689
+ "learning_rate": 0.0001596782328170445,
2690
+ "loss": 1.4811,
2691
+ "step": 894
2692
+ },
2693
+ {
2694
+ "epoch": 0.96,
2695
+ "learning_rate": 0.00015949333138878138,
2696
+ "loss": 1.4048,
2697
+ "step": 896
2698
+ },
2699
+ {
2700
+ "epoch": 0.96,
2701
+ "learning_rate": 0.00015930811459023957,
2702
+ "loss": 1.5489,
2703
+ "step": 898
2704
+ },
2705
+ {
2706
+ "epoch": 0.96,
2707
+ "learning_rate": 0.00015912258340324126,
2708
+ "loss": 1.7197,
2709
+ "step": 900
2710
+ },
2711
+ {
2712
+ "epoch": 0.96,
2713
+ "learning_rate": 0.00015893673881127524,
2714
+ "loss": 2.1217,
2715
+ "step": 902
2716
+ },
2717
+ {
2718
+ "epoch": 0.96,
2719
+ "learning_rate": 0.00015875058179949151,
2720
+ "loss": 2.2086,
2721
+ "step": 904
2722
+ },
2723
+ {
2724
+ "epoch": 0.97,
2725
+ "learning_rate": 0.00015856411335469638,
2726
+ "loss": 2.094,
2727
+ "step": 906
2728
+ },
2729
+ {
2730
+ "epoch": 0.97,
2731
+ "learning_rate": 0.00015837733446534688,
2732
+ "loss": 2.3415,
2733
+ "step": 908
2734
+ },
2735
+ {
2736
+ "epoch": 0.97,
2737
+ "learning_rate": 0.00015819024612154575,
2738
+ "loss": 2.2378,
2739
+ "step": 910
2740
+ },
2741
+ {
2742
+ "epoch": 0.97,
2743
+ "learning_rate": 0.00015800284931503618,
2744
+ "loss": 2.1351,
2745
+ "step": 912
2746
+ },
2747
+ {
2748
+ "epoch": 0.97,
2749
+ "learning_rate": 0.0001578151450391964,
2750
+ "loss": 2.1795,
2751
+ "step": 914
2752
+ },
2753
+ {
2754
+ "epoch": 0.98,
2755
+ "learning_rate": 0.00015762713428903454,
2756
+ "loss": 2.0282,
2757
+ "step": 916
2758
+ },
2759
+ {
2760
+ "epoch": 0.98,
2761
+ "learning_rate": 0.00015743881806118342,
2762
+ "loss": 2.2858,
2763
+ "step": 918
2764
+ },
2765
+ {
2766
+ "epoch": 0.98,
2767
+ "learning_rate": 0.00015725019735389503,
2768
+ "loss": 2.0929,
2769
+ "step": 920
2770
+ },
2771
+ {
2772
+ "epoch": 0.98,
2773
+ "learning_rate": 0.00015706127316703557,
2774
+ "loss": 1.9913,
2775
+ "step": 922
2776
+ },
2777
+ {
2778
+ "epoch": 0.98,
2779
+ "learning_rate": 0.0001568720465020798,
2780
+ "loss": 2.0128,
2781
+ "step": 924
2782
+ },
2783
+ {
2784
+ "epoch": 0.99,
2785
+ "learning_rate": 0.00015668251836210595,
2786
+ "loss": 1.8792,
2787
+ "step": 926
2788
+ },
2789
+ {
2790
+ "epoch": 0.99,
2791
+ "learning_rate": 0.0001564926897517904,
2792
+ "loss": 1.6978,
2793
+ "step": 928
2794
+ },
2795
+ {
2796
+ "epoch": 0.99,
2797
+ "learning_rate": 0.0001563025616774022,
2798
+ "loss": 2.0335,
2799
+ "step": 930
2800
+ },
2801
+ {
2802
+ "epoch": 0.99,
2803
+ "learning_rate": 0.0001561121351467979,
2804
+ "loss": 1.845,
2805
+ "step": 932
2806
+ },
2807
+ {
2808
+ "epoch": 1.0,
2809
+ "learning_rate": 0.00015592141116941628,
2810
+ "loss": 1.3927,
2811
+ "step": 934
2812
+ },
2813
+ {
2814
+ "epoch": 1.0,
2815
+ "learning_rate": 0.00015573039075627256,
2816
+ "loss": 1.6165,
2817
+ "step": 936
2818
+ },
2819
+ {
2820
+ "epoch": 1.0,
2821
+ "learning_rate": 0.00015553907491995365,
2822
+ "loss": 1.6737,
2823
+ "step": 938
2824
  }
2825
  ],
2826
  "logging_steps": 2,
2827
+ "max_steps": 2814,
2828
+ "num_train_epochs": 3,
2829
  "save_steps": 500,
2830
+ "total_flos": 1.51669593729024e+16,
2831
  "trial_name": null,
2832
  "trial_params": null
2833
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:239e6ed22c6da37e9830926f0efad13bc0fcad9d2c09e0f0964ed8fccabea5b1
3
  size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eac5a6e647e0711f34f242674d22d25ea75ac515d1a47bb64e3ecc9e0b716063
3
  size 4600