iamnguyen commited on
Commit
b3da762
·
verified ·
1 Parent(s): 31948c0

Training in progress, step 16, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "_name_or_path": "google-t5/t5-small",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
6
  "classifier_dropout": 0.0,
7
- "d_ff": 2048,
8
  "d_kv": 64,
9
- "d_model": 512,
10
  "decoder_start_token_id": 0,
11
  "dense_act_fn": "relu",
12
  "dropout_rate": 0.1,
@@ -18,44 +18,15 @@
18
  "layer_norm_epsilon": 1e-06,
19
  "model_type": "t5",
20
  "n_positions": 512,
21
- "num_decoder_layers": 6,
22
- "num_heads": 8,
23
- "num_layers": 6,
24
  "output_past": true,
25
  "pad_token_id": 0,
26
  "relative_attention_max_distance": 128,
27
  "relative_attention_num_buckets": 32,
28
- "task_specific_params": {
29
- "summarization": {
30
- "early_stopping": true,
31
- "length_penalty": 2.0,
32
- "max_length": 200,
33
- "min_length": 30,
34
- "no_repeat_ngram_size": 3,
35
- "num_beams": 4,
36
- "prefix": "summarize: "
37
- },
38
- "translation_en_to_de": {
39
- "early_stopping": true,
40
- "max_length": 300,
41
- "num_beams": 4,
42
- "prefix": "translate English to German: "
43
- },
44
- "translation_en_to_fr": {
45
- "early_stopping": true,
46
- "max_length": 300,
47
- "num_beams": 4,
48
- "prefix": "translate English to French: "
49
- },
50
- "translation_en_to_ro": {
51
- "early_stopping": true,
52
- "max_length": 300,
53
- "num_beams": 4,
54
- "prefix": "translate English to Romanian: "
55
- }
56
- },
57
  "torch_dtype": "float32",
58
  "transformers_version": "4.42.3",
59
  "use_cache": true,
60
- "vocab_size": 32128
61
  }
 
1
  {
2
+ "_name_or_path": "VietAI/vit5-base",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
6
  "classifier_dropout": 0.0,
7
+ "d_ff": 3072,
8
  "d_kv": 64,
9
+ "d_model": 768,
10
  "decoder_start_token_id": 0,
11
  "dense_act_fn": "relu",
12
  "dropout_rate": 0.1,
 
18
  "layer_norm_epsilon": 1e-06,
19
  "model_type": "t5",
20
  "n_positions": 512,
21
+ "num_decoder_layers": 12,
22
+ "num_heads": 12,
23
+ "num_layers": 12,
24
  "output_past": true,
25
  "pad_token_id": 0,
26
  "relative_attention_max_distance": 128,
27
  "relative_attention_num_buckets": 32,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "torch_dtype": "float32",
29
  "transformers_version": "4.42.3",
30
  "use_cache": true,
31
+ "vocab_size": 36096
32
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67cde00778fd9b31e92769f2fa5fb22957b3c18857b8fa15489d34bf37fd4392
3
- size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:119eea453bd7b4029fcdef03fe0a7106f3b8552ddeebbf478f83f4f695d0bc27
3
+ size 903834408
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a268ea2d87ebd5586132b48f679290592ca529d7cd515b00d80780d58f6b8ca
3
- size 484163514
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:097f2da6ca0542c33e534858431ad08295fb5b03025fd8e94bda6c3ada974000
3
+ size 1807824186
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:369ba992403565a78cd72e1427835ccc8b05eba0f71afd8552e75e7e6ea369c4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fd40d17872e1d1f44fd19772092ee3ba49e037d722a9306b6de7d846efa46dc
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba478b7a054120870328b778675384e38480f575773ac8971e03a9ab54918ff3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e37c62b1a4bff2525d39b9bf2be166315395310f82e667777248052632e706f7
3
  size 1064
last-checkpoint/special_tokens_map.json CHANGED
@@ -95,31 +95,9 @@
95
  "<extra_id_92>",
96
  "<extra_id_93>",
97
  "<extra_id_94>",
98
- "<extra_id_95>",
99
- "<extra_id_96>",
100
- "<extra_id_97>",
101
- "<extra_id_98>",
102
- "<extra_id_99>"
103
  ],
104
- "eos_token": {
105
- "content": "</s>",
106
- "lstrip": false,
107
- "normalized": false,
108
- "rstrip": false,
109
- "single_word": false
110
- },
111
- "pad_token": {
112
- "content": "<pad>",
113
- "lstrip": false,
114
- "normalized": false,
115
- "rstrip": false,
116
- "single_word": false
117
- },
118
- "unk_token": {
119
- "content": "<unk>",
120
- "lstrip": false,
121
- "normalized": false,
122
- "rstrip": false,
123
- "single_word": false
124
- }
125
  }
 
95
  "<extra_id_92>",
96
  "<extra_id_93>",
97
  "<extra_id_94>",
98
+ "<extra_id_95>"
 
 
 
 
99
  ],
100
+ "eos_token": "</s>",
101
+ "pad_token": "<pad>",
102
+ "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  }
last-checkpoint/spiece.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
- size 791656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59986b62f9f0b90edafb9b073ea7b93d21114a5841219a1ea2399ade73f729c6
3
+ size 820370
last-checkpoint/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json CHANGED
@@ -24,39 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "32000": {
28
- "content": "<extra_id_99>",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "32001": {
36
- "content": "<extra_id_98>",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- },
43
- "32002": {
44
- "content": "<extra_id_97>",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": true
50
- },
51
- "32003": {
52
- "content": "<extra_id_96>",
53
- "lstrip": false,
54
- "normalized": false,
55
- "rstrip": false,
56
- "single_word": false,
57
- "special": true
58
- },
59
- "32004": {
60
  "content": "<extra_id_95>",
61
  "lstrip": false,
62
  "normalized": false,
@@ -64,7 +32,7 @@
64
  "single_word": false,
65
  "special": true
66
  },
67
- "32005": {
68
  "content": "<extra_id_94>",
69
  "lstrip": false,
70
  "normalized": false,
@@ -72,7 +40,7 @@
72
  "single_word": false,
73
  "special": true
74
  },
75
- "32006": {
76
  "content": "<extra_id_93>",
77
  "lstrip": false,
78
  "normalized": false,
@@ -80,7 +48,7 @@
80
  "single_word": false,
81
  "special": true
82
  },
83
- "32007": {
84
  "content": "<extra_id_92>",
85
  "lstrip": false,
86
  "normalized": false,
@@ -88,7 +56,7 @@
88
  "single_word": false,
89
  "special": true
90
  },
91
- "32008": {
92
  "content": "<extra_id_91>",
93
  "lstrip": false,
94
  "normalized": false,
@@ -96,7 +64,7 @@
96
  "single_word": false,
97
  "special": true
98
  },
99
- "32009": {
100
  "content": "<extra_id_90>",
101
  "lstrip": false,
102
  "normalized": false,
@@ -104,7 +72,7 @@
104
  "single_word": false,
105
  "special": true
106
  },
107
- "32010": {
108
  "content": "<extra_id_89>",
109
  "lstrip": false,
110
  "normalized": false,
@@ -112,7 +80,7 @@
112
  "single_word": false,
113
  "special": true
114
  },
115
- "32011": {
116
  "content": "<extra_id_88>",
117
  "lstrip": false,
118
  "normalized": false,
@@ -120,7 +88,7 @@
120
  "single_word": false,
121
  "special": true
122
  },
123
- "32012": {
124
  "content": "<extra_id_87>",
125
  "lstrip": false,
126
  "normalized": false,
@@ -128,7 +96,7 @@
128
  "single_word": false,
129
  "special": true
130
  },
131
- "32013": {
132
  "content": "<extra_id_86>",
133
  "lstrip": false,
134
  "normalized": false,
@@ -136,7 +104,7 @@
136
  "single_word": false,
137
  "special": true
138
  },
139
- "32014": {
140
  "content": "<extra_id_85>",
141
  "lstrip": false,
142
  "normalized": false,
@@ -144,7 +112,7 @@
144
  "single_word": false,
145
  "special": true
146
  },
147
- "32015": {
148
  "content": "<extra_id_84>",
149
  "lstrip": false,
150
  "normalized": false,
@@ -152,7 +120,7 @@
152
  "single_word": false,
153
  "special": true
154
  },
155
- "32016": {
156
  "content": "<extra_id_83>",
157
  "lstrip": false,
158
  "normalized": false,
@@ -160,7 +128,7 @@
160
  "single_word": false,
161
  "special": true
162
  },
163
- "32017": {
164
  "content": "<extra_id_82>",
165
  "lstrip": false,
166
  "normalized": false,
@@ -168,7 +136,7 @@
168
  "single_word": false,
169
  "special": true
170
  },
171
- "32018": {
172
  "content": "<extra_id_81>",
173
  "lstrip": false,
174
  "normalized": false,
@@ -176,7 +144,7 @@
176
  "single_word": false,
177
  "special": true
178
  },
179
- "32019": {
180
  "content": "<extra_id_80>",
181
  "lstrip": false,
182
  "normalized": false,
@@ -184,7 +152,7 @@
184
  "single_word": false,
185
  "special": true
186
  },
187
- "32020": {
188
  "content": "<extra_id_79>",
189
  "lstrip": false,
190
  "normalized": false,
@@ -192,7 +160,7 @@
192
  "single_word": false,
193
  "special": true
194
  },
195
- "32021": {
196
  "content": "<extra_id_78>",
197
  "lstrip": false,
198
  "normalized": false,
@@ -200,7 +168,7 @@
200
  "single_word": false,
201
  "special": true
202
  },
203
- "32022": {
204
  "content": "<extra_id_77>",
205
  "lstrip": false,
206
  "normalized": false,
@@ -208,7 +176,7 @@
208
  "single_word": false,
209
  "special": true
210
  },
211
- "32023": {
212
  "content": "<extra_id_76>",
213
  "lstrip": false,
214
  "normalized": false,
@@ -216,7 +184,7 @@
216
  "single_word": false,
217
  "special": true
218
  },
219
- "32024": {
220
  "content": "<extra_id_75>",
221
  "lstrip": false,
222
  "normalized": false,
@@ -224,7 +192,7 @@
224
  "single_word": false,
225
  "special": true
226
  },
227
- "32025": {
228
  "content": "<extra_id_74>",
229
  "lstrip": false,
230
  "normalized": false,
@@ -232,7 +200,7 @@
232
  "single_word": false,
233
  "special": true
234
  },
235
- "32026": {
236
  "content": "<extra_id_73>",
237
  "lstrip": false,
238
  "normalized": false,
@@ -240,7 +208,7 @@
240
  "single_word": false,
241
  "special": true
242
  },
243
- "32027": {
244
  "content": "<extra_id_72>",
245
  "lstrip": false,
246
  "normalized": false,
@@ -248,7 +216,7 @@
248
  "single_word": false,
249
  "special": true
250
  },
251
- "32028": {
252
  "content": "<extra_id_71>",
253
  "lstrip": false,
254
  "normalized": false,
@@ -256,7 +224,7 @@
256
  "single_word": false,
257
  "special": true
258
  },
259
- "32029": {
260
  "content": "<extra_id_70>",
261
  "lstrip": false,
262
  "normalized": false,
@@ -264,7 +232,7 @@
264
  "single_word": false,
265
  "special": true
266
  },
267
- "32030": {
268
  "content": "<extra_id_69>",
269
  "lstrip": false,
270
  "normalized": false,
@@ -272,7 +240,7 @@
272
  "single_word": false,
273
  "special": true
274
  },
275
- "32031": {
276
  "content": "<extra_id_68>",
277
  "lstrip": false,
278
  "normalized": false,
@@ -280,7 +248,7 @@
280
  "single_word": false,
281
  "special": true
282
  },
283
- "32032": {
284
  "content": "<extra_id_67>",
285
  "lstrip": false,
286
  "normalized": false,
@@ -288,7 +256,7 @@
288
  "single_word": false,
289
  "special": true
290
  },
291
- "32033": {
292
  "content": "<extra_id_66>",
293
  "lstrip": false,
294
  "normalized": false,
@@ -296,7 +264,7 @@
296
  "single_word": false,
297
  "special": true
298
  },
299
- "32034": {
300
  "content": "<extra_id_65>",
301
  "lstrip": false,
302
  "normalized": false,
@@ -304,7 +272,7 @@
304
  "single_word": false,
305
  "special": true
306
  },
307
- "32035": {
308
  "content": "<extra_id_64>",
309
  "lstrip": false,
310
  "normalized": false,
@@ -312,7 +280,7 @@
312
  "single_word": false,
313
  "special": true
314
  },
315
- "32036": {
316
  "content": "<extra_id_63>",
317
  "lstrip": false,
318
  "normalized": false,
@@ -320,7 +288,7 @@
320
  "single_word": false,
321
  "special": true
322
  },
323
- "32037": {
324
  "content": "<extra_id_62>",
325
  "lstrip": false,
326
  "normalized": false,
@@ -328,7 +296,7 @@
328
  "single_word": false,
329
  "special": true
330
  },
331
- "32038": {
332
  "content": "<extra_id_61>",
333
  "lstrip": false,
334
  "normalized": false,
@@ -336,7 +304,7 @@
336
  "single_word": false,
337
  "special": true
338
  },
339
- "32039": {
340
  "content": "<extra_id_60>",
341
  "lstrip": false,
342
  "normalized": false,
@@ -344,7 +312,7 @@
344
  "single_word": false,
345
  "special": true
346
  },
347
- "32040": {
348
  "content": "<extra_id_59>",
349
  "lstrip": false,
350
  "normalized": false,
@@ -352,7 +320,7 @@
352
  "single_word": false,
353
  "special": true
354
  },
355
- "32041": {
356
  "content": "<extra_id_58>",
357
  "lstrip": false,
358
  "normalized": false,
@@ -360,7 +328,7 @@
360
  "single_word": false,
361
  "special": true
362
  },
363
- "32042": {
364
  "content": "<extra_id_57>",
365
  "lstrip": false,
366
  "normalized": false,
@@ -368,7 +336,7 @@
368
  "single_word": false,
369
  "special": true
370
  },
371
- "32043": {
372
  "content": "<extra_id_56>",
373
  "lstrip": false,
374
  "normalized": false,
@@ -376,7 +344,7 @@
376
  "single_word": false,
377
  "special": true
378
  },
379
- "32044": {
380
  "content": "<extra_id_55>",
381
  "lstrip": false,
382
  "normalized": false,
@@ -384,7 +352,7 @@
384
  "single_word": false,
385
  "special": true
386
  },
387
- "32045": {
388
  "content": "<extra_id_54>",
389
  "lstrip": false,
390
  "normalized": false,
@@ -392,7 +360,7 @@
392
  "single_word": false,
393
  "special": true
394
  },
395
- "32046": {
396
  "content": "<extra_id_53>",
397
  "lstrip": false,
398
  "normalized": false,
@@ -400,7 +368,7 @@
400
  "single_word": false,
401
  "special": true
402
  },
403
- "32047": {
404
  "content": "<extra_id_52>",
405
  "lstrip": false,
406
  "normalized": false,
@@ -408,7 +376,7 @@
408
  "single_word": false,
409
  "special": true
410
  },
411
- "32048": {
412
  "content": "<extra_id_51>",
413
  "lstrip": false,
414
  "normalized": false,
@@ -416,7 +384,7 @@
416
  "single_word": false,
417
  "special": true
418
  },
419
- "32049": {
420
  "content": "<extra_id_50>",
421
  "lstrip": false,
422
  "normalized": false,
@@ -424,7 +392,7 @@
424
  "single_word": false,
425
  "special": true
426
  },
427
- "32050": {
428
  "content": "<extra_id_49>",
429
  "lstrip": false,
430
  "normalized": false,
@@ -432,7 +400,7 @@
432
  "single_word": false,
433
  "special": true
434
  },
435
- "32051": {
436
  "content": "<extra_id_48>",
437
  "lstrip": false,
438
  "normalized": false,
@@ -440,7 +408,7 @@
440
  "single_word": false,
441
  "special": true
442
  },
443
- "32052": {
444
  "content": "<extra_id_47>",
445
  "lstrip": false,
446
  "normalized": false,
@@ -448,7 +416,7 @@
448
  "single_word": false,
449
  "special": true
450
  },
451
- "32053": {
452
  "content": "<extra_id_46>",
453
  "lstrip": false,
454
  "normalized": false,
@@ -456,7 +424,7 @@
456
  "single_word": false,
457
  "special": true
458
  },
459
- "32054": {
460
  "content": "<extra_id_45>",
461
  "lstrip": false,
462
  "normalized": false,
@@ -464,7 +432,7 @@
464
  "single_word": false,
465
  "special": true
466
  },
467
- "32055": {
468
  "content": "<extra_id_44>",
469
  "lstrip": false,
470
  "normalized": false,
@@ -472,7 +440,7 @@
472
  "single_word": false,
473
  "special": true
474
  },
475
- "32056": {
476
  "content": "<extra_id_43>",
477
  "lstrip": false,
478
  "normalized": false,
@@ -480,7 +448,7 @@
480
  "single_word": false,
481
  "special": true
482
  },
483
- "32057": {
484
  "content": "<extra_id_42>",
485
  "lstrip": false,
486
  "normalized": false,
@@ -488,7 +456,7 @@
488
  "single_word": false,
489
  "special": true
490
  },
491
- "32058": {
492
  "content": "<extra_id_41>",
493
  "lstrip": false,
494
  "normalized": false,
@@ -496,7 +464,7 @@
496
  "single_word": false,
497
  "special": true
498
  },
499
- "32059": {
500
  "content": "<extra_id_40>",
501
  "lstrip": false,
502
  "normalized": false,
@@ -504,7 +472,7 @@
504
  "single_word": false,
505
  "special": true
506
  },
507
- "32060": {
508
  "content": "<extra_id_39>",
509
  "lstrip": false,
510
  "normalized": false,
@@ -512,7 +480,7 @@
512
  "single_word": false,
513
  "special": true
514
  },
515
- "32061": {
516
  "content": "<extra_id_38>",
517
  "lstrip": false,
518
  "normalized": false,
@@ -520,7 +488,7 @@
520
  "single_word": false,
521
  "special": true
522
  },
523
- "32062": {
524
  "content": "<extra_id_37>",
525
  "lstrip": false,
526
  "normalized": false,
@@ -528,7 +496,7 @@
528
  "single_word": false,
529
  "special": true
530
  },
531
- "32063": {
532
  "content": "<extra_id_36>",
533
  "lstrip": false,
534
  "normalized": false,
@@ -536,7 +504,7 @@
536
  "single_word": false,
537
  "special": true
538
  },
539
- "32064": {
540
  "content": "<extra_id_35>",
541
  "lstrip": false,
542
  "normalized": false,
@@ -544,7 +512,7 @@
544
  "single_word": false,
545
  "special": true
546
  },
547
- "32065": {
548
  "content": "<extra_id_34>",
549
  "lstrip": false,
550
  "normalized": false,
@@ -552,7 +520,7 @@
552
  "single_word": false,
553
  "special": true
554
  },
555
- "32066": {
556
  "content": "<extra_id_33>",
557
  "lstrip": false,
558
  "normalized": false,
@@ -560,7 +528,7 @@
560
  "single_word": false,
561
  "special": true
562
  },
563
- "32067": {
564
  "content": "<extra_id_32>",
565
  "lstrip": false,
566
  "normalized": false,
@@ -568,7 +536,7 @@
568
  "single_word": false,
569
  "special": true
570
  },
571
- "32068": {
572
  "content": "<extra_id_31>",
573
  "lstrip": false,
574
  "normalized": false,
@@ -576,7 +544,7 @@
576
  "single_word": false,
577
  "special": true
578
  },
579
- "32069": {
580
  "content": "<extra_id_30>",
581
  "lstrip": false,
582
  "normalized": false,
@@ -584,7 +552,7 @@
584
  "single_word": false,
585
  "special": true
586
  },
587
- "32070": {
588
  "content": "<extra_id_29>",
589
  "lstrip": false,
590
  "normalized": false,
@@ -592,7 +560,7 @@
592
  "single_word": false,
593
  "special": true
594
  },
595
- "32071": {
596
  "content": "<extra_id_28>",
597
  "lstrip": false,
598
  "normalized": false,
@@ -600,7 +568,7 @@
600
  "single_word": false,
601
  "special": true
602
  },
603
- "32072": {
604
  "content": "<extra_id_27>",
605
  "lstrip": false,
606
  "normalized": false,
@@ -608,7 +576,7 @@
608
  "single_word": false,
609
  "special": true
610
  },
611
- "32073": {
612
  "content": "<extra_id_26>",
613
  "lstrip": false,
614
  "normalized": false,
@@ -616,7 +584,7 @@
616
  "single_word": false,
617
  "special": true
618
  },
619
- "32074": {
620
  "content": "<extra_id_25>",
621
  "lstrip": false,
622
  "normalized": false,
@@ -624,7 +592,7 @@
624
  "single_word": false,
625
  "special": true
626
  },
627
- "32075": {
628
  "content": "<extra_id_24>",
629
  "lstrip": false,
630
  "normalized": false,
@@ -632,7 +600,7 @@
632
  "single_word": false,
633
  "special": true
634
  },
635
- "32076": {
636
  "content": "<extra_id_23>",
637
  "lstrip": false,
638
  "normalized": false,
@@ -640,7 +608,7 @@
640
  "single_word": false,
641
  "special": true
642
  },
643
- "32077": {
644
  "content": "<extra_id_22>",
645
  "lstrip": false,
646
  "normalized": false,
@@ -648,7 +616,7 @@
648
  "single_word": false,
649
  "special": true
650
  },
651
- "32078": {
652
  "content": "<extra_id_21>",
653
  "lstrip": false,
654
  "normalized": false,
@@ -656,7 +624,7 @@
656
  "single_word": false,
657
  "special": true
658
  },
659
- "32079": {
660
  "content": "<extra_id_20>",
661
  "lstrip": false,
662
  "normalized": false,
@@ -664,7 +632,7 @@
664
  "single_word": false,
665
  "special": true
666
  },
667
- "32080": {
668
  "content": "<extra_id_19>",
669
  "lstrip": false,
670
  "normalized": false,
@@ -672,7 +640,7 @@
672
  "single_word": false,
673
  "special": true
674
  },
675
- "32081": {
676
  "content": "<extra_id_18>",
677
  "lstrip": false,
678
  "normalized": false,
@@ -680,7 +648,7 @@
680
  "single_word": false,
681
  "special": true
682
  },
683
- "32082": {
684
  "content": "<extra_id_17>",
685
  "lstrip": false,
686
  "normalized": false,
@@ -688,7 +656,7 @@
688
  "single_word": false,
689
  "special": true
690
  },
691
- "32083": {
692
  "content": "<extra_id_16>",
693
  "lstrip": false,
694
  "normalized": false,
@@ -696,7 +664,7 @@
696
  "single_word": false,
697
  "special": true
698
  },
699
- "32084": {
700
  "content": "<extra_id_15>",
701
  "lstrip": false,
702
  "normalized": false,
@@ -704,7 +672,7 @@
704
  "single_word": false,
705
  "special": true
706
  },
707
- "32085": {
708
  "content": "<extra_id_14>",
709
  "lstrip": false,
710
  "normalized": false,
@@ -712,7 +680,7 @@
712
  "single_word": false,
713
  "special": true
714
  },
715
- "32086": {
716
  "content": "<extra_id_13>",
717
  "lstrip": false,
718
  "normalized": false,
@@ -720,7 +688,7 @@
720
  "single_word": false,
721
  "special": true
722
  },
723
- "32087": {
724
  "content": "<extra_id_12>",
725
  "lstrip": false,
726
  "normalized": false,
@@ -728,7 +696,7 @@
728
  "single_word": false,
729
  "special": true
730
  },
731
- "32088": {
732
  "content": "<extra_id_11>",
733
  "lstrip": false,
734
  "normalized": false,
@@ -736,7 +704,7 @@
736
  "single_word": false,
737
  "special": true
738
  },
739
- "32089": {
740
  "content": "<extra_id_10>",
741
  "lstrip": false,
742
  "normalized": false,
@@ -744,7 +712,7 @@
744
  "single_word": false,
745
  "special": true
746
  },
747
- "32090": {
748
  "content": "<extra_id_9>",
749
  "lstrip": false,
750
  "normalized": false,
@@ -752,7 +720,7 @@
752
  "single_word": false,
753
  "special": true
754
  },
755
- "32091": {
756
  "content": "<extra_id_8>",
757
  "lstrip": false,
758
  "normalized": false,
@@ -760,7 +728,7 @@
760
  "single_word": false,
761
  "special": true
762
  },
763
- "32092": {
764
  "content": "<extra_id_7>",
765
  "lstrip": false,
766
  "normalized": false,
@@ -768,7 +736,7 @@
768
  "single_word": false,
769
  "special": true
770
  },
771
- "32093": {
772
  "content": "<extra_id_6>",
773
  "lstrip": false,
774
  "normalized": false,
@@ -776,7 +744,7 @@
776
  "single_word": false,
777
  "special": true
778
  },
779
- "32094": {
780
  "content": "<extra_id_5>",
781
  "lstrip": false,
782
  "normalized": false,
@@ -784,7 +752,7 @@
784
  "single_word": false,
785
  "special": true
786
  },
787
- "32095": {
788
  "content": "<extra_id_4>",
789
  "lstrip": false,
790
  "normalized": false,
@@ -792,7 +760,7 @@
792
  "single_word": false,
793
  "special": true
794
  },
795
- "32096": {
796
  "content": "<extra_id_3>",
797
  "lstrip": false,
798
  "normalized": false,
@@ -800,7 +768,7 @@
800
  "single_word": false,
801
  "special": true
802
  },
803
- "32097": {
804
  "content": "<extra_id_2>",
805
  "lstrip": false,
806
  "normalized": false,
@@ -808,7 +776,7 @@
808
  "single_word": false,
809
  "special": true
810
  },
811
- "32098": {
812
  "content": "<extra_id_1>",
813
  "lstrip": false,
814
  "normalized": false,
@@ -816,7 +784,7 @@
816
  "single_word": false,
817
  "special": true
818
  },
819
- "32099": {
820
  "content": "<extra_id_0>",
821
  "lstrip": false,
822
  "normalized": false,
@@ -921,17 +889,14 @@
921
  "<extra_id_92>",
922
  "<extra_id_93>",
923
  "<extra_id_94>",
924
- "<extra_id_95>",
925
- "<extra_id_96>",
926
- "<extra_id_97>",
927
- "<extra_id_98>",
928
- "<extra_id_99>"
929
  ],
930
  "clean_up_tokenization_spaces": true,
931
  "eos_token": "</s>",
932
- "extra_ids": 100,
933
- "model_max_length": 512,
934
  "pad_token": "<pad>",
 
935
  "tokenizer_class": "T5Tokenizer",
936
  "unk_token": "<unk>"
937
  }
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "36000": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "content": "<extra_id_95>",
29
  "lstrip": false,
30
  "normalized": false,
 
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "36001": {
36
  "content": "<extra_id_94>",
37
  "lstrip": false,
38
  "normalized": false,
 
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "36002": {
44
  "content": "<extra_id_93>",
45
  "lstrip": false,
46
  "normalized": false,
 
48
  "single_word": false,
49
  "special": true
50
  },
51
+ "36003": {
52
  "content": "<extra_id_92>",
53
  "lstrip": false,
54
  "normalized": false,
 
56
  "single_word": false,
57
  "special": true
58
  },
59
+ "36004": {
60
  "content": "<extra_id_91>",
61
  "lstrip": false,
62
  "normalized": false,
 
64
  "single_word": false,
65
  "special": true
66
  },
67
+ "36005": {
68
  "content": "<extra_id_90>",
69
  "lstrip": false,
70
  "normalized": false,
 
72
  "single_word": false,
73
  "special": true
74
  },
75
+ "36006": {
76
  "content": "<extra_id_89>",
77
  "lstrip": false,
78
  "normalized": false,
 
80
  "single_word": false,
81
  "special": true
82
  },
83
+ "36007": {
84
  "content": "<extra_id_88>",
85
  "lstrip": false,
86
  "normalized": false,
 
88
  "single_word": false,
89
  "special": true
90
  },
91
+ "36008": {
92
  "content": "<extra_id_87>",
93
  "lstrip": false,
94
  "normalized": false,
 
96
  "single_word": false,
97
  "special": true
98
  },
99
+ "36009": {
100
  "content": "<extra_id_86>",
101
  "lstrip": false,
102
  "normalized": false,
 
104
  "single_word": false,
105
  "special": true
106
  },
107
+ "36010": {
108
  "content": "<extra_id_85>",
109
  "lstrip": false,
110
  "normalized": false,
 
112
  "single_word": false,
113
  "special": true
114
  },
115
+ "36011": {
116
  "content": "<extra_id_84>",
117
  "lstrip": false,
118
  "normalized": false,
 
120
  "single_word": false,
121
  "special": true
122
  },
123
+ "36012": {
124
  "content": "<extra_id_83>",
125
  "lstrip": false,
126
  "normalized": false,
 
128
  "single_word": false,
129
  "special": true
130
  },
131
+ "36013": {
132
  "content": "<extra_id_82>",
133
  "lstrip": false,
134
  "normalized": false,
 
136
  "single_word": false,
137
  "special": true
138
  },
139
+ "36014": {
140
  "content": "<extra_id_81>",
141
  "lstrip": false,
142
  "normalized": false,
 
144
  "single_word": false,
145
  "special": true
146
  },
147
+ "36015": {
148
  "content": "<extra_id_80>",
149
  "lstrip": false,
150
  "normalized": false,
 
152
  "single_word": false,
153
  "special": true
154
  },
155
+ "36016": {
156
  "content": "<extra_id_79>",
157
  "lstrip": false,
158
  "normalized": false,
 
160
  "single_word": false,
161
  "special": true
162
  },
163
+ "36017": {
164
  "content": "<extra_id_78>",
165
  "lstrip": false,
166
  "normalized": false,
 
168
  "single_word": false,
169
  "special": true
170
  },
171
+ "36018": {
172
  "content": "<extra_id_77>",
173
  "lstrip": false,
174
  "normalized": false,
 
176
  "single_word": false,
177
  "special": true
178
  },
179
+ "36019": {
180
  "content": "<extra_id_76>",
181
  "lstrip": false,
182
  "normalized": false,
 
184
  "single_word": false,
185
  "special": true
186
  },
187
+ "36020": {
188
  "content": "<extra_id_75>",
189
  "lstrip": false,
190
  "normalized": false,
 
192
  "single_word": false,
193
  "special": true
194
  },
195
+ "36021": {
196
  "content": "<extra_id_74>",
197
  "lstrip": false,
198
  "normalized": false,
 
200
  "single_word": false,
201
  "special": true
202
  },
203
+ "36022": {
204
  "content": "<extra_id_73>",
205
  "lstrip": false,
206
  "normalized": false,
 
208
  "single_word": false,
209
  "special": true
210
  },
211
+ "36023": {
212
  "content": "<extra_id_72>",
213
  "lstrip": false,
214
  "normalized": false,
 
216
  "single_word": false,
217
  "special": true
218
  },
219
+ "36024": {
220
  "content": "<extra_id_71>",
221
  "lstrip": false,
222
  "normalized": false,
 
224
  "single_word": false,
225
  "special": true
226
  },
227
+ "36025": {
228
  "content": "<extra_id_70>",
229
  "lstrip": false,
230
  "normalized": false,
 
232
  "single_word": false,
233
  "special": true
234
  },
235
+ "36026": {
236
  "content": "<extra_id_69>",
237
  "lstrip": false,
238
  "normalized": false,
 
240
  "single_word": false,
241
  "special": true
242
  },
243
+ "36027": {
244
  "content": "<extra_id_68>",
245
  "lstrip": false,
246
  "normalized": false,
 
248
  "single_word": false,
249
  "special": true
250
  },
251
+ "36028": {
252
  "content": "<extra_id_67>",
253
  "lstrip": false,
254
  "normalized": false,
 
256
  "single_word": false,
257
  "special": true
258
  },
259
+ "36029": {
260
  "content": "<extra_id_66>",
261
  "lstrip": false,
262
  "normalized": false,
 
264
  "single_word": false,
265
  "special": true
266
  },
267
+ "36030": {
268
  "content": "<extra_id_65>",
269
  "lstrip": false,
270
  "normalized": false,
 
272
  "single_word": false,
273
  "special": true
274
  },
275
+ "36031": {
276
  "content": "<extra_id_64>",
277
  "lstrip": false,
278
  "normalized": false,
 
280
  "single_word": false,
281
  "special": true
282
  },
283
+ "36032": {
284
  "content": "<extra_id_63>",
285
  "lstrip": false,
286
  "normalized": false,
 
288
  "single_word": false,
289
  "special": true
290
  },
291
+ "36033": {
292
  "content": "<extra_id_62>",
293
  "lstrip": false,
294
  "normalized": false,
 
296
  "single_word": false,
297
  "special": true
298
  },
299
+ "36034": {
300
  "content": "<extra_id_61>",
301
  "lstrip": false,
302
  "normalized": false,
 
304
  "single_word": false,
305
  "special": true
306
  },
307
+ "36035": {
308
  "content": "<extra_id_60>",
309
  "lstrip": false,
310
  "normalized": false,
 
312
  "single_word": false,
313
  "special": true
314
  },
315
+ "36036": {
316
  "content": "<extra_id_59>",
317
  "lstrip": false,
318
  "normalized": false,
 
320
  "single_word": false,
321
  "special": true
322
  },
323
+ "36037": {
324
  "content": "<extra_id_58>",
325
  "lstrip": false,
326
  "normalized": false,
 
328
  "single_word": false,
329
  "special": true
330
  },
331
+ "36038": {
332
  "content": "<extra_id_57>",
333
  "lstrip": false,
334
  "normalized": false,
 
336
  "single_word": false,
337
  "special": true
338
  },
339
+ "36039": {
340
  "content": "<extra_id_56>",
341
  "lstrip": false,
342
  "normalized": false,
 
344
  "single_word": false,
345
  "special": true
346
  },
347
+ "36040": {
348
  "content": "<extra_id_55>",
349
  "lstrip": false,
350
  "normalized": false,
 
352
  "single_word": false,
353
  "special": true
354
  },
355
+ "36041": {
356
  "content": "<extra_id_54>",
357
  "lstrip": false,
358
  "normalized": false,
 
360
  "single_word": false,
361
  "special": true
362
  },
363
+ "36042": {
364
  "content": "<extra_id_53>",
365
  "lstrip": false,
366
  "normalized": false,
 
368
  "single_word": false,
369
  "special": true
370
  },
371
+ "36043": {
372
  "content": "<extra_id_52>",
373
  "lstrip": false,
374
  "normalized": false,
 
376
  "single_word": false,
377
  "special": true
378
  },
379
+ "36044": {
380
  "content": "<extra_id_51>",
381
  "lstrip": false,
382
  "normalized": false,
 
384
  "single_word": false,
385
  "special": true
386
  },
387
+ "36045": {
388
  "content": "<extra_id_50>",
389
  "lstrip": false,
390
  "normalized": false,
 
392
  "single_word": false,
393
  "special": true
394
  },
395
+ "36046": {
396
  "content": "<extra_id_49>",
397
  "lstrip": false,
398
  "normalized": false,
 
400
  "single_word": false,
401
  "special": true
402
  },
403
+ "36047": {
404
  "content": "<extra_id_48>",
405
  "lstrip": false,
406
  "normalized": false,
 
408
  "single_word": false,
409
  "special": true
410
  },
411
+ "36048": {
412
  "content": "<extra_id_47>",
413
  "lstrip": false,
414
  "normalized": false,
 
416
  "single_word": false,
417
  "special": true
418
  },
419
+ "36049": {
420
  "content": "<extra_id_46>",
421
  "lstrip": false,
422
  "normalized": false,
 
424
  "single_word": false,
425
  "special": true
426
  },
427
+ "36050": {
428
  "content": "<extra_id_45>",
429
  "lstrip": false,
430
  "normalized": false,
 
432
  "single_word": false,
433
  "special": true
434
  },
435
+ "36051": {
436
  "content": "<extra_id_44>",
437
  "lstrip": false,
438
  "normalized": false,
 
440
  "single_word": false,
441
  "special": true
442
  },
443
+ "36052": {
444
  "content": "<extra_id_43>",
445
  "lstrip": false,
446
  "normalized": false,
 
448
  "single_word": false,
449
  "special": true
450
  },
451
+ "36053": {
452
  "content": "<extra_id_42>",
453
  "lstrip": false,
454
  "normalized": false,
 
456
  "single_word": false,
457
  "special": true
458
  },
459
+ "36054": {
460
  "content": "<extra_id_41>",
461
  "lstrip": false,
462
  "normalized": false,
 
464
  "single_word": false,
465
  "special": true
466
  },
467
+ "36055": {
468
  "content": "<extra_id_40>",
469
  "lstrip": false,
470
  "normalized": false,
 
472
  "single_word": false,
473
  "special": true
474
  },
475
+ "36056": {
476
  "content": "<extra_id_39>",
477
  "lstrip": false,
478
  "normalized": false,
 
480
  "single_word": false,
481
  "special": true
482
  },
483
+ "36057": {
484
  "content": "<extra_id_38>",
485
  "lstrip": false,
486
  "normalized": false,
 
488
  "single_word": false,
489
  "special": true
490
  },
491
+ "36058": {
492
  "content": "<extra_id_37>",
493
  "lstrip": false,
494
  "normalized": false,
 
496
  "single_word": false,
497
  "special": true
498
  },
499
+ "36059": {
500
  "content": "<extra_id_36>",
501
  "lstrip": false,
502
  "normalized": false,
 
504
  "single_word": false,
505
  "special": true
506
  },
507
+ "36060": {
508
  "content": "<extra_id_35>",
509
  "lstrip": false,
510
  "normalized": false,
 
512
  "single_word": false,
513
  "special": true
514
  },
515
+ "36061": {
516
  "content": "<extra_id_34>",
517
  "lstrip": false,
518
  "normalized": false,
 
520
  "single_word": false,
521
  "special": true
522
  },
523
+ "36062": {
524
  "content": "<extra_id_33>",
525
  "lstrip": false,
526
  "normalized": false,
 
528
  "single_word": false,
529
  "special": true
530
  },
531
+ "36063": {
532
  "content": "<extra_id_32>",
533
  "lstrip": false,
534
  "normalized": false,
 
536
  "single_word": false,
537
  "special": true
538
  },
539
+ "36064": {
540
  "content": "<extra_id_31>",
541
  "lstrip": false,
542
  "normalized": false,
 
544
  "single_word": false,
545
  "special": true
546
  },
547
+ "36065": {
548
  "content": "<extra_id_30>",
549
  "lstrip": false,
550
  "normalized": false,
 
552
  "single_word": false,
553
  "special": true
554
  },
555
+ "36066": {
556
  "content": "<extra_id_29>",
557
  "lstrip": false,
558
  "normalized": false,
 
560
  "single_word": false,
561
  "special": true
562
  },
563
+ "36067": {
564
  "content": "<extra_id_28>",
565
  "lstrip": false,
566
  "normalized": false,
 
568
  "single_word": false,
569
  "special": true
570
  },
571
+ "36068": {
572
  "content": "<extra_id_27>",
573
  "lstrip": false,
574
  "normalized": false,
 
576
  "single_word": false,
577
  "special": true
578
  },
579
+ "36069": {
580
  "content": "<extra_id_26>",
581
  "lstrip": false,
582
  "normalized": false,
 
584
  "single_word": false,
585
  "special": true
586
  },
587
+ "36070": {
588
  "content": "<extra_id_25>",
589
  "lstrip": false,
590
  "normalized": false,
 
592
  "single_word": false,
593
  "special": true
594
  },
595
+ "36071": {
596
  "content": "<extra_id_24>",
597
  "lstrip": false,
598
  "normalized": false,
 
600
  "single_word": false,
601
  "special": true
602
  },
603
+ "36072": {
604
  "content": "<extra_id_23>",
605
  "lstrip": false,
606
  "normalized": false,
 
608
  "single_word": false,
609
  "special": true
610
  },
611
+ "36073": {
612
  "content": "<extra_id_22>",
613
  "lstrip": false,
614
  "normalized": false,
 
616
  "single_word": false,
617
  "special": true
618
  },
619
+ "36074": {
620
  "content": "<extra_id_21>",
621
  "lstrip": false,
622
  "normalized": false,
 
624
  "single_word": false,
625
  "special": true
626
  },
627
+ "36075": {
628
  "content": "<extra_id_20>",
629
  "lstrip": false,
630
  "normalized": false,
 
632
  "single_word": false,
633
  "special": true
634
  },
635
+ "36076": {
636
  "content": "<extra_id_19>",
637
  "lstrip": false,
638
  "normalized": false,
 
640
  "single_word": false,
641
  "special": true
642
  },
643
+ "36077": {
644
  "content": "<extra_id_18>",
645
  "lstrip": false,
646
  "normalized": false,
 
648
  "single_word": false,
649
  "special": true
650
  },
651
+ "36078": {
652
  "content": "<extra_id_17>",
653
  "lstrip": false,
654
  "normalized": false,
 
656
  "single_word": false,
657
  "special": true
658
  },
659
+ "36079": {
660
  "content": "<extra_id_16>",
661
  "lstrip": false,
662
  "normalized": false,
 
664
  "single_word": false,
665
  "special": true
666
  },
667
+ "36080": {
668
  "content": "<extra_id_15>",
669
  "lstrip": false,
670
  "normalized": false,
 
672
  "single_word": false,
673
  "special": true
674
  },
675
+ "36081": {
676
  "content": "<extra_id_14>",
677
  "lstrip": false,
678
  "normalized": false,
 
680
  "single_word": false,
681
  "special": true
682
  },
683
+ "36082": {
684
  "content": "<extra_id_13>",
685
  "lstrip": false,
686
  "normalized": false,
 
688
  "single_word": false,
689
  "special": true
690
  },
691
+ "36083": {
692
  "content": "<extra_id_12>",
693
  "lstrip": false,
694
  "normalized": false,
 
696
  "single_word": false,
697
  "special": true
698
  },
699
+ "36084": {
700
  "content": "<extra_id_11>",
701
  "lstrip": false,
702
  "normalized": false,
 
704
  "single_word": false,
705
  "special": true
706
  },
707
+ "36085": {
708
  "content": "<extra_id_10>",
709
  "lstrip": false,
710
  "normalized": false,
 
712
  "single_word": false,
713
  "special": true
714
  },
715
+ "36086": {
716
  "content": "<extra_id_9>",
717
  "lstrip": false,
718
  "normalized": false,
 
720
  "single_word": false,
721
  "special": true
722
  },
723
+ "36087": {
724
  "content": "<extra_id_8>",
725
  "lstrip": false,
726
  "normalized": false,
 
728
  "single_word": false,
729
  "special": true
730
  },
731
+ "36088": {
732
  "content": "<extra_id_7>",
733
  "lstrip": false,
734
  "normalized": false,
 
736
  "single_word": false,
737
  "special": true
738
  },
739
+ "36089": {
740
  "content": "<extra_id_6>",
741
  "lstrip": false,
742
  "normalized": false,
 
744
  "single_word": false,
745
  "special": true
746
  },
747
+ "36090": {
748
  "content": "<extra_id_5>",
749
  "lstrip": false,
750
  "normalized": false,
 
752
  "single_word": false,
753
  "special": true
754
  },
755
+ "36091": {
756
  "content": "<extra_id_4>",
757
  "lstrip": false,
758
  "normalized": false,
 
760
  "single_word": false,
761
  "special": true
762
  },
763
+ "36092": {
764
  "content": "<extra_id_3>",
765
  "lstrip": false,
766
  "normalized": false,
 
768
  "single_word": false,
769
  "special": true
770
  },
771
+ "36093": {
772
  "content": "<extra_id_2>",
773
  "lstrip": false,
774
  "normalized": false,
 
776
  "single_word": false,
777
  "special": true
778
  },
779
+ "36094": {
780
  "content": "<extra_id_1>",
781
  "lstrip": false,
782
  "normalized": false,
 
784
  "single_word": false,
785
  "special": true
786
  },
787
+ "36095": {
788
  "content": "<extra_id_0>",
789
  "lstrip": false,
790
  "normalized": false,
 
889
  "<extra_id_92>",
890
  "<extra_id_93>",
891
  "<extra_id_94>",
892
+ "<extra_id_95>"
 
 
 
 
893
  ],
894
  "clean_up_tokenization_spaces": true,
895
  "eos_token": "</s>",
896
+ "extra_ids": 96,
897
+ "model_max_length": 1000000000000000019884624838656,
898
  "pad_token": "<pad>",
899
+ "sp_model_kwargs": {},
900
  "tokenizer_class": "T5Tokenizer",
901
  "unk_token": "<unk>"
902
  }
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.013446464646464646,
5
  "eval_steps": 16,
6
- "global_step": 208,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -12,1580 +12,128 @@
12
  "epoch": 6.464646464646465e-05,
13
  "grad_norm": NaN,
14
  "learning_rate": 0.0,
15
- "loss": 12.4995,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0001292929292929293,
20
  "grad_norm": NaN,
21
  "learning_rate": 0.0,
22
- "loss": 12.5063,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.00019393939393939395,
27
- "grad_norm": 68.83582305908203,
28
- "learning_rate": 1.2903225806451614e-06,
29
- "loss": 12.8615,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.0002585858585858586,
34
- "grad_norm": 57.923553466796875,
35
- "learning_rate": 2.580645161290323e-06,
36
- "loss": 12.6171,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.00032323232323232324,
41
- "grad_norm": NaN,
42
- "learning_rate": 2.580645161290323e-06,
43
- "loss": 12.4297,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.0003878787878787879,
48
- "grad_norm": 61.84542465209961,
49
- "learning_rate": 3.870967741935484e-06,
50
- "loss": 13.0576,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.0004525252525252525,
55
- "grad_norm": 54.698020935058594,
56
- "learning_rate": 5.161290322580646e-06,
57
- "loss": 12.7474,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.0005171717171717172,
62
- "grad_norm": NaN,
63
- "learning_rate": 5.161290322580646e-06,
64
- "loss": 12.7983,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.0005818181818181818,
69
- "grad_norm": 60.97978591918945,
70
- "learning_rate": 6.451612903225806e-06,
71
- "loss": 12.524,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.0006464646464646465,
76
- "grad_norm": 54.93958282470703,
77
- "learning_rate": 7.741935483870968e-06,
78
- "loss": 13.3508,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.0007111111111111111,
83
- "grad_norm": 95.04920196533203,
84
- "learning_rate": 9.03225806451613e-06,
85
- "loss": 12.7488,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.0007757575757575758,
90
- "grad_norm": 146.7698516845703,
91
- "learning_rate": 1.0322580645161291e-05,
92
- "loss": 12.3513,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.0008404040404040404,
97
- "grad_norm": 61.88846206665039,
98
- "learning_rate": 1.1612903225806453e-05,
99
- "loss": 12.392,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.000905050505050505,
104
- "grad_norm": 84.30684661865234,
105
- "learning_rate": 1.2903225806451613e-05,
106
- "loss": 12.0805,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.0009696969696969697,
111
- "grad_norm": 166.33363342285156,
112
- "learning_rate": 1.4193548387096774e-05,
113
- "loss": 11.7633,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.0010343434343434343,
118
- "grad_norm": 70.05492401123047,
119
- "learning_rate": 1.5483870967741936e-05,
120
- "loss": 11.2654,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.0010343434343434343,
125
- "eval_bleu": 4.01633632443273,
126
- "eval_loss": 12.072832107543945,
127
- "eval_runtime": 1.3625,
128
- "eval_samples_per_second": 11.743,
129
- "eval_steps_per_second": 2.936,
130
  "step": 16
131
- },
132
- {
133
- "epoch": 0.0010989898989898989,
134
- "grad_norm": NaN,
135
- "learning_rate": 1.5483870967741936e-05,
136
- "loss": 11.3409,
137
- "step": 17
138
- },
139
- {
140
- "epoch": 0.0011636363636363637,
141
- "grad_norm": NaN,
142
- "learning_rate": 1.5483870967741936e-05,
143
- "loss": 10.8596,
144
- "step": 18
145
- },
146
- {
147
- "epoch": 0.0012282828282828282,
148
- "grad_norm": 191.51080322265625,
149
- "learning_rate": 1.6774193548387098e-05,
150
- "loss": 11.4091,
151
- "step": 19
152
- },
153
- {
154
- "epoch": 0.001292929292929293,
155
- "grad_norm": 60.890865325927734,
156
- "learning_rate": 1.806451612903226e-05,
157
- "loss": 11.6678,
158
- "step": 20
159
- },
160
- {
161
- "epoch": 0.0013575757575757575,
162
- "grad_norm": 86.45341491699219,
163
- "learning_rate": 1.935483870967742e-05,
164
- "loss": 11.0757,
165
- "step": 21
166
- },
167
- {
168
- "epoch": 0.0014222222222222223,
169
- "grad_norm": 157.23501586914062,
170
- "learning_rate": 2.0645161290322582e-05,
171
- "loss": 10.7504,
172
- "step": 22
173
- },
174
- {
175
- "epoch": 0.0014868686868686868,
176
- "grad_norm": 63.56692123413086,
177
- "learning_rate": 2.1935483870967744e-05,
178
- "loss": 10.3107,
179
- "step": 23
180
- },
181
- {
182
- "epoch": 0.0015515151515151516,
183
- "grad_norm": 134.14984130859375,
184
- "learning_rate": 2.3225806451612906e-05,
185
- "loss": 9.6532,
186
- "step": 24
187
- },
188
- {
189
- "epoch": 0.0016161616161616162,
190
- "grad_norm": 58.10708999633789,
191
- "learning_rate": 2.4516129032258064e-05,
192
- "loss": 9.6044,
193
- "step": 25
194
- },
195
- {
196
- "epoch": 0.0016808080808080807,
197
- "grad_norm": 66.19347381591797,
198
- "learning_rate": 2.5806451612903226e-05,
199
- "loss": 9.1942,
200
- "step": 26
201
- },
202
- {
203
- "epoch": 0.0017454545454545455,
204
- "grad_norm": 60.57972717285156,
205
- "learning_rate": 2.709677419354839e-05,
206
- "loss": 9.2875,
207
- "step": 27
208
- },
209
- {
210
- "epoch": 0.00181010101010101,
211
- "grad_norm": 58.872520446777344,
212
- "learning_rate": 2.838709677419355e-05,
213
- "loss": 7.7579,
214
- "step": 28
215
- },
216
- {
217
- "epoch": 0.0018747474747474748,
218
- "grad_norm": 63.92591094970703,
219
- "learning_rate": 2.967741935483871e-05,
220
- "loss": 7.4737,
221
- "step": 29
222
- },
223
- {
224
- "epoch": 0.0019393939393939393,
225
- "grad_norm": 59.86642837524414,
226
- "learning_rate": 3.096774193548387e-05,
227
- "loss": 6.8395,
228
- "step": 30
229
- },
230
- {
231
- "epoch": 0.002004040404040404,
232
- "grad_norm": 61.897117614746094,
233
- "learning_rate": 3.2258064516129034e-05,
234
- "loss": 6.788,
235
- "step": 31
236
- },
237
- {
238
- "epoch": 0.0020686868686868687,
239
- "grad_norm": 68.02660369873047,
240
- "learning_rate": 3.3548387096774195e-05,
241
- "loss": 6.5021,
242
- "step": 32
243
- },
244
- {
245
- "epoch": 0.0020686868686868687,
246
- "eval_bleu": 3.747450816674144,
247
- "eval_loss": 3.330573558807373,
248
- "eval_runtime": 1.3643,
249
- "eval_samples_per_second": 11.728,
250
- "eval_steps_per_second": 2.932,
251
- "step": 32
252
- },
253
- {
254
- "epoch": 0.0021333333333333334,
255
- "grad_norm": 59.149505615234375,
256
- "learning_rate": 3.483870967741936e-05,
257
- "loss": 5.7595,
258
- "step": 33
259
- },
260
- {
261
- "epoch": 0.0021979797979797978,
262
- "grad_norm": 62.66898727416992,
263
- "learning_rate": 3.612903225806452e-05,
264
- "loss": 5.3139,
265
- "step": 34
266
- },
267
- {
268
- "epoch": 0.0022626262626262625,
269
- "grad_norm": 59.711551666259766,
270
- "learning_rate": 3.741935483870968e-05,
271
- "loss": 5.1228,
272
- "step": 35
273
- },
274
- {
275
- "epoch": 0.0023272727272727273,
276
- "grad_norm": 55.77004623413086,
277
- "learning_rate": 3.870967741935484e-05,
278
- "loss": 4.1742,
279
- "step": 36
280
- },
281
- {
282
- "epoch": 0.002391919191919192,
283
- "grad_norm": 52.294471740722656,
284
- "learning_rate": 4e-05,
285
- "loss": 4.2128,
286
- "step": 37
287
- },
288
- {
289
- "epoch": 0.0024565656565656564,
290
- "grad_norm": 51.9980583190918,
291
- "learning_rate": 4.1290322580645165e-05,
292
- "loss": 3.3269,
293
- "step": 38
294
- },
295
- {
296
- "epoch": 0.002521212121212121,
297
- "grad_norm": 40.03384017944336,
298
- "learning_rate": 4.258064516129032e-05,
299
- "loss": 2.7555,
300
- "step": 39
301
- },
302
- {
303
- "epoch": 0.002585858585858586,
304
- "grad_norm": 33.953269958496094,
305
- "learning_rate": 4.387096774193549e-05,
306
- "loss": 2.4034,
307
- "step": 40
308
- },
309
- {
310
- "epoch": 0.0026505050505050507,
311
- "grad_norm": 29.490108489990234,
312
- "learning_rate": 4.516129032258064e-05,
313
- "loss": 2.1995,
314
- "step": 41
315
- },
316
- {
317
- "epoch": 0.002715151515151515,
318
- "grad_norm": 56.527854919433594,
319
- "learning_rate": 4.645161290322581e-05,
320
- "loss": 1.7884,
321
- "step": 42
322
- },
323
- {
324
- "epoch": 0.00277979797979798,
325
- "grad_norm": 21.921388626098633,
326
- "learning_rate": 4.774193548387097e-05,
327
- "loss": 1.8252,
328
- "step": 43
329
- },
330
- {
331
- "epoch": 0.0028444444444444446,
332
- "grad_norm": 18.03729820251465,
333
- "learning_rate": 4.903225806451613e-05,
334
- "loss": 1.706,
335
- "step": 44
336
- },
337
- {
338
- "epoch": 0.002909090909090909,
339
- "grad_norm": 11.620859146118164,
340
- "learning_rate": 5.032258064516129e-05,
341
- "loss": 1.4186,
342
- "step": 45
343
- },
344
- {
345
- "epoch": 0.0029737373737373737,
346
- "grad_norm": 10.846900939941406,
347
- "learning_rate": 5.161290322580645e-05,
348
- "loss": 1.3585,
349
- "step": 46
350
- },
351
- {
352
- "epoch": 0.0030383838383838385,
353
- "grad_norm": 9.786290168762207,
354
- "learning_rate": 5.290322580645162e-05,
355
- "loss": 1.345,
356
- "step": 47
357
- },
358
- {
359
- "epoch": 0.0031030303030303032,
360
- "grad_norm": 27.742897033691406,
361
- "learning_rate": 5.419354838709678e-05,
362
- "loss": 1.1557,
363
- "step": 48
364
- },
365
- {
366
- "epoch": 0.0031030303030303032,
367
- "eval_bleu": 0.0,
368
- "eval_loss": 0.3357515335083008,
369
- "eval_runtime": 1.3804,
370
- "eval_samples_per_second": 11.591,
371
- "eval_steps_per_second": 2.898,
372
- "step": 48
373
- },
374
- {
375
- "epoch": 0.0031676767676767676,
376
- "grad_norm": 3.792893409729004,
377
- "learning_rate": 5.5483870967741936e-05,
378
- "loss": 1.0484,
379
- "step": 49
380
- },
381
- {
382
- "epoch": 0.0032323232323232323,
383
- "grad_norm": 6.693092346191406,
384
- "learning_rate": 5.67741935483871e-05,
385
- "loss": 1.1635,
386
- "step": 50
387
- },
388
- {
389
- "epoch": 0.003296969696969697,
390
- "grad_norm": 3.844968795776367,
391
- "learning_rate": 5.8064516129032266e-05,
392
- "loss": 1.0061,
393
- "step": 51
394
- },
395
- {
396
- "epoch": 0.0033616161616161614,
397
- "grad_norm": 4.14786958694458,
398
- "learning_rate": 5.935483870967742e-05,
399
- "loss": 0.9577,
400
- "step": 52
401
- },
402
- {
403
- "epoch": 0.003426262626262626,
404
- "grad_norm": 4.455865859985352,
405
- "learning_rate": 6.064516129032258e-05,
406
- "loss": 0.8501,
407
- "step": 53
408
- },
409
- {
410
- "epoch": 0.003490909090909091,
411
- "grad_norm": 8.088818550109863,
412
- "learning_rate": 6.193548387096774e-05,
413
- "loss": 0.7826,
414
- "step": 54
415
- },
416
- {
417
- "epoch": 0.0035555555555555557,
418
- "grad_norm": 3.6797592639923096,
419
- "learning_rate": 6.32258064516129e-05,
420
- "loss": 0.7374,
421
- "step": 55
422
- },
423
- {
424
- "epoch": 0.00362020202020202,
425
- "grad_norm": 3.0284504890441895,
426
- "learning_rate": 6.451612903225807e-05,
427
- "loss": 0.6646,
428
- "step": 56
429
- },
430
- {
431
- "epoch": 0.003684848484848485,
432
- "grad_norm": 2.8636531829833984,
433
- "learning_rate": 6.580645161290323e-05,
434
- "loss": 0.6316,
435
- "step": 57
436
- },
437
- {
438
- "epoch": 0.0037494949494949496,
439
- "grad_norm": 2.4601149559020996,
440
- "learning_rate": 6.709677419354839e-05,
441
- "loss": 0.5411,
442
- "step": 58
443
- },
444
- {
445
- "epoch": 0.003814141414141414,
446
- "grad_norm": 2.3120172023773193,
447
- "learning_rate": 6.838709677419355e-05,
448
- "loss": 0.4993,
449
- "step": 59
450
- },
451
- {
452
- "epoch": 0.0038787878787878787,
453
- "grad_norm": 1.8615421056747437,
454
- "learning_rate": 6.967741935483871e-05,
455
- "loss": 0.4766,
456
- "step": 60
457
- },
458
- {
459
- "epoch": 0.0039434343434343435,
460
- "grad_norm": 1.5349754095077515,
461
- "learning_rate": 7.096774193548388e-05,
462
- "loss": 0.4203,
463
- "step": 61
464
- },
465
- {
466
- "epoch": 0.004008080808080808,
467
- "grad_norm": 1.7214694023132324,
468
- "learning_rate": 7.225806451612904e-05,
469
- "loss": 0.4294,
470
- "step": 62
471
- },
472
- {
473
- "epoch": 0.004072727272727273,
474
- "grad_norm": 1.5352543592453003,
475
- "learning_rate": 7.35483870967742e-05,
476
- "loss": 0.3648,
477
- "step": 63
478
- },
479
- {
480
- "epoch": 0.004137373737373737,
481
- "grad_norm": 0.8122034072875977,
482
- "learning_rate": 7.483870967741936e-05,
483
- "loss": 0.3224,
484
- "step": 64
485
- },
486
- {
487
- "epoch": 0.004137373737373737,
488
- "eval_bleu": 0.0,
489
- "eval_loss": 0.24285614490509033,
490
- "eval_runtime": 1.3711,
491
- "eval_samples_per_second": 11.67,
492
- "eval_steps_per_second": 2.917,
493
- "step": 64
494
- },
495
- {
496
- "epoch": 0.004202020202020202,
497
- "grad_norm": 0.8955532312393188,
498
- "learning_rate": 7.612903225806451e-05,
499
- "loss": 0.347,
500
- "step": 65
501
- },
502
- {
503
- "epoch": 0.004266666666666667,
504
- "grad_norm": 0.5511079430580139,
505
- "learning_rate": 7.741935483870968e-05,
506
- "loss": 0.3468,
507
- "step": 66
508
- },
509
- {
510
- "epoch": 0.004331313131313131,
511
- "grad_norm": 0.8366074562072754,
512
- "learning_rate": 7.870967741935484e-05,
513
- "loss": 0.2938,
514
- "step": 67
515
- },
516
- {
517
- "epoch": 0.0043959595959595955,
518
- "grad_norm": 0.43903565406799316,
519
- "learning_rate": 8e-05,
520
- "loss": 0.2681,
521
- "step": 68
522
- },
523
- {
524
- "epoch": 0.004460606060606061,
525
- "grad_norm": 0.5412452816963196,
526
- "learning_rate": 8.129032258064517e-05,
527
- "loss": 0.2725,
528
- "step": 69
529
- },
530
- {
531
- "epoch": 0.004525252525252525,
532
- "grad_norm": 0.4017622470855713,
533
- "learning_rate": 8.258064516129033e-05,
534
- "loss": 0.2347,
535
- "step": 70
536
- },
537
- {
538
- "epoch": 0.00458989898989899,
539
- "grad_norm": 22.646089553833008,
540
- "learning_rate": 8.387096774193549e-05,
541
- "loss": 0.2836,
542
- "step": 71
543
- },
544
- {
545
- "epoch": 0.004654545454545455,
546
- "grad_norm": 0.3233143091201782,
547
- "learning_rate": 8.516129032258064e-05,
548
- "loss": 0.2306,
549
- "step": 72
550
- },
551
- {
552
- "epoch": 0.004719191919191919,
553
- "grad_norm": 0.3327657878398895,
554
- "learning_rate": 8.645161290322581e-05,
555
- "loss": 0.2516,
556
- "step": 73
557
- },
558
- {
559
- "epoch": 0.004783838383838384,
560
- "grad_norm": 0.32695600390434265,
561
- "learning_rate": 8.774193548387098e-05,
562
- "loss": 0.2525,
563
- "step": 74
564
- },
565
- {
566
- "epoch": 0.0048484848484848485,
567
- "grad_norm": 0.2796344459056854,
568
- "learning_rate": 8.903225806451614e-05,
569
- "loss": 0.2679,
570
- "step": 75
571
- },
572
- {
573
- "epoch": 0.004913131313131313,
574
- "grad_norm": 0.26281794905662537,
575
- "learning_rate": 9.032258064516129e-05,
576
- "loss": 0.2468,
577
- "step": 76
578
- },
579
- {
580
- "epoch": 0.004977777777777778,
581
- "grad_norm": 0.2546544075012207,
582
- "learning_rate": 9.161290322580646e-05,
583
- "loss": 0.237,
584
- "step": 77
585
- },
586
- {
587
- "epoch": 0.005042424242424242,
588
- "grad_norm": 0.24231921136379242,
589
- "learning_rate": 9.290322580645162e-05,
590
- "loss": 0.2529,
591
- "step": 78
592
- },
593
- {
594
- "epoch": 0.005107070707070707,
595
- "grad_norm": 0.24682262539863586,
596
- "learning_rate": 9.419354838709677e-05,
597
- "loss": 0.2292,
598
- "step": 79
599
- },
600
- {
601
- "epoch": 0.005171717171717172,
602
- "grad_norm": 0.30323857069015503,
603
- "learning_rate": 9.548387096774195e-05,
604
- "loss": 0.2191,
605
- "step": 80
606
- },
607
- {
608
- "epoch": 0.005171717171717172,
609
- "eval_bleu": 0.0,
610
- "eval_loss": 0.19482067227363586,
611
- "eval_runtime": 1.3813,
612
- "eval_samples_per_second": 11.584,
613
- "eval_steps_per_second": 2.896,
614
- "step": 80
615
- },
616
- {
617
- "epoch": 0.005236363636363636,
618
- "grad_norm": 0.21761713922023773,
619
- "learning_rate": 9.677419354838711e-05,
620
- "loss": 0.2414,
621
- "step": 81
622
- },
623
- {
624
- "epoch": 0.005301010101010101,
625
- "grad_norm": 0.23040293157100677,
626
- "learning_rate": 9.806451612903226e-05,
627
- "loss": 0.2274,
628
- "step": 82
629
- },
630
- {
631
- "epoch": 0.005365656565656566,
632
- "grad_norm": 0.23759864270687103,
633
- "learning_rate": 9.935483870967742e-05,
634
- "loss": 0.2126,
635
- "step": 83
636
- },
637
- {
638
- "epoch": 0.00543030303030303,
639
- "grad_norm": 0.19173021614551544,
640
- "learning_rate": 0.00010064516129032258,
641
- "loss": 0.2219,
642
- "step": 84
643
- },
644
- {
645
- "epoch": 0.005494949494949495,
646
- "grad_norm": 0.19772595167160034,
647
- "learning_rate": 0.00010193548387096774,
648
- "loss": 0.2046,
649
- "step": 85
650
- },
651
- {
652
- "epoch": 0.00555959595959596,
653
- "grad_norm": 0.19209372997283936,
654
- "learning_rate": 0.0001032258064516129,
655
- "loss": 0.2202,
656
- "step": 86
657
- },
658
- {
659
- "epoch": 0.005624242424242424,
660
- "grad_norm": 0.18713383376598358,
661
- "learning_rate": 0.00010451612903225806,
662
- "loss": 0.2279,
663
- "step": 87
664
- },
665
- {
666
- "epoch": 0.005688888888888889,
667
- "grad_norm": 0.20889417827129364,
668
- "learning_rate": 0.00010580645161290324,
669
- "loss": 0.1787,
670
- "step": 88
671
- },
672
- {
673
- "epoch": 0.0057535353535353535,
674
- "grad_norm": 0.17321723699569702,
675
- "learning_rate": 0.0001070967741935484,
676
- "loss": 0.2047,
677
- "step": 89
678
- },
679
- {
680
- "epoch": 0.005818181818181818,
681
- "grad_norm": 0.25413277745246887,
682
- "learning_rate": 0.00010838709677419356,
683
- "loss": 0.2226,
684
- "step": 90
685
- },
686
- {
687
- "epoch": 0.005882828282828283,
688
- "grad_norm": 0.1873357743024826,
689
- "learning_rate": 0.00010967741935483871,
690
- "loss": 0.1974,
691
- "step": 91
692
- },
693
- {
694
- "epoch": 0.005947474747474747,
695
- "grad_norm": 0.14157669246196747,
696
- "learning_rate": 0.00011096774193548387,
697
- "loss": 0.1757,
698
- "step": 92
699
- },
700
- {
701
- "epoch": 0.006012121212121212,
702
- "grad_norm": 0.18100616335868835,
703
- "learning_rate": 0.00011225806451612903,
704
- "loss": 0.193,
705
- "step": 93
706
- },
707
- {
708
- "epoch": 0.006076767676767677,
709
- "grad_norm": 0.17187540233135223,
710
- "learning_rate": 0.0001135483870967742,
711
- "loss": 0.1996,
712
- "step": 94
713
- },
714
- {
715
- "epoch": 0.006141414141414141,
716
- "grad_norm": 0.25643497705459595,
717
- "learning_rate": 0.00011483870967741937,
718
- "loss": 0.1944,
719
- "step": 95
720
- },
721
- {
722
- "epoch": 0.0062060606060606064,
723
- "grad_norm": 0.3475594222545624,
724
- "learning_rate": 0.00011612903225806453,
725
- "loss": 0.1821,
726
- "step": 96
727
- },
728
- {
729
- "epoch": 0.0062060606060606064,
730
- "eval_bleu": 0.0,
731
- "eval_loss": 0.15860861539840698,
732
- "eval_runtime": 1.3903,
733
- "eval_samples_per_second": 11.508,
734
- "eval_steps_per_second": 2.877,
735
- "step": 96
736
- },
737
- {
738
- "epoch": 0.006270707070707071,
739
- "grad_norm": 0.17301534116268158,
740
- "learning_rate": 0.00011741935483870967,
741
- "loss": 0.1964,
742
- "step": 97
743
- },
744
- {
745
- "epoch": 0.006335353535353535,
746
- "grad_norm": 0.13940928876399994,
747
- "learning_rate": 0.00011870967741935484,
748
- "loss": 0.1702,
749
- "step": 98
750
- },
751
- {
752
- "epoch": 0.0064,
753
- "grad_norm": 0.1584329903125763,
754
- "learning_rate": 0.00012,
755
- "loss": 0.1667,
756
- "step": 99
757
- },
758
- {
759
- "epoch": 0.006464646464646465,
760
- "grad_norm": 0.19155238568782806,
761
- "learning_rate": 0.00012129032258064516,
762
- "loss": 0.1941,
763
- "step": 100
764
- },
765
- {
766
- "epoch": 0.006529292929292929,
767
- "grad_norm": 0.1830209642648697,
768
- "learning_rate": 0.00012258064516129034,
769
- "loss": 0.1794,
770
- "step": 101
771
- },
772
- {
773
- "epoch": 0.006593939393939394,
774
- "grad_norm": 0.9416115880012512,
775
- "learning_rate": 0.0001238709677419355,
776
- "loss": 0.1799,
777
- "step": 102
778
- },
779
- {
780
- "epoch": 0.0066585858585858585,
781
- "grad_norm": 0.13209928572177887,
782
- "learning_rate": 0.00012516129032258066,
783
- "loss": 0.1715,
784
- "step": 103
785
- },
786
- {
787
- "epoch": 0.006723232323232323,
788
- "grad_norm": 0.15749603509902954,
789
- "learning_rate": 0.0001264516129032258,
790
- "loss": 0.1809,
791
- "step": 104
792
- },
793
- {
794
- "epoch": 0.006787878787878788,
795
- "grad_norm": 0.1440904438495636,
796
- "learning_rate": 0.00012774193548387096,
797
- "loss": 0.1883,
798
- "step": 105
799
- },
800
- {
801
- "epoch": 0.006852525252525252,
802
- "grad_norm": 0.13838624954223633,
803
- "learning_rate": 0.00012903225806451613,
804
- "loss": 0.1835,
805
- "step": 106
806
- },
807
- {
808
- "epoch": 0.006917171717171718,
809
- "grad_norm": 0.13904741406440735,
810
- "learning_rate": 0.0001303225806451613,
811
- "loss": 0.1723,
812
- "step": 107
813
- },
814
- {
815
- "epoch": 0.006981818181818182,
816
- "grad_norm": 0.18367218971252441,
817
- "learning_rate": 0.00013161290322580646,
818
- "loss": 0.1888,
819
- "step": 108
820
- },
821
- {
822
- "epoch": 0.007046464646464646,
823
- "grad_norm": 0.12487432360649109,
824
- "learning_rate": 0.00013290322580645163,
825
- "loss": 0.1821,
826
- "step": 109
827
- },
828
- {
829
- "epoch": 0.0071111111111111115,
830
- "grad_norm": 0.13827675580978394,
831
- "learning_rate": 0.00013419354838709678,
832
- "loss": 0.1733,
833
- "step": 110
834
- },
835
- {
836
- "epoch": 0.007175757575757576,
837
- "grad_norm": 0.12497523427009583,
838
- "learning_rate": 0.00013548387096774193,
839
- "loss": 0.1804,
840
- "step": 111
841
- },
842
- {
843
- "epoch": 0.00724040404040404,
844
- "grad_norm": 0.11209689825773239,
845
- "learning_rate": 0.0001367741935483871,
846
- "loss": 0.1782,
847
- "step": 112
848
- },
849
- {
850
- "epoch": 0.00724040404040404,
851
- "eval_bleu": 0.0,
852
- "eval_loss": 0.14075997471809387,
853
- "eval_runtime": 1.3802,
854
- "eval_samples_per_second": 11.593,
855
- "eval_steps_per_second": 2.898,
856
- "step": 112
857
- },
858
- {
859
- "epoch": 0.007305050505050505,
860
- "grad_norm": 0.1200064942240715,
861
- "learning_rate": 0.00013806451612903225,
862
- "loss": 0.1665,
863
- "step": 113
864
- },
865
- {
866
- "epoch": 0.00736969696969697,
867
- "grad_norm": 0.2869766652584076,
868
- "learning_rate": 0.00013935483870967743,
869
- "loss": 0.164,
870
- "step": 114
871
- },
872
- {
873
- "epoch": 0.007434343434343434,
874
- "grad_norm": 0.12517796456813812,
875
- "learning_rate": 0.0001406451612903226,
876
- "loss": 0.1584,
877
- "step": 115
878
- },
879
- {
880
- "epoch": 0.007498989898989899,
881
- "grad_norm": 0.11323254555463791,
882
- "learning_rate": 0.00014193548387096775,
883
- "loss": 0.1593,
884
- "step": 116
885
- },
886
- {
887
- "epoch": 0.0075636363636363635,
888
- "grad_norm": 0.23315002024173737,
889
- "learning_rate": 0.00014322580645161293,
890
- "loss": 0.1725,
891
- "step": 117
892
- },
893
- {
894
- "epoch": 0.007628282828282828,
895
- "grad_norm": 0.11487537622451782,
896
- "learning_rate": 0.00014451612903225807,
897
- "loss": 0.1554,
898
- "step": 118
899
- },
900
- {
901
- "epoch": 0.007692929292929293,
902
- "grad_norm": 0.11991633474826813,
903
- "learning_rate": 0.00014580645161290322,
904
- "loss": 0.1525,
905
- "step": 119
906
- },
907
- {
908
- "epoch": 0.007757575757575757,
909
- "grad_norm": 0.17173829674720764,
910
- "learning_rate": 0.0001470967741935484,
911
- "loss": 0.1961,
912
- "step": 120
913
- },
914
- {
915
- "epoch": 0.007822222222222222,
916
- "grad_norm": 0.22421815991401672,
917
- "learning_rate": 0.00014838709677419355,
918
- "loss": 0.1491,
919
- "step": 121
920
- },
921
- {
922
- "epoch": 0.007886868686868687,
923
- "grad_norm": 0.10590796172618866,
924
- "learning_rate": 0.00014967741935483872,
925
- "loss": 0.1466,
926
- "step": 122
927
- },
928
- {
929
- "epoch": 0.007951515151515152,
930
- "grad_norm": 0.15446847677230835,
931
- "learning_rate": 0.0001509677419354839,
932
- "loss": 0.1506,
933
- "step": 123
934
- },
935
- {
936
- "epoch": 0.008016161616161616,
937
- "grad_norm": 0.1181936264038086,
938
- "learning_rate": 0.00015225806451612902,
939
- "loss": 0.1483,
940
- "step": 124
941
- },
942
- {
943
- "epoch": 0.00808080808080808,
944
- "grad_norm": 0.12403552234172821,
945
- "learning_rate": 0.0001535483870967742,
946
- "loss": 0.1602,
947
- "step": 125
948
- },
949
- {
950
- "epoch": 0.008145454545454546,
951
- "grad_norm": 0.12927326560020447,
952
- "learning_rate": 0.00015483870967741937,
953
- "loss": 0.173,
954
- "step": 126
955
- },
956
- {
957
- "epoch": 0.00821010101010101,
958
- "grad_norm": 0.7120084762573242,
959
- "learning_rate": 0.00015612903225806451,
960
- "loss": 0.1561,
961
- "step": 127
962
- },
963
- {
964
- "epoch": 0.008274747474747475,
965
- "grad_norm": 0.09934462606906891,
966
- "learning_rate": 0.0001574193548387097,
967
- "loss": 0.1433,
968
- "step": 128
969
- },
970
- {
971
- "epoch": 0.008274747474747475,
972
- "eval_bleu": 0.0,
973
- "eval_loss": 0.12477699667215347,
974
- "eval_runtime": 1.3776,
975
- "eval_samples_per_second": 11.614,
976
- "eval_steps_per_second": 2.904,
977
- "step": 128
978
- },
979
- {
980
- "epoch": 0.00833939393939394,
981
- "grad_norm": 0.11033691465854645,
982
- "learning_rate": 0.00015870967741935487,
983
- "loss": 0.1615,
984
- "step": 129
985
- },
986
- {
987
- "epoch": 0.008404040404040403,
988
- "grad_norm": 0.12420456856489182,
989
- "learning_rate": 0.00016,
990
- "loss": 0.1477,
991
- "step": 130
992
- },
993
- {
994
- "epoch": 0.008468686868686869,
995
- "grad_norm": 0.1626136600971222,
996
- "learning_rate": 0.00016129032258064516,
997
- "loss": 0.1624,
998
- "step": 131
999
- },
1000
- {
1001
- "epoch": 0.008533333333333334,
1002
- "grad_norm": 0.1007506251335144,
1003
- "learning_rate": 0.00016258064516129034,
1004
- "loss": 0.1499,
1005
- "step": 132
1006
- },
1007
- {
1008
- "epoch": 0.008597979797979797,
1009
- "grad_norm": 0.35909000039100647,
1010
- "learning_rate": 0.00016387096774193548,
1011
- "loss": 0.1646,
1012
- "step": 133
1013
- },
1014
- {
1015
- "epoch": 0.008662626262626262,
1016
- "grad_norm": 0.139847531914711,
1017
- "learning_rate": 0.00016516129032258066,
1018
- "loss": 0.1531,
1019
- "step": 134
1020
- },
1021
- {
1022
- "epoch": 0.008727272727272728,
1023
- "grad_norm": 0.14874647557735443,
1024
- "learning_rate": 0.0001664516129032258,
1025
- "loss": 0.1518,
1026
- "step": 135
1027
- },
1028
- {
1029
- "epoch": 0.008791919191919191,
1030
- "grad_norm": 0.0868111178278923,
1031
- "learning_rate": 0.00016774193548387098,
1032
- "loss": 0.1429,
1033
- "step": 136
1034
- },
1035
- {
1036
- "epoch": 0.008856565656565656,
1037
- "grad_norm": 0.13478216528892517,
1038
- "learning_rate": 0.00016903225806451616,
1039
- "loss": 0.1591,
1040
- "step": 137
1041
- },
1042
- {
1043
- "epoch": 0.008921212121212121,
1044
- "grad_norm": 0.27142342925071716,
1045
- "learning_rate": 0.00017032258064516128,
1046
- "loss": 0.1421,
1047
- "step": 138
1048
- },
1049
- {
1050
- "epoch": 0.008985858585858587,
1051
- "grad_norm": 0.10351862758398056,
1052
- "learning_rate": 0.00017161290322580645,
1053
- "loss": 0.1314,
1054
- "step": 139
1055
- },
1056
- {
1057
- "epoch": 0.00905050505050505,
1058
- "grad_norm": 0.2667485177516937,
1059
- "learning_rate": 0.00017290322580645163,
1060
- "loss": 0.1383,
1061
- "step": 140
1062
- },
1063
- {
1064
- "epoch": 0.009115151515151515,
1065
- "grad_norm": 0.12030935287475586,
1066
- "learning_rate": 0.00017419354838709678,
1067
- "loss": 0.1532,
1068
- "step": 141
1069
- },
1070
- {
1071
- "epoch": 0.00917979797979798,
1072
- "grad_norm": 0.11514189839363098,
1073
- "learning_rate": 0.00017548387096774195,
1074
- "loss": 0.1478,
1075
- "step": 142
1076
- },
1077
- {
1078
- "epoch": 0.009244444444444444,
1079
- "grad_norm": 0.09005405008792877,
1080
- "learning_rate": 0.0001767741935483871,
1081
- "loss": 0.1323,
1082
- "step": 143
1083
- },
1084
- {
1085
- "epoch": 0.00930909090909091,
1086
- "grad_norm": 0.09564518928527832,
1087
- "learning_rate": 0.00017806451612903228,
1088
- "loss": 0.1591,
1089
- "step": 144
1090
- },
1091
- {
1092
- "epoch": 0.00930909090909091,
1093
- "eval_bleu": 0.0,
1094
- "eval_loss": 0.11460547149181366,
1095
- "eval_runtime": 1.359,
1096
- "eval_samples_per_second": 11.773,
1097
- "eval_steps_per_second": 2.943,
1098
- "step": 144
1099
- },
1100
- {
1101
- "epoch": 0.009373737373737374,
1102
- "grad_norm": 0.0995207279920578,
1103
- "learning_rate": 0.00017935483870967742,
1104
- "loss": 0.1577,
1105
- "step": 145
1106
- },
1107
- {
1108
- "epoch": 0.009438383838383838,
1109
- "grad_norm": 0.4307728707790375,
1110
- "learning_rate": 0.00018064516129032257,
1111
- "loss": 0.1381,
1112
- "step": 146
1113
- },
1114
- {
1115
- "epoch": 0.009503030303030303,
1116
- "grad_norm": 0.10841380804777145,
1117
- "learning_rate": 0.00018193548387096775,
1118
- "loss": 0.1695,
1119
- "step": 147
1120
- },
1121
- {
1122
- "epoch": 0.009567676767676768,
1123
- "grad_norm": 0.08941018581390381,
1124
- "learning_rate": 0.00018322580645161292,
1125
- "loss": 0.1407,
1126
- "step": 148
1127
- },
1128
- {
1129
- "epoch": 0.009632323232323232,
1130
- "grad_norm": 0.09527455270290375,
1131
- "learning_rate": 0.00018451612903225807,
1132
- "loss": 0.1515,
1133
- "step": 149
1134
- },
1135
- {
1136
- "epoch": 0.009696969696969697,
1137
- "grad_norm": 0.07641109079122543,
1138
- "learning_rate": 0.00018580645161290325,
1139
- "loss": 0.1433,
1140
- "step": 150
1141
- },
1142
- {
1143
- "epoch": 0.009761616161616162,
1144
- "grad_norm": 0.1487646847963333,
1145
- "learning_rate": 0.0001870967741935484,
1146
- "loss": 0.1474,
1147
- "step": 151
1148
- },
1149
- {
1150
- "epoch": 0.009826262626262626,
1151
- "grad_norm": 0.08308811485767365,
1152
- "learning_rate": 0.00018838709677419354,
1153
- "loss": 0.1323,
1154
- "step": 152
1155
- },
1156
- {
1157
- "epoch": 0.00989090909090909,
1158
- "grad_norm": 0.06572406738996506,
1159
- "learning_rate": 0.00018967741935483872,
1160
- "loss": 0.1407,
1161
- "step": 153
1162
- },
1163
- {
1164
- "epoch": 0.009955555555555556,
1165
- "grad_norm": 0.24972431361675262,
1166
- "learning_rate": 0.0001909677419354839,
1167
- "loss": 0.1385,
1168
- "step": 154
1169
- },
1170
- {
1171
- "epoch": 0.01002020202020202,
1172
- "grad_norm": 0.07581052929162979,
1173
- "learning_rate": 0.00019225806451612904,
1174
- "loss": 0.1507,
1175
- "step": 155
1176
- },
1177
- {
1178
- "epoch": 0.010084848484848485,
1179
- "grad_norm": 0.1980135142803192,
1180
- "learning_rate": 0.00019354838709677422,
1181
- "loss": 0.1419,
1182
- "step": 156
1183
- },
1184
- {
1185
- "epoch": 0.01014949494949495,
1186
- "grad_norm": 0.06879571825265884,
1187
- "learning_rate": 0.00019483870967741936,
1188
- "loss": 0.1371,
1189
- "step": 157
1190
- },
1191
- {
1192
- "epoch": 0.010214141414141413,
1193
- "grad_norm": 0.08168785274028778,
1194
- "learning_rate": 0.0001961290322580645,
1195
- "loss": 0.143,
1196
- "step": 158
1197
- },
1198
- {
1199
- "epoch": 0.010278787878787879,
1200
- "grad_norm": 0.13444702327251434,
1201
- "learning_rate": 0.00019741935483870969,
1202
- "loss": 0.1458,
1203
- "step": 159
1204
- },
1205
- {
1206
- "epoch": 0.010343434343434344,
1207
- "grad_norm": 0.09177995473146439,
1208
- "learning_rate": 0.00019870967741935483,
1209
- "loss": 0.1302,
1210
- "step": 160
1211
- },
1212
- {
1213
- "epoch": 0.010343434343434344,
1214
- "eval_bleu": 0.1344542592045913,
1215
- "eval_loss": 0.10595569759607315,
1216
- "eval_runtime": 1.353,
1217
- "eval_samples_per_second": 11.825,
1218
- "eval_steps_per_second": 2.956,
1219
- "step": 160
1220
- },
1221
- {
1222
- "epoch": 0.010408080808080807,
1223
- "grad_norm": 0.059201959520578384,
1224
- "learning_rate": 0.0002,
1225
- "loss": 0.1317,
1226
- "step": 161
1227
- },
1228
- {
1229
- "epoch": 0.010472727272727272,
1230
- "grad_norm": 0.1885806769132614,
1231
- "learning_rate": 0.00019999999789549876,
1232
- "loss": 0.1319,
1233
- "step": 162
1234
- },
1235
- {
1236
- "epoch": 0.010537373737373738,
1237
- "grad_norm": 0.06697044521570206,
1238
- "learning_rate": 0.0001999999915819952,
1239
- "loss": 0.1136,
1240
- "step": 163
1241
- },
1242
- {
1243
- "epoch": 0.010602020202020203,
1244
- "grad_norm": 0.0689595639705658,
1245
- "learning_rate": 0.00019999998105948953,
1246
- "loss": 0.1189,
1247
- "step": 164
1248
- },
1249
- {
1250
- "epoch": 0.010666666666666666,
1251
- "grad_norm": 0.07707302272319794,
1252
- "learning_rate": 0.00019999996632798217,
1253
- "loss": 0.1412,
1254
- "step": 165
1255
- },
1256
- {
1257
- "epoch": 0.010731313131313132,
1258
- "grad_norm": 0.22306282818317413,
1259
- "learning_rate": 0.00019999994738747378,
1260
- "loss": 0.1657,
1261
- "step": 166
1262
- },
1263
- {
1264
- "epoch": 0.010795959595959597,
1265
- "grad_norm": 0.09084911644458771,
1266
- "learning_rate": 0.00019999992423796515,
1267
- "loss": 0.126,
1268
- "step": 167
1269
- },
1270
- {
1271
- "epoch": 0.01086060606060606,
1272
- "grad_norm": 0.09681031852960587,
1273
- "learning_rate": 0.00019999989687945728,
1274
- "loss": 0.1303,
1275
- "step": 168
1276
- },
1277
- {
1278
- "epoch": 0.010925252525252525,
1279
- "grad_norm": 0.12961797416210175,
1280
- "learning_rate": 0.0001999998653119513,
1281
- "loss": 0.1231,
1282
- "step": 169
1283
- },
1284
- {
1285
- "epoch": 0.01098989898989899,
1286
- "grad_norm": 0.07255159318447113,
1287
- "learning_rate": 0.00019999982953544852,
1288
- "loss": 0.1324,
1289
- "step": 170
1290
- },
1291
- {
1292
- "epoch": 0.011054545454545454,
1293
- "grad_norm": 0.07213090360164642,
1294
- "learning_rate": 0.00019999978954995045,
1295
- "loss": 0.1243,
1296
- "step": 171
1297
- },
1298
- {
1299
- "epoch": 0.01111919191919192,
1300
- "grad_norm": 0.058742836117744446,
1301
- "learning_rate": 0.0001999997453554588,
1302
- "loss": 0.1196,
1303
- "step": 172
1304
- },
1305
- {
1306
- "epoch": 0.011183838383838384,
1307
- "grad_norm": 0.06557495146989822,
1308
- "learning_rate": 0.00019999969695197543,
1309
- "loss": 0.1234,
1310
- "step": 173
1311
- },
1312
- {
1313
- "epoch": 0.011248484848484848,
1314
- "grad_norm": 0.05962904542684555,
1315
- "learning_rate": 0.00019999964433950235,
1316
- "loss": 0.1465,
1317
- "step": 174
1318
- },
1319
- {
1320
- "epoch": 0.011313131313131313,
1321
- "grad_norm": 0.11403318494558334,
1322
- "learning_rate": 0.00019999958751804178,
1323
- "loss": 0.1147,
1324
- "step": 175
1325
- },
1326
- {
1327
- "epoch": 0.011377777777777778,
1328
- "grad_norm": 0.06283015757799149,
1329
- "learning_rate": 0.0001999995264875961,
1330
- "loss": 0.1051,
1331
- "step": 176
1332
- },
1333
- {
1334
- "epoch": 0.011377777777777778,
1335
- "eval_bleu": 0.34108656655271324,
1336
- "eval_loss": 0.10132479667663574,
1337
- "eval_runtime": 1.4086,
1338
- "eval_samples_per_second": 11.359,
1339
- "eval_steps_per_second": 2.84,
1340
- "step": 176
1341
- },
1342
- {
1343
- "epoch": 0.011442424242424242,
1344
- "grad_norm": 0.07057774811983109,
1345
- "learning_rate": 0.00019999946124816794,
1346
- "loss": 0.1159,
1347
- "step": 177
1348
- },
1349
- {
1350
- "epoch": 0.011507070707070707,
1351
- "grad_norm": 0.1344994753599167,
1352
- "learning_rate": 0.00019999939179975997,
1353
- "loss": 0.1123,
1354
- "step": 178
1355
- },
1356
- {
1357
- "epoch": 0.011571717171717172,
1358
- "grad_norm": 0.053567882627248764,
1359
- "learning_rate": 0.00019999931814237515,
1360
- "loss": 0.1319,
1361
- "step": 179
1362
- },
1363
- {
1364
- "epoch": 0.011636363636363636,
1365
- "grad_norm": 0.05020461976528168,
1366
- "learning_rate": 0.0001999992402760166,
1367
- "loss": 0.1315,
1368
- "step": 180
1369
- },
1370
- {
1371
- "epoch": 0.0117010101010101,
1372
- "grad_norm": 0.09403225779533386,
1373
- "learning_rate": 0.00019999915820068757,
1374
- "loss": 0.1275,
1375
- "step": 181
1376
- },
1377
- {
1378
- "epoch": 0.011765656565656566,
1379
- "grad_norm": 0.07833687961101532,
1380
- "learning_rate": 0.0001999990719163915,
1381
- "loss": 0.1216,
1382
- "step": 182
1383
- },
1384
- {
1385
- "epoch": 0.01183030303030303,
1386
- "grad_norm": 0.03991740942001343,
1387
- "learning_rate": 0.00019999898142313206,
1388
- "loss": 0.1142,
1389
- "step": 183
1390
- },
1391
- {
1392
- "epoch": 0.011894949494949495,
1393
- "grad_norm": 0.04619375616312027,
1394
- "learning_rate": 0.00019999888672091304,
1395
- "loss": 0.1103,
1396
- "step": 184
1397
- },
1398
- {
1399
- "epoch": 0.01195959595959596,
1400
- "grad_norm": 0.037316370755434036,
1401
- "learning_rate": 0.0001999987878097384,
1402
- "loss": 0.1168,
1403
- "step": 185
1404
- },
1405
- {
1406
- "epoch": 0.012024242424242423,
1407
- "grad_norm": 0.04291122406721115,
1408
- "learning_rate": 0.00019999868468961233,
1409
- "loss": 0.1198,
1410
- "step": 186
1411
- },
1412
- {
1413
- "epoch": 0.012088888888888889,
1414
- "grad_norm": 0.07236277312040329,
1415
- "learning_rate": 0.00019999857736053918,
1416
- "loss": 0.12,
1417
- "step": 187
1418
- },
1419
- {
1420
- "epoch": 0.012153535353535354,
1421
- "grad_norm": 0.04166350141167641,
1422
- "learning_rate": 0.0001999984658225235,
1423
- "loss": 0.1312,
1424
- "step": 188
1425
- },
1426
- {
1427
- "epoch": 0.012218181818181819,
1428
- "grad_norm": 0.04327237978577614,
1429
- "learning_rate": 0.00019999835007556986,
1430
- "loss": 0.138,
1431
- "step": 189
1432
- },
1433
- {
1434
- "epoch": 0.012282828282828282,
1435
- "grad_norm": 0.13315382599830627,
1436
- "learning_rate": 0.00019999823011968327,
1437
- "loss": 0.1395,
1438
- "step": 190
1439
- },
1440
- {
1441
- "epoch": 0.012347474747474748,
1442
- "grad_norm": 0.04486257955431938,
1443
- "learning_rate": 0.0001999981059548687,
1444
- "loss": 0.1281,
1445
- "step": 191
1446
- },
1447
- {
1448
- "epoch": 0.012412121212121213,
1449
- "grad_norm": 0.06127722188830376,
1450
- "learning_rate": 0.0001999979775811314,
1451
- "loss": 0.1295,
1452
- "step": 192
1453
- },
1454
- {
1455
- "epoch": 0.012412121212121213,
1456
- "eval_bleu": 2.797660142073947,
1457
- "eval_loss": 0.09842301905155182,
1458
- "eval_runtime": 1.3435,
1459
- "eval_samples_per_second": 11.909,
1460
- "eval_steps_per_second": 2.977,
1461
- "step": 192
1462
- },
1463
- {
1464
- "epoch": 0.012476767676767676,
1465
- "grad_norm": 0.10934247821569443,
1466
- "learning_rate": 0.00019999784499847678,
1467
- "loss": 0.119,
1468
- "step": 193
1469
- },
1470
- {
1471
- "epoch": 0.012541414141414142,
1472
- "grad_norm": 0.04266177862882614,
1473
- "learning_rate": 0.0001999977082069104,
1474
- "loss": 0.1094,
1475
- "step": 194
1476
- },
1477
- {
1478
- "epoch": 0.012606060606060607,
1479
- "grad_norm": 0.10852430015802383,
1480
- "learning_rate": 0.00019999756720643803,
1481
- "loss": 0.1118,
1482
- "step": 195
1483
- },
1484
- {
1485
- "epoch": 0.01267070707070707,
1486
- "grad_norm": 0.06190445274114609,
1487
- "learning_rate": 0.0001999974219970656,
1488
- "loss": 0.129,
1489
- "step": 196
1490
- },
1491
- {
1492
- "epoch": 0.012735353535353535,
1493
- "grad_norm": 0.04268389567732811,
1494
- "learning_rate": 0.00019999727257879923,
1495
- "loss": 0.1149,
1496
- "step": 197
1497
- },
1498
- {
1499
- "epoch": 0.0128,
1500
- "grad_norm": 0.04210319742560387,
1501
- "learning_rate": 0.0001999971189516452,
1502
- "loss": 0.1231,
1503
- "step": 198
1504
- },
1505
- {
1506
- "epoch": 0.012864646464646464,
1507
- "grad_norm": 0.07176094502210617,
1508
- "learning_rate": 0.00019999696111561,
1509
- "loss": 0.1123,
1510
- "step": 199
1511
- },
1512
- {
1513
- "epoch": 0.01292929292929293,
1514
- "grad_norm": 0.04062803462147713,
1515
- "learning_rate": 0.00019999679907070023,
1516
- "loss": 0.1225,
1517
- "step": 200
1518
- },
1519
- {
1520
- "epoch": 0.012993939393939394,
1521
- "grad_norm": 0.04266968369483948,
1522
- "learning_rate": 0.00019999663281692275,
1523
- "loss": 0.1259,
1524
- "step": 201
1525
- },
1526
- {
1527
- "epoch": 0.013058585858585858,
1528
- "grad_norm": 0.045373089611530304,
1529
- "learning_rate": 0.00019999646235428452,
1530
- "loss": 0.1353,
1531
- "step": 202
1532
- },
1533
- {
1534
- "epoch": 0.013123232323232323,
1535
- "grad_norm": 0.04623784124851227,
1536
- "learning_rate": 0.00019999628768279276,
1537
- "loss": 0.1224,
1538
- "step": 203
1539
- },
1540
- {
1541
- "epoch": 0.013187878787878788,
1542
- "grad_norm": 0.03664301335811615,
1543
- "learning_rate": 0.0001999961088024548,
1544
- "loss": 0.1361,
1545
- "step": 204
1546
- },
1547
- {
1548
- "epoch": 0.013252525252525252,
1549
- "grad_norm": 0.03849755972623825,
1550
- "learning_rate": 0.00019999592571327815,
1551
- "loss": 0.1307,
1552
- "step": 205
1553
- },
1554
- {
1555
- "epoch": 0.013317171717171717,
1556
- "grad_norm": 0.03995022922754288,
1557
- "learning_rate": 0.00019999573841527054,
1558
- "loss": 0.1079,
1559
- "step": 206
1560
- },
1561
- {
1562
- "epoch": 0.013381818181818182,
1563
- "grad_norm": 0.039675675332546234,
1564
- "learning_rate": 0.00019999554690843988,
1565
- "loss": 0.1212,
1566
- "step": 207
1567
- },
1568
- {
1569
- "epoch": 0.013446464646464646,
1570
- "grad_norm": 0.05080877244472504,
1571
- "learning_rate": 0.00019999535119279415,
1572
- "loss": 0.0991,
1573
- "step": 208
1574
- },
1575
- {
1576
- "epoch": 0.013446464646464646,
1577
- "eval_bleu": 3.66705872401506,
1578
- "eval_loss": 0.09652489423751831,
1579
- "eval_runtime": 1.3632,
1580
- "eval_samples_per_second": 11.737,
1581
- "eval_steps_per_second": 2.934,
1582
- "step": 208
1583
  }
1584
  ],
1585
  "logging_steps": 1,
1586
- "max_steps": 15468,
1587
  "num_input_tokens_seen": 0,
1588
- "num_train_epochs": 1,
1589
  "save_steps": 16,
1590
  "stateful_callbacks": {
1591
  "TrainerControl": {
@@ -1599,8 +147,8 @@
1599
  "attributes": {}
1600
  }
1601
  },
1602
- "total_flos": 1801670061195264.0,
1603
- "train_batch_size": 4,
1604
  "trial_name": null,
1605
  "trial_params": null
1606
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.0010343434343434343,
5
  "eval_steps": 16,
6
+ "global_step": 16,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
12
  "epoch": 6.464646464646465e-05,
13
  "grad_norm": NaN,
14
  "learning_rate": 0.0,
15
+ "loss": 29.6213,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0001292929292929293,
20
  "grad_norm": NaN,
21
  "learning_rate": 0.0,
22
+ "loss": 29.6208,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.00019393939393939395,
27
+ "grad_norm": 129.40235900878906,
28
+ "learning_rate": 4.3010752688172043e-07,
29
+ "loss": 29.5846,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.0002585858585858586,
34
+ "grad_norm": Infinity,
35
+ "learning_rate": 4.3010752688172043e-07,
36
+ "loss": 29.7161,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.00032323232323232324,
41
+ "grad_norm": 130.79031372070312,
42
+ "learning_rate": 8.602150537634409e-07,
43
+ "loss": 29.7196,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.0003878787878787879,
48
+ "grad_norm": 123.62369537353516,
49
+ "learning_rate": 1.2903225806451614e-06,
50
+ "loss": 29.2487,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.0004525252525252525,
55
+ "grad_norm": 135.1348876953125,
56
+ "learning_rate": 1.7204301075268817e-06,
57
+ "loss": 29.6055,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.0005171717171717172,
62
+ "grad_norm": Infinity,
63
+ "learning_rate": 1.7204301075268817e-06,
64
+ "loss": 28.568,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.0005818181818181818,
69
+ "grad_norm": 270.385498046875,
70
+ "learning_rate": 2.1505376344086023e-06,
71
+ "loss": 29.7127,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.0006464646464646465,
76
+ "grad_norm": 109.7217788696289,
77
+ "learning_rate": 2.580645161290323e-06,
78
+ "loss": 29.2279,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.0007111111111111111,
83
+ "grad_norm": 403.46337890625,
84
+ "learning_rate": 3.0107526881720433e-06,
85
+ "loss": 28.7925,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.0007757575757575758,
90
+ "grad_norm": 114.81087493896484,
91
+ "learning_rate": 3.4408602150537635e-06,
92
+ "loss": 28.0663,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.0008404040404040404,
97
+ "grad_norm": 240.21282958984375,
98
+ "learning_rate": 3.870967741935484e-06,
99
+ "loss": 27.3475,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.000905050505050505,
104
+ "grad_norm": 236.48675537109375,
105
+ "learning_rate": 4.3010752688172045e-06,
106
+ "loss": 26.9302,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.0009696969696969697,
111
+ "grad_norm": 203.8461456298828,
112
+ "learning_rate": 4.731182795698925e-06,
113
+ "loss": 24.0135,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.0010343434343434343,
118
+ "grad_norm": 156.15663146972656,
119
+ "learning_rate": 5.161290322580646e-06,
120
+ "loss": 22.7445,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.0010343434343434343,
125
+ "eval_bleu": 0.11919568898736486,
126
+ "eval_loss": 21.227251052856445,
127
+ "eval_runtime": 2.9098,
128
+ "eval_samples_per_second": 10.997,
129
+ "eval_steps_per_second": 1.375,
130
  "step": 16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
132
  ],
133
  "logging_steps": 1,
134
+ "max_steps": 46404,
135
  "num_input_tokens_seen": 0,
136
+ "num_train_epochs": 3,
137
  "save_steps": 16,
138
  "stateful_callbacks": {
139
  "TrainerControl": {
 
147
  "attributes": {}
148
  }
149
  },
150
+ "total_flos": 311786439966720.0,
151
+ "train_batch_size": 8,
152
  "trial_name": null,
153
  "trial_params": null
154
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc65473306b4a3f5fae99fa380d17d29a861d4b0c3d748ffda65119a04290231
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f3a0792a04fef5ea618963940e094b8c3947590c45a58f3ea50f2fde6e0f3e8
3
  size 5240