Jellyfish042 commited on
Commit
a71e823
1 Parent(s): b853386

Upload 8 files

Browse files
checkpoint-15000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-small",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 1024,
7
+ "d_kv": 64,
8
+ "d_model": 512,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "mt5",
19
+ "num_decoder_layers": 8,
20
+ "num_heads": 6,
21
+ "num_layers": 8,
22
+ "pad_token_id": 0,
23
+ "relative_attention_max_distance": 128,
24
+ "relative_attention_num_buckets": 32,
25
+ "tie_word_embeddings": false,
26
+ "tokenizer_class": "T5Tokenizer",
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.28.0",
29
+ "use_cache": true,
30
+ "vocab_size": 250112
31
+ }
checkpoint-15000/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.28.0"
7
+ }
checkpoint-15000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0831c45eef4cb373d82a2158ac609ab45ac5d4d1db6209eb53cfbd94637349d
3
+ size 2401525449
checkpoint-15000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b543b1ab713dc4649af695e62925584fd9c95a3a1e827cd2b0cf2dd263fb5c3
3
+ size 1200770757
checkpoint-15000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db24715fa1d181eb24364f66201adf2589f55a4fda0da6d4a2413db2c6f3a048
3
+ size 14503
checkpoint-15000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79ed0ebfeed37d4fae67cc150c5ffff9a16ccedbf987c79b5e3d183dd9c1434d
3
+ size 623
checkpoint-15000/trainer_state.json ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.7971753420380452,
5
+ "global_step": 15000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 1.9957482993197282e-05,
13
+ "loss": 31.2602,
14
+ "step": 200
15
+ },
16
+ {
17
+ "epoch": 0.02,
18
+ "learning_rate": 1.991496598639456e-05,
19
+ "loss": 18.3425,
20
+ "step": 400
21
+ },
22
+ {
23
+ "epoch": 0.03,
24
+ "learning_rate": 1.987244897959184e-05,
25
+ "loss": 11.5784,
26
+ "step": 600
27
+ },
28
+ {
29
+ "epoch": 0.04,
30
+ "learning_rate": 1.982993197278912e-05,
31
+ "loss": 7.843,
32
+ "step": 800
33
+ },
34
+ {
35
+ "epoch": 0.05,
36
+ "learning_rate": 1.9787414965986397e-05,
37
+ "loss": 4.9763,
38
+ "step": 1000
39
+ },
40
+ {
41
+ "epoch": 0.06,
42
+ "learning_rate": 1.9744897959183677e-05,
43
+ "loss": 3.365,
44
+ "step": 1200
45
+ },
46
+ {
47
+ "epoch": 0.07,
48
+ "learning_rate": 1.9702380952380954e-05,
49
+ "loss": 2.5671,
50
+ "step": 1400
51
+ },
52
+ {
53
+ "epoch": 0.09,
54
+ "learning_rate": 1.965986394557823e-05,
55
+ "loss": 1.9747,
56
+ "step": 1600
57
+ },
58
+ {
59
+ "epoch": 0.1,
60
+ "learning_rate": 1.961734693877551e-05,
61
+ "loss": 1.528,
62
+ "step": 1800
63
+ },
64
+ {
65
+ "epoch": 0.11,
66
+ "learning_rate": 1.9574829931972788e-05,
67
+ "loss": 1.1564,
68
+ "step": 2000
69
+ },
70
+ {
71
+ "epoch": 0.12,
72
+ "learning_rate": 1.953231292517007e-05,
73
+ "loss": 0.8765,
74
+ "step": 2200
75
+ },
76
+ {
77
+ "epoch": 0.13,
78
+ "learning_rate": 1.948979591836735e-05,
79
+ "loss": 0.6853,
80
+ "step": 2400
81
+ },
82
+ {
83
+ "epoch": 0.14,
84
+ "learning_rate": 1.9447278911564626e-05,
85
+ "loss": 0.5731,
86
+ "step": 2600
87
+ },
88
+ {
89
+ "epoch": 0.15,
90
+ "learning_rate": 1.9404761904761906e-05,
91
+ "loss": 0.4927,
92
+ "step": 2800
93
+ },
94
+ {
95
+ "epoch": 0.16,
96
+ "learning_rate": 1.9362244897959186e-05,
97
+ "loss": 0.4354,
98
+ "step": 3000
99
+ },
100
+ {
101
+ "epoch": 0.17,
102
+ "learning_rate": 1.9319727891156463e-05,
103
+ "loss": 0.3948,
104
+ "step": 3200
105
+ },
106
+ {
107
+ "epoch": 0.18,
108
+ "learning_rate": 1.9277210884353744e-05,
109
+ "loss": 0.3723,
110
+ "step": 3400
111
+ },
112
+ {
113
+ "epoch": 0.19,
114
+ "learning_rate": 1.9234693877551024e-05,
115
+ "loss": 0.346,
116
+ "step": 3600
117
+ },
118
+ {
119
+ "epoch": 0.2,
120
+ "learning_rate": 1.91921768707483e-05,
121
+ "loss": 0.3295,
122
+ "step": 3800
123
+ },
124
+ {
125
+ "epoch": 0.21,
126
+ "learning_rate": 1.914965986394558e-05,
127
+ "loss": 0.3101,
128
+ "step": 4000
129
+ },
130
+ {
131
+ "epoch": 0.21,
132
+ "eval_loss": 0.16350167989730835,
133
+ "eval_runtime": 1650.684,
134
+ "eval_samples_per_second": 81.06,
135
+ "eval_steps_per_second": 20.266,
136
+ "step": 4000
137
+ },
138
+ {
139
+ "epoch": 0.22,
140
+ "learning_rate": 1.910714285714286e-05,
141
+ "loss": 0.2984,
142
+ "step": 4200
143
+ },
144
+ {
145
+ "epoch": 0.23,
146
+ "learning_rate": 1.906462585034014e-05,
147
+ "loss": 0.2811,
148
+ "step": 4400
149
+ },
150
+ {
151
+ "epoch": 0.24,
152
+ "learning_rate": 1.9022108843537416e-05,
153
+ "loss": 0.2704,
154
+ "step": 4600
155
+ },
156
+ {
157
+ "epoch": 0.26,
158
+ "learning_rate": 1.8979591836734696e-05,
159
+ "loss": 0.2612,
160
+ "step": 4800
161
+ },
162
+ {
163
+ "epoch": 0.27,
164
+ "learning_rate": 1.8937074829931973e-05,
165
+ "loss": 0.2441,
166
+ "step": 5000
167
+ },
168
+ {
169
+ "epoch": 0.28,
170
+ "learning_rate": 1.8894557823129253e-05,
171
+ "loss": 0.2384,
172
+ "step": 5200
173
+ },
174
+ {
175
+ "epoch": 0.29,
176
+ "learning_rate": 1.885204081632653e-05,
177
+ "loss": 0.2265,
178
+ "step": 5400
179
+ },
180
+ {
181
+ "epoch": 0.3,
182
+ "learning_rate": 1.880952380952381e-05,
183
+ "loss": 0.2226,
184
+ "step": 5600
185
+ },
186
+ {
187
+ "epoch": 0.31,
188
+ "learning_rate": 1.876700680272109e-05,
189
+ "loss": 0.2197,
190
+ "step": 5800
191
+ },
192
+ {
193
+ "epoch": 0.32,
194
+ "learning_rate": 1.8724489795918368e-05,
195
+ "loss": 0.2079,
196
+ "step": 6000
197
+ },
198
+ {
199
+ "epoch": 0.33,
200
+ "learning_rate": 1.8681972789115648e-05,
201
+ "loss": 0.204,
202
+ "step": 6200
203
+ },
204
+ {
205
+ "epoch": 0.34,
206
+ "learning_rate": 1.863945578231293e-05,
207
+ "loss": 0.1986,
208
+ "step": 6400
209
+ },
210
+ {
211
+ "epoch": 0.35,
212
+ "learning_rate": 1.8596938775510206e-05,
213
+ "loss": 0.1965,
214
+ "step": 6600
215
+ },
216
+ {
217
+ "epoch": 0.36,
218
+ "learning_rate": 1.8554421768707486e-05,
219
+ "loss": 0.1874,
220
+ "step": 6800
221
+ },
222
+ {
223
+ "epoch": 0.37,
224
+ "learning_rate": 1.8511904761904763e-05,
225
+ "loss": 0.1889,
226
+ "step": 7000
227
+ },
228
+ {
229
+ "epoch": 0.38,
230
+ "learning_rate": 1.8469387755102043e-05,
231
+ "loss": 0.1841,
232
+ "step": 7200
233
+ },
234
+ {
235
+ "epoch": 0.39,
236
+ "learning_rate": 1.842687074829932e-05,
237
+ "loss": 0.1808,
238
+ "step": 7400
239
+ },
240
+ {
241
+ "epoch": 0.4,
242
+ "learning_rate": 1.83843537414966e-05,
243
+ "loss": 0.1778,
244
+ "step": 7600
245
+ },
246
+ {
247
+ "epoch": 0.41,
248
+ "learning_rate": 1.8341836734693877e-05,
249
+ "loss": 0.1715,
250
+ "step": 7800
251
+ },
252
+ {
253
+ "epoch": 0.43,
254
+ "learning_rate": 1.8299319727891158e-05,
255
+ "loss": 0.1717,
256
+ "step": 8000
257
+ },
258
+ {
259
+ "epoch": 0.43,
260
+ "eval_loss": 0.1131439059972763,
261
+ "eval_runtime": 1642.8688,
262
+ "eval_samples_per_second": 81.446,
263
+ "eval_steps_per_second": 20.362,
264
+ "step": 8000
265
+ },
266
+ {
267
+ "epoch": 0.44,
268
+ "learning_rate": 1.8256802721088435e-05,
269
+ "loss": 0.1673,
270
+ "step": 8200
271
+ },
272
+ {
273
+ "epoch": 0.45,
274
+ "learning_rate": 1.8214285714285715e-05,
275
+ "loss": 0.1633,
276
+ "step": 8400
277
+ },
278
+ {
279
+ "epoch": 0.46,
280
+ "learning_rate": 1.8171768707482995e-05,
281
+ "loss": 0.1608,
282
+ "step": 8600
283
+ },
284
+ {
285
+ "epoch": 0.47,
286
+ "learning_rate": 1.8129251700680272e-05,
287
+ "loss": 0.1623,
288
+ "step": 8800
289
+ },
290
+ {
291
+ "epoch": 0.48,
292
+ "learning_rate": 1.8086734693877553e-05,
293
+ "loss": 0.1583,
294
+ "step": 9000
295
+ },
296
+ {
297
+ "epoch": 0.49,
298
+ "learning_rate": 1.8044217687074833e-05,
299
+ "loss": 0.1548,
300
+ "step": 9200
301
+ },
302
+ {
303
+ "epoch": 0.5,
304
+ "learning_rate": 1.800170068027211e-05,
305
+ "loss": 0.1553,
306
+ "step": 9400
307
+ },
308
+ {
309
+ "epoch": 0.51,
310
+ "learning_rate": 1.795918367346939e-05,
311
+ "loss": 0.1491,
312
+ "step": 9600
313
+ },
314
+ {
315
+ "epoch": 0.52,
316
+ "learning_rate": 1.7916666666666667e-05,
317
+ "loss": 0.1529,
318
+ "step": 9800
319
+ },
320
+ {
321
+ "epoch": 0.53,
322
+ "learning_rate": 1.7874149659863948e-05,
323
+ "loss": 0.1489,
324
+ "step": 10000
325
+ },
326
+ {
327
+ "epoch": 0.54,
328
+ "learning_rate": 1.7831632653061225e-05,
329
+ "loss": 0.1454,
330
+ "step": 10200
331
+ },
332
+ {
333
+ "epoch": 0.55,
334
+ "learning_rate": 1.7789115646258505e-05,
335
+ "loss": 0.1458,
336
+ "step": 10400
337
+ },
338
+ {
339
+ "epoch": 0.56,
340
+ "learning_rate": 1.7746598639455782e-05,
341
+ "loss": 0.1401,
342
+ "step": 10600
343
+ },
344
+ {
345
+ "epoch": 0.57,
346
+ "learning_rate": 1.7704081632653062e-05,
347
+ "loss": 0.1407,
348
+ "step": 10800
349
+ },
350
+ {
351
+ "epoch": 0.58,
352
+ "learning_rate": 1.7661564625850343e-05,
353
+ "loss": 0.1377,
354
+ "step": 11000
355
+ },
356
+ {
357
+ "epoch": 0.6,
358
+ "learning_rate": 1.761904761904762e-05,
359
+ "loss": 0.1378,
360
+ "step": 11200
361
+ },
362
+ {
363
+ "epoch": 0.61,
364
+ "learning_rate": 1.75765306122449e-05,
365
+ "loss": 0.1344,
366
+ "step": 11400
367
+ },
368
+ {
369
+ "epoch": 0.62,
370
+ "learning_rate": 1.7534013605442177e-05,
371
+ "loss": 0.1331,
372
+ "step": 11600
373
+ },
374
+ {
375
+ "epoch": 0.63,
376
+ "learning_rate": 1.7491496598639457e-05,
377
+ "loss": 0.1333,
378
+ "step": 11800
379
+ },
380
+ {
381
+ "epoch": 0.64,
382
+ "learning_rate": 1.7448979591836738e-05,
383
+ "loss": 0.1317,
384
+ "step": 12000
385
+ },
386
+ {
387
+ "epoch": 0.64,
388
+ "eval_loss": 0.09435752034187317,
389
+ "eval_runtime": 1637.6898,
390
+ "eval_samples_per_second": 81.704,
391
+ "eval_steps_per_second": 20.426,
392
+ "step": 12000
393
+ },
394
+ {
395
+ "epoch": 0.65,
396
+ "learning_rate": 1.7406462585034015e-05,
397
+ "loss": 0.1309,
398
+ "step": 12200
399
+ },
400
+ {
401
+ "epoch": 0.66,
402
+ "learning_rate": 1.7363945578231295e-05,
403
+ "loss": 0.1308,
404
+ "step": 12400
405
+ },
406
+ {
407
+ "epoch": 0.67,
408
+ "learning_rate": 1.7321428571428572e-05,
409
+ "loss": 0.1303,
410
+ "step": 12600
411
+ },
412
+ {
413
+ "epoch": 0.68,
414
+ "learning_rate": 1.7278911564625852e-05,
415
+ "loss": 0.1258,
416
+ "step": 12800
417
+ },
418
+ {
419
+ "epoch": 0.69,
420
+ "learning_rate": 1.723639455782313e-05,
421
+ "loss": 0.1261,
422
+ "step": 13000
423
+ },
424
+ {
425
+ "epoch": 0.7,
426
+ "learning_rate": 1.719387755102041e-05,
427
+ "loss": 0.1261,
428
+ "step": 13200
429
+ },
430
+ {
431
+ "epoch": 0.71,
432
+ "learning_rate": 1.7151360544217686e-05,
433
+ "loss": 0.1236,
434
+ "step": 13400
435
+ },
436
+ {
437
+ "epoch": 0.72,
438
+ "learning_rate": 1.7108843537414967e-05,
439
+ "loss": 0.1219,
440
+ "step": 13600
441
+ },
442
+ {
443
+ "epoch": 0.73,
444
+ "learning_rate": 1.7066326530612247e-05,
445
+ "loss": 0.1223,
446
+ "step": 13800
447
+ },
448
+ {
449
+ "epoch": 0.74,
450
+ "learning_rate": 1.7023809523809524e-05,
451
+ "loss": 0.1219,
452
+ "step": 14000
453
+ },
454
+ {
455
+ "epoch": 0.75,
456
+ "learning_rate": 1.6981292517006804e-05,
457
+ "loss": 0.1188,
458
+ "step": 14200
459
+ },
460
+ {
461
+ "epoch": 0.77,
462
+ "learning_rate": 1.6938775510204085e-05,
463
+ "loss": 0.119,
464
+ "step": 14400
465
+ },
466
+ {
467
+ "epoch": 0.78,
468
+ "learning_rate": 1.6896258503401362e-05,
469
+ "loss": 0.118,
470
+ "step": 14600
471
+ },
472
+ {
473
+ "epoch": 0.79,
474
+ "learning_rate": 1.6853741496598642e-05,
475
+ "loss": 0.118,
476
+ "step": 14800
477
+ },
478
+ {
479
+ "epoch": 0.8,
480
+ "learning_rate": 1.681122448979592e-05,
481
+ "loss": 0.1156,
482
+ "step": 15000
483
+ }
484
+ ],
485
+ "max_steps": 94080,
486
+ "num_train_epochs": 5,
487
+ "total_flos": 5.075997032448e+17,
488
+ "trial_name": null,
489
+ "trial_params": null
490
+ }
checkpoint-15000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c7a14b68a242d9adb4a5aed9daebd4e60b09b655362ead363872702c8708f4d
3
+ size 3695