houck2040 commited on
Commit
8eac77b
·
1 Parent(s): d748196

Upload 7 files

Browse files
Files changed (7) hide show
  1. config.json +27 -0
  2. optimizer.pt +3 -0
  3. pytorch_model.bin +3 -0
  4. rng_state.pth +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +550 -0
  7. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilroberta-base",
3
+ "architectures": [
4
+ "RobertaForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 6,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.29.2",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:708ea052ccd9e7a849d1996aae6b8dcb964857c986245dfe257a08a7500fb5ba
3
+ size 657423685
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7d23eaa24906ba4d3625f1194a5496f9042b12a265b4e39bc39df570d473851
3
+ size 328721273
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:191fa8bc45c5a8971d5fe7c55e541bbde67d9e26cbdd7014604ce098c7889fe4
3
+ size 14575
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5cdb69e9300a3691c36bdd8e1eae0c655ce61ed34a28580d6828d75ac320681
3
+ size 627
trainer_state.json ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.95603827256271,
5
+ "global_step": 38500,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.13,
12
+ "learning_rate": 1.9741401603310062e-05,
13
+ "loss": 1.9914,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.26,
18
+ "learning_rate": 1.948280320662012e-05,
19
+ "loss": 1.606,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.39,
24
+ "learning_rate": 1.922420480993018e-05,
25
+ "loss": 1.4903,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.52,
30
+ "learning_rate": 1.896560641324024e-05,
31
+ "loss": 1.4065,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 0.65,
36
+ "learning_rate": 1.87070080165503e-05,
37
+ "loss": 1.3202,
38
+ "step": 2500
39
+ },
40
+ {
41
+ "epoch": 0.78,
42
+ "learning_rate": 1.844840961986036e-05,
43
+ "loss": 1.288,
44
+ "step": 3000
45
+ },
46
+ {
47
+ "epoch": 0.91,
48
+ "learning_rate": 1.8189811223170417e-05,
49
+ "loss": 1.2592,
50
+ "step": 3500
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "eval_loss": 1.1307439804077148,
55
+ "eval_runtime": 12.893,
56
+ "eval_samples_per_second": 1193.128,
57
+ "eval_steps_per_second": 149.151,
58
+ "step": 3867
59
+ },
60
+ {
61
+ "epoch": 1.03,
62
+ "learning_rate": 1.7931212826480477e-05,
63
+ "loss": 1.2107,
64
+ "step": 4000
65
+ },
66
+ {
67
+ "epoch": 1.16,
68
+ "learning_rate": 1.7672614429790537e-05,
69
+ "loss": 1.1851,
70
+ "step": 4500
71
+ },
72
+ {
73
+ "epoch": 1.29,
74
+ "learning_rate": 1.7414016033100598e-05,
75
+ "loss": 1.1513,
76
+ "step": 5000
77
+ },
78
+ {
79
+ "epoch": 1.42,
80
+ "learning_rate": 1.7155417636410658e-05,
81
+ "loss": 1.1181,
82
+ "step": 5500
83
+ },
84
+ {
85
+ "epoch": 1.55,
86
+ "learning_rate": 1.6896819239720715e-05,
87
+ "loss": 1.0948,
88
+ "step": 6000
89
+ },
90
+ {
91
+ "epoch": 1.68,
92
+ "learning_rate": 1.6638220843030775e-05,
93
+ "loss": 1.0955,
94
+ "step": 6500
95
+ },
96
+ {
97
+ "epoch": 1.81,
98
+ "learning_rate": 1.6379622446340835e-05,
99
+ "loss": 1.0562,
100
+ "step": 7000
101
+ },
102
+ {
103
+ "epoch": 1.94,
104
+ "learning_rate": 1.6121024049650892e-05,
105
+ "loss": 1.0578,
106
+ "step": 7500
107
+ },
108
+ {
109
+ "epoch": 2.0,
110
+ "eval_loss": 0.9538503885269165,
111
+ "eval_runtime": 12.8064,
112
+ "eval_samples_per_second": 1201.199,
113
+ "eval_steps_per_second": 150.16,
114
+ "step": 7734
115
+ },
116
+ {
117
+ "epoch": 2.07,
118
+ "learning_rate": 1.5862425652960952e-05,
119
+ "loss": 1.0484,
120
+ "step": 8000
121
+ },
122
+ {
123
+ "epoch": 2.2,
124
+ "learning_rate": 1.5603827256271013e-05,
125
+ "loss": 1.0226,
126
+ "step": 8500
127
+ },
128
+ {
129
+ "epoch": 2.33,
130
+ "learning_rate": 1.5345228859581073e-05,
131
+ "loss": 1.0189,
132
+ "step": 9000
133
+ },
134
+ {
135
+ "epoch": 2.46,
136
+ "learning_rate": 1.5086630462891131e-05,
137
+ "loss": 0.9812,
138
+ "step": 9500
139
+ },
140
+ {
141
+ "epoch": 2.59,
142
+ "learning_rate": 1.482803206620119e-05,
143
+ "loss": 0.9543,
144
+ "step": 10000
145
+ },
146
+ {
147
+ "epoch": 2.72,
148
+ "learning_rate": 1.456943366951125e-05,
149
+ "loss": 0.9917,
150
+ "step": 10500
151
+ },
152
+ {
153
+ "epoch": 2.84,
154
+ "learning_rate": 1.431083527282131e-05,
155
+ "loss": 0.9423,
156
+ "step": 11000
157
+ },
158
+ {
159
+ "epoch": 2.97,
160
+ "learning_rate": 1.4052236876131369e-05,
161
+ "loss": 0.9628,
162
+ "step": 11500
163
+ },
164
+ {
165
+ "epoch": 3.0,
166
+ "eval_loss": 0.8621994256973267,
167
+ "eval_runtime": 12.1534,
168
+ "eval_samples_per_second": 1265.738,
169
+ "eval_steps_per_second": 158.228,
170
+ "step": 11601
171
+ },
172
+ {
173
+ "epoch": 3.1,
174
+ "learning_rate": 1.379363847944143e-05,
175
+ "loss": 0.9401,
176
+ "step": 12000
177
+ },
178
+ {
179
+ "epoch": 3.23,
180
+ "learning_rate": 1.3535040082751488e-05,
181
+ "loss": 0.9208,
182
+ "step": 12500
183
+ },
184
+ {
185
+ "epoch": 3.36,
186
+ "learning_rate": 1.3276441686061548e-05,
187
+ "loss": 0.9068,
188
+ "step": 13000
189
+ },
190
+ {
191
+ "epoch": 3.49,
192
+ "learning_rate": 1.3017843289371609e-05,
193
+ "loss": 0.9114,
194
+ "step": 13500
195
+ },
196
+ {
197
+ "epoch": 3.62,
198
+ "learning_rate": 1.2759244892681665e-05,
199
+ "loss": 0.9146,
200
+ "step": 14000
201
+ },
202
+ {
203
+ "epoch": 3.75,
204
+ "learning_rate": 1.2500646495991726e-05,
205
+ "loss": 0.8964,
206
+ "step": 14500
207
+ },
208
+ {
209
+ "epoch": 3.88,
210
+ "learning_rate": 1.2242048099301784e-05,
211
+ "loss": 0.9084,
212
+ "step": 15000
213
+ },
214
+ {
215
+ "epoch": 4.0,
216
+ "eval_loss": 0.8124715685844421,
217
+ "eval_runtime": 12.1415,
218
+ "eval_samples_per_second": 1266.974,
219
+ "eval_steps_per_second": 158.382,
220
+ "step": 15468
221
+ },
222
+ {
223
+ "epoch": 4.01,
224
+ "learning_rate": 1.1983449702611844e-05,
225
+ "loss": 0.8739,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 4.14,
230
+ "learning_rate": 1.1724851305921905e-05,
231
+ "loss": 0.8625,
232
+ "step": 16000
233
+ },
234
+ {
235
+ "epoch": 4.27,
236
+ "learning_rate": 1.1466252909231963e-05,
237
+ "loss": 0.8507,
238
+ "step": 16500
239
+ },
240
+ {
241
+ "epoch": 4.4,
242
+ "learning_rate": 1.1207654512542024e-05,
243
+ "loss": 0.8759,
244
+ "step": 17000
245
+ },
246
+ {
247
+ "epoch": 4.53,
248
+ "learning_rate": 1.0949056115852084e-05,
249
+ "loss": 0.8383,
250
+ "step": 17500
251
+ },
252
+ {
253
+ "epoch": 4.65,
254
+ "learning_rate": 1.0690457719162142e-05,
255
+ "loss": 0.8487,
256
+ "step": 18000
257
+ },
258
+ {
259
+ "epoch": 4.78,
260
+ "learning_rate": 1.0431859322472203e-05,
261
+ "loss": 0.8595,
262
+ "step": 18500
263
+ },
264
+ {
265
+ "epoch": 4.91,
266
+ "learning_rate": 1.017326092578226e-05,
267
+ "loss": 0.8374,
268
+ "step": 19000
269
+ },
270
+ {
271
+ "epoch": 5.0,
272
+ "eval_loss": 0.7769160270690918,
273
+ "eval_runtime": 12.8439,
274
+ "eval_samples_per_second": 1197.691,
275
+ "eval_steps_per_second": 149.721,
276
+ "step": 19335
277
+ },
278
+ {
279
+ "epoch": 5.04,
280
+ "learning_rate": 9.91466252909232e-06,
281
+ "loss": 0.8352,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 5.17,
286
+ "learning_rate": 9.65606413240238e-06,
287
+ "loss": 0.8663,
288
+ "step": 20000
289
+ },
290
+ {
291
+ "epoch": 5.3,
292
+ "learning_rate": 9.397465735712439e-06,
293
+ "loss": 0.8169,
294
+ "step": 20500
295
+ },
296
+ {
297
+ "epoch": 5.43,
298
+ "learning_rate": 9.138867339022499e-06,
299
+ "loss": 0.8187,
300
+ "step": 21000
301
+ },
302
+ {
303
+ "epoch": 5.56,
304
+ "learning_rate": 8.880268942332558e-06,
305
+ "loss": 0.8153,
306
+ "step": 21500
307
+ },
308
+ {
309
+ "epoch": 5.69,
310
+ "learning_rate": 8.621670545642618e-06,
311
+ "loss": 0.8133,
312
+ "step": 22000
313
+ },
314
+ {
315
+ "epoch": 5.82,
316
+ "learning_rate": 8.363072148952676e-06,
317
+ "loss": 0.7901,
318
+ "step": 22500
319
+ },
320
+ {
321
+ "epoch": 5.95,
322
+ "learning_rate": 8.104473752262737e-06,
323
+ "loss": 0.8139,
324
+ "step": 23000
325
+ },
326
+ {
327
+ "epoch": 6.0,
328
+ "eval_loss": 0.7311471104621887,
329
+ "eval_runtime": 12.8348,
330
+ "eval_samples_per_second": 1198.535,
331
+ "eval_steps_per_second": 149.827,
332
+ "step": 23202
333
+ },
334
+ {
335
+ "epoch": 6.08,
336
+ "learning_rate": 7.845875355572797e-06,
337
+ "loss": 0.7811,
338
+ "step": 23500
339
+ },
340
+ {
341
+ "epoch": 6.21,
342
+ "learning_rate": 7.5872769588828555e-06,
343
+ "loss": 0.8063,
344
+ "step": 24000
345
+ },
346
+ {
347
+ "epoch": 6.34,
348
+ "learning_rate": 7.328678562192915e-06,
349
+ "loss": 0.785,
350
+ "step": 24500
351
+ },
352
+ {
353
+ "epoch": 6.46,
354
+ "learning_rate": 7.070080165502975e-06,
355
+ "loss": 0.7909,
356
+ "step": 25000
357
+ },
358
+ {
359
+ "epoch": 6.59,
360
+ "learning_rate": 6.811481768813034e-06,
361
+ "loss": 0.7727,
362
+ "step": 25500
363
+ },
364
+ {
365
+ "epoch": 6.72,
366
+ "learning_rate": 6.552883372123093e-06,
367
+ "loss": 0.7779,
368
+ "step": 26000
369
+ },
370
+ {
371
+ "epoch": 6.85,
372
+ "learning_rate": 6.294284975433153e-06,
373
+ "loss": 0.7627,
374
+ "step": 26500
375
+ },
376
+ {
377
+ "epoch": 6.98,
378
+ "learning_rate": 6.035686578743212e-06,
379
+ "loss": 0.7577,
380
+ "step": 27000
381
+ },
382
+ {
383
+ "epoch": 7.0,
384
+ "eval_loss": 0.6985681653022766,
385
+ "eval_runtime": 12.1148,
386
+ "eval_samples_per_second": 1269.772,
387
+ "eval_steps_per_second": 158.732,
388
+ "step": 27069
389
+ },
390
+ {
391
+ "epoch": 7.11,
392
+ "learning_rate": 5.777088182053272e-06,
393
+ "loss": 0.7761,
394
+ "step": 27500
395
+ },
396
+ {
397
+ "epoch": 7.24,
398
+ "learning_rate": 5.518489785363331e-06,
399
+ "loss": 0.7782,
400
+ "step": 28000
401
+ },
402
+ {
403
+ "epoch": 7.37,
404
+ "learning_rate": 5.25989138867339e-06,
405
+ "loss": 0.7335,
406
+ "step": 28500
407
+ },
408
+ {
409
+ "epoch": 7.5,
410
+ "learning_rate": 5.00129299198345e-06,
411
+ "loss": 0.7656,
412
+ "step": 29000
413
+ },
414
+ {
415
+ "epoch": 7.63,
416
+ "learning_rate": 4.742694595293509e-06,
417
+ "loss": 0.761,
418
+ "step": 29500
419
+ },
420
+ {
421
+ "epoch": 7.76,
422
+ "learning_rate": 4.484096198603569e-06,
423
+ "loss": 0.7517,
424
+ "step": 30000
425
+ },
426
+ {
427
+ "epoch": 7.89,
428
+ "learning_rate": 4.225497801913628e-06,
429
+ "loss": 0.75,
430
+ "step": 30500
431
+ },
432
+ {
433
+ "epoch": 8.0,
434
+ "eval_loss": 0.6888388991355896,
435
+ "eval_runtime": 12.1237,
436
+ "eval_samples_per_second": 1268.839,
437
+ "eval_steps_per_second": 158.615,
438
+ "step": 30936
439
+ },
440
+ {
441
+ "epoch": 8.02,
442
+ "learning_rate": 3.966899405223688e-06,
443
+ "loss": 0.7545,
444
+ "step": 31000
445
+ },
446
+ {
447
+ "epoch": 8.15,
448
+ "learning_rate": 3.708301008533747e-06,
449
+ "loss": 0.723,
450
+ "step": 31500
451
+ },
452
+ {
453
+ "epoch": 8.28,
454
+ "learning_rate": 3.449702611843807e-06,
455
+ "loss": 0.7352,
456
+ "step": 32000
457
+ },
458
+ {
459
+ "epoch": 8.4,
460
+ "learning_rate": 3.191104215153866e-06,
461
+ "loss": 0.7253,
462
+ "step": 32500
463
+ },
464
+ {
465
+ "epoch": 8.53,
466
+ "learning_rate": 2.932505818463926e-06,
467
+ "loss": 0.7248,
468
+ "step": 33000
469
+ },
470
+ {
471
+ "epoch": 8.66,
472
+ "learning_rate": 2.6739074217739853e-06,
473
+ "loss": 0.7371,
474
+ "step": 33500
475
+ },
476
+ {
477
+ "epoch": 8.79,
478
+ "learning_rate": 2.4153090250840447e-06,
479
+ "loss": 0.7284,
480
+ "step": 34000
481
+ },
482
+ {
483
+ "epoch": 8.92,
484
+ "learning_rate": 2.156710628394104e-06,
485
+ "loss": 0.7205,
486
+ "step": 34500
487
+ },
488
+ {
489
+ "epoch": 9.0,
490
+ "eval_loss": 0.6802728772163391,
491
+ "eval_runtime": 12.1115,
492
+ "eval_samples_per_second": 1270.111,
493
+ "eval_steps_per_second": 158.774,
494
+ "step": 34803
495
+ },
496
+ {
497
+ "epoch": 9.05,
498
+ "learning_rate": 1.8981122317041636e-06,
499
+ "loss": 0.7059,
500
+ "step": 35000
501
+ },
502
+ {
503
+ "epoch": 9.18,
504
+ "learning_rate": 1.6395138350142232e-06,
505
+ "loss": 0.7321,
506
+ "step": 35500
507
+ },
508
+ {
509
+ "epoch": 9.31,
510
+ "learning_rate": 1.3809154383242826e-06,
511
+ "loss": 0.7392,
512
+ "step": 36000
513
+ },
514
+ {
515
+ "epoch": 9.44,
516
+ "learning_rate": 1.122317041634342e-06,
517
+ "loss": 0.7314,
518
+ "step": 36500
519
+ },
520
+ {
521
+ "epoch": 9.57,
522
+ "learning_rate": 8.637186449444015e-07,
523
+ "loss": 0.73,
524
+ "step": 37000
525
+ },
526
+ {
527
+ "epoch": 9.7,
528
+ "learning_rate": 6.051202482544609e-07,
529
+ "loss": 0.7199,
530
+ "step": 37500
531
+ },
532
+ {
533
+ "epoch": 9.83,
534
+ "learning_rate": 3.465218515645203e-07,
535
+ "loss": 0.7258,
536
+ "step": 38000
537
+ },
538
+ {
539
+ "epoch": 9.96,
540
+ "learning_rate": 8.792345487457979e-08,
541
+ "loss": 0.7562,
542
+ "step": 38500
543
+ }
544
+ ],
545
+ "max_steps": 38670,
546
+ "num_train_epochs": 10,
547
+ "total_flos": 1994099670728550.0,
548
+ "trial_name": null,
549
+ "trial_params": null
550
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e865e111c033ba88540c2ee9cd8f8ff3925894b1823fb5e55c1dfac2c22959ef
3
+ size 3899