ndaheim commited on
Commit
8ad164d
1 Parent(s): 7526f27

initial commit

Browse files
.gitattributes CHANGED
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
29
+ training_args.bin filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<user>": 50273, "<knowledge_sep>": 50271, "<Other>": 50268, "<Hint/Information_Reveal>": 50267, "<Question>": 50269, "<Correction>": 50266, "<knowledge_tag>": 50272, "<Confirmation>": 50265, "<agent>": 50270}
config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/bart-base",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "BartForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.1,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 6,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 12,
23
+ "encoder_ffn_dim": 3072,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 6,
26
+ "eos_token_id": 2,
27
+ "forced_bos_token_id": 0,
28
+ "forced_eos_token_id": 2,
29
+ "gradient_checkpointing": false,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1",
33
+ "2": "LABEL_2"
34
+ },
35
+ "init_std": 0.02,
36
+ "is_encoder_decoder": true,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1,
40
+ "LABEL_2": 2
41
+ },
42
+ "max_length": 60,
43
+ "max_position_embeddings": 1024,
44
+ "model_type": "bart",
45
+ "no_repeat_ngram_size": 3,
46
+ "normalize_before": false,
47
+ "normalize_embedding": true,
48
+ "num_beams": 10,
49
+ "num_hidden_layers": 6,
50
+ "pad_token_id": 1,
51
+ "scale_embedding": false,
52
+ "task_specific_params": {
53
+ "summarization": {
54
+ "length_penalty": 1.0,
55
+ "max_length": 128,
56
+ "min_length": 12,
57
+ "num_beams": 4
58
+ },
59
+ "summarization_cnn": {
60
+ "length_penalty": 2.0,
61
+ "max_length": 142,
62
+ "min_length": 56,
63
+ "num_beams": 4
64
+ },
65
+ "summarization_xsum": {
66
+ "length_penalty": 1.0,
67
+ "max_length": 62,
68
+ "min_length": 11,
69
+ "num_beams": 6
70
+ }
71
+ },
72
+ "torch_dtype": "float32",
73
+ "transformers_version": "4.9.0",
74
+ "uid_regularization": 0.0,
75
+ "use_cache": true,
76
+ "vocab_size": 50274
77
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9da260309d32e347bcddd04513805ab97fbdd5dcb8697e3b4457015beeb764f1
3
+ size 1115581221
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:651ce0c707bd4267aa6895e4822d3d361a3a0f88fa335a167501deda59592ee1
3
+ size 558013395
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4023d58fe9b64875419226c1e2779de7fe40dfe32dec6e4b9a8dce64f042f152
3
+ size 14593
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bb9bdc4c1c89f937942a1165786f8e682f0330541a622942e057de47622afdf
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}, "additional_special_tokens": ["<Confirmation>", "<Correction>", "<Hint/Information_Reveal>", "<Other>", "<Question>", "<agent>", "<knowledge_sep>", "<knowledge_tag>", "<user>"]}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "facebook/bart-base", "tokenizer_class": "BartTokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,858 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 16.99889502762431,
5
+ "global_step": 5763,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.38,
12
+ "gpu_memory": 2825061888,
13
+ "learning_rate": 8.32e-06,
14
+ "loss": 4.6062,
15
+ "step": 128
16
+ },
17
+ {
18
+ "epoch": 0.75,
19
+ "gpu_memory": 2903643648,
20
+ "learning_rate": 1.664e-05,
21
+ "loss": 2.7746,
22
+ "step": 256
23
+ },
24
+ {
25
+ "epoch": 1.0,
26
+ "eval_bp": 0.021341648192077716,
27
+ "eval_counts": [
28
+ 342,
29
+ 58,
30
+ 18,
31
+ 6
32
+ ],
33
+ "eval_loss": 2.035790205001831,
34
+ "eval_precisions": [
35
+ 34.862385321100916,
36
+ 8.516886930983848,
37
+ 4.651162790697675,
38
+ 2.3529411764705883
39
+ ],
40
+ "eval_ref_len": 4755,
41
+ "eval_runtime": 35.7733,
42
+ "eval_samples_per_second": 8.386,
43
+ "eval_score": 0.16113155714674393,
44
+ "eval_steps_per_second": 8.386,
45
+ "eval_sys_len": 981,
46
+ "eval_totals": [
47
+ 981,
48
+ 681,
49
+ 387,
50
+ 255
51
+ ],
52
+ "gpu_memory": 2903643648,
53
+ "step": 339
54
+ },
55
+ {
56
+ "epoch": 1.13,
57
+ "gpu_memory": 2903643648,
58
+ "learning_rate": 2.4959999999999998e-05,
59
+ "loss": 2.2201,
60
+ "step": 384
61
+ },
62
+ {
63
+ "epoch": 1.51,
64
+ "gpu_memory": 2903643648,
65
+ "learning_rate": 3.2437898089171974e-05,
66
+ "loss": 1.9599,
67
+ "step": 512
68
+ },
69
+ {
70
+ "epoch": 1.89,
71
+ "gpu_memory": 2903643648,
72
+ "learning_rate": 3.1775477707006364e-05,
73
+ "loss": 1.8228,
74
+ "step": 640
75
+ },
76
+ {
77
+ "epoch": 2.0,
78
+ "eval_bp": 0.1919535866757935,
79
+ "eval_counts": [
80
+ 640,
81
+ 199,
82
+ 91,
83
+ 36
84
+ ],
85
+ "eval_loss": 1.740516185760498,
86
+ "eval_precisions": [
87
+ 35.67447045707915,
88
+ 13.319946452476573,
89
+ 7.526881720430108,
90
+ 3.896103896103896
91
+ ],
92
+ "eval_ref_len": 4755,
93
+ "eval_runtime": 51.5291,
94
+ "eval_samples_per_second": 5.822,
95
+ "eval_score": 2.0855597670386987,
96
+ "eval_steps_per_second": 5.822,
97
+ "eval_sys_len": 1794,
98
+ "eval_totals": [
99
+ 1794,
100
+ 1494,
101
+ 1209,
102
+ 924
103
+ ],
104
+ "gpu_memory": 2903643648,
105
+ "step": 678
106
+ },
107
+ {
108
+ "epoch": 2.27,
109
+ "gpu_memory": 2903643648,
110
+ "learning_rate": 3.111305732484076e-05,
111
+ "loss": 1.7275,
112
+ "step": 768
113
+ },
114
+ {
115
+ "epoch": 2.64,
116
+ "gpu_memory": 2903643648,
117
+ "learning_rate": 3.0450636942675155e-05,
118
+ "loss": 1.614,
119
+ "step": 896
120
+ },
121
+ {
122
+ "epoch": 3.0,
123
+ "eval_bp": 0.157930307305936,
124
+ "eval_counts": [
125
+ 662,
126
+ 239,
127
+ 127,
128
+ 66
129
+ ],
130
+ "eval_loss": 1.6653738021850586,
131
+ "eval_precisions": [
132
+ 39.61699581089168,
133
+ 17.432530999270604,
134
+ 11.598173515981735,
135
+ 8.02919708029197
136
+ ],
137
+ "eval_ref_len": 4755,
138
+ "eval_runtime": 46.1755,
139
+ "eval_samples_per_second": 6.497,
140
+ "eval_score": 2.515019790343611,
141
+ "eval_steps_per_second": 6.497,
142
+ "eval_sys_len": 1671,
143
+ "eval_totals": [
144
+ 1671,
145
+ 1371,
146
+ 1095,
147
+ 822
148
+ ],
149
+ "gpu_memory": 2903643648,
150
+ "step": 1017
151
+ },
152
+ {
153
+ "epoch": 3.02,
154
+ "gpu_memory": 2903643648,
155
+ "learning_rate": 2.9788216560509553e-05,
156
+ "loss": 1.561,
157
+ "step": 1024
158
+ },
159
+ {
160
+ "epoch": 3.4,
161
+ "gpu_memory": 2903643648,
162
+ "learning_rate": 2.9125796178343946e-05,
163
+ "loss": 1.4029,
164
+ "step": 1152
165
+ },
166
+ {
167
+ "epoch": 3.77,
168
+ "gpu_memory": 2903643648,
169
+ "learning_rate": 2.8463375796178344e-05,
170
+ "loss": 1.4541,
171
+ "step": 1280
172
+ },
173
+ {
174
+ "epoch": 4.0,
175
+ "eval_bp": 0.06814983706797134,
176
+ "eval_counts": [
177
+ 481,
178
+ 162,
179
+ 79,
180
+ 37
181
+ ],
182
+ "eval_loss": 1.6631227731704712,
183
+ "eval_precisions": [
184
+ 37.286821705426355,
185
+ 16.363636363636363,
186
+ 10.881542699724518,
187
+ 6.630824372759856
188
+ ],
189
+ "eval_ref_len": 4755,
190
+ "eval_runtime": 45.6186,
191
+ "eval_samples_per_second": 6.576,
192
+ "eval_score": 0.9871612910485801,
193
+ "eval_steps_per_second": 6.576,
194
+ "eval_sys_len": 1290,
195
+ "eval_totals": [
196
+ 1290,
197
+ 990,
198
+ 726,
199
+ 558
200
+ ],
201
+ "gpu_memory": 2903643648,
202
+ "step": 1356
203
+ },
204
+ {
205
+ "epoch": 4.15,
206
+ "gpu_memory": 2903643648,
207
+ "learning_rate": 2.7800955414012737e-05,
208
+ "loss": 1.4088,
209
+ "step": 1408
210
+ },
211
+ {
212
+ "epoch": 4.53,
213
+ "gpu_memory": 2903643648,
214
+ "learning_rate": 2.713853503184713e-05,
215
+ "loss": 1.3351,
216
+ "step": 1536
217
+ },
218
+ {
219
+ "epoch": 4.91,
220
+ "gpu_memory": 2903643648,
221
+ "learning_rate": 2.647611464968153e-05,
222
+ "loss": 1.3229,
223
+ "step": 1664
224
+ },
225
+ {
226
+ "epoch": 5.0,
227
+ "eval_bp": 0.23640264658354365,
228
+ "eval_counts": [
229
+ 633,
230
+ 216,
231
+ 105,
232
+ 58
233
+ ],
234
+ "eval_loss": 1.6731408834457397,
235
+ "eval_precisions": [
236
+ 32.5115562403698,
237
+ 13.114754098360656,
238
+ 7.658643326039387,
239
+ 5.239385727190605
240
+ ],
241
+ "eval_ref_len": 4755,
242
+ "eval_runtime": 51.3595,
243
+ "eval_samples_per_second": 5.841,
244
+ "eval_score": 2.703708498377427,
245
+ "eval_steps_per_second": 5.841,
246
+ "eval_sys_len": 1947,
247
+ "eval_totals": [
248
+ 1947,
249
+ 1647,
250
+ 1371,
251
+ 1107
252
+ ],
253
+ "gpu_memory": 2903643648,
254
+ "step": 1695
255
+ },
256
+ {
257
+ "epoch": 5.29,
258
+ "gpu_memory": 2903643648,
259
+ "learning_rate": 2.5813694267515922e-05,
260
+ "loss": 1.2429,
261
+ "step": 1792
262
+ },
263
+ {
264
+ "epoch": 5.66,
265
+ "gpu_memory": 2903643648,
266
+ "learning_rate": 2.515127388535032e-05,
267
+ "loss": 1.2329,
268
+ "step": 1920
269
+ },
270
+ {
271
+ "epoch": 6.0,
272
+ "eval_bp": 0.07532276614122083,
273
+ "eval_counts": [
274
+ 579,
275
+ 202,
276
+ 98,
277
+ 55
278
+ ],
279
+ "eval_loss": 1.6539884805679321,
280
+ "eval_precisions": [
281
+ 43.665158371040725,
282
+ 19.68810916179337,
283
+ 13.01460823373174,
284
+ 9.499136442141623
285
+ ],
286
+ "eval_ref_len": 4755,
287
+ "eval_runtime": 46.8126,
288
+ "eval_samples_per_second": 6.409,
289
+ "eval_score": 1.3600028829560191,
290
+ "eval_steps_per_second": 6.409,
291
+ "eval_sys_len": 1326,
292
+ "eval_totals": [
293
+ 1326,
294
+ 1026,
295
+ 753,
296
+ 579
297
+ ],
298
+ "gpu_memory": 2903643648,
299
+ "step": 2034
300
+ },
301
+ {
302
+ "epoch": 6.04,
303
+ "gpu_memory": 2903643648,
304
+ "learning_rate": 2.4488853503184713e-05,
305
+ "loss": 1.2504,
306
+ "step": 2048
307
+ },
308
+ {
309
+ "epoch": 6.42,
310
+ "gpu_memory": 2903643648,
311
+ "learning_rate": 2.3826433121019104e-05,
312
+ "loss": 1.1421,
313
+ "step": 2176
314
+ },
315
+ {
316
+ "epoch": 6.8,
317
+ "gpu_memory": 2903643648,
318
+ "learning_rate": 2.31640127388535e-05,
319
+ "loss": 1.1795,
320
+ "step": 2304
321
+ },
322
+ {
323
+ "epoch": 7.0,
324
+ "eval_bp": 0.17181721996808308,
325
+ "eval_counts": [
326
+ 768,
327
+ 262,
328
+ 133,
329
+ 70
330
+ ],
331
+ "eval_loss": 1.667359471321106,
332
+ "eval_precisions": [
333
+ 44.599303135888505,
334
+ 18.424753867791843,
335
+ 11.697449428320141,
336
+ 8.018327605956472
337
+ ],
338
+ "eval_ref_len": 4755,
339
+ "eval_runtime": 50.5053,
340
+ "eval_samples_per_second": 5.94,
341
+ "eval_score": 2.862812289607837,
342
+ "eval_steps_per_second": 5.94,
343
+ "eval_sys_len": 1722,
344
+ "eval_totals": [
345
+ 1722,
346
+ 1422,
347
+ 1137,
348
+ 873
349
+ ],
350
+ "gpu_memory": 2903643648,
351
+ "step": 2373
352
+ },
353
+ {
354
+ "epoch": 7.17,
355
+ "gpu_memory": 2903643648,
356
+ "learning_rate": 2.2501592356687895e-05,
357
+ "loss": 1.0902,
358
+ "step": 2432
359
+ },
360
+ {
361
+ "epoch": 7.55,
362
+ "gpu_memory": 2903643648,
363
+ "learning_rate": 2.183917197452229e-05,
364
+ "loss": 1.0705,
365
+ "step": 2560
366
+ },
367
+ {
368
+ "epoch": 7.93,
369
+ "gpu_memory": 2903643648,
370
+ "learning_rate": 2.1176751592356686e-05,
371
+ "loss": 1.1128,
372
+ "step": 2688
373
+ },
374
+ {
375
+ "epoch": 8.0,
376
+ "eval_bp": 0.2669632643662467,
377
+ "eval_counts": [
378
+ 866,
379
+ 300,
380
+ 163,
381
+ 96
382
+ ],
383
+ "eval_loss": 1.708727240562439,
384
+ "eval_precisions": [
385
+ 42.26451927769644,
386
+ 17.152658662092623,
387
+ 11.20274914089347,
388
+ 8.226221079691516
389
+ ],
390
+ "eval_ref_len": 4755,
391
+ "eval_runtime": 53.4181,
392
+ "eval_samples_per_second": 5.616,
393
+ "eval_score": 4.291998839505449,
394
+ "eval_steps_per_second": 5.616,
395
+ "eval_sys_len": 2049,
396
+ "eval_totals": [
397
+ 2049,
398
+ 1749,
399
+ 1455,
400
+ 1167
401
+ ],
402
+ "gpu_memory": 2903643648,
403
+ "step": 2712
404
+ },
405
+ {
406
+ "epoch": 8.31,
407
+ "gpu_memory": 2903643648,
408
+ "learning_rate": 2.051433121019108e-05,
409
+ "loss": 1.0162,
410
+ "step": 2816
411
+ },
412
+ {
413
+ "epoch": 8.68,
414
+ "gpu_memory": 2903643648,
415
+ "learning_rate": 1.9851910828025477e-05,
416
+ "loss": 1.0183,
417
+ "step": 2944
418
+ },
419
+ {
420
+ "epoch": 9.0,
421
+ "eval_bp": 0.09731210069014802,
422
+ "eval_counts": [
423
+ 678,
424
+ 233,
425
+ 102,
426
+ 45
427
+ ],
428
+ "eval_loss": 1.7135441303253174,
429
+ "eval_precisions": [
430
+ 47.47899159663866,
431
+ 20.656028368794328,
432
+ 12.23021582733813,
433
+ 7.142857142857143
434
+ ],
435
+ "eval_ref_len": 4755,
436
+ "eval_runtime": 50.1778,
437
+ "eval_samples_per_second": 5.979,
438
+ "eval_score": 1.664870454299152,
439
+ "eval_steps_per_second": 5.979,
440
+ "eval_sys_len": 1428,
441
+ "eval_totals": [
442
+ 1428,
443
+ 1128,
444
+ 834,
445
+ 630
446
+ ],
447
+ "gpu_memory": 2903643648,
448
+ "step": 3051
449
+ },
450
+ {
451
+ "epoch": 9.06,
452
+ "gpu_memory": 2903643648,
453
+ "learning_rate": 1.918949044585987e-05,
454
+ "loss": 1.0367,
455
+ "step": 3072
456
+ },
457
+ {
458
+ "epoch": 9.44,
459
+ "gpu_memory": 2903643648,
460
+ "learning_rate": 1.8527070063694264e-05,
461
+ "loss": 0.9645,
462
+ "step": 3200
463
+ },
464
+ {
465
+ "epoch": 9.82,
466
+ "gpu_memory": 2903643648,
467
+ "learning_rate": 1.786464968152866e-05,
468
+ "loss": 0.9616,
469
+ "step": 3328
470
+ },
471
+ {
472
+ "epoch": 10.0,
473
+ "eval_bp": 0.22930577411313655,
474
+ "eval_counts": [
475
+ 768,
476
+ 280,
477
+ 145,
478
+ 80
479
+ ],
480
+ "eval_loss": 1.736754298210144,
481
+ "eval_precisions": [
482
+ 39.93759750390016,
483
+ 17.25200246457178,
484
+ 10.837070254110612,
485
+ 7.428040854224698
486
+ ],
487
+ "eval_ref_len": 4755,
488
+ "eval_runtime": 57.956,
489
+ "eval_samples_per_second": 5.176,
490
+ "eval_score": 3.518980787396955,
491
+ "eval_steps_per_second": 5.176,
492
+ "eval_sys_len": 1923,
493
+ "eval_totals": [
494
+ 1923,
495
+ 1623,
496
+ 1338,
497
+ 1077
498
+ ],
499
+ "gpu_memory": 2903643648,
500
+ "step": 3390
501
+ },
502
+ {
503
+ "epoch": 10.19,
504
+ "gpu_memory": 2903643648,
505
+ "learning_rate": 1.7202229299363055e-05,
506
+ "loss": 0.9403,
507
+ "step": 3456
508
+ },
509
+ {
510
+ "epoch": 10.57,
511
+ "gpu_memory": 2903643648,
512
+ "learning_rate": 1.6539808917197452e-05,
513
+ "loss": 0.9059,
514
+ "step": 3584
515
+ },
516
+ {
517
+ "epoch": 10.95,
518
+ "gpu_memory": 2903643648,
519
+ "learning_rate": 1.5877388535031846e-05,
520
+ "loss": 0.9249,
521
+ "step": 3712
522
+ },
523
+ {
524
+ "epoch": 11.0,
525
+ "eval_bp": 0.1751321349922995,
526
+ "eval_counts": [
527
+ 748,
528
+ 240,
529
+ 115,
530
+ 63
531
+ ],
532
+ "eval_loss": 1.782728672027588,
533
+ "eval_precisions": [
534
+ 43.13725490196079,
535
+ 16.736401673640167,
536
+ 10.008703220191471,
537
+ 7.11864406779661
538
+ ],
539
+ "eval_ref_len": 4755,
540
+ "eval_runtime": 54.5903,
541
+ "eval_samples_per_second": 5.495,
542
+ "eval_score": 2.6374744638290037,
543
+ "eval_steps_per_second": 5.495,
544
+ "eval_sys_len": 1734,
545
+ "eval_totals": [
546
+ 1734,
547
+ 1434,
548
+ 1149,
549
+ 885
550
+ ],
551
+ "gpu_memory": 2903643648,
552
+ "step": 3729
553
+ },
554
+ {
555
+ "epoch": 11.33,
556
+ "gpu_memory": 2903643648,
557
+ "learning_rate": 1.5214968152866242e-05,
558
+ "loss": 0.8587,
559
+ "step": 3840
560
+ },
561
+ {
562
+ "epoch": 11.7,
563
+ "gpu_memory": 2903643648,
564
+ "learning_rate": 1.4552547770700635e-05,
565
+ "loss": 0.8739,
566
+ "step": 3968
567
+ },
568
+ {
569
+ "epoch": 12.0,
570
+ "eval_bp": 0.1555153512571023,
571
+ "eval_counts": [
572
+ 739,
573
+ 267,
574
+ 125,
575
+ 60
576
+ ],
577
+ "eval_loss": 1.8148356676101685,
578
+ "eval_precisions": [
579
+ 44.46450060168472,
580
+ 19.60352422907489,
581
+ 11.671335200746965,
582
+ 7.462686567164179
583
+ ],
584
+ "eval_ref_len": 4755,
585
+ "eval_runtime": 53.3032,
586
+ "eval_samples_per_second": 5.628,
587
+ "eval_score": 2.581452241674501,
588
+ "eval_steps_per_second": 5.628,
589
+ "eval_sys_len": 1662,
590
+ "eval_totals": [
591
+ 1662,
592
+ 1362,
593
+ 1071,
594
+ 804
595
+ ],
596
+ "gpu_memory": 2903643648,
597
+ "step": 4068
598
+ },
599
+ {
600
+ "epoch": 12.08,
601
+ "gpu_memory": 2903643648,
602
+ "learning_rate": 1.3890127388535031e-05,
603
+ "loss": 0.8413,
604
+ "step": 4096
605
+ },
606
+ {
607
+ "epoch": 12.46,
608
+ "gpu_memory": 2903643648,
609
+ "learning_rate": 1.3227707006369426e-05,
610
+ "loss": 0.8195,
611
+ "step": 4224
612
+ },
613
+ {
614
+ "epoch": 12.84,
615
+ "gpu_memory": 2903643648,
616
+ "learning_rate": 1.2565286624203822e-05,
617
+ "loss": 0.823,
618
+ "step": 4352
619
+ },
620
+ {
621
+ "epoch": 13.0,
622
+ "eval_bp": 0.2187397058134024,
623
+ "eval_counts": [
624
+ 843,
625
+ 326,
626
+ 173,
627
+ 91
628
+ ],
629
+ "eval_loss": 1.8146471977233887,
630
+ "eval_precisions": [
631
+ 44.67408585055644,
632
+ 20.5419029615627,
633
+ 13.442113442113442,
634
+ 9.027777777777779
635
+ ],
636
+ "eval_ref_len": 4755,
637
+ "eval_runtime": 55.2439,
638
+ "eval_samples_per_second": 5.43,
639
+ "eval_score": 3.995892671984357,
640
+ "eval_steps_per_second": 5.43,
641
+ "eval_sys_len": 1887,
642
+ "eval_totals": [
643
+ 1887,
644
+ 1587,
645
+ 1287,
646
+ 1008
647
+ ],
648
+ "gpu_memory": 2903643648,
649
+ "step": 4407
650
+ },
651
+ {
652
+ "epoch": 13.22,
653
+ "gpu_memory": 2903643648,
654
+ "learning_rate": 1.1902866242038214e-05,
655
+ "loss": 0.7992,
656
+ "step": 4480
657
+ },
658
+ {
659
+ "epoch": 13.59,
660
+ "gpu_memory": 2903643648,
661
+ "learning_rate": 1.124044585987261e-05,
662
+ "loss": 0.7702,
663
+ "step": 4608
664
+ },
665
+ {
666
+ "epoch": 13.97,
667
+ "gpu_memory": 2903643648,
668
+ "learning_rate": 1.0578025477707005e-05,
669
+ "loss": 0.7824,
670
+ "step": 4736
671
+ },
672
+ {
673
+ "epoch": 14.0,
674
+ "eval_bp": 0.16524048903893263,
675
+ "eval_counts": [
676
+ 719,
677
+ 244,
678
+ 108,
679
+ 52
680
+ ],
681
+ "eval_loss": 1.8748054504394531,
682
+ "eval_precisions": [
683
+ 42.34393404004712,
684
+ 17.453505007153076,
685
+ 9.72972972972973,
686
+ 6.081871345029239
687
+ ],
688
+ "eval_ref_len": 4755,
689
+ "eval_runtime": 54.7238,
690
+ "eval_samples_per_second": 5.482,
691
+ "eval_score": 2.389568242739576,
692
+ "eval_steps_per_second": 5.482,
693
+ "eval_sys_len": 1698,
694
+ "eval_totals": [
695
+ 1698,
696
+ 1398,
697
+ 1110,
698
+ 855
699
+ ],
700
+ "gpu_memory": 2903643648,
701
+ "step": 4746
702
+ },
703
+ {
704
+ "epoch": 14.35,
705
+ "gpu_memory": 2903643648,
706
+ "learning_rate": 9.9156050955414e-06,
707
+ "loss": 0.7425,
708
+ "step": 4864
709
+ },
710
+ {
711
+ "epoch": 14.72,
712
+ "gpu_memory": 2903643648,
713
+ "learning_rate": 9.253184713375794e-06,
714
+ "loss": 0.7501,
715
+ "step": 4992
716
+ },
717
+ {
718
+ "epoch": 15.0,
719
+ "eval_bp": 0.1953640836862138,
720
+ "eval_counts": [
721
+ 762,
722
+ 263,
723
+ 131,
724
+ 74
725
+ ],
726
+ "eval_loss": 1.9026106595993042,
727
+ "eval_precisions": [
728
+ 42.19269102990033,
729
+ 17.46347941567065,
730
+ 10.835401157981803,
731
+ 7.781282860147213
732
+ ],
733
+ "eval_ref_len": 4755,
734
+ "eval_runtime": 56.8759,
735
+ "eval_samples_per_second": 5.275,
736
+ "eval_score": 3.0843295492719487,
737
+ "eval_steps_per_second": 5.275,
738
+ "eval_sys_len": 1806,
739
+ "eval_totals": [
740
+ 1806,
741
+ 1506,
742
+ 1209,
743
+ 951
744
+ ],
745
+ "gpu_memory": 2903643648,
746
+ "step": 5085
747
+ },
748
+ {
749
+ "epoch": 15.1,
750
+ "gpu_memory": 2903643648,
751
+ "learning_rate": 8.59076433121019e-06,
752
+ "loss": 0.7315,
753
+ "step": 5120
754
+ },
755
+ {
756
+ "epoch": 15.48,
757
+ "gpu_memory": 2903643648,
758
+ "learning_rate": 7.928343949044585e-06,
759
+ "loss": 0.7011,
760
+ "step": 5248
761
+ },
762
+ {
763
+ "epoch": 15.86,
764
+ "gpu_memory": 2903643648,
765
+ "learning_rate": 7.265923566878981e-06,
766
+ "loss": 0.7139,
767
+ "step": 5376
768
+ },
769
+ {
770
+ "epoch": 16.0,
771
+ "eval_bp": 0.23551335586741148,
772
+ "eval_counts": [
773
+ 816,
774
+ 277,
775
+ 129,
776
+ 72
777
+ ],
778
+ "eval_loss": 1.9286922216415405,
779
+ "eval_precisions": [
780
+ 41.97530864197531,
781
+ 16.849148418491485,
782
+ 9.57683741648107,
783
+ 6.70391061452514
784
+ ],
785
+ "eval_ref_len": 4755,
786
+ "eval_runtime": 58.3566,
787
+ "eval_samples_per_second": 5.141,
788
+ "eval_score": 3.4379225352028846,
789
+ "eval_steps_per_second": 5.141,
790
+ "eval_sys_len": 1944,
791
+ "eval_totals": [
792
+ 1944,
793
+ 1644,
794
+ 1347,
795
+ 1074
796
+ ],
797
+ "gpu_memory": 2903643648,
798
+ "step": 5424
799
+ },
800
+ {
801
+ "epoch": 16.24,
802
+ "gpu_memory": 2903643648,
803
+ "learning_rate": 6.6035031847133755e-06,
804
+ "loss": 0.689,
805
+ "step": 5504
806
+ },
807
+ {
808
+ "epoch": 16.61,
809
+ "gpu_memory": 2903643648,
810
+ "learning_rate": 5.94108280254777e-06,
811
+ "loss": 0.6788,
812
+ "step": 5632
813
+ },
814
+ {
815
+ "epoch": 16.99,
816
+ "gpu_memory": 2903643648,
817
+ "learning_rate": 5.278662420382165e-06,
818
+ "loss": 0.7053,
819
+ "step": 5760
820
+ },
821
+ {
822
+ "epoch": 17.0,
823
+ "eval_bp": 0.2934278208519596,
824
+ "eval_counts": [
825
+ 886,
826
+ 340,
827
+ 171,
828
+ 99
829
+ ],
830
+ "eval_loss": 1.9354726076126099,
831
+ "eval_precisions": [
832
+ 41.47940074906367,
833
+ 18.51851851851852,
834
+ 11.089494163424124,
835
+ 7.746478873239437
836
+ ],
837
+ "eval_ref_len": 4755,
838
+ "eval_runtime": 60.6492,
839
+ "eval_samples_per_second": 4.946,
840
+ "eval_score": 4.702891790634525,
841
+ "eval_steps_per_second": 4.946,
842
+ "eval_sys_len": 2136,
843
+ "eval_totals": [
844
+ 2136,
845
+ 1836,
846
+ 1542,
847
+ 1278
848
+ ],
849
+ "gpu_memory": 2903643648,
850
+ "step": 5763
851
+ }
852
+ ],
853
+ "max_steps": 6780,
854
+ "num_train_epochs": 20,
855
+ "total_flos": 1765580040806400.0,
856
+ "trial_name": null,
857
+ "trial_params": null
858
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e786bd1ef537a382285a95ac7f452a91d1521b490bec8b1018dacd87e6012b0b
3
+ size 2927
vocab.json ADDED
The diff for this file is too large to render. See raw diff