josecannete commited on
Commit
6d12749
·
1 Parent(s): 201eed7

adding model finetuned on PAWS-X

Browse files
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.809499979019165,
4
+ "eval_loss": 0.42281684279441833,
5
+ "eval_runtime": 0.5205,
6
+ "eval_samples": 2000,
7
+ "eval_samples_per_second": 3842.632,
8
+ "eval_steps_per_second": 240.165,
9
+ "train_loss": 0.24578420763806358,
10
+ "train_runtime": 1895.6753,
11
+ "train_samples": 49401,
12
+ "train_samples_per_second": 104.239,
13
+ "train_steps_per_second": 6.516
14
+ }
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "CenIA/albert_tiny_spanish",
3
+ "architectures": [
4
+ "AlbertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0,
7
+ "bos_token_id": 2,
8
+ "classifier_dropout_prob": 0.1,
9
+ "down_scale_factor": 1,
10
+ "embedding_size": 128,
11
+ "eos_token_id": 3,
12
+ "gap_size": 0,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0,
15
+ "hidden_size": 312,
16
+ "initializer_range": 0.02,
17
+ "inner_group_num": 1,
18
+ "intermediate_size": 1248,
19
+ "layer_norm_eps": 1e-12,
20
+ "max_position_embeddings": 512,
21
+ "model_type": "albert",
22
+ "net_structure_type": 0,
23
+ "num_attention_heads": 12,
24
+ "num_hidden_groups": 1,
25
+ "num_hidden_layers": 4,
26
+ "num_memory_blocks": 0,
27
+ "pad_token_id": 0,
28
+ "position_embedding_type": "absolute",
29
+ "problem_type": "single_label_classification",
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.12.5",
32
+ "type_vocab_size": 2,
33
+ "vocab_size": 31000
34
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.809499979019165,
4
+ "eval_loss": 0.42281684279441833,
5
+ "eval_runtime": 0.5205,
6
+ "eval_samples": 2000,
7
+ "eval_samples_per_second": 3842.632,
8
+ "eval_steps_per_second": 240.165
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8e72f236fd70a343cc55abe6e3b8f1de6073b85f9de8c82c4b24fd96b8b59a9
3
+ size 21393937
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "remove_space": true, "keep_accents": true, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "CenIA/albert_tiny_spanish", "tokenizer_class": "AlbertTokenizer"}
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "train_loss": 0.24578420763806358,
4
+ "train_runtime": 1895.6753,
5
+ "train_samples": 49401,
6
+ "train_samples_per_second": 104.239,
7
+ "train_steps_per_second": 6.516
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,538 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.42281684279441833,
3
+ "best_model_checkpoint": "/data/jcanete/all_results/pawsx/albeto_tiny/epochs_4_bs_16_lr_5e-5/checkpoint-3000",
4
+ "epoch": 4.0,
5
+ "global_step": 12352,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.1,
12
+ "eval_accuracy": 0.5764999985694885,
13
+ "eval_loss": 0.6765033006668091,
14
+ "eval_runtime": 0.5222,
15
+ "eval_samples_per_second": 3830.009,
16
+ "eval_steps_per_second": 239.376,
17
+ "step": 300
18
+ },
19
+ {
20
+ "epoch": 0.16,
21
+ "learning_rate": 4.798413212435233e-05,
22
+ "loss": 0.6642,
23
+ "step": 500
24
+ },
25
+ {
26
+ "epoch": 0.19,
27
+ "eval_accuracy": 0.7055000066757202,
28
+ "eval_loss": 0.5674529671669006,
29
+ "eval_runtime": 1.3054,
30
+ "eval_samples_per_second": 1532.102,
31
+ "eval_steps_per_second": 95.756,
32
+ "step": 600
33
+ },
34
+ {
35
+ "epoch": 0.29,
36
+ "eval_accuracy": 0.737500011920929,
37
+ "eval_loss": 0.5434138178825378,
38
+ "eval_runtime": 0.5173,
39
+ "eval_samples_per_second": 3866.263,
40
+ "eval_steps_per_second": 241.641,
41
+ "step": 900
42
+ },
43
+ {
44
+ "epoch": 0.32,
45
+ "learning_rate": 4.596421632124353e-05,
46
+ "loss": 0.5097,
47
+ "step": 1000
48
+ },
49
+ {
50
+ "epoch": 0.39,
51
+ "eval_accuracy": 0.7609999775886536,
52
+ "eval_loss": 0.523171603679657,
53
+ "eval_runtime": 1.4935,
54
+ "eval_samples_per_second": 1339.147,
55
+ "eval_steps_per_second": 83.697,
56
+ "step": 1200
57
+ },
58
+ {
59
+ "epoch": 0.49,
60
+ "learning_rate": 4.394025259067357e-05,
61
+ "loss": 0.4473,
62
+ "step": 1500
63
+ },
64
+ {
65
+ "epoch": 0.49,
66
+ "eval_accuracy": 0.7864999771118164,
67
+ "eval_loss": 0.46667101979255676,
68
+ "eval_runtime": 0.5272,
69
+ "eval_samples_per_second": 3793.714,
70
+ "eval_steps_per_second": 237.107,
71
+ "step": 1500
72
+ },
73
+ {
74
+ "epoch": 0.58,
75
+ "eval_accuracy": 0.8009999990463257,
76
+ "eval_loss": 0.4371644854545593,
77
+ "eval_runtime": 0.5192,
78
+ "eval_samples_per_second": 3852.095,
79
+ "eval_steps_per_second": 240.756,
80
+ "step": 1800
81
+ },
82
+ {
83
+ "epoch": 0.65,
84
+ "learning_rate": 4.191628886010363e-05,
85
+ "loss": 0.3934,
86
+ "step": 2000
87
+ },
88
+ {
89
+ "epoch": 0.68,
90
+ "eval_accuracy": 0.7894999980926514,
91
+ "eval_loss": 0.4638134837150574,
92
+ "eval_runtime": 0.5279,
93
+ "eval_samples_per_second": 3788.294,
94
+ "eval_steps_per_second": 236.768,
95
+ "step": 2100
96
+ },
97
+ {
98
+ "epoch": 0.78,
99
+ "eval_accuracy": 0.7885000109672546,
100
+ "eval_loss": 0.4593009650707245,
101
+ "eval_runtime": 0.5306,
102
+ "eval_samples_per_second": 3769.222,
103
+ "eval_steps_per_second": 235.576,
104
+ "step": 2400
105
+ },
106
+ {
107
+ "epoch": 0.81,
108
+ "learning_rate": 3.989637305699482e-05,
109
+ "loss": 0.3659,
110
+ "step": 2500
111
+ },
112
+ {
113
+ "epoch": 0.87,
114
+ "eval_accuracy": 0.7960000038146973,
115
+ "eval_loss": 0.4648894965648651,
116
+ "eval_runtime": 0.5101,
117
+ "eval_samples_per_second": 3920.44,
118
+ "eval_steps_per_second": 245.027,
119
+ "step": 2700
120
+ },
121
+ {
122
+ "epoch": 0.97,
123
+ "learning_rate": 3.787240932642487e-05,
124
+ "loss": 0.3408,
125
+ "step": 3000
126
+ },
127
+ {
128
+ "epoch": 0.97,
129
+ "eval_accuracy": 0.809499979019165,
130
+ "eval_loss": 0.42281684279441833,
131
+ "eval_runtime": 1.3845,
132
+ "eval_samples_per_second": 1444.57,
133
+ "eval_steps_per_second": 90.286,
134
+ "step": 3000
135
+ },
136
+ {
137
+ "epoch": 1.07,
138
+ "eval_accuracy": 0.8059999942779541,
139
+ "eval_loss": 0.4953405261039734,
140
+ "eval_runtime": 0.5733,
141
+ "eval_samples_per_second": 3488.535,
142
+ "eval_steps_per_second": 218.033,
143
+ "step": 3300
144
+ },
145
+ {
146
+ "epoch": 1.13,
147
+ "learning_rate": 3.5848445595854926e-05,
148
+ "loss": 0.2812,
149
+ "step": 3500
150
+ },
151
+ {
152
+ "epoch": 1.17,
153
+ "eval_accuracy": 0.7994999885559082,
154
+ "eval_loss": 0.5381343364715576,
155
+ "eval_runtime": 0.5151,
156
+ "eval_samples_per_second": 3882.955,
157
+ "eval_steps_per_second": 242.685,
158
+ "step": 3600
159
+ },
160
+ {
161
+ "epoch": 1.26,
162
+ "eval_accuracy": 0.8054999709129333,
163
+ "eval_loss": 0.4457036256790161,
164
+ "eval_runtime": 0.5211,
165
+ "eval_samples_per_second": 3838.159,
166
+ "eval_steps_per_second": 239.885,
167
+ "step": 3900
168
+ },
169
+ {
170
+ "epoch": 1.3,
171
+ "learning_rate": 3.382448186528497e-05,
172
+ "loss": 0.2545,
173
+ "step": 4000
174
+ },
175
+ {
176
+ "epoch": 1.36,
177
+ "eval_accuracy": 0.8149999976158142,
178
+ "eval_loss": 0.4693449139595032,
179
+ "eval_runtime": 0.5216,
180
+ "eval_samples_per_second": 3834.156,
181
+ "eval_steps_per_second": 239.635,
182
+ "step": 4200
183
+ },
184
+ {
185
+ "epoch": 1.46,
186
+ "learning_rate": 3.180051813471503e-05,
187
+ "loss": 0.2588,
188
+ "step": 4500
189
+ },
190
+ {
191
+ "epoch": 1.46,
192
+ "eval_accuracy": 0.8220000267028809,
193
+ "eval_loss": 0.48552772402763367,
194
+ "eval_runtime": 2.4236,
195
+ "eval_samples_per_second": 825.207,
196
+ "eval_steps_per_second": 51.575,
197
+ "step": 4500
198
+ },
199
+ {
200
+ "epoch": 1.55,
201
+ "eval_accuracy": 0.8184999823570251,
202
+ "eval_loss": 0.48397132754325867,
203
+ "eval_runtime": 0.5143,
204
+ "eval_samples_per_second": 3888.874,
205
+ "eval_steps_per_second": 243.055,
206
+ "step": 4800
207
+ },
208
+ {
209
+ "epoch": 1.62,
210
+ "learning_rate": 2.9780602331606216e-05,
211
+ "loss": 0.2436,
212
+ "step": 5000
213
+ },
214
+ {
215
+ "epoch": 1.65,
216
+ "eval_accuracy": 0.8080000281333923,
217
+ "eval_loss": 0.5025840997695923,
218
+ "eval_runtime": 0.5261,
219
+ "eval_samples_per_second": 3801.816,
220
+ "eval_steps_per_second": 237.613,
221
+ "step": 5100
222
+ },
223
+ {
224
+ "epoch": 1.75,
225
+ "eval_accuracy": 0.8140000104904175,
226
+ "eval_loss": 0.4753943681716919,
227
+ "eval_runtime": 0.5312,
228
+ "eval_samples_per_second": 3765.313,
229
+ "eval_steps_per_second": 235.332,
230
+ "step": 5400
231
+ },
232
+ {
233
+ "epoch": 1.78,
234
+ "learning_rate": 2.7756638601036272e-05,
235
+ "loss": 0.2414,
236
+ "step": 5500
237
+ },
238
+ {
239
+ "epoch": 1.85,
240
+ "eval_accuracy": 0.8034999966621399,
241
+ "eval_loss": 0.48492932319641113,
242
+ "eval_runtime": 0.5256,
243
+ "eval_samples_per_second": 3805.048,
244
+ "eval_steps_per_second": 237.815,
245
+ "step": 5700
246
+ },
247
+ {
248
+ "epoch": 1.94,
249
+ "learning_rate": 2.573267487046632e-05,
250
+ "loss": 0.2527,
251
+ "step": 6000
252
+ },
253
+ {
254
+ "epoch": 1.94,
255
+ "eval_accuracy": 0.8100000023841858,
256
+ "eval_loss": 0.48092204332351685,
257
+ "eval_runtime": 0.52,
258
+ "eval_samples_per_second": 3846.385,
259
+ "eval_steps_per_second": 240.399,
260
+ "step": 6000
261
+ },
262
+ {
263
+ "epoch": 2.04,
264
+ "eval_accuracy": 0.8134999871253967,
265
+ "eval_loss": 0.5660321116447449,
266
+ "eval_runtime": 0.5164,
267
+ "eval_samples_per_second": 3873.13,
268
+ "eval_steps_per_second": 242.071,
269
+ "step": 6300
270
+ },
271
+ {
272
+ "epoch": 2.1,
273
+ "learning_rate": 2.3708711139896374e-05,
274
+ "loss": 0.1818,
275
+ "step": 6500
276
+ },
277
+ {
278
+ "epoch": 2.14,
279
+ "eval_accuracy": 0.8180000185966492,
280
+ "eval_loss": 0.6352373361587524,
281
+ "eval_runtime": 0.5235,
282
+ "eval_samples_per_second": 3820.269,
283
+ "eval_steps_per_second": 238.767,
284
+ "step": 6600
285
+ },
286
+ {
287
+ "epoch": 2.23,
288
+ "eval_accuracy": 0.8209999799728394,
289
+ "eval_loss": 0.6510393619537354,
290
+ "eval_runtime": 0.5162,
291
+ "eval_samples_per_second": 3874.656,
292
+ "eval_steps_per_second": 242.166,
293
+ "step": 6900
294
+ },
295
+ {
296
+ "epoch": 2.27,
297
+ "learning_rate": 2.1692843264248704e-05,
298
+ "loss": 0.1776,
299
+ "step": 7000
300
+ },
301
+ {
302
+ "epoch": 2.33,
303
+ "eval_accuracy": 0.8065000176429749,
304
+ "eval_loss": 0.684609591960907,
305
+ "eval_runtime": 0.5072,
306
+ "eval_samples_per_second": 3943.283,
307
+ "eval_steps_per_second": 246.455,
308
+ "step": 7200
309
+ },
310
+ {
311
+ "epoch": 2.43,
312
+ "learning_rate": 1.9668879533678756e-05,
313
+ "loss": 0.1754,
314
+ "step": 7500
315
+ },
316
+ {
317
+ "epoch": 2.43,
318
+ "eval_accuracy": 0.8100000023841858,
319
+ "eval_loss": 0.6463525295257568,
320
+ "eval_runtime": 1.4644,
321
+ "eval_samples_per_second": 1365.751,
322
+ "eval_steps_per_second": 85.359,
323
+ "step": 7500
324
+ },
325
+ {
326
+ "epoch": 2.53,
327
+ "eval_accuracy": 0.8125,
328
+ "eval_loss": 0.6290258169174194,
329
+ "eval_runtime": 3.6012,
330
+ "eval_samples_per_second": 555.376,
331
+ "eval_steps_per_second": 34.711,
332
+ "step": 7800
333
+ },
334
+ {
335
+ "epoch": 2.59,
336
+ "learning_rate": 1.764491580310881e-05,
337
+ "loss": 0.1763,
338
+ "step": 8000
339
+ },
340
+ {
341
+ "epoch": 2.62,
342
+ "eval_accuracy": 0.8144999742507935,
343
+ "eval_loss": 0.6613443493843079,
344
+ "eval_runtime": 2.9055,
345
+ "eval_samples_per_second": 688.341,
346
+ "eval_steps_per_second": 43.021,
347
+ "step": 8100
348
+ },
349
+ {
350
+ "epoch": 2.72,
351
+ "eval_accuracy": 0.8224999904632568,
352
+ "eval_loss": 0.6761817336082458,
353
+ "eval_runtime": 1.1073,
354
+ "eval_samples_per_second": 1806.22,
355
+ "eval_steps_per_second": 112.889,
356
+ "step": 8400
357
+ },
358
+ {
359
+ "epoch": 2.75,
360
+ "learning_rate": 1.5620952072538862e-05,
361
+ "loss": 0.1853,
362
+ "step": 8500
363
+ },
364
+ {
365
+ "epoch": 2.82,
366
+ "eval_accuracy": 0.8119999766349792,
367
+ "eval_loss": 0.6429938077926636,
368
+ "eval_runtime": 0.5254,
369
+ "eval_samples_per_second": 3806.417,
370
+ "eval_steps_per_second": 237.901,
371
+ "step": 8700
372
+ },
373
+ {
374
+ "epoch": 2.91,
375
+ "learning_rate": 1.3596988341968913e-05,
376
+ "loss": 0.1804,
377
+ "step": 9000
378
+ },
379
+ {
380
+ "epoch": 2.91,
381
+ "eval_accuracy": 0.8255000114440918,
382
+ "eval_loss": 0.6622639298439026,
383
+ "eval_runtime": 1.2718,
384
+ "eval_samples_per_second": 1572.63,
385
+ "eval_steps_per_second": 98.289,
386
+ "step": 9000
387
+ },
388
+ {
389
+ "epoch": 3.01,
390
+ "eval_accuracy": 0.8245000243186951,
391
+ "eval_loss": 0.6402567028999329,
392
+ "eval_runtime": 0.5322,
393
+ "eval_samples_per_second": 3757.904,
394
+ "eval_steps_per_second": 234.869,
395
+ "step": 9300
396
+ },
397
+ {
398
+ "epoch": 3.08,
399
+ "learning_rate": 1.1577072538860104e-05,
400
+ "loss": 0.1321,
401
+ "step": 9500
402
+ },
403
+ {
404
+ "epoch": 3.11,
405
+ "eval_accuracy": 0.8184999823570251,
406
+ "eval_loss": 0.76594477891922,
407
+ "eval_runtime": 1.2705,
408
+ "eval_samples_per_second": 1574.241,
409
+ "eval_steps_per_second": 98.39,
410
+ "step": 9600
411
+ },
412
+ {
413
+ "epoch": 3.21,
414
+ "eval_accuracy": 0.8264999985694885,
415
+ "eval_loss": 0.7427929043769836,
416
+ "eval_runtime": 0.5386,
417
+ "eval_samples_per_second": 3713.17,
418
+ "eval_steps_per_second": 232.073,
419
+ "step": 9900
420
+ },
421
+ {
422
+ "epoch": 3.24,
423
+ "learning_rate": 9.553108808290157e-06,
424
+ "loss": 0.1135,
425
+ "step": 10000
426
+ },
427
+ {
428
+ "epoch": 3.3,
429
+ "eval_accuracy": 0.8144999742507935,
430
+ "eval_loss": 0.7777068018913269,
431
+ "eval_runtime": 2.4153,
432
+ "eval_samples_per_second": 828.039,
433
+ "eval_steps_per_second": 51.752,
434
+ "step": 10200
435
+ },
436
+ {
437
+ "epoch": 3.4,
438
+ "learning_rate": 7.529145077720208e-06,
439
+ "loss": 0.1182,
440
+ "step": 10500
441
+ },
442
+ {
443
+ "epoch": 3.4,
444
+ "eval_accuracy": 0.8199999928474426,
445
+ "eval_loss": 0.7448311448097229,
446
+ "eval_runtime": 0.5257,
447
+ "eval_samples_per_second": 3804.49,
448
+ "eval_steps_per_second": 237.781,
449
+ "step": 10500
450
+ },
451
+ {
452
+ "epoch": 3.5,
453
+ "eval_accuracy": 0.824999988079071,
454
+ "eval_loss": 0.7690622806549072,
455
+ "eval_runtime": 0.5391,
456
+ "eval_samples_per_second": 3709.781,
457
+ "eval_steps_per_second": 231.861,
458
+ "step": 10800
459
+ },
460
+ {
461
+ "epoch": 3.56,
462
+ "learning_rate": 5.50518134715026e-06,
463
+ "loss": 0.1114,
464
+ "step": 11000
465
+ },
466
+ {
467
+ "epoch": 3.59,
468
+ "eval_accuracy": 0.8195000290870667,
469
+ "eval_loss": 0.7780025601387024,
470
+ "eval_runtime": 0.5331,
471
+ "eval_samples_per_second": 3751.575,
472
+ "eval_steps_per_second": 234.473,
473
+ "step": 11100
474
+ },
475
+ {
476
+ "epoch": 3.69,
477
+ "eval_accuracy": 0.8220000267028809,
478
+ "eval_loss": 0.7819697260856628,
479
+ "eval_runtime": 0.5267,
480
+ "eval_samples_per_second": 3797.154,
481
+ "eval_steps_per_second": 237.322,
482
+ "step": 11400
483
+ },
484
+ {
485
+ "epoch": 3.72,
486
+ "learning_rate": 3.4852655440414507e-06,
487
+ "loss": 0.0992,
488
+ "step": 11500
489
+ },
490
+ {
491
+ "epoch": 3.79,
492
+ "eval_accuracy": 0.8215000033378601,
493
+ "eval_loss": 0.79557865858078,
494
+ "eval_runtime": 0.537,
495
+ "eval_samples_per_second": 3724.493,
496
+ "eval_steps_per_second": 232.781,
497
+ "step": 11700
498
+ },
499
+ {
500
+ "epoch": 3.89,
501
+ "learning_rate": 1.4613018134715026e-06,
502
+ "loss": 0.1068,
503
+ "step": 12000
504
+ },
505
+ {
506
+ "epoch": 3.89,
507
+ "eval_accuracy": 0.8224999904632568,
508
+ "eval_loss": 0.793406069278717,
509
+ "eval_runtime": 0.5363,
510
+ "eval_samples_per_second": 3729.122,
511
+ "eval_steps_per_second": 233.07,
512
+ "step": 12000
513
+ },
514
+ {
515
+ "epoch": 3.98,
516
+ "eval_accuracy": 0.8215000033378601,
517
+ "eval_loss": 0.8001092672348022,
518
+ "eval_runtime": 0.5353,
519
+ "eval_samples_per_second": 3736.045,
520
+ "eval_steps_per_second": 233.503,
521
+ "step": 12300
522
+ },
523
+ {
524
+ "epoch": 4.0,
525
+ "step": 12352,
526
+ "total_flos": 144723222180000.0,
527
+ "train_loss": 0.24578420763806358,
528
+ "train_runtime": 1895.6753,
529
+ "train_samples_per_second": 104.239,
530
+ "train_steps_per_second": 6.516
531
+ }
532
+ ],
533
+ "max_steps": 12352,
534
+ "num_train_epochs": 4,
535
+ "total_flos": 144723222180000.0,
536
+ "trial_name": null,
537
+ "trial_params": null
538
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fedf9ecaa2d365a3b667011a6cc527dad72f1ad0726c031e1c0a91aa48e4499
3
+ size 2863