SansarK commited on
Commit
8beb841
1 Parent(s): d67e7e6

Upload folder using huggingface_hub

Browse files
Files changed (29) hide show
  1. distilbert_ranking.onnx +3 -0
  2. distilbert_ranking/checkpoint-1000/config.json +25 -0
  3. distilbert_ranking/checkpoint-1000/model.safetensors +3 -0
  4. distilbert_ranking/checkpoint-1000/optimizer.pt +3 -0
  5. distilbert_ranking/checkpoint-1000/rng_state.pth +3 -0
  6. distilbert_ranking/checkpoint-1000/scheduler.pt +3 -0
  7. distilbert_ranking/checkpoint-1000/special_tokens_map.json +7 -0
  8. distilbert_ranking/checkpoint-1000/tokenizer_config.json +57 -0
  9. distilbert_ranking/checkpoint-1000/trainer_state.json +749 -0
  10. distilbert_ranking/checkpoint-1000/training_args.bin +3 -0
  11. distilbert_ranking/checkpoint-1000/vocab.txt +0 -0
  12. distilbert_ranking/checkpoint-1182/config.json +25 -0
  13. distilbert_ranking/checkpoint-1182/model.safetensors +3 -0
  14. distilbert_ranking/checkpoint-1182/optimizer.pt +3 -0
  15. distilbert_ranking/checkpoint-1182/rng_state.pth +3 -0
  16. distilbert_ranking/checkpoint-1182/scheduler.pt +3 -0
  17. distilbert_ranking/checkpoint-1182/special_tokens_map.json +7 -0
  18. distilbert_ranking/checkpoint-1182/tokenizer_config.json +57 -0
  19. distilbert_ranking/checkpoint-1182/trainer_state.json +875 -0
  20. distilbert_ranking/checkpoint-1182/training_args.bin +3 -0
  21. distilbert_ranking/checkpoint-1182/vocab.txt +0 -0
  22. distilbert_ranking_final/config.json +25 -0
  23. distilbert_ranking_final/model.safetensors +3 -0
  24. distilbert_ranking_final/special_tokens_map.json +7 -0
  25. distilbert_ranking_final/tokenizer_config.json +57 -0
  26. distilbert_ranking_final/training_args.bin +3 -0
  27. distilbert_ranking_final/vocab.txt +0 -0
  28. logs/events.out.tfevents.1726929739.9016b8275454.37.0 +3 -0
  29. logs/events.out.tfevents.1726929842.9016b8275454.37.1 +3 -0
distilbert_ranking.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b293d0e8db625b0db17d9dc242bf6530672c55b61a451427773be61631c30cf
3
+ size 267918628
distilbert_ranking/checkpoint-1000/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "pad_token_id": 0,
17
+ "problem_type": "single_label_classification",
18
+ "qa_dropout": 0.1,
19
+ "seq_classif_dropout": 0.2,
20
+ "sinusoidal_pos_embds": false,
21
+ "tie_weights_": true,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.44.0",
24
+ "vocab_size": 30522
25
+ }
distilbert_ranking/checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87484e1fcb9e4a77a3aacc978e17b48b0163a497cf37575c1faba69e14cd6ecc
3
+ size 267832560
distilbert_ranking/checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:559bfc82138d61aad7d9834a82a6aeea2f6c269a8eaf4ecfc7fa8c1123fe9f25
3
+ size 535727290
distilbert_ranking/checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94b7e075cf7126042f3936c52d3f56f4486b118a14fb9c3146f36cd69794b084
3
+ size 14244
distilbert_ranking/checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e766cedce67a15b0fb8efe9df650d7363b9dd7e5e567404d4e819fa24811fee
3
+ size 1064
distilbert_ranking/checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
distilbert_ranking/checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "DistilBertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
distilbert_ranking/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,749 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.5380710659898478,
5
+ "eval_steps": 500,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.025380710659898477,
13
+ "grad_norm": 2.13912296295166,
14
+ "learning_rate": 4.957698815566836e-05,
15
+ "loss": 0.4596,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.050761421319796954,
20
+ "grad_norm": 1.5265041589736938,
21
+ "learning_rate": 4.9153976311336716e-05,
22
+ "loss": 0.5119,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.07614213197969544,
27
+ "grad_norm": 3.402026891708374,
28
+ "learning_rate": 4.873096446700508e-05,
29
+ "loss": 0.5408,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.10152284263959391,
34
+ "grad_norm": 1.8265776634216309,
35
+ "learning_rate": 4.8307952622673436e-05,
36
+ "loss": 0.5605,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.12690355329949238,
41
+ "grad_norm": 1.4988059997558594,
42
+ "learning_rate": 4.7884940778341796e-05,
43
+ "loss": 0.5919,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.15228426395939088,
48
+ "grad_norm": 1.6790786981582642,
49
+ "learning_rate": 4.746192893401015e-05,
50
+ "loss": 0.5605,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.17766497461928935,
55
+ "grad_norm": 2.108412742614746,
56
+ "learning_rate": 4.7038917089678517e-05,
57
+ "loss": 0.5024,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.20304568527918782,
62
+ "grad_norm": 1.2667797803878784,
63
+ "learning_rate": 4.661590524534687e-05,
64
+ "loss": 0.6062,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.22842639593908629,
69
+ "grad_norm": 1.2963333129882812,
70
+ "learning_rate": 4.619289340101523e-05,
71
+ "loss": 0.5623,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.25380710659898476,
76
+ "grad_norm": 4.575770378112793,
77
+ "learning_rate": 4.576988155668359e-05,
78
+ "loss": 0.5304,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.27918781725888325,
83
+ "grad_norm": 1.6193996667861938,
84
+ "learning_rate": 4.534686971235195e-05,
85
+ "loss": 0.5276,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.30456852791878175,
90
+ "grad_norm": 1.5291751623153687,
91
+ "learning_rate": 4.492385786802031e-05,
92
+ "loss": 0.4513,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.3299492385786802,
97
+ "grad_norm": 2.916069984436035,
98
+ "learning_rate": 4.4500846023688664e-05,
99
+ "loss": 0.4674,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.3553299492385787,
104
+ "grad_norm": 2.216944456100464,
105
+ "learning_rate": 4.4077834179357024e-05,
106
+ "loss": 0.455,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.38071065989847713,
111
+ "grad_norm": 2.572436571121216,
112
+ "learning_rate": 4.365482233502538e-05,
113
+ "loss": 0.5409,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.40609137055837563,
118
+ "grad_norm": 2.5171351432800293,
119
+ "learning_rate": 4.3231810490693744e-05,
120
+ "loss": 0.5328,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.43147208121827413,
125
+ "grad_norm": 2.056858777999878,
126
+ "learning_rate": 4.2808798646362104e-05,
127
+ "loss": 0.515,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.45685279187817257,
132
+ "grad_norm": 1.575071930885315,
133
+ "learning_rate": 4.238578680203046e-05,
134
+ "loss": 0.5201,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.48223350253807107,
139
+ "grad_norm": 3.0955426692962646,
140
+ "learning_rate": 4.196277495769882e-05,
141
+ "loss": 0.4181,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.5076142131979695,
146
+ "grad_norm": 2.798530340194702,
147
+ "learning_rate": 4.153976311336718e-05,
148
+ "loss": 0.3292,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.5329949238578681,
153
+ "grad_norm": 2.3137614727020264,
154
+ "learning_rate": 4.111675126903554e-05,
155
+ "loss": 0.6242,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.5583756345177665,
160
+ "grad_norm": 1.7211440801620483,
161
+ "learning_rate": 4.069373942470389e-05,
162
+ "loss": 0.5459,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.583756345177665,
167
+ "grad_norm": 1.9022053480148315,
168
+ "learning_rate": 4.027072758037225e-05,
169
+ "loss": 0.4379,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.6091370558375635,
174
+ "grad_norm": 1.4369803667068481,
175
+ "learning_rate": 3.9847715736040605e-05,
176
+ "loss": 0.5092,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.6345177664974619,
181
+ "grad_norm": 1.8029989004135132,
182
+ "learning_rate": 3.942470389170897e-05,
183
+ "loss": 0.5678,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.6598984771573604,
188
+ "grad_norm": 2.690417528152466,
189
+ "learning_rate": 3.900169204737733e-05,
190
+ "loss": 0.4386,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.6852791878172588,
195
+ "grad_norm": 3.036839485168457,
196
+ "learning_rate": 3.8578680203045685e-05,
197
+ "loss": 0.5936,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.7106598984771574,
202
+ "grad_norm": 1.6595145463943481,
203
+ "learning_rate": 3.8155668358714046e-05,
204
+ "loss": 0.5406,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.7360406091370558,
209
+ "grad_norm": 4.433804035186768,
210
+ "learning_rate": 3.7732656514382406e-05,
211
+ "loss": 0.4874,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.7614213197969543,
216
+ "grad_norm": 2.043555498123169,
217
+ "learning_rate": 3.7309644670050766e-05,
218
+ "loss": 0.4603,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.7868020304568528,
223
+ "grad_norm": 2.553321123123169,
224
+ "learning_rate": 3.688663282571912e-05,
225
+ "loss": 0.5858,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.8121827411167513,
230
+ "grad_norm": 2.677241563796997,
231
+ "learning_rate": 3.646362098138748e-05,
232
+ "loss": 0.4095,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.8375634517766497,
237
+ "grad_norm": 3.408170461654663,
238
+ "learning_rate": 3.604060913705584e-05,
239
+ "loss": 0.6228,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.8629441624365483,
244
+ "grad_norm": 3.3228750228881836,
245
+ "learning_rate": 3.56175972927242e-05,
246
+ "loss": 0.4735,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.8883248730964467,
251
+ "grad_norm": 4.007043838500977,
252
+ "learning_rate": 3.519458544839256e-05,
253
+ "loss": 0.51,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.9137055837563451,
258
+ "grad_norm": 1.5816287994384766,
259
+ "learning_rate": 3.477157360406091e-05,
260
+ "loss": 0.5176,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.9390862944162437,
265
+ "grad_norm": 1.4616796970367432,
266
+ "learning_rate": 3.434856175972927e-05,
267
+ "loss": 0.286,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.9644670050761421,
272
+ "grad_norm": 4.507604122161865,
273
+ "learning_rate": 3.3925549915397633e-05,
274
+ "loss": 0.6015,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.9898477157360406,
279
+ "grad_norm": 1.7000436782836914,
280
+ "learning_rate": 3.3502538071065994e-05,
281
+ "loss": 0.3593,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 1.0,
286
+ "eval_loss": 0.512408971786499,
287
+ "eval_runtime": 8.5129,
288
+ "eval_samples_per_second": 92.565,
289
+ "eval_steps_per_second": 11.629,
290
+ "step": 394
291
+ },
292
+ {
293
+ "epoch": 1.015228426395939,
294
+ "grad_norm": 3.039973497390747,
295
+ "learning_rate": 3.307952622673435e-05,
296
+ "loss": 0.5087,
297
+ "step": 400
298
+ },
299
+ {
300
+ "epoch": 1.0406091370558375,
301
+ "grad_norm": 2.5285613536834717,
302
+ "learning_rate": 3.265651438240271e-05,
303
+ "loss": 0.4335,
304
+ "step": 410
305
+ },
306
+ {
307
+ "epoch": 1.0659898477157361,
308
+ "grad_norm": 1.5140420198440552,
309
+ "learning_rate": 3.223350253807107e-05,
310
+ "loss": 0.3759,
311
+ "step": 420
312
+ },
313
+ {
314
+ "epoch": 1.0913705583756346,
315
+ "grad_norm": 2.0796151161193848,
316
+ "learning_rate": 3.181049069373943e-05,
317
+ "loss": 0.6431,
318
+ "step": 430
319
+ },
320
+ {
321
+ "epoch": 1.116751269035533,
322
+ "grad_norm": 3.370028018951416,
323
+ "learning_rate": 3.138747884940779e-05,
324
+ "loss": 0.4801,
325
+ "step": 440
326
+ },
327
+ {
328
+ "epoch": 1.1421319796954315,
329
+ "grad_norm": 7.150363445281982,
330
+ "learning_rate": 3.096446700507614e-05,
331
+ "loss": 0.4856,
332
+ "step": 450
333
+ },
334
+ {
335
+ "epoch": 1.16751269035533,
336
+ "grad_norm": 4.6761860847473145,
337
+ "learning_rate": 3.05414551607445e-05,
338
+ "loss": 0.5973,
339
+ "step": 460
340
+ },
341
+ {
342
+ "epoch": 1.1928934010152283,
343
+ "grad_norm": 1.6876248121261597,
344
+ "learning_rate": 3.0118443316412858e-05,
345
+ "loss": 0.4265,
346
+ "step": 470
347
+ },
348
+ {
349
+ "epoch": 1.218274111675127,
350
+ "grad_norm": 2.2751359939575195,
351
+ "learning_rate": 2.969543147208122e-05,
352
+ "loss": 0.4307,
353
+ "step": 480
354
+ },
355
+ {
356
+ "epoch": 1.2436548223350254,
357
+ "grad_norm": 4.351472854614258,
358
+ "learning_rate": 2.927241962774958e-05,
359
+ "loss": 0.4978,
360
+ "step": 490
361
+ },
362
+ {
363
+ "epoch": 1.2690355329949239,
364
+ "grad_norm": 3.3015713691711426,
365
+ "learning_rate": 2.8849407783417938e-05,
366
+ "loss": 0.4159,
367
+ "step": 500
368
+ },
369
+ {
370
+ "epoch": 1.2944162436548223,
371
+ "grad_norm": 1.4904918670654297,
372
+ "learning_rate": 2.84263959390863e-05,
373
+ "loss": 0.512,
374
+ "step": 510
375
+ },
376
+ {
377
+ "epoch": 1.3197969543147208,
378
+ "grad_norm": 2.306755542755127,
379
+ "learning_rate": 2.800338409475465e-05,
380
+ "loss": 0.5745,
381
+ "step": 520
382
+ },
383
+ {
384
+ "epoch": 1.3451776649746192,
385
+ "grad_norm": 3.0485565662384033,
386
+ "learning_rate": 2.7580372250423015e-05,
387
+ "loss": 0.4243,
388
+ "step": 530
389
+ },
390
+ {
391
+ "epoch": 1.3705583756345177,
392
+ "grad_norm": 5.009148597717285,
393
+ "learning_rate": 2.715736040609137e-05,
394
+ "loss": 0.5187,
395
+ "step": 540
396
+ },
397
+ {
398
+ "epoch": 1.3959390862944163,
399
+ "grad_norm": 3.1525375843048096,
400
+ "learning_rate": 2.6734348561759732e-05,
401
+ "loss": 0.4693,
402
+ "step": 550
403
+ },
404
+ {
405
+ "epoch": 1.4213197969543148,
406
+ "grad_norm": 2.2495410442352295,
407
+ "learning_rate": 2.6311336717428085e-05,
408
+ "loss": 0.5011,
409
+ "step": 560
410
+ },
411
+ {
412
+ "epoch": 1.4467005076142132,
413
+ "grad_norm": 2.390629291534424,
414
+ "learning_rate": 2.588832487309645e-05,
415
+ "loss": 0.6348,
416
+ "step": 570
417
+ },
418
+ {
419
+ "epoch": 1.4720812182741116,
420
+ "grad_norm": 1.7083393335342407,
421
+ "learning_rate": 2.546531302876481e-05,
422
+ "loss": 0.4737,
423
+ "step": 580
424
+ },
425
+ {
426
+ "epoch": 1.49746192893401,
427
+ "grad_norm": 2.4492344856262207,
428
+ "learning_rate": 2.5042301184433166e-05,
429
+ "loss": 0.5164,
430
+ "step": 590
431
+ },
432
+ {
433
+ "epoch": 1.5228426395939088,
434
+ "grad_norm": 8.137948989868164,
435
+ "learning_rate": 2.4619289340101523e-05,
436
+ "loss": 0.4653,
437
+ "step": 600
438
+ },
439
+ {
440
+ "epoch": 1.548223350253807,
441
+ "grad_norm": 2.3943252563476562,
442
+ "learning_rate": 2.4196277495769883e-05,
443
+ "loss": 0.4019,
444
+ "step": 610
445
+ },
446
+ {
447
+ "epoch": 1.5736040609137056,
448
+ "grad_norm": 1.8179367780685425,
449
+ "learning_rate": 2.3773265651438243e-05,
450
+ "loss": 0.3428,
451
+ "step": 620
452
+ },
453
+ {
454
+ "epoch": 1.598984771573604,
455
+ "grad_norm": 2.3452229499816895,
456
+ "learning_rate": 2.33502538071066e-05,
457
+ "loss": 0.5092,
458
+ "step": 630
459
+ },
460
+ {
461
+ "epoch": 1.6243654822335025,
462
+ "grad_norm": 1.6915607452392578,
463
+ "learning_rate": 2.292724196277496e-05,
464
+ "loss": 0.5964,
465
+ "step": 640
466
+ },
467
+ {
468
+ "epoch": 1.649746192893401,
469
+ "grad_norm": 3.0646092891693115,
470
+ "learning_rate": 2.2504230118443317e-05,
471
+ "loss": 0.4237,
472
+ "step": 650
473
+ },
474
+ {
475
+ "epoch": 1.6751269035532994,
476
+ "grad_norm": 2.2048497200012207,
477
+ "learning_rate": 2.2081218274111677e-05,
478
+ "loss": 0.3713,
479
+ "step": 660
480
+ },
481
+ {
482
+ "epoch": 1.700507614213198,
483
+ "grad_norm": 3.50895094871521,
484
+ "learning_rate": 2.1658206429780033e-05,
485
+ "loss": 0.4798,
486
+ "step": 670
487
+ },
488
+ {
489
+ "epoch": 1.7258883248730963,
490
+ "grad_norm": 2.0043256282806396,
491
+ "learning_rate": 2.1235194585448394e-05,
492
+ "loss": 0.583,
493
+ "step": 680
494
+ },
495
+ {
496
+ "epoch": 1.751269035532995,
497
+ "grad_norm": 1.845831274986267,
498
+ "learning_rate": 2.0812182741116754e-05,
499
+ "loss": 0.5429,
500
+ "step": 690
501
+ },
502
+ {
503
+ "epoch": 1.7766497461928934,
504
+ "grad_norm": 1.7165807485580444,
505
+ "learning_rate": 2.038917089678511e-05,
506
+ "loss": 0.4464,
507
+ "step": 700
508
+ },
509
+ {
510
+ "epoch": 1.8020304568527918,
511
+ "grad_norm": 1.361329436302185,
512
+ "learning_rate": 1.996615905245347e-05,
513
+ "loss": 0.4246,
514
+ "step": 710
515
+ },
516
+ {
517
+ "epoch": 1.8274111675126905,
518
+ "grad_norm": 2.407137870788574,
519
+ "learning_rate": 1.9543147208121827e-05,
520
+ "loss": 0.4645,
521
+ "step": 720
522
+ },
523
+ {
524
+ "epoch": 1.8527918781725887,
525
+ "grad_norm": 2.324209213256836,
526
+ "learning_rate": 1.9120135363790187e-05,
527
+ "loss": 0.5234,
528
+ "step": 730
529
+ },
530
+ {
531
+ "epoch": 1.8781725888324874,
532
+ "grad_norm": 3.296576738357544,
533
+ "learning_rate": 1.8697123519458544e-05,
534
+ "loss": 0.557,
535
+ "step": 740
536
+ },
537
+ {
538
+ "epoch": 1.9035532994923858,
539
+ "grad_norm": 2.2372303009033203,
540
+ "learning_rate": 1.8274111675126904e-05,
541
+ "loss": 0.4279,
542
+ "step": 750
543
+ },
544
+ {
545
+ "epoch": 1.9289340101522843,
546
+ "grad_norm": 4.05513334274292,
547
+ "learning_rate": 1.785109983079526e-05,
548
+ "loss": 0.4958,
549
+ "step": 760
550
+ },
551
+ {
552
+ "epoch": 1.9543147208121827,
553
+ "grad_norm": 1.2267847061157227,
554
+ "learning_rate": 1.7428087986463625e-05,
555
+ "loss": 0.4711,
556
+ "step": 770
557
+ },
558
+ {
559
+ "epoch": 1.9796954314720812,
560
+ "grad_norm": 2.452728033065796,
561
+ "learning_rate": 1.700507614213198e-05,
562
+ "loss": 0.4696,
563
+ "step": 780
564
+ },
565
+ {
566
+ "epoch": 2.0,
567
+ "eval_loss": 0.47782018780708313,
568
+ "eval_runtime": 8.5194,
569
+ "eval_samples_per_second": 92.495,
570
+ "eval_steps_per_second": 11.621,
571
+ "step": 788
572
+ },
573
+ {
574
+ "epoch": 2.00507614213198,
575
+ "grad_norm": 1.9264096021652222,
576
+ "learning_rate": 1.658206429780034e-05,
577
+ "loss": 0.4061,
578
+ "step": 790
579
+ },
580
+ {
581
+ "epoch": 2.030456852791878,
582
+ "grad_norm": 1.578764796257019,
583
+ "learning_rate": 1.6159052453468698e-05,
584
+ "loss": 0.4168,
585
+ "step": 800
586
+ },
587
+ {
588
+ "epoch": 2.0558375634517767,
589
+ "grad_norm": 5.699263095855713,
590
+ "learning_rate": 1.5736040609137055e-05,
591
+ "loss": 0.4339,
592
+ "step": 810
593
+ },
594
+ {
595
+ "epoch": 2.081218274111675,
596
+ "grad_norm": 3.1477041244506836,
597
+ "learning_rate": 1.5313028764805415e-05,
598
+ "loss": 0.3623,
599
+ "step": 820
600
+ },
601
+ {
602
+ "epoch": 2.1065989847715736,
603
+ "grad_norm": 2.8351192474365234,
604
+ "learning_rate": 1.4890016920473774e-05,
605
+ "loss": 0.446,
606
+ "step": 830
607
+ },
608
+ {
609
+ "epoch": 2.1319796954314723,
610
+ "grad_norm": 6.005784511566162,
611
+ "learning_rate": 1.4467005076142132e-05,
612
+ "loss": 0.3802,
613
+ "step": 840
614
+ },
615
+ {
616
+ "epoch": 2.1573604060913705,
617
+ "grad_norm": 3.5085108280181885,
618
+ "learning_rate": 1.404399323181049e-05,
619
+ "loss": 0.4335,
620
+ "step": 850
621
+ },
622
+ {
623
+ "epoch": 2.182741116751269,
624
+ "grad_norm": 2.457703113555908,
625
+ "learning_rate": 1.362098138747885e-05,
626
+ "loss": 0.3866,
627
+ "step": 860
628
+ },
629
+ {
630
+ "epoch": 2.2081218274111674,
631
+ "grad_norm": 6.848071098327637,
632
+ "learning_rate": 1.3197969543147209e-05,
633
+ "loss": 0.3146,
634
+ "step": 870
635
+ },
636
+ {
637
+ "epoch": 2.233502538071066,
638
+ "grad_norm": 2.1162843704223633,
639
+ "learning_rate": 1.2774957698815568e-05,
640
+ "loss": 0.3747,
641
+ "step": 880
642
+ },
643
+ {
644
+ "epoch": 2.2588832487309647,
645
+ "grad_norm": 2.40061354637146,
646
+ "learning_rate": 1.2351945854483926e-05,
647
+ "loss": 0.5355,
648
+ "step": 890
649
+ },
650
+ {
651
+ "epoch": 2.284263959390863,
652
+ "grad_norm": 12.88298225402832,
653
+ "learning_rate": 1.1928934010152284e-05,
654
+ "loss": 0.3908,
655
+ "step": 900
656
+ },
657
+ {
658
+ "epoch": 2.3096446700507616,
659
+ "grad_norm": 6.1054463386535645,
660
+ "learning_rate": 1.1505922165820643e-05,
661
+ "loss": 0.3276,
662
+ "step": 910
663
+ },
664
+ {
665
+ "epoch": 2.33502538071066,
666
+ "grad_norm": 16.432769775390625,
667
+ "learning_rate": 1.1082910321489003e-05,
668
+ "loss": 0.2436,
669
+ "step": 920
670
+ },
671
+ {
672
+ "epoch": 2.3604060913705585,
673
+ "grad_norm": 9.989404678344727,
674
+ "learning_rate": 1.0659898477157361e-05,
675
+ "loss": 0.5455,
676
+ "step": 930
677
+ },
678
+ {
679
+ "epoch": 2.3857868020304567,
680
+ "grad_norm": 3.592336893081665,
681
+ "learning_rate": 1.023688663282572e-05,
682
+ "loss": 0.467,
683
+ "step": 940
684
+ },
685
+ {
686
+ "epoch": 2.4111675126903553,
687
+ "grad_norm": 4.606255531311035,
688
+ "learning_rate": 9.813874788494078e-06,
689
+ "loss": 0.4826,
690
+ "step": 950
691
+ },
692
+ {
693
+ "epoch": 2.436548223350254,
694
+ "grad_norm": 2.08891224861145,
695
+ "learning_rate": 9.390862944162437e-06,
696
+ "loss": 0.2995,
697
+ "step": 960
698
+ },
699
+ {
700
+ "epoch": 2.4619289340101522,
701
+ "grad_norm": 4.196601867675781,
702
+ "learning_rate": 8.967851099830795e-06,
703
+ "loss": 0.5598,
704
+ "step": 970
705
+ },
706
+ {
707
+ "epoch": 2.487309644670051,
708
+ "grad_norm": 2.653444290161133,
709
+ "learning_rate": 8.544839255499154e-06,
710
+ "loss": 0.4471,
711
+ "step": 980
712
+ },
713
+ {
714
+ "epoch": 2.512690355329949,
715
+ "grad_norm": 1.8868201971054077,
716
+ "learning_rate": 8.121827411167512e-06,
717
+ "loss": 0.4616,
718
+ "step": 990
719
+ },
720
+ {
721
+ "epoch": 2.5380710659898478,
722
+ "grad_norm": 5.2379045486450195,
723
+ "learning_rate": 7.698815566835872e-06,
724
+ "loss": 0.2389,
725
+ "step": 1000
726
+ }
727
+ ],
728
+ "logging_steps": 10,
729
+ "max_steps": 1182,
730
+ "num_input_tokens_seen": 0,
731
+ "num_train_epochs": 3,
732
+ "save_steps": 500,
733
+ "stateful_callbacks": {
734
+ "TrainerControl": {
735
+ "args": {
736
+ "should_epoch_stop": false,
737
+ "should_evaluate": false,
738
+ "should_log": false,
739
+ "should_save": true,
740
+ "should_training_stop": false
741
+ },
742
+ "attributes": {}
743
+ }
744
+ },
745
+ "total_flos": 1059739189248000.0,
746
+ "train_batch_size": 8,
747
+ "trial_name": null,
748
+ "trial_params": null
749
+ }
distilbert_ranking/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b84117e184b790860c5640a9d712589f6b0aca90747571d9d65b366b184da82f
3
+ size 5176
distilbert_ranking/checkpoint-1000/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
distilbert_ranking/checkpoint-1182/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "pad_token_id": 0,
17
+ "problem_type": "single_label_classification",
18
+ "qa_dropout": 0.1,
19
+ "seq_classif_dropout": 0.2,
20
+ "sinusoidal_pos_embds": false,
21
+ "tie_weights_": true,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.44.0",
24
+ "vocab_size": 30522
25
+ }
distilbert_ranking/checkpoint-1182/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edf736462c0a29c2ac0441cd6fe59ee3f26b4bfdb6893e6961af067becbf5d40
3
+ size 267832560
distilbert_ranking/checkpoint-1182/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42e5a6c616d3f203fa5a3115a82289ae4d9105dc6dc97a47f728e22ccedd1c6a
3
+ size 535727290
distilbert_ranking/checkpoint-1182/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29864a4a6b087769eea302e2e6c88ecb8d62dc63de83bd6b04bb031be1781af5
3
+ size 14244
distilbert_ranking/checkpoint-1182/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4a380e9d800c0f6bcb243e98aeb9297684e495632edfc300873b77fcb450a6c
3
+ size 1064
distilbert_ranking/checkpoint-1182/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
distilbert_ranking/checkpoint-1182/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "DistilBertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
distilbert_ranking/checkpoint-1182/trainer_state.json ADDED
@@ -0,0 +1,875 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1182,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.025380710659898477,
13
+ "grad_norm": 2.13912296295166,
14
+ "learning_rate": 4.957698815566836e-05,
15
+ "loss": 0.4596,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.050761421319796954,
20
+ "grad_norm": 1.5265041589736938,
21
+ "learning_rate": 4.9153976311336716e-05,
22
+ "loss": 0.5119,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.07614213197969544,
27
+ "grad_norm": 3.402026891708374,
28
+ "learning_rate": 4.873096446700508e-05,
29
+ "loss": 0.5408,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.10152284263959391,
34
+ "grad_norm": 1.8265776634216309,
35
+ "learning_rate": 4.8307952622673436e-05,
36
+ "loss": 0.5605,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.12690355329949238,
41
+ "grad_norm": 1.4988059997558594,
42
+ "learning_rate": 4.7884940778341796e-05,
43
+ "loss": 0.5919,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.15228426395939088,
48
+ "grad_norm": 1.6790786981582642,
49
+ "learning_rate": 4.746192893401015e-05,
50
+ "loss": 0.5605,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.17766497461928935,
55
+ "grad_norm": 2.108412742614746,
56
+ "learning_rate": 4.7038917089678517e-05,
57
+ "loss": 0.5024,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.20304568527918782,
62
+ "grad_norm": 1.2667797803878784,
63
+ "learning_rate": 4.661590524534687e-05,
64
+ "loss": 0.6062,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.22842639593908629,
69
+ "grad_norm": 1.2963333129882812,
70
+ "learning_rate": 4.619289340101523e-05,
71
+ "loss": 0.5623,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.25380710659898476,
76
+ "grad_norm": 4.575770378112793,
77
+ "learning_rate": 4.576988155668359e-05,
78
+ "loss": 0.5304,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.27918781725888325,
83
+ "grad_norm": 1.6193996667861938,
84
+ "learning_rate": 4.534686971235195e-05,
85
+ "loss": 0.5276,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.30456852791878175,
90
+ "grad_norm": 1.5291751623153687,
91
+ "learning_rate": 4.492385786802031e-05,
92
+ "loss": 0.4513,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.3299492385786802,
97
+ "grad_norm": 2.916069984436035,
98
+ "learning_rate": 4.4500846023688664e-05,
99
+ "loss": 0.4674,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.3553299492385787,
104
+ "grad_norm": 2.216944456100464,
105
+ "learning_rate": 4.4077834179357024e-05,
106
+ "loss": 0.455,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.38071065989847713,
111
+ "grad_norm": 2.572436571121216,
112
+ "learning_rate": 4.365482233502538e-05,
113
+ "loss": 0.5409,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.40609137055837563,
118
+ "grad_norm": 2.5171351432800293,
119
+ "learning_rate": 4.3231810490693744e-05,
120
+ "loss": 0.5328,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.43147208121827413,
125
+ "grad_norm": 2.056858777999878,
126
+ "learning_rate": 4.2808798646362104e-05,
127
+ "loss": 0.515,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.45685279187817257,
132
+ "grad_norm": 1.575071930885315,
133
+ "learning_rate": 4.238578680203046e-05,
134
+ "loss": 0.5201,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.48223350253807107,
139
+ "grad_norm": 3.0955426692962646,
140
+ "learning_rate": 4.196277495769882e-05,
141
+ "loss": 0.4181,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.5076142131979695,
146
+ "grad_norm": 2.798530340194702,
147
+ "learning_rate": 4.153976311336718e-05,
148
+ "loss": 0.3292,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.5329949238578681,
153
+ "grad_norm": 2.3137614727020264,
154
+ "learning_rate": 4.111675126903554e-05,
155
+ "loss": 0.6242,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.5583756345177665,
160
+ "grad_norm": 1.7211440801620483,
161
+ "learning_rate": 4.069373942470389e-05,
162
+ "loss": 0.5459,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.583756345177665,
167
+ "grad_norm": 1.9022053480148315,
168
+ "learning_rate": 4.027072758037225e-05,
169
+ "loss": 0.4379,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.6091370558375635,
174
+ "grad_norm": 1.4369803667068481,
175
+ "learning_rate": 3.9847715736040605e-05,
176
+ "loss": 0.5092,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.6345177664974619,
181
+ "grad_norm": 1.8029989004135132,
182
+ "learning_rate": 3.942470389170897e-05,
183
+ "loss": 0.5678,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.6598984771573604,
188
+ "grad_norm": 2.690417528152466,
189
+ "learning_rate": 3.900169204737733e-05,
190
+ "loss": 0.4386,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.6852791878172588,
195
+ "grad_norm": 3.036839485168457,
196
+ "learning_rate": 3.8578680203045685e-05,
197
+ "loss": 0.5936,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.7106598984771574,
202
+ "grad_norm": 1.6595145463943481,
203
+ "learning_rate": 3.8155668358714046e-05,
204
+ "loss": 0.5406,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.7360406091370558,
209
+ "grad_norm": 4.433804035186768,
210
+ "learning_rate": 3.7732656514382406e-05,
211
+ "loss": 0.4874,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.7614213197969543,
216
+ "grad_norm": 2.043555498123169,
217
+ "learning_rate": 3.7309644670050766e-05,
218
+ "loss": 0.4603,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.7868020304568528,
223
+ "grad_norm": 2.553321123123169,
224
+ "learning_rate": 3.688663282571912e-05,
225
+ "loss": 0.5858,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.8121827411167513,
230
+ "grad_norm": 2.677241563796997,
231
+ "learning_rate": 3.646362098138748e-05,
232
+ "loss": 0.4095,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.8375634517766497,
237
+ "grad_norm": 3.408170461654663,
238
+ "learning_rate": 3.604060913705584e-05,
239
+ "loss": 0.6228,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.8629441624365483,
244
+ "grad_norm": 3.3228750228881836,
245
+ "learning_rate": 3.56175972927242e-05,
246
+ "loss": 0.4735,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.8883248730964467,
251
+ "grad_norm": 4.007043838500977,
252
+ "learning_rate": 3.519458544839256e-05,
253
+ "loss": 0.51,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.9137055837563451,
258
+ "grad_norm": 1.5816287994384766,
259
+ "learning_rate": 3.477157360406091e-05,
260
+ "loss": 0.5176,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.9390862944162437,
265
+ "grad_norm": 1.4616796970367432,
266
+ "learning_rate": 3.434856175972927e-05,
267
+ "loss": 0.286,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.9644670050761421,
272
+ "grad_norm": 4.507604122161865,
273
+ "learning_rate": 3.3925549915397633e-05,
274
+ "loss": 0.6015,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.9898477157360406,
279
+ "grad_norm": 1.7000436782836914,
280
+ "learning_rate": 3.3502538071065994e-05,
281
+ "loss": 0.3593,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 1.0,
286
+ "eval_loss": 0.512408971786499,
287
+ "eval_runtime": 8.5129,
288
+ "eval_samples_per_second": 92.565,
289
+ "eval_steps_per_second": 11.629,
290
+ "step": 394
291
+ },
292
+ {
293
+ "epoch": 1.015228426395939,
294
+ "grad_norm": 3.039973497390747,
295
+ "learning_rate": 3.307952622673435e-05,
296
+ "loss": 0.5087,
297
+ "step": 400
298
+ },
299
+ {
300
+ "epoch": 1.0406091370558375,
301
+ "grad_norm": 2.5285613536834717,
302
+ "learning_rate": 3.265651438240271e-05,
303
+ "loss": 0.4335,
304
+ "step": 410
305
+ },
306
+ {
307
+ "epoch": 1.0659898477157361,
308
+ "grad_norm": 1.5140420198440552,
309
+ "learning_rate": 3.223350253807107e-05,
310
+ "loss": 0.3759,
311
+ "step": 420
312
+ },
313
+ {
314
+ "epoch": 1.0913705583756346,
315
+ "grad_norm": 2.0796151161193848,
316
+ "learning_rate": 3.181049069373943e-05,
317
+ "loss": 0.6431,
318
+ "step": 430
319
+ },
320
+ {
321
+ "epoch": 1.116751269035533,
322
+ "grad_norm": 3.370028018951416,
323
+ "learning_rate": 3.138747884940779e-05,
324
+ "loss": 0.4801,
325
+ "step": 440
326
+ },
327
+ {
328
+ "epoch": 1.1421319796954315,
329
+ "grad_norm": 7.150363445281982,
330
+ "learning_rate": 3.096446700507614e-05,
331
+ "loss": 0.4856,
332
+ "step": 450
333
+ },
334
+ {
335
+ "epoch": 1.16751269035533,
336
+ "grad_norm": 4.6761860847473145,
337
+ "learning_rate": 3.05414551607445e-05,
338
+ "loss": 0.5973,
339
+ "step": 460
340
+ },
341
+ {
342
+ "epoch": 1.1928934010152283,
343
+ "grad_norm": 1.6876248121261597,
344
+ "learning_rate": 3.0118443316412858e-05,
345
+ "loss": 0.4265,
346
+ "step": 470
347
+ },
348
+ {
349
+ "epoch": 1.218274111675127,
350
+ "grad_norm": 2.2751359939575195,
351
+ "learning_rate": 2.969543147208122e-05,
352
+ "loss": 0.4307,
353
+ "step": 480
354
+ },
355
+ {
356
+ "epoch": 1.2436548223350254,
357
+ "grad_norm": 4.351472854614258,
358
+ "learning_rate": 2.927241962774958e-05,
359
+ "loss": 0.4978,
360
+ "step": 490
361
+ },
362
+ {
363
+ "epoch": 1.2690355329949239,
364
+ "grad_norm": 3.3015713691711426,
365
+ "learning_rate": 2.8849407783417938e-05,
366
+ "loss": 0.4159,
367
+ "step": 500
368
+ },
369
+ {
370
+ "epoch": 1.2944162436548223,
371
+ "grad_norm": 1.4904918670654297,
372
+ "learning_rate": 2.84263959390863e-05,
373
+ "loss": 0.512,
374
+ "step": 510
375
+ },
376
+ {
377
+ "epoch": 1.3197969543147208,
378
+ "grad_norm": 2.306755542755127,
379
+ "learning_rate": 2.800338409475465e-05,
380
+ "loss": 0.5745,
381
+ "step": 520
382
+ },
383
+ {
384
+ "epoch": 1.3451776649746192,
385
+ "grad_norm": 3.0485565662384033,
386
+ "learning_rate": 2.7580372250423015e-05,
387
+ "loss": 0.4243,
388
+ "step": 530
389
+ },
390
+ {
391
+ "epoch": 1.3705583756345177,
392
+ "grad_norm": 5.009148597717285,
393
+ "learning_rate": 2.715736040609137e-05,
394
+ "loss": 0.5187,
395
+ "step": 540
396
+ },
397
+ {
398
+ "epoch": 1.3959390862944163,
399
+ "grad_norm": 3.1525375843048096,
400
+ "learning_rate": 2.6734348561759732e-05,
401
+ "loss": 0.4693,
402
+ "step": 550
403
+ },
404
+ {
405
+ "epoch": 1.4213197969543148,
406
+ "grad_norm": 2.2495410442352295,
407
+ "learning_rate": 2.6311336717428085e-05,
408
+ "loss": 0.5011,
409
+ "step": 560
410
+ },
411
+ {
412
+ "epoch": 1.4467005076142132,
413
+ "grad_norm": 2.390629291534424,
414
+ "learning_rate": 2.588832487309645e-05,
415
+ "loss": 0.6348,
416
+ "step": 570
417
+ },
418
+ {
419
+ "epoch": 1.4720812182741116,
420
+ "grad_norm": 1.7083393335342407,
421
+ "learning_rate": 2.546531302876481e-05,
422
+ "loss": 0.4737,
423
+ "step": 580
424
+ },
425
+ {
426
+ "epoch": 1.49746192893401,
427
+ "grad_norm": 2.4492344856262207,
428
+ "learning_rate": 2.5042301184433166e-05,
429
+ "loss": 0.5164,
430
+ "step": 590
431
+ },
432
+ {
433
+ "epoch": 1.5228426395939088,
434
+ "grad_norm": 8.137948989868164,
435
+ "learning_rate": 2.4619289340101523e-05,
436
+ "loss": 0.4653,
437
+ "step": 600
438
+ },
439
+ {
440
+ "epoch": 1.548223350253807,
441
+ "grad_norm": 2.3943252563476562,
442
+ "learning_rate": 2.4196277495769883e-05,
443
+ "loss": 0.4019,
444
+ "step": 610
445
+ },
446
+ {
447
+ "epoch": 1.5736040609137056,
448
+ "grad_norm": 1.8179367780685425,
449
+ "learning_rate": 2.3773265651438243e-05,
450
+ "loss": 0.3428,
451
+ "step": 620
452
+ },
453
+ {
454
+ "epoch": 1.598984771573604,
455
+ "grad_norm": 2.3452229499816895,
456
+ "learning_rate": 2.33502538071066e-05,
457
+ "loss": 0.5092,
458
+ "step": 630
459
+ },
460
+ {
461
+ "epoch": 1.6243654822335025,
462
+ "grad_norm": 1.6915607452392578,
463
+ "learning_rate": 2.292724196277496e-05,
464
+ "loss": 0.5964,
465
+ "step": 640
466
+ },
467
+ {
468
+ "epoch": 1.649746192893401,
469
+ "grad_norm": 3.0646092891693115,
470
+ "learning_rate": 2.2504230118443317e-05,
471
+ "loss": 0.4237,
472
+ "step": 650
473
+ },
474
+ {
475
+ "epoch": 1.6751269035532994,
476
+ "grad_norm": 2.2048497200012207,
477
+ "learning_rate": 2.2081218274111677e-05,
478
+ "loss": 0.3713,
479
+ "step": 660
480
+ },
481
+ {
482
+ "epoch": 1.700507614213198,
483
+ "grad_norm": 3.50895094871521,
484
+ "learning_rate": 2.1658206429780033e-05,
485
+ "loss": 0.4798,
486
+ "step": 670
487
+ },
488
+ {
489
+ "epoch": 1.7258883248730963,
490
+ "grad_norm": 2.0043256282806396,
491
+ "learning_rate": 2.1235194585448394e-05,
492
+ "loss": 0.583,
493
+ "step": 680
494
+ },
495
+ {
496
+ "epoch": 1.751269035532995,
497
+ "grad_norm": 1.845831274986267,
498
+ "learning_rate": 2.0812182741116754e-05,
499
+ "loss": 0.5429,
500
+ "step": 690
501
+ },
502
+ {
503
+ "epoch": 1.7766497461928934,
504
+ "grad_norm": 1.7165807485580444,
505
+ "learning_rate": 2.038917089678511e-05,
506
+ "loss": 0.4464,
507
+ "step": 700
508
+ },
509
+ {
510
+ "epoch": 1.8020304568527918,
511
+ "grad_norm": 1.361329436302185,
512
+ "learning_rate": 1.996615905245347e-05,
513
+ "loss": 0.4246,
514
+ "step": 710
515
+ },
516
+ {
517
+ "epoch": 1.8274111675126905,
518
+ "grad_norm": 2.407137870788574,
519
+ "learning_rate": 1.9543147208121827e-05,
520
+ "loss": 0.4645,
521
+ "step": 720
522
+ },
523
+ {
524
+ "epoch": 1.8527918781725887,
525
+ "grad_norm": 2.324209213256836,
526
+ "learning_rate": 1.9120135363790187e-05,
527
+ "loss": 0.5234,
528
+ "step": 730
529
+ },
530
+ {
531
+ "epoch": 1.8781725888324874,
532
+ "grad_norm": 3.296576738357544,
533
+ "learning_rate": 1.8697123519458544e-05,
534
+ "loss": 0.557,
535
+ "step": 740
536
+ },
537
+ {
538
+ "epoch": 1.9035532994923858,
539
+ "grad_norm": 2.2372303009033203,
540
+ "learning_rate": 1.8274111675126904e-05,
541
+ "loss": 0.4279,
542
+ "step": 750
543
+ },
544
+ {
545
+ "epoch": 1.9289340101522843,
546
+ "grad_norm": 4.05513334274292,
547
+ "learning_rate": 1.785109983079526e-05,
548
+ "loss": 0.4958,
549
+ "step": 760
550
+ },
551
+ {
552
+ "epoch": 1.9543147208121827,
553
+ "grad_norm": 1.2267847061157227,
554
+ "learning_rate": 1.7428087986463625e-05,
555
+ "loss": 0.4711,
556
+ "step": 770
557
+ },
558
+ {
559
+ "epoch": 1.9796954314720812,
560
+ "grad_norm": 2.452728033065796,
561
+ "learning_rate": 1.700507614213198e-05,
562
+ "loss": 0.4696,
563
+ "step": 780
564
+ },
565
+ {
566
+ "epoch": 2.0,
567
+ "eval_loss": 0.47782018780708313,
568
+ "eval_runtime": 8.5194,
569
+ "eval_samples_per_second": 92.495,
570
+ "eval_steps_per_second": 11.621,
571
+ "step": 788
572
+ },
573
+ {
574
+ "epoch": 2.00507614213198,
575
+ "grad_norm": 1.9264096021652222,
576
+ "learning_rate": 1.658206429780034e-05,
577
+ "loss": 0.4061,
578
+ "step": 790
579
+ },
580
+ {
581
+ "epoch": 2.030456852791878,
582
+ "grad_norm": 1.578764796257019,
583
+ "learning_rate": 1.6159052453468698e-05,
584
+ "loss": 0.4168,
585
+ "step": 800
586
+ },
587
+ {
588
+ "epoch": 2.0558375634517767,
589
+ "grad_norm": 5.699263095855713,
590
+ "learning_rate": 1.5736040609137055e-05,
591
+ "loss": 0.4339,
592
+ "step": 810
593
+ },
594
+ {
595
+ "epoch": 2.081218274111675,
596
+ "grad_norm": 3.1477041244506836,
597
+ "learning_rate": 1.5313028764805415e-05,
598
+ "loss": 0.3623,
599
+ "step": 820
600
+ },
601
+ {
602
+ "epoch": 2.1065989847715736,
603
+ "grad_norm": 2.8351192474365234,
604
+ "learning_rate": 1.4890016920473774e-05,
605
+ "loss": 0.446,
606
+ "step": 830
607
+ },
608
+ {
609
+ "epoch": 2.1319796954314723,
610
+ "grad_norm": 6.005784511566162,
611
+ "learning_rate": 1.4467005076142132e-05,
612
+ "loss": 0.3802,
613
+ "step": 840
614
+ },
615
+ {
616
+ "epoch": 2.1573604060913705,
617
+ "grad_norm": 3.5085108280181885,
618
+ "learning_rate": 1.404399323181049e-05,
619
+ "loss": 0.4335,
620
+ "step": 850
621
+ },
622
+ {
623
+ "epoch": 2.182741116751269,
624
+ "grad_norm": 2.457703113555908,
625
+ "learning_rate": 1.362098138747885e-05,
626
+ "loss": 0.3866,
627
+ "step": 860
628
+ },
629
+ {
630
+ "epoch": 2.2081218274111674,
631
+ "grad_norm": 6.848071098327637,
632
+ "learning_rate": 1.3197969543147209e-05,
633
+ "loss": 0.3146,
634
+ "step": 870
635
+ },
636
+ {
637
+ "epoch": 2.233502538071066,
638
+ "grad_norm": 2.1162843704223633,
639
+ "learning_rate": 1.2774957698815568e-05,
640
+ "loss": 0.3747,
641
+ "step": 880
642
+ },
643
+ {
644
+ "epoch": 2.2588832487309647,
645
+ "grad_norm": 2.40061354637146,
646
+ "learning_rate": 1.2351945854483926e-05,
647
+ "loss": 0.5355,
648
+ "step": 890
649
+ },
650
+ {
651
+ "epoch": 2.284263959390863,
652
+ "grad_norm": 12.88298225402832,
653
+ "learning_rate": 1.1928934010152284e-05,
654
+ "loss": 0.3908,
655
+ "step": 900
656
+ },
657
+ {
658
+ "epoch": 2.3096446700507616,
659
+ "grad_norm": 6.1054463386535645,
660
+ "learning_rate": 1.1505922165820643e-05,
661
+ "loss": 0.3276,
662
+ "step": 910
663
+ },
664
+ {
665
+ "epoch": 2.33502538071066,
666
+ "grad_norm": 16.432769775390625,
667
+ "learning_rate": 1.1082910321489003e-05,
668
+ "loss": 0.2436,
669
+ "step": 920
670
+ },
671
+ {
672
+ "epoch": 2.3604060913705585,
673
+ "grad_norm": 9.989404678344727,
674
+ "learning_rate": 1.0659898477157361e-05,
675
+ "loss": 0.5455,
676
+ "step": 930
677
+ },
678
+ {
679
+ "epoch": 2.3857868020304567,
680
+ "grad_norm": 3.592336893081665,
681
+ "learning_rate": 1.023688663282572e-05,
682
+ "loss": 0.467,
683
+ "step": 940
684
+ },
685
+ {
686
+ "epoch": 2.4111675126903553,
687
+ "grad_norm": 4.606255531311035,
688
+ "learning_rate": 9.813874788494078e-06,
689
+ "loss": 0.4826,
690
+ "step": 950
691
+ },
692
+ {
693
+ "epoch": 2.436548223350254,
694
+ "grad_norm": 2.08891224861145,
695
+ "learning_rate": 9.390862944162437e-06,
696
+ "loss": 0.2995,
697
+ "step": 960
698
+ },
699
+ {
700
+ "epoch": 2.4619289340101522,
701
+ "grad_norm": 4.196601867675781,
702
+ "learning_rate": 8.967851099830795e-06,
703
+ "loss": 0.5598,
704
+ "step": 970
705
+ },
706
+ {
707
+ "epoch": 2.487309644670051,
708
+ "grad_norm": 2.653444290161133,
709
+ "learning_rate": 8.544839255499154e-06,
710
+ "loss": 0.4471,
711
+ "step": 980
712
+ },
713
+ {
714
+ "epoch": 2.512690355329949,
715
+ "grad_norm": 1.8868201971054077,
716
+ "learning_rate": 8.121827411167512e-06,
717
+ "loss": 0.4616,
718
+ "step": 990
719
+ },
720
+ {
721
+ "epoch": 2.5380710659898478,
722
+ "grad_norm": 5.2379045486450195,
723
+ "learning_rate": 7.698815566835872e-06,
724
+ "loss": 0.2389,
725
+ "step": 1000
726
+ },
727
+ {
728
+ "epoch": 2.563451776649746,
729
+ "grad_norm": 3.2069485187530518,
730
+ "learning_rate": 7.275803722504231e-06,
731
+ "loss": 0.3617,
732
+ "step": 1010
733
+ },
734
+ {
735
+ "epoch": 2.5888324873096447,
736
+ "grad_norm": 7.2437286376953125,
737
+ "learning_rate": 6.852791878172589e-06,
738
+ "loss": 0.6099,
739
+ "step": 1020
740
+ },
741
+ {
742
+ "epoch": 2.6142131979695433,
743
+ "grad_norm": 1.056522250175476,
744
+ "learning_rate": 6.429780033840948e-06,
745
+ "loss": 0.1708,
746
+ "step": 1030
747
+ },
748
+ {
749
+ "epoch": 2.6395939086294415,
750
+ "grad_norm": 7.495356559753418,
751
+ "learning_rate": 6.006768189509306e-06,
752
+ "loss": 0.3881,
753
+ "step": 1040
754
+ },
755
+ {
756
+ "epoch": 2.66497461928934,
757
+ "grad_norm": 4.80796480178833,
758
+ "learning_rate": 5.583756345177665e-06,
759
+ "loss": 0.3701,
760
+ "step": 1050
761
+ },
762
+ {
763
+ "epoch": 2.6903553299492384,
764
+ "grad_norm": 4.566274642944336,
765
+ "learning_rate": 5.160744500846024e-06,
766
+ "loss": 0.2963,
767
+ "step": 1060
768
+ },
769
+ {
770
+ "epoch": 2.715736040609137,
771
+ "grad_norm": 5.139068603515625,
772
+ "learning_rate": 4.737732656514383e-06,
773
+ "loss": 0.4711,
774
+ "step": 1070
775
+ },
776
+ {
777
+ "epoch": 2.7411167512690353,
778
+ "grad_norm": 5.687432289123535,
779
+ "learning_rate": 4.3147208121827415e-06,
780
+ "loss": 0.3127,
781
+ "step": 1080
782
+ },
783
+ {
784
+ "epoch": 2.766497461928934,
785
+ "grad_norm": 2.3243467807769775,
786
+ "learning_rate": 3.8917089678511e-06,
787
+ "loss": 0.3814,
788
+ "step": 1090
789
+ },
790
+ {
791
+ "epoch": 2.7918781725888326,
792
+ "grad_norm": 3.6257083415985107,
793
+ "learning_rate": 3.4686971235194584e-06,
794
+ "loss": 0.3983,
795
+ "step": 1100
796
+ },
797
+ {
798
+ "epoch": 2.817258883248731,
799
+ "grad_norm": 7.288547992706299,
800
+ "learning_rate": 3.0456852791878177e-06,
801
+ "loss": 0.3005,
802
+ "step": 1110
803
+ },
804
+ {
805
+ "epoch": 2.8426395939086295,
806
+ "grad_norm": 2.3332388401031494,
807
+ "learning_rate": 2.622673434856176e-06,
808
+ "loss": 0.4473,
809
+ "step": 1120
810
+ },
811
+ {
812
+ "epoch": 2.868020304568528,
813
+ "grad_norm": 14.024263381958008,
814
+ "learning_rate": 2.199661590524535e-06,
815
+ "loss": 0.4144,
816
+ "step": 1130
817
+ },
818
+ {
819
+ "epoch": 2.8934010152284264,
820
+ "grad_norm": 4.453007698059082,
821
+ "learning_rate": 1.7766497461928936e-06,
822
+ "loss": 0.2824,
823
+ "step": 1140
824
+ },
825
+ {
826
+ "epoch": 2.9187817258883246,
827
+ "grad_norm": 19.20343017578125,
828
+ "learning_rate": 1.353637901861252e-06,
829
+ "loss": 0.4371,
830
+ "step": 1150
831
+ },
832
+ {
833
+ "epoch": 2.9441624365482233,
834
+ "grad_norm": 8.754419326782227,
835
+ "learning_rate": 9.306260575296109e-07,
836
+ "loss": 0.3963,
837
+ "step": 1160
838
+ },
839
+ {
840
+ "epoch": 2.969543147208122,
841
+ "grad_norm": 4.8408589363098145,
842
+ "learning_rate": 5.076142131979695e-07,
843
+ "loss": 0.3653,
844
+ "step": 1170
845
+ },
846
+ {
847
+ "epoch": 2.99492385786802,
848
+ "grad_norm": 3.477670431137085,
849
+ "learning_rate": 8.460236886632826e-08,
850
+ "loss": 0.4175,
851
+ "step": 1180
852
+ }
853
+ ],
854
+ "logging_steps": 10,
855
+ "max_steps": 1182,
856
+ "num_input_tokens_seen": 0,
857
+ "num_train_epochs": 3,
858
+ "save_steps": 500,
859
+ "stateful_callbacks": {
860
+ "TrainerControl": {
861
+ "args": {
862
+ "should_epoch_stop": false,
863
+ "should_evaluate": false,
864
+ "should_log": false,
865
+ "should_save": true,
866
+ "should_training_stop": true
867
+ },
868
+ "attributes": {}
869
+ }
870
+ },
871
+ "total_flos": 1252611721691136.0,
872
+ "train_batch_size": 8,
873
+ "trial_name": null,
874
+ "trial_params": null
875
+ }
distilbert_ranking/checkpoint-1182/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b84117e184b790860c5640a9d712589f6b0aca90747571d9d65b366b184da82f
3
+ size 5176
distilbert_ranking/checkpoint-1182/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
distilbert_ranking_final/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "pad_token_id": 0,
17
+ "problem_type": "single_label_classification",
18
+ "qa_dropout": 0.1,
19
+ "seq_classif_dropout": 0.2,
20
+ "sinusoidal_pos_embds": false,
21
+ "tie_weights_": true,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.44.0",
24
+ "vocab_size": 30522
25
+ }
distilbert_ranking_final/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edf736462c0a29c2ac0441cd6fe59ee3f26b4bfdb6893e6961af067becbf5d40
3
+ size 267832560
distilbert_ranking_final/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
distilbert_ranking_final/tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "DistilBertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
distilbert_ranking_final/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b84117e184b790860c5640a9d712589f6b0aca90747571d9d65b366b184da82f
3
+ size 5176
distilbert_ranking_final/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
logs/events.out.tfevents.1726929739.9016b8275454.37.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:123b45f8fcfcc8a176830de916e90eed0fd85d65d7fcc94733603f297038f768
3
+ size 4763
logs/events.out.tfevents.1726929842.9016b8275454.37.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bce0c61835db5d6ef38227aee7c058076ec573effdfe9f3da4ba6e5da72ebaf
3
+ size 30767