TinyPixel commited on
Commit
b95b093
1 Parent(s): 8acd751

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -19,9 +19,9 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "dense_4h_to_h",
23
- "dense",
24
  "query_key_value",
 
 
25
  "dense_h_to_4h"
26
  ],
27
  "task_type": "CAUSAL_LM"
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
 
 
22
  "query_key_value",
23
+ "dense",
24
+ "dense_4h_to_h",
25
  "dense_h_to_4h"
26
  ],
27
  "task_type": "CAUSAL_LM"
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8bf82283c20826d6a4a34abaeea3f35b5c712d8aba82cad8dbe52e58882f6afc
3
  size 134235712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb7d053c5db954d9c46c98a456aa9c81806762e0254afbc771a6070b93b7b458
3
  size 134235712
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23f6e9d6616ed7000e7d8f69286e68440bfa3a52b5dd41b8f82d9c38fa8e9406
3
  size 268514874
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecb2bec54ed43396c2e0c0727d03f4f2ba604833feaa0ad1554b4d1b10bde55f
3
  size 268514874
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbd4b572d58637569be1d3beaf58aef4ab1657fe3e125abb46978c795638ef8b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd810bc6da3808c3f1be86070a1d84bdaa55b1f1611659061bb046875486c2d4
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68ca608482c17f9314e1c94cd309a18be088851d4c0591a9306e6a01c952c9f3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5096170d1d34df4d83b07caa0860cfb37d7b3659bf8c113929aaa3a4f71f7094
3
  size 1064
trainer_state.json CHANGED
@@ -1,451 +1,619 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.953846153846154,
5
  "eval_steps": 500,
6
- "global_step": 144,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.04,
13
- "learning_rate": 8e-05,
14
- "loss": 2.2151,
15
  "step": 2
16
  },
17
  {
18
- "epoch": 0.08,
19
- "learning_rate": 0.00016,
20
- "loss": 2.3087,
21
  "step": 4
22
  },
23
  {
24
- "epoch": 0.12,
25
- "learning_rate": 0.0001999744599547812,
26
- "loss": 2.3667,
27
  "step": 6
28
  },
29
  {
30
- "epoch": 0.16,
31
- "learning_rate": 0.00019977021786163598,
32
- "loss": 2.5011,
33
  "step": 8
34
  },
35
  {
36
- "epoch": 0.21,
37
- "learning_rate": 0.00019936215093023884,
38
- "loss": 2.4286,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.25,
43
- "learning_rate": 0.00019875109281794825,
44
- "loss": 2.4203,
45
  "step": 12
46
  },
47
  {
48
- "epoch": 0.29,
49
- "learning_rate": 0.00019793829188147406,
50
- "loss": 2.2626,
51
  "step": 14
52
  },
53
  {
54
- "epoch": 0.33,
55
- "learning_rate": 0.00019692540862655585,
56
- "loss": 2.2,
57
  "step": 16
58
  },
59
  {
60
- "epoch": 0.37,
61
- "learning_rate": 0.00019571451231564525,
62
- "loss": 2.5432,
63
  "step": 18
64
  },
65
  {
66
- "epoch": 0.41,
67
- "learning_rate": 0.00019430807674052092,
68
- "loss": 2.399,
69
  "step": 20
70
  },
71
  {
72
- "epoch": 0.45,
73
- "learning_rate": 0.00019270897516847403,
74
- "loss": 2.4603,
75
  "step": 22
76
  },
77
  {
78
- "epoch": 0.49,
79
- "learning_rate": 0.00019092047447238773,
80
- "loss": 2.2575,
81
  "step": 24
82
  },
83
  {
84
- "epoch": 0.53,
85
- "learning_rate": 0.00018894622845670283,
86
- "loss": 2.2481,
87
  "step": 26
88
  },
89
  {
90
- "epoch": 0.57,
91
- "learning_rate": 0.00018679027039290497,
92
- "loss": 2.2629,
93
  "step": 28
94
  },
95
  {
96
- "epoch": 0.62,
97
- "learning_rate": 0.00018445700477978205,
98
- "loss": 2.5683,
99
  "step": 30
100
  },
101
  {
102
- "epoch": 0.66,
103
- "learning_rate": 0.00018195119834528534,
104
- "loss": 2.3422,
105
  "step": 32
106
  },
107
  {
108
- "epoch": 0.7,
109
- "learning_rate": 0.00017927797030837768,
110
- "loss": 2.3204,
111
  "step": 34
112
  },
113
  {
114
- "epoch": 0.74,
115
- "learning_rate": 0.0001764427819207624,
116
- "loss": 2.318,
117
  "step": 36
118
  },
119
  {
120
- "epoch": 0.78,
121
- "learning_rate": 0.00017345142530985887,
122
- "loss": 2.2508,
123
  "step": 38
124
  },
125
  {
126
- "epoch": 0.82,
127
- "learning_rate": 0.00017031001164581828,
128
- "loss": 2.2968,
129
  "step": 40
130
  },
131
  {
132
- "epoch": 0.86,
133
- "learning_rate": 0.0001670249586567531,
134
- "loss": 2.2603,
135
  "step": 42
136
  },
137
  {
138
- "epoch": 0.9,
139
- "learning_rate": 0.0001636029775176862,
140
- "loss": 2.4402,
141
  "step": 44
142
  },
143
  {
144
- "epoch": 0.94,
145
- "learning_rate": 0.00016005105914000507,
146
- "loss": 2.3397,
147
  "step": 46
148
  },
149
  {
150
- "epoch": 0.98,
151
- "learning_rate": 0.0001563764598894301,
152
- "loss": 2.2608,
153
  "step": 48
154
  },
155
  {
156
- "epoch": 1.03,
157
- "learning_rate": 0.00015258668676167546,
158
- "loss": 2.0417,
159
  "step": 50
160
  },
161
  {
162
- "epoch": 1.07,
163
- "learning_rate": 0.000148689482046087,
164
- "loss": 2.297,
165
  "step": 52
166
  },
167
  {
168
- "epoch": 1.11,
169
- "learning_rate": 0.00014469280750858854,
170
- "loss": 2.1616,
171
  "step": 54
172
  },
173
  {
174
- "epoch": 1.15,
175
- "learning_rate": 0.00014060482812625055,
176
- "loss": 2.2691,
177
  "step": 56
178
  },
179
  {
180
- "epoch": 1.19,
181
- "learning_rate": 0.00013643389540670962,
182
- "loss": 2.2967,
183
  "step": 58
184
  },
185
  {
186
- "epoch": 1.23,
187
- "learning_rate": 0.0001321885303265172,
188
- "loss": 2.1486,
189
  "step": 60
190
  },
191
  {
192
- "epoch": 1.27,
193
- "learning_rate": 0.0001278774059232723,
194
- "loss": 2.0864,
195
  "step": 62
196
  },
197
  {
198
- "epoch": 1.31,
199
- "learning_rate": 0.0001235093295771032,
200
- "loss": 2.044,
201
  "step": 64
202
  },
203
  {
204
- "epoch": 1.35,
205
- "learning_rate": 0.00011909322501769406,
206
- "loss": 2.3136,
207
  "step": 66
208
  },
209
  {
210
- "epoch": 1.39,
211
- "learning_rate": 0.00011463811409361667,
212
- "loss": 2.3556,
213
  "step": 68
214
  },
215
  {
216
- "epoch": 1.44,
217
- "learning_rate": 0.00011015309834121081,
218
- "loss": 2.4312,
219
  "step": 70
220
  },
221
  {
222
- "epoch": 1.48,
223
- "learning_rate": 0.00010564734039066699,
224
- "loss": 2.1922,
225
  "step": 72
226
  },
227
  {
228
- "epoch": 1.52,
229
- "learning_rate": 0.00010113004524729799,
230
- "loss": 2.0871,
231
  "step": 74
232
  },
233
  {
234
- "epoch": 1.56,
235
- "learning_rate": 9.661044148624037e-05,
236
- "loss": 2.1988,
237
  "step": 76
238
  },
239
  {
240
- "epoch": 1.6,
241
- "learning_rate": 9.209776239900453e-05,
242
- "loss": 2.255,
243
  "step": 78
244
  },
245
  {
246
- "epoch": 1.64,
247
- "learning_rate": 8.760122713038881e-05,
248
- "loss": 2.3782,
249
  "step": 80
250
  },
251
  {
252
- "epoch": 1.68,
253
- "learning_rate": 8.313002184429529e-05,
254
- "loss": 2.196,
255
  "step": 82
256
  },
257
  {
258
- "epoch": 1.72,
259
- "learning_rate": 7.869328095692312e-05,
260
- "loss": 2.3759,
261
  "step": 84
262
  },
263
  {
264
- "epoch": 1.76,
265
- "learning_rate": 7.430006847567972e-05,
266
- "loss": 2.1787,
267
  "step": 86
268
  },
269
  {
270
- "epoch": 1.81,
271
- "learning_rate": 6.995935948193294e-05,
272
- "loss": 2.2547,
273
  "step": 88
274
  },
275
  {
276
- "epoch": 1.85,
277
- "learning_rate": 6.568002179543409e-05,
278
- "loss": 2.4197,
279
  "step": 90
280
  },
281
  {
282
- "epoch": 1.89,
283
- "learning_rate": 6.147079785787038e-05,
284
- "loss": 2.3039,
285
  "step": 92
286
  },
287
  {
288
- "epoch": 1.93,
289
- "learning_rate": 5.734028687255751e-05,
290
- "loss": 2.2513,
291
  "step": 94
292
  },
293
  {
294
- "epoch": 1.97,
295
- "learning_rate": 5.329692723675994e-05,
296
- "loss": 2.1533,
297
  "step": 96
298
  },
299
  {
300
- "epoch": 2.01,
301
- "learning_rate": 4.934897930252886e-05,
302
- "loss": 2.0451,
303
  "step": 98
304
  },
305
  {
306
- "epoch": 2.05,
307
- "learning_rate": 4.550450850127625e-05,
308
- "loss": 2.02,
309
  "step": 100
310
  },
311
  {
312
- "epoch": 2.09,
313
- "learning_rate": 4.1771368866560665e-05,
314
- "loss": 2.2731,
315
  "step": 102
316
  },
317
  {
318
- "epoch": 2.13,
319
- "learning_rate": 3.815718698874672e-05,
320
- "loss": 2.4117,
321
  "step": 104
322
  },
323
  {
324
- "epoch": 2.17,
325
- "learning_rate": 3.466934643431795e-05,
326
- "loss": 2.2761,
327
  "step": 106
328
  },
329
  {
330
- "epoch": 2.22,
331
- "learning_rate": 3.131497266167357e-05,
332
- "loss": 2.3457,
333
  "step": 108
334
  },
335
  {
336
- "epoch": 2.26,
337
- "learning_rate": 2.81009184642253e-05,
338
- "loss": 1.9155,
339
  "step": 110
340
  },
341
  {
342
- "epoch": 2.3,
343
- "learning_rate": 2.5033749970533015e-05,
344
- "loss": 2.1173,
345
  "step": 112
346
  },
347
  {
348
- "epoch": 2.34,
349
- "learning_rate": 2.2119733230080408e-05,
350
- "loss": 2.2136,
351
  "step": 114
352
  },
353
  {
354
- "epoch": 2.38,
355
- "learning_rate": 1.9364821412094857e-05,
356
- "loss": 2.432,
357
  "step": 116
358
  },
359
  {
360
- "epoch": 2.42,
361
- "learning_rate": 1.6774642643563953e-05,
362
- "loss": 2.2785,
363
  "step": 118
364
  },
365
  {
366
- "epoch": 2.46,
367
- "learning_rate": 1.4354488511294417e-05,
368
- "loss": 2.1617,
369
  "step": 120
370
  },
371
  {
372
- "epoch": 2.5,
373
- "learning_rate": 1.2109303251503434e-05,
374
- "loss": 2.1157,
375
  "step": 122
376
  },
377
  {
378
- "epoch": 2.54,
379
- "learning_rate": 1.0043673649027518e-05,
380
- "loss": 2.1812,
381
  "step": 124
382
  },
383
  {
384
- "epoch": 2.58,
385
- "learning_rate": 8.161819666783888e-06,
386
- "loss": 2.0722,
387
  "step": 126
388
  },
389
  {
390
- "epoch": 2.63,
391
- "learning_rate": 6.467585824627887e-06,
392
- "loss": 2.3153,
393
  "step": 128
394
  },
395
  {
396
- "epoch": 2.67,
397
- "learning_rate": 4.964433345219355e-06,
398
- "loss": 2.2884,
399
  "step": 130
400
  },
401
  {
402
- "epoch": 2.71,
403
- "learning_rate": 3.655433082942972e-06,
404
- "loss": 2.0102,
405
  "step": 132
406
  },
407
  {
408
- "epoch": 2.75,
409
- "learning_rate": 2.5432592503288e-06,
410
- "loss": 1.9512,
411
  "step": 134
412
  },
413
  {
414
- "epoch": 2.79,
415
- "learning_rate": 1.6301839547892328e-06,
416
- "loss": 2.2504,
417
  "step": 136
418
  },
419
  {
420
- "epoch": 2.83,
421
- "learning_rate": 9.180725568338044e-07,
422
- "loss": 2.217,
423
  "step": 138
424
  },
425
  {
426
- "epoch": 2.87,
427
- "learning_rate": 4.0837985924448984e-07,
428
- "loss": 2.331,
429
  "step": 140
430
  },
431
  {
432
- "epoch": 2.91,
433
- "learning_rate": 1.0214713499706597e-07,
434
- "loss": 2.1635,
435
  "step": 142
436
  },
437
  {
438
- "epoch": 2.95,
439
- "learning_rate": 0.0,
440
- "loss": 2.3516,
441
  "step": 144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  }
443
  ],
444
  "logging_steps": 2,
445
- "max_steps": 144,
446
  "num_train_epochs": 3,
447
  "save_steps": 500,
448
- "total_flos": 6393487599157248.0,
449
  "trial_name": null,
450
  "trial_params": null
451
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9972041006523766,
5
  "eval_steps": 500,
6
+ "global_step": 201,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "learning_rate": 5.714285714285714e-05,
14
+ "loss": 2.3586,
15
  "step": 2
16
  },
17
  {
18
+ "epoch": 0.06,
19
+ "learning_rate": 0.00011428571428571428,
20
+ "loss": 2.162,
21
  "step": 4
22
  },
23
  {
24
+ "epoch": 0.09,
25
+ "learning_rate": 0.00017142857142857143,
26
+ "loss": 2.0177,
27
  "step": 6
28
  },
29
  {
30
+ "epoch": 0.12,
31
+ "learning_rate": 0.00019998688836656323,
32
+ "loss": 1.9121,
33
  "step": 8
34
  },
35
  {
36
+ "epoch": 0.15,
37
+ "learning_rate": 0.0001998820159279591,
38
+ "loss": 1.7096,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.18,
43
+ "learning_rate": 0.00019967238104745696,
44
+ "loss": 1.962,
45
  "step": 12
46
  },
47
  {
48
+ "epoch": 0.21,
49
+ "learning_rate": 0.00019935820360309777,
50
+ "loss": 1.9894,
51
  "step": 14
52
  },
53
  {
54
+ "epoch": 0.24,
55
+ "learning_rate": 0.00019893981312363562,
56
+ "loss": 1.9644,
57
  "step": 16
58
  },
59
  {
60
+ "epoch": 0.27,
61
+ "learning_rate": 0.00019841764844290744,
62
+ "loss": 2.76,
63
  "step": 18
64
  },
65
  {
66
+ "epoch": 0.3,
67
+ "learning_rate": 0.00019779225723955707,
68
+ "loss": 1.9172,
69
  "step": 20
70
  },
71
  {
72
+ "epoch": 0.33,
73
+ "learning_rate": 0.00019706429546259593,
74
+ "loss": 1.7235,
75
  "step": 22
76
  },
77
  {
78
+ "epoch": 0.36,
79
+ "learning_rate": 0.00019623452664340306,
80
+ "loss": 1.7437,
81
  "step": 24
82
  },
83
  {
84
+ "epoch": 0.39,
85
+ "learning_rate": 0.0001953038210948861,
86
+ "loss": 1.6974,
87
  "step": 26
88
  },
89
  {
90
+ "epoch": 0.42,
91
+ "learning_rate": 0.00019427315499864344,
92
+ "loss": 1.6471,
93
  "step": 28
94
  },
95
  {
96
+ "epoch": 0.45,
97
+ "learning_rate": 0.00019314360938108425,
98
+ "loss": 1.795,
99
  "step": 30
100
  },
101
  {
102
+ "epoch": 0.48,
103
+ "learning_rate": 0.00019191636897958122,
104
+ "loss": 1.7032,
105
  "step": 32
106
  },
107
  {
108
+ "epoch": 0.51,
109
+ "learning_rate": 0.0001905927209998447,
110
+ "loss": 1.9127,
111
  "step": 34
112
  },
113
  {
114
+ "epoch": 0.54,
115
+ "learning_rate": 0.00018917405376582145,
116
+ "loss": 1.7574,
117
  "step": 36
118
  },
119
  {
120
+ "epoch": 0.57,
121
+ "learning_rate": 0.0001876618552635348,
122
+ "loss": 1.6811,
123
  "step": 38
124
  },
125
  {
126
+ "epoch": 0.6,
127
+ "learning_rate": 0.00018605771158039253,
128
+ "loss": 1.725,
129
  "step": 40
130
  },
131
  {
132
+ "epoch": 0.63,
133
+ "learning_rate": 0.00018436330524160047,
134
+ "loss": 1.4664,
135
  "step": 42
136
  },
137
  {
138
+ "epoch": 0.66,
139
+ "learning_rate": 0.00018258041344542566,
140
+ "loss": 1.6107,
141
  "step": 44
142
  },
143
  {
144
+ "epoch": 0.69,
145
+ "learning_rate": 0.00018071090619916093,
146
+ "loss": 1.8359,
147
  "step": 46
148
  },
149
  {
150
+ "epoch": 0.72,
151
+ "learning_rate": 0.00017875674435774547,
152
+ "loss": 1.6842,
153
  "step": 48
154
  },
155
  {
156
+ "epoch": 0.75,
157
+ "learning_rate": 0.00017671997756709863,
158
+ "loss": 2.5867,
159
  "step": 50
160
  },
161
  {
162
+ "epoch": 0.78,
163
+ "learning_rate": 0.0001746027421143246,
164
+ "loss": 1.7532,
165
  "step": 52
166
  },
167
  {
168
+ "epoch": 0.81,
169
+ "learning_rate": 0.00017240725868704218,
170
+ "loss": 1.7108,
171
  "step": 54
172
  },
173
  {
174
+ "epoch": 0.84,
175
+ "learning_rate": 0.00017013583004418993,
176
+ "loss": 1.8096,
177
  "step": 56
178
  },
179
  {
180
+ "epoch": 0.86,
181
+ "learning_rate": 0.00016779083860075033,
182
+ "loss": 1.3886,
183
  "step": 58
184
  },
185
  {
186
+ "epoch": 0.89,
187
+ "learning_rate": 0.00016537474392892528,
188
+ "loss": 1.6365,
189
  "step": 60
190
  },
191
  {
192
+ "epoch": 0.92,
193
+ "learning_rate": 0.00016289008017838445,
194
+ "loss": 1.5767,
195
  "step": 62
196
  },
197
  {
198
+ "epoch": 0.95,
199
+ "learning_rate": 0.00016033945341829248,
200
+ "loss": 1.5843,
201
  "step": 64
202
  },
203
  {
204
+ "epoch": 0.98,
205
+ "learning_rate": 0.00015772553890390197,
206
+ "loss": 1.8543,
207
  "step": 66
208
  },
209
  {
210
+ "epoch": 1.01,
211
+ "learning_rate": 0.00015505107827058036,
212
+ "loss": 2.2743,
213
  "step": 68
214
  },
215
  {
216
+ "epoch": 1.04,
217
+ "learning_rate": 0.000152318876658213,
218
+ "loss": 2.2094,
219
  "step": 70
220
  },
221
  {
222
+ "epoch": 1.07,
223
+ "learning_rate": 0.00014953179976899878,
224
+ "loss": 1.6459,
225
  "step": 72
226
  },
227
  {
228
+ "epoch": 1.1,
229
+ "learning_rate": 0.00014669277086172406,
230
+ "loss": 1.7053,
231
  "step": 74
232
  },
233
  {
234
+ "epoch": 1.13,
235
+ "learning_rate": 0.00014380476768566824,
236
+ "loss": 1.5764,
237
  "step": 76
238
  },
239
  {
240
+ "epoch": 1.16,
241
+ "learning_rate": 0.00014087081935735564,
242
+ "loss": 1.3565,
243
  "step": 78
244
  },
245
  {
246
+ "epoch": 1.19,
247
+ "learning_rate": 0.00013789400318343068,
248
+ "loss": 1.4764,
249
  "step": 80
250
  },
251
  {
252
+ "epoch": 1.22,
253
+ "learning_rate": 0.00013487744143298822,
254
+ "loss": 1.5535,
255
  "step": 82
256
  },
257
  {
258
+ "epoch": 1.25,
259
+ "learning_rate": 0.0001318242980627444,
260
+ "loss": 1.5302,
261
  "step": 84
262
  },
263
  {
264
+ "epoch": 1.28,
265
+ "learning_rate": 0.00012873777539848283,
266
+ "loss": 1.8395,
267
  "step": 86
268
  },
269
  {
270
+ "epoch": 1.31,
271
+ "learning_rate": 0.00012562111077625722,
272
+ "loss": 1.6544,
273
  "step": 88
274
  },
275
  {
276
+ "epoch": 1.34,
277
+ "learning_rate": 0.00012247757314687297,
278
+ "loss": 1.574,
279
  "step": 90
280
  },
281
  {
282
+ "epoch": 1.37,
283
+ "learning_rate": 0.00011931045964720881,
284
+ "loss": 1.5317,
285
  "step": 92
286
  },
287
  {
288
+ "epoch": 1.4,
289
+ "learning_rate": 0.00011612309214197599,
290
+ "loss": 1.3428,
291
  "step": 94
292
  },
293
  {
294
+ "epoch": 1.43,
295
+ "learning_rate": 0.00011291881373954065,
296
+ "loss": 1.5308,
297
  "step": 96
298
  },
299
  {
300
+ "epoch": 1.46,
301
+ "learning_rate": 0.00010970098528546481,
302
+ "loss": 1.5653,
303
  "step": 98
304
  },
305
  {
306
+ "epoch": 1.49,
307
+ "learning_rate": 0.00010647298183744359,
308
+ "loss": 1.5217,
309
  "step": 100
310
  },
311
  {
312
+ "epoch": 1.52,
313
+ "learning_rate": 0.00010323818912533561,
314
+ "loss": 2.0985,
315
  "step": 102
316
  },
317
  {
318
+ "epoch": 1.55,
319
+ "learning_rate": 0.0001,
320
+ "loss": 1.6621,
321
  "step": 104
322
  },
323
  {
324
+ "epoch": 1.58,
325
+ "learning_rate": 9.676181087466444e-05,
326
+ "loss": 1.5929,
327
  "step": 106
328
  },
329
  {
330
+ "epoch": 1.61,
331
+ "learning_rate": 9.352701816255643e-05,
332
+ "loss": 1.6037,
333
  "step": 108
334
  },
335
  {
336
+ "epoch": 1.64,
337
+ "learning_rate": 9.02990147145352e-05,
338
+ "loss": 1.3565,
339
  "step": 110
340
  },
341
  {
342
+ "epoch": 1.67,
343
+ "learning_rate": 8.70811862604594e-05,
344
+ "loss": 1.4116,
345
  "step": 112
346
  },
347
  {
348
+ "epoch": 1.7,
349
+ "learning_rate": 8.387690785802402e-05,
350
+ "loss": 1.6237,
351
  "step": 114
352
  },
353
  {
354
+ "epoch": 1.73,
355
+ "learning_rate": 8.068954035279121e-05,
356
+ "loss": 1.7385,
357
  "step": 116
358
  },
359
  {
360
+ "epoch": 1.76,
361
+ "learning_rate": 7.75224268531271e-05,
362
+ "loss": 2.2754,
363
  "step": 118
364
  },
365
  {
366
+ "epoch": 1.79,
367
+ "learning_rate": 7.437888922374276e-05,
368
+ "loss": 1.5661,
369
  "step": 120
370
  },
371
  {
372
+ "epoch": 1.82,
373
+ "learning_rate": 7.126222460151719e-05,
374
+ "loss": 1.7311,
375
  "step": 122
376
  },
377
  {
378
+ "epoch": 1.85,
379
+ "learning_rate": 6.817570193725564e-05,
380
+ "loss": 1.5312,
381
  "step": 124
382
  },
383
  {
384
+ "epoch": 1.88,
385
+ "learning_rate": 6.512255856701177e-05,
386
+ "loss": 1.391,
387
  "step": 126
388
  },
389
  {
390
+ "epoch": 1.91,
391
+ "learning_rate": 6.210599681656933e-05,
392
+ "loss": 1.3985,
393
  "step": 128
394
  },
395
  {
396
+ "epoch": 1.94,
397
+ "learning_rate": 5.9129180642644414e-05,
398
+ "loss": 1.5069,
399
  "step": 130
400
  },
401
  {
402
+ "epoch": 1.97,
403
+ "learning_rate": 5.6195232314331766e-05,
404
+ "loss": 1.4259,
405
  "step": 132
406
  },
407
  {
408
+ "epoch": 2.0,
409
+ "learning_rate": 5.3307229138275936e-05,
410
+ "loss": 1.5473,
411
  "step": 134
412
  },
413
  {
414
+ "epoch": 2.03,
415
+ "learning_rate": 5.0468200231001286e-05,
416
+ "loss": 1.9502,
417
  "step": 136
418
  },
419
  {
420
+ "epoch": 2.06,
421
+ "learning_rate": 4.768112334178699e-05,
422
+ "loss": 1.6592,
423
  "step": 138
424
  },
425
  {
426
+ "epoch": 2.09,
427
+ "learning_rate": 4.494892172941965e-05,
428
+ "loss": 1.5564,
429
  "step": 140
430
  },
431
  {
432
+ "epoch": 2.12,
433
+ "learning_rate": 4.227446109609809e-05,
434
+ "loss": 1.6098,
435
  "step": 142
436
  },
437
  {
438
+ "epoch": 2.15,
439
+ "learning_rate": 3.966054658170754e-05,
440
+ "loss": 1.2925,
441
  "step": 144
442
+ },
443
+ {
444
+ "epoch": 2.18,
445
+ "learning_rate": 3.710991982161555e-05,
446
+ "loss": 1.4613,
447
+ "step": 146
448
+ },
449
+ {
450
+ "epoch": 2.21,
451
+ "learning_rate": 3.4625256071074773e-05,
452
+ "loss": 1.4148,
453
+ "step": 148
454
+ },
455
+ {
456
+ "epoch": 2.24,
457
+ "learning_rate": 3.2209161399249674e-05,
458
+ "loss": 1.3662,
459
+ "step": 150
460
+ },
461
+ {
462
+ "epoch": 2.27,
463
+ "learning_rate": 2.9864169955810084e-05,
464
+ "loss": 2.1277,
465
+ "step": 152
466
+ },
467
+ {
468
+ "epoch": 2.3,
469
+ "learning_rate": 2.759274131295787e-05,
470
+ "loss": 1.8296,
471
+ "step": 154
472
+ },
473
+ {
474
+ "epoch": 2.33,
475
+ "learning_rate": 2.5397257885675397e-05,
476
+ "loss": 1.609,
477
+ "step": 156
478
+ },
479
+ {
480
+ "epoch": 2.36,
481
+ "learning_rate": 2.3280022432901383e-05,
482
+ "loss": 1.6705,
483
+ "step": 158
484
+ },
485
+ {
486
+ "epoch": 2.39,
487
+ "learning_rate": 2.1243255642254578e-05,
488
+ "loss": 1.4399,
489
+ "step": 160
490
+ },
491
+ {
492
+ "epoch": 2.42,
493
+ "learning_rate": 1.9289093800839066e-05,
494
+ "loss": 1.271,
495
+ "step": 162
496
+ },
497
+ {
498
+ "epoch": 2.45,
499
+ "learning_rate": 1.741958655457436e-05,
500
+ "loss": 1.3362,
501
+ "step": 164
502
+ },
503
+ {
504
+ "epoch": 2.48,
505
+ "learning_rate": 1.563669475839956e-05,
506
+ "loss": 1.3983,
507
+ "step": 166
508
+ },
509
+ {
510
+ "epoch": 2.51,
511
+ "learning_rate": 1.3942288419607475e-05,
512
+ "loss": 2.5072,
513
+ "step": 168
514
+ },
515
+ {
516
+ "epoch": 2.53,
517
+ "learning_rate": 1.233814473646524e-05,
518
+ "loss": 1.8437,
519
+ "step": 170
520
+ },
521
+ {
522
+ "epoch": 2.56,
523
+ "learning_rate": 1.0825946234178574e-05,
524
+ "loss": 1.6483,
525
+ "step": 172
526
+ },
527
+ {
528
+ "epoch": 2.59,
529
+ "learning_rate": 9.407279000155312e-06,
530
+ "loss": 1.5843,
531
+ "step": 174
532
+ },
533
+ {
534
+ "epoch": 2.62,
535
+ "learning_rate": 8.083631020418791e-06,
536
+ "loss": 1.231,
537
+ "step": 176
538
+ },
539
+ {
540
+ "epoch": 2.65,
541
+ "learning_rate": 6.856390618915775e-06,
542
+ "loss": 1.3115,
543
+ "step": 178
544
+ },
545
+ {
546
+ "epoch": 2.68,
547
+ "learning_rate": 5.726845001356573e-06,
548
+ "loss": 1.4052,
549
+ "step": 180
550
+ },
551
+ {
552
+ "epoch": 2.71,
553
+ "learning_rate": 4.6961789051139124e-06,
554
+ "loss": 1.0685,
555
+ "step": 182
556
+ },
557
+ {
558
+ "epoch": 2.74,
559
+ "learning_rate": 3.7654733565969826e-06,
560
+ "loss": 2.13,
561
+ "step": 184
562
+ },
563
+ {
564
+ "epoch": 2.77,
565
+ "learning_rate": 2.9357045374040825e-06,
566
+ "loss": 1.6736,
567
+ "step": 186
568
+ },
569
+ {
570
+ "epoch": 2.8,
571
+ "learning_rate": 2.2077427604429433e-06,
572
+ "loss": 1.6618,
573
+ "step": 188
574
+ },
575
+ {
576
+ "epoch": 2.83,
577
+ "learning_rate": 1.5823515570925763e-06,
578
+ "loss": 1.6082,
579
+ "step": 190
580
+ },
581
+ {
582
+ "epoch": 2.86,
583
+ "learning_rate": 1.0601868763643996e-06,
584
+ "loss": 1.4342,
585
+ "step": 192
586
+ },
587
+ {
588
+ "epoch": 2.89,
589
+ "learning_rate": 6.41796396902239e-07,
590
+ "loss": 1.2327,
591
+ "step": 194
592
+ },
593
+ {
594
+ "epoch": 2.92,
595
+ "learning_rate": 3.2761895254306287e-07,
596
+ "loss": 1.4817,
597
+ "step": 196
598
+ },
599
+ {
600
+ "epoch": 2.95,
601
+ "learning_rate": 1.179840720409331e-07,
602
+ "loss": 1.015,
603
+ "step": 198
604
+ },
605
+ {
606
+ "epoch": 2.98,
607
+ "learning_rate": 1.3111633436779791e-08,
608
+ "loss": 1.6181,
609
+ "step": 200
610
  }
611
  ],
612
  "logging_steps": 2,
613
+ "max_steps": 201,
614
  "num_train_epochs": 3,
615
  "save_steps": 500,
616
+ "total_flos": 8958427177402368.0,
617
  "trial_name": null,
618
  "trial_params": null
619
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fe2e827eb6206105b07079a3b30f17479fed73be174922d87993b1034d0bd65
3
  size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b37df8cd675a96fd62739792548139099638fde02917db6a4e57e9644200937
3
  size 4600