Files changed (7) hide show
  1. optimizer.pt +3 -0
  2. pytorch_model.bin +3 -0
  3. rng_state.pth +3 -0
  4. scaler.pt +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +442 -0
  7. training_args.bin +3 -0
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ca2f652b50161f1f365baf8d7263a0de21b273291ddee69e160ae8137f775f4
3
+ size 160172869
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2216405c6e3b4b6c318003c5e38e29c1a7a0ce2eb21abf474114378e8c274c9f
3
+ size 80114765
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9defd0d2ac7cea09127fefb20385f7a54eb2860c6bd9a32502c3c5a52f9ad55c
3
+ size 14575
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e8bfa24714ec568e1cb11f40b8e7b5945102f5516627c6bc17c400ab29513c9
3
+ size 557
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9074f5d5b0ae79eb8de0c0b12be33d42a5246074a74de638af3c1b2e70535446
3
+ size 627
trainer_state.json ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.110647439956665,
3
+ "best_model_checkpoint": "outs/instuct_chat_50k/checkpoint-2000",
4
+ "epoch": 0.23353573096683794,
5
+ "global_step": 3000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 1.5568240788790864e-07,
13
+ "loss": 1.1388,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.01,
18
+ "learning_rate": 1.5412558380902958e-05,
19
+ "loss": 1.5551,
20
+ "step": 100
21
+ },
22
+ {
23
+ "epoch": 0.01,
24
+ "eval_loss": 1.2936956882476807,
25
+ "eval_runtime": 31.4406,
26
+ "eval_samples_per_second": 6.361,
27
+ "eval_steps_per_second": 0.795,
28
+ "step": 100
29
+ },
30
+ {
31
+ "epoch": 0.02,
32
+ "learning_rate": 3.098079916969382e-05,
33
+ "loss": 1.4423,
34
+ "step": 200
35
+ },
36
+ {
37
+ "epoch": 0.02,
38
+ "eval_loss": 1.227220892906189,
39
+ "eval_runtime": 31.4526,
40
+ "eval_samples_per_second": 6.359,
41
+ "eval_steps_per_second": 0.795,
42
+ "step": 200
43
+ },
44
+ {
45
+ "epoch": 0.02,
46
+ "learning_rate": 4.6393357550596784e-05,
47
+ "loss": 1.3335,
48
+ "step": 300
49
+ },
50
+ {
51
+ "epoch": 0.02,
52
+ "eval_loss": 1.2049953937530518,
53
+ "eval_runtime": 31.9522,
54
+ "eval_samples_per_second": 6.259,
55
+ "eval_steps_per_second": 0.782,
56
+ "step": 300
57
+ },
58
+ {
59
+ "epoch": 0.03,
60
+ "learning_rate": 6.165023352361183e-05,
61
+ "loss": 1.4225,
62
+ "step": 400
63
+ },
64
+ {
65
+ "epoch": 0.03,
66
+ "eval_loss": 1.1863670349121094,
67
+ "eval_runtime": 31.5148,
68
+ "eval_samples_per_second": 6.346,
69
+ "eval_steps_per_second": 0.793,
70
+ "step": 400
71
+ },
72
+ {
73
+ "epoch": 0.04,
74
+ "learning_rate": 7.721847431240269e-05,
75
+ "loss": 1.3618,
76
+ "step": 500
77
+ },
78
+ {
79
+ "epoch": 0.04,
80
+ "eval_loss": 1.1773875951766968,
81
+ "eval_runtime": 31.4521,
82
+ "eval_samples_per_second": 6.359,
83
+ "eval_steps_per_second": 0.795,
84
+ "step": 500
85
+ },
86
+ {
87
+ "epoch": 0.05,
88
+ "learning_rate": 9.278671510119357e-05,
89
+ "loss": 1.3262,
90
+ "step": 600
91
+ },
92
+ {
93
+ "epoch": 0.05,
94
+ "eval_loss": 1.168226718902588,
95
+ "eval_runtime": 31.5121,
96
+ "eval_samples_per_second": 6.347,
97
+ "eval_steps_per_second": 0.793,
98
+ "step": 600
99
+ },
100
+ {
101
+ "epoch": 0.05,
102
+ "learning_rate": 0.00010835495588998442,
103
+ "loss": 1.2133,
104
+ "step": 700
105
+ },
106
+ {
107
+ "epoch": 0.05,
108
+ "eval_loss": 1.1481703519821167,
109
+ "eval_runtime": 31.504,
110
+ "eval_samples_per_second": 6.348,
111
+ "eval_steps_per_second": 0.794,
112
+ "step": 700
113
+ },
114
+ {
115
+ "epoch": 0.06,
116
+ "learning_rate": 0.00012392319667877528,
117
+ "loss": 1.2034,
118
+ "step": 800
119
+ },
120
+ {
121
+ "epoch": 0.06,
122
+ "eval_loss": 1.13801908493042,
123
+ "eval_runtime": 31.5183,
124
+ "eval_samples_per_second": 6.346,
125
+ "eval_steps_per_second": 0.793,
126
+ "step": 800
127
+ },
128
+ {
129
+ "epoch": 0.07,
130
+ "learning_rate": 0.00013949143746756616,
131
+ "loss": 1.2159,
132
+ "step": 900
133
+ },
134
+ {
135
+ "epoch": 0.07,
136
+ "eval_loss": 1.1303695440292358,
137
+ "eval_runtime": 31.4531,
138
+ "eval_samples_per_second": 6.359,
139
+ "eval_steps_per_second": 0.795,
140
+ "step": 900
141
+ },
142
+ {
143
+ "epoch": 0.08,
144
+ "learning_rate": 0.00015505967825635704,
145
+ "loss": 1.1612,
146
+ "step": 1000
147
+ },
148
+ {
149
+ "epoch": 0.08,
150
+ "eval_loss": 1.1280632019042969,
151
+ "eval_runtime": 31.434,
152
+ "eval_samples_per_second": 6.363,
153
+ "eval_steps_per_second": 0.795,
154
+ "step": 1000
155
+ },
156
+ {
157
+ "epoch": 0.09,
158
+ "learning_rate": 0.00017062791904514786,
159
+ "loss": 1.1549,
160
+ "step": 1100
161
+ },
162
+ {
163
+ "epoch": 0.09,
164
+ "eval_loss": 1.1211422681808472,
165
+ "eval_runtime": 31.5065,
166
+ "eval_samples_per_second": 6.348,
167
+ "eval_steps_per_second": 0.793,
168
+ "step": 1100
169
+ },
170
+ {
171
+ "epoch": 0.09,
172
+ "learning_rate": 0.00018619615983393875,
173
+ "loss": 1.2038,
174
+ "step": 1200
175
+ },
176
+ {
177
+ "epoch": 0.09,
178
+ "eval_loss": 1.126134991645813,
179
+ "eval_runtime": 31.4919,
180
+ "eval_samples_per_second": 6.351,
181
+ "eval_steps_per_second": 0.794,
182
+ "step": 1200
183
+ },
184
+ {
185
+ "epoch": 0.1,
186
+ "learning_rate": 0.00020176440062272963,
187
+ "loss": 1.1784,
188
+ "step": 1300
189
+ },
190
+ {
191
+ "epoch": 0.1,
192
+ "eval_loss": 1.1223646402359009,
193
+ "eval_runtime": 31.4108,
194
+ "eval_samples_per_second": 6.367,
195
+ "eval_steps_per_second": 0.796,
196
+ "step": 1300
197
+ },
198
+ {
199
+ "epoch": 0.11,
200
+ "learning_rate": 0.00021733264141152048,
201
+ "loss": 1.1834,
202
+ "step": 1400
203
+ },
204
+ {
205
+ "epoch": 0.11,
206
+ "eval_loss": 1.113663911819458,
207
+ "eval_runtime": 31.463,
208
+ "eval_samples_per_second": 6.357,
209
+ "eval_steps_per_second": 0.795,
210
+ "step": 1400
211
+ },
212
+ {
213
+ "epoch": 0.12,
214
+ "learning_rate": 0.00023290088220031133,
215
+ "loss": 1.1974,
216
+ "step": 1500
217
+ },
218
+ {
219
+ "epoch": 0.12,
220
+ "eval_loss": 1.1265443563461304,
221
+ "eval_runtime": 31.4893,
222
+ "eval_samples_per_second": 6.351,
223
+ "eval_steps_per_second": 0.794,
224
+ "step": 1500
225
+ },
226
+ {
227
+ "epoch": 0.12,
228
+ "learning_rate": 0.0002484691229891022,
229
+ "loss": 1.2174,
230
+ "step": 1600
231
+ },
232
+ {
233
+ "epoch": 0.12,
234
+ "eval_loss": 1.1237465143203735,
235
+ "eval_runtime": 31.5226,
236
+ "eval_samples_per_second": 6.345,
237
+ "eval_steps_per_second": 0.793,
238
+ "step": 1600
239
+ },
240
+ {
241
+ "epoch": 0.13,
242
+ "learning_rate": 0.00026403736377789307,
243
+ "loss": 1.2019,
244
+ "step": 1700
245
+ },
246
+ {
247
+ "epoch": 0.13,
248
+ "eval_loss": 1.1198475360870361,
249
+ "eval_runtime": 31.4658,
250
+ "eval_samples_per_second": 6.356,
251
+ "eval_steps_per_second": 0.795,
252
+ "step": 1700
253
+ },
254
+ {
255
+ "epoch": 0.14,
256
+ "learning_rate": 0.0002796056045666839,
257
+ "loss": 1.2114,
258
+ "step": 1800
259
+ },
260
+ {
261
+ "epoch": 0.14,
262
+ "eval_loss": 1.1201978921890259,
263
+ "eval_runtime": 31.5138,
264
+ "eval_samples_per_second": 6.346,
265
+ "eval_steps_per_second": 0.793,
266
+ "step": 1800
267
+ },
268
+ {
269
+ "epoch": 0.15,
270
+ "learning_rate": 0.00029517384535547483,
271
+ "loss": 1.1848,
272
+ "step": 1900
273
+ },
274
+ {
275
+ "epoch": 0.15,
276
+ "eval_loss": 1.1198968887329102,
277
+ "eval_runtime": 31.1932,
278
+ "eval_samples_per_second": 6.412,
279
+ "eval_steps_per_second": 0.801,
280
+ "step": 1900
281
+ },
282
+ {
283
+ "epoch": 0.16,
284
+ "learning_rate": 0.0002994345961596241,
285
+ "loss": 1.129,
286
+ "step": 2000
287
+ },
288
+ {
289
+ "epoch": 0.16,
290
+ "eval_loss": 1.110647439956665,
291
+ "eval_runtime": 31.1909,
292
+ "eval_samples_per_second": 6.412,
293
+ "eval_steps_per_second": 0.802,
294
+ "step": 2000
295
+ },
296
+ {
297
+ "epoch": 0.16,
298
+ "learning_rate": 0.00029861517030400697,
299
+ "loss": 1.1864,
300
+ "step": 2100
301
+ },
302
+ {
303
+ "epoch": 0.16,
304
+ "eval_loss": 1.1134731769561768,
305
+ "eval_runtime": 31.169,
306
+ "eval_samples_per_second": 6.417,
307
+ "eval_steps_per_second": 0.802,
308
+ "step": 2100
309
+ },
310
+ {
311
+ "epoch": 0.17,
312
+ "learning_rate": 0.0002977957444483898,
313
+ "loss": 1.1725,
314
+ "step": 2200
315
+ },
316
+ {
317
+ "epoch": 0.17,
318
+ "eval_loss": 1.1226890087127686,
319
+ "eval_runtime": 31.1975,
320
+ "eval_samples_per_second": 6.411,
321
+ "eval_steps_per_second": 0.801,
322
+ "step": 2200
323
+ },
324
+ {
325
+ "epoch": 0.18,
326
+ "learning_rate": 0.0002969763185927726,
327
+ "loss": 1.1363,
328
+ "step": 2300
329
+ },
330
+ {
331
+ "epoch": 0.18,
332
+ "eval_loss": 1.1255100965499878,
333
+ "eval_runtime": 31.237,
334
+ "eval_samples_per_second": 6.403,
335
+ "eval_steps_per_second": 0.8,
336
+ "step": 2300
337
+ },
338
+ {
339
+ "epoch": 0.19,
340
+ "learning_rate": 0.00029615689273715547,
341
+ "loss": 1.204,
342
+ "step": 2400
343
+ },
344
+ {
345
+ "epoch": 0.19,
346
+ "eval_loss": 1.1148998737335205,
347
+ "eval_runtime": 31.2369,
348
+ "eval_samples_per_second": 6.403,
349
+ "eval_steps_per_second": 0.8,
350
+ "step": 2400
351
+ },
352
+ {
353
+ "epoch": 0.19,
354
+ "learning_rate": 0.0002953374668815383,
355
+ "loss": 1.1465,
356
+ "step": 2500
357
+ },
358
+ {
359
+ "epoch": 0.19,
360
+ "eval_loss": 1.1103030443191528,
361
+ "eval_runtime": 31.2141,
362
+ "eval_samples_per_second": 6.407,
363
+ "eval_steps_per_second": 0.801,
364
+ "step": 2500
365
+ },
366
+ {
367
+ "epoch": 0.2,
368
+ "learning_rate": 0.0002945180410259212,
369
+ "loss": 1.2058,
370
+ "step": 2600
371
+ },
372
+ {
373
+ "epoch": 0.2,
374
+ "eval_loss": 1.1112116575241089,
375
+ "eval_runtime": 31.6225,
376
+ "eval_samples_per_second": 6.325,
377
+ "eval_steps_per_second": 0.791,
378
+ "step": 2600
379
+ },
380
+ {
381
+ "epoch": 0.21,
382
+ "learning_rate": 0.00029369861517030397,
383
+ "loss": 1.1662,
384
+ "step": 2700
385
+ },
386
+ {
387
+ "epoch": 0.21,
388
+ "eval_loss": 1.1122115850448608,
389
+ "eval_runtime": 31.1829,
390
+ "eval_samples_per_second": 6.414,
391
+ "eval_steps_per_second": 0.802,
392
+ "step": 2700
393
+ },
394
+ {
395
+ "epoch": 0.22,
396
+ "learning_rate": 0.0002928791893146868,
397
+ "loss": 1.1372,
398
+ "step": 2800
399
+ },
400
+ {
401
+ "epoch": 0.22,
402
+ "eval_loss": 1.1113629341125488,
403
+ "eval_runtime": 31.6588,
404
+ "eval_samples_per_second": 6.317,
405
+ "eval_steps_per_second": 0.79,
406
+ "step": 2800
407
+ },
408
+ {
409
+ "epoch": 0.23,
410
+ "learning_rate": 0.0002920597634590696,
411
+ "loss": 1.1693,
412
+ "step": 2900
413
+ },
414
+ {
415
+ "epoch": 0.23,
416
+ "eval_loss": 1.1073808670043945,
417
+ "eval_runtime": 31.5886,
418
+ "eval_samples_per_second": 6.331,
419
+ "eval_steps_per_second": 0.791,
420
+ "step": 2900
421
+ },
422
+ {
423
+ "epoch": 0.23,
424
+ "learning_rate": 0.0002912403376034525,
425
+ "loss": 1.1776,
426
+ "step": 3000
427
+ },
428
+ {
429
+ "epoch": 0.23,
430
+ "eval_loss": 1.1112266778945923,
431
+ "eval_runtime": 31.3957,
432
+ "eval_samples_per_second": 6.37,
433
+ "eval_steps_per_second": 0.796,
434
+ "step": 3000
435
+ }
436
+ ],
437
+ "max_steps": 38538,
438
+ "num_train_epochs": 3,
439
+ "total_flos": 4.679774978918646e+17,
440
+ "trial_name": null,
441
+ "trial_params": null
442
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5bb6ba6f5e12c684d8d83e0b5f07bf755523076c01cd9fe3b53e7d228104ff0
3
+ size 3579