chansung commited on
Commit
4ab1fb8
1 Parent(s): 17358be

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.6643
24
 
25
  ## Model description
26
 
@@ -51,17 +51,13 @@ The following hyperparameters were used during training:
51
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
- - num_epochs: 5
55
 
56
  ### Training results
57
 
58
- | Training Loss | Epoch | Step | Validation Loss |
59
- |:-------------:|:------:|:----:|:---------------:|
60
- | 2.9056 | 0.9924 | 65 | 2.6113 |
61
- | 1.8271 | 2.0 | 131 | 1.8230 |
62
- | 1.7019 | 2.9924 | 196 | 1.7041 |
63
- | 1.7024 | 4.0 | 262 | 1.6962 |
64
- | 1.6463 | 4.9618 | 325 | 1.6643 |
65
 
66
 
67
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.6616
24
 
25
  ## Model description
26
 
 
51
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
  - lr_scheduler_type: cosine
53
  - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 1
55
 
56
  ### Training results
57
 
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.6542 | 1.0 | 140 | 1.6616 |
 
 
 
 
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 4.961832061068702,
3
- "eval_loss": 1.664337158203125,
4
- "eval_runtime": 19.0212,
5
- "eval_samples": 5201,
6
- "eval_samples_per_second": 48.42,
7
- "eval_steps_per_second": 0.789,
8
- "total_flos": 9.909828121379471e+17,
9
- "train_loss": 5.476599056537335,
10
- "train_runtime": 4095.1846,
11
- "train_samples": 46801,
12
- "train_samples_per_second": 10.222,
13
- "train_steps_per_second": 0.079
14
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 4.268849030789857e+17,
4
+ "train_loss": 5.964417205538068,
5
+ "train_runtime": 1743.5737,
6
+ "train_samples": 51241,
7
+ "train_samples_per_second": 10.269,
8
+ "train_steps_per_second": 0.08
 
 
 
 
 
9
  }
runs/Nov15_21-21-31_main-lora-mistral-alpaca-0-0/events.out.tfevents.1731724272.main-lora-mistral-alpaca-0-0.456.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:509cbd7f8d7f8e46314c0198f31a922c767fdf00a6a24d9aebc80e23c745dcf3
3
- size 12677
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7d66dcd64dfa383a706ccb760e61ce343eda49a6adcba7ff73cabb9e80d7c6f
3
+ size 13302
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 4.961832061068702,
3
- "total_flos": 9.909828121379471e+17,
4
- "train_loss": 5.476599056537335,
5
- "train_runtime": 4095.1846,
6
- "train_samples": 46801,
7
- "train_samples_per_second": 10.222,
8
- "train_steps_per_second": 0.079
9
  }
 
1
  {
2
+ "epoch": 1.0,
3
+ "total_flos": 4.268849030789857e+17,
4
+ "train_loss": 5.964417205538068,
5
+ "train_runtime": 1743.5737,
6
+ "train_samples": 51241,
7
+ "train_samples_per_second": 10.269,
8
+ "train_steps_per_second": 0.08
9
  }
trainer_state.json CHANGED
@@ -1,529 +1,238 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.961832061068702,
5
  "eval_steps": 500,
6
- "global_step": 325,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.015267175572519083,
13
- "grad_norm": 183.11753845214844,
14
- "learning_rate": 6.060606060606061e-06,
15
- "loss": 46.1063,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.07633587786259542,
20
- "grad_norm": 136.03738403320312,
21
- "learning_rate": 3.0303030303030306e-05,
22
- "loss": 44.0302,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.15267175572519084,
27
- "grad_norm": 69.2432632446289,
28
- "learning_rate": 6.060606060606061e-05,
29
- "loss": 38.4659,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.22900763358778625,
34
- "grad_norm": 17.486797332763672,
35
- "learning_rate": 9.090909090909092e-05,
36
- "loss": 30.3029,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.3053435114503817,
41
- "grad_norm": 13.530756950378418,
42
- "learning_rate": 0.00012121212121212122,
43
- "loss": 26.6709,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.3816793893129771,
48
- "grad_norm": 7.521498680114746,
49
- "learning_rate": 0.00015151515151515152,
50
- "loss": 24.4319,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.4580152671755725,
55
- "grad_norm": 5.912084102630615,
56
- "learning_rate": 0.00018181818181818183,
57
- "loss": 22.862,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.5343511450381679,
62
- "grad_norm": 10.610209465026855,
63
- "learning_rate": 0.00019997685019798912,
64
- "loss": 21.5999,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.6106870229007634,
69
- "grad_norm": 20.944725036621094,
70
- "learning_rate": 0.0001997165380022878,
71
- "loss": 19.4719,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.6870229007633588,
76
- "grad_norm": 34.12383270263672,
77
- "learning_rate": 0.000199167731989929,
78
- "loss": 14.6832,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.7633587786259542,
83
- "grad_norm": 42.86738204956055,
84
- "learning_rate": 0.0001983320199330545,
85
- "loss": 8.7569,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.8396946564885496,
90
- "grad_norm": 12.474686622619629,
91
- "learning_rate": 0.00019721181966290613,
92
- "loss": 4.3457,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.916030534351145,
97
- "grad_norm": 9.623456954956055,
98
- "learning_rate": 0.00019581037207470382,
99
- "loss": 3.4309,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.9923664122137404,
104
- "grad_norm": 3.5216312408447266,
105
- "learning_rate": 0.00019413173175128473,
106
- "loss": 2.9056,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.9923664122137404,
111
- "eval_loss": 2.611328125,
112
- "eval_runtime": 19.2134,
113
- "eval_samples_per_second": 47.935,
114
- "eval_steps_per_second": 0.781,
115
- "step": 65
116
- },
117
- {
118
- "epoch": 1.0687022900763359,
119
- "grad_norm": 2.9582359790802,
120
- "learning_rate": 0.00019218075523263104,
121
- "loss": 2.7809,
122
  "step": 70
123
  },
124
  {
125
- "epoch": 1.1450381679389312,
126
- "grad_norm": 2.319239616394043,
127
- "learning_rate": 0.00018996308696522433,
128
- "loss": 2.3224,
129
  "step": 75
130
  },
131
  {
132
- "epoch": 1.2213740458015268,
133
- "grad_norm": 1.3839267492294312,
134
- "learning_rate": 0.00018748514297187648,
135
- "loss": 2.2039,
136
  "step": 80
137
  },
138
  {
139
- "epoch": 1.297709923664122,
140
- "grad_norm": 0.5840837955474854,
141
- "learning_rate": 0.00018475409228928312,
142
- "loss": 2.1174,
143
  "step": 85
144
  },
145
  {
146
- "epoch": 1.3740458015267176,
147
- "grad_norm": 1.5493711233139038,
148
- "learning_rate": 0.00018177783622700327,
149
- "loss": 2.0565,
150
  "step": 90
151
  },
152
  {
153
- "epoch": 1.450381679389313,
154
- "grad_norm": 0.7415986657142639,
155
- "learning_rate": 0.00017856498550787144,
156
- "loss": 2.003,
157
  "step": 95
158
  },
159
  {
160
- "epoch": 1.5267175572519083,
161
- "grad_norm": 0.6342356204986572,
162
- "learning_rate": 0.00017512483535597867,
163
- "loss": 1.9686,
164
  "step": 100
165
  },
166
  {
167
- "epoch": 1.6030534351145038,
168
- "grad_norm": 1.0893248319625854,
169
- "learning_rate": 0.00017146733860429612,
170
- "loss": 1.9499,
171
  "step": 105
172
  },
173
  {
174
- "epoch": 1.6793893129770994,
175
- "grad_norm": 1.233128547668457,
176
- "learning_rate": 0.0001676030768997445,
177
- "loss": 1.9192,
178
  "step": 110
179
  },
180
  {
181
- "epoch": 1.7557251908396947,
182
- "grad_norm": 0.7829602360725403,
183
- "learning_rate": 0.00016354323008901776,
184
- "loss": 1.8934,
185
  "step": 115
186
  },
187
  {
188
- "epoch": 1.83206106870229,
189
- "grad_norm": 1.0393383502960205,
190
- "learning_rate": 0.00015929954387373103,
191
- "loss": 1.8579,
192
  "step": 120
193
  },
194
  {
195
- "epoch": 1.9083969465648853,
196
- "grad_norm": 2.433302879333496,
197
- "learning_rate": 0.00015488429582847192,
198
- "loss": 1.8576,
199
  "step": 125
200
  },
201
  {
202
- "epoch": 1.984732824427481,
203
- "grad_norm": 1.2537367343902588,
204
- "learning_rate": 0.00015031025988006936,
205
- "loss": 1.8271,
206
  "step": 130
207
  },
208
  {
209
- "epoch": 2.0,
210
- "eval_loss": 1.8229883909225464,
211
- "eval_runtime": 19.0953,
212
- "eval_samples_per_second": 48.232,
213
- "eval_steps_per_second": 0.786,
214
- "step": 131
215
- },
216
- {
217
- "epoch": 2.0610687022900764,
218
- "grad_norm": 1.04417085647583,
219
- "learning_rate": 0.00014559066935084588,
220
- "loss": 1.975,
221
  "step": 135
222
  },
223
  {
224
- "epoch": 2.1374045801526718,
225
- "grad_norm": 0.9754623174667358,
226
- "learning_rate": 0.00014073917867277557,
227
- "loss": 1.7901,
228
- "step": 140
229
- },
230
- {
231
- "epoch": 2.213740458015267,
232
- "grad_norm": 0.6031882762908936,
233
- "learning_rate": 0.0001357698238833126,
234
- "loss": 1.7584,
235
- "step": 145
236
- },
237
- {
238
- "epoch": 2.2900763358778624,
239
- "grad_norm": 1.7654844522476196,
240
- "learning_rate": 0.000130696982017182,
241
- "loss": 1.7665,
242
- "step": 150
243
- },
244
- {
245
- "epoch": 2.366412213740458,
246
- "grad_norm": 1.8184305429458618,
247
- "learning_rate": 0.0001255353295116187,
248
- "loss": 1.7496,
249
- "step": 155
250
- },
251
- {
252
- "epoch": 2.4427480916030535,
253
- "grad_norm": 2.4291305541992188,
254
- "learning_rate": 0.00012029979974539234,
255
- "loss": 1.7389,
256
- "step": 160
257
- },
258
- {
259
- "epoch": 2.519083969465649,
260
- "grad_norm": 0.7844381928443909,
261
- "learning_rate": 0.00011500553983446527,
262
- "loss": 1.7327,
263
- "step": 165
264
- },
265
- {
266
- "epoch": 2.595419847328244,
267
- "grad_norm": 1.0221455097198486,
268
- "learning_rate": 0.00010966786680927874,
269
- "loss": 1.7365,
270
- "step": 170
271
- },
272
- {
273
- "epoch": 2.67175572519084,
274
- "grad_norm": 1.1956524848937988,
275
- "learning_rate": 0.00010430222330045304,
276
- "loss": 1.7204,
277
- "step": 175
278
- },
279
- {
280
- "epoch": 2.7480916030534353,
281
- "grad_norm": 0.7325518131256104,
282
- "learning_rate": 9.892413286110886e-05,
283
- "loss": 1.7177,
284
- "step": 180
285
- },
286
- {
287
- "epoch": 2.8244274809160306,
288
- "grad_norm": 0.8538561463356018,
289
- "learning_rate": 9.354915505506839e-05,
290
- "loss": 1.7193,
291
- "step": 185
292
- },
293
- {
294
- "epoch": 2.900763358778626,
295
- "grad_norm": 1.252325415611267,
296
- "learning_rate": 8.81928404408726e-05,
297
- "loss": 1.7058,
298
- "step": 190
299
- },
300
- {
301
- "epoch": 2.9770992366412212,
302
- "grad_norm": 0.7734937071800232,
303
- "learning_rate": 8.287068558185225e-05,
304
- "loss": 1.7019,
305
- "step": 195
306
- },
307
- {
308
- "epoch": 2.9923664122137406,
309
- "eval_loss": 1.7041354179382324,
310
- "eval_runtime": 19.3108,
311
- "eval_samples_per_second": 47.694,
312
- "eval_steps_per_second": 0.777,
313
- "step": 196
314
- },
315
- {
316
- "epoch": 3.053435114503817,
317
- "grad_norm": 0.6631619334220886,
318
- "learning_rate": 7.759808821241406e-05,
319
- "loss": 1.8697,
320
- "step": 200
321
- },
322
- {
323
- "epoch": 3.1297709923664123,
324
- "grad_norm": 0.7187236547470093,
325
- "learning_rate": 7.239030269025311e-05,
326
- "loss": 1.7181,
327
- "step": 205
328
- },
329
- {
330
- "epoch": 3.2061068702290076,
331
- "grad_norm": 0.5320985913276672,
332
- "learning_rate": 6.726239586337408e-05,
333
- "loss": 1.7351,
334
- "step": 210
335
- },
336
- {
337
- "epoch": 3.282442748091603,
338
- "grad_norm": 0.43638336658477783,
339
- "learning_rate": 6.22292034796035e-05,
340
- "loss": 1.7156,
341
- "step": 215
342
- },
343
- {
344
- "epoch": 3.3587786259541983,
345
- "grad_norm": 0.3966742753982544,
346
- "learning_rate": 5.730528726470792e-05,
347
- "loss": 1.7158,
348
- "step": 220
349
- },
350
- {
351
- "epoch": 3.435114503816794,
352
- "grad_norm": 0.326159805059433,
353
- "learning_rate": 5.2504892793295e-05,
354
- "loss": 1.7055,
355
- "step": 225
356
- },
357
- {
358
- "epoch": 3.5114503816793894,
359
- "grad_norm": 0.4766685664653778,
360
- "learning_rate": 4.7841908274384616e-05,
361
- "loss": 1.7006,
362
- "step": 230
363
- },
364
- {
365
- "epoch": 3.5877862595419847,
366
- "grad_norm": 0.41363418102264404,
367
- "learning_rate": 4.332982437088825e-05,
368
- "loss": 1.7106,
369
- "step": 235
370
- },
371
- {
372
- "epoch": 3.66412213740458,
373
- "grad_norm": 0.5006980299949646,
374
- "learning_rate": 3.898169516924398e-05,
375
- "loss": 1.6938,
376
- "step": 240
377
- },
378
- {
379
- "epoch": 3.7404580152671754,
380
- "grad_norm": 0.4720315933227539,
381
- "learning_rate": 3.4810100412128747e-05,
382
- "loss": 1.6886,
383
- "step": 245
384
- },
385
- {
386
- "epoch": 3.816793893129771,
387
- "grad_norm": 0.5057269334793091,
388
- "learning_rate": 3.0827109103512643e-05,
389
- "loss": 1.6912,
390
- "step": 250
391
- },
392
- {
393
- "epoch": 3.8931297709923665,
394
- "grad_norm": 0.38378995656967163,
395
- "learning_rate": 2.7044244591351232e-05,
396
- "loss": 1.7001,
397
- "step": 255
398
- },
399
- {
400
- "epoch": 3.969465648854962,
401
- "grad_norm": 0.3008043169975281,
402
- "learning_rate": 2.3472451228937253e-05,
403
- "loss": 1.7024,
404
- "step": 260
405
- },
406
- {
407
- "epoch": 4.0,
408
- "eval_loss": 1.6962379217147827,
409
- "eval_runtime": 18.9852,
410
- "eval_samples_per_second": 48.512,
411
- "eval_steps_per_second": 0.79,
412
- "step": 262
413
- },
414
- {
415
- "epoch": 4.0458015267175576,
416
- "grad_norm": 0.9348434805870056,
417
- "learning_rate": 2.0122062711363532e-05,
418
- "loss": 1.8574,
419
- "step": 265
420
- },
421
- {
422
- "epoch": 4.122137404580153,
423
- "grad_norm": 0.7455368638038635,
424
- "learning_rate": 1.7002772178705716e-05,
425
- "loss": 1.6594,
426
- "step": 270
427
- },
428
- {
429
- "epoch": 4.198473282442748,
430
- "grad_norm": 0.5774383544921875,
431
- "learning_rate": 1.4123604172419713e-05,
432
- "loss": 1.6527,
433
- "step": 275
434
- },
435
- {
436
- "epoch": 4.2748091603053435,
437
- "grad_norm": 0.5370898842811584,
438
- "learning_rate": 1.149288852608743e-05,
439
- "loss": 1.6587,
440
- "step": 280
441
- },
442
- {
443
- "epoch": 4.351145038167939,
444
- "grad_norm": 0.7321135997772217,
445
- "learning_rate": 9.118236266049707e-06,
446
- "loss": 1.6676,
447
- "step": 285
448
- },
449
- {
450
- "epoch": 4.427480916030534,
451
- "grad_norm": 0.5155964493751526,
452
- "learning_rate": 7.0065175916482095e-06,
453
- "loss": 1.6579,
454
- "step": 290
455
- },
456
- {
457
- "epoch": 4.5038167938931295,
458
- "grad_norm": 0.6737932562828064,
459
- "learning_rate": 5.163841998782837e-06,
460
- "loss": 1.6508,
461
- "step": 295
462
- },
463
- {
464
- "epoch": 4.580152671755725,
465
- "grad_norm": 0.9017395377159119,
466
- "learning_rate": 3.595540604290437e-06,
467
- "loss": 1.6375,
468
- "step": 300
469
- },
470
- {
471
- "epoch": 4.65648854961832,
472
- "grad_norm": 0.5460083484649658,
473
- "learning_rate": 2.30615072228183e-06,
474
- "loss": 1.6522,
475
- "step": 305
476
- },
477
- {
478
- "epoch": 4.732824427480916,
479
- "grad_norm": 0.5443113446235657,
480
- "learning_rate": 1.2994027370611173e-06,
481
- "loss": 1.648,
482
- "step": 310
483
- },
484
- {
485
- "epoch": 4.809160305343512,
486
- "grad_norm": 0.6177972555160522,
487
- "learning_rate": 5.782093106048159e-07,
488
- "loss": 1.6559,
489
- "step": 315
490
- },
491
- {
492
- "epoch": 4.885496183206107,
493
- "grad_norm": 0.4734289050102234,
494
- "learning_rate": 1.446569558255395e-07,
495
- "loss": 1.6443,
496
- "step": 320
497
- },
498
- {
499
- "epoch": 4.961832061068702,
500
- "grad_norm": 0.6619871854782104,
501
  "learning_rate": 0.0,
502
- "loss": 1.6463,
503
- "step": 325
504
  },
505
  {
506
- "epoch": 4.961832061068702,
507
- "eval_loss": 1.664337158203125,
508
- "eval_runtime": 18.9808,
509
- "eval_samples_per_second": 48.523,
510
- "eval_steps_per_second": 0.79,
511
- "step": 325
512
  },
513
  {
514
- "epoch": 4.961832061068702,
515
- "step": 325,
516
- "total_flos": 9.909828121379471e+17,
517
- "train_loss": 5.476599056537335,
518
- "train_runtime": 4095.1846,
519
- "train_samples_per_second": 10.222,
520
- "train_steps_per_second": 0.079
521
  }
522
  ],
523
  "logging_steps": 5,
524
- "max_steps": 325,
525
  "num_input_tokens_seen": 0,
526
- "num_train_epochs": 5,
527
  "save_steps": 100,
528
  "stateful_callbacks": {
529
  "TrainerControl": {
@@ -537,7 +246,7 @@
537
  "attributes": {}
538
  }
539
  },
540
- "total_flos": 9.909828121379471e+17,
541
  "train_batch_size": 8,
542
  "trial_name": null,
543
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 140,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.007142857142857143,
13
+ "grad_norm": 106.10796356201172,
14
+ "learning_rate": 1.4285714285714285e-05,
15
+ "loss": 27.4831,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.03571428571428571,
20
+ "grad_norm": 46.88632583618164,
21
+ "learning_rate": 7.142857142857143e-05,
22
+ "loss": 26.3105,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.07142857142857142,
27
+ "grad_norm": 16.438461303710938,
28
+ "learning_rate": 0.00014285714285714287,
29
+ "loss": 20.4923,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.10714285714285714,
34
+ "grad_norm": 9.703349113464355,
35
+ "learning_rate": 0.00019996891820008164,
36
+ "loss": 16.0475,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.14285714285714285,
41
+ "grad_norm": 3.3252718448638916,
42
+ "learning_rate": 0.00019888308262251285,
43
+ "loss": 13.4483,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.17857142857142858,
48
+ "grad_norm": 3.251009941101074,
49
+ "learning_rate": 0.0001962624246950012,
50
+ "loss": 12.6172,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.21428571428571427,
55
+ "grad_norm": 5.38721227645874,
56
+ "learning_rate": 0.00019214762118704076,
57
+ "loss": 11.9827,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.25,
62
+ "grad_norm": 10.532784461975098,
63
+ "learning_rate": 0.00018660254037844388,
64
+ "loss": 11.0145,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.2857142857142857,
69
+ "grad_norm": 14.15513801574707,
70
+ "learning_rate": 0.00017971325072229226,
71
+ "loss": 9.3353,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.32142857142857145,
76
+ "grad_norm": 21.529788970947266,
77
+ "learning_rate": 0.00017158668492597186,
78
+ "loss": 7.229,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.35714285714285715,
83
+ "grad_norm": 15.934834480285645,
84
+ "learning_rate": 0.00016234898018587337,
85
+ "loss": 4.7381,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.39285714285714285,
90
+ "grad_norm": 5.8390045166015625,
91
+ "learning_rate": 0.0001521435203379498,
92
+ "loss": 2.8963,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.42857142857142855,
97
+ "grad_norm": 4.718578815460205,
98
+ "learning_rate": 0.00014112871031306119,
99
+ "loss": 2.4009,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.4642857142857143,
104
+ "grad_norm": 3.6623828411102295,
105
+ "learning_rate": 0.00012947551744109043,
106
+ "loss": 2.1721,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.5,
111
+ "grad_norm": 2.086202383041382,
112
+ "learning_rate": 0.00011736481776669306,
113
+ "loss": 2.0155,
 
 
 
 
 
 
 
 
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.5357142857142857,
118
+ "grad_norm": 1.1254757642745972,
119
+ "learning_rate": 0.00010498458856606972,
120
+ "loss": 1.9116,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.5714285714285714,
125
+ "grad_norm": 1.9386184215545654,
126
+ "learning_rate": 9.252699064135758e-05,
127
+ "loss": 1.8129,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.6071428571428571,
132
+ "grad_norm": 1.5302088260650635,
133
+ "learning_rate": 8.018538568006027e-05,
134
+ "loss": 1.7831,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.6428571428571429,
139
+ "grad_norm": 1.059885859489441,
140
+ "learning_rate": 6.815133497483157e-05,
141
+ "loss": 1.7466,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.6785714285714286,
146
+ "grad_norm": 1.2457741498947144,
147
+ "learning_rate": 5.6611626088244194e-05,
148
+ "loss": 1.7392,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.7142857142857143,
153
+ "grad_norm": 1.7660045623779297,
154
+ "learning_rate": 4.574537361342407e-05,
155
+ "loss": 1.6987,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.75,
160
+ "grad_norm": 0.7355481386184692,
161
+ "learning_rate": 3.5721239031346066e-05,
162
+ "loss": 1.6811,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.7857142857142857,
167
+ "grad_norm": 0.7224046587944031,
168
+ "learning_rate": 2.669481281701739e-05,
169
+ "loss": 1.7008,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 0.8214285714285714,
174
+ "grad_norm": 0.7918136715888977,
175
+ "learning_rate": 1.880619942841435e-05,
176
+ "loss": 1.6759,
177
  "step": 115
178
  },
179
  {
180
+ "epoch": 0.8571428571428571,
181
+ "grad_norm": 1.5361932516098022,
182
+ "learning_rate": 1.2177842662977135e-05,
183
+ "loss": 1.6748,
184
  "step": 120
185
  },
186
  {
187
+ "epoch": 0.8928571428571429,
188
+ "grad_norm": 1.0850142240524292,
189
+ "learning_rate": 6.9126251355795864e-06,
190
+ "loss": 1.6777,
191
  "step": 125
192
  },
193
  {
194
+ "epoch": 0.9285714285714286,
195
+ "grad_norm": 0.7962830662727356,
196
+ "learning_rate": 3.092271377092215e-06,
197
+ "loss": 1.6705,
198
  "step": 130
199
  },
200
  {
201
+ "epoch": 0.9642857142857143,
202
+ "grad_norm": 0.7234132885932922,
203
+ "learning_rate": 7.760793399827937e-07,
204
+ "loss": 1.6417,
 
 
 
 
 
 
 
 
205
  "step": 135
206
  },
207
  {
208
+ "epoch": 1.0,
209
+ "grad_norm": 0.7538830637931824,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  "learning_rate": 0.0,
211
+ "loss": 1.6542,
212
+ "step": 140
213
  },
214
  {
215
+ "epoch": 1.0,
216
+ "eval_loss": 1.6615785360336304,
217
+ "eval_runtime": 3.8187,
218
+ "eval_samples_per_second": 46.613,
219
+ "eval_steps_per_second": 0.786,
220
+ "step": 140
221
  },
222
  {
223
+ "epoch": 1.0,
224
+ "step": 140,
225
+ "total_flos": 4.268849030789857e+17,
226
+ "train_loss": 5.964417205538068,
227
+ "train_runtime": 1743.5737,
228
+ "train_samples_per_second": 10.269,
229
+ "train_steps_per_second": 0.08
230
  }
231
  ],
232
  "logging_steps": 5,
233
+ "max_steps": 140,
234
  "num_input_tokens_seen": 0,
235
+ "num_train_epochs": 1,
236
  "save_steps": 100,
237
  "stateful_callbacks": {
238
  "TrainerControl": {
 
246
  "attributes": {}
247
  }
248
  },
249
+ "total_flos": 4.268849030789857e+17,
250
  "train_batch_size": 8,
251
  "trial_name": null,
252
  "trial_params": null