RoyJoy commited on
Commit
5fcf671
1 Parent(s): 6769a67

Training in progress, step 25, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "down_proj",
24
- "o_proj",
25
  "gate_proj",
26
- "k_proj",
27
  "v_proj",
28
- "up_proj",
29
- "q_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "up_proj",
 
24
  "gate_proj",
25
+ "o_proj",
26
  "v_proj",
27
+ "k_proj",
28
+ "q_proj",
29
+ "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e07b5035ffb180931a60c0f114ef7216a354c482f95cd25ca93960a8ebf1c535
3
  size 323014168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a71533fe03d6d5298967c52fe8a3adcd9f58f9bfde5ea1ca92fded50af052345
3
  size 323014168
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3b5f5bb59cebe737dc3be6e482be6849e6e1825609a4a24df7ab1326e900710
3
  size 646253418
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97709bd8d8c4f3d5fbc7ea6c9b936a9128092e61fc480fb684fc8b8aa374bc4a
3
  size 646253418
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af25ef66e3578b088787098691ff18f3d160f468c01678789a770159b408e9ff
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b6d31cc46b3fdac785b4e257da9a155cd9ffc56f4117a3380bad48dd22d6e21
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3cb1a771d842cefd502b9e7b043d798857a051d7ea04a1aca476720f6bce1e4
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d95845af24bb237001e703b7e7c7500ff206982797e8188e4d4583e54688da29
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c5fc06ca6e201f6cf92a9a131de3c7777a41831442b3ee77f257c6e125e5136
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a9523f33dac34eaa51608e01e3a143afa57e5b297a1d3e88c82c8eac8e33a19
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce1df0fc9a8b7b175429da9e9454ce253fb484f5186e19d58b68bb4f04eaed68
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37851911ccb33a0de34736e35c192d969c66ca74673f9cfdb04e6055cfc12252
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:051dee7dfbeecb34b46e8409ffafec324501f465585234624669bc8c9e863ae4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:107607e336ff02e6e3fc135f6b55fc089901c7172808564e7046a8e836a537c9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "best_metric": 0.7247900366783142,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
- "epoch": 0.03545392098207361,
5
  "eval_steps": 25,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0003545392098207361,
13
- "grad_norm": 4.592109680175781,
14
  "learning_rate": 2e-05,
15
  "loss": 5.0876,
16
  "step": 1
@@ -18,735 +18,186 @@
18
  {
19
  "epoch": 0.0003545392098207361,
20
  "eval_loss": 6.216423511505127,
21
- "eval_runtime": 1.4239,
22
- "eval_samples_per_second": 35.114,
23
- "eval_steps_per_second": 9.13,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.0007090784196414722,
28
- "grad_norm": 4.908637523651123,
29
  "learning_rate": 4e-05,
30
  "loss": 4.9835,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.0010636176294622083,
35
- "grad_norm": 5.108648777008057,
36
  "learning_rate": 6e-05,
37
- "loss": 5.0016,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.0014181568392829444,
42
- "grad_norm": 5.53975248336792,
43
  "learning_rate": 8e-05,
44
- "loss": 4.8283,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.0017726960491036806,
49
- "grad_norm": 5.778749465942383,
50
  "learning_rate": 0.0001,
51
- "loss": 3.61,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.0021272352589244165,
56
- "grad_norm": 5.146139144897461,
57
  "learning_rate": 9.997539658034168e-05,
58
- "loss": 2.5626,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.002481774468745153,
63
- "grad_norm": 3.335517644882202,
64
  "learning_rate": 9.990161322484486e-05,
65
- "loss": 1.6114,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.002836313678565889,
70
- "grad_norm": 1.6765196323394775,
71
  "learning_rate": 9.977873061452552e-05,
72
- "loss": 1.2627,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.003190852888386625,
77
- "grad_norm": 1.5829001665115356,
78
  "learning_rate": 9.96068831197139e-05,
79
- "loss": 1.0987,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.003545392098207361,
84
- "grad_norm": 2.0118753910064697,
85
  "learning_rate": 9.938625865312251e-05,
86
- "loss": 0.8517,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.003899931308028097,
91
- "grad_norm": 2.000054121017456,
92
  "learning_rate": 9.911709846436641e-05,
93
- "loss": 0.7161,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.004254470517848833,
98
- "grad_norm": 1.3442065715789795,
99
  "learning_rate": 9.879969687616027e-05,
100
- "loss": 0.7239,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.004609009727669569,
105
- "grad_norm": 1.3330413103103638,
106
  "learning_rate": 9.84344009624807e-05,
107
- "loss": 0.8929,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.004963548937490306,
112
- "grad_norm": 1.584913969039917,
113
  "learning_rate": 9.80216101690461e-05,
114
- "loss": 1.193,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.005318088147311041,
119
- "grad_norm": 0.9643686413764954,
120
  "learning_rate": 9.756177587652856e-05,
121
- "loss": 1.0059,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.005672627357131778,
126
- "grad_norm": 1.0433870553970337,
127
  "learning_rate": 9.705540090697575e-05,
128
  "loss": 1.1119,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.006027166566952514,
133
- "grad_norm": 1.0804919004440308,
134
  "learning_rate": 9.650303897398232e-05,
135
- "loss": 0.8664,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.00638170577677325,
140
- "grad_norm": 0.9156429767608643,
141
  "learning_rate": 9.590529407721231e-05,
142
- "loss": 0.8146,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.006736244986593986,
147
- "grad_norm": 0.7510998845100403,
148
  "learning_rate": 9.526281984193436e-05,
149
- "loss": 0.7862,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.007090784196414722,
154
- "grad_norm": 0.8360267877578735,
155
  "learning_rate": 9.4576318804292e-05,
156
- "loss": 0.7927,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.007445323406235459,
161
- "grad_norm": 0.7520768046379089,
162
  "learning_rate": 9.384654164309083e-05,
163
  "loss": 0.6681,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.007799862616056194,
168
- "grad_norm": 0.5856723189353943,
169
  "learning_rate": 9.30742863589421e-05,
170
- "loss": 0.6741,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.008154401825876931,
175
- "grad_norm": 0.716020405292511,
176
  "learning_rate": 9.226039740166091e-05,
177
- "loss": 0.6866,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.008508941035697666,
182
- "grad_norm": 0.6712137460708618,
183
  "learning_rate": 9.140576474687264e-05,
184
- "loss": 0.4932,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.008863480245518402,
189
- "grad_norm": 0.7443585395812988,
190
  "learning_rate": 9.051132292283771e-05,
191
- "loss": 0.4627,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.008863480245518402,
196
- "eval_loss": 0.7844333052635193,
197
- "eval_runtime": 1.4306,
198
- "eval_samples_per_second": 34.95,
199
- "eval_steps_per_second": 9.087,
200
  "step": 25
201
- },
202
- {
203
- "epoch": 0.009218019455339139,
204
- "grad_norm": 0.6638553142547607,
205
- "learning_rate": 8.957804998855866e-05,
206
- "loss": 1.0456,
207
- "step": 26
208
- },
209
- {
210
- "epoch": 0.009572558665159875,
211
- "grad_norm": 0.8564605116844177,
212
- "learning_rate": 8.860696646428693e-05,
213
- "loss": 1.0947,
214
- "step": 27
215
- },
216
- {
217
- "epoch": 0.009927097874980612,
218
- "grad_norm": 0.7077105641365051,
219
- "learning_rate": 8.759913421559902e-05,
220
- "loss": 1.0131,
221
- "step": 28
222
- },
223
- {
224
- "epoch": 0.010281637084801348,
225
- "grad_norm": 0.8055078387260437,
226
- "learning_rate": 8.655565529226198e-05,
227
- "loss": 0.949,
228
- "step": 29
229
- },
230
- {
231
- "epoch": 0.010636176294622083,
232
- "grad_norm": 0.846217155456543,
233
- "learning_rate": 8.547767072315835e-05,
234
- "loss": 0.7614,
235
- "step": 30
236
- },
237
- {
238
- "epoch": 0.010990715504442819,
239
- "grad_norm": 0.7835873365402222,
240
- "learning_rate": 8.436635926858759e-05,
241
- "loss": 0.7072,
242
- "step": 31
243
- },
244
- {
245
- "epoch": 0.011345254714263555,
246
- "grad_norm": 0.6457473039627075,
247
- "learning_rate": 8.322293613130917e-05,
248
- "loss": 0.6011,
249
- "step": 32
250
- },
251
- {
252
- "epoch": 0.011699793924084292,
253
- "grad_norm": 0.6326347589492798,
254
- "learning_rate": 8.204865162773613e-05,
255
- "loss": 0.7046,
256
- "step": 33
257
- },
258
- {
259
- "epoch": 0.012054333133905028,
260
- "grad_norm": 0.6413611173629761,
261
- "learning_rate": 8.084478982073247e-05,
262
- "loss": 0.5783,
263
- "step": 34
264
- },
265
- {
266
- "epoch": 0.012408872343725764,
267
- "grad_norm": 0.7243404388427734,
268
- "learning_rate": 7.961266711550922e-05,
269
- "loss": 0.4478,
270
- "step": 35
271
- },
272
- {
273
- "epoch": 0.0127634115535465,
274
- "grad_norm": 0.565170168876648,
275
- "learning_rate": 7.835363082015468e-05,
276
- "loss": 0.4311,
277
- "step": 36
278
- },
279
- {
280
- "epoch": 0.013117950763367235,
281
- "grad_norm": 0.6162042021751404,
282
- "learning_rate": 7.706905767237288e-05,
283
- "loss": 0.3916,
284
- "step": 37
285
- },
286
- {
287
- "epoch": 0.013472489973187972,
288
- "grad_norm": 0.7090334892272949,
289
- "learning_rate": 7.576035233404096e-05,
290
- "loss": 0.7694,
291
- "step": 38
292
- },
293
- {
294
- "epoch": 0.013827029183008708,
295
- "grad_norm": 0.7033197283744812,
296
- "learning_rate": 7.442894585523218e-05,
297
- "loss": 0.9693,
298
- "step": 39
299
- },
300
- {
301
- "epoch": 0.014181568392829445,
302
- "grad_norm": 0.7213008999824524,
303
- "learning_rate": 7.307629410938363e-05,
304
- "loss": 0.8617,
305
- "step": 40
306
- },
307
- {
308
- "epoch": 0.014536107602650181,
309
- "grad_norm": 0.6332206726074219,
310
- "learning_rate": 7.170387620131993e-05,
311
- "loss": 0.7386,
312
- "step": 41
313
- },
314
- {
315
- "epoch": 0.014890646812470917,
316
- "grad_norm": 0.8468403220176697,
317
- "learning_rate": 7.031319284987394e-05,
318
- "loss": 0.7881,
319
- "step": 42
320
- },
321
- {
322
- "epoch": 0.015245186022291652,
323
- "grad_norm": 0.7750101089477539,
324
- "learning_rate": 6.890576474687263e-05,
325
- "loss": 0.6851,
326
- "step": 43
327
- },
328
- {
329
- "epoch": 0.015599725232112388,
330
- "grad_norm": 0.7253099679946899,
331
- "learning_rate": 6.7483130894283e-05,
332
- "loss": 0.6545,
333
- "step": 44
334
- },
335
- {
336
- "epoch": 0.015954264441933125,
337
- "grad_norm": 0.717294454574585,
338
- "learning_rate": 6.604684692133597e-05,
339
- "loss": 0.6447,
340
- "step": 45
341
- },
342
- {
343
- "epoch": 0.016308803651753863,
344
- "grad_norm": 0.6965174078941345,
345
- "learning_rate": 6.459848338346861e-05,
346
- "loss": 0.5076,
347
- "step": 46
348
- },
349
- {
350
- "epoch": 0.016663342861574598,
351
- "grad_norm": 0.7776918411254883,
352
- "learning_rate": 6.313962404494496e-05,
353
- "loss": 0.5284,
354
- "step": 47
355
- },
356
- {
357
- "epoch": 0.017017882071395332,
358
- "grad_norm": 0.9502537846565247,
359
- "learning_rate": 6.167186414703289e-05,
360
- "loss": 0.5736,
361
- "step": 48
362
- },
363
- {
364
- "epoch": 0.01737242128121607,
365
- "grad_norm": 0.8803554177284241,
366
- "learning_rate": 6.019680866363139e-05,
367
- "loss": 0.576,
368
- "step": 49
369
- },
370
- {
371
- "epoch": 0.017726960491036805,
372
- "grad_norm": 0.6308343410491943,
373
- "learning_rate": 5.8716070546254966e-05,
374
- "loss": 0.3936,
375
- "step": 50
376
- },
377
- {
378
- "epoch": 0.017726960491036805,
379
- "eval_loss": 0.7665626406669617,
380
- "eval_runtime": 1.431,
381
- "eval_samples_per_second": 34.942,
382
- "eval_steps_per_second": 9.085,
383
- "step": 50
384
- },
385
- {
386
- "epoch": 0.018081499700857543,
387
- "grad_norm": 0.7196190357208252,
388
- "learning_rate": 5.7231268960295e-05,
389
- "loss": 0.96,
390
- "step": 51
391
- },
392
- {
393
- "epoch": 0.018436038910678278,
394
- "grad_norm": 0.7429586052894592,
395
- "learning_rate": 5.574402751448614e-05,
396
- "loss": 0.9205,
397
- "step": 52
398
- },
399
- {
400
- "epoch": 0.018790578120499012,
401
- "grad_norm": 0.7085890173912048,
402
- "learning_rate": 5.425597248551387e-05,
403
- "loss": 0.7904,
404
- "step": 53
405
- },
406
- {
407
- "epoch": 0.01914511733031975,
408
- "grad_norm": 0.775666356086731,
409
- "learning_rate": 5.2768731039705e-05,
410
- "loss": 0.8087,
411
- "step": 54
412
- },
413
- {
414
- "epoch": 0.019499656540140485,
415
- "grad_norm": 0.8202411532402039,
416
- "learning_rate": 5.128392945374505e-05,
417
- "loss": 0.7787,
418
- "step": 55
419
- },
420
- {
421
- "epoch": 0.019854195749961223,
422
- "grad_norm": 0.7467792630195618,
423
- "learning_rate": 4.980319133636863e-05,
424
- "loss": 0.672,
425
- "step": 56
426
- },
427
- {
428
- "epoch": 0.020208734959781958,
429
- "grad_norm": 0.756299614906311,
430
- "learning_rate": 4.83281358529671e-05,
431
- "loss": 0.6991,
432
- "step": 57
433
- },
434
- {
435
- "epoch": 0.020563274169602696,
436
- "grad_norm": 0.7745821475982666,
437
- "learning_rate": 4.686037595505507e-05,
438
- "loss": 0.6683,
439
- "step": 58
440
- },
441
- {
442
- "epoch": 0.02091781337942343,
443
- "grad_norm": 0.6686888933181763,
444
- "learning_rate": 4.54015166165314e-05,
445
- "loss": 0.5444,
446
- "step": 59
447
- },
448
- {
449
- "epoch": 0.021272352589244165,
450
- "grad_norm": 0.6725859045982361,
451
- "learning_rate": 4.395315307866405e-05,
452
- "loss": 0.468,
453
- "step": 60
454
- },
455
- {
456
- "epoch": 0.021626891799064903,
457
- "grad_norm": 0.6770166754722595,
458
- "learning_rate": 4.2516869105717004e-05,
459
- "loss": 0.4267,
460
- "step": 61
461
- },
462
- {
463
- "epoch": 0.021981431008885638,
464
- "grad_norm": 0.8719369769096375,
465
- "learning_rate": 4.109423525312738e-05,
466
- "loss": 0.4629,
467
- "step": 62
468
- },
469
- {
470
- "epoch": 0.022335970218706376,
471
- "grad_norm": 0.6566356420516968,
472
- "learning_rate": 3.968680715012606e-05,
473
- "loss": 0.5978,
474
- "step": 63
475
- },
476
- {
477
- "epoch": 0.02269050942852711,
478
- "grad_norm": 0.7933331727981567,
479
- "learning_rate": 3.829612379868006e-05,
480
- "loss": 1.0035,
481
- "step": 64
482
- },
483
- {
484
- "epoch": 0.02304504863834785,
485
- "grad_norm": 0.8843840956687927,
486
- "learning_rate": 3.692370589061639e-05,
487
- "loss": 0.9023,
488
- "step": 65
489
- },
490
- {
491
- "epoch": 0.023399587848168583,
492
- "grad_norm": 1.0728343725204468,
493
- "learning_rate": 3.557105414476782e-05,
494
- "loss": 0.9255,
495
- "step": 66
496
- },
497
- {
498
- "epoch": 0.023754127057989318,
499
- "grad_norm": 0.7340516448020935,
500
- "learning_rate": 3.423964766595906e-05,
501
- "loss": 0.7142,
502
- "step": 67
503
- },
504
- {
505
- "epoch": 0.024108666267810056,
506
- "grad_norm": 0.8005227446556091,
507
- "learning_rate": 3.293094232762715e-05,
508
- "loss": 0.6483,
509
- "step": 68
510
- },
511
- {
512
- "epoch": 0.02446320547763079,
513
- "grad_norm": 0.7076966762542725,
514
- "learning_rate": 3.164636917984534e-05,
515
- "loss": 0.6988,
516
- "step": 69
517
- },
518
- {
519
- "epoch": 0.02481774468745153,
520
- "grad_norm": 0.6570345163345337,
521
- "learning_rate": 3.0387332884490805e-05,
522
- "loss": 0.636,
523
- "step": 70
524
- },
525
- {
526
- "epoch": 0.025172283897272264,
527
- "grad_norm": 0.6902582049369812,
528
- "learning_rate": 2.9155210179267546e-05,
529
- "loss": 0.4957,
530
- "step": 71
531
- },
532
- {
533
- "epoch": 0.025526823107093,
534
- "grad_norm": 0.7639642953872681,
535
- "learning_rate": 2.7951348372263875e-05,
536
- "loss": 0.5997,
537
- "step": 72
538
- },
539
- {
540
- "epoch": 0.025881362316913736,
541
- "grad_norm": 0.7231664061546326,
542
- "learning_rate": 2.677706386869083e-05,
543
- "loss": 0.4678,
544
- "step": 73
545
- },
546
- {
547
- "epoch": 0.02623590152673447,
548
- "grad_norm": 0.7691690325737,
549
- "learning_rate": 2.5633640731412412e-05,
550
- "loss": 0.5281,
551
- "step": 74
552
- },
553
- {
554
- "epoch": 0.02659044073655521,
555
- "grad_norm": 0.6874270439147949,
556
- "learning_rate": 2.4522329276841663e-05,
557
- "loss": 0.307,
558
- "step": 75
559
- },
560
- {
561
- "epoch": 0.02659044073655521,
562
- "eval_loss": 0.7375490665435791,
563
- "eval_runtime": 1.43,
564
- "eval_samples_per_second": 34.966,
565
- "eval_steps_per_second": 9.091,
566
- "step": 75
567
- },
568
- {
569
- "epoch": 0.026944979946375944,
570
- "grad_norm": 0.689096212387085,
571
- "learning_rate": 2.3444344707738015e-05,
572
- "loss": 0.9376,
573
- "step": 76
574
- },
575
- {
576
- "epoch": 0.027299519156196682,
577
- "grad_norm": 0.7334891557693481,
578
- "learning_rate": 2.2400865784401e-05,
579
- "loss": 0.8911,
580
- "step": 77
581
- },
582
- {
583
- "epoch": 0.027654058366017416,
584
- "grad_norm": 0.6938356757164001,
585
- "learning_rate": 2.1393033535713093e-05,
586
- "loss": 0.7595,
587
- "step": 78
588
- },
589
- {
590
- "epoch": 0.02800859757583815,
591
- "grad_norm": 0.6203986406326294,
592
- "learning_rate": 2.0421950011441354e-05,
593
- "loss": 0.6434,
594
- "step": 79
595
- },
596
- {
597
- "epoch": 0.02836313678565889,
598
- "grad_norm": 1.0269389152526855,
599
- "learning_rate": 1.9488677077162295e-05,
600
- "loss": 0.5729,
601
- "step": 80
602
- },
603
- {
604
- "epoch": 0.028717675995479624,
605
- "grad_norm": 0.7017802596092224,
606
- "learning_rate": 1.8594235253127375e-05,
607
- "loss": 0.6952,
608
- "step": 81
609
- },
610
- {
611
- "epoch": 0.029072215205300362,
612
- "grad_norm": 0.823606550693512,
613
- "learning_rate": 1.77396025983391e-05,
614
- "loss": 0.7631,
615
- "step": 82
616
- },
617
- {
618
- "epoch": 0.029426754415121097,
619
- "grad_norm": 0.6583719253540039,
620
- "learning_rate": 1.6925713641057904e-05,
621
- "loss": 0.5065,
622
- "step": 83
623
- },
624
- {
625
- "epoch": 0.029781293624941835,
626
- "grad_norm": 0.7148926854133606,
627
- "learning_rate": 1.6153458356909176e-05,
628
- "loss": 0.6016,
629
- "step": 84
630
- },
631
- {
632
- "epoch": 0.03013583283476257,
633
- "grad_norm": 0.7244095206260681,
634
- "learning_rate": 1.5423681195707997e-05,
635
- "loss": 0.511,
636
- "step": 85
637
- },
638
- {
639
- "epoch": 0.030490372044583304,
640
- "grad_norm": 0.7808502912521362,
641
- "learning_rate": 1.4737180158065644e-05,
642
- "loss": 0.4341,
643
- "step": 86
644
- },
645
- {
646
- "epoch": 0.030844911254404042,
647
- "grad_norm": 0.6845284104347229,
648
- "learning_rate": 1.4094705922787687e-05,
649
- "loss": 0.438,
650
- "step": 87
651
- },
652
- {
653
- "epoch": 0.031199450464224777,
654
- "grad_norm": 0.7536430358886719,
655
- "learning_rate": 1.3496961026017687e-05,
656
- "loss": 0.7226,
657
- "step": 88
658
- },
659
- {
660
- "epoch": 0.03155398967404551,
661
- "grad_norm": 0.7071179747581482,
662
- "learning_rate": 1.2944599093024267e-05,
663
- "loss": 0.8611,
664
- "step": 89
665
- },
666
- {
667
- "epoch": 0.03190852888386625,
668
- "grad_norm": 0.7080587148666382,
669
- "learning_rate": 1.2438224123471442e-05,
670
- "loss": 0.7928,
671
- "step": 90
672
- },
673
- {
674
- "epoch": 0.03226306809368699,
675
- "grad_norm": 0.8562609553337097,
676
- "learning_rate": 1.1978389830953907e-05,
677
- "loss": 0.8498,
678
- "step": 91
679
- },
680
- {
681
- "epoch": 0.032617607303507726,
682
- "grad_norm": 0.7281271815299988,
683
- "learning_rate": 1.1565599037519316e-05,
684
- "loss": 0.7436,
685
- "step": 92
686
- },
687
- {
688
- "epoch": 0.03297214651332846,
689
- "grad_norm": 0.7492510676383972,
690
- "learning_rate": 1.1200303123839742e-05,
691
- "loss": 0.6069,
692
- "step": 93
693
- },
694
- {
695
- "epoch": 0.033326685723149195,
696
- "grad_norm": 0.7654992938041687,
697
- "learning_rate": 1.088290153563358e-05,
698
- "loss": 0.6479,
699
- "step": 94
700
- },
701
- {
702
- "epoch": 0.03368122493296993,
703
- "grad_norm": 0.8193288445472717,
704
- "learning_rate": 1.0613741346877497e-05,
705
- "loss": 0.6431,
706
- "step": 95
707
- },
708
- {
709
- "epoch": 0.034035764142790664,
710
- "grad_norm": 0.9601457715034485,
711
- "learning_rate": 1.0393116880286118e-05,
712
- "loss": 0.6555,
713
- "step": 96
714
- },
715
- {
716
- "epoch": 0.0343903033526114,
717
- "grad_norm": 0.820478081703186,
718
- "learning_rate": 1.0221269385474488e-05,
719
- "loss": 0.5409,
720
- "step": 97
721
- },
722
- {
723
- "epoch": 0.03474484256243214,
724
- "grad_norm": 0.8726357817649841,
725
- "learning_rate": 1.0098386775155147e-05,
726
- "loss": 0.4809,
727
- "step": 98
728
- },
729
- {
730
- "epoch": 0.03509938177225287,
731
- "grad_norm": 0.7600851058959961,
732
- "learning_rate": 1.0024603419658329e-05,
733
- "loss": 0.4056,
734
- "step": 99
735
- },
736
- {
737
- "epoch": 0.03545392098207361,
738
- "grad_norm": 0.6082343459129333,
739
- "learning_rate": 1e-05,
740
- "loss": 0.2769,
741
- "step": 100
742
- },
743
- {
744
- "epoch": 0.03545392098207361,
745
- "eval_loss": 0.7247900366783142,
746
- "eval_runtime": 1.4297,
747
- "eval_samples_per_second": 34.973,
748
- "eval_steps_per_second": 9.093,
749
- "step": 100
750
  }
751
  ],
752
  "logging_steps": 1,
@@ -770,12 +221,12 @@
770
  "should_evaluate": false,
771
  "should_log": false,
772
  "should_save": true,
773
- "should_training_stop": true
774
  },
775
  "attributes": {}
776
  }
777
  },
778
- "total_flos": 1.1249873143390536e+18,
779
  "train_batch_size": 1,
780
  "trial_name": null,
781
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7849562168121338,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-25",
4
+ "epoch": 0.008863480245518402,
5
  "eval_steps": 25,
6
+ "global_step": 25,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0003545392098207361,
13
+ "grad_norm": 4.848531246185303,
14
  "learning_rate": 2e-05,
15
  "loss": 5.0876,
16
  "step": 1
 
18
  {
19
  "epoch": 0.0003545392098207361,
20
  "eval_loss": 6.216423511505127,
21
+ "eval_runtime": 1.4346,
22
+ "eval_samples_per_second": 34.853,
23
+ "eval_steps_per_second": 9.062,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.0007090784196414722,
28
+ "grad_norm": 5.189554691314697,
29
  "learning_rate": 4e-05,
30
  "loss": 4.9835,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.0010636176294622083,
35
+ "grad_norm": 5.395487308502197,
36
  "learning_rate": 6e-05,
37
+ "loss": 5.0002,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.0014181568392829444,
42
+ "grad_norm": 5.79182767868042,
43
  "learning_rate": 8e-05,
44
+ "loss": 4.815,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.0017726960491036806,
49
+ "grad_norm": 5.924882411956787,
50
  "learning_rate": 0.0001,
51
+ "loss": 3.5888,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.0021272352589244165,
56
+ "grad_norm": 5.22299861907959,
57
  "learning_rate": 9.997539658034168e-05,
58
+ "loss": 2.5276,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.002481774468745153,
63
+ "grad_norm": 3.220250129699707,
64
  "learning_rate": 9.990161322484486e-05,
65
+ "loss": 1.5851,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.002836313678565889,
70
+ "grad_norm": 1.6759289503097534,
71
  "learning_rate": 9.977873061452552e-05,
72
+ "loss": 1.2539,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.003190852888386625,
77
+ "grad_norm": 1.6135218143463135,
78
  "learning_rate": 9.96068831197139e-05,
79
+ "loss": 1.0919,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.003545392098207361,
84
+ "grad_norm": 1.962496042251587,
85
  "learning_rate": 9.938625865312251e-05,
86
+ "loss": 0.8399,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.003899931308028097,
91
+ "grad_norm": 2.1563808917999268,
92
  "learning_rate": 9.911709846436641e-05,
93
+ "loss": 0.7126,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.004254470517848833,
98
+ "grad_norm": 1.3578228950500488,
99
  "learning_rate": 9.879969687616027e-05,
100
+ "loss": 0.7229,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.004609009727669569,
105
+ "grad_norm": 1.3786768913269043,
106
  "learning_rate": 9.84344009624807e-05,
107
+ "loss": 0.8936,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.004963548937490306,
112
+ "grad_norm": 1.6054021120071411,
113
  "learning_rate": 9.80216101690461e-05,
114
+ "loss": 1.1911,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.005318088147311041,
119
+ "grad_norm": 0.9699863791465759,
120
  "learning_rate": 9.756177587652856e-05,
121
+ "loss": 1.0015,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.005672627357131778,
126
+ "grad_norm": 1.0598102807998657,
127
  "learning_rate": 9.705540090697575e-05,
128
  "loss": 1.1119,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.006027166566952514,
133
+ "grad_norm": 1.0990904569625854,
134
  "learning_rate": 9.650303897398232e-05,
135
+ "loss": 0.8657,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.00638170577677325,
140
+ "grad_norm": 0.9300882816314697,
141
  "learning_rate": 9.590529407721231e-05,
142
+ "loss": 0.8144,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.006736244986593986,
147
+ "grad_norm": 0.7585983276367188,
148
  "learning_rate": 9.526281984193436e-05,
149
+ "loss": 0.7869,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.007090784196414722,
154
+ "grad_norm": 0.83787602186203,
155
  "learning_rate": 9.4576318804292e-05,
156
+ "loss": 0.7916,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.007445323406235459,
161
+ "grad_norm": 0.7473427653312683,
162
  "learning_rate": 9.384654164309083e-05,
163
  "loss": 0.6681,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.007799862616056194,
168
+ "grad_norm": 0.5894711017608643,
169
  "learning_rate": 9.30742863589421e-05,
170
+ "loss": 0.675,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.008154401825876931,
175
+ "grad_norm": 0.715570867061615,
176
  "learning_rate": 9.226039740166091e-05,
177
+ "loss": 0.6853,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.008508941035697666,
182
+ "grad_norm": 0.6784505844116211,
183
  "learning_rate": 9.140576474687264e-05,
184
+ "loss": 0.4911,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.008863480245518402,
189
+ "grad_norm": 0.7639828324317932,
190
  "learning_rate": 9.051132292283771e-05,
191
+ "loss": 0.4635,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.008863480245518402,
196
+ "eval_loss": 0.7849562168121338,
197
+ "eval_runtime": 1.4348,
198
+ "eval_samples_per_second": 34.849,
199
+ "eval_steps_per_second": 9.061,
200
  "step": 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  }
202
  ],
203
  "logging_steps": 1,
 
221
  "should_evaluate": false,
222
  "should_log": false,
223
  "should_save": true,
224
+ "should_training_stop": false
225
  },
226
  "attributes": {}
227
  }
228
  },
229
+ "total_flos": 2.812907666370724e+17,
230
  "train_batch_size": 1,
231
  "trial_name": null,
232
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e13ebf621f3ae7801a8fa1b74c6bbd62a569669ff2bc7e880f5897ba3c972d81
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05bf92cdc9776646a10f781dcc857c6462d8f9a3727f2361008d8e455efc9baa
3
  size 6840