g4rg commited on
Commit
afb3759
1 Parent(s): d7f9919

Training in progress, step 66, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "down_proj",
24
  "k_proj",
25
- "gate_proj",
26
  "o_proj",
27
- "v_proj",
28
- "up_proj",
29
- "q_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "v_proj",
24
+ "up_proj",
25
  "down_proj",
26
  "k_proj",
27
+ "q_proj",
28
  "o_proj",
29
+ "gate_proj"
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:651204470d974333e74132ca634e50c46cab4f71d2b3bef1ed0dec3eb6aba04d
3
  size 763470136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cde4c0dc915fef419c1193ae86c0d6cad089c08b2c9fd319eb8d1cfc01feab3
3
  size 763470136
last-checkpoint/global_step66/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:856881d6095b0839d3bd7514110d2cdcdc0559f6fc8cb267bc5141b3bb8fb130
3
  size 289064656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9840574e09fbed5b59e8b2d691a2786e8f5468f915d6e3b34e6dc4661ae4dae3
3
  size 289064656
last-checkpoint/global_step66/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3ec5485157f503118c0e48f554d1f5520735c3097bc76d41a8443b455963ffb
3
  size 289064656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eec6a976f669c5c23f4a573acee68f5dc79be13f41ecf84e66b7f38d49858897
3
  size 289064656
last-checkpoint/global_step66/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c5675f6c03826d269461d7b487eeb925e2dd98d3705817b40437ec232b0b7a1
3
  size 289064656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3655589db9599c3e8670860cdac9edd731baa863f80f3f88344e718ec06abac4
3
  size 289064656
last-checkpoint/global_step66/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f588dd44a055da72c47094f640c1b63e398913c5459f8a5ee48af60cb02399e2
3
  size 289064656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c4db552f94915bba454e5ae77fcad81ea17e80836e32cdf0ea547a32387741e
3
  size 289064656
last-checkpoint/global_step66/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24c8a76833729b7e642158d26cd8ecc63eb5c89c0149c1072a38619d99b3ad10
3
  size 289064656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a8a1e6366f9dfae16c9009c6f3b42a191a4af102cbf8d75bb73e1d4fe3c91ea
3
  size 289064656
last-checkpoint/global_step66/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53431f389a44260329ba5449b3c3cdd854a928d7cffcabb54e25b959e1ed251e
3
  size 289064656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:821ae30ad41e9d9810083e01b5731bcffd77ea75d52c583dea3c1a2d1b975025
3
  size 289064656
last-checkpoint/global_step66/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ad56bb2d798741728bf4f2e6df097e85c333f37686dd375d9f8ef96f29a457d
3
  size 289064656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23d2c20a129743b3a6fd6428968e3af4b9fdcbaf2e8a84ee3c622133af982fee
3
  size 289064656
last-checkpoint/global_step66/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cde8be6f46a02f5fa7d6a09a30b5cc4236445fcb495af50fd87624b38a4d7b4
3
  size 289064656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fac5a84153eac1748035cfa3327dcb6140f616f9be68bb6b50dae39c6a4fae53
3
  size 289064656
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e9ce871037e8d378408842390a351a4bb8856d71d37389bd1055187b26f84d4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6530e0522c975674706d8073e33fa508580e6b794aaf4f3e6111389796f319e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -10,479 +10,479 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.003067484662576687,
13
- "grad_norm": 0.9516617278813834,
14
- "learning_rate": 1.25e-05,
15
  "loss": 1.9557,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.003067484662576687,
20
  "eval_loss": 2.6437082290649414,
21
- "eval_runtime": 55.5495,
22
- "eval_samples_per_second": 1.8,
23
  "eval_steps_per_second": 0.126,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.006134969325153374,
28
- "grad_norm": 0.515521728634264,
29
- "learning_rate": 2.5e-05,
30
  "loss": 1.9268,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.009202453987730062,
35
- "grad_norm": 1.0602168628533477,
36
- "learning_rate": 3.75e-05,
37
- "loss": 1.9644,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.012269938650306749,
42
- "grad_norm": 0.5232804296238467,
43
- "learning_rate": 5e-05,
44
- "loss": 1.9174,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.015337423312883436,
49
- "grad_norm": 0.6049728735982117,
50
- "learning_rate": 6.25e-05,
51
- "loss": 1.9183,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.018404907975460124,
56
- "grad_norm": 0.44617735370287787,
57
- "learning_rate": 7.5e-05,
58
- "loss": 1.9016,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.02147239263803681,
63
- "grad_norm": 0.5041842596415366,
64
- "learning_rate": 8.75e-05,
65
- "loss": 1.9706,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.024539877300613498,
70
- "grad_norm": 0.5697227180606876,
71
- "learning_rate": 0.0001,
72
- "loss": 1.9105,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.027607361963190184,
77
- "grad_norm": 0.3797683389810269,
78
- "learning_rate": 0.00011250000000000001,
79
- "loss": 1.9351,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.03067484662576687,
84
- "grad_norm": 0.3464113535012369,
85
- "learning_rate": 0.000125,
86
- "loss": 1.9347,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.03374233128834356,
91
- "grad_norm": 1.038453745480312,
92
- "learning_rate": 0.0001375,
93
- "loss": 1.9008,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.03680981595092025,
98
- "grad_norm": 0.5222824963828644,
99
- "learning_rate": 0.00015,
100
- "loss": 1.9251,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.03987730061349693,
105
- "grad_norm": 0.5129473208257509,
106
- "learning_rate": 0.00016250000000000002,
107
- "loss": 1.8613,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.04294478527607362,
112
- "grad_norm": 0.7292233670769845,
113
- "learning_rate": 0.000175,
114
- "loss": 1.9507,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.046012269938650305,
119
- "grad_norm": 0.6360368446619434,
120
- "learning_rate": 0.0001875,
121
- "loss": 1.9512,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.049079754601226995,
126
- "grad_norm": 0.48214017101050627,
127
- "learning_rate": 0.0002,
128
- "loss": 1.961,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.05214723926380368,
133
- "grad_norm": 0.4394229337647846,
134
- "learning_rate": 0.0002125,
135
- "loss": 1.9704,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.05521472392638037,
140
- "grad_norm": 0.3796994442046945,
141
- "learning_rate": 0.00022500000000000002,
142
- "loss": 1.8925,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.05828220858895705,
147
- "grad_norm": 0.3188673935343497,
148
- "learning_rate": 0.0002375,
149
- "loss": 1.969,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.06134969325153374,
154
- "grad_norm": 0.9883905241335006,
155
- "learning_rate": 0.00025,
156
- "loss": 1.9734,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.06441717791411043,
161
- "grad_norm": 0.42956410678121015,
162
- "learning_rate": 0.000249994071079807,
163
- "loss": 1.9632,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.06748466257668712,
168
- "grad_norm": 0.5580696830715027,
169
- "learning_rate": 0.00024997628494415405,
170
- "loss": 1.8911,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.0705521472392638,
175
- "grad_norm": 0.4247455273508192,
176
- "learning_rate": 0.00024994664346775366,
177
- "loss": 1.9549,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.0736196319018405,
182
- "grad_norm": 0.5638089571797716,
183
- "learning_rate": 0.0002499051497749072,
184
- "loss": 1.8903,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.07668711656441718,
189
- "grad_norm": 0.3337856270380794,
190
- "learning_rate": 0.00024985180823917534,
191
- "loss": 1.9817,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.07975460122699386,
196
- "grad_norm": 0.5964071002925826,
197
- "learning_rate": 0.00024978662448291747,
198
- "loss": 2.0113,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.08282208588957055,
203
- "grad_norm": 1.3108802906417165,
204
- "learning_rate": 0.0002497096053766986,
205
- "loss": 1.9136,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.08588957055214724,
210
- "grad_norm": 1.9953168113527813,
211
- "learning_rate": 0.0002496207590385656,
212
- "loss": 2.0042,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.08895705521472393,
217
- "grad_norm": 4.7288777305801615,
218
- "learning_rate": 0.00024952009483319136,
219
- "loss": 2.0138,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.09202453987730061,
224
- "grad_norm": 0.9466987524434748,
225
- "learning_rate": 0.0002494076233708877,
226
- "loss": 2.027,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.0950920245398773,
231
- "grad_norm": 0.5335701207102423,
232
- "learning_rate": 0.000249283356506487,
233
- "loss": 1.9497,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.09815950920245399,
238
- "grad_norm": 0.4530090911036831,
239
- "learning_rate": 0.0002491473073380928,
240
- "loss": 1.8991,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.10122699386503067,
245
- "grad_norm": 0.43545874771481075,
246
- "learning_rate": 0.000248999490205699,
247
- "loss": 2.0384,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.10429447852760736,
252
- "grad_norm": 3.1727083160562874,
253
- "learning_rate": 0.00024883992068967873,
254
- "loss": 1.9743,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.10736196319018405,
259
- "grad_norm": 5.481030996815809,
260
- "learning_rate": 0.0002486686156091417,
261
- "loss": 2.0054,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.11042944785276074,
266
- "grad_norm": 9.756683051815624,
267
- "learning_rate": 0.0002484855930201617,
268
- "loss": 1.9805,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.11349693251533742,
273
- "grad_norm": 0.5694221348977583,
274
- "learning_rate": 0.0002482908722138734,
275
- "loss": 1.9495,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.1165644171779141,
280
- "grad_norm": 0.4781718005749317,
281
- "learning_rate": 0.00024808447371443896,
282
- "loss": 2.0154,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.1196319018404908,
287
- "grad_norm": 2.070517297643313,
288
- "learning_rate": 0.00024786641927688466,
289
- "loss": 1.9294,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.12269938650306748,
294
- "grad_norm": 0.4269552409103539,
295
- "learning_rate": 0.000247636731884808,
296
- "loss": 1.9768,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.12576687116564417,
301
- "grad_norm": 0.39633691656297887,
302
- "learning_rate": 0.0002473954357479551,
303
- "loss": 1.9978,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 0.12883435582822086,
308
- "grad_norm": 0.5628682021190763,
309
- "learning_rate": 0.0002471425562996688,
310
- "loss": 1.9877,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 0.13190184049079753,
315
- "grad_norm": 0.4235604267637786,
316
- "learning_rate": 0.00024687812019420806,
317
- "loss": 1.9601,
318
  "step": 43
319
  },
320
  {
321
  "epoch": 0.13496932515337423,
322
- "grad_norm": 1.955262598542115,
323
- "learning_rate": 0.0002466021553039386,
324
- "loss": 1.9665,
325
  "step": 44
326
  },
327
  {
328
  "epoch": 0.13803680981595093,
329
- "grad_norm": 0.5343967332691423,
330
- "learning_rate": 0.0002463146907163947,
331
- "loss": 1.9132,
332
  "step": 45
333
  },
334
  {
335
  "epoch": 0.1411042944785276,
336
- "grad_norm": 0.35886735373161066,
337
- "learning_rate": 0.0002460157567312137,
338
- "loss": 1.9353,
339
  "step": 46
340
  },
341
  {
342
  "epoch": 0.1441717791411043,
343
- "grad_norm": 0.49035062436723287,
344
- "learning_rate": 0.00024570538485694214,
345
- "loss": 1.9721,
346
  "step": 47
347
  },
348
  {
349
  "epoch": 0.147239263803681,
350
- "grad_norm": 0.3404214165006091,
351
- "learning_rate": 0.00024538360780771465,
352
- "loss": 1.9382,
353
  "step": 48
354
  },
355
  {
356
  "epoch": 0.15030674846625766,
357
- "grad_norm": 0.5345047082277987,
358
- "learning_rate": 0.00024505045949980574,
359
- "loss": 1.9566,
360
  "step": 49
361
  },
362
  {
363
  "epoch": 0.15337423312883436,
364
- "grad_norm": 0.33138829718017737,
365
- "learning_rate": 0.00024470597504805516,
366
- "loss": 1.9025,
367
  "step": 50
368
  },
369
  {
370
  "epoch": 0.15644171779141106,
371
- "grad_norm": 0.3960289244574568,
372
- "learning_rate": 0.00024435019076216627,
373
- "loss": 1.9338,
374
  "step": 51
375
  },
376
  {
377
  "epoch": 0.15950920245398773,
378
- "grad_norm": 0.44538606572029693,
379
- "learning_rate": 0.00024398314414287938,
380
- "loss": 1.9495,
381
  "step": 52
382
  },
383
  {
384
  "epoch": 0.16257668711656442,
385
- "grad_norm": 0.38091673390175385,
386
- "learning_rate": 0.00024360487387801872,
387
- "loss": 1.9579,
388
  "step": 53
389
  },
390
  {
391
  "epoch": 0.1656441717791411,
392
- "grad_norm": 0.3786713587133258,
393
- "learning_rate": 0.00024321541983841468,
394
- "loss": 1.9606,
395
  "step": 54
396
  },
397
  {
398
  "epoch": 0.1687116564417178,
399
- "grad_norm": 0.34787683708853046,
400
- "learning_rate": 0.00024281482307370142,
401
- "loss": 1.9642,
402
  "step": 55
403
  },
404
  {
405
  "epoch": 0.17177914110429449,
406
- "grad_norm": 0.7739678290668914,
407
- "learning_rate": 0.00024240312580799,
408
- "loss": 1.9082,
409
  "step": 56
410
  },
411
  {
412
  "epoch": 0.17484662576687116,
413
- "grad_norm": 0.8090200041147584,
414
- "learning_rate": 0.00024198037143541792,
415
- "loss": 1.9458,
416
  "step": 57
417
  },
418
  {
419
  "epoch": 0.17791411042944785,
420
- "grad_norm": 0.38965067919011226,
421
- "learning_rate": 0.00024154660451557508,
422
- "loss": 1.9724,
423
  "step": 58
424
  },
425
  {
426
  "epoch": 0.18098159509202455,
427
- "grad_norm": 0.525233423512868,
428
- "learning_rate": 0.0002411018707688073,
429
- "loss": 1.9726,
430
  "step": 59
431
  },
432
  {
433
  "epoch": 0.18404907975460122,
434
- "grad_norm": 0.6309847144158074,
435
- "learning_rate": 0.00024064621707139708,
436
- "loss": 1.8999,
437
  "step": 60
438
  },
439
  {
440
  "epoch": 0.18711656441717792,
441
- "grad_norm": 0.8241404186554419,
442
- "learning_rate": 0.00024017969145062278,
443
- "loss": 1.927,
444
  "step": 61
445
  },
446
  {
447
  "epoch": 0.1901840490797546,
448
- "grad_norm": 0.3936537378135966,
449
- "learning_rate": 0.0002397023430796964,
450
- "loss": 1.9457,
451
  "step": 62
452
  },
453
  {
454
  "epoch": 0.19325153374233128,
455
- "grad_norm": 0.5030215425538933,
456
- "learning_rate": 0.0002392142222725805,
457
- "loss": 1.9413,
458
  "step": 63
459
  },
460
  {
461
  "epoch": 0.19631901840490798,
462
- "grad_norm": 0.82199867849235,
463
- "learning_rate": 0.00023871538047868512,
464
- "loss": 1.8935,
465
  "step": 64
466
  },
467
  {
468
  "epoch": 0.19938650306748465,
469
- "grad_norm": 0.36522090025587745,
470
- "learning_rate": 0.00023820587027744452,
471
- "loss": 1.8778,
472
  "step": 65
473
  },
474
  {
475
  "epoch": 0.20245398773006135,
476
- "grad_norm": 0.44631812034158336,
477
- "learning_rate": 0.00023768574537277558,
478
- "loss": 1.8862,
479
  "step": 66
480
  },
481
  {
482
  "epoch": 0.20245398773006135,
483
- "eval_loss": 2.6580638885498047,
484
- "eval_runtime": 55.7526,
485
- "eval_samples_per_second": 1.794,
486
  "eval_steps_per_second": 0.126,
487
  "step": 66
488
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.003067484662576687,
13
+ "grad_norm": 1.0254060683433053,
14
+ "learning_rate": 5e-06,
15
  "loss": 1.9557,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.003067484662576687,
20
  "eval_loss": 2.6437082290649414,
21
+ "eval_runtime": 55.4152,
22
+ "eval_samples_per_second": 1.805,
23
  "eval_steps_per_second": 0.126,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.006134969325153374,
28
+ "grad_norm": 0.5293660177597584,
29
+ "learning_rate": 1e-05,
30
  "loss": 1.9268,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.009202453987730062,
35
+ "grad_norm": 0.6031237810490027,
36
+ "learning_rate": 1.5e-05,
37
+ "loss": 1.9666,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.012269938650306749,
42
+ "grad_norm": 0.5216691776821837,
43
+ "learning_rate": 2e-05,
44
+ "loss": 1.9176,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.015337423312883436,
49
+ "grad_norm": 0.45736012052053565,
50
+ "learning_rate": 2.5e-05,
51
+ "loss": 1.9172,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.018404907975460124,
56
+ "grad_norm": 0.4721331330094363,
57
+ "learning_rate": 3e-05,
58
+ "loss": 1.9038,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.02147239263803681,
63
+ "grad_norm": 0.4699970169077475,
64
+ "learning_rate": 3.5e-05,
65
+ "loss": 1.972,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.024539877300613498,
70
+ "grad_norm": 0.5998147513619175,
71
+ "learning_rate": 4e-05,
72
+ "loss": 1.9115,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.027607361963190184,
77
+ "grad_norm": 0.39982194363235835,
78
+ "learning_rate": 4.5e-05,
79
+ "loss": 1.9362,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.03067484662576687,
84
+ "grad_norm": 0.41316001445589784,
85
+ "learning_rate": 5e-05,
86
+ "loss": 1.9367,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.03374233128834356,
91
+ "grad_norm": 1.978145485337434,
92
+ "learning_rate": 5.500000000000001e-05,
93
+ "loss": 1.9018,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.03680981595092025,
98
+ "grad_norm": 0.5763394527514556,
99
+ "learning_rate": 6e-05,
100
+ "loss": 1.9239,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.03987730061349693,
105
+ "grad_norm": 0.6656094180752898,
106
+ "learning_rate": 6.500000000000001e-05,
107
+ "loss": 1.8601,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.04294478527607362,
112
+ "grad_norm": 0.3779888950718134,
113
+ "learning_rate": 7e-05,
114
+ "loss": 1.9467,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.046012269938650305,
119
+ "grad_norm": 0.4210293643738542,
120
+ "learning_rate": 7.500000000000001e-05,
121
+ "loss": 1.9491,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.049079754601226995,
126
+ "grad_norm": 0.284470526924256,
127
+ "learning_rate": 8e-05,
128
+ "loss": 1.96,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.05214723926380368,
133
+ "grad_norm": 0.4511944107373649,
134
+ "learning_rate": 8.5e-05,
135
+ "loss": 1.9688,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.05521472392638037,
140
+ "grad_norm": 0.5213533339486691,
141
+ "learning_rate": 9e-05,
142
+ "loss": 1.8883,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.05828220858895705,
147
+ "grad_norm": 0.3529095514608687,
148
+ "learning_rate": 9.5e-05,
149
+ "loss": 1.9652,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.06134969325153374,
154
+ "grad_norm": 0.37388599933304034,
155
+ "learning_rate": 0.0001,
156
+ "loss": 1.9701,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.06441717791411043,
161
+ "grad_norm": 0.6715118705762056,
162
+ "learning_rate": 9.999762843192279e-05,
163
+ "loss": 1.9591,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.06748466257668712,
168
+ "grad_norm": 0.3339477252516958,
169
+ "learning_rate": 9.999051397766162e-05,
170
+ "loss": 1.8851,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.0705521472392638,
175
+ "grad_norm": 0.38292464677189253,
176
+ "learning_rate": 9.997865738710147e-05,
177
+ "loss": 1.9505,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.0736196319018405,
182
+ "grad_norm": 0.46332198422774334,
183
+ "learning_rate": 9.996205990996288e-05,
184
+ "loss": 1.8819,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.07668711656441718,
189
+ "grad_norm": 0.32033971816842144,
190
+ "learning_rate": 9.994072329567015e-05,
191
+ "loss": 1.9778,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.07975460122699386,
196
+ "grad_norm": 0.32764211011622874,
197
+ "learning_rate": 9.991464979316699e-05,
198
+ "loss": 2.0035,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.08282208588957055,
203
+ "grad_norm": 0.35749570152374016,
204
+ "learning_rate": 9.988384215067945e-05,
205
+ "loss": 1.897,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.08588957055214724,
210
+ "grad_norm": 0.47517571287279864,
211
+ "learning_rate": 9.984830361542625e-05,
212
+ "loss": 1.9916,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.08895705521472393,
217
+ "grad_norm": 0.37844919890358947,
218
+ "learning_rate": 9.980803793327656e-05,
219
+ "loss": 1.9787,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.09202453987730061,
224
+ "grad_norm": 0.3392783686369942,
225
+ "learning_rate": 9.976304934835509e-05,
226
+ "loss": 1.9915,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.0950920245398773,
231
+ "grad_norm": 0.3672803421436023,
232
+ "learning_rate": 9.97133426025948e-05,
233
+ "loss": 1.9237,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.09815950920245399,
238
+ "grad_norm": 0.3717328207326788,
239
+ "learning_rate": 9.965892293523712e-05,
240
+ "loss": 1.8755,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.10122699386503067,
245
+ "grad_norm": 0.41380648649234975,
246
+ "learning_rate": 9.959979608227961e-05,
247
+ "loss": 2.021,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.10429447852760736,
252
+ "grad_norm": 1.0263652968268477,
253
+ "learning_rate": 9.95359682758715e-05,
254
+ "loss": 1.9528,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.10736196319018405,
259
+ "grad_norm": 0.9592485389518621,
260
+ "learning_rate": 9.946744624365668e-05,
261
+ "loss": 1.9055,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.11042944785276074,
266
+ "grad_norm": 0.43725271995243464,
267
+ "learning_rate": 9.939423720806468e-05,
268
+ "loss": 1.9306,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.11349693251533742,
273
+ "grad_norm": 0.3175345165915247,
274
+ "learning_rate": 9.931634888554937e-05,
275
+ "loss": 1.9159,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.1165644171779141,
280
+ "grad_norm": 0.4731845530714391,
281
+ "learning_rate": 9.923378948577559e-05,
282
+ "loss": 1.993,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.1196319018404908,
287
+ "grad_norm": 0.3274613986874974,
288
+ "learning_rate": 9.914656771075387e-05,
289
+ "loss": 1.8971,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.12269938650306748,
294
+ "grad_norm": 0.4175774555118117,
295
+ "learning_rate": 9.90546927539232e-05,
296
+ "loss": 1.9529,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.12576687116564417,
301
+ "grad_norm": 0.4723214170983414,
302
+ "learning_rate": 9.895817429918203e-05,
303
+ "loss": 1.9775,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 0.12883435582822086,
308
+ "grad_norm": 0.5517874328207245,
309
+ "learning_rate": 9.885702251986753e-05,
310
+ "loss": 1.9704,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 0.13190184049079753,
315
+ "grad_norm": 0.7112812651734346,
316
+ "learning_rate": 9.875124807768324e-05,
317
+ "loss": 1.9396,
318
  "step": 43
319
  },
320
  {
321
  "epoch": 0.13496932515337423,
322
+ "grad_norm": 0.4122128687502141,
323
+ "learning_rate": 9.864086212157544e-05,
324
+ "loss": 1.9495,
325
  "step": 44
326
  },
327
  {
328
  "epoch": 0.13803680981595093,
329
+ "grad_norm": 0.33784719392668305,
330
+ "learning_rate": 9.852587628655787e-05,
331
+ "loss": 1.8904,
332
  "step": 45
333
  },
334
  {
335
  "epoch": 0.1411042944785276,
336
+ "grad_norm": 0.281184642101553,
337
+ "learning_rate": 9.840630269248549e-05,
338
+ "loss": 1.9156,
339
  "step": 46
340
  },
341
  {
342
  "epoch": 0.1441717791411043,
343
+ "grad_norm": 0.7601259994555819,
344
+ "learning_rate": 9.828215394277687e-05,
345
+ "loss": 1.9516,
346
  "step": 47
347
  },
348
  {
349
  "epoch": 0.147239263803681,
350
+ "grad_norm": 0.36449789385058556,
351
+ "learning_rate": 9.815344312308587e-05,
352
+ "loss": 1.9182,
353
  "step": 48
354
  },
355
  {
356
  "epoch": 0.15030674846625766,
357
+ "grad_norm": 0.32613788602651017,
358
+ "learning_rate": 9.80201837999223e-05,
359
+ "loss": 1.9367,
360
  "step": 49
361
  },
362
  {
363
  "epoch": 0.15337423312883436,
364
+ "grad_norm": 0.4437625986967123,
365
+ "learning_rate": 9.788239001922206e-05,
366
+ "loss": 1.8838,
367
  "step": 50
368
  },
369
  {
370
  "epoch": 0.15644171779141106,
371
+ "grad_norm": 0.7368917728925937,
372
+ "learning_rate": 9.774007630486651e-05,
373
+ "loss": 1.9125,
374
  "step": 51
375
  },
376
  {
377
  "epoch": 0.15950920245398773,
378
+ "grad_norm": 0.43661779665549927,
379
+ "learning_rate": 9.759325765715176e-05,
380
+ "loss": 1.9309,
381
  "step": 52
382
  },
383
  {
384
  "epoch": 0.16257668711656442,
385
+ "grad_norm": 0.27925292993087114,
386
+ "learning_rate": 9.744194955120748e-05,
387
+ "loss": 1.9374,
388
  "step": 53
389
  },
390
  {
391
  "epoch": 0.1656441717791411,
392
+ "grad_norm": 0.46390992287233235,
393
+ "learning_rate": 9.728616793536588e-05,
394
+ "loss": 1.9425,
395
  "step": 54
396
  },
397
  {
398
  "epoch": 0.1687116564417178,
399
+ "grad_norm": 0.2514992126441497,
400
+ "learning_rate": 9.712592922948057e-05,
401
+ "loss": 1.9482,
402
  "step": 55
403
  },
404
  {
405
  "epoch": 0.17177914110429449,
406
+ "grad_norm": 0.2703640459793386,
407
+ "learning_rate": 9.6961250323196e-05,
408
+ "loss": 1.8895,
409
  "step": 56
410
  },
411
  {
412
  "epoch": 0.17484662576687116,
413
+ "grad_norm": 0.561176184389631,
414
+ "learning_rate": 9.679214857416717e-05,
415
+ "loss": 1.928,
416
  "step": 57
417
  },
418
  {
419
  "epoch": 0.17791411042944785,
420
+ "grad_norm": 0.29671160399395613,
421
+ "learning_rate": 9.661864180623003e-05,
422
+ "loss": 1.9542,
423
  "step": 58
424
  },
425
  {
426
  "epoch": 0.18098159509202455,
427
+ "grad_norm": 0.28259623949277235,
428
+ "learning_rate": 9.644074830752293e-05,
429
+ "loss": 1.9519,
430
  "step": 59
431
  },
432
  {
433
  "epoch": 0.18404907975460122,
434
+ "grad_norm": 0.32102511884381013,
435
+ "learning_rate": 9.625848682855884e-05,
436
+ "loss": 1.8776,
437
  "step": 60
438
  },
439
  {
440
  "epoch": 0.18711656441717792,
441
+ "grad_norm": 1.6811025479349568,
442
+ "learning_rate": 9.607187658024912e-05,
443
+ "loss": 1.9016,
444
  "step": 61
445
  },
446
  {
447
  "epoch": 0.1901840490797546,
448
+ "grad_norm": 0.2951789033160566,
449
+ "learning_rate": 9.588093723187857e-05,
450
+ "loss": 1.9204,
451
  "step": 62
452
  },
453
  {
454
  "epoch": 0.19325153374233128,
455
+ "grad_norm": 0.35508359779387055,
456
+ "learning_rate": 9.568568890903221e-05,
457
+ "loss": 1.9144,
458
  "step": 63
459
  },
460
  {
461
  "epoch": 0.19631901840490798,
462
+ "grad_norm": 0.3620090919465414,
463
+ "learning_rate": 9.548615219147405e-05,
464
+ "loss": 1.8699,
465
  "step": 64
466
  },
467
  {
468
  "epoch": 0.19938650306748465,
469
+ "grad_norm": 0.3475528667692185,
470
+ "learning_rate": 9.528234811097782e-05,
471
+ "loss": 1.855,
472
  "step": 65
473
  },
474
  {
475
  "epoch": 0.20245398773006135,
476
+ "grad_norm": 0.2922421805064443,
477
+ "learning_rate": 9.507429814911024e-05,
478
+ "loss": 1.8648,
479
  "step": 66
480
  },
481
  {
482
  "epoch": 0.20245398773006135,
483
+ "eval_loss": 2.6012535095214844,
484
+ "eval_runtime": 55.5905,
485
+ "eval_samples_per_second": 1.799,
486
  "eval_steps_per_second": 0.126,
487
  "step": 66
488
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46d6cb0eb1e7ca6e84cff1f8ec963246766ca8b78e905f9a2825914974167129
3
  size 8120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05b2ca02ad4e9caf3f4e3a14715869eddab19db1759edf6925cc0fa1214003d8
3
  size 8120