ganghyeon commited on
Commit
a081187
1 Parent(s): cc5d710

Upload fine-tuned Llama model for order analysis

Browse files
README.md CHANGED
@@ -199,4 +199,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
- - PEFT 0.12.0
 
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
+ - PEFT 0.13.2
adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "q_proj",
24
  "o_proj",
25
- "down_proj",
26
- "up_proj",
27
  "v_proj",
28
- "gate_proj",
29
- "k_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "gate_proj",
24
  "q_proj",
25
  "o_proj",
 
 
26
  "v_proj",
27
+ "up_proj",
28
+ "k_proj",
29
+ "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f387711e63c8ab74a59664fd7c1a08083964c4f023e8f9c181ad0376b5caf698
3
  size 22573704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fd16f2186e694fe6230262c312e61c10a3d7f9e03f38474b129ae0681d1da30
3
  size 22573704
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e545c92e8a29e73b2a708438df82a773c90ac81fb95e3d77084344d71129ce4c
3
  size 45276986
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efdb0ee0bd918e11c0e37a9a99a18cbed253d9301bb54c58dcf8b9443eb5bd63
3
  size 45276986
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:134bd3caf7fa7a05a76100cdc2365343eb2f59dc0c82afde6756800f9549f1f9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9182312abdf3f5166a5d65d247c9bb66837869cb2d0f153f14a68891be22ee4
3
  size 14244
trainer_state.json CHANGED
@@ -10,492 +10,492 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.014285714285714285,
13
- "grad_norm": 2.279883623123169,
14
  "learning_rate": 1.4285714285714285e-05,
15
- "loss": 2.5982,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.02857142857142857,
20
- "grad_norm": 1.5012547969818115,
21
  "learning_rate": 2.857142857142857e-05,
22
- "loss": 2.3142,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.04285714285714286,
27
- "grad_norm": 1.7552474737167358,
28
  "learning_rate": 4.2857142857142856e-05,
29
- "loss": 1.7833,
30
  "step": 60
31
  },
32
  {
33
  "epoch": 0.05714285714285714,
34
- "grad_norm": 1.8790605068206787,
35
  "learning_rate": 5.714285714285714e-05,
36
- "loss": 1.2697,
37
  "step": 80
38
  },
39
  {
40
  "epoch": 0.07142857142857142,
41
- "grad_norm": 1.6815859079360962,
42
  "learning_rate": 7.142857142857143e-05,
43
- "loss": 1.049,
44
  "step": 100
45
  },
46
  {
47
  "epoch": 0.08571428571428572,
48
- "grad_norm": 1.561714768409729,
49
  "learning_rate": 8.571428571428571e-05,
50
- "loss": 0.9491,
51
  "step": 120
52
  },
53
  {
54
  "epoch": 0.1,
55
- "grad_norm": 1.4625489711761475,
56
  "learning_rate": 0.0001,
57
- "loss": 0.8847,
58
  "step": 140
59
  },
60
  {
61
  "epoch": 0.11428571428571428,
62
- "grad_norm": 2.1226258277893066,
63
  "learning_rate": 9.841269841269841e-05,
64
- "loss": 0.8167,
65
  "step": 160
66
  },
67
  {
68
  "epoch": 0.12857142857142856,
69
- "grad_norm": 1.3709640502929688,
70
  "learning_rate": 9.682539682539682e-05,
71
- "loss": 0.7321,
72
  "step": 180
73
  },
74
  {
75
  "epoch": 0.14285714285714285,
76
- "grad_norm": 1.2312499284744263,
77
  "learning_rate": 9.523809523809524e-05,
78
- "loss": 0.693,
79
  "step": 200
80
  },
81
  {
82
  "epoch": 0.15714285714285714,
83
- "grad_norm": 1.4416557550430298,
84
  "learning_rate": 9.365079365079366e-05,
85
- "loss": 0.6389,
86
  "step": 220
87
  },
88
  {
89
  "epoch": 0.17142857142857143,
90
- "grad_norm": 1.7498096227645874,
91
  "learning_rate": 9.206349206349206e-05,
92
- "loss": 0.6421,
93
  "step": 240
94
  },
95
  {
96
  "epoch": 0.18571428571428572,
97
- "grad_norm": 1.6708226203918457,
98
  "learning_rate": 9.047619047619048e-05,
99
- "loss": 0.6309,
100
  "step": 260
101
  },
102
  {
103
  "epoch": 0.2,
104
- "grad_norm": 1.7032530307769775,
105
  "learning_rate": 8.888888888888889e-05,
106
- "loss": 0.6388,
107
  "step": 280
108
  },
109
  {
110
  "epoch": 0.21428571428571427,
111
- "grad_norm": 1.1614326238632202,
112
  "learning_rate": 8.730158730158731e-05,
113
- "loss": 0.5888,
114
  "step": 300
115
  },
116
  {
117
  "epoch": 0.22857142857142856,
118
- "grad_norm": 1.7418193817138672,
119
  "learning_rate": 8.571428571428571e-05,
120
- "loss": 0.5878,
121
  "step": 320
122
  },
123
  {
124
  "epoch": 0.24285714285714285,
125
- "grad_norm": 1.3387174606323242,
126
  "learning_rate": 8.412698412698413e-05,
127
- "loss": 0.5962,
128
  "step": 340
129
  },
130
  {
131
  "epoch": 0.2571428571428571,
132
- "grad_norm": 1.1994811296463013,
133
  "learning_rate": 8.253968253968255e-05,
134
- "loss": 0.6093,
135
  "step": 360
136
  },
137
  {
138
  "epoch": 0.2714285714285714,
139
- "grad_norm": 1.5204330682754517,
140
  "learning_rate": 8.095238095238096e-05,
141
- "loss": 0.6045,
142
  "step": 380
143
  },
144
  {
145
  "epoch": 0.2857142857142857,
146
- "grad_norm": 1.2687711715698242,
147
  "learning_rate": 7.936507936507937e-05,
148
- "loss": 0.5934,
149
  "step": 400
150
  },
151
  {
152
  "epoch": 0.3,
153
- "grad_norm": 1.4332380294799805,
154
  "learning_rate": 7.777777777777778e-05,
155
- "loss": 0.5883,
156
  "step": 420
157
  },
158
  {
159
  "epoch": 0.3142857142857143,
160
- "grad_norm": 1.5756443738937378,
161
  "learning_rate": 7.619047619047618e-05,
162
- "loss": 0.5605,
163
  "step": 440
164
  },
165
  {
166
  "epoch": 0.32857142857142857,
167
- "grad_norm": 1.405213713645935,
168
  "learning_rate": 7.460317460317461e-05,
169
- "loss": 0.5993,
170
  "step": 460
171
  },
172
  {
173
  "epoch": 0.34285714285714286,
174
- "grad_norm": 1.480230450630188,
175
  "learning_rate": 7.301587301587302e-05,
176
- "loss": 0.5896,
177
  "step": 480
178
  },
179
  {
180
  "epoch": 0.35714285714285715,
181
- "grad_norm": 1.472406029701233,
182
  "learning_rate": 7.142857142857143e-05,
183
- "loss": 0.5661,
184
  "step": 500
185
  },
186
  {
187
  "epoch": 0.37142857142857144,
188
- "grad_norm": 1.408607006072998,
189
  "learning_rate": 6.984126984126984e-05,
190
- "loss": 0.545,
191
  "step": 520
192
  },
193
  {
194
  "epoch": 0.38571428571428573,
195
- "grad_norm": 1.3194152116775513,
196
  "learning_rate": 6.825396825396825e-05,
197
- "loss": 0.5366,
198
  "step": 540
199
  },
200
  {
201
  "epoch": 0.4,
202
- "grad_norm": 1.5078984498977661,
203
  "learning_rate": 6.666666666666667e-05,
204
- "loss": 0.5236,
205
  "step": 560
206
  },
207
  {
208
  "epoch": 0.4142857142857143,
209
- "grad_norm": 1.3387917280197144,
210
  "learning_rate": 6.507936507936509e-05,
211
- "loss": 0.5545,
212
  "step": 580
213
  },
214
  {
215
  "epoch": 0.42857142857142855,
216
- "grad_norm": 1.1835085153579712,
217
  "learning_rate": 6.349206349206349e-05,
218
- "loss": 0.563,
219
  "step": 600
220
  },
221
  {
222
  "epoch": 0.44285714285714284,
223
- "grad_norm": 1.424862027168274,
224
  "learning_rate": 6.19047619047619e-05,
225
- "loss": 0.5369,
226
  "step": 620
227
  },
228
  {
229
  "epoch": 0.45714285714285713,
230
- "grad_norm": 1.3369919061660767,
231
  "learning_rate": 6.0317460317460316e-05,
232
- "loss": 0.576,
233
  "step": 640
234
  },
235
  {
236
  "epoch": 0.4714285714285714,
237
- "grad_norm": 1.2523393630981445,
238
  "learning_rate": 5.873015873015873e-05,
239
- "loss": 0.5245,
240
  "step": 660
241
  },
242
  {
243
  "epoch": 0.4857142857142857,
244
- "grad_norm": 1.6725609302520752,
245
  "learning_rate": 5.714285714285714e-05,
246
- "loss": 0.5047,
247
  "step": 680
248
  },
249
  {
250
  "epoch": 0.5,
251
- "grad_norm": 1.3288273811340332,
252
  "learning_rate": 5.555555555555556e-05,
253
- "loss": 0.5396,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.5142857142857142,
258
- "grad_norm": 1.492069125175476,
259
  "learning_rate": 5.396825396825397e-05,
260
- "loss": 0.5099,
261
  "step": 720
262
  },
263
  {
264
  "epoch": 0.5285714285714286,
265
- "grad_norm": 1.508617639541626,
266
  "learning_rate": 5.2380952380952384e-05,
267
- "loss": 0.503,
268
  "step": 740
269
  },
270
  {
271
  "epoch": 0.5428571428571428,
272
- "grad_norm": 1.6115648746490479,
273
  "learning_rate": 5.0793650793650794e-05,
274
- "loss": 0.5571,
275
  "step": 760
276
  },
277
  {
278
  "epoch": 0.5571428571428572,
279
- "grad_norm": 1.4812785387039185,
280
  "learning_rate": 4.9206349206349204e-05,
281
- "loss": 0.5036,
282
  "step": 780
283
  },
284
  {
285
  "epoch": 0.5714285714285714,
286
- "grad_norm": 1.555457353591919,
287
  "learning_rate": 4.761904761904762e-05,
288
- "loss": 0.518,
289
  "step": 800
290
  },
291
  {
292
  "epoch": 0.5857142857142857,
293
- "grad_norm": 1.6743320226669312,
294
  "learning_rate": 4.603174603174603e-05,
295
- "loss": 0.523,
296
  "step": 820
297
  },
298
  {
299
  "epoch": 0.6,
300
- "grad_norm": 1.6365365982055664,
301
  "learning_rate": 4.4444444444444447e-05,
302
- "loss": 0.5112,
303
  "step": 840
304
  },
305
  {
306
  "epoch": 0.6142857142857143,
307
- "grad_norm": 1.4804445505142212,
308
  "learning_rate": 4.2857142857142856e-05,
309
- "loss": 0.5177,
310
  "step": 860
311
  },
312
  {
313
  "epoch": 0.6285714285714286,
314
- "grad_norm": 1.5929114818572998,
315
  "learning_rate": 4.126984126984127e-05,
316
- "loss": 0.4895,
317
  "step": 880
318
  },
319
  {
320
  "epoch": 0.6428571428571429,
321
- "grad_norm": 1.512065052986145,
322
  "learning_rate": 3.968253968253968e-05,
323
- "loss": 0.5158,
324
  "step": 900
325
  },
326
  {
327
  "epoch": 0.6571428571428571,
328
- "grad_norm": 1.5385123491287231,
329
  "learning_rate": 3.809523809523809e-05,
330
- "loss": 0.5188,
331
  "step": 920
332
  },
333
  {
334
  "epoch": 0.6714285714285714,
335
- "grad_norm": 1.7010993957519531,
336
  "learning_rate": 3.650793650793651e-05,
337
- "loss": 0.5039,
338
  "step": 940
339
  },
340
  {
341
  "epoch": 0.6857142857142857,
342
- "grad_norm": 1.4756510257720947,
343
  "learning_rate": 3.492063492063492e-05,
344
- "loss": 0.5004,
345
  "step": 960
346
  },
347
  {
348
  "epoch": 0.7,
349
- "grad_norm": 1.407616376876831,
350
  "learning_rate": 3.3333333333333335e-05,
351
- "loss": 0.4947,
352
  "step": 980
353
  },
354
  {
355
  "epoch": 0.7142857142857143,
356
- "grad_norm": 1.376063346862793,
357
  "learning_rate": 3.1746031746031745e-05,
358
- "loss": 0.4797,
359
  "step": 1000
360
  },
361
  {
362
  "epoch": 0.7285714285714285,
363
- "grad_norm": 1.6061830520629883,
364
  "learning_rate": 3.0158730158730158e-05,
365
- "loss": 0.4872,
366
  "step": 1020
367
  },
368
  {
369
  "epoch": 0.7428571428571429,
370
- "grad_norm": 1.4005217552185059,
371
  "learning_rate": 2.857142857142857e-05,
372
- "loss": 0.4648,
373
  "step": 1040
374
  },
375
  {
376
  "epoch": 0.7571428571428571,
377
- "grad_norm": 1.4235899448394775,
378
  "learning_rate": 2.6984126984126984e-05,
379
- "loss": 0.463,
380
  "step": 1060
381
  },
382
  {
383
  "epoch": 0.7714285714285715,
384
- "grad_norm": 1.210481882095337,
385
  "learning_rate": 2.5396825396825397e-05,
386
- "loss": 0.4728,
387
  "step": 1080
388
  },
389
  {
390
  "epoch": 0.7857142857142857,
391
- "grad_norm": 1.6099470853805542,
392
  "learning_rate": 2.380952380952381e-05,
393
- "loss": 0.485,
394
  "step": 1100
395
  },
396
  {
397
  "epoch": 0.8,
398
- "grad_norm": 1.6083734035491943,
399
  "learning_rate": 2.2222222222222223e-05,
400
- "loss": 0.4594,
401
  "step": 1120
402
  },
403
  {
404
  "epoch": 0.8142857142857143,
405
- "grad_norm": 1.350246548652649,
406
  "learning_rate": 2.0634920634920636e-05,
407
- "loss": 0.4763,
408
  "step": 1140
409
  },
410
  {
411
  "epoch": 0.8285714285714286,
412
- "grad_norm": 1.3000835180282593,
413
  "learning_rate": 1.9047619047619046e-05,
414
- "loss": 0.4705,
415
  "step": 1160
416
  },
417
  {
418
  "epoch": 0.8428571428571429,
419
- "grad_norm": 1.2059348821640015,
420
  "learning_rate": 1.746031746031746e-05,
421
- "loss": 0.4876,
422
  "step": 1180
423
  },
424
  {
425
  "epoch": 0.8571428571428571,
426
- "grad_norm": 1.3652459383010864,
427
  "learning_rate": 1.5873015873015872e-05,
428
- "loss": 0.4919,
429
  "step": 1200
430
  },
431
  {
432
  "epoch": 0.8714285714285714,
433
- "grad_norm": 1.451910376548767,
434
  "learning_rate": 1.4285714285714285e-05,
435
- "loss": 0.463,
436
  "step": 1220
437
  },
438
  {
439
  "epoch": 0.8857142857142857,
440
- "grad_norm": 1.4704546928405762,
441
  "learning_rate": 1.2698412698412699e-05,
442
- "loss": 0.4768,
443
  "step": 1240
444
  },
445
  {
446
  "epoch": 0.9,
447
- "grad_norm": 1.5009324550628662,
448
  "learning_rate": 1.1111111111111112e-05,
449
- "loss": 0.4516,
450
  "step": 1260
451
  },
452
  {
453
  "epoch": 0.9142857142857143,
454
- "grad_norm": 1.5519717931747437,
455
  "learning_rate": 9.523809523809523e-06,
456
- "loss": 0.4517,
457
  "step": 1280
458
  },
459
  {
460
  "epoch": 0.9285714285714286,
461
- "grad_norm": 1.5606343746185303,
462
  "learning_rate": 7.936507936507936e-06,
463
- "loss": 0.4732,
464
  "step": 1300
465
  },
466
  {
467
  "epoch": 0.9428571428571428,
468
- "grad_norm": 1.3639295101165771,
469
  "learning_rate": 6.349206349206349e-06,
470
- "loss": 0.4791,
471
  "step": 1320
472
  },
473
  {
474
  "epoch": 0.9571428571428572,
475
- "grad_norm": 1.6867655515670776,
476
  "learning_rate": 4.7619047619047615e-06,
477
- "loss": 0.4968,
478
  "step": 1340
479
  },
480
  {
481
  "epoch": 0.9714285714285714,
482
- "grad_norm": 1.4185600280761719,
483
  "learning_rate": 3.1746031746031746e-06,
484
- "loss": 0.4902,
485
  "step": 1360
486
  },
487
  {
488
  "epoch": 0.9857142857142858,
489
- "grad_norm": 1.7370814085006714,
490
  "learning_rate": 1.5873015873015873e-06,
491
- "loss": 0.4564,
492
  "step": 1380
493
  },
494
  {
495
  "epoch": 1.0,
496
- "grad_norm": 1.6167285442352295,
497
  "learning_rate": 0.0,
498
- "loss": 0.4652,
499
  "step": 1400
500
  }
501
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.014285714285714285,
13
+ "grad_norm": 1.8742432594299316,
14
  "learning_rate": 1.4285714285714285e-05,
15
+ "loss": 2.6888,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.02857142857142857,
20
+ "grad_norm": 1.6443878412246704,
21
  "learning_rate": 2.857142857142857e-05,
22
+ "loss": 2.289,
23
  "step": 40
24
  },
25
  {
26
  "epoch": 0.04285714285714286,
27
+ "grad_norm": 1.8094583749771118,
28
  "learning_rate": 4.2857142857142856e-05,
29
+ "loss": 1.7939,
30
  "step": 60
31
  },
32
  {
33
  "epoch": 0.05714285714285714,
34
+ "grad_norm": 1.4206470251083374,
35
  "learning_rate": 5.714285714285714e-05,
36
+ "loss": 1.312,
37
  "step": 80
38
  },
39
  {
40
  "epoch": 0.07142857142857142,
41
+ "grad_norm": 1.4811224937438965,
42
  "learning_rate": 7.142857142857143e-05,
43
+ "loss": 1.0735,
44
  "step": 100
45
  },
46
  {
47
  "epoch": 0.08571428571428572,
48
+ "grad_norm": 1.4770212173461914,
49
  "learning_rate": 8.571428571428571e-05,
50
+ "loss": 0.9261,
51
  "step": 120
52
  },
53
  {
54
  "epoch": 0.1,
55
+ "grad_norm": 1.3422306776046753,
56
  "learning_rate": 0.0001,
57
+ "loss": 0.882,
58
  "step": 140
59
  },
60
  {
61
  "epoch": 0.11428571428571428,
62
+ "grad_norm": 1.5494495630264282,
63
  "learning_rate": 9.841269841269841e-05,
64
+ "loss": 0.8599,
65
  "step": 160
66
  },
67
  {
68
  "epoch": 0.12857142857142856,
69
+ "grad_norm": 1.4366521835327148,
70
  "learning_rate": 9.682539682539682e-05,
71
+ "loss": 0.7784,
72
  "step": 180
73
  },
74
  {
75
  "epoch": 0.14285714285714285,
76
+ "grad_norm": 1.731536865234375,
77
  "learning_rate": 9.523809523809524e-05,
78
+ "loss": 0.7404,
79
  "step": 200
80
  },
81
  {
82
  "epoch": 0.15714285714285714,
83
+ "grad_norm": 1.2290751934051514,
84
  "learning_rate": 9.365079365079366e-05,
85
+ "loss": 0.6393,
86
  "step": 220
87
  },
88
  {
89
  "epoch": 0.17142857142857143,
90
+ "grad_norm": 1.5464402437210083,
91
  "learning_rate": 9.206349206349206e-05,
92
+ "loss": 0.6414,
93
  "step": 240
94
  },
95
  {
96
  "epoch": 0.18571428571428572,
97
+ "grad_norm": 1.628503680229187,
98
  "learning_rate": 9.047619047619048e-05,
99
+ "loss": 0.6233,
100
  "step": 260
101
  },
102
  {
103
  "epoch": 0.2,
104
+ "grad_norm": 1.8485362529754639,
105
  "learning_rate": 8.888888888888889e-05,
106
+ "loss": 0.6558,
107
  "step": 280
108
  },
109
  {
110
  "epoch": 0.21428571428571427,
111
+ "grad_norm": 1.4331471920013428,
112
  "learning_rate": 8.730158730158731e-05,
113
+ "loss": 0.624,
114
  "step": 300
115
  },
116
  {
117
  "epoch": 0.22857142857142856,
118
+ "grad_norm": 1.6106165647506714,
119
  "learning_rate": 8.571428571428571e-05,
120
+ "loss": 0.6276,
121
  "step": 320
122
  },
123
  {
124
  "epoch": 0.24285714285714285,
125
+ "grad_norm": 1.328774094581604,
126
  "learning_rate": 8.412698412698413e-05,
127
+ "loss": 0.5928,
128
  "step": 340
129
  },
130
  {
131
  "epoch": 0.2571428571428571,
132
+ "grad_norm": 1.4172090291976929,
133
  "learning_rate": 8.253968253968255e-05,
134
+ "loss": 0.597,
135
  "step": 360
136
  },
137
  {
138
  "epoch": 0.2714285714285714,
139
+ "grad_norm": 1.2621384859085083,
140
  "learning_rate": 8.095238095238096e-05,
141
+ "loss": 0.6336,
142
  "step": 380
143
  },
144
  {
145
  "epoch": 0.2857142857142857,
146
+ "grad_norm": 1.4633510112762451,
147
  "learning_rate": 7.936507936507937e-05,
148
+ "loss": 0.6165,
149
  "step": 400
150
  },
151
  {
152
  "epoch": 0.3,
153
+ "grad_norm": 1.5444340705871582,
154
  "learning_rate": 7.777777777777778e-05,
155
+ "loss": 0.6204,
156
  "step": 420
157
  },
158
  {
159
  "epoch": 0.3142857142857143,
160
+ "grad_norm": 1.758965253829956,
161
  "learning_rate": 7.619047619047618e-05,
162
+ "loss": 0.6122,
163
  "step": 440
164
  },
165
  {
166
  "epoch": 0.32857142857142857,
167
+ "grad_norm": 1.4885847568511963,
168
  "learning_rate": 7.460317460317461e-05,
169
+ "loss": 0.5739,
170
  "step": 460
171
  },
172
  {
173
  "epoch": 0.34285714285714286,
174
+ "grad_norm": 1.6695561408996582,
175
  "learning_rate": 7.301587301587302e-05,
176
+ "loss": 0.5807,
177
  "step": 480
178
  },
179
  {
180
  "epoch": 0.35714285714285715,
181
+ "grad_norm": 1.7690757513046265,
182
  "learning_rate": 7.142857142857143e-05,
183
+ "loss": 0.5633,
184
  "step": 500
185
  },
186
  {
187
  "epoch": 0.37142857142857144,
188
+ "grad_norm": 1.4946894645690918,
189
  "learning_rate": 6.984126984126984e-05,
190
+ "loss": 0.5602,
191
  "step": 520
192
  },
193
  {
194
  "epoch": 0.38571428571428573,
195
+ "grad_norm": 1.5215212106704712,
196
  "learning_rate": 6.825396825396825e-05,
197
+ "loss": 0.5381,
198
  "step": 540
199
  },
200
  {
201
  "epoch": 0.4,
202
+ "grad_norm": 1.1956950426101685,
203
  "learning_rate": 6.666666666666667e-05,
204
+ "loss": 0.5281,
205
  "step": 560
206
  },
207
  {
208
  "epoch": 0.4142857142857143,
209
+ "grad_norm": 1.315496563911438,
210
  "learning_rate": 6.507936507936509e-05,
211
+ "loss": 0.551,
212
  "step": 580
213
  },
214
  {
215
  "epoch": 0.42857142857142855,
216
+ "grad_norm": 1.3885098695755005,
217
  "learning_rate": 6.349206349206349e-05,
218
+ "loss": 0.5428,
219
  "step": 600
220
  },
221
  {
222
  "epoch": 0.44285714285714284,
223
+ "grad_norm": 1.4572407007217407,
224
  "learning_rate": 6.19047619047619e-05,
225
+ "loss": 0.5465,
226
  "step": 620
227
  },
228
  {
229
  "epoch": 0.45714285714285713,
230
+ "grad_norm": 1.411399006843567,
231
  "learning_rate": 6.0317460317460316e-05,
232
+ "loss": 0.5263,
233
  "step": 640
234
  },
235
  {
236
  "epoch": 0.4714285714285714,
237
+ "grad_norm": 1.3295258283615112,
238
  "learning_rate": 5.873015873015873e-05,
239
+ "loss": 0.5248,
240
  "step": 660
241
  },
242
  {
243
  "epoch": 0.4857142857142857,
244
+ "grad_norm": 1.512662410736084,
245
  "learning_rate": 5.714285714285714e-05,
246
+ "loss": 0.5213,
247
  "step": 680
248
  },
249
  {
250
  "epoch": 0.5,
251
+ "grad_norm": 1.3572126626968384,
252
  "learning_rate": 5.555555555555556e-05,
253
+ "loss": 0.5003,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.5142857142857142,
258
+ "grad_norm": 1.4415481090545654,
259
  "learning_rate": 5.396825396825397e-05,
260
+ "loss": 0.5529,
261
  "step": 720
262
  },
263
  {
264
  "epoch": 0.5285714285714286,
265
+ "grad_norm": 1.4465086460113525,
266
  "learning_rate": 5.2380952380952384e-05,
267
+ "loss": 0.5266,
268
  "step": 740
269
  },
270
  {
271
  "epoch": 0.5428571428571428,
272
+ "grad_norm": 1.3659744262695312,
273
  "learning_rate": 5.0793650793650794e-05,
274
+ "loss": 0.533,
275
  "step": 760
276
  },
277
  {
278
  "epoch": 0.5571428571428572,
279
+ "grad_norm": 1.5549241304397583,
280
  "learning_rate": 4.9206349206349204e-05,
281
+ "loss": 0.5139,
282
  "step": 780
283
  },
284
  {
285
  "epoch": 0.5714285714285714,
286
+ "grad_norm": 1.5689399242401123,
287
  "learning_rate": 4.761904761904762e-05,
288
+ "loss": 0.5188,
289
  "step": 800
290
  },
291
  {
292
  "epoch": 0.5857142857142857,
293
+ "grad_norm": 1.3908042907714844,
294
  "learning_rate": 4.603174603174603e-05,
295
+ "loss": 0.529,
296
  "step": 820
297
  },
298
  {
299
  "epoch": 0.6,
300
+ "grad_norm": 1.7993814945220947,
301
  "learning_rate": 4.4444444444444447e-05,
302
+ "loss": 0.548,
303
  "step": 840
304
  },
305
  {
306
  "epoch": 0.6142857142857143,
307
+ "grad_norm": 1.5198805332183838,
308
  "learning_rate": 4.2857142857142856e-05,
309
+ "loss": 0.5367,
310
  "step": 860
311
  },
312
  {
313
  "epoch": 0.6285714285714286,
314
+ "grad_norm": 1.4617140293121338,
315
  "learning_rate": 4.126984126984127e-05,
316
+ "loss": 0.5016,
317
  "step": 880
318
  },
319
  {
320
  "epoch": 0.6428571428571429,
321
+ "grad_norm": 1.6662240028381348,
322
  "learning_rate": 3.968253968253968e-05,
323
+ "loss": 0.5078,
324
  "step": 900
325
  },
326
  {
327
  "epoch": 0.6571428571428571,
328
+ "grad_norm": 1.617306113243103,
329
  "learning_rate": 3.809523809523809e-05,
330
+ "loss": 0.5237,
331
  "step": 920
332
  },
333
  {
334
  "epoch": 0.6714285714285714,
335
+ "grad_norm": 1.7389674186706543,
336
  "learning_rate": 3.650793650793651e-05,
337
+ "loss": 0.4726,
338
  "step": 940
339
  },
340
  {
341
  "epoch": 0.6857142857142857,
342
+ "grad_norm": 1.54181706905365,
343
  "learning_rate": 3.492063492063492e-05,
344
+ "loss": 0.5232,
345
  "step": 960
346
  },
347
  {
348
  "epoch": 0.7,
349
+ "grad_norm": 1.4722188711166382,
350
  "learning_rate": 3.3333333333333335e-05,
351
+ "loss": 0.532,
352
  "step": 980
353
  },
354
  {
355
  "epoch": 0.7142857142857143,
356
+ "grad_norm": 1.3351426124572754,
357
  "learning_rate": 3.1746031746031745e-05,
358
+ "loss": 0.5015,
359
  "step": 1000
360
  },
361
  {
362
  "epoch": 0.7285714285714285,
363
+ "grad_norm": 1.491120457649231,
364
  "learning_rate": 3.0158730158730158e-05,
365
+ "loss": 0.5201,
366
  "step": 1020
367
  },
368
  {
369
  "epoch": 0.7428571428571429,
370
+ "grad_norm": 1.24501633644104,
371
  "learning_rate": 2.857142857142857e-05,
372
+ "loss": 0.4943,
373
  "step": 1040
374
  },
375
  {
376
  "epoch": 0.7571428571428571,
377
+ "grad_norm": 1.4944851398468018,
378
  "learning_rate": 2.6984126984126984e-05,
379
+ "loss": 0.4821,
380
  "step": 1060
381
  },
382
  {
383
  "epoch": 0.7714285714285715,
384
+ "grad_norm": 1.513795256614685,
385
  "learning_rate": 2.5396825396825397e-05,
386
+ "loss": 0.481,
387
  "step": 1080
388
  },
389
  {
390
  "epoch": 0.7857142857142857,
391
+ "grad_norm": 1.8671048879623413,
392
  "learning_rate": 2.380952380952381e-05,
393
+ "loss": 0.48,
394
  "step": 1100
395
  },
396
  {
397
  "epoch": 0.8,
398
+ "grad_norm": 1.3706579208374023,
399
  "learning_rate": 2.2222222222222223e-05,
400
+ "loss": 0.4979,
401
  "step": 1120
402
  },
403
  {
404
  "epoch": 0.8142857142857143,
405
+ "grad_norm": 1.3706722259521484,
406
  "learning_rate": 2.0634920634920636e-05,
407
+ "loss": 0.4842,
408
  "step": 1140
409
  },
410
  {
411
  "epoch": 0.8285714285714286,
412
+ "grad_norm": 1.5844509601593018,
413
  "learning_rate": 1.9047619047619046e-05,
414
+ "loss": 0.4647,
415
  "step": 1160
416
  },
417
  {
418
  "epoch": 0.8428571428571429,
419
+ "grad_norm": 1.4700433015823364,
420
  "learning_rate": 1.746031746031746e-05,
421
+ "loss": 0.4861,
422
  "step": 1180
423
  },
424
  {
425
  "epoch": 0.8571428571428571,
426
+ "grad_norm": 1.5295989513397217,
427
  "learning_rate": 1.5873015873015872e-05,
428
+ "loss": 0.4719,
429
  "step": 1200
430
  },
431
  {
432
  "epoch": 0.8714285714285714,
433
+ "grad_norm": 1.4497429132461548,
434
  "learning_rate": 1.4285714285714285e-05,
435
+ "loss": 0.4761,
436
  "step": 1220
437
  },
438
  {
439
  "epoch": 0.8857142857142857,
440
+ "grad_norm": 1.5591635704040527,
441
  "learning_rate": 1.2698412698412699e-05,
442
+ "loss": 0.4787,
443
  "step": 1240
444
  },
445
  {
446
  "epoch": 0.9,
447
+ "grad_norm": 1.410678744316101,
448
  "learning_rate": 1.1111111111111112e-05,
449
+ "loss": 0.4824,
450
  "step": 1260
451
  },
452
  {
453
  "epoch": 0.9142857142857143,
454
+ "grad_norm": 1.4857158660888672,
455
  "learning_rate": 9.523809523809523e-06,
456
+ "loss": 0.4757,
457
  "step": 1280
458
  },
459
  {
460
  "epoch": 0.9285714285714286,
461
+ "grad_norm": 1.626083493232727,
462
  "learning_rate": 7.936507936507936e-06,
463
+ "loss": 0.4571,
464
  "step": 1300
465
  },
466
  {
467
  "epoch": 0.9428571428571428,
468
+ "grad_norm": 1.595832109451294,
469
  "learning_rate": 6.349206349206349e-06,
470
+ "loss": 0.4909,
471
  "step": 1320
472
  },
473
  {
474
  "epoch": 0.9571428571428572,
475
+ "grad_norm": 1.5187280178070068,
476
  "learning_rate": 4.7619047619047615e-06,
477
+ "loss": 0.4785,
478
  "step": 1340
479
  },
480
  {
481
  "epoch": 0.9714285714285714,
482
+ "grad_norm": 1.7050893306732178,
483
  "learning_rate": 3.1746031746031746e-06,
484
+ "loss": 0.4933,
485
  "step": 1360
486
  },
487
  {
488
  "epoch": 0.9857142857142858,
489
+ "grad_norm": 1.5147182941436768,
490
  "learning_rate": 1.5873015873015873e-06,
491
+ "loss": 0.4833,
492
  "step": 1380
493
  },
494
  {
495
  "epoch": 1.0,
496
+ "grad_norm": 1.7184252738952637,
497
  "learning_rate": 0.0,
498
+ "loss": 0.4739,
499
  "step": 1400
500
  }
501
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39f98817715f279a5f40c38dd70904c7137598047d2e35bac717a82d7f015fd1
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8eed791846fb1238d3ca6900ad9684d9e5dc153aabd4fc98681bc70458ff8546
3
  size 5560