agentlans commited on
Commit
9f61df5
1 Parent(s): ec60add

Upload 8 files

Browse files
README.md CHANGED
@@ -4,19 +4,19 @@ base_model: agentlans/multilingual-e5-small-aligned
4
  tags:
5
  - generated_from_trainer
6
  model-index:
7
- - name: multilingual-e5-small-aligned-transformed-sentiment
8
  results: []
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
  should probably proofread and complete it, then remove this comment. -->
13
 
14
- # multilingual-e5-small-aligned-transformed-sentiment
15
 
16
  This model is a fine-tuned version of [agentlans/multilingual-e5-small-aligned](https://huggingface.co/agentlans/multilingual-e5-small-aligned) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.2082
19
- - Mse: 0.2082
20
 
21
  ## Model description
22
 
@@ -36,7 +36,7 @@ More information needed
36
 
37
  The following hyperparameters were used during training:
38
  - learning_rate: 5e-05
39
- - train_batch_size: 32
40
  - eval_batch_size: 8
41
  - seed: 42
42
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
@@ -45,11 +45,11 @@ The following hyperparameters were used during training:
45
 
46
  ### Training results
47
 
48
- | Training Loss | Epoch | Step | Validation Loss | Mse |
49
- |:-------------:|:-----:|:------:|:---------------:|:------:|
50
- | 0.1898 | 1.0 | 54191 | 0.2322 | 0.2322 |
51
- | 0.1186 | 2.0 | 108382 | 0.2139 | 0.2139 |
52
- | 0.0861 | 3.0 | 162573 | 0.2082 | 0.2082 |
53
 
54
 
55
  ### Framework versions
 
4
  tags:
5
  - generated_from_trainer
6
  model-index:
7
+ - name: multilingual-e5-small-aligned-sentiment-20241214-new
8
  results: []
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
  should probably proofread and complete it, then remove this comment. -->
13
 
14
+ # multilingual-e5-small-aligned-sentiment-20241214-new
15
 
16
  This model is a fine-tuned version of [agentlans/multilingual-e5-small-aligned](https://huggingface.co/agentlans/multilingual-e5-small-aligned) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.1455
19
+ - Mse: 0.1455
20
 
21
  ## Model description
22
 
 
36
 
37
  The following hyperparameters were used during training:
38
  - learning_rate: 5e-05
39
+ - train_batch_size: 128
40
  - eval_batch_size: 8
41
  - seed: 42
42
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 
45
 
46
  ### Training results
47
 
48
+ | Training Loss | Epoch | Step | Validation Loss | Mse |
49
+ |:-------------:|:-----:|:-----:|:---------------:|:------:|
50
+ | 0.1946 | 1.0 | 7813 | 0.1647 | 0.1647 |
51
+ | 0.1385 | 2.0 | 15626 | 0.1528 | 0.1528 |
52
+ | 0.1121 | 3.0 | 23439 | 0.1455 | 0.1455 |
53
 
54
 
55
  ### Framework versions
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.20824576914310455,
4
- "eval_mse": 0.20824573578672098,
5
- "eval_runtime": 118.1347,
6
- "eval_samples": 192676,
7
- "eval_samples_per_second": 1630.985,
8
- "eval_steps_per_second": 203.877,
9
- "total_flos": 8.56700972907817e+16,
10
- "train_loss": 0.16141534491764947,
11
- "train_runtime": 8977.4486,
12
- "train_samples": 1734084,
13
- "train_samples_per_second": 579.48,
14
- "train_steps_per_second": 18.109
15
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.14551572501659393,
4
+ "eval_mse": 0.14551572992011078,
5
+ "eval_runtime": 98.285,
6
+ "eval_samples": 182111,
7
+ "eval_samples_per_second": 1852.887,
8
+ "eval_steps_per_second": 231.612,
9
+ "total_flos": 4.9403660544e+16,
10
+ "train_loss": 0.16052449254575832,
11
+ "train_runtime": 3301.1135,
12
+ "train_samples": 1000000,
13
+ "train_samples_per_second": 908.784,
14
+ "train_steps_per_second": 7.1
15
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.20824576914310455,
4
- "eval_mse": 0.20824573578672098,
5
- "eval_runtime": 118.1347,
6
- "eval_samples": 192676,
7
- "eval_samples_per_second": 1630.985,
8
- "eval_steps_per_second": 203.877
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.14551572501659393,
4
+ "eval_mse": 0.14551572992011078,
5
+ "eval_runtime": 98.285,
6
+ "eval_samples": 182111,
7
+ "eval_samples_per_second": 1852.887,
8
+ "eval_steps_per_second": 231.612
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1625359b708464b43c87eea957f8f6c642c0ed136ac047d2d480e5e37858bab4
3
  size 470640124
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb6389c22918ca68d07d7638d285a70d97fa83d5b864da284dbb26aaba50ba4
3
  size 470640124
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "total_flos": 8.56700972907817e+16,
4
- "train_loss": 0.16141534491764947,
5
- "train_runtime": 8977.4486,
6
- "train_samples": 1734084,
7
- "train_samples_per_second": 579.48,
8
- "train_steps_per_second": 18.109
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 4.9403660544e+16,
4
+ "train_loss": 0.16052449254575832,
5
+ "train_runtime": 3301.1135,
6
+ "train_samples": 1000000,
7
+ "train_samples_per_second": 908.784,
8
+ "train_steps_per_second": 7.1
9
  }
trainer_state.json CHANGED
@@ -1,2327 +1,374 @@
1
  {
2
- "best_metric": 0.20824576914310455,
3
- "best_model_checkpoint": "multilingual-e5-small-aligned-transformed-sentiment/checkpoint-162573",
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 162573,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.009226624347216328,
13
- "grad_norm": 4.861542224884033,
14
- "learning_rate": 4.98462229275464e-05,
15
- "loss": 0.4705,
16
  "step": 500
17
  },
18
  {
19
- "epoch": 0.018453248694432656,
20
- "grad_norm": 4.163719177246094,
21
- "learning_rate": 4.969244585509279e-05,
22
- "loss": 0.3878,
23
  "step": 1000
24
  },
25
  {
26
- "epoch": 0.027679873041648984,
27
- "grad_norm": 5.356781482696533,
28
- "learning_rate": 4.9538668782639185e-05,
29
- "loss": 0.3727,
30
  "step": 1500
31
  },
32
  {
33
- "epoch": 0.03690649738886531,
34
- "grad_norm": 4.754086017608643,
35
- "learning_rate": 4.9384891710185583e-05,
36
- "loss": 0.3647,
37
  "step": 2000
38
  },
39
  {
40
- "epoch": 0.04613312173608164,
41
- "grad_norm": 4.055091857910156,
42
- "learning_rate": 4.9231114637731975e-05,
43
- "loss": 0.3508,
44
  "step": 2500
45
  },
46
  {
47
- "epoch": 0.05535974608329797,
48
- "grad_norm": 4.6239824295043945,
49
- "learning_rate": 4.907733756527837e-05,
50
- "loss": 0.3418,
51
  "step": 3000
52
  },
53
  {
54
- "epoch": 0.0645863704305143,
55
- "grad_norm": 3.2537200450897217,
56
- "learning_rate": 4.8923560492824766e-05,
57
- "loss": 0.3471,
58
  "step": 3500
59
  },
60
  {
61
- "epoch": 0.07381299477773062,
62
- "grad_norm": 8.821883201599121,
63
- "learning_rate": 4.876978342037116e-05,
64
- "loss": 0.3448,
65
  "step": 4000
66
  },
67
  {
68
- "epoch": 0.08303961912494695,
69
- "grad_norm": 4.157027244567871,
70
- "learning_rate": 4.8616006347917556e-05,
71
- "loss": 0.3304,
72
  "step": 4500
73
  },
74
  {
75
- "epoch": 0.09226624347216328,
76
- "grad_norm": 5.966025352478027,
77
- "learning_rate": 4.846222927546395e-05,
78
- "loss": 0.3288,
79
  "step": 5000
80
  },
81
  {
82
- "epoch": 0.1014928678193796,
83
- "grad_norm": 2.689772367477417,
84
- "learning_rate": 4.830845220301034e-05,
85
- "loss": 0.315,
86
  "step": 5500
87
  },
88
  {
89
- "epoch": 0.11071949216659593,
90
- "grad_norm": 4.417109966278076,
91
- "learning_rate": 4.815467513055674e-05,
92
- "loss": 0.3171,
93
  "step": 6000
94
  },
95
  {
96
- "epoch": 0.11994611651381226,
97
- "grad_norm": 2.441032886505127,
98
- "learning_rate": 4.800089805810313e-05,
99
- "loss": 0.3111,
100
  "step": 6500
101
  },
102
  {
103
- "epoch": 0.1291727408610286,
104
- "grad_norm": 5.741962432861328,
105
- "learning_rate": 4.784712098564952e-05,
106
- "loss": 0.3161,
107
  "step": 7000
108
  },
109
  {
110
- "epoch": 0.13839936520824492,
111
- "grad_norm": 3.6779587268829346,
112
- "learning_rate": 4.769334391319592e-05,
113
- "loss": 0.3172,
114
  "step": 7500
115
  },
116
  {
117
- "epoch": 0.14762598955546125,
118
- "grad_norm": 3.389577627182007,
119
- "learning_rate": 4.753956684074232e-05,
120
- "loss": 0.3089,
 
 
 
 
 
 
 
 
 
121
  "step": 8000
122
  },
123
  {
124
- "epoch": 0.15685261390267757,
125
- "grad_norm": 6.343785285949707,
126
- "learning_rate": 4.7385789768288705e-05,
127
- "loss": 0.3038,
128
  "step": 8500
129
  },
130
  {
131
- "epoch": 0.1660792382498939,
132
- "grad_norm": 3.811483383178711,
133
- "learning_rate": 4.7232012695835104e-05,
134
- "loss": 0.2977,
135
  "step": 9000
136
  },
137
  {
138
- "epoch": 0.17530586259711023,
139
- "grad_norm": 3.0193912982940674,
140
- "learning_rate": 4.70782356233815e-05,
141
- "loss": 0.302,
142
  "step": 9500
143
  },
144
  {
145
- "epoch": 0.18453248694432656,
146
- "grad_norm": 5.484386444091797,
147
- "learning_rate": 4.6924458550927894e-05,
148
- "loss": 0.3041,
149
  "step": 10000
150
  },
151
  {
152
- "epoch": 0.19375911129154288,
153
- "grad_norm": 4.629725933074951,
154
- "learning_rate": 4.6770681478474286e-05,
155
- "loss": 0.3036,
156
  "step": 10500
157
  },
158
  {
159
- "epoch": 0.2029857356387592,
160
- "grad_norm": 9.174530982971191,
161
- "learning_rate": 4.6616904406020685e-05,
162
- "loss": 0.2884,
163
  "step": 11000
164
  },
165
  {
166
- "epoch": 0.21221235998597554,
167
- "grad_norm": 11.994110107421875,
168
- "learning_rate": 4.6463127333567077e-05,
169
- "loss": 0.2913,
170
  "step": 11500
171
  },
172
  {
173
- "epoch": 0.22143898433319187,
174
- "grad_norm": 4.78723669052124,
175
- "learning_rate": 4.630935026111347e-05,
176
- "loss": 0.2903,
177
  "step": 12000
178
  },
179
  {
180
- "epoch": 0.2306656086804082,
181
- "grad_norm": 4.056216239929199,
182
- "learning_rate": 4.615557318865987e-05,
183
- "loss": 0.2885,
184
  "step": 12500
185
  },
186
  {
187
- "epoch": 0.23989223302762452,
188
- "grad_norm": 2.5596282482147217,
189
- "learning_rate": 4.600179611620626e-05,
190
- "loss": 0.2865,
191
  "step": 13000
192
  },
193
  {
194
- "epoch": 0.24911885737484085,
195
- "grad_norm": 3.354088544845581,
196
- "learning_rate": 4.584801904375266e-05,
197
- "loss": 0.2801,
198
  "step": 13500
199
  },
200
  {
201
- "epoch": 0.2583454817220572,
202
- "grad_norm": 2.7451272010803223,
203
- "learning_rate": 4.569424197129905e-05,
204
- "loss": 0.2804,
205
  "step": 14000
206
  },
207
  {
208
- "epoch": 0.2675721060692735,
209
- "grad_norm": 2.0492589473724365,
210
- "learning_rate": 4.554046489884544e-05,
211
- "loss": 0.2805,
212
  "step": 14500
213
  },
214
  {
215
- "epoch": 0.27679873041648984,
216
- "grad_norm": 2.7824437618255615,
217
- "learning_rate": 4.538668782639184e-05,
218
- "loss": 0.276,
219
  "step": 15000
220
  },
221
  {
222
- "epoch": 0.28602535476370616,
223
- "grad_norm": 2.753225326538086,
224
- "learning_rate": 4.523291075393823e-05,
225
- "loss": 0.2779,
226
  "step": 15500
227
  },
228
  {
229
- "epoch": 0.2952519791109225,
230
- "grad_norm": 4.156832218170166,
231
- "learning_rate": 4.5079133681484624e-05,
232
- "loss": 0.2776,
 
 
 
 
 
 
 
 
 
233
  "step": 16000
234
  },
235
  {
236
- "epoch": 0.3044786034581388,
237
- "grad_norm": 1.4764610528945923,
238
- "learning_rate": 4.492535660903102e-05,
239
- "loss": 0.2777,
240
  "step": 16500
241
  },
242
  {
243
- "epoch": 0.31370522780535515,
244
- "grad_norm": 2.732165813446045,
245
- "learning_rate": 4.477157953657742e-05,
246
- "loss": 0.2759,
247
  "step": 17000
248
  },
249
  {
250
- "epoch": 0.3229318521525715,
251
- "grad_norm": 3.756098508834839,
252
- "learning_rate": 4.4617802464123806e-05,
253
- "loss": 0.2707,
254
  "step": 17500
255
  },
256
  {
257
- "epoch": 0.3321584764997878,
258
- "grad_norm": 3.7828195095062256,
259
- "learning_rate": 4.4464025391670205e-05,
260
- "loss": 0.2692,
261
  "step": 18000
262
  },
263
  {
264
- "epoch": 0.34138510084700413,
265
- "grad_norm": 6.942204475402832,
266
- "learning_rate": 4.4310248319216603e-05,
267
- "loss": 0.2722,
268
  "step": 18500
269
  },
270
  {
271
- "epoch": 0.35061172519422046,
272
- "grad_norm": 2.0811824798583984,
273
- "learning_rate": 4.4156471246762995e-05,
274
- "loss": 0.2624,
275
  "step": 19000
276
  },
277
  {
278
- "epoch": 0.3598383495414368,
279
- "grad_norm": 2.2063019275665283,
280
- "learning_rate": 4.400269417430939e-05,
281
- "loss": 0.2567,
282
  "step": 19500
283
  },
284
  {
285
- "epoch": 0.3690649738886531,
286
- "grad_norm": 3.381683826446533,
287
- "learning_rate": 4.3848917101855786e-05,
288
- "loss": 0.2623,
289
  "step": 20000
290
  },
291
  {
292
- "epoch": 0.37829159823586944,
293
- "grad_norm": 2.4916694164276123,
294
- "learning_rate": 4.369514002940218e-05,
295
- "loss": 0.2665,
296
  "step": 20500
297
  },
298
  {
299
- "epoch": 0.38751822258308577,
300
- "grad_norm": 3.138047695159912,
301
- "learning_rate": 4.354136295694857e-05,
302
- "loss": 0.251,
303
  "step": 21000
304
  },
305
  {
306
- "epoch": 0.3967448469303021,
307
- "grad_norm": 4.300042152404785,
308
- "learning_rate": 4.338758588449497e-05,
309
- "loss": 0.2542,
310
  "step": 21500
311
  },
312
  {
313
- "epoch": 0.4059714712775184,
314
- "grad_norm": 4.118566513061523,
315
- "learning_rate": 4.323380881204136e-05,
316
- "loss": 0.2545,
317
  "step": 22000
318
  },
319
  {
320
- "epoch": 0.41519809562473475,
321
- "grad_norm": 3.6837940216064453,
322
- "learning_rate": 4.308003173958776e-05,
323
- "loss": 0.2506,
324
  "step": 22500
325
  },
326
  {
327
- "epoch": 0.4244247199719511,
328
- "grad_norm": 3.9393532276153564,
329
- "learning_rate": 4.292625466713415e-05,
330
- "loss": 0.248,
331
  "step": 23000
332
  },
333
- {
334
- "epoch": 0.4336513443191674,
335
- "grad_norm": 4.186630725860596,
336
- "learning_rate": 4.277247759468054e-05,
337
- "loss": 0.2574,
338
- "step": 23500
339
- },
340
- {
341
- "epoch": 0.44287796866638374,
342
- "grad_norm": 2.1121768951416016,
343
- "learning_rate": 4.261870052222694e-05,
344
- "loss": 0.2552,
345
- "step": 24000
346
- },
347
- {
348
- "epoch": 0.45210459301360006,
349
- "grad_norm": 3.940450429916382,
350
- "learning_rate": 4.246492344977333e-05,
351
- "loss": 0.2449,
352
- "step": 24500
353
- },
354
- {
355
- "epoch": 0.4613312173608164,
356
- "grad_norm": 3.8467142581939697,
357
- "learning_rate": 4.2311146377319725e-05,
358
- "loss": 0.2497,
359
- "step": 25000
360
- },
361
- {
362
- "epoch": 0.4705578417080327,
363
- "grad_norm": 4.122659683227539,
364
- "learning_rate": 4.2157369304866124e-05,
365
- "loss": 0.2502,
366
- "step": 25500
367
- },
368
- {
369
- "epoch": 0.47978446605524905,
370
- "grad_norm": 4.005275249481201,
371
- "learning_rate": 4.200359223241252e-05,
372
- "loss": 0.239,
373
- "step": 26000
374
- },
375
- {
376
- "epoch": 0.4890110904024654,
377
- "grad_norm": 3.944265365600586,
378
- "learning_rate": 4.184981515995891e-05,
379
- "loss": 0.2495,
380
- "step": 26500
381
- },
382
- {
383
- "epoch": 0.4982377147496817,
384
- "grad_norm": 5.242092609405518,
385
- "learning_rate": 4.1696038087505306e-05,
386
- "loss": 0.2504,
387
- "step": 27000
388
- },
389
- {
390
- "epoch": 0.507464339096898,
391
- "grad_norm": 3.0890393257141113,
392
- "learning_rate": 4.1542261015051705e-05,
393
- "loss": 0.2424,
394
- "step": 27500
395
- },
396
- {
397
- "epoch": 0.5166909634441144,
398
- "grad_norm": 2.5902299880981445,
399
- "learning_rate": 4.1388483942598097e-05,
400
- "loss": 0.2432,
401
- "step": 28000
402
- },
403
- {
404
- "epoch": 0.5259175877913307,
405
- "grad_norm": 2.1534225940704346,
406
- "learning_rate": 4.123470687014449e-05,
407
- "loss": 0.2423,
408
- "step": 28500
409
- },
410
- {
411
- "epoch": 0.535144212138547,
412
- "grad_norm": 4.093803405761719,
413
- "learning_rate": 4.108092979769089e-05,
414
- "loss": 0.2393,
415
- "step": 29000
416
- },
417
- {
418
- "epoch": 0.5443708364857633,
419
- "grad_norm": 5.0820722579956055,
420
- "learning_rate": 4.092715272523728e-05,
421
- "loss": 0.2355,
422
- "step": 29500
423
- },
424
- {
425
- "epoch": 0.5535974608329797,
426
- "grad_norm": 3.2006969451904297,
427
- "learning_rate": 4.077337565278367e-05,
428
- "loss": 0.2378,
429
- "step": 30000
430
- },
431
- {
432
- "epoch": 0.562824085180196,
433
- "grad_norm": 2.7393364906311035,
434
- "learning_rate": 4.061959858033007e-05,
435
- "loss": 0.2391,
436
- "step": 30500
437
- },
438
- {
439
- "epoch": 0.5720507095274123,
440
- "grad_norm": 5.7313361167907715,
441
- "learning_rate": 4.046582150787646e-05,
442
- "loss": 0.2378,
443
- "step": 31000
444
- },
445
- {
446
- "epoch": 0.5812773338746287,
447
- "grad_norm": 3.5704684257507324,
448
- "learning_rate": 4.031204443542286e-05,
449
- "loss": 0.2416,
450
- "step": 31500
451
- },
452
- {
453
- "epoch": 0.590503958221845,
454
- "grad_norm": 3.010260820388794,
455
- "learning_rate": 4.015826736296925e-05,
456
- "loss": 0.2315,
457
- "step": 32000
458
- },
459
- {
460
- "epoch": 0.5997305825690613,
461
- "grad_norm": 6.030303001403809,
462
- "learning_rate": 4.0004490290515644e-05,
463
- "loss": 0.2346,
464
- "step": 32500
465
- },
466
- {
467
- "epoch": 0.6089572069162776,
468
- "grad_norm": 2.6332879066467285,
469
- "learning_rate": 3.985071321806204e-05,
470
- "loss": 0.2387,
471
- "step": 33000
472
- },
473
- {
474
- "epoch": 0.618183831263494,
475
- "grad_norm": 3.79506254196167,
476
- "learning_rate": 3.9696936145608434e-05,
477
- "loss": 0.2314,
478
- "step": 33500
479
- },
480
- {
481
- "epoch": 0.6274104556107103,
482
- "grad_norm": 3.9026734828948975,
483
- "learning_rate": 3.9543159073154826e-05,
484
- "loss": 0.2265,
485
- "step": 34000
486
- },
487
- {
488
- "epoch": 0.6366370799579266,
489
- "grad_norm": 7.885356426239014,
490
- "learning_rate": 3.9389382000701225e-05,
491
- "loss": 0.2288,
492
- "step": 34500
493
- },
494
- {
495
- "epoch": 0.645863704305143,
496
- "grad_norm": 3.634693145751953,
497
- "learning_rate": 3.9235604928247623e-05,
498
- "loss": 0.2269,
499
- "step": 35000
500
- },
501
- {
502
- "epoch": 0.6550903286523593,
503
- "grad_norm": 4.571321487426758,
504
- "learning_rate": 3.9081827855794015e-05,
505
- "loss": 0.226,
506
- "step": 35500
507
- },
508
- {
509
- "epoch": 0.6643169529995756,
510
- "grad_norm": 4.4402337074279785,
511
- "learning_rate": 3.892805078334041e-05,
512
- "loss": 0.2227,
513
- "step": 36000
514
- },
515
- {
516
- "epoch": 0.6735435773467919,
517
- "grad_norm": 2.3273956775665283,
518
- "learning_rate": 3.8774273710886806e-05,
519
- "loss": 0.2328,
520
- "step": 36500
521
- },
522
- {
523
- "epoch": 0.6827702016940083,
524
- "grad_norm": 7.7202372550964355,
525
- "learning_rate": 3.86204966384332e-05,
526
- "loss": 0.2242,
527
- "step": 37000
528
- },
529
- {
530
- "epoch": 0.6919968260412246,
531
- "grad_norm": 3.037423849105835,
532
- "learning_rate": 3.846671956597959e-05,
533
- "loss": 0.2219,
534
- "step": 37500
535
- },
536
- {
537
- "epoch": 0.7012234503884409,
538
- "grad_norm": 3.3124380111694336,
539
- "learning_rate": 3.831294249352599e-05,
540
- "loss": 0.2217,
541
- "step": 38000
542
- },
543
- {
544
- "epoch": 0.7104500747356572,
545
- "grad_norm": 1.5552330017089844,
546
- "learning_rate": 3.815916542107238e-05,
547
- "loss": 0.2237,
548
- "step": 38500
549
- },
550
- {
551
- "epoch": 0.7196766990828736,
552
- "grad_norm": 3.6003737449645996,
553
- "learning_rate": 3.800538834861878e-05,
554
- "loss": 0.2212,
555
- "step": 39000
556
- },
557
- {
558
- "epoch": 0.7289033234300899,
559
- "grad_norm": 2.323984146118164,
560
- "learning_rate": 3.785161127616517e-05,
561
- "loss": 0.2217,
562
- "step": 39500
563
- },
564
- {
565
- "epoch": 0.7381299477773062,
566
- "grad_norm": 4.002011775970459,
567
- "learning_rate": 3.769783420371156e-05,
568
- "loss": 0.2178,
569
- "step": 40000
570
- },
571
- {
572
- "epoch": 0.7473565721245226,
573
- "grad_norm": 9.153217315673828,
574
- "learning_rate": 3.754405713125796e-05,
575
- "loss": 0.2156,
576
- "step": 40500
577
- },
578
- {
579
- "epoch": 0.7565831964717389,
580
- "grad_norm": 4.3000712394714355,
581
- "learning_rate": 3.739028005880435e-05,
582
- "loss": 0.2202,
583
- "step": 41000
584
- },
585
- {
586
- "epoch": 0.7658098208189552,
587
- "grad_norm": 5.20850944519043,
588
- "learning_rate": 3.7236502986350745e-05,
589
- "loss": 0.2156,
590
- "step": 41500
591
- },
592
- {
593
- "epoch": 0.7750364451661715,
594
- "grad_norm": 3.736025810241699,
595
- "learning_rate": 3.7082725913897144e-05,
596
- "loss": 0.2106,
597
- "step": 42000
598
- },
599
- {
600
- "epoch": 0.7842630695133879,
601
- "grad_norm": 4.413645267486572,
602
- "learning_rate": 3.692894884144354e-05,
603
- "loss": 0.2154,
604
- "step": 42500
605
- },
606
- {
607
- "epoch": 0.7934896938606042,
608
- "grad_norm": 3.298003911972046,
609
- "learning_rate": 3.677517176898993e-05,
610
- "loss": 0.2106,
611
- "step": 43000
612
- },
613
- {
614
- "epoch": 0.8027163182078205,
615
- "grad_norm": 2.9312047958374023,
616
- "learning_rate": 3.6621394696536326e-05,
617
- "loss": 0.2043,
618
- "step": 43500
619
- },
620
- {
621
- "epoch": 0.8119429425550369,
622
- "grad_norm": 4.253361701965332,
623
- "learning_rate": 3.6467617624082725e-05,
624
- "loss": 0.2131,
625
- "step": 44000
626
- },
627
- {
628
- "epoch": 0.8211695669022532,
629
- "grad_norm": 2.0434412956237793,
630
- "learning_rate": 3.6313840551629117e-05,
631
- "loss": 0.2144,
632
- "step": 44500
633
- },
634
- {
635
- "epoch": 0.8303961912494695,
636
- "grad_norm": 3.0040202140808105,
637
- "learning_rate": 3.616006347917551e-05,
638
- "loss": 0.2124,
639
- "step": 45000
640
- },
641
- {
642
- "epoch": 0.8396228155966858,
643
- "grad_norm": 3.3966643810272217,
644
- "learning_rate": 3.600628640672191e-05,
645
- "loss": 0.2077,
646
- "step": 45500
647
- },
648
- {
649
- "epoch": 0.8488494399439022,
650
- "grad_norm": 2.4415907859802246,
651
- "learning_rate": 3.58525093342683e-05,
652
- "loss": 0.2049,
653
- "step": 46000
654
- },
655
- {
656
- "epoch": 0.8580760642911185,
657
- "grad_norm": 3.1614882946014404,
658
- "learning_rate": 3.569873226181469e-05,
659
- "loss": 0.2073,
660
- "step": 46500
661
- },
662
- {
663
- "epoch": 0.8673026886383348,
664
- "grad_norm": 4.641379356384277,
665
- "learning_rate": 3.554495518936109e-05,
666
- "loss": 0.2025,
667
- "step": 47000
668
- },
669
- {
670
- "epoch": 0.8765293129855511,
671
- "grad_norm": 3.275320529937744,
672
- "learning_rate": 3.539117811690748e-05,
673
- "loss": 0.206,
674
- "step": 47500
675
- },
676
- {
677
- "epoch": 0.8857559373327675,
678
- "grad_norm": 2.602555274963379,
679
- "learning_rate": 3.523740104445388e-05,
680
- "loss": 0.2053,
681
- "step": 48000
682
- },
683
- {
684
- "epoch": 0.8949825616799838,
685
- "grad_norm": 3.3625969886779785,
686
- "learning_rate": 3.508362397200027e-05,
687
- "loss": 0.2031,
688
- "step": 48500
689
- },
690
- {
691
- "epoch": 0.9042091860272001,
692
- "grad_norm": 2.0234267711639404,
693
- "learning_rate": 3.4929846899546664e-05,
694
- "loss": 0.1981,
695
- "step": 49000
696
- },
697
- {
698
- "epoch": 0.9134358103744165,
699
- "grad_norm": 2.6035192012786865,
700
- "learning_rate": 3.477606982709306e-05,
701
- "loss": 0.2013,
702
- "step": 49500
703
- },
704
- {
705
- "epoch": 0.9226624347216328,
706
- "grad_norm": 5.516040802001953,
707
- "learning_rate": 3.4622292754639454e-05,
708
- "loss": 0.2063,
709
- "step": 50000
710
- },
711
- {
712
- "epoch": 0.9318890590688491,
713
- "grad_norm": 4.573687553405762,
714
- "learning_rate": 3.4468515682185846e-05,
715
- "loss": 0.2044,
716
- "step": 50500
717
- },
718
- {
719
- "epoch": 0.9411156834160654,
720
- "grad_norm": 3.124086856842041,
721
- "learning_rate": 3.4314738609732245e-05,
722
- "loss": 0.1937,
723
- "step": 51000
724
- },
725
- {
726
- "epoch": 0.9503423077632818,
727
- "grad_norm": 4.916173458099365,
728
- "learning_rate": 3.4160961537278643e-05,
729
- "loss": 0.1959,
730
- "step": 51500
731
- },
732
- {
733
- "epoch": 0.9595689321104981,
734
- "grad_norm": 3.445047378540039,
735
- "learning_rate": 3.400718446482503e-05,
736
- "loss": 0.1999,
737
- "step": 52000
738
- },
739
- {
740
- "epoch": 0.9687955564577144,
741
- "grad_norm": 2.2390198707580566,
742
- "learning_rate": 3.385340739237143e-05,
743
- "loss": 0.1887,
744
- "step": 52500
745
- },
746
- {
747
- "epoch": 0.9780221808049308,
748
- "grad_norm": 6.404945373535156,
749
- "learning_rate": 3.3699630319917826e-05,
750
- "loss": 0.1963,
751
- "step": 53000
752
- },
753
- {
754
- "epoch": 0.9872488051521471,
755
- "grad_norm": 3.268970251083374,
756
- "learning_rate": 3.354585324746422e-05,
757
- "loss": 0.1958,
758
- "step": 53500
759
- },
760
- {
761
- "epoch": 0.9964754294993634,
762
- "grad_norm": 2.5354039669036865,
763
- "learning_rate": 3.339207617501061e-05,
764
- "loss": 0.1898,
765
- "step": 54000
766
- },
767
- {
768
- "epoch": 1.0,
769
- "eval_loss": 0.23220877349376678,
770
- "eval_mse": 0.2322087733155601,
771
- "eval_runtime": 114.203,
772
- "eval_samples_per_second": 1687.136,
773
- "eval_steps_per_second": 210.896,
774
- "step": 54191
775
- },
776
- {
777
- "epoch": 1.0057020538465797,
778
- "grad_norm": 2.1543655395507812,
779
- "learning_rate": 3.323829910255701e-05,
780
- "loss": 0.1712,
781
- "step": 54500
782
- },
783
- {
784
- "epoch": 1.014928678193796,
785
- "grad_norm": 2.780333995819092,
786
- "learning_rate": 3.30845220301034e-05,
787
- "loss": 0.1574,
788
- "step": 55000
789
- },
790
- {
791
- "epoch": 1.0241553025410124,
792
- "grad_norm": 5.817172527313232,
793
- "learning_rate": 3.293074495764979e-05,
794
- "loss": 0.1505,
795
- "step": 55500
796
- },
797
- {
798
- "epoch": 1.0333819268882287,
799
- "grad_norm": 5.431843280792236,
800
- "learning_rate": 3.277696788519619e-05,
801
- "loss": 0.1551,
802
- "step": 56000
803
- },
804
- {
805
- "epoch": 1.042608551235445,
806
- "grad_norm": 2.024513006210327,
807
- "learning_rate": 3.262319081274258e-05,
808
- "loss": 0.1541,
809
- "step": 56500
810
- },
811
- {
812
- "epoch": 1.0518351755826614,
813
- "grad_norm": 5.155509948730469,
814
- "learning_rate": 3.246941374028898e-05,
815
- "loss": 0.1538,
816
- "step": 57000
817
- },
818
- {
819
- "epoch": 1.0610617999298777,
820
- "grad_norm": 1.8281043767929077,
821
- "learning_rate": 3.231563666783537e-05,
822
- "loss": 0.1503,
823
- "step": 57500
824
- },
825
- {
826
- "epoch": 1.070288424277094,
827
- "grad_norm": 3.030827283859253,
828
- "learning_rate": 3.2161859595381765e-05,
829
- "loss": 0.1535,
830
- "step": 58000
831
- },
832
- {
833
- "epoch": 1.0795150486243104,
834
- "grad_norm": 3.2830984592437744,
835
- "learning_rate": 3.2008082522928164e-05,
836
- "loss": 0.1567,
837
- "step": 58500
838
- },
839
- {
840
- "epoch": 1.0887416729715267,
841
- "grad_norm": 2.756232500076294,
842
- "learning_rate": 3.1854305450474555e-05,
843
- "loss": 0.1576,
844
- "step": 59000
845
- },
846
- {
847
- "epoch": 1.097968297318743,
848
- "grad_norm": 2.0984957218170166,
849
- "learning_rate": 3.170052837802095e-05,
850
- "loss": 0.161,
851
- "step": 59500
852
- },
853
- {
854
- "epoch": 1.1071949216659593,
855
- "grad_norm": 2.4525437355041504,
856
- "learning_rate": 3.1546751305567346e-05,
857
- "loss": 0.1542,
858
- "step": 60000
859
- },
860
- {
861
- "epoch": 1.1164215460131757,
862
- "grad_norm": 2.31719970703125,
863
- "learning_rate": 3.1392974233113745e-05,
864
- "loss": 0.1528,
865
- "step": 60500
866
- },
867
- {
868
- "epoch": 1.125648170360392,
869
- "grad_norm": 3.3912220001220703,
870
- "learning_rate": 3.123919716066013e-05,
871
- "loss": 0.1551,
872
- "step": 61000
873
- },
874
- {
875
- "epoch": 1.1348747947076083,
876
- "grad_norm": 3.2458841800689697,
877
- "learning_rate": 3.108542008820653e-05,
878
- "loss": 0.1508,
879
- "step": 61500
880
- },
881
- {
882
- "epoch": 1.1441014190548247,
883
- "grad_norm": 3.3046302795410156,
884
- "learning_rate": 3.093164301575293e-05,
885
- "loss": 0.1465,
886
- "step": 62000
887
- },
888
- {
889
- "epoch": 1.153328043402041,
890
- "grad_norm": 4.0332183837890625,
891
- "learning_rate": 3.077786594329932e-05,
892
- "loss": 0.1535,
893
- "step": 62500
894
- },
895
- {
896
- "epoch": 1.1625546677492573,
897
- "grad_norm": 2.0470728874206543,
898
- "learning_rate": 3.062408887084571e-05,
899
- "loss": 0.1501,
900
- "step": 63000
901
- },
902
- {
903
- "epoch": 1.1717812920964736,
904
- "grad_norm": 4.00844669342041,
905
- "learning_rate": 3.047031179839211e-05,
906
- "loss": 0.1556,
907
- "step": 63500
908
- },
909
- {
910
- "epoch": 1.18100791644369,
911
- "grad_norm": 2.260006904602051,
912
- "learning_rate": 3.03165347259385e-05,
913
- "loss": 0.1514,
914
- "step": 64000
915
- },
916
- {
917
- "epoch": 1.1902345407909063,
918
- "grad_norm": 1.3348864316940308,
919
- "learning_rate": 3.0162757653484897e-05,
920
- "loss": 0.1436,
921
- "step": 64500
922
- },
923
- {
924
- "epoch": 1.1994611651381226,
925
- "grad_norm": 5.925819396972656,
926
- "learning_rate": 3.0008980581031292e-05,
927
- "loss": 0.1521,
928
- "step": 65000
929
- },
930
- {
931
- "epoch": 1.208687789485339,
932
- "grad_norm": 4.659446716308594,
933
- "learning_rate": 2.9855203508577684e-05,
934
- "loss": 0.1434,
935
- "step": 65500
936
- },
937
- {
938
- "epoch": 1.2179144138325553,
939
- "grad_norm": 4.0146164894104,
940
- "learning_rate": 2.970142643612408e-05,
941
- "loss": 0.1503,
942
- "step": 66000
943
- },
944
- {
945
- "epoch": 1.2271410381797716,
946
- "grad_norm": 1.715017557144165,
947
- "learning_rate": 2.9547649363670478e-05,
948
- "loss": 0.1499,
949
- "step": 66500
950
- },
951
- {
952
- "epoch": 1.236367662526988,
953
- "grad_norm": 4.178813457489014,
954
- "learning_rate": 2.9393872291216866e-05,
955
- "loss": 0.1504,
956
- "step": 67000
957
- },
958
- {
959
- "epoch": 1.2455942868742043,
960
- "grad_norm": 2.155510663986206,
961
- "learning_rate": 2.9240095218763265e-05,
962
- "loss": 0.1423,
963
- "step": 67500
964
- },
965
- {
966
- "epoch": 1.2548209112214206,
967
- "grad_norm": 1.8401468992233276,
968
- "learning_rate": 2.908631814630966e-05,
969
- "loss": 0.1534,
970
- "step": 68000
971
- },
972
- {
973
- "epoch": 1.264047535568637,
974
- "grad_norm": 3.5961029529571533,
975
- "learning_rate": 2.8932541073856055e-05,
976
- "loss": 0.1422,
977
- "step": 68500
978
- },
979
- {
980
- "epoch": 1.2732741599158532,
981
- "grad_norm": 2.855060338973999,
982
- "learning_rate": 2.8778764001402447e-05,
983
- "loss": 0.1497,
984
- "step": 69000
985
- },
986
- {
987
- "epoch": 1.2825007842630696,
988
- "grad_norm": 2.705552816390991,
989
- "learning_rate": 2.8624986928948842e-05,
990
- "loss": 0.1482,
991
- "step": 69500
992
- },
993
- {
994
- "epoch": 1.291727408610286,
995
- "grad_norm": 3.748999834060669,
996
- "learning_rate": 2.847120985649524e-05,
997
- "loss": 0.1516,
998
- "step": 70000
999
- },
1000
- {
1001
- "epoch": 1.3009540329575022,
1002
- "grad_norm": 2.6836044788360596,
1003
- "learning_rate": 2.831743278404163e-05,
1004
- "loss": 0.1476,
1005
- "step": 70500
1006
- },
1007
- {
1008
- "epoch": 1.3101806573047186,
1009
- "grad_norm": 1.9708038568496704,
1010
- "learning_rate": 2.8163655711588028e-05,
1011
- "loss": 0.1469,
1012
- "step": 71000
1013
- },
1014
- {
1015
- "epoch": 1.3194072816519349,
1016
- "grad_norm": 2.0082767009735107,
1017
- "learning_rate": 2.8009878639134424e-05,
1018
- "loss": 0.1473,
1019
- "step": 71500
1020
- },
1021
- {
1022
- "epoch": 1.3286339059991512,
1023
- "grad_norm": 5.9193830490112305,
1024
- "learning_rate": 2.7856101566680815e-05,
1025
- "loss": 0.148,
1026
- "step": 72000
1027
- },
1028
- {
1029
- "epoch": 1.3378605303463675,
1030
- "grad_norm": 2.226789951324463,
1031
- "learning_rate": 2.770232449422721e-05,
1032
- "loss": 0.1479,
1033
- "step": 72500
1034
- },
1035
- {
1036
- "epoch": 1.3470871546935839,
1037
- "grad_norm": 2.320139169692993,
1038
- "learning_rate": 2.7548547421773606e-05,
1039
- "loss": 0.141,
1040
- "step": 73000
1041
- },
1042
- {
1043
- "epoch": 1.3563137790408002,
1044
- "grad_norm": 1.762904405593872,
1045
- "learning_rate": 2.7394770349319998e-05,
1046
- "loss": 0.143,
1047
- "step": 73500
1048
- },
1049
- {
1050
- "epoch": 1.3655404033880165,
1051
- "grad_norm": 1.4634217023849487,
1052
- "learning_rate": 2.7240993276866393e-05,
1053
- "loss": 0.1417,
1054
- "step": 74000
1055
- },
1056
- {
1057
- "epoch": 1.3747670277352328,
1058
- "grad_norm": 1.4410927295684814,
1059
- "learning_rate": 2.7087216204412792e-05,
1060
- "loss": 0.1417,
1061
- "step": 74500
1062
- },
1063
- {
1064
- "epoch": 1.3839936520824492,
1065
- "grad_norm": 2.7735280990600586,
1066
- "learning_rate": 2.693343913195918e-05,
1067
- "loss": 0.1439,
1068
- "step": 75000
1069
- },
1070
- {
1071
- "epoch": 1.3932202764296655,
1072
- "grad_norm": 2.384705066680908,
1073
- "learning_rate": 2.677966205950558e-05,
1074
- "loss": 0.1437,
1075
- "step": 75500
1076
- },
1077
- {
1078
- "epoch": 1.4024469007768818,
1079
- "grad_norm": 3.4809861183166504,
1080
- "learning_rate": 2.6625884987051974e-05,
1081
- "loss": 0.1408,
1082
- "step": 76000
1083
- },
1084
- {
1085
- "epoch": 1.4116735251240982,
1086
- "grad_norm": 2.29471492767334,
1087
- "learning_rate": 2.6472107914598366e-05,
1088
- "loss": 0.1459,
1089
- "step": 76500
1090
- },
1091
- {
1092
- "epoch": 1.4209001494713145,
1093
- "grad_norm": 3.0202510356903076,
1094
- "learning_rate": 2.631833084214476e-05,
1095
- "loss": 0.1402,
1096
- "step": 77000
1097
- },
1098
- {
1099
- "epoch": 1.4301267738185308,
1100
- "grad_norm": 2.7061448097229004,
1101
- "learning_rate": 2.6164553769691157e-05,
1102
- "loss": 0.1408,
1103
- "step": 77500
1104
- },
1105
- {
1106
- "epoch": 1.4393533981657471,
1107
- "grad_norm": 1.499624252319336,
1108
- "learning_rate": 2.601077669723755e-05,
1109
- "loss": 0.1443,
1110
- "step": 78000
1111
- },
1112
- {
1113
- "epoch": 1.4485800225129635,
1114
- "grad_norm": 8.131513595581055,
1115
- "learning_rate": 2.5856999624783944e-05,
1116
- "loss": 0.1374,
1117
- "step": 78500
1118
- },
1119
- {
1120
- "epoch": 1.4578066468601798,
1121
- "grad_norm": 1.652654767036438,
1122
- "learning_rate": 2.5703222552330342e-05,
1123
- "loss": 0.1401,
1124
- "step": 79000
1125
- },
1126
- {
1127
- "epoch": 1.4670332712073961,
1128
- "grad_norm": 2.2545433044433594,
1129
- "learning_rate": 2.554944547987673e-05,
1130
- "loss": 0.1388,
1131
- "step": 79500
1132
- },
1133
- {
1134
- "epoch": 1.4762598955546125,
1135
- "grad_norm": 2.1318209171295166,
1136
- "learning_rate": 2.539566840742313e-05,
1137
- "loss": 0.1434,
1138
- "step": 80000
1139
- },
1140
- {
1141
- "epoch": 1.4854865199018288,
1142
- "grad_norm": 1.8352861404418945,
1143
- "learning_rate": 2.5241891334969525e-05,
1144
- "loss": 0.142,
1145
- "step": 80500
1146
- },
1147
- {
1148
- "epoch": 1.4947131442490451,
1149
- "grad_norm": 2.1764025688171387,
1150
- "learning_rate": 2.5088114262515917e-05,
1151
- "loss": 0.1366,
1152
- "step": 81000
1153
- },
1154
- {
1155
- "epoch": 1.5039397685962612,
1156
- "grad_norm": 2.425063371658325,
1157
- "learning_rate": 2.4934337190062312e-05,
1158
- "loss": 0.1435,
1159
- "step": 81500
1160
- },
1161
- {
1162
- "epoch": 1.5131663929434778,
1163
- "grad_norm": 1.579362154006958,
1164
- "learning_rate": 2.4780560117608707e-05,
1165
- "loss": 0.1383,
1166
- "step": 82000
1167
- },
1168
- {
1169
- "epoch": 1.5223930172906939,
1170
- "grad_norm": 1.9185165166854858,
1171
- "learning_rate": 2.46267830451551e-05,
1172
- "loss": 0.1357,
1173
- "step": 82500
1174
- },
1175
- {
1176
- "epoch": 1.5316196416379104,
1177
- "grad_norm": 1.506785273551941,
1178
- "learning_rate": 2.4473005972701498e-05,
1179
- "loss": 0.1366,
1180
- "step": 83000
1181
- },
1182
- {
1183
- "epoch": 1.5408462659851265,
1184
- "grad_norm": 2.999217987060547,
1185
- "learning_rate": 2.431922890024789e-05,
1186
- "loss": 0.1374,
1187
- "step": 83500
1188
- },
1189
- {
1190
- "epoch": 1.550072890332343,
1191
- "grad_norm": 1.4639360904693604,
1192
- "learning_rate": 2.4165451827794285e-05,
1193
- "loss": 0.1339,
1194
- "step": 84000
1195
- },
1196
- {
1197
- "epoch": 1.5592995146795592,
1198
- "grad_norm": 3.4754111766815186,
1199
- "learning_rate": 2.401167475534068e-05,
1200
- "loss": 0.1288,
1201
- "step": 84500
1202
- },
1203
- {
1204
- "epoch": 1.5685261390267757,
1205
- "grad_norm": 2.0212953090667725,
1206
- "learning_rate": 2.3857897682887072e-05,
1207
- "loss": 0.1379,
1208
- "step": 85000
1209
- },
1210
- {
1211
- "epoch": 1.5777527633739918,
1212
- "grad_norm": 14.00969409942627,
1213
- "learning_rate": 2.3704120610433467e-05,
1214
- "loss": 0.135,
1215
- "step": 85500
1216
- },
1217
- {
1218
- "epoch": 1.5869793877212084,
1219
- "grad_norm": 2.084036111831665,
1220
- "learning_rate": 2.3550343537979862e-05,
1221
- "loss": 0.1406,
1222
- "step": 86000
1223
- },
1224
- {
1225
- "epoch": 1.5962060120684245,
1226
- "grad_norm": 1.8672277927398682,
1227
- "learning_rate": 2.3396566465526258e-05,
1228
- "loss": 0.131,
1229
- "step": 86500
1230
- },
1231
- {
1232
- "epoch": 1.605432636415641,
1233
- "grad_norm": 1.3933255672454834,
1234
- "learning_rate": 2.324278939307265e-05,
1235
- "loss": 0.1346,
1236
- "step": 87000
1237
- },
1238
- {
1239
- "epoch": 1.6146592607628572,
1240
- "grad_norm": 4.199204921722412,
1241
- "learning_rate": 2.3089012320619048e-05,
1242
- "loss": 0.1345,
1243
- "step": 87500
1244
- },
1245
- {
1246
- "epoch": 1.6238858851100737,
1247
- "grad_norm": 2.914705276489258,
1248
- "learning_rate": 2.293523524816544e-05,
1249
- "loss": 0.1331,
1250
- "step": 88000
1251
- },
1252
- {
1253
- "epoch": 1.6331125094572898,
1254
- "grad_norm": 2.8266611099243164,
1255
- "learning_rate": 2.2781458175711835e-05,
1256
- "loss": 0.1331,
1257
- "step": 88500
1258
- },
1259
- {
1260
- "epoch": 1.6423391338045064,
1261
- "grad_norm": 2.148892402648926,
1262
- "learning_rate": 2.262768110325823e-05,
1263
- "loss": 0.1353,
1264
- "step": 89000
1265
- },
1266
- {
1267
- "epoch": 1.6515657581517225,
1268
- "grad_norm": 3.0781641006469727,
1269
- "learning_rate": 2.2473904030804623e-05,
1270
- "loss": 0.1312,
1271
- "step": 89500
1272
- },
1273
- {
1274
- "epoch": 1.660792382498939,
1275
- "grad_norm": 1.3129165172576904,
1276
- "learning_rate": 2.2320126958351018e-05,
1277
- "loss": 0.1287,
1278
- "step": 90000
1279
- },
1280
- {
1281
- "epoch": 1.6700190068461551,
1282
- "grad_norm": 2.6767327785491943,
1283
- "learning_rate": 2.2166349885897413e-05,
1284
- "loss": 0.1307,
1285
- "step": 90500
1286
- },
1287
- {
1288
- "epoch": 1.6792456311933717,
1289
- "grad_norm": 2.783486843109131,
1290
- "learning_rate": 2.201257281344381e-05,
1291
- "loss": 0.1307,
1292
- "step": 91000
1293
- },
1294
- {
1295
- "epoch": 1.6884722555405878,
1296
- "grad_norm": 4.483890056610107,
1297
- "learning_rate": 2.18587957409902e-05,
1298
- "loss": 0.1311,
1299
- "step": 91500
1300
- },
1301
- {
1302
- "epoch": 1.6976988798878043,
1303
- "grad_norm": 2.766557216644287,
1304
- "learning_rate": 2.17050186685366e-05,
1305
- "loss": 0.1327,
1306
- "step": 92000
1307
- },
1308
- {
1309
- "epoch": 1.7069255042350204,
1310
- "grad_norm": 3.863123893737793,
1311
- "learning_rate": 2.155124159608299e-05,
1312
- "loss": 0.1358,
1313
- "step": 92500
1314
- },
1315
- {
1316
- "epoch": 1.716152128582237,
1317
- "grad_norm": 3.8993873596191406,
1318
- "learning_rate": 2.1397464523629386e-05,
1319
- "loss": 0.1306,
1320
- "step": 93000
1321
- },
1322
- {
1323
- "epoch": 1.725378752929453,
1324
- "grad_norm": 3.616542100906372,
1325
- "learning_rate": 2.124368745117578e-05,
1326
- "loss": 0.1306,
1327
- "step": 93500
1328
- },
1329
- {
1330
- "epoch": 1.7346053772766696,
1331
- "grad_norm": 2.784503698348999,
1332
- "learning_rate": 2.1089910378722173e-05,
1333
- "loss": 0.1316,
1334
- "step": 94000
1335
- },
1336
- {
1337
- "epoch": 1.7438320016238857,
1338
- "grad_norm": 2.199709415435791,
1339
- "learning_rate": 2.093613330626857e-05,
1340
- "loss": 0.13,
1341
- "step": 94500
1342
- },
1343
- {
1344
- "epoch": 1.7530586259711023,
1345
- "grad_norm": 1.9818087816238403,
1346
- "learning_rate": 2.0782356233814964e-05,
1347
- "loss": 0.1308,
1348
- "step": 95000
1349
- },
1350
- {
1351
- "epoch": 1.7622852503183184,
1352
- "grad_norm": 1.3748022317886353,
1353
- "learning_rate": 2.062857916136136e-05,
1354
- "loss": 0.1279,
1355
- "step": 95500
1356
- },
1357
- {
1358
- "epoch": 1.771511874665535,
1359
- "grad_norm": 2.4911797046661377,
1360
- "learning_rate": 2.047480208890775e-05,
1361
- "loss": 0.1287,
1362
- "step": 96000
1363
- },
1364
- {
1365
- "epoch": 1.780738499012751,
1366
- "grad_norm": 2.5785412788391113,
1367
- "learning_rate": 2.032102501645415e-05,
1368
- "loss": 0.1293,
1369
- "step": 96500
1370
- },
1371
- {
1372
- "epoch": 1.7899651233599676,
1373
- "grad_norm": 3.9389474391937256,
1374
- "learning_rate": 2.016724794400054e-05,
1375
- "loss": 0.1276,
1376
- "step": 97000
1377
- },
1378
- {
1379
- "epoch": 1.7991917477071837,
1380
- "grad_norm": 3.8254497051239014,
1381
- "learning_rate": 2.0013470871546937e-05,
1382
- "loss": 0.1259,
1383
- "step": 97500
1384
- },
1385
- {
1386
- "epoch": 1.8084183720544003,
1387
- "grad_norm": 2.5958099365234375,
1388
- "learning_rate": 1.9859693799093332e-05,
1389
- "loss": 0.1264,
1390
- "step": 98000
1391
- },
1392
- {
1393
- "epoch": 1.8176449964016164,
1394
- "grad_norm": 5.190915107727051,
1395
- "learning_rate": 1.9705916726639727e-05,
1396
- "loss": 0.1288,
1397
- "step": 98500
1398
- },
1399
- {
1400
- "epoch": 1.826871620748833,
1401
- "grad_norm": 1.9603300094604492,
1402
- "learning_rate": 1.955213965418612e-05,
1403
- "loss": 0.1271,
1404
- "step": 99000
1405
- },
1406
- {
1407
- "epoch": 1.836098245096049,
1408
- "grad_norm": 2.722358226776123,
1409
- "learning_rate": 1.9398362581732514e-05,
1410
- "loss": 0.1278,
1411
- "step": 99500
1412
- },
1413
- {
1414
- "epoch": 1.8453248694432656,
1415
- "grad_norm": 1.6586706638336182,
1416
- "learning_rate": 1.924458550927891e-05,
1417
- "loss": 0.1248,
1418
- "step": 100000
1419
- },
1420
- {
1421
- "epoch": 1.8545514937904817,
1422
- "grad_norm": 2.985854148864746,
1423
- "learning_rate": 1.90908084368253e-05,
1424
- "loss": 0.1266,
1425
- "step": 100500
1426
- },
1427
- {
1428
- "epoch": 1.8637781181376982,
1429
- "grad_norm": 2.7211902141571045,
1430
- "learning_rate": 1.89370313643717e-05,
1431
- "loss": 0.1267,
1432
- "step": 101000
1433
- },
1434
- {
1435
- "epoch": 1.8730047424849143,
1436
- "grad_norm": 2.373112678527832,
1437
- "learning_rate": 1.8783254291918092e-05,
1438
- "loss": 0.1239,
1439
- "step": 101500
1440
- },
1441
- {
1442
- "epoch": 1.8822313668321309,
1443
- "grad_norm": 2.1845340728759766,
1444
- "learning_rate": 1.862947721946449e-05,
1445
- "loss": 0.1259,
1446
- "step": 102000
1447
- },
1448
- {
1449
- "epoch": 1.891457991179347,
1450
- "grad_norm": 2.6702089309692383,
1451
- "learning_rate": 1.8475700147010882e-05,
1452
- "loss": 0.1267,
1453
- "step": 102500
1454
- },
1455
- {
1456
- "epoch": 1.9006846155265635,
1457
- "grad_norm": 1.2957886457443237,
1458
- "learning_rate": 1.8321923074557278e-05,
1459
- "loss": 0.1231,
1460
- "step": 103000
1461
- },
1462
- {
1463
- "epoch": 1.9099112398737796,
1464
- "grad_norm": 2.2960615158081055,
1465
- "learning_rate": 1.8168146002103673e-05,
1466
- "loss": 0.1242,
1467
- "step": 103500
1468
- },
1469
- {
1470
- "epoch": 1.9191378642209962,
1471
- "grad_norm": 1.4060367345809937,
1472
- "learning_rate": 1.8014368929650065e-05,
1473
- "loss": 0.1217,
1474
- "step": 104000
1475
- },
1476
- {
1477
- "epoch": 1.9283644885682123,
1478
- "grad_norm": 1.8247722387313843,
1479
- "learning_rate": 1.786059185719646e-05,
1480
- "loss": 0.1247,
1481
- "step": 104500
1482
- },
1483
- {
1484
- "epoch": 1.9375911129154288,
1485
- "grad_norm": 4.583653450012207,
1486
- "learning_rate": 1.7706814784742855e-05,
1487
- "loss": 0.1224,
1488
- "step": 105000
1489
- },
1490
- {
1491
- "epoch": 1.946817737262645,
1492
- "grad_norm": 1.7650556564331055,
1493
- "learning_rate": 1.755303771228925e-05,
1494
- "loss": 0.1235,
1495
- "step": 105500
1496
- },
1497
- {
1498
- "epoch": 1.9560443616098615,
1499
- "grad_norm": 2.088684320449829,
1500
- "learning_rate": 1.7399260639835643e-05,
1501
- "loss": 0.1203,
1502
- "step": 106000
1503
- },
1504
- {
1505
- "epoch": 1.9652709859570776,
1506
- "grad_norm": 2.448063850402832,
1507
- "learning_rate": 1.724548356738204e-05,
1508
- "loss": 0.1209,
1509
- "step": 106500
1510
- },
1511
- {
1512
- "epoch": 1.9744976103042942,
1513
- "grad_norm": 4.1177778244018555,
1514
- "learning_rate": 1.7091706494928433e-05,
1515
- "loss": 0.1188,
1516
- "step": 107000
1517
- },
1518
- {
1519
- "epoch": 1.9837242346515103,
1520
- "grad_norm": 4.088508129119873,
1521
- "learning_rate": 1.693792942247483e-05,
1522
- "loss": 0.1206,
1523
- "step": 107500
1524
- },
1525
- {
1526
- "epoch": 1.9929508589987268,
1527
- "grad_norm": 2.3093175888061523,
1528
- "learning_rate": 1.6784152350021224e-05,
1529
- "loss": 0.1186,
1530
- "step": 108000
1531
- },
1532
- {
1533
- "epoch": 2.0,
1534
- "eval_loss": 0.2138589769601822,
1535
- "eval_mse": 0.21385898989204177,
1536
- "eval_runtime": 125.4291,
1537
- "eval_samples_per_second": 1536.134,
1538
- "eval_steps_per_second": 192.021,
1539
- "step": 108382
1540
- },
1541
- {
1542
- "epoch": 2.002177483345943,
1543
- "grad_norm": 2.2715179920196533,
1544
- "learning_rate": 1.6630375277567615e-05,
1545
- "loss": 0.114,
1546
- "step": 108500
1547
- },
1548
- {
1549
- "epoch": 2.0114041076931595,
1550
- "grad_norm": 2.236180543899536,
1551
- "learning_rate": 1.647659820511401e-05,
1552
- "loss": 0.0994,
1553
- "step": 109000
1554
- },
1555
- {
1556
- "epoch": 2.0206307320403756,
1557
- "grad_norm": 2.335440158843994,
1558
- "learning_rate": 1.6322821132660406e-05,
1559
- "loss": 0.0975,
1560
- "step": 109500
1561
- },
1562
- {
1563
- "epoch": 2.029857356387592,
1564
- "grad_norm": 4.5400519371032715,
1565
- "learning_rate": 1.61690440602068e-05,
1566
- "loss": 0.0972,
1567
- "step": 110000
1568
- },
1569
- {
1570
- "epoch": 2.0390839807348082,
1571
- "grad_norm": 2.633301258087158,
1572
- "learning_rate": 1.6015266987753193e-05,
1573
- "loss": 0.0982,
1574
- "step": 110500
1575
- },
1576
- {
1577
- "epoch": 2.048310605082025,
1578
- "grad_norm": 1.150661826133728,
1579
- "learning_rate": 1.5861489915299592e-05,
1580
- "loss": 0.0999,
1581
- "step": 111000
1582
- },
1583
- {
1584
- "epoch": 2.057537229429241,
1585
- "grad_norm": 1.9149357080459595,
1586
- "learning_rate": 1.5707712842845984e-05,
1587
- "loss": 0.0979,
1588
- "step": 111500
1589
- },
1590
- {
1591
- "epoch": 2.0667638537764574,
1592
- "grad_norm": 1.996846079826355,
1593
- "learning_rate": 1.555393577039238e-05,
1594
- "loss": 0.0996,
1595
- "step": 112000
1596
- },
1597
- {
1598
- "epoch": 2.0759904781236735,
1599
- "grad_norm": 1.5708836317062378,
1600
- "learning_rate": 1.5400158697938774e-05,
1601
- "loss": 0.0969,
1602
- "step": 112500
1603
- },
1604
- {
1605
- "epoch": 2.08521710247089,
1606
- "grad_norm": 1.5404409170150757,
1607
- "learning_rate": 1.5246381625485168e-05,
1608
- "loss": 0.0963,
1609
- "step": 113000
1610
- },
1611
- {
1612
- "epoch": 2.094443726818106,
1613
- "grad_norm": 1.6409614086151123,
1614
- "learning_rate": 1.5092604553031561e-05,
1615
- "loss": 0.0977,
1616
- "step": 113500
1617
- },
1618
- {
1619
- "epoch": 2.1036703511653227,
1620
- "grad_norm": 1.7960460186004639,
1621
- "learning_rate": 1.4938827480577958e-05,
1622
- "loss": 0.0964,
1623
- "step": 114000
1624
- },
1625
- {
1626
- "epoch": 2.112896975512539,
1627
- "grad_norm": 1.685120701789856,
1628
- "learning_rate": 1.4785050408124352e-05,
1629
- "loss": 0.0989,
1630
- "step": 114500
1631
- },
1632
- {
1633
- "epoch": 2.1221235998597554,
1634
- "grad_norm": 3.500861644744873,
1635
- "learning_rate": 1.4631273335670745e-05,
1636
- "loss": 0.0943,
1637
- "step": 115000
1638
- },
1639
- {
1640
- "epoch": 2.1313502242069715,
1641
- "grad_norm": 2.3654606342315674,
1642
- "learning_rate": 1.447749626321714e-05,
1643
- "loss": 0.0963,
1644
- "step": 115500
1645
- },
1646
- {
1647
- "epoch": 2.140576848554188,
1648
- "grad_norm": 3.000051975250244,
1649
- "learning_rate": 1.4323719190763534e-05,
1650
- "loss": 0.0922,
1651
- "step": 116000
1652
- },
1653
- {
1654
- "epoch": 2.149803472901404,
1655
- "grad_norm": 2.384732961654663,
1656
- "learning_rate": 1.4169942118309928e-05,
1657
- "loss": 0.0971,
1658
- "step": 116500
1659
- },
1660
- {
1661
- "epoch": 2.1590300972486207,
1662
- "grad_norm": 1.3965630531311035,
1663
- "learning_rate": 1.4016165045856325e-05,
1664
- "loss": 0.0971,
1665
- "step": 117000
1666
- },
1667
- {
1668
- "epoch": 2.168256721595837,
1669
- "grad_norm": 1.745569109916687,
1670
- "learning_rate": 1.3862387973402718e-05,
1671
- "loss": 0.0977,
1672
- "step": 117500
1673
- },
1674
- {
1675
- "epoch": 2.1774833459430534,
1676
- "grad_norm": 2.326707363128662,
1677
- "learning_rate": 1.3708610900949112e-05,
1678
- "loss": 0.0937,
1679
- "step": 118000
1680
- },
1681
- {
1682
- "epoch": 2.1867099702902695,
1683
- "grad_norm": 1.6542750597000122,
1684
- "learning_rate": 1.3554833828495509e-05,
1685
- "loss": 0.0943,
1686
- "step": 118500
1687
- },
1688
- {
1689
- "epoch": 2.195936594637486,
1690
- "grad_norm": 1.1322625875473022,
1691
- "learning_rate": 1.3401056756041902e-05,
1692
- "loss": 0.0937,
1693
- "step": 119000
1694
- },
1695
- {
1696
- "epoch": 2.205163218984702,
1697
- "grad_norm": 1.815834641456604,
1698
- "learning_rate": 1.3247279683588296e-05,
1699
- "loss": 0.0938,
1700
- "step": 119500
1701
- },
1702
- {
1703
- "epoch": 2.2143898433319187,
1704
- "grad_norm": 4.64595890045166,
1705
- "learning_rate": 1.3093502611134691e-05,
1706
- "loss": 0.0939,
1707
- "step": 120000
1708
- },
1709
- {
1710
- "epoch": 2.223616467679135,
1711
- "grad_norm": 2.1671462059020996,
1712
- "learning_rate": 1.2939725538681085e-05,
1713
- "loss": 0.093,
1714
- "step": 120500
1715
- },
1716
- {
1717
- "epoch": 2.2328430920263513,
1718
- "grad_norm": 1.636570692062378,
1719
- "learning_rate": 1.2785948466227478e-05,
1720
- "loss": 0.0928,
1721
- "step": 121000
1722
- },
1723
- {
1724
- "epoch": 2.2420697163735674,
1725
- "grad_norm": 3.4394800662994385,
1726
- "learning_rate": 1.2632171393773875e-05,
1727
- "loss": 0.0936,
1728
- "step": 121500
1729
- },
1730
- {
1731
- "epoch": 2.251296340720784,
1732
- "grad_norm": 2.013307571411133,
1733
- "learning_rate": 1.2478394321320269e-05,
1734
- "loss": 0.0954,
1735
- "step": 122000
1736
- },
1737
- {
1738
- "epoch": 2.260522965068,
1739
- "grad_norm": 3.2544264793395996,
1740
- "learning_rate": 1.2324617248866664e-05,
1741
- "loss": 0.0987,
1742
- "step": 122500
1743
- },
1744
- {
1745
- "epoch": 2.2697495894152167,
1746
- "grad_norm": 2.9892079830169678,
1747
- "learning_rate": 1.2170840176413058e-05,
1748
- "loss": 0.0931,
1749
- "step": 123000
1750
- },
1751
- {
1752
- "epoch": 2.2789762137624328,
1753
- "grad_norm": 3.113938331604004,
1754
- "learning_rate": 1.2017063103959453e-05,
1755
- "loss": 0.0945,
1756
- "step": 123500
1757
- },
1758
- {
1759
- "epoch": 2.2882028381096493,
1760
- "grad_norm": 1.7884827852249146,
1761
- "learning_rate": 1.1863286031505848e-05,
1762
- "loss": 0.0935,
1763
- "step": 124000
1764
- },
1765
- {
1766
- "epoch": 2.2974294624568654,
1767
- "grad_norm": 2.059272527694702,
1768
- "learning_rate": 1.1709508959052242e-05,
1769
- "loss": 0.0962,
1770
- "step": 124500
1771
- },
1772
- {
1773
- "epoch": 2.306656086804082,
1774
- "grad_norm": 1.7323048114776611,
1775
- "learning_rate": 1.1555731886598637e-05,
1776
- "loss": 0.0928,
1777
- "step": 125000
1778
- },
1779
- {
1780
- "epoch": 2.315882711151298,
1781
- "grad_norm": 1.6812376976013184,
1782
- "learning_rate": 1.140195481414503e-05,
1783
- "loss": 0.0918,
1784
- "step": 125500
1785
- },
1786
- {
1787
- "epoch": 2.3251093354985146,
1788
- "grad_norm": 1.550013780593872,
1789
- "learning_rate": 1.1248177741691424e-05,
1790
- "loss": 0.0944,
1791
- "step": 126000
1792
- },
1793
- {
1794
- "epoch": 2.3343359598457307,
1795
- "grad_norm": 2.913409948348999,
1796
- "learning_rate": 1.109440066923782e-05,
1797
- "loss": 0.0957,
1798
- "step": 126500
1799
- },
1800
- {
1801
- "epoch": 2.3435625841929473,
1802
- "grad_norm": 1.515856146812439,
1803
- "learning_rate": 1.0940623596784215e-05,
1804
- "loss": 0.0929,
1805
- "step": 127000
1806
- },
1807
- {
1808
- "epoch": 2.3527892085401634,
1809
- "grad_norm": 1.571866512298584,
1810
- "learning_rate": 1.0786846524330608e-05,
1811
- "loss": 0.0925,
1812
- "step": 127500
1813
- },
1814
- {
1815
- "epoch": 2.36201583288738,
1816
- "grad_norm": 2.379932403564453,
1817
- "learning_rate": 1.0633069451877004e-05,
1818
- "loss": 0.0927,
1819
- "step": 128000
1820
- },
1821
- {
1822
- "epoch": 2.371242457234596,
1823
- "grad_norm": 3.373950958251953,
1824
- "learning_rate": 1.0479292379423399e-05,
1825
- "loss": 0.0908,
1826
- "step": 128500
1827
- },
1828
- {
1829
- "epoch": 2.3804690815818126,
1830
- "grad_norm": 2.3678219318389893,
1831
- "learning_rate": 1.0325515306969792e-05,
1832
- "loss": 0.0898,
1833
- "step": 129000
1834
- },
1835
- {
1836
- "epoch": 2.3896957059290287,
1837
- "grad_norm": 2.636244058609009,
1838
- "learning_rate": 1.0171738234516188e-05,
1839
- "loss": 0.0892,
1840
- "step": 129500
1841
- },
1842
- {
1843
- "epoch": 2.3989223302762452,
1844
- "grad_norm": 2.6495730876922607,
1845
- "learning_rate": 1.0017961162062581e-05,
1846
- "loss": 0.0886,
1847
- "step": 130000
1848
- },
1849
- {
1850
- "epoch": 2.4081489546234613,
1851
- "grad_norm": 2.9955618381500244,
1852
- "learning_rate": 9.864184089608975e-06,
1853
- "loss": 0.0897,
1854
- "step": 130500
1855
- },
1856
- {
1857
- "epoch": 2.417375578970678,
1858
- "grad_norm": 3.0076186656951904,
1859
- "learning_rate": 9.71040701715537e-06,
1860
- "loss": 0.0895,
1861
- "step": 131000
1862
- },
1863
- {
1864
- "epoch": 2.426602203317894,
1865
- "grad_norm": 2.0592894554138184,
1866
- "learning_rate": 9.556629944701765e-06,
1867
- "loss": 0.0905,
1868
- "step": 131500
1869
- },
1870
- {
1871
- "epoch": 2.4358288276651106,
1872
- "grad_norm": 1.5429611206054688,
1873
- "learning_rate": 9.402852872248159e-06,
1874
- "loss": 0.0937,
1875
- "step": 132000
1876
- },
1877
- {
1878
- "epoch": 2.4450554520123267,
1879
- "grad_norm": 2.048470973968506,
1880
- "learning_rate": 9.249075799794554e-06,
1881
- "loss": 0.0903,
1882
- "step": 132500
1883
- },
1884
- {
1885
- "epoch": 2.454282076359543,
1886
- "grad_norm": 1.8051766157150269,
1887
- "learning_rate": 9.09529872734095e-06,
1888
- "loss": 0.091,
1889
- "step": 133000
1890
- },
1891
- {
1892
- "epoch": 2.4635087007067593,
1893
- "grad_norm": 1.5680794715881348,
1894
- "learning_rate": 8.941521654887343e-06,
1895
- "loss": 0.0892,
1896
- "step": 133500
1897
- },
1898
- {
1899
- "epoch": 2.472735325053976,
1900
- "grad_norm": 1.979874610900879,
1901
- "learning_rate": 8.787744582433738e-06,
1902
- "loss": 0.0877,
1903
- "step": 134000
1904
- },
1905
- {
1906
- "epoch": 2.481961949401192,
1907
- "grad_norm": 2.7211787700653076,
1908
- "learning_rate": 8.633967509980134e-06,
1909
- "loss": 0.0925,
1910
- "step": 134500
1911
- },
1912
- {
1913
- "epoch": 2.4911885737484085,
1914
- "grad_norm": 1.0742968320846558,
1915
- "learning_rate": 8.480190437526527e-06,
1916
- "loss": 0.0881,
1917
- "step": 135000
1918
- },
1919
- {
1920
- "epoch": 2.5004151980956246,
1921
- "grad_norm": 2.0518765449523926,
1922
- "learning_rate": 8.32641336507292e-06,
1923
- "loss": 0.0943,
1924
- "step": 135500
1925
- },
1926
- {
1927
- "epoch": 2.509641822442841,
1928
- "grad_norm": 1.9672821760177612,
1929
- "learning_rate": 8.172636292619316e-06,
1930
- "loss": 0.0898,
1931
- "step": 136000
1932
- },
1933
- {
1934
- "epoch": 2.5188684467900573,
1935
- "grad_norm": 1.2716307640075684,
1936
- "learning_rate": 8.01885922016571e-06,
1937
- "loss": 0.0875,
1938
- "step": 136500
1939
- },
1940
- {
1941
- "epoch": 2.528095071137274,
1942
- "grad_norm": 2.617617607116699,
1943
- "learning_rate": 7.865082147712105e-06,
1944
- "loss": 0.0889,
1945
- "step": 137000
1946
- },
1947
- {
1948
- "epoch": 2.53732169548449,
1949
- "grad_norm": 0.8945909738540649,
1950
- "learning_rate": 7.7113050752585e-06,
1951
- "loss": 0.0909,
1952
- "step": 137500
1953
- },
1954
- {
1955
- "epoch": 2.5465483198317065,
1956
- "grad_norm": 1.661537766456604,
1957
- "learning_rate": 7.557528002804894e-06,
1958
- "loss": 0.0878,
1959
- "step": 138000
1960
- },
1961
- {
1962
- "epoch": 2.5557749441789226,
1963
- "grad_norm": 3.6078097820281982,
1964
- "learning_rate": 7.403750930351289e-06,
1965
- "loss": 0.0903,
1966
- "step": 138500
1967
- },
1968
- {
1969
- "epoch": 2.565001568526139,
1970
- "grad_norm": 1.483906626701355,
1971
- "learning_rate": 7.249973857897683e-06,
1972
- "loss": 0.0862,
1973
- "step": 139000
1974
- },
1975
- {
1976
- "epoch": 2.5742281928733552,
1977
- "grad_norm": 1.867789626121521,
1978
- "learning_rate": 7.096196785444077e-06,
1979
- "loss": 0.0891,
1980
- "step": 139500
1981
- },
1982
- {
1983
- "epoch": 2.583454817220572,
1984
- "grad_norm": 2.8336305618286133,
1985
- "learning_rate": 6.942419712990472e-06,
1986
- "loss": 0.0901,
1987
- "step": 140000
1988
- },
1989
- {
1990
- "epoch": 2.592681441567788,
1991
- "grad_norm": 1.5188074111938477,
1992
- "learning_rate": 6.7886426405368675e-06,
1993
- "loss": 0.0903,
1994
- "step": 140500
1995
- },
1996
- {
1997
- "epoch": 2.6019080659150045,
1998
- "grad_norm": 2.809237480163574,
1999
- "learning_rate": 6.634865568083261e-06,
2000
- "loss": 0.0892,
2001
- "step": 141000
2002
- },
2003
- {
2004
- "epoch": 2.6111346902622206,
2005
- "grad_norm": 1.773245096206665,
2006
- "learning_rate": 6.4810884956296555e-06,
2007
- "loss": 0.0909,
2008
- "step": 141500
2009
- },
2010
- {
2011
- "epoch": 2.620361314609437,
2012
- "grad_norm": 1.85002863407135,
2013
- "learning_rate": 6.327311423176051e-06,
2014
- "loss": 0.092,
2015
- "step": 142000
2016
- },
2017
- {
2018
- "epoch": 2.629587938956653,
2019
- "grad_norm": 0.9777950048446655,
2020
- "learning_rate": 6.173534350722445e-06,
2021
- "loss": 0.0888,
2022
- "step": 142500
2023
- },
2024
- {
2025
- "epoch": 2.6388145633038698,
2026
- "grad_norm": 2.261619806289673,
2027
- "learning_rate": 6.0197572782688396e-06,
2028
- "loss": 0.0879,
2029
- "step": 143000
2030
- },
2031
- {
2032
- "epoch": 2.648041187651086,
2033
- "grad_norm": 2.093942642211914,
2034
- "learning_rate": 5.865980205815234e-06,
2035
- "loss": 0.0866,
2036
- "step": 143500
2037
- },
2038
- {
2039
- "epoch": 2.6572678119983024,
2040
- "grad_norm": 3.01939058303833,
2041
- "learning_rate": 5.712203133361628e-06,
2042
- "loss": 0.0882,
2043
- "step": 144000
2044
- },
2045
- {
2046
- "epoch": 2.6664944363455185,
2047
- "grad_norm": 2.6572530269622803,
2048
- "learning_rate": 5.558426060908023e-06,
2049
- "loss": 0.0889,
2050
- "step": 144500
2051
- },
2052
- {
2053
- "epoch": 2.675721060692735,
2054
- "grad_norm": 1.3037127256393433,
2055
- "learning_rate": 5.404648988454418e-06,
2056
- "loss": 0.0907,
2057
- "step": 145000
2058
- },
2059
- {
2060
- "epoch": 2.684947685039951,
2061
- "grad_norm": 1.2352185249328613,
2062
- "learning_rate": 5.2508719160008125e-06,
2063
- "loss": 0.0888,
2064
- "step": 145500
2065
- },
2066
- {
2067
- "epoch": 2.6941743093871677,
2068
- "grad_norm": 1.6539493799209595,
2069
- "learning_rate": 5.097094843547207e-06,
2070
- "loss": 0.0886,
2071
- "step": 146000
2072
- },
2073
- {
2074
- "epoch": 2.703400933734384,
2075
- "grad_norm": 1.900009036064148,
2076
- "learning_rate": 4.943317771093601e-06,
2077
- "loss": 0.0896,
2078
- "step": 146500
2079
- },
2080
- {
2081
- "epoch": 2.7126275580816004,
2082
- "grad_norm": 1.3108474016189575,
2083
- "learning_rate": 4.789540698639996e-06,
2084
- "loss": 0.0874,
2085
- "step": 147000
2086
- },
2087
- {
2088
- "epoch": 2.7218541824288165,
2089
- "grad_norm": 0.9704590439796448,
2090
- "learning_rate": 4.63576362618639e-06,
2091
- "loss": 0.087,
2092
- "step": 147500
2093
- },
2094
- {
2095
- "epoch": 2.731080806776033,
2096
- "grad_norm": 1.0830601453781128,
2097
- "learning_rate": 4.481986553732785e-06,
2098
- "loss": 0.0881,
2099
- "step": 148000
2100
- },
2101
- {
2102
- "epoch": 2.740307431123249,
2103
- "grad_norm": 1.5252071619033813,
2104
- "learning_rate": 4.32820948127918e-06,
2105
- "loss": 0.0892,
2106
- "step": 148500
2107
- },
2108
- {
2109
- "epoch": 2.7495340554704657,
2110
- "grad_norm": 1.7691118717193604,
2111
- "learning_rate": 4.174432408825573e-06,
2112
- "loss": 0.0859,
2113
- "step": 149000
2114
- },
2115
- {
2116
- "epoch": 2.758760679817682,
2117
- "grad_norm": 7.3577494621276855,
2118
- "learning_rate": 4.020655336371969e-06,
2119
- "loss": 0.0856,
2120
- "step": 149500
2121
- },
2122
- {
2123
- "epoch": 2.7679873041648984,
2124
- "grad_norm": 1.1883918046951294,
2125
- "learning_rate": 3.866878263918363e-06,
2126
- "loss": 0.086,
2127
- "step": 150000
2128
- },
2129
- {
2130
- "epoch": 2.7772139285121145,
2131
- "grad_norm": 2.320882797241211,
2132
- "learning_rate": 3.7131011914647575e-06,
2133
- "loss": 0.0868,
2134
- "step": 150500
2135
- },
2136
- {
2137
- "epoch": 2.786440552859331,
2138
- "grad_norm": 2.119135618209839,
2139
- "learning_rate": 3.5593241190111523e-06,
2140
- "loss": 0.0837,
2141
- "step": 151000
2142
- },
2143
- {
2144
- "epoch": 2.795667177206547,
2145
- "grad_norm": 2.0826363563537598,
2146
- "learning_rate": 3.4055470465575468e-06,
2147
- "loss": 0.0871,
2148
- "step": 151500
2149
- },
2150
- {
2151
- "epoch": 2.8048938015537637,
2152
- "grad_norm": 1.4801201820373535,
2153
- "learning_rate": 3.2517699741039408e-06,
2154
- "loss": 0.0855,
2155
- "step": 152000
2156
- },
2157
- {
2158
- "epoch": 2.8141204259009798,
2159
- "grad_norm": 3.352520227432251,
2160
- "learning_rate": 3.0979929016503356e-06,
2161
- "loss": 0.0907,
2162
- "step": 152500
2163
- },
2164
- {
2165
- "epoch": 2.8233470502481963,
2166
- "grad_norm": 3.1500301361083984,
2167
- "learning_rate": 2.94421582919673e-06,
2168
- "loss": 0.0858,
2169
- "step": 153000
2170
- },
2171
- {
2172
- "epoch": 2.8325736745954124,
2173
- "grad_norm": 1.9149506092071533,
2174
- "learning_rate": 2.790438756743125e-06,
2175
- "loss": 0.0856,
2176
- "step": 153500
2177
- },
2178
- {
2179
- "epoch": 2.841800298942629,
2180
- "grad_norm": 2.150416612625122,
2181
- "learning_rate": 2.6366616842895193e-06,
2182
- "loss": 0.0849,
2183
- "step": 154000
2184
- },
2185
- {
2186
- "epoch": 2.851026923289845,
2187
- "grad_norm": 1.613443374633789,
2188
- "learning_rate": 2.4828846118359137e-06,
2189
- "loss": 0.084,
2190
- "step": 154500
2191
- },
2192
- {
2193
- "epoch": 2.8602535476370616,
2194
- "grad_norm": 4.109127998352051,
2195
- "learning_rate": 2.3291075393823085e-06,
2196
- "loss": 0.0859,
2197
- "step": 155000
2198
- },
2199
- {
2200
- "epoch": 2.8694801719842777,
2201
- "grad_norm": 2.9541776180267334,
2202
- "learning_rate": 2.175330466928703e-06,
2203
- "loss": 0.0869,
2204
- "step": 155500
2205
- },
2206
- {
2207
- "epoch": 2.8787067963314943,
2208
- "grad_norm": 2.9944493770599365,
2209
- "learning_rate": 2.0215533944750974e-06,
2210
- "loss": 0.0861,
2211
- "step": 156000
2212
- },
2213
- {
2214
- "epoch": 2.8879334206787104,
2215
- "grad_norm": 2.072777271270752,
2216
- "learning_rate": 1.867776322021492e-06,
2217
- "loss": 0.084,
2218
- "step": 156500
2219
- },
2220
- {
2221
- "epoch": 2.897160045025927,
2222
- "grad_norm": 2.4962828159332275,
2223
- "learning_rate": 1.7139992495678866e-06,
2224
- "loss": 0.0826,
2225
- "step": 157000
2226
- },
2227
- {
2228
- "epoch": 2.906386669373143,
2229
- "grad_norm": 1.9871286153793335,
2230
- "learning_rate": 1.560222177114281e-06,
2231
- "loss": 0.086,
2232
- "step": 157500
2233
- },
2234
- {
2235
- "epoch": 2.9156132937203596,
2236
- "grad_norm": 1.9906572103500366,
2237
- "learning_rate": 1.4064451046606757e-06,
2238
- "loss": 0.0838,
2239
- "step": 158000
2240
- },
2241
- {
2242
- "epoch": 2.9248399180675757,
2243
- "grad_norm": 2.1322317123413086,
2244
- "learning_rate": 1.25266803220707e-06,
2245
- "loss": 0.0844,
2246
- "step": 158500
2247
- },
2248
- {
2249
- "epoch": 2.9340665424147923,
2250
- "grad_norm": 2.26415753364563,
2251
- "learning_rate": 1.0988909597534647e-06,
2252
- "loss": 0.0839,
2253
- "step": 159000
2254
- },
2255
- {
2256
- "epoch": 2.9432931667620084,
2257
- "grad_norm": 1.9201428890228271,
2258
- "learning_rate": 9.451138872998592e-07,
2259
- "loss": 0.0861,
2260
- "step": 159500
2261
- },
2262
- {
2263
- "epoch": 2.952519791109225,
2264
- "grad_norm": 1.4225293397903442,
2265
- "learning_rate": 7.913368148462537e-07,
2266
- "loss": 0.0867,
2267
- "step": 160000
2268
- },
2269
- {
2270
- "epoch": 2.961746415456441,
2271
- "grad_norm": 4.265661239624023,
2272
- "learning_rate": 6.375597423926483e-07,
2273
- "loss": 0.0864,
2274
- "step": 160500
2275
- },
2276
- {
2277
- "epoch": 2.9709730398036576,
2278
- "grad_norm": 3.4527368545532227,
2279
- "learning_rate": 4.837826699390428e-07,
2280
- "loss": 0.0853,
2281
- "step": 161000
2282
- },
2283
- {
2284
- "epoch": 2.9801996641508737,
2285
- "grad_norm": 1.618511438369751,
2286
- "learning_rate": 3.300055974854373e-07,
2287
- "loss": 0.0862,
2288
- "step": 161500
2289
- },
2290
- {
2291
- "epoch": 2.9894262884980902,
2292
- "grad_norm": 1.6494286060333252,
2293
- "learning_rate": 1.7622852503183185e-07,
2294
- "loss": 0.0827,
2295
- "step": 162000
2296
- },
2297
- {
2298
- "epoch": 2.9986529128453063,
2299
- "grad_norm": 1.5462067127227783,
2300
- "learning_rate": 2.24514525782264e-08,
2301
- "loss": 0.0861,
2302
- "step": 162500
2303
- },
2304
  {
2305
  "epoch": 3.0,
2306
- "eval_loss": 0.20824576914310455,
2307
- "eval_mse": 0.20824573578672098,
2308
- "eval_runtime": 124.5011,
2309
- "eval_samples_per_second": 1547.584,
2310
- "eval_steps_per_second": 193.452,
2311
- "step": 162573
2312
  },
2313
  {
2314
  "epoch": 3.0,
2315
- "step": 162573,
2316
- "total_flos": 8.56700972907817e+16,
2317
- "train_loss": 0.16141534491764947,
2318
- "train_runtime": 8977.4486,
2319
- "train_samples_per_second": 579.48,
2320
- "train_steps_per_second": 18.109
2321
  }
2322
  ],
2323
  "logging_steps": 500,
2324
- "max_steps": 162573,
2325
  "num_input_tokens_seen": 0,
2326
  "num_train_epochs": 3,
2327
  "save_steps": 500,
@@ -2337,8 +384,8 @@
2337
  "attributes": {}
2338
  }
2339
  },
2340
- "total_flos": 8.56700972907817e+16,
2341
- "train_batch_size": 32,
2342
  "trial_name": null,
2343
  "trial_params": null
2344
  }
 
1
  {
2
+ "best_metric": 0.14551572501659393,
3
+ "best_model_checkpoint": "multilingual-e5-small-aligned-sentiment-20241214-new/checkpoint-23439",
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 23439,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.06399590426212723,
13
+ "grad_norm": 1.8201525211334229,
14
+ "learning_rate": 4.8933401595631215e-05,
15
+ "loss": 0.3311,
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 0.12799180852425446,
20
+ "grad_norm": 1.933944821357727,
21
+ "learning_rate": 4.786680319126243e-05,
22
+ "loss": 0.259,
23
  "step": 1000
24
  },
25
  {
26
+ "epoch": 0.19198771278638166,
27
+ "grad_norm": 1.5761635303497314,
28
+ "learning_rate": 4.680020478689364e-05,
29
+ "loss": 0.244,
30
  "step": 1500
31
  },
32
  {
33
+ "epoch": 0.2559836170485089,
34
+ "grad_norm": 2.812056303024292,
35
+ "learning_rate": 4.573360638252485e-05,
36
+ "loss": 0.2353,
37
  "step": 2000
38
  },
39
  {
40
+ "epoch": 0.3199795213106361,
41
+ "grad_norm": 2.022963285446167,
42
+ "learning_rate": 4.4667007978156063e-05,
43
+ "loss": 0.2261,
44
  "step": 2500
45
  },
46
  {
47
+ "epoch": 0.3839754255727633,
48
+ "grad_norm": 2.1023800373077393,
49
+ "learning_rate": 4.360040957378728e-05,
50
+ "loss": 0.22,
51
  "step": 3000
52
  },
53
  {
54
+ "epoch": 0.4479713298348906,
55
+ "grad_norm": 1.8184232711791992,
56
+ "learning_rate": 4.2533811169418495e-05,
57
+ "loss": 0.2166,
58
  "step": 3500
59
  },
60
  {
61
+ "epoch": 0.5119672340970178,
62
+ "grad_norm": 1.9675606489181519,
63
+ "learning_rate": 4.146721276504971e-05,
64
+ "loss": 0.2124,
65
  "step": 4000
66
  },
67
  {
68
+ "epoch": 0.575963138359145,
69
+ "grad_norm": 1.7122745513916016,
70
+ "learning_rate": 4.040061436068092e-05,
71
+ "loss": 0.209,
72
  "step": 4500
73
  },
74
  {
75
+ "epoch": 0.6399590426212722,
76
+ "grad_norm": 2.49656081199646,
77
+ "learning_rate": 3.933401595631213e-05,
78
+ "loss": 0.205,
79
  "step": 5000
80
  },
81
  {
82
+ "epoch": 0.7039549468833994,
83
+ "grad_norm": 3.310446262359619,
84
+ "learning_rate": 3.8267417551943344e-05,
85
+ "loss": 0.203,
86
  "step": 5500
87
  },
88
  {
89
+ "epoch": 0.7679508511455266,
90
+ "grad_norm": 2.4122066497802734,
91
+ "learning_rate": 3.7200819147574556e-05,
92
+ "loss": 0.1964,
93
  "step": 6000
94
  },
95
  {
96
+ "epoch": 0.831946755407654,
97
+ "grad_norm": 1.8602417707443237,
98
+ "learning_rate": 3.613422074320577e-05,
99
+ "loss": 0.1982,
100
  "step": 6500
101
  },
102
  {
103
+ "epoch": 0.8959426596697811,
104
+ "grad_norm": 3.1396756172180176,
105
+ "learning_rate": 3.506762233883698e-05,
106
+ "loss": 0.1935,
107
  "step": 7000
108
  },
109
  {
110
+ "epoch": 0.9599385639319084,
111
+ "grad_norm": 4.411809921264648,
112
+ "learning_rate": 3.400102393446819e-05,
113
+ "loss": 0.1946,
114
  "step": 7500
115
  },
116
  {
117
+ "epoch": 1.0,
118
+ "eval_loss": 0.16471454501152039,
119
+ "eval_mse": 0.16471455099075522,
120
+ "eval_runtime": 97.1359,
121
+ "eval_samples_per_second": 1874.807,
122
+ "eval_steps_per_second": 234.352,
123
+ "step": 7813
124
+ },
125
+ {
126
+ "epoch": 1.0239344681940357,
127
+ "grad_norm": 2.101229429244995,
128
+ "learning_rate": 3.293442553009941e-05,
129
+ "loss": 0.1776,
130
  "step": 8000
131
  },
132
  {
133
+ "epoch": 1.0879303724561629,
134
+ "grad_norm": 1.918332576751709,
135
+ "learning_rate": 3.1867827125730624e-05,
136
+ "loss": 0.1548,
137
  "step": 8500
138
  },
139
  {
140
+ "epoch": 1.15192627671829,
141
+ "grad_norm": 2.1414854526519775,
142
+ "learning_rate": 3.0801228721361836e-05,
143
+ "loss": 0.1489,
144
  "step": 9000
145
  },
146
  {
147
+ "epoch": 1.2159221809804173,
148
+ "grad_norm": 1.7239971160888672,
149
+ "learning_rate": 2.9734630316993045e-05,
150
+ "loss": 0.1472,
151
  "step": 9500
152
  },
153
  {
154
+ "epoch": 1.2799180852425445,
155
+ "grad_norm": 1.5635288953781128,
156
+ "learning_rate": 2.866803191262426e-05,
157
+ "loss": 0.1491,
158
  "step": 10000
159
  },
160
  {
161
+ "epoch": 1.3439139895046717,
162
+ "grad_norm": 2.0498759746551514,
163
+ "learning_rate": 2.7601433508255476e-05,
164
+ "loss": 0.1493,
165
  "step": 10500
166
  },
167
  {
168
+ "epoch": 1.4079098937667989,
169
+ "grad_norm": 1.2982215881347656,
170
+ "learning_rate": 2.6534835103886685e-05,
171
+ "loss": 0.1458,
172
  "step": 11000
173
  },
174
  {
175
+ "epoch": 1.471905798028926,
176
+ "grad_norm": 1.9183140993118286,
177
+ "learning_rate": 2.54682366995179e-05,
178
+ "loss": 0.149,
179
  "step": 11500
180
  },
181
  {
182
+ "epoch": 1.5359017022910533,
183
+ "grad_norm": 1.4604493379592896,
184
+ "learning_rate": 2.4401638295149112e-05,
185
+ "loss": 0.1441,
186
  "step": 12000
187
  },
188
  {
189
+ "epoch": 1.5998976065531805,
190
+ "grad_norm": 1.4501017332077026,
191
+ "learning_rate": 2.3335039890780325e-05,
192
+ "loss": 0.1483,
193
  "step": 12500
194
  },
195
  {
196
+ "epoch": 1.6638935108153077,
197
+ "grad_norm": 1.7177495956420898,
198
+ "learning_rate": 2.2268441486411537e-05,
199
+ "loss": 0.1443,
200
  "step": 13000
201
  },
202
  {
203
+ "epoch": 1.727889415077435,
204
+ "grad_norm": 2.2127881050109863,
205
+ "learning_rate": 2.120184308204275e-05,
206
+ "loss": 0.1458,
207
  "step": 13500
208
  },
209
  {
210
+ "epoch": 1.7918853193395623,
211
+ "grad_norm": 1.5237836837768555,
212
+ "learning_rate": 2.0135244677673965e-05,
213
+ "loss": 0.1435,
214
  "step": 14000
215
  },
216
  {
217
+ "epoch": 1.8558812236016895,
218
+ "grad_norm": 2.217888355255127,
219
+ "learning_rate": 1.9068646273305177e-05,
220
+ "loss": 0.141,
221
  "step": 14500
222
  },
223
  {
224
+ "epoch": 1.9198771278638167,
225
+ "grad_norm": 3.9023308753967285,
226
+ "learning_rate": 1.800204786893639e-05,
227
+ "loss": 0.139,
228
  "step": 15000
229
  },
230
  {
231
+ "epoch": 1.983873032125944,
232
+ "grad_norm": 1.6923686265945435,
233
+ "learning_rate": 1.69354494645676e-05,
234
+ "loss": 0.1385,
235
  "step": 15500
236
  },
237
  {
238
+ "epoch": 2.0,
239
+ "eval_loss": 0.15275108814239502,
240
+ "eval_mse": 0.15275108203598506,
241
+ "eval_runtime": 101.0827,
242
+ "eval_samples_per_second": 1801.605,
243
+ "eval_steps_per_second": 225.202,
244
+ "step": 15626
245
+ },
246
+ {
247
+ "epoch": 2.0478689363880713,
248
+ "grad_norm": 3.1119792461395264,
249
+ "learning_rate": 1.5868851060198814e-05,
250
+ "loss": 0.1234,
251
  "step": 16000
252
  },
253
  {
254
+ "epoch": 2.1118648406501985,
255
+ "grad_norm": 1.3244566917419434,
256
+ "learning_rate": 1.480225265583003e-05,
257
+ "loss": 0.1168,
258
  "step": 16500
259
  },
260
  {
261
+ "epoch": 2.1758607449123257,
262
+ "grad_norm": 1.8541600704193115,
263
+ "learning_rate": 1.3735654251461241e-05,
264
+ "loss": 0.1129,
265
  "step": 17000
266
  },
267
  {
268
+ "epoch": 2.239856649174453,
269
+ "grad_norm": 1.291128396987915,
270
+ "learning_rate": 1.2669055847092454e-05,
271
+ "loss": 0.1161,
272
  "step": 17500
273
  },
274
  {
275
+ "epoch": 2.30385255343658,
276
+ "grad_norm": 1.4121185541152954,
277
+ "learning_rate": 1.1602457442723666e-05,
278
+ "loss": 0.1137,
279
  "step": 18000
280
  },
281
  {
282
+ "epoch": 2.3678484576987073,
283
+ "grad_norm": 1.1205600500106812,
284
+ "learning_rate": 1.053585903835488e-05,
285
+ "loss": 0.1151,
286
  "step": 18500
287
  },
288
  {
289
+ "epoch": 2.4318443619608345,
290
+ "grad_norm": 2.112236976623535,
291
+ "learning_rate": 9.469260633986092e-06,
292
+ "loss": 0.114,
293
  "step": 19000
294
  },
295
  {
296
+ "epoch": 2.4958402662229617,
297
+ "grad_norm": 1.419636845588684,
298
+ "learning_rate": 8.402662229617304e-06,
299
+ "loss": 0.1152,
300
  "step": 19500
301
  },
302
  {
303
+ "epoch": 2.559836170485089,
304
+ "grad_norm": 1.89504873752594,
305
+ "learning_rate": 7.336063825248518e-06,
306
+ "loss": 0.1153,
307
  "step": 20000
308
  },
309
  {
310
+ "epoch": 2.623832074747216,
311
+ "grad_norm": 1.8703386783599854,
312
+ "learning_rate": 6.26946542087973e-06,
313
+ "loss": 0.1131,
314
  "step": 20500
315
  },
316
  {
317
+ "epoch": 2.6878279790093433,
318
+ "grad_norm": 1.3758282661437988,
319
+ "learning_rate": 5.202867016510943e-06,
320
+ "loss": 0.1134,
321
  "step": 21000
322
  },
323
  {
324
+ "epoch": 2.7518238832714705,
325
+ "grad_norm": 0.9953869581222534,
326
+ "learning_rate": 4.1362686121421564e-06,
327
+ "loss": 0.1122,
328
  "step": 21500
329
  },
330
  {
331
+ "epoch": 2.8158197875335977,
332
+ "grad_norm": 1.4371693134307861,
333
+ "learning_rate": 3.069670207773369e-06,
334
+ "loss": 0.1117,
335
  "step": 22000
336
  },
337
  {
338
+ "epoch": 2.879815691795725,
339
+ "grad_norm": 1.3081939220428467,
340
+ "learning_rate": 2.003071803404582e-06,
341
+ "loss": 0.1121,
342
  "step": 22500
343
  },
344
  {
345
+ "epoch": 2.943811596057852,
346
+ "grad_norm": 1.5292563438415527,
347
+ "learning_rate": 9.364733990357951e-07,
348
+ "loss": 0.1121,
349
  "step": 23000
350
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  {
352
  "epoch": 3.0,
353
+ "eval_loss": 0.14551572501659393,
354
+ "eval_mse": 0.14551572992011078,
355
+ "eval_runtime": 108.2898,
356
+ "eval_samples_per_second": 1681.7,
357
+ "eval_steps_per_second": 210.214,
358
+ "step": 23439
359
  },
360
  {
361
  "epoch": 3.0,
362
+ "step": 23439,
363
+ "total_flos": 4.9403660544e+16,
364
+ "train_loss": 0.16052449254575832,
365
+ "train_runtime": 3301.1135,
366
+ "train_samples_per_second": 908.784,
367
+ "train_steps_per_second": 7.1
368
  }
369
  ],
370
  "logging_steps": 500,
371
+ "max_steps": 23439,
372
  "num_input_tokens_seen": 0,
373
  "num_train_epochs": 3,
374
  "save_steps": 500,
 
384
  "attributes": {}
385
  }
386
  },
387
+ "total_flos": 4.9403660544e+16,
388
+ "train_batch_size": 128,
389
  "trial_name": null,
390
  "trial_params": null
391
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ab0472f329dda31a741344576f9001dc9064737abfb75a6baa4c9a1bdeb39ed
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81cc339dcecd204a318a224316ad713a90babbf60a7aeb149bc6f55c8815224b
3
  size 5368