edlee123 commited on
Commit
3a0cc09
1 Parent(s): c53fd5d

End of training

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ base_model: BridgeTower/bridgetower-large-itm-mlm-itc
4
  tags:
5
  - generated_from_trainer
6
  datasets:
7
- - newyorker_caption_contest
8
  model-index:
9
  - name: bridgetower
10
  results: []
@@ -15,7 +15,12 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # bridgetower
17
 
18
- This model is a fine-tuned version of [BridgeTower/bridgetower-large-itm-mlm-itc](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-itc) on the newyorker_caption_contest dataset.
 
 
 
 
 
19
 
20
  ## Model description
21
 
 
4
  tags:
5
  - generated_from_trainer
6
  datasets:
7
+ - jmhessel/newyorker_caption_contest
8
  model-index:
9
  - name: bridgetower
10
  results: []
 
15
 
16
  # bridgetower
17
 
18
+ This model is a fine-tuned version of [BridgeTower/bridgetower-large-itm-mlm-itc](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-itc) on the jmhessel/newyorker_caption_contest matching dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 0.1284
21
+ - Memory Allocated (gb): 51.27
22
+ - Max Memory Allocated (gb): 57.18
23
+ - Total Memory Available (gb): 94.62
24
 
25
  ## Model description
26
 
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 0.13596950471401215,
4
- "eval_runtime": 5.0456,
5
- "eval_samples_per_second": 129.562,
6
- "eval_steps_per_second": 8.098,
7
  "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_flos": 3.0598946525952e+16,
10
  "total_memory_available (GB)": 94.62,
11
- "train_loss": 0.06072675045655698,
12
- "train_runtime": 1077.821,
13
- "train_samples_per_second": 52.682,
14
- "train_steps_per_second": 1.318
15
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 0.12840959429740906,
4
+ "eval_runtime": 7.7898,
5
+ "eval_samples_per_second": 66.741,
6
+ "eval_steps_per_second": 4.171,
7
  "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_flos": 3.0598946525952e+16,
10
  "total_memory_available (GB)": 94.62,
11
+ "train_loss": 0.06098026679486644,
12
+ "train_runtime": 1192.2443,
13
+ "train_samples_per_second": 46.607,
14
+ "train_steps_per_second": 1.166
15
  }
runs/Oct16_16-24-55_workload-ai-workshop/events.out.tfevents.1729097262.workload-ai-workshop.7719.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab62df61deef9141d1d2c7c640cc2d48e090a71d901494bc75f8f67a54ea32ec
3
+ size 998
test_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 0.13596950471401215,
4
- "eval_runtime": 5.0456,
5
- "eval_samples_per_second": 129.562,
6
- "eval_steps_per_second": 8.098,
7
  "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 0.12840959429740906,
4
+ "eval_runtime": 7.7898,
5
+ "eval_samples_per_second": 66.741,
6
+ "eval_steps_per_second": 4.171,
7
  "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
train_results.json CHANGED
@@ -4,8 +4,8 @@
4
  "memory_allocated (GB)": 50.57,
5
  "total_flos": 3.0598946525952e+16,
6
  "total_memory_available (GB)": 94.62,
7
- "train_loss": 0.06072675045655698,
8
- "train_runtime": 1077.821,
9
- "train_samples_per_second": 52.682,
10
- "train_steps_per_second": 1.318
11
  }
 
4
  "memory_allocated (GB)": 50.57,
5
  "total_flos": 3.0598946525952e+16,
6
  "total_memory_available (GB)": 94.62,
7
+ "train_loss": 0.06098026679486644,
8
+ "train_runtime": 1192.2443,
9
+ "train_samples_per_second": 46.607,
10
+ "train_steps_per_second": 1.166
11
  }
trainer_state.json CHANGED
@@ -20,9 +20,9 @@
20
  },
21
  {
22
  "epoch": 0.08163265306122448,
23
- "grad_norm": 18.564252853393555,
24
  "learning_rate": 9.836734693877552e-06,
25
- "loss": 0.1554,
26
  "max_memory_allocated (GB)": 57.18,
27
  "memory_allocated (GB)": 50.57,
28
  "step": 20,
@@ -30,9 +30,9 @@
30
  },
31
  {
32
  "epoch": 0.12244897959183673,
33
- "grad_norm": 7.802953720092773,
34
  "learning_rate": 9.755102040816327e-06,
35
- "loss": 0.126,
36
  "max_memory_allocated (GB)": 57.18,
37
  "memory_allocated (GB)": 50.57,
38
  "step": 30,
@@ -40,9 +40,9 @@
40
  },
41
  {
42
  "epoch": 0.16326530612244897,
43
- "grad_norm": 4.599184036254883,
44
  "learning_rate": 9.673469387755103e-06,
45
- "loss": 0.1103,
46
  "max_memory_allocated (GB)": 57.18,
47
  "memory_allocated (GB)": 50.57,
48
  "step": 40,
@@ -50,9 +50,9 @@
50
  },
51
  {
52
  "epoch": 0.20408163265306123,
53
- "grad_norm": 6.734092712402344,
54
  "learning_rate": 9.591836734693878e-06,
55
- "loss": 0.1217,
56
  "max_memory_allocated (GB)": 57.18,
57
  "memory_allocated (GB)": 50.57,
58
  "step": 50,
@@ -60,9 +60,9 @@
60
  },
61
  {
62
  "epoch": 0.24489795918367346,
63
- "grad_norm": 1.7976917028427124,
64
  "learning_rate": 9.510204081632653e-06,
65
- "loss": 0.08,
66
  "max_memory_allocated (GB)": 57.18,
67
  "memory_allocated (GB)": 50.57,
68
  "step": 60,
@@ -70,9 +70,9 @@
70
  },
71
  {
72
  "epoch": 0.2857142857142857,
73
- "grad_norm": 4.0598835945129395,
74
  "learning_rate": 9.42857142857143e-06,
75
- "loss": 0.1023,
76
  "max_memory_allocated (GB)": 57.18,
77
  "memory_allocated (GB)": 50.57,
78
  "step": 70,
@@ -80,9 +80,9 @@
80
  },
81
  {
82
  "epoch": 0.32653061224489793,
83
- "grad_norm": 2.0350396633148193,
84
  "learning_rate": 9.346938775510204e-06,
85
- "loss": 0.1128,
86
  "max_memory_allocated (GB)": 57.18,
87
  "memory_allocated (GB)": 50.57,
88
  "step": 80,
@@ -90,9 +90,9 @@
90
  },
91
  {
92
  "epoch": 0.3673469387755102,
93
- "grad_norm": 3.785065174102783,
94
  "learning_rate": 9.26530612244898e-06,
95
- "loss": 0.0842,
96
  "max_memory_allocated (GB)": 57.18,
97
  "memory_allocated (GB)": 50.57,
98
  "step": 90,
@@ -100,9 +100,9 @@
100
  },
101
  {
102
  "epoch": 0.40816326530612246,
103
- "grad_norm": 1.9926950931549072,
104
  "learning_rate": 9.183673469387756e-06,
105
- "loss": 0.0728,
106
  "max_memory_allocated (GB)": 57.18,
107
  "memory_allocated (GB)": 50.57,
108
  "step": 100,
@@ -110,9 +110,9 @@
110
  },
111
  {
112
  "epoch": 0.4489795918367347,
113
- "grad_norm": 4.878537178039551,
114
  "learning_rate": 9.102040816326532e-06,
115
- "loss": 0.0692,
116
  "max_memory_allocated (GB)": 57.18,
117
  "memory_allocated (GB)": 50.57,
118
  "step": 110,
@@ -120,9 +120,9 @@
120
  },
121
  {
122
  "epoch": 0.4897959183673469,
123
- "grad_norm": 2.5495357513427734,
124
  "learning_rate": 9.020408163265307e-06,
125
- "loss": 0.0691,
126
  "max_memory_allocated (GB)": 57.18,
127
  "memory_allocated (GB)": 50.57,
128
  "step": 120,
@@ -130,9 +130,9 @@
130
  },
131
  {
132
  "epoch": 0.5306122448979592,
133
- "grad_norm": 3.0813372135162354,
134
  "learning_rate": 8.938775510204082e-06,
135
- "loss": 0.0597,
136
  "max_memory_allocated (GB)": 57.18,
137
  "memory_allocated (GB)": 50.57,
138
  "step": 130,
@@ -140,9 +140,9 @@
140
  },
141
  {
142
  "epoch": 0.5714285714285714,
143
- "grad_norm": 1.2760021686553955,
144
  "learning_rate": 8.857142857142858e-06,
145
- "loss": 0.0507,
146
  "max_memory_allocated (GB)": 57.18,
147
  "memory_allocated (GB)": 50.57,
148
  "step": 140,
@@ -150,9 +150,9 @@
150
  },
151
  {
152
  "epoch": 0.6122448979591837,
153
- "grad_norm": 2.0892932415008545,
154
  "learning_rate": 8.775510204081633e-06,
155
- "loss": 0.0634,
156
  "max_memory_allocated (GB)": 57.18,
157
  "memory_allocated (GB)": 50.57,
158
  "step": 150,
@@ -160,9 +160,9 @@
160
  },
161
  {
162
  "epoch": 0.6530612244897959,
163
- "grad_norm": 2.0191688537597656,
164
  "learning_rate": 8.69387755102041e-06,
165
- "loss": 0.0718,
166
  "max_memory_allocated (GB)": 57.18,
167
  "memory_allocated (GB)": 50.57,
168
  "step": 160,
@@ -170,9 +170,9 @@
170
  },
171
  {
172
  "epoch": 0.6938775510204082,
173
- "grad_norm": 2.4970450401306152,
174
  "learning_rate": 8.612244897959184e-06,
175
- "loss": 0.0753,
176
  "max_memory_allocated (GB)": 57.18,
177
  "memory_allocated (GB)": 50.57,
178
  "step": 170,
@@ -180,9 +180,9 @@
180
  },
181
  {
182
  "epoch": 0.7346938775510204,
183
- "grad_norm": 1.5428143739700317,
184
  "learning_rate": 8.530612244897961e-06,
185
- "loss": 0.0612,
186
  "max_memory_allocated (GB)": 57.18,
187
  "memory_allocated (GB)": 50.57,
188
  "step": 180,
@@ -190,9 +190,9 @@
190
  },
191
  {
192
  "epoch": 0.7755102040816326,
193
- "grad_norm": 3.6041452884674072,
194
  "learning_rate": 8.448979591836736e-06,
195
- "loss": 0.0573,
196
  "max_memory_allocated (GB)": 57.18,
197
  "memory_allocated (GB)": 50.57,
198
  "step": 190,
@@ -200,9 +200,9 @@
200
  },
201
  {
202
  "epoch": 0.8163265306122449,
203
- "grad_norm": 1.0115529298782349,
204
  "learning_rate": 8.36734693877551e-06,
205
- "loss": 0.0631,
206
  "max_memory_allocated (GB)": 57.18,
207
  "memory_allocated (GB)": 50.57,
208
  "step": 200,
@@ -210,9 +210,9 @@
210
  },
211
  {
212
  "epoch": 0.8571428571428571,
213
- "grad_norm": 0.8029147982597351,
214
  "learning_rate": 8.285714285714287e-06,
215
- "loss": 0.0643,
216
  "max_memory_allocated (GB)": 57.18,
217
  "memory_allocated (GB)": 50.57,
218
  "step": 210,
@@ -220,9 +220,9 @@
220
  },
221
  {
222
  "epoch": 0.8979591836734694,
223
- "grad_norm": 1.130996584892273,
224
  "learning_rate": 8.204081632653062e-06,
225
- "loss": 0.0608,
226
  "max_memory_allocated (GB)": 57.18,
227
  "memory_allocated (GB)": 50.57,
228
  "step": 220,
@@ -230,9 +230,9 @@
230
  },
231
  {
232
  "epoch": 0.9387755102040817,
233
- "grad_norm": 0.7962849140167236,
234
  "learning_rate": 8.122448979591837e-06,
235
- "loss": 0.0535,
236
  "max_memory_allocated (GB)": 57.18,
237
  "memory_allocated (GB)": 50.57,
238
  "step": 230,
@@ -240,9 +240,9 @@
240
  },
241
  {
242
  "epoch": 0.9795918367346939,
243
- "grad_norm": 2.3987386226654053,
244
  "learning_rate": 8.040816326530613e-06,
245
- "loss": 0.0499,
246
  "max_memory_allocated (GB)": 57.18,
247
  "memory_allocated (GB)": 50.57,
248
  "step": 240,
@@ -250,9 +250,9 @@
250
  },
251
  {
252
  "epoch": 1.0204081632653061,
253
- "grad_norm": 1.084067463874817,
254
  "learning_rate": 7.959183673469388e-06,
255
- "loss": 0.0612,
256
  "max_memory_allocated (GB)": 57.18,
257
  "memory_allocated (GB)": 50.57,
258
  "step": 250,
@@ -260,9 +260,9 @@
260
  },
261
  {
262
  "epoch": 1.0612244897959184,
263
- "grad_norm": 3.247530460357666,
264
  "learning_rate": 7.877551020408164e-06,
265
- "loss": 0.055,
266
  "max_memory_allocated (GB)": 57.18,
267
  "memory_allocated (GB)": 50.57,
268
  "step": 260,
@@ -270,9 +270,9 @@
270
  },
271
  {
272
  "epoch": 1.1020408163265305,
273
- "grad_norm": 1.5961194038391113,
274
  "learning_rate": 7.79591836734694e-06,
275
- "loss": 0.0632,
276
  "max_memory_allocated (GB)": 57.18,
277
  "memory_allocated (GB)": 50.57,
278
  "step": 270,
@@ -280,9 +280,9 @@
280
  },
281
  {
282
  "epoch": 1.1428571428571428,
283
- "grad_norm": 0.32916340231895447,
284
  "learning_rate": 7.714285714285716e-06,
285
- "loss": 0.0565,
286
  "max_memory_allocated (GB)": 57.18,
287
  "memory_allocated (GB)": 50.57,
288
  "step": 280,
@@ -290,9 +290,9 @@
290
  },
291
  {
292
  "epoch": 1.183673469387755,
293
- "grad_norm": 0.6009345054626465,
294
  "learning_rate": 7.63265306122449e-06,
295
- "loss": 0.0503,
296
  "max_memory_allocated (GB)": 57.18,
297
  "memory_allocated (GB)": 50.57,
298
  "step": 290,
@@ -300,9 +300,9 @@
300
  },
301
  {
302
  "epoch": 1.2244897959183674,
303
- "grad_norm": 9.806236267089844,
304
  "learning_rate": 7.551020408163265e-06,
305
- "loss": 0.0635,
306
  "max_memory_allocated (GB)": 57.18,
307
  "memory_allocated (GB)": 50.57,
308
  "step": 300,
@@ -310,9 +310,9 @@
310
  },
311
  {
312
  "epoch": 1.2653061224489797,
313
- "grad_norm": 1.231447696685791,
314
  "learning_rate": 7.469387755102041e-06,
315
- "loss": 0.0664,
316
  "max_memory_allocated (GB)": 57.18,
317
  "memory_allocated (GB)": 50.57,
318
  "step": 310,
@@ -320,9 +320,9 @@
320
  },
321
  {
322
  "epoch": 1.306122448979592,
323
- "grad_norm": 1.016727328300476,
324
  "learning_rate": 7.387755102040817e-06,
325
- "loss": 0.0502,
326
  "max_memory_allocated (GB)": 57.18,
327
  "memory_allocated (GB)": 50.57,
328
  "step": 320,
@@ -330,9 +330,9 @@
330
  },
331
  {
332
  "epoch": 1.346938775510204,
333
- "grad_norm": 0.7703081965446472,
334
  "learning_rate": 7.306122448979592e-06,
335
- "loss": 0.0599,
336
  "max_memory_allocated (GB)": 57.18,
337
  "memory_allocated (GB)": 50.57,
338
  "step": 330,
@@ -340,9 +340,9 @@
340
  },
341
  {
342
  "epoch": 1.3877551020408163,
343
- "grad_norm": 5.310096740722656,
344
  "learning_rate": 7.224489795918368e-06,
345
- "loss": 0.0541,
346
  "max_memory_allocated (GB)": 57.18,
347
  "memory_allocated (GB)": 50.57,
348
  "step": 340,
@@ -350,9 +350,9 @@
350
  },
351
  {
352
  "epoch": 1.4285714285714286,
353
- "grad_norm": 1.1444560289382935,
354
  "learning_rate": 7.1428571428571436e-06,
355
- "loss": 0.0493,
356
  "max_memory_allocated (GB)": 57.18,
357
  "memory_allocated (GB)": 50.57,
358
  "step": 350,
@@ -360,9 +360,9 @@
360
  },
361
  {
362
  "epoch": 1.469387755102041,
363
- "grad_norm": 3.1823084354400635,
364
  "learning_rate": 7.061224489795919e-06,
365
- "loss": 0.0459,
366
  "max_memory_allocated (GB)": 57.18,
367
  "memory_allocated (GB)": 50.57,
368
  "step": 360,
@@ -370,9 +370,9 @@
370
  },
371
  {
372
  "epoch": 1.510204081632653,
373
- "grad_norm": 1.1924108266830444,
374
  "learning_rate": 6.979591836734695e-06,
375
- "loss": 0.058,
376
  "max_memory_allocated (GB)": 57.18,
377
  "memory_allocated (GB)": 50.57,
378
  "step": 370,
@@ -380,9 +380,9 @@
380
  },
381
  {
382
  "epoch": 1.5510204081632653,
383
- "grad_norm": 3.677968740463257,
384
  "learning_rate": 6.8979591836734705e-06,
385
- "loss": 0.0468,
386
  "max_memory_allocated (GB)": 57.18,
387
  "memory_allocated (GB)": 50.57,
388
  "step": 380,
@@ -390,9 +390,9 @@
390
  },
391
  {
392
  "epoch": 1.5918367346938775,
393
- "grad_norm": 1.0082957744598389,
394
  "learning_rate": 6.816326530612245e-06,
395
- "loss": 0.0652,
396
  "max_memory_allocated (GB)": 57.18,
397
  "memory_allocated (GB)": 50.57,
398
  "step": 390,
@@ -400,9 +400,9 @@
400
  },
401
  {
402
  "epoch": 1.6326530612244898,
403
- "grad_norm": 0.6584922075271606,
404
  "learning_rate": 6.734693877551021e-06,
405
- "loss": 0.0605,
406
  "max_memory_allocated (GB)": 57.18,
407
  "memory_allocated (GB)": 50.57,
408
  "step": 400,
@@ -410,9 +410,9 @@
410
  },
411
  {
412
  "epoch": 1.6734693877551021,
413
- "grad_norm": 1.2264763116836548,
414
  "learning_rate": 6.653061224489797e-06,
415
- "loss": 0.0599,
416
  "max_memory_allocated (GB)": 57.18,
417
  "memory_allocated (GB)": 50.57,
418
  "step": 410,
@@ -420,9 +420,9 @@
420
  },
421
  {
422
  "epoch": 1.7142857142857144,
423
- "grad_norm": 1.824320912361145,
424
  "learning_rate": 6.571428571428572e-06,
425
- "loss": 0.053,
426
  "max_memory_allocated (GB)": 57.18,
427
  "memory_allocated (GB)": 50.57,
428
  "step": 420,
@@ -430,9 +430,9 @@
430
  },
431
  {
432
  "epoch": 1.7551020408163265,
433
- "grad_norm": 0.24405838549137115,
434
  "learning_rate": 6.489795918367348e-06,
435
- "loss": 0.067,
436
  "max_memory_allocated (GB)": 57.18,
437
  "memory_allocated (GB)": 50.57,
438
  "step": 430,
@@ -440,9 +440,9 @@
440
  },
441
  {
442
  "epoch": 1.7959183673469388,
443
- "grad_norm": 1.0410544872283936,
444
  "learning_rate": 6.408163265306124e-06,
445
- "loss": 0.06,
446
  "max_memory_allocated (GB)": 57.18,
447
  "memory_allocated (GB)": 50.57,
448
  "step": 440,
@@ -450,9 +450,9 @@
450
  },
451
  {
452
  "epoch": 1.836734693877551,
453
- "grad_norm": 0.39765527844429016,
454
  "learning_rate": 6.326530612244899e-06,
455
- "loss": 0.043,
456
  "max_memory_allocated (GB)": 57.18,
457
  "memory_allocated (GB)": 50.57,
458
  "step": 450,
@@ -460,9 +460,9 @@
460
  },
461
  {
462
  "epoch": 1.8775510204081631,
463
- "grad_norm": 0.29981690645217896,
464
  "learning_rate": 6.244897959183675e-06,
465
- "loss": 0.0533,
466
  "max_memory_allocated (GB)": 57.18,
467
  "memory_allocated (GB)": 50.57,
468
  "step": 460,
@@ -470,9 +470,9 @@
470
  },
471
  {
472
  "epoch": 1.9183673469387754,
473
- "grad_norm": 0.6634105443954468,
474
  "learning_rate": 6.163265306122449e-06,
475
- "loss": 0.0428,
476
  "max_memory_allocated (GB)": 57.18,
477
  "memory_allocated (GB)": 50.57,
478
  "step": 470,
@@ -480,9 +480,9 @@
480
  },
481
  {
482
  "epoch": 1.9591836734693877,
483
- "grad_norm": 0.2060549110174179,
484
  "learning_rate": 6.0816326530612245e-06,
485
- "loss": 0.0505,
486
  "max_memory_allocated (GB)": 57.18,
487
  "memory_allocated (GB)": 50.57,
488
  "step": 480,
@@ -490,9 +490,9 @@
490
  },
491
  {
492
  "epoch": 2.0,
493
- "grad_norm": 1.0800402164459229,
494
  "learning_rate": 6e-06,
495
- "loss": 0.0647,
496
  "max_memory_allocated (GB)": 57.18,
497
  "memory_allocated (GB)": 50.57,
498
  "step": 490,
@@ -500,7 +500,7 @@
500
  },
501
  {
502
  "epoch": 2.0408163265306123,
503
- "grad_norm": 0.6078592538833618,
504
  "learning_rate": 5.918367346938776e-06,
505
  "loss": 0.0596,
506
  "max_memory_allocated (GB)": 57.18,
@@ -510,9 +510,9 @@
510
  },
511
  {
512
  "epoch": 2.0816326530612246,
513
- "grad_norm": 0.6262472867965698,
514
  "learning_rate": 5.8367346938775515e-06,
515
- "loss": 0.0495,
516
  "max_memory_allocated (GB)": 57.18,
517
  "memory_allocated (GB)": 50.57,
518
  "step": 510,
@@ -520,7 +520,7 @@
520
  },
521
  {
522
  "epoch": 2.122448979591837,
523
- "grad_norm": 1.4308090209960938,
524
  "learning_rate": 5.755102040816327e-06,
525
  "loss": 0.0461,
526
  "max_memory_allocated (GB)": 57.18,
@@ -530,9 +530,9 @@
530
  },
531
  {
532
  "epoch": 2.163265306122449,
533
- "grad_norm": 0.8761769533157349,
534
  "learning_rate": 5.673469387755103e-06,
535
- "loss": 0.0626,
536
  "max_memory_allocated (GB)": 57.18,
537
  "memory_allocated (GB)": 50.57,
538
  "step": 530,
@@ -540,9 +540,9 @@
540
  },
541
  {
542
  "epoch": 2.204081632653061,
543
- "grad_norm": 0.6150110363960266,
544
  "learning_rate": 5.591836734693878e-06,
545
- "loss": 0.0576,
546
  "max_memory_allocated (GB)": 57.18,
547
  "memory_allocated (GB)": 50.57,
548
  "step": 540,
@@ -550,9 +550,9 @@
550
  },
551
  {
552
  "epoch": 2.2448979591836733,
553
- "grad_norm": 9.280069351196289,
554
  "learning_rate": 5.510204081632653e-06,
555
- "loss": 0.0634,
556
  "max_memory_allocated (GB)": 57.18,
557
  "memory_allocated (GB)": 50.57,
558
  "step": 550,
@@ -560,9 +560,9 @@
560
  },
561
  {
562
  "epoch": 2.2857142857142856,
563
- "grad_norm": 2.136209726333618,
564
  "learning_rate": 5.428571428571429e-06,
565
- "loss": 0.0579,
566
  "max_memory_allocated (GB)": 57.18,
567
  "memory_allocated (GB)": 50.57,
568
  "step": 560,
@@ -570,9 +570,9 @@
570
  },
571
  {
572
  "epoch": 2.326530612244898,
573
- "grad_norm": 0.9547491073608398,
574
  "learning_rate": 5.3469387755102045e-06,
575
- "loss": 0.0589,
576
  "max_memory_allocated (GB)": 57.18,
577
  "memory_allocated (GB)": 50.57,
578
  "step": 570,
@@ -580,9 +580,9 @@
580
  },
581
  {
582
  "epoch": 2.36734693877551,
583
- "grad_norm": 2.1748363971710205,
584
  "learning_rate": 5.26530612244898e-06,
585
- "loss": 0.0672,
586
  "max_memory_allocated (GB)": 57.18,
587
  "memory_allocated (GB)": 50.57,
588
  "step": 580,
@@ -590,9 +590,9 @@
590
  },
591
  {
592
  "epoch": 2.4081632653061225,
593
- "grad_norm": 0.9653811454772949,
594
  "learning_rate": 5.183673469387756e-06,
595
- "loss": 0.0536,
596
  "max_memory_allocated (GB)": 57.18,
597
  "memory_allocated (GB)": 50.57,
598
  "step": 590,
@@ -600,9 +600,9 @@
600
  },
601
  {
602
  "epoch": 2.4489795918367347,
603
- "grad_norm": 0.7852123379707336,
604
  "learning_rate": 5.1020408163265315e-06,
605
- "loss": 0.0443,
606
  "max_memory_allocated (GB)": 57.18,
607
  "memory_allocated (GB)": 50.57,
608
  "step": 600,
@@ -610,7 +610,7 @@
610
  },
611
  {
612
  "epoch": 2.489795918367347,
613
- "grad_norm": 0.7405697107315063,
614
  "learning_rate": 5.020408163265307e-06,
615
  "loss": 0.041,
616
  "max_memory_allocated (GB)": 57.18,
@@ -620,9 +620,9 @@
620
  },
621
  {
622
  "epoch": 2.5306122448979593,
623
- "grad_norm": 0.7166327834129333,
624
  "learning_rate": 4.938775510204082e-06,
625
- "loss": 0.0431,
626
  "max_memory_allocated (GB)": 57.18,
627
  "memory_allocated (GB)": 50.57,
628
  "step": 620,
@@ -630,9 +630,9 @@
630
  },
631
  {
632
  "epoch": 2.571428571428571,
633
- "grad_norm": 0.5675875544548035,
634
  "learning_rate": 4.857142857142858e-06,
635
- "loss": 0.0807,
636
  "max_memory_allocated (GB)": 57.18,
637
  "memory_allocated (GB)": 50.57,
638
  "step": 630,
@@ -640,7 +640,7 @@
640
  },
641
  {
642
  "epoch": 2.612244897959184,
643
- "grad_norm": 0.954290509223938,
644
  "learning_rate": 4.775510204081633e-06,
645
  "loss": 0.0492,
646
  "max_memory_allocated (GB)": 57.18,
@@ -650,9 +650,9 @@
650
  },
651
  {
652
  "epoch": 2.6530612244897958,
653
- "grad_norm": 0.9061315655708313,
654
  "learning_rate": 4.693877551020409e-06,
655
- "loss": 0.0472,
656
  "max_memory_allocated (GB)": 57.18,
657
  "memory_allocated (GB)": 50.57,
658
  "step": 650,
@@ -660,9 +660,9 @@
660
  },
661
  {
662
  "epoch": 2.693877551020408,
663
- "grad_norm": 0.7333698868751526,
664
  "learning_rate": 4.612244897959184e-06,
665
- "loss": 0.0474,
666
  "max_memory_allocated (GB)": 57.18,
667
  "memory_allocated (GB)": 50.57,
668
  "step": 660,
@@ -670,9 +670,9 @@
670
  },
671
  {
672
  "epoch": 2.7346938775510203,
673
- "grad_norm": 1.7820810079574585,
674
  "learning_rate": 4.530612244897959e-06,
675
- "loss": 0.0869,
676
  "max_memory_allocated (GB)": 57.18,
677
  "memory_allocated (GB)": 50.57,
678
  "step": 670,
@@ -680,9 +680,9 @@
680
  },
681
  {
682
  "epoch": 2.7755102040816326,
683
- "grad_norm": 1.3184058666229248,
684
  "learning_rate": 4.448979591836735e-06,
685
- "loss": 0.0494,
686
  "max_memory_allocated (GB)": 57.18,
687
  "memory_allocated (GB)": 50.57,
688
  "step": 680,
@@ -690,9 +690,9 @@
690
  },
691
  {
692
  "epoch": 2.816326530612245,
693
- "grad_norm": 0.4212433397769928,
694
  "learning_rate": 4.367346938775511e-06,
695
- "loss": 0.0445,
696
  "max_memory_allocated (GB)": 57.18,
697
  "memory_allocated (GB)": 50.57,
698
  "step": 690,
@@ -700,9 +700,9 @@
700
  },
701
  {
702
  "epoch": 2.857142857142857,
703
- "grad_norm": 0.39547938108444214,
704
  "learning_rate": 4.2857142857142855e-06,
705
- "loss": 0.0429,
706
  "max_memory_allocated (GB)": 57.18,
707
  "memory_allocated (GB)": 50.57,
708
  "step": 700,
@@ -710,9 +710,9 @@
710
  },
711
  {
712
  "epoch": 2.8979591836734695,
713
- "grad_norm": 1.0822113752365112,
714
  "learning_rate": 4.204081632653061e-06,
715
- "loss": 0.0549,
716
  "max_memory_allocated (GB)": 57.18,
717
  "memory_allocated (GB)": 50.57,
718
  "step": 710,
@@ -720,9 +720,9 @@
720
  },
721
  {
722
  "epoch": 2.938775510204082,
723
- "grad_norm": 1.0838605165481567,
724
  "learning_rate": 4.122448979591837e-06,
725
- "loss": 0.0582,
726
  "max_memory_allocated (GB)": 57.18,
727
  "memory_allocated (GB)": 50.57,
728
  "step": 720,
@@ -730,9 +730,9 @@
730
  },
731
  {
732
  "epoch": 2.979591836734694,
733
- "grad_norm": 0.2726249098777771,
734
  "learning_rate": 4.040816326530612e-06,
735
- "loss": 0.0341,
736
  "max_memory_allocated (GB)": 57.18,
737
  "memory_allocated (GB)": 50.57,
738
  "step": 730,
@@ -740,9 +740,9 @@
740
  },
741
  {
742
  "epoch": 3.020408163265306,
743
- "grad_norm": 0.836703360080719,
744
  "learning_rate": 3.959183673469388e-06,
745
- "loss": 0.053,
746
  "max_memory_allocated (GB)": 57.18,
747
  "memory_allocated (GB)": 50.57,
748
  "step": 740,
@@ -750,9 +750,9 @@
750
  },
751
  {
752
  "epoch": 3.061224489795918,
753
- "grad_norm": 0.6878814697265625,
754
  "learning_rate": 3.877551020408164e-06,
755
- "loss": 0.0618,
756
  "max_memory_allocated (GB)": 57.18,
757
  "memory_allocated (GB)": 50.57,
758
  "step": 750,
@@ -760,7 +760,7 @@
760
  },
761
  {
762
  "epoch": 3.1020408163265305,
763
- "grad_norm": 1.3712306022644043,
764
  "learning_rate": 3.795918367346939e-06,
765
  "loss": 0.0527,
766
  "max_memory_allocated (GB)": 57.18,
@@ -770,9 +770,9 @@
770
  },
771
  {
772
  "epoch": 3.142857142857143,
773
- "grad_norm": 2.1615536212921143,
774
  "learning_rate": 3.7142857142857146e-06,
775
- "loss": 0.069,
776
  "max_memory_allocated (GB)": 57.18,
777
  "memory_allocated (GB)": 50.57,
778
  "step": 770,
@@ -780,9 +780,9 @@
780
  },
781
  {
782
  "epoch": 3.183673469387755,
783
- "grad_norm": 0.45463302731513977,
784
  "learning_rate": 3.6326530612244903e-06,
785
- "loss": 0.064,
786
  "max_memory_allocated (GB)": 57.18,
787
  "memory_allocated (GB)": 50.57,
788
  "step": 780,
@@ -790,9 +790,9 @@
790
  },
791
  {
792
  "epoch": 3.2244897959183674,
793
- "grad_norm": 0.40240758657455444,
794
  "learning_rate": 3.5510204081632655e-06,
795
- "loss": 0.0528,
796
  "max_memory_allocated (GB)": 57.18,
797
  "memory_allocated (GB)": 50.57,
798
  "step": 790,
@@ -800,9 +800,9 @@
800
  },
801
  {
802
  "epoch": 3.2653061224489797,
803
- "grad_norm": 0.5408643484115601,
804
  "learning_rate": 3.469387755102041e-06,
805
- "loss": 0.0433,
806
  "max_memory_allocated (GB)": 57.18,
807
  "memory_allocated (GB)": 50.57,
808
  "step": 800,
@@ -810,9 +810,9 @@
810
  },
811
  {
812
  "epoch": 3.306122448979592,
813
- "grad_norm": 0.6606118679046631,
814
  "learning_rate": 3.3877551020408168e-06,
815
- "loss": 0.0548,
816
  "max_memory_allocated (GB)": 57.18,
817
  "memory_allocated (GB)": 50.57,
818
  "step": 810,
@@ -820,9 +820,9 @@
820
  },
821
  {
822
  "epoch": 3.3469387755102042,
823
- "grad_norm": 0.89394211769104,
824
  "learning_rate": 3.3061224489795924e-06,
825
- "loss": 0.0448,
826
  "max_memory_allocated (GB)": 57.18,
827
  "memory_allocated (GB)": 50.57,
828
  "step": 820,
@@ -830,9 +830,9 @@
830
  },
831
  {
832
  "epoch": 3.387755102040816,
833
- "grad_norm": 0.3227229416370392,
834
  "learning_rate": 3.2244897959183672e-06,
835
- "loss": 0.0491,
836
  "max_memory_allocated (GB)": 57.18,
837
  "memory_allocated (GB)": 50.57,
838
  "step": 830,
@@ -840,9 +840,9 @@
840
  },
841
  {
842
  "epoch": 3.4285714285714284,
843
- "grad_norm": 0.928611159324646,
844
  "learning_rate": 3.142857142857143e-06,
845
- "loss": 0.0523,
846
  "max_memory_allocated (GB)": 57.18,
847
  "memory_allocated (GB)": 50.57,
848
  "step": 840,
@@ -850,7 +850,7 @@
850
  },
851
  {
852
  "epoch": 3.4693877551020407,
853
- "grad_norm": 0.6023752093315125,
854
  "learning_rate": 3.0612244897959185e-06,
855
  "loss": 0.0813,
856
  "max_memory_allocated (GB)": 57.18,
@@ -860,9 +860,9 @@
860
  },
861
  {
862
  "epoch": 3.510204081632653,
863
- "grad_norm": 1.1742885112762451,
864
  "learning_rate": 2.979591836734694e-06,
865
- "loss": 0.0451,
866
  "max_memory_allocated (GB)": 57.18,
867
  "memory_allocated (GB)": 50.57,
868
  "step": 860,
@@ -870,9 +870,9 @@
870
  },
871
  {
872
  "epoch": 3.5510204081632653,
873
- "grad_norm": 0.21970601379871368,
874
  "learning_rate": 2.8979591836734694e-06,
875
- "loss": 0.0644,
876
  "max_memory_allocated (GB)": 57.18,
877
  "memory_allocated (GB)": 50.57,
878
  "step": 870,
@@ -880,9 +880,9 @@
880
  },
881
  {
882
  "epoch": 3.5918367346938775,
883
- "grad_norm": 1.2270339727401733,
884
  "learning_rate": 2.816326530612245e-06,
885
- "loss": 0.044,
886
  "max_memory_allocated (GB)": 57.18,
887
  "memory_allocated (GB)": 50.57,
888
  "step": 880,
@@ -890,9 +890,9 @@
890
  },
891
  {
892
  "epoch": 3.63265306122449,
893
- "grad_norm": 0.15171077847480774,
894
  "learning_rate": 2.7346938775510207e-06,
895
- "loss": 0.0291,
896
  "max_memory_allocated (GB)": 57.18,
897
  "memory_allocated (GB)": 50.57,
898
  "step": 890,
@@ -900,9 +900,9 @@
900
  },
901
  {
902
  "epoch": 3.673469387755102,
903
- "grad_norm": 2.9102306365966797,
904
  "learning_rate": 2.6530612244897964e-06,
905
- "loss": 0.0516,
906
  "max_memory_allocated (GB)": 57.18,
907
  "memory_allocated (GB)": 50.57,
908
  "step": 900,
@@ -910,7 +910,7 @@
910
  },
911
  {
912
  "epoch": 3.7142857142857144,
913
- "grad_norm": 0.6113564968109131,
914
  "learning_rate": 2.571428571428571e-06,
915
  "loss": 0.061,
916
  "max_memory_allocated (GB)": 57.18,
@@ -920,9 +920,9 @@
920
  },
921
  {
922
  "epoch": 3.7551020408163263,
923
- "grad_norm": 0.19307516515254974,
924
  "learning_rate": 2.489795918367347e-06,
925
- "loss": 0.0539,
926
  "max_memory_allocated (GB)": 57.18,
927
  "memory_allocated (GB)": 50.57,
928
  "step": 920,
@@ -930,9 +930,9 @@
930
  },
931
  {
932
  "epoch": 3.795918367346939,
933
- "grad_norm": 0.38657116889953613,
934
  "learning_rate": 2.4081632653061225e-06,
935
- "loss": 0.05,
936
  "max_memory_allocated (GB)": 57.18,
937
  "memory_allocated (GB)": 50.57,
938
  "step": 930,
@@ -940,9 +940,9 @@
940
  },
941
  {
942
  "epoch": 3.836734693877551,
943
- "grad_norm": 0.9545940160751343,
944
  "learning_rate": 2.326530612244898e-06,
945
- "loss": 0.0481,
946
  "max_memory_allocated (GB)": 57.18,
947
  "memory_allocated (GB)": 50.57,
948
  "step": 940,
@@ -950,9 +950,9 @@
950
  },
951
  {
952
  "epoch": 3.877551020408163,
953
- "grad_norm": 1.3555389642715454,
954
  "learning_rate": 2.244897959183674e-06,
955
- "loss": 0.0596,
956
  "max_memory_allocated (GB)": 57.18,
957
  "memory_allocated (GB)": 50.57,
958
  "step": 950,
@@ -960,9 +960,9 @@
960
  },
961
  {
962
  "epoch": 3.9183673469387754,
963
- "grad_norm": 0.3931931257247925,
964
  "learning_rate": 2.1632653061224495e-06,
965
- "loss": 0.0498,
966
  "max_memory_allocated (GB)": 57.18,
967
  "memory_allocated (GB)": 50.57,
968
  "step": 960,
@@ -970,9 +970,9 @@
970
  },
971
  {
972
  "epoch": 3.9591836734693877,
973
- "grad_norm": 0.6945517063140869,
974
  "learning_rate": 2.0816326530612247e-06,
975
- "loss": 0.0734,
976
  "max_memory_allocated (GB)": 57.18,
977
  "memory_allocated (GB)": 50.57,
978
  "step": 970,
@@ -980,9 +980,9 @@
980
  },
981
  {
982
  "epoch": 4.0,
983
- "grad_norm": 1.0659183263778687,
984
  "learning_rate": 2.0000000000000003e-06,
985
- "loss": 0.0707,
986
  "max_memory_allocated (GB)": 57.18,
987
  "memory_allocated (GB)": 50.57,
988
  "step": 980,
@@ -990,9 +990,9 @@
990
  },
991
  {
992
  "epoch": 4.040816326530612,
993
- "grad_norm": 0.8017714023590088,
994
  "learning_rate": 1.9183673469387756e-06,
995
- "loss": 0.0476,
996
  "max_memory_allocated (GB)": 57.18,
997
  "memory_allocated (GB)": 50.57,
998
  "step": 990,
@@ -1000,9 +1000,9 @@
1000
  },
1001
  {
1002
  "epoch": 4.081632653061225,
1003
- "grad_norm": 1.1733373403549194,
1004
  "learning_rate": 1.8367346938775512e-06,
1005
- "loss": 0.0617,
1006
  "max_memory_allocated (GB)": 57.18,
1007
  "memory_allocated (GB)": 50.57,
1008
  "step": 1000,
@@ -1010,9 +1010,9 @@
1010
  },
1011
  {
1012
  "epoch": 4.122448979591836,
1013
- "grad_norm": 0.6048600077629089,
1014
  "learning_rate": 1.7551020408163267e-06,
1015
- "loss": 0.0625,
1016
  "max_memory_allocated (GB)": 57.18,
1017
  "memory_allocated (GB)": 50.57,
1018
  "step": 1010,
@@ -1020,7 +1020,7 @@
1020
  },
1021
  {
1022
  "epoch": 4.163265306122449,
1023
- "grad_norm": 1.096007227897644,
1024
  "learning_rate": 1.6734693877551023e-06,
1025
  "loss": 0.0407,
1026
  "max_memory_allocated (GB)": 57.18,
@@ -1030,9 +1030,9 @@
1030
  },
1031
  {
1032
  "epoch": 4.204081632653061,
1033
- "grad_norm": 4.377923965454102,
1034
  "learning_rate": 1.5918367346938775e-06,
1035
- "loss": 0.0528,
1036
  "max_memory_allocated (GB)": 57.18,
1037
  "memory_allocated (GB)": 50.57,
1038
  "step": 1030,
@@ -1040,9 +1040,9 @@
1040
  },
1041
  {
1042
  "epoch": 4.244897959183674,
1043
- "grad_norm": 0.7295175194740295,
1044
  "learning_rate": 1.5102040816326532e-06,
1045
- "loss": 0.04,
1046
  "max_memory_allocated (GB)": 57.18,
1047
  "memory_allocated (GB)": 50.57,
1048
  "step": 1040,
@@ -1050,9 +1050,9 @@
1050
  },
1051
  {
1052
  "epoch": 4.285714285714286,
1053
- "grad_norm": 0.5129045248031616,
1054
  "learning_rate": 1.4285714285714286e-06,
1055
- "loss": 0.0493,
1056
  "max_memory_allocated (GB)": 57.18,
1057
  "memory_allocated (GB)": 50.57,
1058
  "step": 1050,
@@ -1060,7 +1060,7 @@
1060
  },
1061
  {
1062
  "epoch": 4.326530612244898,
1063
- "grad_norm": 0.505799412727356,
1064
  "learning_rate": 1.3469387755102043e-06,
1065
  "loss": 0.046,
1066
  "max_memory_allocated (GB)": 57.18,
@@ -1070,9 +1070,9 @@
1070
  },
1071
  {
1072
  "epoch": 4.36734693877551,
1073
- "grad_norm": 0.8460046052932739,
1074
  "learning_rate": 1.2653061224489795e-06,
1075
- "loss": 0.0483,
1076
  "max_memory_allocated (GB)": 57.18,
1077
  "memory_allocated (GB)": 50.57,
1078
  "step": 1070,
@@ -1080,9 +1080,9 @@
1080
  },
1081
  {
1082
  "epoch": 4.408163265306122,
1083
- "grad_norm": 0.5636487603187561,
1084
  "learning_rate": 1.1836734693877552e-06,
1085
- "loss": 0.0523,
1086
  "max_memory_allocated (GB)": 57.18,
1087
  "memory_allocated (GB)": 50.57,
1088
  "step": 1080,
@@ -1090,9 +1090,9 @@
1090
  },
1091
  {
1092
  "epoch": 4.448979591836735,
1093
- "grad_norm": 0.6893309354782104,
1094
  "learning_rate": 1.1020408163265308e-06,
1095
- "loss": 0.0505,
1096
  "max_memory_allocated (GB)": 57.18,
1097
  "memory_allocated (GB)": 50.57,
1098
  "step": 1090,
@@ -1100,9 +1100,9 @@
1100
  },
1101
  {
1102
  "epoch": 4.489795918367347,
1103
- "grad_norm": 0.6645925045013428,
1104
  "learning_rate": 1.0204081632653063e-06,
1105
- "loss": 0.0361,
1106
  "max_memory_allocated (GB)": 57.18,
1107
  "memory_allocated (GB)": 50.57,
1108
  "step": 1100,
@@ -1110,9 +1110,9 @@
1110
  },
1111
  {
1112
  "epoch": 4.530612244897959,
1113
- "grad_norm": 0.14709816873073578,
1114
  "learning_rate": 9.387755102040817e-07,
1115
- "loss": 0.0458,
1116
  "max_memory_allocated (GB)": 57.18,
1117
  "memory_allocated (GB)": 50.57,
1118
  "step": 1110,
@@ -1120,9 +1120,9 @@
1120
  },
1121
  {
1122
  "epoch": 4.571428571428571,
1123
- "grad_norm": 0.8794461488723755,
1124
  "learning_rate": 8.571428571428572e-07,
1125
- "loss": 0.0414,
1126
  "max_memory_allocated (GB)": 57.18,
1127
  "memory_allocated (GB)": 50.57,
1128
  "step": 1120,
@@ -1130,9 +1130,9 @@
1130
  },
1131
  {
1132
  "epoch": 4.612244897959184,
1133
- "grad_norm": 1.345910906791687,
1134
  "learning_rate": 7.755102040816327e-07,
1135
- "loss": 0.0526,
1136
  "max_memory_allocated (GB)": 57.18,
1137
  "memory_allocated (GB)": 50.57,
1138
  "step": 1130,
@@ -1140,9 +1140,9 @@
1140
  },
1141
  {
1142
  "epoch": 4.653061224489796,
1143
- "grad_norm": 0.5657418370246887,
1144
  "learning_rate": 6.938775510204082e-07,
1145
- "loss": 0.0721,
1146
  "max_memory_allocated (GB)": 57.18,
1147
  "memory_allocated (GB)": 50.57,
1148
  "step": 1140,
@@ -1150,9 +1150,9 @@
1150
  },
1151
  {
1152
  "epoch": 4.6938775510204085,
1153
- "grad_norm": 2.0943238735198975,
1154
  "learning_rate": 6.122448979591837e-07,
1155
- "loss": 0.0515,
1156
  "max_memory_allocated (GB)": 57.18,
1157
  "memory_allocated (GB)": 50.57,
1158
  "step": 1150,
@@ -1160,9 +1160,9 @@
1160
  },
1161
  {
1162
  "epoch": 4.73469387755102,
1163
- "grad_norm": 3.1750009059906006,
1164
  "learning_rate": 5.306122448979592e-07,
1165
- "loss": 0.0607,
1166
  "max_memory_allocated (GB)": 57.18,
1167
  "memory_allocated (GB)": 50.57,
1168
  "step": 1160,
@@ -1170,9 +1170,9 @@
1170
  },
1171
  {
1172
  "epoch": 4.775510204081632,
1173
- "grad_norm": 0.8910292387008667,
1174
  "learning_rate": 4.489795918367347e-07,
1175
- "loss": 0.054,
1176
  "max_memory_allocated (GB)": 57.18,
1177
  "memory_allocated (GB)": 50.57,
1178
  "step": 1170,
@@ -1180,9 +1180,9 @@
1180
  },
1181
  {
1182
  "epoch": 4.816326530612245,
1183
- "grad_norm": 0.9796111583709717,
1184
  "learning_rate": 3.6734693877551025e-07,
1185
- "loss": 0.0432,
1186
  "max_memory_allocated (GB)": 57.18,
1187
  "memory_allocated (GB)": 50.57,
1188
  "step": 1180,
@@ -1190,9 +1190,9 @@
1190
  },
1191
  {
1192
  "epoch": 4.857142857142857,
1193
- "grad_norm": 1.7865172624588013,
1194
  "learning_rate": 2.8571428571428575e-07,
1195
- "loss": 0.0549,
1196
  "max_memory_allocated (GB)": 57.18,
1197
  "memory_allocated (GB)": 50.57,
1198
  "step": 1190,
@@ -1200,9 +1200,9 @@
1200
  },
1201
  {
1202
  "epoch": 4.8979591836734695,
1203
- "grad_norm": 0.7347144484519958,
1204
  "learning_rate": 2.0408163265306121e-07,
1205
- "loss": 0.0711,
1206
  "max_memory_allocated (GB)": 57.18,
1207
  "memory_allocated (GB)": 50.57,
1208
  "step": 1200,
@@ -1210,9 +1210,9 @@
1210
  },
1211
  {
1212
  "epoch": 4.938775510204081,
1213
- "grad_norm": 1.210486888885498,
1214
  "learning_rate": 1.2244897959183673e-07,
1215
- "loss": 0.0652,
1216
  "max_memory_allocated (GB)": 57.18,
1217
  "memory_allocated (GB)": 50.57,
1218
  "step": 1210,
@@ -1220,9 +1220,9 @@
1220
  },
1221
  {
1222
  "epoch": 4.979591836734694,
1223
- "grad_norm": 1.2078640460968018,
1224
  "learning_rate": 4.0816326530612253e-08,
1225
- "loss": 0.0414,
1226
  "max_memory_allocated (GB)": 57.18,
1227
  "memory_allocated (GB)": 50.57,
1228
  "step": 1220,
@@ -1235,10 +1235,10 @@
1235
  "step": 1225,
1236
  "total_flos": 3.0598946525952e+16,
1237
  "total_memory_available (GB)": 94.62,
1238
- "train_loss": 0.06072675045655698,
1239
- "train_runtime": 1077.821,
1240
- "train_samples_per_second": 52.682,
1241
- "train_steps_per_second": 1.318
1242
  }
1243
  ],
1244
  "logging_steps": 10,
 
20
  },
21
  {
22
  "epoch": 0.08163265306122448,
23
+ "grad_norm": 7.052234649658203,
24
  "learning_rate": 9.836734693877552e-06,
25
+ "loss": 0.1555,
26
  "max_memory_allocated (GB)": 57.18,
27
  "memory_allocated (GB)": 50.57,
28
  "step": 20,
 
30
  },
31
  {
32
  "epoch": 0.12244897959183673,
33
+ "grad_norm": 6.5298075675964355,
34
  "learning_rate": 9.755102040816327e-06,
35
+ "loss": 0.1251,
36
  "max_memory_allocated (GB)": 57.18,
37
  "memory_allocated (GB)": 50.57,
38
  "step": 30,
 
40
  },
41
  {
42
  "epoch": 0.16326530612244897,
43
+ "grad_norm": 4.405805587768555,
44
  "learning_rate": 9.673469387755103e-06,
45
+ "loss": 0.1102,
46
  "max_memory_allocated (GB)": 57.18,
47
  "memory_allocated (GB)": 50.57,
48
  "step": 40,
 
50
  },
51
  {
52
  "epoch": 0.20408163265306123,
53
+ "grad_norm": 4.870044708251953,
54
  "learning_rate": 9.591836734693878e-06,
55
+ "loss": 0.1232,
56
  "max_memory_allocated (GB)": 57.18,
57
  "memory_allocated (GB)": 50.57,
58
  "step": 50,
 
60
  },
61
  {
62
  "epoch": 0.24489795918367346,
63
+ "grad_norm": 1.6433866024017334,
64
  "learning_rate": 9.510204081632653e-06,
65
+ "loss": 0.0797,
66
  "max_memory_allocated (GB)": 57.18,
67
  "memory_allocated (GB)": 50.57,
68
  "step": 60,
 
70
  },
71
  {
72
  "epoch": 0.2857142857142857,
73
+ "grad_norm": 4.2432074546813965,
74
  "learning_rate": 9.42857142857143e-06,
75
+ "loss": 0.1031,
76
  "max_memory_allocated (GB)": 57.18,
77
  "memory_allocated (GB)": 50.57,
78
  "step": 70,
 
80
  },
81
  {
82
  "epoch": 0.32653061224489793,
83
+ "grad_norm": 2.0352487564086914,
84
  "learning_rate": 9.346938775510204e-06,
85
+ "loss": 0.1115,
86
  "max_memory_allocated (GB)": 57.18,
87
  "memory_allocated (GB)": 50.57,
88
  "step": 80,
 
90
  },
91
  {
92
  "epoch": 0.3673469387755102,
93
+ "grad_norm": 4.201560020446777,
94
  "learning_rate": 9.26530612244898e-06,
95
+ "loss": 0.0817,
96
  "max_memory_allocated (GB)": 57.18,
97
  "memory_allocated (GB)": 50.57,
98
  "step": 90,
 
100
  },
101
  {
102
  "epoch": 0.40816326530612246,
103
+ "grad_norm": 3.618368625640869,
104
  "learning_rate": 9.183673469387756e-06,
105
+ "loss": 0.0768,
106
  "max_memory_allocated (GB)": 57.18,
107
  "memory_allocated (GB)": 50.57,
108
  "step": 100,
 
110
  },
111
  {
112
  "epoch": 0.4489795918367347,
113
+ "grad_norm": 4.793916702270508,
114
  "learning_rate": 9.102040816326532e-06,
115
+ "loss": 0.071,
116
  "max_memory_allocated (GB)": 57.18,
117
  "memory_allocated (GB)": 50.57,
118
  "step": 110,
 
120
  },
121
  {
122
  "epoch": 0.4897959183673469,
123
+ "grad_norm": 2.3223495483398438,
124
  "learning_rate": 9.020408163265307e-06,
125
+ "loss": 0.0707,
126
  "max_memory_allocated (GB)": 57.18,
127
  "memory_allocated (GB)": 50.57,
128
  "step": 120,
 
130
  },
131
  {
132
  "epoch": 0.5306122448979592,
133
+ "grad_norm": 3.5389153957366943,
134
  "learning_rate": 8.938775510204082e-06,
135
+ "loss": 0.0599,
136
  "max_memory_allocated (GB)": 57.18,
137
  "memory_allocated (GB)": 50.57,
138
  "step": 130,
 
140
  },
141
  {
142
  "epoch": 0.5714285714285714,
143
+ "grad_norm": 1.586653232574463,
144
  "learning_rate": 8.857142857142858e-06,
145
+ "loss": 0.0491,
146
  "max_memory_allocated (GB)": 57.18,
147
  "memory_allocated (GB)": 50.57,
148
  "step": 140,
 
150
  },
151
  {
152
  "epoch": 0.6122448979591837,
153
+ "grad_norm": 1.5236841440200806,
154
  "learning_rate": 8.775510204081633e-06,
155
+ "loss": 0.0632,
156
  "max_memory_allocated (GB)": 57.18,
157
  "memory_allocated (GB)": 50.57,
158
  "step": 150,
 
160
  },
161
  {
162
  "epoch": 0.6530612244897959,
163
+ "grad_norm": 2.752020835876465,
164
  "learning_rate": 8.69387755102041e-06,
165
+ "loss": 0.0722,
166
  "max_memory_allocated (GB)": 57.18,
167
  "memory_allocated (GB)": 50.57,
168
  "step": 160,
 
170
  },
171
  {
172
  "epoch": 0.6938775510204082,
173
+ "grad_norm": 7.606927394866943,
174
  "learning_rate": 8.612244897959184e-06,
175
+ "loss": 0.0756,
176
  "max_memory_allocated (GB)": 57.18,
177
  "memory_allocated (GB)": 50.57,
178
  "step": 170,
 
180
  },
181
  {
182
  "epoch": 0.7346938775510204,
183
+ "grad_norm": 1.5622702836990356,
184
  "learning_rate": 8.530612244897961e-06,
185
+ "loss": 0.0617,
186
  "max_memory_allocated (GB)": 57.18,
187
  "memory_allocated (GB)": 50.57,
188
  "step": 180,
 
190
  },
191
  {
192
  "epoch": 0.7755102040816326,
193
+ "grad_norm": 0.9614956378936768,
194
  "learning_rate": 8.448979591836736e-06,
195
+ "loss": 0.0572,
196
  "max_memory_allocated (GB)": 57.18,
197
  "memory_allocated (GB)": 50.57,
198
  "step": 190,
 
200
  },
201
  {
202
  "epoch": 0.8163265306122449,
203
+ "grad_norm": 0.7814755439758301,
204
  "learning_rate": 8.36734693877551e-06,
205
+ "loss": 0.0636,
206
  "max_memory_allocated (GB)": 57.18,
207
  "memory_allocated (GB)": 50.57,
208
  "step": 200,
 
210
  },
211
  {
212
  "epoch": 0.8571428571428571,
213
+ "grad_norm": 1.352851390838623,
214
  "learning_rate": 8.285714285714287e-06,
215
+ "loss": 0.0648,
216
  "max_memory_allocated (GB)": 57.18,
217
  "memory_allocated (GB)": 50.57,
218
  "step": 210,
 
220
  },
221
  {
222
  "epoch": 0.8979591836734694,
223
+ "grad_norm": 1.6814969778060913,
224
  "learning_rate": 8.204081632653062e-06,
225
+ "loss": 0.0604,
226
  "max_memory_allocated (GB)": 57.18,
227
  "memory_allocated (GB)": 50.57,
228
  "step": 220,
 
230
  },
231
  {
232
  "epoch": 0.9387755102040817,
233
+ "grad_norm": 0.859993040561676,
234
  "learning_rate": 8.122448979591837e-06,
235
+ "loss": 0.0549,
236
  "max_memory_allocated (GB)": 57.18,
237
  "memory_allocated (GB)": 50.57,
238
  "step": 230,
 
240
  },
241
  {
242
  "epoch": 0.9795918367346939,
243
+ "grad_norm": 0.6439819931983948,
244
  "learning_rate": 8.040816326530613e-06,
245
+ "loss": 0.0493,
246
  "max_memory_allocated (GB)": 57.18,
247
  "memory_allocated (GB)": 50.57,
248
  "step": 240,
 
250
  },
251
  {
252
  "epoch": 1.0204081632653061,
253
+ "grad_norm": 0.8465150594711304,
254
  "learning_rate": 7.959183673469388e-06,
255
+ "loss": 0.0624,
256
  "max_memory_allocated (GB)": 57.18,
257
  "memory_allocated (GB)": 50.57,
258
  "step": 250,
 
260
  },
261
  {
262
  "epoch": 1.0612244897959184,
263
+ "grad_norm": 1.0257333517074585,
264
  "learning_rate": 7.877551020408164e-06,
265
+ "loss": 0.056,
266
  "max_memory_allocated (GB)": 57.18,
267
  "memory_allocated (GB)": 50.57,
268
  "step": 260,
 
270
  },
271
  {
272
  "epoch": 1.1020408163265305,
273
+ "grad_norm": 2.619938850402832,
274
  "learning_rate": 7.79591836734694e-06,
275
+ "loss": 0.0648,
276
  "max_memory_allocated (GB)": 57.18,
277
  "memory_allocated (GB)": 50.57,
278
  "step": 270,
 
280
  },
281
  {
282
  "epoch": 1.1428571428571428,
283
+ "grad_norm": 0.4946042001247406,
284
  "learning_rate": 7.714285714285716e-06,
285
+ "loss": 0.0586,
286
  "max_memory_allocated (GB)": 57.18,
287
  "memory_allocated (GB)": 50.57,
288
  "step": 280,
 
290
  },
291
  {
292
  "epoch": 1.183673469387755,
293
+ "grad_norm": 1.0154733657836914,
294
  "learning_rate": 7.63265306122449e-06,
295
+ "loss": 0.0505,
296
  "max_memory_allocated (GB)": 57.18,
297
  "memory_allocated (GB)": 50.57,
298
  "step": 290,
 
300
  },
301
  {
302
  "epoch": 1.2244897959183674,
303
+ "grad_norm": 1.0347952842712402,
304
  "learning_rate": 7.551020408163265e-06,
305
+ "loss": 0.0646,
306
  "max_memory_allocated (GB)": 57.18,
307
  "memory_allocated (GB)": 50.57,
308
  "step": 300,
 
310
  },
311
  {
312
  "epoch": 1.2653061224489797,
313
+ "grad_norm": 0.7844366431236267,
314
  "learning_rate": 7.469387755102041e-06,
315
+ "loss": 0.0676,
316
  "max_memory_allocated (GB)": 57.18,
317
  "memory_allocated (GB)": 50.57,
318
  "step": 310,
 
320
  },
321
  {
322
  "epoch": 1.306122448979592,
323
+ "grad_norm": 1.1971337795257568,
324
  "learning_rate": 7.387755102040817e-06,
325
+ "loss": 0.0499,
326
  "max_memory_allocated (GB)": 57.18,
327
  "memory_allocated (GB)": 50.57,
328
  "step": 320,
 
330
  },
331
  {
332
  "epoch": 1.346938775510204,
333
+ "grad_norm": 0.6674404740333557,
334
  "learning_rate": 7.306122448979592e-06,
335
+ "loss": 0.0602,
336
  "max_memory_allocated (GB)": 57.18,
337
  "memory_allocated (GB)": 50.57,
338
  "step": 330,
 
340
  },
341
  {
342
  "epoch": 1.3877551020408163,
343
+ "grad_norm": 1.511208415031433,
344
  "learning_rate": 7.224489795918368e-06,
345
+ "loss": 0.0547,
346
  "max_memory_allocated (GB)": 57.18,
347
  "memory_allocated (GB)": 50.57,
348
  "step": 340,
 
350
  },
351
  {
352
  "epoch": 1.4285714285714286,
353
+ "grad_norm": 0.5328841209411621,
354
  "learning_rate": 7.1428571428571436e-06,
355
+ "loss": 0.0486,
356
  "max_memory_allocated (GB)": 57.18,
357
  "memory_allocated (GB)": 50.57,
358
  "step": 350,
 
360
  },
361
  {
362
  "epoch": 1.469387755102041,
363
+ "grad_norm": 1.464439034461975,
364
  "learning_rate": 7.061224489795919e-06,
365
+ "loss": 0.0464,
366
  "max_memory_allocated (GB)": 57.18,
367
  "memory_allocated (GB)": 50.57,
368
  "step": 360,
 
370
  },
371
  {
372
  "epoch": 1.510204081632653,
373
+ "grad_norm": 0.834863543510437,
374
  "learning_rate": 6.979591836734695e-06,
375
+ "loss": 0.0591,
376
  "max_memory_allocated (GB)": 57.18,
377
  "memory_allocated (GB)": 50.57,
378
  "step": 370,
 
380
  },
381
  {
382
  "epoch": 1.5510204081632653,
383
+ "grad_norm": 0.5399609208106995,
384
  "learning_rate": 6.8979591836734705e-06,
385
+ "loss": 0.0464,
386
  "max_memory_allocated (GB)": 57.18,
387
  "memory_allocated (GB)": 50.57,
388
  "step": 380,
 
390
  },
391
  {
392
  "epoch": 1.5918367346938775,
393
+ "grad_norm": 0.8577661514282227,
394
  "learning_rate": 6.816326530612245e-06,
395
+ "loss": 0.0654,
396
  "max_memory_allocated (GB)": 57.18,
397
  "memory_allocated (GB)": 50.57,
398
  "step": 390,
 
400
  },
401
  {
402
  "epoch": 1.6326530612244898,
403
+ "grad_norm": 0.5057955384254456,
404
  "learning_rate": 6.734693877551021e-06,
405
+ "loss": 0.0609,
406
  "max_memory_allocated (GB)": 57.18,
407
  "memory_allocated (GB)": 50.57,
408
  "step": 400,
 
410
  },
411
  {
412
  "epoch": 1.6734693877551021,
413
+ "grad_norm": 0.9135333895683289,
414
  "learning_rate": 6.653061224489797e-06,
415
+ "loss": 0.0607,
416
  "max_memory_allocated (GB)": 57.18,
417
  "memory_allocated (GB)": 50.57,
418
  "step": 410,
 
420
  },
421
  {
422
  "epoch": 1.7142857142857144,
423
+ "grad_norm": 2.9697179794311523,
424
  "learning_rate": 6.571428571428572e-06,
425
+ "loss": 0.054,
426
  "max_memory_allocated (GB)": 57.18,
427
  "memory_allocated (GB)": 50.57,
428
  "step": 420,
 
430
  },
431
  {
432
  "epoch": 1.7551020408163265,
433
+ "grad_norm": 0.3473312556743622,
434
  "learning_rate": 6.489795918367348e-06,
435
+ "loss": 0.0685,
436
  "max_memory_allocated (GB)": 57.18,
437
  "memory_allocated (GB)": 50.57,
438
  "step": 430,
 
440
  },
441
  {
442
  "epoch": 1.7959183673469388,
443
+ "grad_norm": 1.4528335332870483,
444
  "learning_rate": 6.408163265306124e-06,
445
+ "loss": 0.0611,
446
  "max_memory_allocated (GB)": 57.18,
447
  "memory_allocated (GB)": 50.57,
448
  "step": 440,
 
450
  },
451
  {
452
  "epoch": 1.836734693877551,
453
+ "grad_norm": 0.48578280210494995,
454
  "learning_rate": 6.326530612244899e-06,
455
+ "loss": 0.0438,
456
  "max_memory_allocated (GB)": 57.18,
457
  "memory_allocated (GB)": 50.57,
458
  "step": 450,
 
460
  },
461
  {
462
  "epoch": 1.8775510204081631,
463
+ "grad_norm": 0.3472760021686554,
464
  "learning_rate": 6.244897959183675e-06,
465
+ "loss": 0.0544,
466
  "max_memory_allocated (GB)": 57.18,
467
  "memory_allocated (GB)": 50.57,
468
  "step": 460,
 
470
  },
471
  {
472
  "epoch": 1.9183673469387754,
473
+ "grad_norm": 1.0984327793121338,
474
  "learning_rate": 6.163265306122449e-06,
475
+ "loss": 0.0438,
476
  "max_memory_allocated (GB)": 57.18,
477
  "memory_allocated (GB)": 50.57,
478
  "step": 470,
 
480
  },
481
  {
482
  "epoch": 1.9591836734693877,
483
+ "grad_norm": 0.20147933065891266,
484
  "learning_rate": 6.0816326530612245e-06,
485
+ "loss": 0.0518,
486
  "max_memory_allocated (GB)": 57.18,
487
  "memory_allocated (GB)": 50.57,
488
  "step": 480,
 
490
  },
491
  {
492
  "epoch": 2.0,
493
+ "grad_norm": 1.1583309173583984,
494
  "learning_rate": 6e-06,
495
+ "loss": 0.0637,
496
  "max_memory_allocated (GB)": 57.18,
497
  "memory_allocated (GB)": 50.57,
498
  "step": 490,
 
500
  },
501
  {
502
  "epoch": 2.0408163265306123,
503
+ "grad_norm": 0.6601622104644775,
504
  "learning_rate": 5.918367346938776e-06,
505
  "loss": 0.0596,
506
  "max_memory_allocated (GB)": 57.18,
 
510
  },
511
  {
512
  "epoch": 2.0816326530612246,
513
+ "grad_norm": 0.5227305293083191,
514
  "learning_rate": 5.8367346938775515e-06,
515
+ "loss": 0.0493,
516
  "max_memory_allocated (GB)": 57.18,
517
  "memory_allocated (GB)": 50.57,
518
  "step": 510,
 
520
  },
521
  {
522
  "epoch": 2.122448979591837,
523
+ "grad_norm": 0.8996191620826721,
524
  "learning_rate": 5.755102040816327e-06,
525
  "loss": 0.0461,
526
  "max_memory_allocated (GB)": 57.18,
 
530
  },
531
  {
532
  "epoch": 2.163265306122449,
533
+ "grad_norm": 1.0684189796447754,
534
  "learning_rate": 5.673469387755103e-06,
535
+ "loss": 0.0629,
536
  "max_memory_allocated (GB)": 57.18,
537
  "memory_allocated (GB)": 50.57,
538
  "step": 530,
 
540
  },
541
  {
542
  "epoch": 2.204081632653061,
543
+ "grad_norm": 0.5558530688285828,
544
  "learning_rate": 5.591836734693878e-06,
545
+ "loss": 0.0581,
546
  "max_memory_allocated (GB)": 57.18,
547
  "memory_allocated (GB)": 50.57,
548
  "step": 540,
 
550
  },
551
  {
552
  "epoch": 2.2448979591836733,
553
+ "grad_norm": 1.1996757984161377,
554
  "learning_rate": 5.510204081632653e-06,
555
+ "loss": 0.0626,
556
  "max_memory_allocated (GB)": 57.18,
557
  "memory_allocated (GB)": 50.57,
558
  "step": 550,
 
560
  },
561
  {
562
  "epoch": 2.2857142857142856,
563
+ "grad_norm": 1.2928632497787476,
564
  "learning_rate": 5.428571428571429e-06,
565
+ "loss": 0.0575,
566
  "max_memory_allocated (GB)": 57.18,
567
  "memory_allocated (GB)": 50.57,
568
  "step": 560,
 
570
  },
571
  {
572
  "epoch": 2.326530612244898,
573
+ "grad_norm": 0.7934871912002563,
574
  "learning_rate": 5.3469387755102045e-06,
575
+ "loss": 0.0577,
576
  "max_memory_allocated (GB)": 57.18,
577
  "memory_allocated (GB)": 50.57,
578
  "step": 570,
 
580
  },
581
  {
582
  "epoch": 2.36734693877551,
583
+ "grad_norm": 3.946485757827759,
584
  "learning_rate": 5.26530612244898e-06,
585
+ "loss": 0.0663,
586
  "max_memory_allocated (GB)": 57.18,
587
  "memory_allocated (GB)": 50.57,
588
  "step": 580,
 
590
  },
591
  {
592
  "epoch": 2.4081632653061225,
593
+ "grad_norm": 0.43567588925361633,
594
  "learning_rate": 5.183673469387756e-06,
595
+ "loss": 0.0539,
596
  "max_memory_allocated (GB)": 57.18,
597
  "memory_allocated (GB)": 50.57,
598
  "step": 590,
 
600
  },
601
  {
602
  "epoch": 2.4489795918367347,
603
+ "grad_norm": 0.5725533962249756,
604
  "learning_rate": 5.1020408163265315e-06,
605
+ "loss": 0.0438,
606
  "max_memory_allocated (GB)": 57.18,
607
  "memory_allocated (GB)": 50.57,
608
  "step": 600,
 
610
  },
611
  {
612
  "epoch": 2.489795918367347,
613
+ "grad_norm": 0.44328320026397705,
614
  "learning_rate": 5.020408163265307e-06,
615
  "loss": 0.041,
616
  "max_memory_allocated (GB)": 57.18,
 
620
  },
621
  {
622
  "epoch": 2.5306122448979593,
623
+ "grad_norm": 1.338100790977478,
624
  "learning_rate": 4.938775510204082e-06,
625
+ "loss": 0.0424,
626
  "max_memory_allocated (GB)": 57.18,
627
  "memory_allocated (GB)": 50.57,
628
  "step": 620,
 
630
  },
631
  {
632
  "epoch": 2.571428571428571,
633
+ "grad_norm": 0.92643803358078,
634
  "learning_rate": 4.857142857142858e-06,
635
+ "loss": 0.0811,
636
  "max_memory_allocated (GB)": 57.18,
637
  "memory_allocated (GB)": 50.57,
638
  "step": 630,
 
640
  },
641
  {
642
  "epoch": 2.612244897959184,
643
+ "grad_norm": 1.1147398948669434,
644
  "learning_rate": 4.775510204081633e-06,
645
  "loss": 0.0492,
646
  "max_memory_allocated (GB)": 57.18,
 
650
  },
651
  {
652
  "epoch": 2.6530612244897958,
653
+ "grad_norm": 0.6104307174682617,
654
  "learning_rate": 4.693877551020409e-06,
655
+ "loss": 0.0468,
656
  "max_memory_allocated (GB)": 57.18,
657
  "memory_allocated (GB)": 50.57,
658
  "step": 650,
 
660
  },
661
  {
662
  "epoch": 2.693877551020408,
663
+ "grad_norm": 0.9826134443283081,
664
  "learning_rate": 4.612244897959184e-06,
665
+ "loss": 0.0471,
666
  "max_memory_allocated (GB)": 57.18,
667
  "memory_allocated (GB)": 50.57,
668
  "step": 660,
 
670
  },
671
  {
672
  "epoch": 2.7346938775510203,
673
+ "grad_norm": 0.7680672407150269,
674
  "learning_rate": 4.530612244897959e-06,
675
+ "loss": 0.0858,
676
  "max_memory_allocated (GB)": 57.18,
677
  "memory_allocated (GB)": 50.57,
678
  "step": 670,
 
680
  },
681
  {
682
  "epoch": 2.7755102040816326,
683
+ "grad_norm": 0.9682340025901794,
684
  "learning_rate": 4.448979591836735e-06,
685
+ "loss": 0.0484,
686
  "max_memory_allocated (GB)": 57.18,
687
  "memory_allocated (GB)": 50.57,
688
  "step": 680,
 
690
  },
691
  {
692
  "epoch": 2.816326530612245,
693
+ "grad_norm": 0.37712323665618896,
694
  "learning_rate": 4.367346938775511e-06,
695
+ "loss": 0.0443,
696
  "max_memory_allocated (GB)": 57.18,
697
  "memory_allocated (GB)": 50.57,
698
  "step": 690,
 
700
  },
701
  {
702
  "epoch": 2.857142857142857,
703
+ "grad_norm": 0.34970754384994507,
704
  "learning_rate": 4.2857142857142855e-06,
705
+ "loss": 0.0434,
706
  "max_memory_allocated (GB)": 57.18,
707
  "memory_allocated (GB)": 50.57,
708
  "step": 700,
 
710
  },
711
  {
712
  "epoch": 2.8979591836734695,
713
+ "grad_norm": 0.9949877262115479,
714
  "learning_rate": 4.204081632653061e-06,
715
+ "loss": 0.0553,
716
  "max_memory_allocated (GB)": 57.18,
717
  "memory_allocated (GB)": 50.57,
718
  "step": 710,
 
720
  },
721
  {
722
  "epoch": 2.938775510204082,
723
+ "grad_norm": 1.4436949491500854,
724
  "learning_rate": 4.122448979591837e-06,
725
+ "loss": 0.0583,
726
  "max_memory_allocated (GB)": 57.18,
727
  "memory_allocated (GB)": 50.57,
728
  "step": 720,
 
730
  },
731
  {
732
  "epoch": 2.979591836734694,
733
+ "grad_norm": 0.1619979739189148,
734
  "learning_rate": 4.040816326530612e-06,
735
+ "loss": 0.0336,
736
  "max_memory_allocated (GB)": 57.18,
737
  "memory_allocated (GB)": 50.57,
738
  "step": 730,
 
740
  },
741
  {
742
  "epoch": 3.020408163265306,
743
+ "grad_norm": 1.2799049615859985,
744
  "learning_rate": 3.959183673469388e-06,
745
+ "loss": 0.0536,
746
  "max_memory_allocated (GB)": 57.18,
747
  "memory_allocated (GB)": 50.57,
748
  "step": 740,
 
750
  },
751
  {
752
  "epoch": 3.061224489795918,
753
+ "grad_norm": 0.5613189935684204,
754
  "learning_rate": 3.877551020408164e-06,
755
+ "loss": 0.062,
756
  "max_memory_allocated (GB)": 57.18,
757
  "memory_allocated (GB)": 50.57,
758
  "step": 750,
 
760
  },
761
  {
762
  "epoch": 3.1020408163265305,
763
+ "grad_norm": 0.827383279800415,
764
  "learning_rate": 3.795918367346939e-06,
765
  "loss": 0.0527,
766
  "max_memory_allocated (GB)": 57.18,
 
770
  },
771
  {
772
  "epoch": 3.142857142857143,
773
+ "grad_norm": 0.6983201503753662,
774
  "learning_rate": 3.7142857142857146e-06,
775
+ "loss": 0.0691,
776
  "max_memory_allocated (GB)": 57.18,
777
  "memory_allocated (GB)": 50.57,
778
  "step": 770,
 
780
  },
781
  {
782
  "epoch": 3.183673469387755,
783
+ "grad_norm": 1.0466923713684082,
784
  "learning_rate": 3.6326530612244903e-06,
785
+ "loss": 0.0644,
786
  "max_memory_allocated (GB)": 57.18,
787
  "memory_allocated (GB)": 50.57,
788
  "step": 780,
 
790
  },
791
  {
792
  "epoch": 3.2244897959183674,
793
+ "grad_norm": 0.3068871796131134,
794
  "learning_rate": 3.5510204081632655e-06,
795
+ "loss": 0.0524,
796
  "max_memory_allocated (GB)": 57.18,
797
  "memory_allocated (GB)": 50.57,
798
  "step": 790,
 
800
  },
801
  {
802
  "epoch": 3.2653061224489797,
803
+ "grad_norm": 0.40160393714904785,
804
  "learning_rate": 3.469387755102041e-06,
805
+ "loss": 0.0434,
806
  "max_memory_allocated (GB)": 57.18,
807
  "memory_allocated (GB)": 50.57,
808
  "step": 800,
 
810
  },
811
  {
812
  "epoch": 3.306122448979592,
813
+ "grad_norm": 0.880214512348175,
814
  "learning_rate": 3.3877551020408168e-06,
815
+ "loss": 0.056,
816
  "max_memory_allocated (GB)": 57.18,
817
  "memory_allocated (GB)": 50.57,
818
  "step": 810,
 
820
  },
821
  {
822
  "epoch": 3.3469387755102042,
823
+ "grad_norm": 0.9539953470230103,
824
  "learning_rate": 3.3061224489795924e-06,
825
+ "loss": 0.0464,
826
  "max_memory_allocated (GB)": 57.18,
827
  "memory_allocated (GB)": 50.57,
828
  "step": 820,
 
830
  },
831
  {
832
  "epoch": 3.387755102040816,
833
+ "grad_norm": 0.24522298574447632,
834
  "learning_rate": 3.2244897959183672e-06,
835
+ "loss": 0.0485,
836
  "max_memory_allocated (GB)": 57.18,
837
  "memory_allocated (GB)": 50.57,
838
  "step": 830,
 
840
  },
841
  {
842
  "epoch": 3.4285714285714284,
843
+ "grad_norm": 0.4946345388889313,
844
  "learning_rate": 3.142857142857143e-06,
845
+ "loss": 0.0527,
846
  "max_memory_allocated (GB)": 57.18,
847
  "memory_allocated (GB)": 50.57,
848
  "step": 840,
 
850
  },
851
  {
852
  "epoch": 3.4693877551020407,
853
+ "grad_norm": 0.4724675416946411,
854
  "learning_rate": 3.0612244897959185e-06,
855
  "loss": 0.0813,
856
  "max_memory_allocated (GB)": 57.18,
 
860
  },
861
  {
862
  "epoch": 3.510204081632653,
863
+ "grad_norm": 0.9907402396202087,
864
  "learning_rate": 2.979591836734694e-06,
865
+ "loss": 0.0447,
866
  "max_memory_allocated (GB)": 57.18,
867
  "memory_allocated (GB)": 50.57,
868
  "step": 860,
 
870
  },
871
  {
872
  "epoch": 3.5510204081632653,
873
+ "grad_norm": 0.19696560502052307,
874
  "learning_rate": 2.8979591836734694e-06,
875
+ "loss": 0.0635,
876
  "max_memory_allocated (GB)": 57.18,
877
  "memory_allocated (GB)": 50.57,
878
  "step": 870,
 
880
  },
881
  {
882
  "epoch": 3.5918367346938775,
883
+ "grad_norm": 0.7972800135612488,
884
  "learning_rate": 2.816326530612245e-06,
885
+ "loss": 0.0438,
886
  "max_memory_allocated (GB)": 57.18,
887
  "memory_allocated (GB)": 50.57,
888
  "step": 880,
 
890
  },
891
  {
892
  "epoch": 3.63265306122449,
893
+ "grad_norm": 0.21193134784698486,
894
  "learning_rate": 2.7346938775510207e-06,
895
+ "loss": 0.029,
896
  "max_memory_allocated (GB)": 57.18,
897
  "memory_allocated (GB)": 50.57,
898
  "step": 890,
 
900
  },
901
  {
902
  "epoch": 3.673469387755102,
903
+ "grad_norm": 0.6128103137016296,
904
  "learning_rate": 2.6530612244897964e-06,
905
+ "loss": 0.0514,
906
  "max_memory_allocated (GB)": 57.18,
907
  "memory_allocated (GB)": 50.57,
908
  "step": 900,
 
910
  },
911
  {
912
  "epoch": 3.7142857142857144,
913
+ "grad_norm": 0.8112168312072754,
914
  "learning_rate": 2.571428571428571e-06,
915
  "loss": 0.061,
916
  "max_memory_allocated (GB)": 57.18,
 
920
  },
921
  {
922
  "epoch": 3.7551020408163263,
923
+ "grad_norm": 0.18730562925338745,
924
  "learning_rate": 2.489795918367347e-06,
925
+ "loss": 0.0546,
926
  "max_memory_allocated (GB)": 57.18,
927
  "memory_allocated (GB)": 50.57,
928
  "step": 920,
 
930
  },
931
  {
932
  "epoch": 3.795918367346939,
933
+ "grad_norm": 0.3866801857948303,
934
  "learning_rate": 2.4081632653061225e-06,
935
+ "loss": 0.0501,
936
  "max_memory_allocated (GB)": 57.18,
937
  "memory_allocated (GB)": 50.57,
938
  "step": 930,
 
940
  },
941
  {
942
  "epoch": 3.836734693877551,
943
+ "grad_norm": 0.8816384077072144,
944
  "learning_rate": 2.326530612244898e-06,
945
+ "loss": 0.0489,
946
  "max_memory_allocated (GB)": 57.18,
947
  "memory_allocated (GB)": 50.57,
948
  "step": 940,
 
950
  },
951
  {
952
  "epoch": 3.877551020408163,
953
+ "grad_norm": 0.5572797656059265,
954
  "learning_rate": 2.244897959183674e-06,
955
+ "loss": 0.0599,
956
  "max_memory_allocated (GB)": 57.18,
957
  "memory_allocated (GB)": 50.57,
958
  "step": 950,
 
960
  },
961
  {
962
  "epoch": 3.9183673469387754,
963
+ "grad_norm": 0.38238489627838135,
964
  "learning_rate": 2.1632653061224495e-06,
965
+ "loss": 0.0497,
966
  "max_memory_allocated (GB)": 57.18,
967
  "memory_allocated (GB)": 50.57,
968
  "step": 960,
 
970
  },
971
  {
972
  "epoch": 3.9591836734693877,
973
+ "grad_norm": 0.6144959926605225,
974
  "learning_rate": 2.0816326530612247e-06,
975
+ "loss": 0.0741,
976
  "max_memory_allocated (GB)": 57.18,
977
  "memory_allocated (GB)": 50.57,
978
  "step": 970,
 
980
  },
981
  {
982
  "epoch": 4.0,
983
+ "grad_norm": 0.6087101697921753,
984
  "learning_rate": 2.0000000000000003e-06,
985
+ "loss": 0.0703,
986
  "max_memory_allocated (GB)": 57.18,
987
  "memory_allocated (GB)": 50.57,
988
  "step": 980,
 
990
  },
991
  {
992
  "epoch": 4.040816326530612,
993
+ "grad_norm": 0.5187469720840454,
994
  "learning_rate": 1.9183673469387756e-06,
995
+ "loss": 0.0482,
996
  "max_memory_allocated (GB)": 57.18,
997
  "memory_allocated (GB)": 50.57,
998
  "step": 990,
 
1000
  },
1001
  {
1002
  "epoch": 4.081632653061225,
1003
+ "grad_norm": 1.248850703239441,
1004
  "learning_rate": 1.8367346938775512e-06,
1005
+ "loss": 0.0631,
1006
  "max_memory_allocated (GB)": 57.18,
1007
  "memory_allocated (GB)": 50.57,
1008
  "step": 1000,
 
1010
  },
1011
  {
1012
  "epoch": 4.122448979591836,
1013
+ "grad_norm": 0.5806276798248291,
1014
  "learning_rate": 1.7551020408163267e-06,
1015
+ "loss": 0.0629,
1016
  "max_memory_allocated (GB)": 57.18,
1017
  "memory_allocated (GB)": 50.57,
1018
  "step": 1010,
 
1020
  },
1021
  {
1022
  "epoch": 4.163265306122449,
1023
+ "grad_norm": 0.3565673828125,
1024
  "learning_rate": 1.6734693877551023e-06,
1025
  "loss": 0.0407,
1026
  "max_memory_allocated (GB)": 57.18,
 
1030
  },
1031
  {
1032
  "epoch": 4.204081632653061,
1033
+ "grad_norm": 0.6948438882827759,
1034
  "learning_rate": 1.5918367346938775e-06,
1035
+ "loss": 0.053,
1036
  "max_memory_allocated (GB)": 57.18,
1037
  "memory_allocated (GB)": 50.57,
1038
  "step": 1030,
 
1040
  },
1041
  {
1042
  "epoch": 4.244897959183674,
1043
+ "grad_norm": 0.5245764851570129,
1044
  "learning_rate": 1.5102040816326532e-06,
1045
+ "loss": 0.0399,
1046
  "max_memory_allocated (GB)": 57.18,
1047
  "memory_allocated (GB)": 50.57,
1048
  "step": 1040,
 
1050
  },
1051
  {
1052
  "epoch": 4.285714285714286,
1053
+ "grad_norm": 0.7932385802268982,
1054
  "learning_rate": 1.4285714285714286e-06,
1055
+ "loss": 0.0502,
1056
  "max_memory_allocated (GB)": 57.18,
1057
  "memory_allocated (GB)": 50.57,
1058
  "step": 1050,
 
1060
  },
1061
  {
1062
  "epoch": 4.326530612244898,
1063
+ "grad_norm": 0.30140048265457153,
1064
  "learning_rate": 1.3469387755102043e-06,
1065
  "loss": 0.046,
1066
  "max_memory_allocated (GB)": 57.18,
 
1070
  },
1071
  {
1072
  "epoch": 4.36734693877551,
1073
+ "grad_norm": 0.570467472076416,
1074
  "learning_rate": 1.2653061224489795e-06,
1075
+ "loss": 0.0487,
1076
  "max_memory_allocated (GB)": 57.18,
1077
  "memory_allocated (GB)": 50.57,
1078
  "step": 1070,
 
1080
  },
1081
  {
1082
  "epoch": 4.408163265306122,
1083
+ "grad_norm": 0.43690067529678345,
1084
  "learning_rate": 1.1836734693877552e-06,
1085
+ "loss": 0.0521,
1086
  "max_memory_allocated (GB)": 57.18,
1087
  "memory_allocated (GB)": 50.57,
1088
  "step": 1080,
 
1090
  },
1091
  {
1092
  "epoch": 4.448979591836735,
1093
+ "grad_norm": 0.5298590660095215,
1094
  "learning_rate": 1.1020408163265308e-06,
1095
+ "loss": 0.0506,
1096
  "max_memory_allocated (GB)": 57.18,
1097
  "memory_allocated (GB)": 50.57,
1098
  "step": 1090,
 
1100
  },
1101
  {
1102
  "epoch": 4.489795918367347,
1103
+ "grad_norm": 0.2310735136270523,
1104
  "learning_rate": 1.0204081632653063e-06,
1105
+ "loss": 0.036,
1106
  "max_memory_allocated (GB)": 57.18,
1107
  "memory_allocated (GB)": 50.57,
1108
  "step": 1100,
 
1110
  },
1111
  {
1112
  "epoch": 4.530612244897959,
1113
+ "grad_norm": 0.13128583133220673,
1114
  "learning_rate": 9.387755102040817e-07,
1115
+ "loss": 0.0463,
1116
  "max_memory_allocated (GB)": 57.18,
1117
  "memory_allocated (GB)": 50.57,
1118
  "step": 1110,
 
1120
  },
1121
  {
1122
  "epoch": 4.571428571428571,
1123
+ "grad_norm": 0.7682464122772217,
1124
  "learning_rate": 8.571428571428572e-07,
1125
+ "loss": 0.0403,
1126
  "max_memory_allocated (GB)": 57.18,
1127
  "memory_allocated (GB)": 50.57,
1128
  "step": 1120,
 
1130
  },
1131
  {
1132
  "epoch": 4.612244897959184,
1133
+ "grad_norm": 0.6608971953392029,
1134
  "learning_rate": 7.755102040816327e-07,
1135
+ "loss": 0.0543,
1136
  "max_memory_allocated (GB)": 57.18,
1137
  "memory_allocated (GB)": 50.57,
1138
  "step": 1130,
 
1140
  },
1141
  {
1142
  "epoch": 4.653061224489796,
1143
+ "grad_norm": 0.8803687691688538,
1144
  "learning_rate": 6.938775510204082e-07,
1145
+ "loss": 0.0728,
1146
  "max_memory_allocated (GB)": 57.18,
1147
  "memory_allocated (GB)": 50.57,
1148
  "step": 1140,
 
1150
  },
1151
  {
1152
  "epoch": 4.6938775510204085,
1153
+ "grad_norm": 4.121662139892578,
1154
  "learning_rate": 6.122448979591837e-07,
1155
+ "loss": 0.0514,
1156
  "max_memory_allocated (GB)": 57.18,
1157
  "memory_allocated (GB)": 50.57,
1158
  "step": 1150,
 
1160
  },
1161
  {
1162
  "epoch": 4.73469387755102,
1163
+ "grad_norm": 0.7500938773155212,
1164
  "learning_rate": 5.306122448979592e-07,
1165
+ "loss": 0.0612,
1166
  "max_memory_allocated (GB)": 57.18,
1167
  "memory_allocated (GB)": 50.57,
1168
  "step": 1160,
 
1170
  },
1171
  {
1172
  "epoch": 4.775510204081632,
1173
+ "grad_norm": 0.6001973748207092,
1174
  "learning_rate": 4.489795918367347e-07,
1175
+ "loss": 0.0549,
1176
  "max_memory_allocated (GB)": 57.18,
1177
  "memory_allocated (GB)": 50.57,
1178
  "step": 1170,
 
1180
  },
1181
  {
1182
  "epoch": 4.816326530612245,
1183
+ "grad_norm": 0.7522645592689514,
1184
  "learning_rate": 3.6734693877551025e-07,
1185
+ "loss": 0.0445,
1186
  "max_memory_allocated (GB)": 57.18,
1187
  "memory_allocated (GB)": 50.57,
1188
  "step": 1180,
 
1190
  },
1191
  {
1192
  "epoch": 4.857142857142857,
1193
+ "grad_norm": 0.6640497446060181,
1194
  "learning_rate": 2.8571428571428575e-07,
1195
+ "loss": 0.0542,
1196
  "max_memory_allocated (GB)": 57.18,
1197
  "memory_allocated (GB)": 50.57,
1198
  "step": 1190,
 
1200
  },
1201
  {
1202
  "epoch": 4.8979591836734695,
1203
+ "grad_norm": 0.8031227588653564,
1204
  "learning_rate": 2.0408163265306121e-07,
1205
+ "loss": 0.0728,
1206
  "max_memory_allocated (GB)": 57.18,
1207
  "memory_allocated (GB)": 50.57,
1208
  "step": 1200,
 
1210
  },
1211
  {
1212
  "epoch": 4.938775510204081,
1213
+ "grad_norm": 0.39187708497047424,
1214
  "learning_rate": 1.2244897959183673e-07,
1215
+ "loss": 0.065,
1216
  "max_memory_allocated (GB)": 57.18,
1217
  "memory_allocated (GB)": 50.57,
1218
  "step": 1210,
 
1220
  },
1221
  {
1222
  "epoch": 4.979591836734694,
1223
+ "grad_norm": 3.809382915496826,
1224
  "learning_rate": 4.0816326530612253e-08,
1225
+ "loss": 0.0417,
1226
  "max_memory_allocated (GB)": 57.18,
1227
  "memory_allocated (GB)": 50.57,
1228
  "step": 1220,
 
1235
  "step": 1225,
1236
  "total_flos": 3.0598946525952e+16,
1237
  "total_memory_available (GB)": 94.62,
1238
+ "train_loss": 0.06098026679486644,
1239
+ "train_runtime": 1192.2443,
1240
+ "train_samples_per_second": 46.607,
1241
+ "train_steps_per_second": 1.166
1242
  }
1243
  ],
1244
  "logging_steps": 10,
validation_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 0.12326910346746445,
4
- "eval_runtime": 26.1444,
5
- "eval_samples_per_second": 36.653,
6
- "eval_steps_per_second": 2.352,
7
  "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 0.12305960804224014,
4
+ "eval_runtime": 31.7562,
5
+ "eval_samples_per_second": 32.832,
6
+ "eval_steps_per_second": 2.107,
7
  "max_memory_allocated (GB)": 57.18,
8
  "memory_allocated (GB)": 51.27,
9
  "total_memory_available (GB)": 94.62