jdannem6 commited on
Commit
4f748ac
1 Parent(s): f2d2a49

Uploaded checkpoint-25000

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1795 -5
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e72ed832ca82b47560da4b70a43bfb85762b41d54a4b1df89cee4b8816cb6fc
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58ea76fcf16a2912a570cf295dd1757cd9562cb7f7f8e74d37938855d31dc866
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19756b617f2bf91f55b9e8c9b87ec2279b8dca12dd91f8f9e92a075a7d6745b9
3
  size 240145026
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:231c6b24970ef58291d1980aeb742ace763101289d628ec3f4ac808335924d18
3
  size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da5d678c8111a8bb6e7f07c6d826c3d293cb2dc841c1a7d8cdada1cef59bd3c9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2bd031f30ceb89483d2d8b5eb187850133dcc5a689162e8975b2cc0e61b4001
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ebf87c3777a880efd4523ce05af816d67a6a12edb3e1d54f156890382c1db41
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3bdbaa37c77733a3ea9eb90a36bc290f4f5b9f56abe23cc6586cbaa459f92c6
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.3439137935638428,
3
- "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-22500",
4
- "epoch": 0.5625,
5
  "eval_steps": 500,
6
- "global_step": 22500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -16117,6 +16117,1796 @@
16117
  "eval_samples_per_second": 15.104,
16118
  "eval_steps_per_second": 15.104,
16119
  "step": 22500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16120
  }
16121
  ],
16122
  "logging_steps": 10,
@@ -16124,7 +17914,7 @@
16124
  "num_input_tokens_seen": 0,
16125
  "num_train_epochs": 1,
16126
  "save_steps": 2500,
16127
- "total_flos": 3.6229783486464e+17,
16128
  "train_batch_size": 1,
16129
  "trial_name": null,
16130
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.3409814834594727,
3
+ "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-25000",
4
+ "epoch": 0.625,
5
  "eval_steps": 500,
6
+ "global_step": 25000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
16117
  "eval_samples_per_second": 15.104,
16118
  "eval_steps_per_second": 15.104,
16119
  "step": 22500
16120
+ },
16121
+ {
16122
+ "epoch": 0.56,
16123
+ "grad_norm": 3.232191562652588,
16124
+ "learning_rate": 5.077966101694915e-06,
16125
+ "loss": 1.3644,
16126
+ "step": 22510
16127
+ },
16128
+ {
16129
+ "epoch": 0.56,
16130
+ "grad_norm": 5.3384528160095215,
16131
+ "learning_rate": 5.071186440677967e-06,
16132
+ "loss": 1.3093,
16133
+ "step": 22520
16134
+ },
16135
+ {
16136
+ "epoch": 0.56,
16137
+ "grad_norm": 6.911621570587158,
16138
+ "learning_rate": 5.064406779661017e-06,
16139
+ "loss": 1.2571,
16140
+ "step": 22530
16141
+ },
16142
+ {
16143
+ "epoch": 0.56,
16144
+ "grad_norm": 6.84785270690918,
16145
+ "learning_rate": 5.057627118644069e-06,
16146
+ "loss": 1.324,
16147
+ "step": 22540
16148
+ },
16149
+ {
16150
+ "epoch": 0.56,
16151
+ "grad_norm": 4.634193420410156,
16152
+ "learning_rate": 5.050847457627119e-06,
16153
+ "loss": 1.4672,
16154
+ "step": 22550
16155
+ },
16156
+ {
16157
+ "epoch": 0.56,
16158
+ "grad_norm": 3.4189460277557373,
16159
+ "learning_rate": 5.0440677966101705e-06,
16160
+ "loss": 1.325,
16161
+ "step": 22560
16162
+ },
16163
+ {
16164
+ "epoch": 0.56,
16165
+ "grad_norm": 7.2140793800354,
16166
+ "learning_rate": 5.037288135593221e-06,
16167
+ "loss": 1.3923,
16168
+ "step": 22570
16169
+ },
16170
+ {
16171
+ "epoch": 0.56,
16172
+ "grad_norm": 3.916853189468384,
16173
+ "learning_rate": 5.030508474576271e-06,
16174
+ "loss": 1.3431,
16175
+ "step": 22580
16176
+ },
16177
+ {
16178
+ "epoch": 0.56,
16179
+ "grad_norm": 6.434349536895752,
16180
+ "learning_rate": 5.023728813559322e-06,
16181
+ "loss": 1.4615,
16182
+ "step": 22590
16183
+ },
16184
+ {
16185
+ "epoch": 0.56,
16186
+ "grad_norm": 6.851098537445068,
16187
+ "learning_rate": 5.016949152542373e-06,
16188
+ "loss": 1.3013,
16189
+ "step": 22600
16190
+ },
16191
+ {
16192
+ "epoch": 0.57,
16193
+ "grad_norm": 7.541562080383301,
16194
+ "learning_rate": 5.010169491525424e-06,
16195
+ "loss": 1.3696,
16196
+ "step": 22610
16197
+ },
16198
+ {
16199
+ "epoch": 0.57,
16200
+ "grad_norm": 4.100308895111084,
16201
+ "learning_rate": 5.003389830508475e-06,
16202
+ "loss": 1.3802,
16203
+ "step": 22620
16204
+ },
16205
+ {
16206
+ "epoch": 0.57,
16207
+ "grad_norm": 12.694953918457031,
16208
+ "learning_rate": 4.996610169491526e-06,
16209
+ "loss": 1.2328,
16210
+ "step": 22630
16211
+ },
16212
+ {
16213
+ "epoch": 0.57,
16214
+ "grad_norm": 12.468879699707031,
16215
+ "learning_rate": 4.989830508474577e-06,
16216
+ "loss": 1.3995,
16217
+ "step": 22640
16218
+ },
16219
+ {
16220
+ "epoch": 0.57,
16221
+ "grad_norm": 3.142810821533203,
16222
+ "learning_rate": 4.983050847457628e-06,
16223
+ "loss": 1.2403,
16224
+ "step": 22650
16225
+ },
16226
+ {
16227
+ "epoch": 0.57,
16228
+ "grad_norm": 7.71440315246582,
16229
+ "learning_rate": 4.976271186440678e-06,
16230
+ "loss": 1.2902,
16231
+ "step": 22660
16232
+ },
16233
+ {
16234
+ "epoch": 0.57,
16235
+ "grad_norm": 4.5355634689331055,
16236
+ "learning_rate": 4.969491525423729e-06,
16237
+ "loss": 1.3243,
16238
+ "step": 22670
16239
+ },
16240
+ {
16241
+ "epoch": 0.57,
16242
+ "grad_norm": 1.5578103065490723,
16243
+ "learning_rate": 4.96271186440678e-06,
16244
+ "loss": 1.2796,
16245
+ "step": 22680
16246
+ },
16247
+ {
16248
+ "epoch": 0.57,
16249
+ "grad_norm": 4.997007846832275,
16250
+ "learning_rate": 4.955932203389831e-06,
16251
+ "loss": 1.1288,
16252
+ "step": 22690
16253
+ },
16254
+ {
16255
+ "epoch": 0.57,
16256
+ "grad_norm": 4.240728855133057,
16257
+ "learning_rate": 4.949152542372882e-06,
16258
+ "loss": 1.408,
16259
+ "step": 22700
16260
+ },
16261
+ {
16262
+ "epoch": 0.57,
16263
+ "grad_norm": 7.517406940460205,
16264
+ "learning_rate": 4.942372881355932e-06,
16265
+ "loss": 1.3879,
16266
+ "step": 22710
16267
+ },
16268
+ {
16269
+ "epoch": 0.57,
16270
+ "grad_norm": 1.1581978797912598,
16271
+ "learning_rate": 4.935593220338984e-06,
16272
+ "loss": 1.3547,
16273
+ "step": 22720
16274
+ },
16275
+ {
16276
+ "epoch": 0.57,
16277
+ "grad_norm": 5.073269367218018,
16278
+ "learning_rate": 4.928813559322034e-06,
16279
+ "loss": 1.3664,
16280
+ "step": 22730
16281
+ },
16282
+ {
16283
+ "epoch": 0.57,
16284
+ "grad_norm": 3.7317769527435303,
16285
+ "learning_rate": 4.922033898305086e-06,
16286
+ "loss": 1.3942,
16287
+ "step": 22740
16288
+ },
16289
+ {
16290
+ "epoch": 0.57,
16291
+ "grad_norm": 4.843672275543213,
16292
+ "learning_rate": 4.915254237288136e-06,
16293
+ "loss": 1.3723,
16294
+ "step": 22750
16295
+ },
16296
+ {
16297
+ "epoch": 0.57,
16298
+ "grad_norm": 7.751224040985107,
16299
+ "learning_rate": 4.908474576271187e-06,
16300
+ "loss": 1.2057,
16301
+ "step": 22760
16302
+ },
16303
+ {
16304
+ "epoch": 0.57,
16305
+ "grad_norm": 2.4519495964050293,
16306
+ "learning_rate": 4.901694915254237e-06,
16307
+ "loss": 1.3836,
16308
+ "step": 22770
16309
+ },
16310
+ {
16311
+ "epoch": 0.57,
16312
+ "grad_norm": 8.7233304977417,
16313
+ "learning_rate": 4.894915254237289e-06,
16314
+ "loss": 1.3842,
16315
+ "step": 22780
16316
+ },
16317
+ {
16318
+ "epoch": 0.57,
16319
+ "grad_norm": 2.717367172241211,
16320
+ "learning_rate": 4.888135593220339e-06,
16321
+ "loss": 1.4519,
16322
+ "step": 22790
16323
+ },
16324
+ {
16325
+ "epoch": 0.57,
16326
+ "grad_norm": 4.797736167907715,
16327
+ "learning_rate": 4.881355932203391e-06,
16328
+ "loss": 1.1785,
16329
+ "step": 22800
16330
+ },
16331
+ {
16332
+ "epoch": 0.57,
16333
+ "grad_norm": 11.28987979888916,
16334
+ "learning_rate": 4.874576271186441e-06,
16335
+ "loss": 1.3058,
16336
+ "step": 22810
16337
+ },
16338
+ {
16339
+ "epoch": 0.57,
16340
+ "grad_norm": 5.097863674163818,
16341
+ "learning_rate": 4.867796610169492e-06,
16342
+ "loss": 1.232,
16343
+ "step": 22820
16344
+ },
16345
+ {
16346
+ "epoch": 0.57,
16347
+ "grad_norm": 5.479716777801514,
16348
+ "learning_rate": 4.861016949152543e-06,
16349
+ "loss": 1.4677,
16350
+ "step": 22830
16351
+ },
16352
+ {
16353
+ "epoch": 0.57,
16354
+ "grad_norm": 11.921456336975098,
16355
+ "learning_rate": 4.854237288135594e-06,
16356
+ "loss": 1.5442,
16357
+ "step": 22840
16358
+ },
16359
+ {
16360
+ "epoch": 0.57,
16361
+ "grad_norm": 9.934676170349121,
16362
+ "learning_rate": 4.847457627118645e-06,
16363
+ "loss": 1.4147,
16364
+ "step": 22850
16365
+ },
16366
+ {
16367
+ "epoch": 0.57,
16368
+ "grad_norm": 3.3140487670898438,
16369
+ "learning_rate": 4.840677966101695e-06,
16370
+ "loss": 1.2146,
16371
+ "step": 22860
16372
+ },
16373
+ {
16374
+ "epoch": 0.57,
16375
+ "grad_norm": 2.1303317546844482,
16376
+ "learning_rate": 4.833898305084746e-06,
16377
+ "loss": 1.3021,
16378
+ "step": 22870
16379
+ },
16380
+ {
16381
+ "epoch": 0.57,
16382
+ "grad_norm": 3.943474769592285,
16383
+ "learning_rate": 4.827118644067797e-06,
16384
+ "loss": 1.2028,
16385
+ "step": 22880
16386
+ },
16387
+ {
16388
+ "epoch": 0.57,
16389
+ "grad_norm": 5.444009780883789,
16390
+ "learning_rate": 4.820338983050848e-06,
16391
+ "loss": 1.3873,
16392
+ "step": 22890
16393
+ },
16394
+ {
16395
+ "epoch": 0.57,
16396
+ "grad_norm": 5.230558395385742,
16397
+ "learning_rate": 4.813559322033899e-06,
16398
+ "loss": 1.2907,
16399
+ "step": 22900
16400
+ },
16401
+ {
16402
+ "epoch": 0.57,
16403
+ "grad_norm": 2.652158498764038,
16404
+ "learning_rate": 4.80677966101695e-06,
16405
+ "loss": 1.27,
16406
+ "step": 22910
16407
+ },
16408
+ {
16409
+ "epoch": 0.57,
16410
+ "grad_norm": 5.423113822937012,
16411
+ "learning_rate": 4.800000000000001e-06,
16412
+ "loss": 1.3488,
16413
+ "step": 22920
16414
+ },
16415
+ {
16416
+ "epoch": 0.57,
16417
+ "grad_norm": 6.731577396392822,
16418
+ "learning_rate": 4.793220338983051e-06,
16419
+ "loss": 1.3411,
16420
+ "step": 22930
16421
+ },
16422
+ {
16423
+ "epoch": 0.57,
16424
+ "grad_norm": 6.142993927001953,
16425
+ "learning_rate": 4.786440677966102e-06,
16426
+ "loss": 1.4032,
16427
+ "step": 22940
16428
+ },
16429
+ {
16430
+ "epoch": 0.57,
16431
+ "grad_norm": 7.845600605010986,
16432
+ "learning_rate": 4.779661016949153e-06,
16433
+ "loss": 1.3567,
16434
+ "step": 22950
16435
+ },
16436
+ {
16437
+ "epoch": 0.57,
16438
+ "grad_norm": 3.123938798904419,
16439
+ "learning_rate": 4.772881355932204e-06,
16440
+ "loss": 1.2106,
16441
+ "step": 22960
16442
+ },
16443
+ {
16444
+ "epoch": 0.57,
16445
+ "grad_norm": 4.684544086456299,
16446
+ "learning_rate": 4.766101694915254e-06,
16447
+ "loss": 1.3407,
16448
+ "step": 22970
16449
+ },
16450
+ {
16451
+ "epoch": 0.57,
16452
+ "grad_norm": 11.232462882995605,
16453
+ "learning_rate": 4.759322033898306e-06,
16454
+ "loss": 1.136,
16455
+ "step": 22980
16456
+ },
16457
+ {
16458
+ "epoch": 0.57,
16459
+ "grad_norm": 3.3728113174438477,
16460
+ "learning_rate": 4.752542372881356e-06,
16461
+ "loss": 1.199,
16462
+ "step": 22990
16463
+ },
16464
+ {
16465
+ "epoch": 0.57,
16466
+ "grad_norm": 3.5851376056671143,
16467
+ "learning_rate": 4.745762711864408e-06,
16468
+ "loss": 1.463,
16469
+ "step": 23000
16470
+ },
16471
+ {
16472
+ "epoch": 0.57,
16473
+ "eval_loss": 1.3161565065383911,
16474
+ "eval_runtime": 66.103,
16475
+ "eval_samples_per_second": 15.128,
16476
+ "eval_steps_per_second": 15.128,
16477
+ "step": 23000
16478
+ },
16479
+ {
16480
+ "epoch": 0.58,
16481
+ "grad_norm": 5.720829010009766,
16482
+ "learning_rate": 4.738983050847458e-06,
16483
+ "loss": 1.4393,
16484
+ "step": 23010
16485
+ },
16486
+ {
16487
+ "epoch": 0.58,
16488
+ "grad_norm": 11.663517951965332,
16489
+ "learning_rate": 4.732203389830509e-06,
16490
+ "loss": 1.3158,
16491
+ "step": 23020
16492
+ },
16493
+ {
16494
+ "epoch": 0.58,
16495
+ "grad_norm": 10.158949851989746,
16496
+ "learning_rate": 4.725423728813559e-06,
16497
+ "loss": 1.4086,
16498
+ "step": 23030
16499
+ },
16500
+ {
16501
+ "epoch": 0.58,
16502
+ "grad_norm": 6.925542831420898,
16503
+ "learning_rate": 4.718644067796611e-06,
16504
+ "loss": 1.3655,
16505
+ "step": 23040
16506
+ },
16507
+ {
16508
+ "epoch": 0.58,
16509
+ "grad_norm": 1.4193698167800903,
16510
+ "learning_rate": 4.711864406779661e-06,
16511
+ "loss": 1.2348,
16512
+ "step": 23050
16513
+ },
16514
+ {
16515
+ "epoch": 0.58,
16516
+ "grad_norm": 6.884500980377197,
16517
+ "learning_rate": 4.705084745762713e-06,
16518
+ "loss": 1.4053,
16519
+ "step": 23060
16520
+ },
16521
+ {
16522
+ "epoch": 0.58,
16523
+ "grad_norm": 4.412232875823975,
16524
+ "learning_rate": 4.698305084745763e-06,
16525
+ "loss": 1.2476,
16526
+ "step": 23070
16527
+ },
16528
+ {
16529
+ "epoch": 0.58,
16530
+ "grad_norm": 2.403428077697754,
16531
+ "learning_rate": 4.691525423728814e-06,
16532
+ "loss": 1.4168,
16533
+ "step": 23080
16534
+ },
16535
+ {
16536
+ "epoch": 0.58,
16537
+ "grad_norm": 7.022388458251953,
16538
+ "learning_rate": 4.684745762711865e-06,
16539
+ "loss": 1.359,
16540
+ "step": 23090
16541
+ },
16542
+ {
16543
+ "epoch": 0.58,
16544
+ "grad_norm": 5.728121280670166,
16545
+ "learning_rate": 4.677966101694916e-06,
16546
+ "loss": 1.3916,
16547
+ "step": 23100
16548
+ },
16549
+ {
16550
+ "epoch": 0.58,
16551
+ "grad_norm": 14.995932579040527,
16552
+ "learning_rate": 4.671186440677967e-06,
16553
+ "loss": 1.4541,
16554
+ "step": 23110
16555
+ },
16556
+ {
16557
+ "epoch": 0.58,
16558
+ "grad_norm": 12.448729515075684,
16559
+ "learning_rate": 4.664406779661017e-06,
16560
+ "loss": 1.2989,
16561
+ "step": 23120
16562
+ },
16563
+ {
16564
+ "epoch": 0.58,
16565
+ "grad_norm": 2.5807507038116455,
16566
+ "learning_rate": 4.657627118644068e-06,
16567
+ "loss": 1.4881,
16568
+ "step": 23130
16569
+ },
16570
+ {
16571
+ "epoch": 0.58,
16572
+ "grad_norm": 4.670041561126709,
16573
+ "learning_rate": 4.650847457627119e-06,
16574
+ "loss": 1.4595,
16575
+ "step": 23140
16576
+ },
16577
+ {
16578
+ "epoch": 0.58,
16579
+ "grad_norm": 3.8450145721435547,
16580
+ "learning_rate": 4.64406779661017e-06,
16581
+ "loss": 1.245,
16582
+ "step": 23150
16583
+ },
16584
+ {
16585
+ "epoch": 0.58,
16586
+ "grad_norm": 15.546969413757324,
16587
+ "learning_rate": 4.637288135593221e-06,
16588
+ "loss": 1.2199,
16589
+ "step": 23160
16590
+ },
16591
+ {
16592
+ "epoch": 0.58,
16593
+ "grad_norm": 8.563859939575195,
16594
+ "learning_rate": 4.630508474576272e-06,
16595
+ "loss": 1.3752,
16596
+ "step": 23170
16597
+ },
16598
+ {
16599
+ "epoch": 0.58,
16600
+ "grad_norm": 8.742653846740723,
16601
+ "learning_rate": 4.623728813559323e-06,
16602
+ "loss": 1.2802,
16603
+ "step": 23180
16604
+ },
16605
+ {
16606
+ "epoch": 0.58,
16607
+ "grad_norm": 6.253279685974121,
16608
+ "learning_rate": 4.616949152542373e-06,
16609
+ "loss": 1.4559,
16610
+ "step": 23190
16611
+ },
16612
+ {
16613
+ "epoch": 0.58,
16614
+ "grad_norm": 5.3615827560424805,
16615
+ "learning_rate": 4.610169491525424e-06,
16616
+ "loss": 1.1649,
16617
+ "step": 23200
16618
+ },
16619
+ {
16620
+ "epoch": 0.58,
16621
+ "grad_norm": 9.165109634399414,
16622
+ "learning_rate": 4.603389830508475e-06,
16623
+ "loss": 1.4435,
16624
+ "step": 23210
16625
+ },
16626
+ {
16627
+ "epoch": 0.58,
16628
+ "grad_norm": 6.625391483306885,
16629
+ "learning_rate": 4.596610169491526e-06,
16630
+ "loss": 1.2911,
16631
+ "step": 23220
16632
+ },
16633
+ {
16634
+ "epoch": 0.58,
16635
+ "grad_norm": 3.898466110229492,
16636
+ "learning_rate": 4.589830508474576e-06,
16637
+ "loss": 1.2653,
16638
+ "step": 23230
16639
+ },
16640
+ {
16641
+ "epoch": 0.58,
16642
+ "grad_norm": 1.5483791828155518,
16643
+ "learning_rate": 4.583050847457628e-06,
16644
+ "loss": 1.1665,
16645
+ "step": 23240
16646
+ },
16647
+ {
16648
+ "epoch": 0.58,
16649
+ "grad_norm": 4.248947620391846,
16650
+ "learning_rate": 4.576271186440678e-06,
16651
+ "loss": 1.2275,
16652
+ "step": 23250
16653
+ },
16654
+ {
16655
+ "epoch": 0.58,
16656
+ "grad_norm": 3.522451639175415,
16657
+ "learning_rate": 4.56949152542373e-06,
16658
+ "loss": 1.3427,
16659
+ "step": 23260
16660
+ },
16661
+ {
16662
+ "epoch": 0.58,
16663
+ "grad_norm": 5.79518461227417,
16664
+ "learning_rate": 4.56271186440678e-06,
16665
+ "loss": 1.2545,
16666
+ "step": 23270
16667
+ },
16668
+ {
16669
+ "epoch": 0.58,
16670
+ "grad_norm": 7.211407661437988,
16671
+ "learning_rate": 4.555932203389831e-06,
16672
+ "loss": 1.3256,
16673
+ "step": 23280
16674
+ },
16675
+ {
16676
+ "epoch": 0.58,
16677
+ "grad_norm": 2.218186855316162,
16678
+ "learning_rate": 4.549152542372881e-06,
16679
+ "loss": 1.327,
16680
+ "step": 23290
16681
+ },
16682
+ {
16683
+ "epoch": 0.58,
16684
+ "grad_norm": 3.0725247859954834,
16685
+ "learning_rate": 4.542372881355933e-06,
16686
+ "loss": 1.1152,
16687
+ "step": 23300
16688
+ },
16689
+ {
16690
+ "epoch": 0.58,
16691
+ "grad_norm": 10.184927940368652,
16692
+ "learning_rate": 4.535593220338983e-06,
16693
+ "loss": 1.3153,
16694
+ "step": 23310
16695
+ },
16696
+ {
16697
+ "epoch": 0.58,
16698
+ "grad_norm": 6.575405597686768,
16699
+ "learning_rate": 4.528813559322035e-06,
16700
+ "loss": 1.2879,
16701
+ "step": 23320
16702
+ },
16703
+ {
16704
+ "epoch": 0.58,
16705
+ "grad_norm": 5.2145094871521,
16706
+ "learning_rate": 4.522033898305085e-06,
16707
+ "loss": 1.2836,
16708
+ "step": 23330
16709
+ },
16710
+ {
16711
+ "epoch": 0.58,
16712
+ "grad_norm": 16.586687088012695,
16713
+ "learning_rate": 4.515254237288136e-06,
16714
+ "loss": 1.245,
16715
+ "step": 23340
16716
+ },
16717
+ {
16718
+ "epoch": 0.58,
16719
+ "grad_norm": 5.264496803283691,
16720
+ "learning_rate": 4.508474576271187e-06,
16721
+ "loss": 1.1731,
16722
+ "step": 23350
16723
+ },
16724
+ {
16725
+ "epoch": 0.58,
16726
+ "grad_norm": 5.267670631408691,
16727
+ "learning_rate": 4.501694915254238e-06,
16728
+ "loss": 1.3662,
16729
+ "step": 23360
16730
+ },
16731
+ {
16732
+ "epoch": 0.58,
16733
+ "grad_norm": 7.335075855255127,
16734
+ "learning_rate": 4.494915254237289e-06,
16735
+ "loss": 1.3261,
16736
+ "step": 23370
16737
+ },
16738
+ {
16739
+ "epoch": 0.58,
16740
+ "grad_norm": 7.068376064300537,
16741
+ "learning_rate": 4.488135593220339e-06,
16742
+ "loss": 1.2805,
16743
+ "step": 23380
16744
+ },
16745
+ {
16746
+ "epoch": 0.58,
16747
+ "grad_norm": 15.14303207397461,
16748
+ "learning_rate": 4.48135593220339e-06,
16749
+ "loss": 1.3605,
16750
+ "step": 23390
16751
+ },
16752
+ {
16753
+ "epoch": 0.58,
16754
+ "grad_norm": 12.552229881286621,
16755
+ "learning_rate": 4.474576271186441e-06,
16756
+ "loss": 1.4804,
16757
+ "step": 23400
16758
+ },
16759
+ {
16760
+ "epoch": 0.59,
16761
+ "grad_norm": 6.760104656219482,
16762
+ "learning_rate": 4.467796610169492e-06,
16763
+ "loss": 1.2604,
16764
+ "step": 23410
16765
+ },
16766
+ {
16767
+ "epoch": 0.59,
16768
+ "grad_norm": 6.444414138793945,
16769
+ "learning_rate": 4.461016949152543e-06,
16770
+ "loss": 1.3677,
16771
+ "step": 23420
16772
+ },
16773
+ {
16774
+ "epoch": 0.59,
16775
+ "grad_norm": 2.223396062850952,
16776
+ "learning_rate": 4.454237288135594e-06,
16777
+ "loss": 1.4585,
16778
+ "step": 23430
16779
+ },
16780
+ {
16781
+ "epoch": 0.59,
16782
+ "grad_norm": 3.0469980239868164,
16783
+ "learning_rate": 4.447457627118645e-06,
16784
+ "loss": 1.2125,
16785
+ "step": 23440
16786
+ },
16787
+ {
16788
+ "epoch": 0.59,
16789
+ "grad_norm": 9.140281677246094,
16790
+ "learning_rate": 4.440677966101695e-06,
16791
+ "loss": 1.2847,
16792
+ "step": 23450
16793
+ },
16794
+ {
16795
+ "epoch": 0.59,
16796
+ "grad_norm": 10.596829414367676,
16797
+ "learning_rate": 4.433898305084746e-06,
16798
+ "loss": 1.1919,
16799
+ "step": 23460
16800
+ },
16801
+ {
16802
+ "epoch": 0.59,
16803
+ "grad_norm": 6.769688129425049,
16804
+ "learning_rate": 4.427118644067797e-06,
16805
+ "loss": 1.3911,
16806
+ "step": 23470
16807
+ },
16808
+ {
16809
+ "epoch": 0.59,
16810
+ "grad_norm": 8.3526029586792,
16811
+ "learning_rate": 4.420338983050848e-06,
16812
+ "loss": 1.3243,
16813
+ "step": 23480
16814
+ },
16815
+ {
16816
+ "epoch": 0.59,
16817
+ "grad_norm": 1.516774296760559,
16818
+ "learning_rate": 4.413559322033898e-06,
16819
+ "loss": 1.4328,
16820
+ "step": 23490
16821
+ },
16822
+ {
16823
+ "epoch": 0.59,
16824
+ "grad_norm": 5.758790493011475,
16825
+ "learning_rate": 4.40677966101695e-06,
16826
+ "loss": 1.1158,
16827
+ "step": 23500
16828
+ },
16829
+ {
16830
+ "epoch": 0.59,
16831
+ "eval_loss": 1.3372619152069092,
16832
+ "eval_runtime": 66.1468,
16833
+ "eval_samples_per_second": 15.118,
16834
+ "eval_steps_per_second": 15.118,
16835
+ "step": 23500
16836
+ },
16837
+ {
16838
+ "epoch": 0.59,
16839
+ "grad_norm": 10.471700668334961,
16840
+ "learning_rate": 4.4e-06,
16841
+ "loss": 1.1029,
16842
+ "step": 23510
16843
+ },
16844
+ {
16845
+ "epoch": 0.59,
16846
+ "grad_norm": 6.78934383392334,
16847
+ "learning_rate": 4.393220338983052e-06,
16848
+ "loss": 1.2838,
16849
+ "step": 23520
16850
+ },
16851
+ {
16852
+ "epoch": 0.59,
16853
+ "grad_norm": 4.890566825866699,
16854
+ "learning_rate": 4.386440677966102e-06,
16855
+ "loss": 1.2861,
16856
+ "step": 23530
16857
+ },
16858
+ {
16859
+ "epoch": 0.59,
16860
+ "grad_norm": 9.901065826416016,
16861
+ "learning_rate": 4.379661016949153e-06,
16862
+ "loss": 1.3043,
16863
+ "step": 23540
16864
+ },
16865
+ {
16866
+ "epoch": 0.59,
16867
+ "grad_norm": 3.332019805908203,
16868
+ "learning_rate": 4.372881355932203e-06,
16869
+ "loss": 1.5098,
16870
+ "step": 23550
16871
+ },
16872
+ {
16873
+ "epoch": 0.59,
16874
+ "grad_norm": 9.1102876663208,
16875
+ "learning_rate": 4.366101694915255e-06,
16876
+ "loss": 1.2222,
16877
+ "step": 23560
16878
+ },
16879
+ {
16880
+ "epoch": 0.59,
16881
+ "grad_norm": 2.800964832305908,
16882
+ "learning_rate": 4.359322033898305e-06,
16883
+ "loss": 1.1916,
16884
+ "step": 23570
16885
+ },
16886
+ {
16887
+ "epoch": 0.59,
16888
+ "grad_norm": 4.45274019241333,
16889
+ "learning_rate": 4.352542372881357e-06,
16890
+ "loss": 1.496,
16891
+ "step": 23580
16892
+ },
16893
+ {
16894
+ "epoch": 0.59,
16895
+ "grad_norm": 7.7979350090026855,
16896
+ "learning_rate": 4.345762711864407e-06,
16897
+ "loss": 1.3483,
16898
+ "step": 23590
16899
+ },
16900
+ {
16901
+ "epoch": 0.59,
16902
+ "grad_norm": 5.517279148101807,
16903
+ "learning_rate": 4.338983050847458e-06,
16904
+ "loss": 1.1602,
16905
+ "step": 23600
16906
+ },
16907
+ {
16908
+ "epoch": 0.59,
16909
+ "grad_norm": 8.224603652954102,
16910
+ "learning_rate": 4.332203389830509e-06,
16911
+ "loss": 1.2216,
16912
+ "step": 23610
16913
+ },
16914
+ {
16915
+ "epoch": 0.59,
16916
+ "grad_norm": 3.9079153537750244,
16917
+ "learning_rate": 4.32542372881356e-06,
16918
+ "loss": 1.4094,
16919
+ "step": 23620
16920
+ },
16921
+ {
16922
+ "epoch": 0.59,
16923
+ "grad_norm": 7.209962844848633,
16924
+ "learning_rate": 4.318644067796611e-06,
16925
+ "loss": 1.4033,
16926
+ "step": 23630
16927
+ },
16928
+ {
16929
+ "epoch": 0.59,
16930
+ "grad_norm": 6.915498733520508,
16931
+ "learning_rate": 4.311864406779661e-06,
16932
+ "loss": 1.3988,
16933
+ "step": 23640
16934
+ },
16935
+ {
16936
+ "epoch": 0.59,
16937
+ "grad_norm": 6.8702778816223145,
16938
+ "learning_rate": 4.305084745762712e-06,
16939
+ "loss": 1.3308,
16940
+ "step": 23650
16941
+ },
16942
+ {
16943
+ "epoch": 0.59,
16944
+ "grad_norm": 6.673946380615234,
16945
+ "learning_rate": 4.298305084745763e-06,
16946
+ "loss": 1.2756,
16947
+ "step": 23660
16948
+ },
16949
+ {
16950
+ "epoch": 0.59,
16951
+ "grad_norm": 2.729367971420288,
16952
+ "learning_rate": 4.291525423728814e-06,
16953
+ "loss": 1.2702,
16954
+ "step": 23670
16955
+ },
16956
+ {
16957
+ "epoch": 0.59,
16958
+ "grad_norm": 4.333055019378662,
16959
+ "learning_rate": 4.284745762711865e-06,
16960
+ "loss": 1.3365,
16961
+ "step": 23680
16962
+ },
16963
+ {
16964
+ "epoch": 0.59,
16965
+ "grad_norm": 8.36184024810791,
16966
+ "learning_rate": 4.277966101694915e-06,
16967
+ "loss": 1.2783,
16968
+ "step": 23690
16969
+ },
16970
+ {
16971
+ "epoch": 0.59,
16972
+ "grad_norm": 4.62699031829834,
16973
+ "learning_rate": 4.271186440677967e-06,
16974
+ "loss": 1.4385,
16975
+ "step": 23700
16976
+ },
16977
+ {
16978
+ "epoch": 0.59,
16979
+ "grad_norm": 3.193026304244995,
16980
+ "learning_rate": 4.264406779661017e-06,
16981
+ "loss": 1.4843,
16982
+ "step": 23710
16983
+ },
16984
+ {
16985
+ "epoch": 0.59,
16986
+ "grad_norm": 8.289533615112305,
16987
+ "learning_rate": 4.257627118644068e-06,
16988
+ "loss": 1.5263,
16989
+ "step": 23720
16990
+ },
16991
+ {
16992
+ "epoch": 0.59,
16993
+ "grad_norm": 3.887775182723999,
16994
+ "learning_rate": 4.250847457627119e-06,
16995
+ "loss": 1.228,
16996
+ "step": 23730
16997
+ },
16998
+ {
16999
+ "epoch": 0.59,
17000
+ "grad_norm": 10.728804588317871,
17001
+ "learning_rate": 4.24406779661017e-06,
17002
+ "loss": 1.4097,
17003
+ "step": 23740
17004
+ },
17005
+ {
17006
+ "epoch": 0.59,
17007
+ "grad_norm": 5.405580997467041,
17008
+ "learning_rate": 4.23728813559322e-06,
17009
+ "loss": 1.227,
17010
+ "step": 23750
17011
+ },
17012
+ {
17013
+ "epoch": 0.59,
17014
+ "grad_norm": 2.104985237121582,
17015
+ "learning_rate": 4.230508474576272e-06,
17016
+ "loss": 1.3258,
17017
+ "step": 23760
17018
+ },
17019
+ {
17020
+ "epoch": 0.59,
17021
+ "grad_norm": 11.678805351257324,
17022
+ "learning_rate": 4.223728813559322e-06,
17023
+ "loss": 1.1797,
17024
+ "step": 23770
17025
+ },
17026
+ {
17027
+ "epoch": 0.59,
17028
+ "grad_norm": 12.024051666259766,
17029
+ "learning_rate": 4.216949152542374e-06,
17030
+ "loss": 1.3278,
17031
+ "step": 23780
17032
+ },
17033
+ {
17034
+ "epoch": 0.59,
17035
+ "grad_norm": 12.879485130310059,
17036
+ "learning_rate": 4.210169491525424e-06,
17037
+ "loss": 1.2552,
17038
+ "step": 23790
17039
+ },
17040
+ {
17041
+ "epoch": 0.59,
17042
+ "grad_norm": 6.001992702484131,
17043
+ "learning_rate": 4.203389830508475e-06,
17044
+ "loss": 1.4639,
17045
+ "step": 23800
17046
+ },
17047
+ {
17048
+ "epoch": 0.6,
17049
+ "grad_norm": 7.713657855987549,
17050
+ "learning_rate": 4.196610169491525e-06,
17051
+ "loss": 1.3542,
17052
+ "step": 23810
17053
+ },
17054
+ {
17055
+ "epoch": 0.6,
17056
+ "grad_norm": 25.137435913085938,
17057
+ "learning_rate": 4.189830508474577e-06,
17058
+ "loss": 1.2694,
17059
+ "step": 23820
17060
+ },
17061
+ {
17062
+ "epoch": 0.6,
17063
+ "grad_norm": 13.080780029296875,
17064
+ "learning_rate": 4.183050847457627e-06,
17065
+ "loss": 1.5512,
17066
+ "step": 23830
17067
+ },
17068
+ {
17069
+ "epoch": 0.6,
17070
+ "grad_norm": 3.648967981338501,
17071
+ "learning_rate": 4.176271186440679e-06,
17072
+ "loss": 1.4919,
17073
+ "step": 23840
17074
+ },
17075
+ {
17076
+ "epoch": 0.6,
17077
+ "grad_norm": 2.8366498947143555,
17078
+ "learning_rate": 4.169491525423729e-06,
17079
+ "loss": 1.3528,
17080
+ "step": 23850
17081
+ },
17082
+ {
17083
+ "epoch": 0.6,
17084
+ "grad_norm": 2.3198916912078857,
17085
+ "learning_rate": 4.16271186440678e-06,
17086
+ "loss": 1.4445,
17087
+ "step": 23860
17088
+ },
17089
+ {
17090
+ "epoch": 0.6,
17091
+ "grad_norm": 9.170830726623535,
17092
+ "learning_rate": 4.155932203389831e-06,
17093
+ "loss": 1.1801,
17094
+ "step": 23870
17095
+ },
17096
+ {
17097
+ "epoch": 0.6,
17098
+ "grad_norm": 7.985089302062988,
17099
+ "learning_rate": 4.149152542372882e-06,
17100
+ "loss": 1.3195,
17101
+ "step": 23880
17102
+ },
17103
+ {
17104
+ "epoch": 0.6,
17105
+ "grad_norm": 10.688753128051758,
17106
+ "learning_rate": 4.142372881355933e-06,
17107
+ "loss": 1.4527,
17108
+ "step": 23890
17109
+ },
17110
+ {
17111
+ "epoch": 0.6,
17112
+ "grad_norm": 12.181285858154297,
17113
+ "learning_rate": 4.135593220338983e-06,
17114
+ "loss": 1.3875,
17115
+ "step": 23900
17116
+ },
17117
+ {
17118
+ "epoch": 0.6,
17119
+ "grad_norm": 10.353550910949707,
17120
+ "learning_rate": 4.128813559322034e-06,
17121
+ "loss": 1.3937,
17122
+ "step": 23910
17123
+ },
17124
+ {
17125
+ "epoch": 0.6,
17126
+ "grad_norm": 3.3962326049804688,
17127
+ "learning_rate": 4.122033898305085e-06,
17128
+ "loss": 1.2212,
17129
+ "step": 23920
17130
+ },
17131
+ {
17132
+ "epoch": 0.6,
17133
+ "grad_norm": 9.191743850708008,
17134
+ "learning_rate": 4.115254237288136e-06,
17135
+ "loss": 1.3884,
17136
+ "step": 23930
17137
+ },
17138
+ {
17139
+ "epoch": 0.6,
17140
+ "grad_norm": 8.74504566192627,
17141
+ "learning_rate": 4.108474576271187e-06,
17142
+ "loss": 1.369,
17143
+ "step": 23940
17144
+ },
17145
+ {
17146
+ "epoch": 0.6,
17147
+ "grad_norm": 15.484914779663086,
17148
+ "learning_rate": 4.101694915254237e-06,
17149
+ "loss": 1.3607,
17150
+ "step": 23950
17151
+ },
17152
+ {
17153
+ "epoch": 0.6,
17154
+ "grad_norm": 8.069631576538086,
17155
+ "learning_rate": 4.094915254237289e-06,
17156
+ "loss": 1.1674,
17157
+ "step": 23960
17158
+ },
17159
+ {
17160
+ "epoch": 0.6,
17161
+ "grad_norm": 5.688279151916504,
17162
+ "learning_rate": 4.088135593220339e-06,
17163
+ "loss": 1.2652,
17164
+ "step": 23970
17165
+ },
17166
+ {
17167
+ "epoch": 0.6,
17168
+ "grad_norm": 2.326960325241089,
17169
+ "learning_rate": 4.081355932203391e-06,
17170
+ "loss": 1.2149,
17171
+ "step": 23980
17172
+ },
17173
+ {
17174
+ "epoch": 0.6,
17175
+ "grad_norm": 9.749725341796875,
17176
+ "learning_rate": 4.074576271186441e-06,
17177
+ "loss": 1.2378,
17178
+ "step": 23990
17179
+ },
17180
+ {
17181
+ "epoch": 0.6,
17182
+ "grad_norm": 5.552289962768555,
17183
+ "learning_rate": 4.067796610169492e-06,
17184
+ "loss": 1.2699,
17185
+ "step": 24000
17186
+ },
17187
+ {
17188
+ "epoch": 0.6,
17189
+ "eval_loss": 1.3411681652069092,
17190
+ "eval_runtime": 66.1396,
17191
+ "eval_samples_per_second": 15.12,
17192
+ "eval_steps_per_second": 15.12,
17193
+ "step": 24000
17194
+ },
17195
+ {
17196
+ "epoch": 0.6,
17197
+ "grad_norm": 8.768118858337402,
17198
+ "learning_rate": 4.061016949152542e-06,
17199
+ "loss": 1.1869,
17200
+ "step": 24010
17201
+ },
17202
+ {
17203
+ "epoch": 0.6,
17204
+ "grad_norm": 1.0193852186203003,
17205
+ "learning_rate": 4.054237288135594e-06,
17206
+ "loss": 1.1477,
17207
+ "step": 24020
17208
+ },
17209
+ {
17210
+ "epoch": 0.6,
17211
+ "grad_norm": 11.04339599609375,
17212
+ "learning_rate": 4.047457627118644e-06,
17213
+ "loss": 1.2492,
17214
+ "step": 24030
17215
+ },
17216
+ {
17217
+ "epoch": 0.6,
17218
+ "grad_norm": 2.5347607135772705,
17219
+ "learning_rate": 4.040677966101696e-06,
17220
+ "loss": 1.2424,
17221
+ "step": 24040
17222
+ },
17223
+ {
17224
+ "epoch": 0.6,
17225
+ "grad_norm": 5.121871471405029,
17226
+ "learning_rate": 4.033898305084746e-06,
17227
+ "loss": 1.1773,
17228
+ "step": 24050
17229
+ },
17230
+ {
17231
+ "epoch": 0.6,
17232
+ "grad_norm": 8.53433609008789,
17233
+ "learning_rate": 4.027118644067797e-06,
17234
+ "loss": 1.4407,
17235
+ "step": 24060
17236
+ },
17237
+ {
17238
+ "epoch": 0.6,
17239
+ "grad_norm": 11.311376571655273,
17240
+ "learning_rate": 4.020338983050847e-06,
17241
+ "loss": 1.4095,
17242
+ "step": 24070
17243
+ },
17244
+ {
17245
+ "epoch": 0.6,
17246
+ "grad_norm": 2.8956375122070312,
17247
+ "learning_rate": 4.013559322033899e-06,
17248
+ "loss": 1.3076,
17249
+ "step": 24080
17250
+ },
17251
+ {
17252
+ "epoch": 0.6,
17253
+ "grad_norm": 3.6406021118164062,
17254
+ "learning_rate": 4.006779661016949e-06,
17255
+ "loss": 1.1801,
17256
+ "step": 24090
17257
+ },
17258
+ {
17259
+ "epoch": 0.6,
17260
+ "grad_norm": 4.67333459854126,
17261
+ "learning_rate": 4.000000000000001e-06,
17262
+ "loss": 1.3627,
17263
+ "step": 24100
17264
+ },
17265
+ {
17266
+ "epoch": 0.6,
17267
+ "grad_norm": 4.243159294128418,
17268
+ "learning_rate": 3.993220338983051e-06,
17269
+ "loss": 1.2065,
17270
+ "step": 24110
17271
+ },
17272
+ {
17273
+ "epoch": 0.6,
17274
+ "grad_norm": 4.570652484893799,
17275
+ "learning_rate": 3.986440677966102e-06,
17276
+ "loss": 1.4576,
17277
+ "step": 24120
17278
+ },
17279
+ {
17280
+ "epoch": 0.6,
17281
+ "grad_norm": 10.30574893951416,
17282
+ "learning_rate": 3.979661016949153e-06,
17283
+ "loss": 1.3798,
17284
+ "step": 24130
17285
+ },
17286
+ {
17287
+ "epoch": 0.6,
17288
+ "grad_norm": 1.7883845567703247,
17289
+ "learning_rate": 3.972881355932204e-06,
17290
+ "loss": 1.1474,
17291
+ "step": 24140
17292
+ },
17293
+ {
17294
+ "epoch": 0.6,
17295
+ "grad_norm": 2.429614305496216,
17296
+ "learning_rate": 3.966101694915255e-06,
17297
+ "loss": 1.3992,
17298
+ "step": 24150
17299
+ },
17300
+ {
17301
+ "epoch": 0.6,
17302
+ "grad_norm": 5.791226863861084,
17303
+ "learning_rate": 3.959322033898305e-06,
17304
+ "loss": 1.4164,
17305
+ "step": 24160
17306
+ },
17307
+ {
17308
+ "epoch": 0.6,
17309
+ "grad_norm": 6.212001800537109,
17310
+ "learning_rate": 3.952542372881356e-06,
17311
+ "loss": 1.4647,
17312
+ "step": 24170
17313
+ },
17314
+ {
17315
+ "epoch": 0.6,
17316
+ "grad_norm": 4.9569292068481445,
17317
+ "learning_rate": 3.945762711864407e-06,
17318
+ "loss": 1.2948,
17319
+ "step": 24180
17320
+ },
17321
+ {
17322
+ "epoch": 0.6,
17323
+ "grad_norm": 2.2119970321655273,
17324
+ "learning_rate": 3.938983050847458e-06,
17325
+ "loss": 1.3955,
17326
+ "step": 24190
17327
+ },
17328
+ {
17329
+ "epoch": 0.6,
17330
+ "grad_norm": 10.280770301818848,
17331
+ "learning_rate": 3.932203389830509e-06,
17332
+ "loss": 1.4461,
17333
+ "step": 24200
17334
+ },
17335
+ {
17336
+ "epoch": 0.61,
17337
+ "grad_norm": 3.701272487640381,
17338
+ "learning_rate": 3.925423728813559e-06,
17339
+ "loss": 1.348,
17340
+ "step": 24210
17341
+ },
17342
+ {
17343
+ "epoch": 0.61,
17344
+ "grad_norm": 8.827926635742188,
17345
+ "learning_rate": 3.918644067796611e-06,
17346
+ "loss": 1.3398,
17347
+ "step": 24220
17348
+ },
17349
+ {
17350
+ "epoch": 0.61,
17351
+ "grad_norm": 6.997286319732666,
17352
+ "learning_rate": 3.911864406779661e-06,
17353
+ "loss": 1.4724,
17354
+ "step": 24230
17355
+ },
17356
+ {
17357
+ "epoch": 0.61,
17358
+ "grad_norm": 5.5268449783325195,
17359
+ "learning_rate": 3.905084745762713e-06,
17360
+ "loss": 1.388,
17361
+ "step": 24240
17362
+ },
17363
+ {
17364
+ "epoch": 0.61,
17365
+ "grad_norm": 8.842992782592773,
17366
+ "learning_rate": 3.898305084745763e-06,
17367
+ "loss": 1.2382,
17368
+ "step": 24250
17369
+ },
17370
+ {
17371
+ "epoch": 0.61,
17372
+ "grad_norm": 11.24975872039795,
17373
+ "learning_rate": 3.891525423728814e-06,
17374
+ "loss": 1.2194,
17375
+ "step": 24260
17376
+ },
17377
+ {
17378
+ "epoch": 0.61,
17379
+ "grad_norm": 2.875722646713257,
17380
+ "learning_rate": 3.884745762711864e-06,
17381
+ "loss": 1.3792,
17382
+ "step": 24270
17383
+ },
17384
+ {
17385
+ "epoch": 0.61,
17386
+ "grad_norm": 8.459474563598633,
17387
+ "learning_rate": 3.877966101694916e-06,
17388
+ "loss": 1.3021,
17389
+ "step": 24280
17390
+ },
17391
+ {
17392
+ "epoch": 0.61,
17393
+ "grad_norm": 3.315873861312866,
17394
+ "learning_rate": 3.871186440677966e-06,
17395
+ "loss": 1.2976,
17396
+ "step": 24290
17397
+ },
17398
+ {
17399
+ "epoch": 0.61,
17400
+ "grad_norm": 6.280729293823242,
17401
+ "learning_rate": 3.864406779661018e-06,
17402
+ "loss": 1.3294,
17403
+ "step": 24300
17404
+ },
17405
+ {
17406
+ "epoch": 0.61,
17407
+ "grad_norm": 6.004711627960205,
17408
+ "learning_rate": 3.857627118644068e-06,
17409
+ "loss": 1.3178,
17410
+ "step": 24310
17411
+ },
17412
+ {
17413
+ "epoch": 0.61,
17414
+ "grad_norm": 8.207845687866211,
17415
+ "learning_rate": 3.850847457627119e-06,
17416
+ "loss": 1.424,
17417
+ "step": 24320
17418
+ },
17419
+ {
17420
+ "epoch": 0.61,
17421
+ "grad_norm": 8.01065444946289,
17422
+ "learning_rate": 3.844067796610169e-06,
17423
+ "loss": 1.2842,
17424
+ "step": 24330
17425
+ },
17426
+ {
17427
+ "epoch": 0.61,
17428
+ "grad_norm": 9.126721382141113,
17429
+ "learning_rate": 3.837288135593221e-06,
17430
+ "loss": 1.368,
17431
+ "step": 24340
17432
+ },
17433
+ {
17434
+ "epoch": 0.61,
17435
+ "grad_norm": 11.590188026428223,
17436
+ "learning_rate": 3.830508474576271e-06,
17437
+ "loss": 1.375,
17438
+ "step": 24350
17439
+ },
17440
+ {
17441
+ "epoch": 0.61,
17442
+ "grad_norm": 7.325139045715332,
17443
+ "learning_rate": 3.823728813559323e-06,
17444
+ "loss": 1.2593,
17445
+ "step": 24360
17446
+ },
17447
+ {
17448
+ "epoch": 0.61,
17449
+ "grad_norm": 6.3924760818481445,
17450
+ "learning_rate": 3.816949152542373e-06,
17451
+ "loss": 1.3238,
17452
+ "step": 24370
17453
+ },
17454
+ {
17455
+ "epoch": 0.61,
17456
+ "grad_norm": 5.093543529510498,
17457
+ "learning_rate": 3.8101694915254238e-06,
17458
+ "loss": 1.3398,
17459
+ "step": 24380
17460
+ },
17461
+ {
17462
+ "epoch": 0.61,
17463
+ "grad_norm": 4.488302707672119,
17464
+ "learning_rate": 3.8033898305084748e-06,
17465
+ "loss": 1.2532,
17466
+ "step": 24390
17467
+ },
17468
+ {
17469
+ "epoch": 0.61,
17470
+ "grad_norm": 3.5369062423706055,
17471
+ "learning_rate": 3.7966101694915257e-06,
17472
+ "loss": 1.1723,
17473
+ "step": 24400
17474
+ },
17475
+ {
17476
+ "epoch": 0.61,
17477
+ "grad_norm": 3.2012510299682617,
17478
+ "learning_rate": 3.7898305084745767e-06,
17479
+ "loss": 1.3348,
17480
+ "step": 24410
17481
+ },
17482
+ {
17483
+ "epoch": 0.61,
17484
+ "grad_norm": 1.5665017366409302,
17485
+ "learning_rate": 3.7830508474576273e-06,
17486
+ "loss": 1.4159,
17487
+ "step": 24420
17488
+ },
17489
+ {
17490
+ "epoch": 0.61,
17491
+ "grad_norm": 12.912787437438965,
17492
+ "learning_rate": 3.7762711864406782e-06,
17493
+ "loss": 1.2406,
17494
+ "step": 24430
17495
+ },
17496
+ {
17497
+ "epoch": 0.61,
17498
+ "grad_norm": 6.572142124176025,
17499
+ "learning_rate": 3.7694915254237292e-06,
17500
+ "loss": 1.1544,
17501
+ "step": 24440
17502
+ },
17503
+ {
17504
+ "epoch": 0.61,
17505
+ "grad_norm": 4.999161720275879,
17506
+ "learning_rate": 3.76271186440678e-06,
17507
+ "loss": 1.1744,
17508
+ "step": 24450
17509
+ },
17510
+ {
17511
+ "epoch": 0.61,
17512
+ "grad_norm": 3.322866439819336,
17513
+ "learning_rate": 3.755932203389831e-06,
17514
+ "loss": 1.3713,
17515
+ "step": 24460
17516
+ },
17517
+ {
17518
+ "epoch": 0.61,
17519
+ "grad_norm": 5.197652816772461,
17520
+ "learning_rate": 3.7491525423728813e-06,
17521
+ "loss": 1.1858,
17522
+ "step": 24470
17523
+ },
17524
+ {
17525
+ "epoch": 0.61,
17526
+ "grad_norm": 6.7361369132995605,
17527
+ "learning_rate": 3.7423728813559323e-06,
17528
+ "loss": 1.2778,
17529
+ "step": 24480
17530
+ },
17531
+ {
17532
+ "epoch": 0.61,
17533
+ "grad_norm": 1.1276848316192627,
17534
+ "learning_rate": 3.7355932203389833e-06,
17535
+ "loss": 1.3971,
17536
+ "step": 24490
17537
+ },
17538
+ {
17539
+ "epoch": 0.61,
17540
+ "grad_norm": 4.62593412399292,
17541
+ "learning_rate": 3.7288135593220342e-06,
17542
+ "loss": 1.479,
17543
+ "step": 24500
17544
+ },
17545
+ {
17546
+ "epoch": 0.61,
17547
+ "eval_loss": 1.3212531805038452,
17548
+ "eval_runtime": 66.1318,
17549
+ "eval_samples_per_second": 15.121,
17550
+ "eval_steps_per_second": 15.121,
17551
+ "step": 24500
17552
+ },
17553
+ {
17554
+ "epoch": 0.61,
17555
+ "grad_norm": 7.978855133056641,
17556
+ "learning_rate": 3.7220338983050852e-06,
17557
+ "loss": 1.1919,
17558
+ "step": 24510
17559
+ },
17560
+ {
17561
+ "epoch": 0.61,
17562
+ "grad_norm": 6.347212314605713,
17563
+ "learning_rate": 3.715254237288136e-06,
17564
+ "loss": 1.2493,
17565
+ "step": 24520
17566
+ },
17567
+ {
17568
+ "epoch": 0.61,
17569
+ "grad_norm": 6.2206573486328125,
17570
+ "learning_rate": 3.7084745762711867e-06,
17571
+ "loss": 1.2258,
17572
+ "step": 24530
17573
+ },
17574
+ {
17575
+ "epoch": 0.61,
17576
+ "grad_norm": 2.548797607421875,
17577
+ "learning_rate": 3.7016949152542377e-06,
17578
+ "loss": 1.262,
17579
+ "step": 24540
17580
+ },
17581
+ {
17582
+ "epoch": 0.61,
17583
+ "grad_norm": 9.5992431640625,
17584
+ "learning_rate": 3.6949152542372883e-06,
17585
+ "loss": 1.4298,
17586
+ "step": 24550
17587
+ },
17588
+ {
17589
+ "epoch": 0.61,
17590
+ "grad_norm": 2.236175298690796,
17591
+ "learning_rate": 3.6881355932203393e-06,
17592
+ "loss": 1.3466,
17593
+ "step": 24560
17594
+ },
17595
+ {
17596
+ "epoch": 0.61,
17597
+ "grad_norm": 7.134004592895508,
17598
+ "learning_rate": 3.6813559322033902e-06,
17599
+ "loss": 1.2642,
17600
+ "step": 24570
17601
+ },
17602
+ {
17603
+ "epoch": 0.61,
17604
+ "grad_norm": 12.453125,
17605
+ "learning_rate": 3.6745762711864408e-06,
17606
+ "loss": 1.369,
17607
+ "step": 24580
17608
+ },
17609
+ {
17610
+ "epoch": 0.61,
17611
+ "grad_norm": 16.243106842041016,
17612
+ "learning_rate": 3.6677966101694918e-06,
17613
+ "loss": 1.3269,
17614
+ "step": 24590
17615
+ },
17616
+ {
17617
+ "epoch": 0.61,
17618
+ "grad_norm": 11.406882286071777,
17619
+ "learning_rate": 3.6610169491525427e-06,
17620
+ "loss": 1.4339,
17621
+ "step": 24600
17622
+ },
17623
+ {
17624
+ "epoch": 0.62,
17625
+ "grad_norm": 6.334946632385254,
17626
+ "learning_rate": 3.6542372881355937e-06,
17627
+ "loss": 1.2445,
17628
+ "step": 24610
17629
+ },
17630
+ {
17631
+ "epoch": 0.62,
17632
+ "grad_norm": 9.451517105102539,
17633
+ "learning_rate": 3.6474576271186447e-06,
17634
+ "loss": 1.1902,
17635
+ "step": 24620
17636
+ },
17637
+ {
17638
+ "epoch": 0.62,
17639
+ "grad_norm": 1.9071747064590454,
17640
+ "learning_rate": 3.640677966101695e-06,
17641
+ "loss": 1.3273,
17642
+ "step": 24630
17643
+ },
17644
+ {
17645
+ "epoch": 0.62,
17646
+ "grad_norm": 4.054659843444824,
17647
+ "learning_rate": 3.633898305084746e-06,
17648
+ "loss": 1.2808,
17649
+ "step": 24640
17650
+ },
17651
+ {
17652
+ "epoch": 0.62,
17653
+ "grad_norm": 6.314877986907959,
17654
+ "learning_rate": 3.6271186440677968e-06,
17655
+ "loss": 1.4351,
17656
+ "step": 24650
17657
+ },
17658
+ {
17659
+ "epoch": 0.62,
17660
+ "grad_norm": 6.971933841705322,
17661
+ "learning_rate": 3.6203389830508478e-06,
17662
+ "loss": 1.2973,
17663
+ "step": 24660
17664
+ },
17665
+ {
17666
+ "epoch": 0.62,
17667
+ "grad_norm": 2.7064402103424072,
17668
+ "learning_rate": 3.6135593220338987e-06,
17669
+ "loss": 1.4335,
17670
+ "step": 24670
17671
+ },
17672
+ {
17673
+ "epoch": 0.62,
17674
+ "grad_norm": 5.698015213012695,
17675
+ "learning_rate": 3.6067796610169493e-06,
17676
+ "loss": 1.3042,
17677
+ "step": 24680
17678
+ },
17679
+ {
17680
+ "epoch": 0.62,
17681
+ "grad_norm": 5.647088050842285,
17682
+ "learning_rate": 3.6000000000000003e-06,
17683
+ "loss": 1.3832,
17684
+ "step": 24690
17685
+ },
17686
+ {
17687
+ "epoch": 0.62,
17688
+ "grad_norm": 10.945414543151855,
17689
+ "learning_rate": 3.5932203389830512e-06,
17690
+ "loss": 1.2363,
17691
+ "step": 24700
17692
+ },
17693
+ {
17694
+ "epoch": 0.62,
17695
+ "grad_norm": 7.516660213470459,
17696
+ "learning_rate": 3.5864406779661022e-06,
17697
+ "loss": 1.2615,
17698
+ "step": 24710
17699
+ },
17700
+ {
17701
+ "epoch": 0.62,
17702
+ "grad_norm": 3.6117703914642334,
17703
+ "learning_rate": 3.579661016949153e-06,
17704
+ "loss": 1.3533,
17705
+ "step": 24720
17706
+ },
17707
+ {
17708
+ "epoch": 0.62,
17709
+ "grad_norm": 10.817008972167969,
17710
+ "learning_rate": 3.5728813559322033e-06,
17711
+ "loss": 1.3425,
17712
+ "step": 24730
17713
+ },
17714
+ {
17715
+ "epoch": 0.62,
17716
+ "grad_norm": 7.7072858810424805,
17717
+ "learning_rate": 3.5661016949152543e-06,
17718
+ "loss": 1.2762,
17719
+ "step": 24740
17720
+ },
17721
+ {
17722
+ "epoch": 0.62,
17723
+ "grad_norm": 5.9887495040893555,
17724
+ "learning_rate": 3.5593220338983053e-06,
17725
+ "loss": 1.3516,
17726
+ "step": 24750
17727
+ },
17728
+ {
17729
+ "epoch": 0.62,
17730
+ "grad_norm": 3.4481067657470703,
17731
+ "learning_rate": 3.5525423728813563e-06,
17732
+ "loss": 1.0325,
17733
+ "step": 24760
17734
+ },
17735
+ {
17736
+ "epoch": 0.62,
17737
+ "grad_norm": 5.2485551834106445,
17738
+ "learning_rate": 3.5457627118644072e-06,
17739
+ "loss": 1.4088,
17740
+ "step": 24770
17741
+ },
17742
+ {
17743
+ "epoch": 0.62,
17744
+ "grad_norm": 6.970777988433838,
17745
+ "learning_rate": 3.538983050847458e-06,
17746
+ "loss": 1.4962,
17747
+ "step": 24780
17748
+ },
17749
+ {
17750
+ "epoch": 0.62,
17751
+ "grad_norm": 6.037806034088135,
17752
+ "learning_rate": 3.5322033898305088e-06,
17753
+ "loss": 1.3806,
17754
+ "step": 24790
17755
+ },
17756
+ {
17757
+ "epoch": 0.62,
17758
+ "grad_norm": 4.5726518630981445,
17759
+ "learning_rate": 3.5254237288135597e-06,
17760
+ "loss": 1.3086,
17761
+ "step": 24800
17762
+ },
17763
+ {
17764
+ "epoch": 0.62,
17765
+ "grad_norm": 4.184850692749023,
17766
+ "learning_rate": 3.5186440677966103e-06,
17767
+ "loss": 1.4004,
17768
+ "step": 24810
17769
+ },
17770
+ {
17771
+ "epoch": 0.62,
17772
+ "grad_norm": 9.181009292602539,
17773
+ "learning_rate": 3.5118644067796613e-06,
17774
+ "loss": 1.4348,
17775
+ "step": 24820
17776
+ },
17777
+ {
17778
+ "epoch": 0.62,
17779
+ "grad_norm": 5.124319076538086,
17780
+ "learning_rate": 3.5050847457627122e-06,
17781
+ "loss": 1.2272,
17782
+ "step": 24830
17783
+ },
17784
+ {
17785
+ "epoch": 0.62,
17786
+ "grad_norm": 7.30942440032959,
17787
+ "learning_rate": 3.498305084745763e-06,
17788
+ "loss": 1.3819,
17789
+ "step": 24840
17790
+ },
17791
+ {
17792
+ "epoch": 0.62,
17793
+ "grad_norm": 7.727287769317627,
17794
+ "learning_rate": 3.4915254237288138e-06,
17795
+ "loss": 1.1956,
17796
+ "step": 24850
17797
+ },
17798
+ {
17799
+ "epoch": 0.62,
17800
+ "grad_norm": 5.2934112548828125,
17801
+ "learning_rate": 3.4847457627118648e-06,
17802
+ "loss": 1.3141,
17803
+ "step": 24860
17804
+ },
17805
+ {
17806
+ "epoch": 0.62,
17807
+ "grad_norm": 7.7370500564575195,
17808
+ "learning_rate": 3.4779661016949157e-06,
17809
+ "loss": 1.2418,
17810
+ "step": 24870
17811
+ },
17812
+ {
17813
+ "epoch": 0.62,
17814
+ "grad_norm": 5.379338264465332,
17815
+ "learning_rate": 3.4711864406779667e-06,
17816
+ "loss": 1.435,
17817
+ "step": 24880
17818
+ },
17819
+ {
17820
+ "epoch": 0.62,
17821
+ "grad_norm": 5.592279434204102,
17822
+ "learning_rate": 3.464406779661017e-06,
17823
+ "loss": 1.324,
17824
+ "step": 24890
17825
+ },
17826
+ {
17827
+ "epoch": 0.62,
17828
+ "grad_norm": 4.178751468658447,
17829
+ "learning_rate": 3.457627118644068e-06,
17830
+ "loss": 1.4095,
17831
+ "step": 24900
17832
+ },
17833
+ {
17834
+ "epoch": 0.62,
17835
+ "grad_norm": 2.7852957248687744,
17836
+ "learning_rate": 3.450847457627119e-06,
17837
+ "loss": 1.1834,
17838
+ "step": 24910
17839
+ },
17840
+ {
17841
+ "epoch": 0.62,
17842
+ "grad_norm": 12.001542091369629,
17843
+ "learning_rate": 3.4440677966101698e-06,
17844
+ "loss": 1.3294,
17845
+ "step": 24920
17846
+ },
17847
+ {
17848
+ "epoch": 0.62,
17849
+ "grad_norm": 4.010140895843506,
17850
+ "learning_rate": 3.4372881355932207e-06,
17851
+ "loss": 1.4615,
17852
+ "step": 24930
17853
+ },
17854
+ {
17855
+ "epoch": 0.62,
17856
+ "grad_norm": 2.7130627632141113,
17857
+ "learning_rate": 3.4305084745762713e-06,
17858
+ "loss": 1.4789,
17859
+ "step": 24940
17860
+ },
17861
+ {
17862
+ "epoch": 0.62,
17863
+ "grad_norm": 5.209987640380859,
17864
+ "learning_rate": 3.4237288135593223e-06,
17865
+ "loss": 1.2389,
17866
+ "step": 24950
17867
+ },
17868
+ {
17869
+ "epoch": 0.62,
17870
+ "grad_norm": 4.1047515869140625,
17871
+ "learning_rate": 3.4169491525423733e-06,
17872
+ "loss": 1.3449,
17873
+ "step": 24960
17874
+ },
17875
+ {
17876
+ "epoch": 0.62,
17877
+ "grad_norm": 19.238649368286133,
17878
+ "learning_rate": 3.4101694915254242e-06,
17879
+ "loss": 1.3224,
17880
+ "step": 24970
17881
+ },
17882
+ {
17883
+ "epoch": 0.62,
17884
+ "grad_norm": 11.792010307312012,
17885
+ "learning_rate": 3.403389830508475e-06,
17886
+ "loss": 1.1061,
17887
+ "step": 24980
17888
+ },
17889
+ {
17890
+ "epoch": 0.62,
17891
+ "grad_norm": 10.934020042419434,
17892
+ "learning_rate": 3.3966101694915253e-06,
17893
+ "loss": 1.4179,
17894
+ "step": 24990
17895
+ },
17896
+ {
17897
+ "epoch": 0.62,
17898
+ "grad_norm": 1.5776662826538086,
17899
+ "learning_rate": 3.3898305084745763e-06,
17900
+ "loss": 1.3254,
17901
+ "step": 25000
17902
+ },
17903
+ {
17904
+ "epoch": 0.62,
17905
+ "eval_loss": 1.3409814834594727,
17906
+ "eval_runtime": 66.1173,
17907
+ "eval_samples_per_second": 15.125,
17908
+ "eval_steps_per_second": 15.125,
17909
+ "step": 25000
17910
  }
17911
  ],
17912
  "logging_steps": 10,
 
17914
  "num_input_tokens_seen": 0,
17915
  "num_train_epochs": 1,
17916
  "save_steps": 2500,
17917
+ "total_flos": 4.025531498496e+17,
17918
  "train_batch_size": 1,
17919
  "trial_name": null,
17920
  "trial_params": null