jdannem6 commited on
Commit
f2d2a49
1 Parent(s): ca7f9e4

Uploaded checkpoint-22500

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1795 -5
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:140c22cb100bb7fb3b9f92ae13ec5bb2bfcde7ed82d7e4434fc5a235f98cb24e
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e72ed832ca82b47560da4b70a43bfb85762b41d54a4b1df89cee4b8816cb6fc
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f49faa425cc20765775b424c81e8f5599e3725a2dc79226d42d68c4573812cfe
3
  size 240145026
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19756b617f2bf91f55b9e8c9b87ec2279b8dca12dd91f8f9e92a075a7d6745b9
3
  size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d70b07077c15c8bd127eaf0a24ba45e81ca7ce6ae410b7a625f50c345ec6eb1f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da5d678c8111a8bb6e7f07c6d826c3d293cb2dc841c1a7d8cdada1cef59bd3c9
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7dc694a733ff91b79c5eaf7bcfe8aa41771c4ef8a47d325d2a9e9f6bc78f946
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ebf87c3777a880efd4523ce05af816d67a6a12edb3e1d54f156890382c1db41
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.348677158355713,
3
- "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-20000",
4
- "epoch": 0.5,
5
  "eval_steps": 500,
6
- "global_step": 20000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -14327,6 +14327,1796 @@
14327
  "eval_samples_per_second": 15.124,
14328
  "eval_steps_per_second": 15.124,
14329
  "step": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14330
  }
14331
  ],
14332
  "logging_steps": 10,
@@ -14334,7 +16124,7 @@
14334
  "num_input_tokens_seen": 0,
14335
  "num_train_epochs": 1,
14336
  "save_steps": 2500,
14337
- "total_flos": 3.2204251987968e+17,
14338
  "train_batch_size": 1,
14339
  "trial_name": null,
14340
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.3439137935638428,
3
+ "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-22500",
4
+ "epoch": 0.5625,
5
  "eval_steps": 500,
6
+ "global_step": 22500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
14327
  "eval_samples_per_second": 15.124,
14328
  "eval_steps_per_second": 15.124,
14329
  "step": 20000
14330
+ },
14331
+ {
14332
+ "epoch": 0.5,
14333
+ "grad_norm": 2.159421920776367,
14334
+ "learning_rate": 6.772881355932204e-06,
14335
+ "loss": 1.4193,
14336
+ "step": 20010
14337
+ },
14338
+ {
14339
+ "epoch": 0.5,
14340
+ "grad_norm": 7.60541296005249,
14341
+ "learning_rate": 6.766101694915255e-06,
14342
+ "loss": 1.4747,
14343
+ "step": 20020
14344
+ },
14345
+ {
14346
+ "epoch": 0.5,
14347
+ "grad_norm": 2.0991897583007812,
14348
+ "learning_rate": 6.759322033898306e-06,
14349
+ "loss": 1.2775,
14350
+ "step": 20030
14351
+ },
14352
+ {
14353
+ "epoch": 0.5,
14354
+ "grad_norm": 7.756378173828125,
14355
+ "learning_rate": 6.7525423728813565e-06,
14356
+ "loss": 1.343,
14357
+ "step": 20040
14358
+ },
14359
+ {
14360
+ "epoch": 0.5,
14361
+ "grad_norm": 7.062932014465332,
14362
+ "learning_rate": 6.745762711864408e-06,
14363
+ "loss": 1.5168,
14364
+ "step": 20050
14365
+ },
14366
+ {
14367
+ "epoch": 0.5,
14368
+ "grad_norm": 6.689007759094238,
14369
+ "learning_rate": 6.7389830508474585e-06,
14370
+ "loss": 1.2894,
14371
+ "step": 20060
14372
+ },
14373
+ {
14374
+ "epoch": 0.5,
14375
+ "grad_norm": 5.31237268447876,
14376
+ "learning_rate": 6.73220338983051e-06,
14377
+ "loss": 1.3382,
14378
+ "step": 20070
14379
+ },
14380
+ {
14381
+ "epoch": 0.5,
14382
+ "grad_norm": 3.6598782539367676,
14383
+ "learning_rate": 6.7254237288135604e-06,
14384
+ "loss": 1.3839,
14385
+ "step": 20080
14386
+ },
14387
+ {
14388
+ "epoch": 0.5,
14389
+ "grad_norm": 3.3091630935668945,
14390
+ "learning_rate": 6.71864406779661e-06,
14391
+ "loss": 1.3654,
14392
+ "step": 20090
14393
+ },
14394
+ {
14395
+ "epoch": 0.5,
14396
+ "grad_norm": 2.1399312019348145,
14397
+ "learning_rate": 6.7118644067796615e-06,
14398
+ "loss": 1.5429,
14399
+ "step": 20100
14400
+ },
14401
+ {
14402
+ "epoch": 0.5,
14403
+ "grad_norm": 4.380695819854736,
14404
+ "learning_rate": 6.705084745762712e-06,
14405
+ "loss": 1.4629,
14406
+ "step": 20110
14407
+ },
14408
+ {
14409
+ "epoch": 0.5,
14410
+ "grad_norm": 9.330241203308105,
14411
+ "learning_rate": 6.6983050847457635e-06,
14412
+ "loss": 1.3547,
14413
+ "step": 20120
14414
+ },
14415
+ {
14416
+ "epoch": 0.5,
14417
+ "grad_norm": 5.803046226501465,
14418
+ "learning_rate": 6.691525423728814e-06,
14419
+ "loss": 1.3534,
14420
+ "step": 20130
14421
+ },
14422
+ {
14423
+ "epoch": 0.5,
14424
+ "grad_norm": 4.967430591583252,
14425
+ "learning_rate": 6.6847457627118655e-06,
14426
+ "loss": 1.4965,
14427
+ "step": 20140
14428
+ },
14429
+ {
14430
+ "epoch": 0.5,
14431
+ "grad_norm": 8.131784439086914,
14432
+ "learning_rate": 6.677966101694916e-06,
14433
+ "loss": 1.4012,
14434
+ "step": 20150
14435
+ },
14436
+ {
14437
+ "epoch": 0.5,
14438
+ "grad_norm": 9.220160484313965,
14439
+ "learning_rate": 6.6711864406779666e-06,
14440
+ "loss": 1.3637,
14441
+ "step": 20160
14442
+ },
14443
+ {
14444
+ "epoch": 0.5,
14445
+ "grad_norm": 6.833899974822998,
14446
+ "learning_rate": 6.664406779661018e-06,
14447
+ "loss": 1.3748,
14448
+ "step": 20170
14449
+ },
14450
+ {
14451
+ "epoch": 0.5,
14452
+ "grad_norm": 13.869956970214844,
14453
+ "learning_rate": 6.6576271186440685e-06,
14454
+ "loss": 1.4474,
14455
+ "step": 20180
14456
+ },
14457
+ {
14458
+ "epoch": 0.5,
14459
+ "grad_norm": 9.462739944458008,
14460
+ "learning_rate": 6.650847457627119e-06,
14461
+ "loss": 1.3893,
14462
+ "step": 20190
14463
+ },
14464
+ {
14465
+ "epoch": 0.51,
14466
+ "grad_norm": 2.321714162826538,
14467
+ "learning_rate": 6.64406779661017e-06,
14468
+ "loss": 1.3836,
14469
+ "step": 20200
14470
+ },
14471
+ {
14472
+ "epoch": 0.51,
14473
+ "grad_norm": 11.65925407409668,
14474
+ "learning_rate": 6.637288135593221e-06,
14475
+ "loss": 1.493,
14476
+ "step": 20210
14477
+ },
14478
+ {
14479
+ "epoch": 0.51,
14480
+ "grad_norm": 6.808838367462158,
14481
+ "learning_rate": 6.6305084745762716e-06,
14482
+ "loss": 1.3632,
14483
+ "step": 20220
14484
+ },
14485
+ {
14486
+ "epoch": 0.51,
14487
+ "grad_norm": 4.534874439239502,
14488
+ "learning_rate": 6.623728813559322e-06,
14489
+ "loss": 1.4303,
14490
+ "step": 20230
14491
+ },
14492
+ {
14493
+ "epoch": 0.51,
14494
+ "grad_norm": 1.9837802648544312,
14495
+ "learning_rate": 6.6169491525423735e-06,
14496
+ "loss": 1.446,
14497
+ "step": 20240
14498
+ },
14499
+ {
14500
+ "epoch": 0.51,
14501
+ "grad_norm": 7.779874324798584,
14502
+ "learning_rate": 6.610169491525424e-06,
14503
+ "loss": 1.4566,
14504
+ "step": 20250
14505
+ },
14506
+ {
14507
+ "epoch": 0.51,
14508
+ "grad_norm": 5.729377746582031,
14509
+ "learning_rate": 6.6033898305084755e-06,
14510
+ "loss": 1.401,
14511
+ "step": 20260
14512
+ },
14513
+ {
14514
+ "epoch": 0.51,
14515
+ "grad_norm": 12.633749008178711,
14516
+ "learning_rate": 6.596610169491526e-06,
14517
+ "loss": 1.2969,
14518
+ "step": 20270
14519
+ },
14520
+ {
14521
+ "epoch": 0.51,
14522
+ "grad_norm": 2.7961552143096924,
14523
+ "learning_rate": 6.5898305084745774e-06,
14524
+ "loss": 1.4151,
14525
+ "step": 20280
14526
+ },
14527
+ {
14528
+ "epoch": 0.51,
14529
+ "grad_norm": 8.461606979370117,
14530
+ "learning_rate": 6.583050847457627e-06,
14531
+ "loss": 1.2665,
14532
+ "step": 20290
14533
+ },
14534
+ {
14535
+ "epoch": 0.51,
14536
+ "grad_norm": 10.325592041015625,
14537
+ "learning_rate": 6.576271186440678e-06,
14538
+ "loss": 1.3666,
14539
+ "step": 20300
14540
+ },
14541
+ {
14542
+ "epoch": 0.51,
14543
+ "grad_norm": 3.0818538665771484,
14544
+ "learning_rate": 6.569491525423729e-06,
14545
+ "loss": 1.4307,
14546
+ "step": 20310
14547
+ },
14548
+ {
14549
+ "epoch": 0.51,
14550
+ "grad_norm": 7.449318885803223,
14551
+ "learning_rate": 6.56271186440678e-06,
14552
+ "loss": 1.4585,
14553
+ "step": 20320
14554
+ },
14555
+ {
14556
+ "epoch": 0.51,
14557
+ "grad_norm": 4.6587042808532715,
14558
+ "learning_rate": 6.555932203389831e-06,
14559
+ "loss": 1.4053,
14560
+ "step": 20330
14561
+ },
14562
+ {
14563
+ "epoch": 0.51,
14564
+ "grad_norm": 5.837299346923828,
14565
+ "learning_rate": 6.549152542372882e-06,
14566
+ "loss": 1.3456,
14567
+ "step": 20340
14568
+ },
14569
+ {
14570
+ "epoch": 0.51,
14571
+ "grad_norm": 7.345305442810059,
14572
+ "learning_rate": 6.542372881355933e-06,
14573
+ "loss": 1.3618,
14574
+ "step": 20350
14575
+ },
14576
+ {
14577
+ "epoch": 0.51,
14578
+ "grad_norm": 5.679592609405518,
14579
+ "learning_rate": 6.5355932203389836e-06,
14580
+ "loss": 1.3875,
14581
+ "step": 20360
14582
+ },
14583
+ {
14584
+ "epoch": 0.51,
14585
+ "grad_norm": 1.062429428100586,
14586
+ "learning_rate": 6.528813559322035e-06,
14587
+ "loss": 1.3083,
14588
+ "step": 20370
14589
+ },
14590
+ {
14591
+ "epoch": 0.51,
14592
+ "grad_norm": 5.2157769203186035,
14593
+ "learning_rate": 6.5220338983050855e-06,
14594
+ "loss": 1.3582,
14595
+ "step": 20380
14596
+ },
14597
+ {
14598
+ "epoch": 0.51,
14599
+ "grad_norm": 4.890625476837158,
14600
+ "learning_rate": 6.515254237288137e-06,
14601
+ "loss": 1.3184,
14602
+ "step": 20390
14603
+ },
14604
+ {
14605
+ "epoch": 0.51,
14606
+ "grad_norm": 1.587312936782837,
14607
+ "learning_rate": 6.508474576271187e-06,
14608
+ "loss": 1.2191,
14609
+ "step": 20400
14610
+ },
14611
+ {
14612
+ "epoch": 0.51,
14613
+ "grad_norm": 4.013046741485596,
14614
+ "learning_rate": 6.501694915254237e-06,
14615
+ "loss": 1.3331,
14616
+ "step": 20410
14617
+ },
14618
+ {
14619
+ "epoch": 0.51,
14620
+ "grad_norm": 4.076744556427002,
14621
+ "learning_rate": 6.4949152542372886e-06,
14622
+ "loss": 1.4351,
14623
+ "step": 20420
14624
+ },
14625
+ {
14626
+ "epoch": 0.51,
14627
+ "grad_norm": 6.493119239807129,
14628
+ "learning_rate": 6.488135593220339e-06,
14629
+ "loss": 1.3519,
14630
+ "step": 20430
14631
+ },
14632
+ {
14633
+ "epoch": 0.51,
14634
+ "grad_norm": 6.127922058105469,
14635
+ "learning_rate": 6.4813559322033905e-06,
14636
+ "loss": 1.4255,
14637
+ "step": 20440
14638
+ },
14639
+ {
14640
+ "epoch": 0.51,
14641
+ "grad_norm": 4.424916744232178,
14642
+ "learning_rate": 6.474576271186441e-06,
14643
+ "loss": 1.44,
14644
+ "step": 20450
14645
+ },
14646
+ {
14647
+ "epoch": 0.51,
14648
+ "grad_norm": 9.894845008850098,
14649
+ "learning_rate": 6.4677966101694925e-06,
14650
+ "loss": 1.2922,
14651
+ "step": 20460
14652
+ },
14653
+ {
14654
+ "epoch": 0.51,
14655
+ "grad_norm": 5.174190521240234,
14656
+ "learning_rate": 6.461016949152543e-06,
14657
+ "loss": 1.439,
14658
+ "step": 20470
14659
+ },
14660
+ {
14661
+ "epoch": 0.51,
14662
+ "grad_norm": 6.313706398010254,
14663
+ "learning_rate": 6.4542372881355944e-06,
14664
+ "loss": 1.3229,
14665
+ "step": 20480
14666
+ },
14667
+ {
14668
+ "epoch": 0.51,
14669
+ "grad_norm": 3.240434169769287,
14670
+ "learning_rate": 6.447457627118645e-06,
14671
+ "loss": 1.3911,
14672
+ "step": 20490
14673
+ },
14674
+ {
14675
+ "epoch": 0.51,
14676
+ "grad_norm": 7.466948986053467,
14677
+ "learning_rate": 6.440677966101695e-06,
14678
+ "loss": 1.4467,
14679
+ "step": 20500
14680
+ },
14681
+ {
14682
+ "epoch": 0.51,
14683
+ "eval_loss": 1.3339221477508545,
14684
+ "eval_runtime": 66.152,
14685
+ "eval_samples_per_second": 15.117,
14686
+ "eval_steps_per_second": 15.117,
14687
+ "step": 20500
14688
+ },
14689
+ {
14690
+ "epoch": 0.51,
14691
+ "grad_norm": 32.95411682128906,
14692
+ "learning_rate": 6.433898305084746e-06,
14693
+ "loss": 1.2821,
14694
+ "step": 20510
14695
+ },
14696
+ {
14697
+ "epoch": 0.51,
14698
+ "grad_norm": 11.001514434814453,
14699
+ "learning_rate": 6.427118644067797e-06,
14700
+ "loss": 1.3523,
14701
+ "step": 20520
14702
+ },
14703
+ {
14704
+ "epoch": 0.51,
14705
+ "grad_norm": 1.6923043727874756,
14706
+ "learning_rate": 6.420338983050848e-06,
14707
+ "loss": 1.3424,
14708
+ "step": 20530
14709
+ },
14710
+ {
14711
+ "epoch": 0.51,
14712
+ "grad_norm": 6.267253398895264,
14713
+ "learning_rate": 6.413559322033899e-06,
14714
+ "loss": 1.2163,
14715
+ "step": 20540
14716
+ },
14717
+ {
14718
+ "epoch": 0.51,
14719
+ "grad_norm": 15.797646522521973,
14720
+ "learning_rate": 6.40677966101695e-06,
14721
+ "loss": 1.2104,
14722
+ "step": 20550
14723
+ },
14724
+ {
14725
+ "epoch": 0.51,
14726
+ "grad_norm": 3.1398627758026123,
14727
+ "learning_rate": 6.4000000000000006e-06,
14728
+ "loss": 1.2264,
14729
+ "step": 20560
14730
+ },
14731
+ {
14732
+ "epoch": 0.51,
14733
+ "grad_norm": 1.7001848220825195,
14734
+ "learning_rate": 6.393220338983052e-06,
14735
+ "loss": 1.345,
14736
+ "step": 20570
14737
+ },
14738
+ {
14739
+ "epoch": 0.51,
14740
+ "grad_norm": 6.551231861114502,
14741
+ "learning_rate": 6.3864406779661025e-06,
14742
+ "loss": 1.295,
14743
+ "step": 20580
14744
+ },
14745
+ {
14746
+ "epoch": 0.51,
14747
+ "grad_norm": 2.501774787902832,
14748
+ "learning_rate": 6.379661016949154e-06,
14749
+ "loss": 1.2727,
14750
+ "step": 20590
14751
+ },
14752
+ {
14753
+ "epoch": 0.52,
14754
+ "grad_norm": 5.2971906661987305,
14755
+ "learning_rate": 6.372881355932204e-06,
14756
+ "loss": 1.1617,
14757
+ "step": 20600
14758
+ },
14759
+ {
14760
+ "epoch": 0.52,
14761
+ "grad_norm": 8.737128257751465,
14762
+ "learning_rate": 6.366101694915254e-06,
14763
+ "loss": 1.3792,
14764
+ "step": 20610
14765
+ },
14766
+ {
14767
+ "epoch": 0.52,
14768
+ "grad_norm": 1.8445500135421753,
14769
+ "learning_rate": 6.3593220338983056e-06,
14770
+ "loss": 1.2818,
14771
+ "step": 20620
14772
+ },
14773
+ {
14774
+ "epoch": 0.52,
14775
+ "grad_norm": 6.586302280426025,
14776
+ "learning_rate": 6.352542372881356e-06,
14777
+ "loss": 1.471,
14778
+ "step": 20630
14779
+ },
14780
+ {
14781
+ "epoch": 0.52,
14782
+ "grad_norm": 2.3581125736236572,
14783
+ "learning_rate": 6.3457627118644075e-06,
14784
+ "loss": 1.26,
14785
+ "step": 20640
14786
+ },
14787
+ {
14788
+ "epoch": 0.52,
14789
+ "grad_norm": 3.006883382797241,
14790
+ "learning_rate": 6.338983050847458e-06,
14791
+ "loss": 1.4312,
14792
+ "step": 20650
14793
+ },
14794
+ {
14795
+ "epoch": 0.52,
14796
+ "grad_norm": 1.8936930894851685,
14797
+ "learning_rate": 6.3322033898305095e-06,
14798
+ "loss": 1.3705,
14799
+ "step": 20660
14800
+ },
14801
+ {
14802
+ "epoch": 0.52,
14803
+ "grad_norm": 5.22953462600708,
14804
+ "learning_rate": 6.32542372881356e-06,
14805
+ "loss": 1.3252,
14806
+ "step": 20670
14807
+ },
14808
+ {
14809
+ "epoch": 0.52,
14810
+ "grad_norm": 7.669064044952393,
14811
+ "learning_rate": 6.318644067796611e-06,
14812
+ "loss": 1.4673,
14813
+ "step": 20680
14814
+ },
14815
+ {
14816
+ "epoch": 0.52,
14817
+ "grad_norm": 3.7508702278137207,
14818
+ "learning_rate": 6.311864406779662e-06,
14819
+ "loss": 1.3465,
14820
+ "step": 20690
14821
+ },
14822
+ {
14823
+ "epoch": 0.52,
14824
+ "grad_norm": 7.959824085235596,
14825
+ "learning_rate": 6.3050847457627125e-06,
14826
+ "loss": 1.4148,
14827
+ "step": 20700
14828
+ },
14829
+ {
14830
+ "epoch": 0.52,
14831
+ "grad_norm": 8.888174057006836,
14832
+ "learning_rate": 6.298305084745763e-06,
14833
+ "loss": 1.4874,
14834
+ "step": 20710
14835
+ },
14836
+ {
14837
+ "epoch": 0.52,
14838
+ "grad_norm": 11.385110855102539,
14839
+ "learning_rate": 6.291525423728814e-06,
14840
+ "loss": 1.3519,
14841
+ "step": 20720
14842
+ },
14843
+ {
14844
+ "epoch": 0.52,
14845
+ "grad_norm": 11.636566162109375,
14846
+ "learning_rate": 6.284745762711865e-06,
14847
+ "loss": 1.3237,
14848
+ "step": 20730
14849
+ },
14850
+ {
14851
+ "epoch": 0.52,
14852
+ "grad_norm": 2.6630842685699463,
14853
+ "learning_rate": 6.277966101694916e-06,
14854
+ "loss": 1.5615,
14855
+ "step": 20740
14856
+ },
14857
+ {
14858
+ "epoch": 0.52,
14859
+ "grad_norm": 15.339405059814453,
14860
+ "learning_rate": 6.271186440677966e-06,
14861
+ "loss": 1.0971,
14862
+ "step": 20750
14863
+ },
14864
+ {
14865
+ "epoch": 0.52,
14866
+ "grad_norm": 5.97685432434082,
14867
+ "learning_rate": 6.2644067796610176e-06,
14868
+ "loss": 1.2582,
14869
+ "step": 20760
14870
+ },
14871
+ {
14872
+ "epoch": 0.52,
14873
+ "grad_norm": 5.218130111694336,
14874
+ "learning_rate": 6.257627118644068e-06,
14875
+ "loss": 1.3272,
14876
+ "step": 20770
14877
+ },
14878
+ {
14879
+ "epoch": 0.52,
14880
+ "grad_norm": 8.943811416625977,
14881
+ "learning_rate": 6.2508474576271195e-06,
14882
+ "loss": 1.4001,
14883
+ "step": 20780
14884
+ },
14885
+ {
14886
+ "epoch": 0.52,
14887
+ "grad_norm": 3.288783550262451,
14888
+ "learning_rate": 6.24406779661017e-06,
14889
+ "loss": 1.3729,
14890
+ "step": 20790
14891
+ },
14892
+ {
14893
+ "epoch": 0.52,
14894
+ "grad_norm": 2.7321832180023193,
14895
+ "learning_rate": 6.2372881355932215e-06,
14896
+ "loss": 1.4365,
14897
+ "step": 20800
14898
+ },
14899
+ {
14900
+ "epoch": 0.52,
14901
+ "grad_norm": 7.656303405761719,
14902
+ "learning_rate": 6.230508474576271e-06,
14903
+ "loss": 1.3646,
14904
+ "step": 20810
14905
+ },
14906
+ {
14907
+ "epoch": 0.52,
14908
+ "grad_norm": 3.2849278450012207,
14909
+ "learning_rate": 6.223728813559322e-06,
14910
+ "loss": 1.0755,
14911
+ "step": 20820
14912
+ },
14913
+ {
14914
+ "epoch": 0.52,
14915
+ "grad_norm": 3.6145408153533936,
14916
+ "learning_rate": 6.216949152542373e-06,
14917
+ "loss": 1.2836,
14918
+ "step": 20830
14919
+ },
14920
+ {
14921
+ "epoch": 0.52,
14922
+ "grad_norm": 6.668073654174805,
14923
+ "learning_rate": 6.210169491525424e-06,
14924
+ "loss": 1.3706,
14925
+ "step": 20840
14926
+ },
14927
+ {
14928
+ "epoch": 0.52,
14929
+ "grad_norm": 10.408007621765137,
14930
+ "learning_rate": 6.203389830508475e-06,
14931
+ "loss": 1.5154,
14932
+ "step": 20850
14933
+ },
14934
+ {
14935
+ "epoch": 0.52,
14936
+ "grad_norm": 2.708711862564087,
14937
+ "learning_rate": 6.196610169491526e-06,
14938
+ "loss": 1.3841,
14939
+ "step": 20860
14940
+ },
14941
+ {
14942
+ "epoch": 0.52,
14943
+ "grad_norm": 1.7991631031036377,
14944
+ "learning_rate": 6.189830508474577e-06,
14945
+ "loss": 1.261,
14946
+ "step": 20870
14947
+ },
14948
+ {
14949
+ "epoch": 0.52,
14950
+ "grad_norm": 6.492557048797607,
14951
+ "learning_rate": 6.183050847457628e-06,
14952
+ "loss": 1.469,
14953
+ "step": 20880
14954
+ },
14955
+ {
14956
+ "epoch": 0.52,
14957
+ "grad_norm": 3.5670902729034424,
14958
+ "learning_rate": 6.176271186440679e-06,
14959
+ "loss": 1.4484,
14960
+ "step": 20890
14961
+ },
14962
+ {
14963
+ "epoch": 0.52,
14964
+ "grad_norm": 3.4511330127716064,
14965
+ "learning_rate": 6.1694915254237295e-06,
14966
+ "loss": 1.3566,
14967
+ "step": 20900
14968
+ },
14969
+ {
14970
+ "epoch": 0.52,
14971
+ "grad_norm": 7.958609580993652,
14972
+ "learning_rate": 6.162711864406781e-06,
14973
+ "loss": 1.3272,
14974
+ "step": 20910
14975
+ },
14976
+ {
14977
+ "epoch": 0.52,
14978
+ "grad_norm": 6.594333648681641,
14979
+ "learning_rate": 6.155932203389831e-06,
14980
+ "loss": 1.3926,
14981
+ "step": 20920
14982
+ },
14983
+ {
14984
+ "epoch": 0.52,
14985
+ "grad_norm": 7.329288005828857,
14986
+ "learning_rate": 6.149152542372881e-06,
14987
+ "loss": 1.4948,
14988
+ "step": 20930
14989
+ },
14990
+ {
14991
+ "epoch": 0.52,
14992
+ "grad_norm": 10.120712280273438,
14993
+ "learning_rate": 6.142372881355933e-06,
14994
+ "loss": 1.5394,
14995
+ "step": 20940
14996
+ },
14997
+ {
14998
+ "epoch": 0.52,
14999
+ "grad_norm": 4.3365983963012695,
15000
+ "learning_rate": 6.135593220338983e-06,
15001
+ "loss": 1.2156,
15002
+ "step": 20950
15003
+ },
15004
+ {
15005
+ "epoch": 0.52,
15006
+ "grad_norm": 2.0522942543029785,
15007
+ "learning_rate": 6.1288135593220346e-06,
15008
+ "loss": 1.3206,
15009
+ "step": 20960
15010
+ },
15011
+ {
15012
+ "epoch": 0.52,
15013
+ "grad_norm": 5.730597496032715,
15014
+ "learning_rate": 6.122033898305085e-06,
15015
+ "loss": 1.4396,
15016
+ "step": 20970
15017
+ },
15018
+ {
15019
+ "epoch": 0.52,
15020
+ "grad_norm": 13.125938415527344,
15021
+ "learning_rate": 6.1152542372881365e-06,
15022
+ "loss": 1.1969,
15023
+ "step": 20980
15024
+ },
15025
+ {
15026
+ "epoch": 0.52,
15027
+ "grad_norm": 3.8410260677337646,
15028
+ "learning_rate": 6.108474576271187e-06,
15029
+ "loss": 1.3842,
15030
+ "step": 20990
15031
+ },
15032
+ {
15033
+ "epoch": 0.53,
15034
+ "grad_norm": 3.415696144104004,
15035
+ "learning_rate": 6.1016949152542385e-06,
15036
+ "loss": 1.3296,
15037
+ "step": 21000
15038
+ },
15039
+ {
15040
+ "epoch": 0.53,
15041
+ "eval_loss": 1.344160556793213,
15042
+ "eval_runtime": 66.1512,
15043
+ "eval_samples_per_second": 15.117,
15044
+ "eval_steps_per_second": 15.117,
15045
+ "step": 21000
15046
+ },
15047
+ {
15048
+ "epoch": 0.53,
15049
+ "grad_norm": 9.95438003540039,
15050
+ "learning_rate": 6.094915254237289e-06,
15051
+ "loss": 1.3206,
15052
+ "step": 21010
15053
+ },
15054
+ {
15055
+ "epoch": 0.53,
15056
+ "grad_norm": 10.410721778869629,
15057
+ "learning_rate": 6.088135593220339e-06,
15058
+ "loss": 1.4014,
15059
+ "step": 21020
15060
+ },
15061
+ {
15062
+ "epoch": 0.53,
15063
+ "grad_norm": 9.159972190856934,
15064
+ "learning_rate": 6.08135593220339e-06,
15065
+ "loss": 1.3827,
15066
+ "step": 21030
15067
+ },
15068
+ {
15069
+ "epoch": 0.53,
15070
+ "grad_norm": 4.856491565704346,
15071
+ "learning_rate": 6.074576271186441e-06,
15072
+ "loss": 1.4561,
15073
+ "step": 21040
15074
+ },
15075
+ {
15076
+ "epoch": 0.53,
15077
+ "grad_norm": 5.863302707672119,
15078
+ "learning_rate": 6.067796610169492e-06,
15079
+ "loss": 1.3225,
15080
+ "step": 21050
15081
+ },
15082
+ {
15083
+ "epoch": 0.53,
15084
+ "grad_norm": 2.968809127807617,
15085
+ "learning_rate": 6.061016949152543e-06,
15086
+ "loss": 1.3713,
15087
+ "step": 21060
15088
+ },
15089
+ {
15090
+ "epoch": 0.53,
15091
+ "grad_norm": 4.19352388381958,
15092
+ "learning_rate": 6.054237288135594e-06,
15093
+ "loss": 1.3894,
15094
+ "step": 21070
15095
+ },
15096
+ {
15097
+ "epoch": 0.53,
15098
+ "grad_norm": 4.841989040374756,
15099
+ "learning_rate": 6.047457627118645e-06,
15100
+ "loss": 1.3609,
15101
+ "step": 21080
15102
+ },
15103
+ {
15104
+ "epoch": 0.53,
15105
+ "grad_norm": 8.693398475646973,
15106
+ "learning_rate": 6.040677966101696e-06,
15107
+ "loss": 1.4118,
15108
+ "step": 21090
15109
+ },
15110
+ {
15111
+ "epoch": 0.53,
15112
+ "grad_norm": 10.902780532836914,
15113
+ "learning_rate": 6.0338983050847465e-06,
15114
+ "loss": 1.4529,
15115
+ "step": 21100
15116
+ },
15117
+ {
15118
+ "epoch": 0.53,
15119
+ "grad_norm": 5.1729607582092285,
15120
+ "learning_rate": 6.027118644067798e-06,
15121
+ "loss": 1.3302,
15122
+ "step": 21110
15123
+ },
15124
+ {
15125
+ "epoch": 0.53,
15126
+ "grad_norm": 1.9906094074249268,
15127
+ "learning_rate": 6.020338983050848e-06,
15128
+ "loss": 1.2294,
15129
+ "step": 21120
15130
+ },
15131
+ {
15132
+ "epoch": 0.53,
15133
+ "grad_norm": 3.285928964614868,
15134
+ "learning_rate": 6.013559322033898e-06,
15135
+ "loss": 1.5006,
15136
+ "step": 21130
15137
+ },
15138
+ {
15139
+ "epoch": 0.53,
15140
+ "grad_norm": 13.098603248596191,
15141
+ "learning_rate": 6.00677966101695e-06,
15142
+ "loss": 1.425,
15143
+ "step": 21140
15144
+ },
15145
+ {
15146
+ "epoch": 0.53,
15147
+ "grad_norm": 4.391754150390625,
15148
+ "learning_rate": 6e-06,
15149
+ "loss": 1.2668,
15150
+ "step": 21150
15151
+ },
15152
+ {
15153
+ "epoch": 0.53,
15154
+ "grad_norm": 8.409893035888672,
15155
+ "learning_rate": 5.9932203389830516e-06,
15156
+ "loss": 1.2061,
15157
+ "step": 21160
15158
+ },
15159
+ {
15160
+ "epoch": 0.53,
15161
+ "grad_norm": 10.568397521972656,
15162
+ "learning_rate": 5.986440677966102e-06,
15163
+ "loss": 1.4191,
15164
+ "step": 21170
15165
+ },
15166
+ {
15167
+ "epoch": 0.53,
15168
+ "grad_norm": 7.371358394622803,
15169
+ "learning_rate": 5.9796610169491535e-06,
15170
+ "loss": 1.2942,
15171
+ "step": 21180
15172
+ },
15173
+ {
15174
+ "epoch": 0.53,
15175
+ "grad_norm": 2.6429450511932373,
15176
+ "learning_rate": 5.972881355932204e-06,
15177
+ "loss": 1.4216,
15178
+ "step": 21190
15179
+ },
15180
+ {
15181
+ "epoch": 0.53,
15182
+ "grad_norm": 5.85234260559082,
15183
+ "learning_rate": 5.9661016949152555e-06,
15184
+ "loss": 1.3541,
15185
+ "step": 21200
15186
+ },
15187
+ {
15188
+ "epoch": 0.53,
15189
+ "grad_norm": 3.2903425693511963,
15190
+ "learning_rate": 5.959322033898306e-06,
15191
+ "loss": 1.2969,
15192
+ "step": 21210
15193
+ },
15194
+ {
15195
+ "epoch": 0.53,
15196
+ "grad_norm": 3.9652106761932373,
15197
+ "learning_rate": 5.9525423728813566e-06,
15198
+ "loss": 1.2853,
15199
+ "step": 21220
15200
+ },
15201
+ {
15202
+ "epoch": 0.53,
15203
+ "grad_norm": 3.8703153133392334,
15204
+ "learning_rate": 5.945762711864407e-06,
15205
+ "loss": 1.3094,
15206
+ "step": 21230
15207
+ },
15208
+ {
15209
+ "epoch": 0.53,
15210
+ "grad_norm": 8.018983840942383,
15211
+ "learning_rate": 5.938983050847458e-06,
15212
+ "loss": 1.4798,
15213
+ "step": 21240
15214
+ },
15215
+ {
15216
+ "epoch": 0.53,
15217
+ "grad_norm": 2.752399206161499,
15218
+ "learning_rate": 5.932203389830509e-06,
15219
+ "loss": 1.4141,
15220
+ "step": 21250
15221
+ },
15222
+ {
15223
+ "epoch": 0.53,
15224
+ "grad_norm": 5.574487686157227,
15225
+ "learning_rate": 5.92542372881356e-06,
15226
+ "loss": 1.2121,
15227
+ "step": 21260
15228
+ },
15229
+ {
15230
+ "epoch": 0.53,
15231
+ "grad_norm": 10.352456092834473,
15232
+ "learning_rate": 5.91864406779661e-06,
15233
+ "loss": 1.3264,
15234
+ "step": 21270
15235
+ },
15236
+ {
15237
+ "epoch": 0.53,
15238
+ "grad_norm": 4.182732582092285,
15239
+ "learning_rate": 5.911864406779662e-06,
15240
+ "loss": 1.4334,
15241
+ "step": 21280
15242
+ },
15243
+ {
15244
+ "epoch": 0.53,
15245
+ "grad_norm": 5.310421943664551,
15246
+ "learning_rate": 5.905084745762712e-06,
15247
+ "loss": 1.5641,
15248
+ "step": 21290
15249
+ },
15250
+ {
15251
+ "epoch": 0.53,
15252
+ "grad_norm": 5.489622592926025,
15253
+ "learning_rate": 5.8983050847457635e-06,
15254
+ "loss": 1.4699,
15255
+ "step": 21300
15256
+ },
15257
+ {
15258
+ "epoch": 0.53,
15259
+ "grad_norm": 6.644534111022949,
15260
+ "learning_rate": 5.891525423728814e-06,
15261
+ "loss": 1.3273,
15262
+ "step": 21310
15263
+ },
15264
+ {
15265
+ "epoch": 0.53,
15266
+ "grad_norm": 13.480459213256836,
15267
+ "learning_rate": 5.8847457627118655e-06,
15268
+ "loss": 1.2957,
15269
+ "step": 21320
15270
+ },
15271
+ {
15272
+ "epoch": 0.53,
15273
+ "grad_norm": 9.34183120727539,
15274
+ "learning_rate": 5.877966101694915e-06,
15275
+ "loss": 1.4145,
15276
+ "step": 21330
15277
+ },
15278
+ {
15279
+ "epoch": 0.53,
15280
+ "grad_norm": 4.792973518371582,
15281
+ "learning_rate": 5.871186440677966e-06,
15282
+ "loss": 1.4912,
15283
+ "step": 21340
15284
+ },
15285
+ {
15286
+ "epoch": 0.53,
15287
+ "grad_norm": 4.098564147949219,
15288
+ "learning_rate": 5.864406779661017e-06,
15289
+ "loss": 1.1895,
15290
+ "step": 21350
15291
+ },
15292
+ {
15293
+ "epoch": 0.53,
15294
+ "grad_norm": 7.23917293548584,
15295
+ "learning_rate": 5.857627118644068e-06,
15296
+ "loss": 1.3875,
15297
+ "step": 21360
15298
+ },
15299
+ {
15300
+ "epoch": 0.53,
15301
+ "grad_norm": 4.86613130569458,
15302
+ "learning_rate": 5.850847457627119e-06,
15303
+ "loss": 1.4827,
15304
+ "step": 21370
15305
+ },
15306
+ {
15307
+ "epoch": 0.53,
15308
+ "grad_norm": 7.629755973815918,
15309
+ "learning_rate": 5.84406779661017e-06,
15310
+ "loss": 1.2189,
15311
+ "step": 21380
15312
+ },
15313
+ {
15314
+ "epoch": 0.53,
15315
+ "grad_norm": 3.80531907081604,
15316
+ "learning_rate": 5.837288135593221e-06,
15317
+ "loss": 1.4064,
15318
+ "step": 21390
15319
+ },
15320
+ {
15321
+ "epoch": 0.54,
15322
+ "grad_norm": 3.432089328765869,
15323
+ "learning_rate": 5.830508474576272e-06,
15324
+ "loss": 1.1929,
15325
+ "step": 21400
15326
+ },
15327
+ {
15328
+ "epoch": 0.54,
15329
+ "grad_norm": 9.766077995300293,
15330
+ "learning_rate": 5.823728813559323e-06,
15331
+ "loss": 1.4525,
15332
+ "step": 21410
15333
+ },
15334
+ {
15335
+ "epoch": 0.54,
15336
+ "grad_norm": 4.745760440826416,
15337
+ "learning_rate": 5.8169491525423736e-06,
15338
+ "loss": 1.4924,
15339
+ "step": 21420
15340
+ },
15341
+ {
15342
+ "epoch": 0.54,
15343
+ "grad_norm": 5.188168525695801,
15344
+ "learning_rate": 5.810169491525425e-06,
15345
+ "loss": 1.27,
15346
+ "step": 21430
15347
+ },
15348
+ {
15349
+ "epoch": 0.54,
15350
+ "grad_norm": 4.576213359832764,
15351
+ "learning_rate": 5.803389830508475e-06,
15352
+ "loss": 1.3165,
15353
+ "step": 21440
15354
+ },
15355
+ {
15356
+ "epoch": 0.54,
15357
+ "grad_norm": 10.540860176086426,
15358
+ "learning_rate": 5.796610169491525e-06,
15359
+ "loss": 1.2652,
15360
+ "step": 21450
15361
+ },
15362
+ {
15363
+ "epoch": 0.54,
15364
+ "grad_norm": 4.210390567779541,
15365
+ "learning_rate": 5.789830508474577e-06,
15366
+ "loss": 1.4372,
15367
+ "step": 21460
15368
+ },
15369
+ {
15370
+ "epoch": 0.54,
15371
+ "grad_norm": 8.733638763427734,
15372
+ "learning_rate": 5.783050847457627e-06,
15373
+ "loss": 1.2691,
15374
+ "step": 21470
15375
+ },
15376
+ {
15377
+ "epoch": 0.54,
15378
+ "grad_norm": 4.997326374053955,
15379
+ "learning_rate": 5.776271186440679e-06,
15380
+ "loss": 1.1454,
15381
+ "step": 21480
15382
+ },
15383
+ {
15384
+ "epoch": 0.54,
15385
+ "grad_norm": 10.108692169189453,
15386
+ "learning_rate": 5.769491525423729e-06,
15387
+ "loss": 1.2982,
15388
+ "step": 21490
15389
+ },
15390
+ {
15391
+ "epoch": 0.54,
15392
+ "grad_norm": 13.393025398254395,
15393
+ "learning_rate": 5.7627118644067805e-06,
15394
+ "loss": 1.4995,
15395
+ "step": 21500
15396
+ },
15397
+ {
15398
+ "epoch": 0.54,
15399
+ "eval_loss": 1.3565526008605957,
15400
+ "eval_runtime": 66.1537,
15401
+ "eval_samples_per_second": 15.116,
15402
+ "eval_steps_per_second": 15.116,
15403
+ "step": 21500
15404
+ },
15405
+ {
15406
+ "epoch": 0.54,
15407
+ "grad_norm": 5.675364017486572,
15408
+ "learning_rate": 5.755932203389831e-06,
15409
+ "loss": 1.2256,
15410
+ "step": 21510
15411
+ },
15412
+ {
15413
+ "epoch": 0.54,
15414
+ "grad_norm": 2.069751262664795,
15415
+ "learning_rate": 5.7491525423728825e-06,
15416
+ "loss": 1.2355,
15417
+ "step": 21520
15418
+ },
15419
+ {
15420
+ "epoch": 0.54,
15421
+ "grad_norm": 4.536093711853027,
15422
+ "learning_rate": 5.742372881355933e-06,
15423
+ "loss": 1.1957,
15424
+ "step": 21530
15425
+ },
15426
+ {
15427
+ "epoch": 0.54,
15428
+ "grad_norm": 2.5274765491485596,
15429
+ "learning_rate": 5.735593220338983e-06,
15430
+ "loss": 1.1843,
15431
+ "step": 21540
15432
+ },
15433
+ {
15434
+ "epoch": 0.54,
15435
+ "grad_norm": 4.418458461761475,
15436
+ "learning_rate": 5.728813559322034e-06,
15437
+ "loss": 1.1994,
15438
+ "step": 21550
15439
+ },
15440
+ {
15441
+ "epoch": 0.54,
15442
+ "grad_norm": 13.488496780395508,
15443
+ "learning_rate": 5.722033898305085e-06,
15444
+ "loss": 1.4173,
15445
+ "step": 21560
15446
+ },
15447
+ {
15448
+ "epoch": 0.54,
15449
+ "grad_norm": 5.223592758178711,
15450
+ "learning_rate": 5.715254237288136e-06,
15451
+ "loss": 1.3032,
15452
+ "step": 21570
15453
+ },
15454
+ {
15455
+ "epoch": 0.54,
15456
+ "grad_norm": 5.894464492797852,
15457
+ "learning_rate": 5.708474576271187e-06,
15458
+ "loss": 1.3027,
15459
+ "step": 21580
15460
+ },
15461
+ {
15462
+ "epoch": 0.54,
15463
+ "grad_norm": 6.945793151855469,
15464
+ "learning_rate": 5.701694915254238e-06,
15465
+ "loss": 1.3102,
15466
+ "step": 21590
15467
+ },
15468
+ {
15469
+ "epoch": 0.54,
15470
+ "grad_norm": 7.386875629425049,
15471
+ "learning_rate": 5.694915254237289e-06,
15472
+ "loss": 1.3643,
15473
+ "step": 21600
15474
+ },
15475
+ {
15476
+ "epoch": 0.54,
15477
+ "grad_norm": 3.5999162197113037,
15478
+ "learning_rate": 5.68813559322034e-06,
15479
+ "loss": 1.2456,
15480
+ "step": 21610
15481
+ },
15482
+ {
15483
+ "epoch": 0.54,
15484
+ "grad_norm": 5.8258490562438965,
15485
+ "learning_rate": 5.6813559322033906e-06,
15486
+ "loss": 1.1922,
15487
+ "step": 21620
15488
+ },
15489
+ {
15490
+ "epoch": 0.54,
15491
+ "grad_norm": 10.920169830322266,
15492
+ "learning_rate": 5.674576271186442e-06,
15493
+ "loss": 1.5312,
15494
+ "step": 21630
15495
+ },
15496
+ {
15497
+ "epoch": 0.54,
15498
+ "grad_norm": 3.898834705352783,
15499
+ "learning_rate": 5.667796610169492e-06,
15500
+ "loss": 1.5012,
15501
+ "step": 21640
15502
+ },
15503
+ {
15504
+ "epoch": 0.54,
15505
+ "grad_norm": 6.2130866050720215,
15506
+ "learning_rate": 5.661016949152542e-06,
15507
+ "loss": 1.543,
15508
+ "step": 21650
15509
+ },
15510
+ {
15511
+ "epoch": 0.54,
15512
+ "grad_norm": 3.604144811630249,
15513
+ "learning_rate": 5.654237288135594e-06,
15514
+ "loss": 1.4586,
15515
+ "step": 21660
15516
+ },
15517
+ {
15518
+ "epoch": 0.54,
15519
+ "grad_norm": 4.859696388244629,
15520
+ "learning_rate": 5.647457627118644e-06,
15521
+ "loss": 1.3056,
15522
+ "step": 21670
15523
+ },
15524
+ {
15525
+ "epoch": 0.54,
15526
+ "grad_norm": 2.9596614837646484,
15527
+ "learning_rate": 5.640677966101696e-06,
15528
+ "loss": 1.3888,
15529
+ "step": 21680
15530
+ },
15531
+ {
15532
+ "epoch": 0.54,
15533
+ "grad_norm": 5.489665985107422,
15534
+ "learning_rate": 5.633898305084746e-06,
15535
+ "loss": 1.2491,
15536
+ "step": 21690
15537
+ },
15538
+ {
15539
+ "epoch": 0.54,
15540
+ "grad_norm": 5.618114948272705,
15541
+ "learning_rate": 5.6271186440677975e-06,
15542
+ "loss": 1.2746,
15543
+ "step": 21700
15544
+ },
15545
+ {
15546
+ "epoch": 0.54,
15547
+ "grad_norm": 2.145024061203003,
15548
+ "learning_rate": 5.620338983050848e-06,
15549
+ "loss": 1.4036,
15550
+ "step": 21710
15551
+ },
15552
+ {
15553
+ "epoch": 0.54,
15554
+ "grad_norm": 2.4870400428771973,
15555
+ "learning_rate": 5.6135593220338995e-06,
15556
+ "loss": 1.4355,
15557
+ "step": 21720
15558
+ },
15559
+ {
15560
+ "epoch": 0.54,
15561
+ "grad_norm": 2.576144218444824,
15562
+ "learning_rate": 5.60677966101695e-06,
15563
+ "loss": 1.2988,
15564
+ "step": 21730
15565
+ },
15566
+ {
15567
+ "epoch": 0.54,
15568
+ "grad_norm": 5.971595764160156,
15569
+ "learning_rate": 5.600000000000001e-06,
15570
+ "loss": 1.3201,
15571
+ "step": 21740
15572
+ },
15573
+ {
15574
+ "epoch": 0.54,
15575
+ "grad_norm": 7.581085205078125,
15576
+ "learning_rate": 5.593220338983051e-06,
15577
+ "loss": 1.3358,
15578
+ "step": 21750
15579
+ },
15580
+ {
15581
+ "epoch": 0.54,
15582
+ "grad_norm": 4.148537635803223,
15583
+ "learning_rate": 5.586440677966102e-06,
15584
+ "loss": 1.2464,
15585
+ "step": 21760
15586
+ },
15587
+ {
15588
+ "epoch": 0.54,
15589
+ "grad_norm": 6.613537788391113,
15590
+ "learning_rate": 5.579661016949153e-06,
15591
+ "loss": 1.3156,
15592
+ "step": 21770
15593
+ },
15594
+ {
15595
+ "epoch": 0.54,
15596
+ "grad_norm": 10.526129722595215,
15597
+ "learning_rate": 5.572881355932204e-06,
15598
+ "loss": 1.2483,
15599
+ "step": 21780
15600
+ },
15601
+ {
15602
+ "epoch": 0.54,
15603
+ "grad_norm": 7.221047401428223,
15604
+ "learning_rate": 5.566101694915255e-06,
15605
+ "loss": 1.5878,
15606
+ "step": 21790
15607
+ },
15608
+ {
15609
+ "epoch": 0.55,
15610
+ "grad_norm": 6.365529537200928,
15611
+ "learning_rate": 5.559322033898306e-06,
15612
+ "loss": 1.4117,
15613
+ "step": 21800
15614
+ },
15615
+ {
15616
+ "epoch": 0.55,
15617
+ "grad_norm": 3.8305916786193848,
15618
+ "learning_rate": 5.552542372881356e-06,
15619
+ "loss": 1.1894,
15620
+ "step": 21810
15621
+ },
15622
+ {
15623
+ "epoch": 0.55,
15624
+ "grad_norm": 3.672477960586548,
15625
+ "learning_rate": 5.5457627118644076e-06,
15626
+ "loss": 1.2199,
15627
+ "step": 21820
15628
+ },
15629
+ {
15630
+ "epoch": 0.55,
15631
+ "grad_norm": 2.586512565612793,
15632
+ "learning_rate": 5.538983050847458e-06,
15633
+ "loss": 1.392,
15634
+ "step": 21830
15635
+ },
15636
+ {
15637
+ "epoch": 0.55,
15638
+ "grad_norm": 4.2184624671936035,
15639
+ "learning_rate": 5.5322033898305095e-06,
15640
+ "loss": 1.3008,
15641
+ "step": 21840
15642
+ },
15643
+ {
15644
+ "epoch": 0.55,
15645
+ "grad_norm": 7.834671974182129,
15646
+ "learning_rate": 5.525423728813559e-06,
15647
+ "loss": 1.1794,
15648
+ "step": 21850
15649
+ },
15650
+ {
15651
+ "epoch": 0.55,
15652
+ "grad_norm": 3.5877692699432373,
15653
+ "learning_rate": 5.518644067796611e-06,
15654
+ "loss": 1.4339,
15655
+ "step": 21860
15656
+ },
15657
+ {
15658
+ "epoch": 0.55,
15659
+ "grad_norm": 3.0174179077148438,
15660
+ "learning_rate": 5.511864406779661e-06,
15661
+ "loss": 1.2704,
15662
+ "step": 21870
15663
+ },
15664
+ {
15665
+ "epoch": 0.55,
15666
+ "grad_norm": 2.9889588356018066,
15667
+ "learning_rate": 5.505084745762712e-06,
15668
+ "loss": 1.3588,
15669
+ "step": 21880
15670
+ },
15671
+ {
15672
+ "epoch": 0.55,
15673
+ "grad_norm": 10.810959815979004,
15674
+ "learning_rate": 5.498305084745763e-06,
15675
+ "loss": 1.3539,
15676
+ "step": 21890
15677
+ },
15678
+ {
15679
+ "epoch": 0.55,
15680
+ "grad_norm": 5.771850109100342,
15681
+ "learning_rate": 5.491525423728814e-06,
15682
+ "loss": 1.4278,
15683
+ "step": 21900
15684
+ },
15685
+ {
15686
+ "epoch": 0.55,
15687
+ "grad_norm": 4.13969612121582,
15688
+ "learning_rate": 5.484745762711865e-06,
15689
+ "loss": 1.2798,
15690
+ "step": 21910
15691
+ },
15692
+ {
15693
+ "epoch": 0.55,
15694
+ "grad_norm": 15.295929908752441,
15695
+ "learning_rate": 5.477966101694916e-06,
15696
+ "loss": 1.5213,
15697
+ "step": 21920
15698
+ },
15699
+ {
15700
+ "epoch": 0.55,
15701
+ "grad_norm": 6.445948600769043,
15702
+ "learning_rate": 5.471186440677967e-06,
15703
+ "loss": 1.3498,
15704
+ "step": 21930
15705
+ },
15706
+ {
15707
+ "epoch": 0.55,
15708
+ "grad_norm": 9.28097152709961,
15709
+ "learning_rate": 5.464406779661018e-06,
15710
+ "loss": 1.3046,
15711
+ "step": 21940
15712
+ },
15713
+ {
15714
+ "epoch": 0.55,
15715
+ "grad_norm": 6.094447135925293,
15716
+ "learning_rate": 5.457627118644067e-06,
15717
+ "loss": 1.4336,
15718
+ "step": 21950
15719
+ },
15720
+ {
15721
+ "epoch": 0.55,
15722
+ "grad_norm": 9.818504333496094,
15723
+ "learning_rate": 5.450847457627119e-06,
15724
+ "loss": 1.516,
15725
+ "step": 21960
15726
+ },
15727
+ {
15728
+ "epoch": 0.55,
15729
+ "grad_norm": 11.956009864807129,
15730
+ "learning_rate": 5.444067796610169e-06,
15731
+ "loss": 1.3435,
15732
+ "step": 21970
15733
+ },
15734
+ {
15735
+ "epoch": 0.55,
15736
+ "grad_norm": 7.544681072235107,
15737
+ "learning_rate": 5.437288135593221e-06,
15738
+ "loss": 1.4625,
15739
+ "step": 21980
15740
+ },
15741
+ {
15742
+ "epoch": 0.55,
15743
+ "grad_norm": 3.394897222518921,
15744
+ "learning_rate": 5.430508474576271e-06,
15745
+ "loss": 1.4565,
15746
+ "step": 21990
15747
+ },
15748
+ {
15749
+ "epoch": 0.55,
15750
+ "grad_norm": 5.722468852996826,
15751
+ "learning_rate": 5.423728813559323e-06,
15752
+ "loss": 1.2133,
15753
+ "step": 22000
15754
+ },
15755
+ {
15756
+ "epoch": 0.55,
15757
+ "eval_loss": 1.3403723239898682,
15758
+ "eval_runtime": 66.1471,
15759
+ "eval_samples_per_second": 15.118,
15760
+ "eval_steps_per_second": 15.118,
15761
+ "step": 22000
15762
+ },
15763
+ {
15764
+ "epoch": 0.55,
15765
+ "grad_norm": 7.046730995178223,
15766
+ "learning_rate": 5.416949152542373e-06,
15767
+ "loss": 1.1353,
15768
+ "step": 22010
15769
+ },
15770
+ {
15771
+ "epoch": 0.55,
15772
+ "grad_norm": 7.013365268707275,
15773
+ "learning_rate": 5.4101694915254246e-06,
15774
+ "loss": 1.4333,
15775
+ "step": 22020
15776
+ },
15777
+ {
15778
+ "epoch": 0.55,
15779
+ "grad_norm": 3.9469892978668213,
15780
+ "learning_rate": 5.403389830508475e-06,
15781
+ "loss": 1.3601,
15782
+ "step": 22030
15783
+ },
15784
+ {
15785
+ "epoch": 0.55,
15786
+ "grad_norm": 3.740983486175537,
15787
+ "learning_rate": 5.3966101694915265e-06,
15788
+ "loss": 1.0232,
15789
+ "step": 22040
15790
+ },
15791
+ {
15792
+ "epoch": 0.55,
15793
+ "grad_norm": 4.604060649871826,
15794
+ "learning_rate": 5.389830508474577e-06,
15795
+ "loss": 1.4011,
15796
+ "step": 22050
15797
+ },
15798
+ {
15799
+ "epoch": 0.55,
15800
+ "grad_norm": 0.5601249933242798,
15801
+ "learning_rate": 5.383050847457627e-06,
15802
+ "loss": 1.2068,
15803
+ "step": 22060
15804
+ },
15805
+ {
15806
+ "epoch": 0.55,
15807
+ "grad_norm": 2.7781484127044678,
15808
+ "learning_rate": 5.376271186440678e-06,
15809
+ "loss": 1.3976,
15810
+ "step": 22070
15811
+ },
15812
+ {
15813
+ "epoch": 0.55,
15814
+ "grad_norm": 10.417901992797852,
15815
+ "learning_rate": 5.369491525423729e-06,
15816
+ "loss": 1.2444,
15817
+ "step": 22080
15818
+ },
15819
+ {
15820
+ "epoch": 0.55,
15821
+ "grad_norm": 8.263280868530273,
15822
+ "learning_rate": 5.36271186440678e-06,
15823
+ "loss": 1.0937,
15824
+ "step": 22090
15825
+ },
15826
+ {
15827
+ "epoch": 0.55,
15828
+ "grad_norm": 6.128343105316162,
15829
+ "learning_rate": 5.355932203389831e-06,
15830
+ "loss": 1.2891,
15831
+ "step": 22100
15832
+ },
15833
+ {
15834
+ "epoch": 0.55,
15835
+ "grad_norm": 4.099800109863281,
15836
+ "learning_rate": 5.349152542372882e-06,
15837
+ "loss": 1.117,
15838
+ "step": 22110
15839
+ },
15840
+ {
15841
+ "epoch": 0.55,
15842
+ "grad_norm": 7.756937026977539,
15843
+ "learning_rate": 5.342372881355933e-06,
15844
+ "loss": 1.3112,
15845
+ "step": 22120
15846
+ },
15847
+ {
15848
+ "epoch": 0.55,
15849
+ "grad_norm": 5.372160911560059,
15850
+ "learning_rate": 5.335593220338984e-06,
15851
+ "loss": 1.5276,
15852
+ "step": 22130
15853
+ },
15854
+ {
15855
+ "epoch": 0.55,
15856
+ "grad_norm": 5.017634391784668,
15857
+ "learning_rate": 5.328813559322035e-06,
15858
+ "loss": 1.2825,
15859
+ "step": 22140
15860
+ },
15861
+ {
15862
+ "epoch": 0.55,
15863
+ "grad_norm": 5.629271030426025,
15864
+ "learning_rate": 5.322033898305086e-06,
15865
+ "loss": 1.2428,
15866
+ "step": 22150
15867
+ },
15868
+ {
15869
+ "epoch": 0.55,
15870
+ "grad_norm": 6.938544273376465,
15871
+ "learning_rate": 5.315254237288136e-06,
15872
+ "loss": 1.4563,
15873
+ "step": 22160
15874
+ },
15875
+ {
15876
+ "epoch": 0.55,
15877
+ "grad_norm": 13.804441452026367,
15878
+ "learning_rate": 5.308474576271186e-06,
15879
+ "loss": 1.332,
15880
+ "step": 22170
15881
+ },
15882
+ {
15883
+ "epoch": 0.55,
15884
+ "grad_norm": 10.347596168518066,
15885
+ "learning_rate": 5.301694915254238e-06,
15886
+ "loss": 1.3879,
15887
+ "step": 22180
15888
+ },
15889
+ {
15890
+ "epoch": 0.55,
15891
+ "grad_norm": 2.613632917404175,
15892
+ "learning_rate": 5.294915254237288e-06,
15893
+ "loss": 1.3298,
15894
+ "step": 22190
15895
+ },
15896
+ {
15897
+ "epoch": 0.56,
15898
+ "grad_norm": 14.637787818908691,
15899
+ "learning_rate": 5.28813559322034e-06,
15900
+ "loss": 1.3301,
15901
+ "step": 22200
15902
+ },
15903
+ {
15904
+ "epoch": 0.56,
15905
+ "grad_norm": 2.5796003341674805,
15906
+ "learning_rate": 5.28135593220339e-06,
15907
+ "loss": 1.41,
15908
+ "step": 22210
15909
+ },
15910
+ {
15911
+ "epoch": 0.56,
15912
+ "grad_norm": 5.326439380645752,
15913
+ "learning_rate": 5.2745762711864416e-06,
15914
+ "loss": 1.2483,
15915
+ "step": 22220
15916
+ },
15917
+ {
15918
+ "epoch": 0.56,
15919
+ "grad_norm": 8.928110122680664,
15920
+ "learning_rate": 5.267796610169492e-06,
15921
+ "loss": 1.2566,
15922
+ "step": 22230
15923
+ },
15924
+ {
15925
+ "epoch": 0.56,
15926
+ "grad_norm": 9.285192489624023,
15927
+ "learning_rate": 5.2610169491525435e-06,
15928
+ "loss": 1.2982,
15929
+ "step": 22240
15930
+ },
15931
+ {
15932
+ "epoch": 0.56,
15933
+ "grad_norm": 2.101649045944214,
15934
+ "learning_rate": 5.254237288135594e-06,
15935
+ "loss": 1.3181,
15936
+ "step": 22250
15937
+ },
15938
+ {
15939
+ "epoch": 0.56,
15940
+ "grad_norm": 5.57994270324707,
15941
+ "learning_rate": 5.247457627118645e-06,
15942
+ "loss": 1.3745,
15943
+ "step": 22260
15944
+ },
15945
+ {
15946
+ "epoch": 0.56,
15947
+ "grad_norm": 12.0460844039917,
15948
+ "learning_rate": 5.240677966101695e-06,
15949
+ "loss": 1.202,
15950
+ "step": 22270
15951
+ },
15952
+ {
15953
+ "epoch": 0.56,
15954
+ "grad_norm": 5.367649078369141,
15955
+ "learning_rate": 5.233898305084746e-06,
15956
+ "loss": 1.2969,
15957
+ "step": 22280
15958
+ },
15959
+ {
15960
+ "epoch": 0.56,
15961
+ "grad_norm": 4.784932613372803,
15962
+ "learning_rate": 5.227118644067797e-06,
15963
+ "loss": 1.363,
15964
+ "step": 22290
15965
+ },
15966
+ {
15967
+ "epoch": 0.56,
15968
+ "grad_norm": 8.103744506835938,
15969
+ "learning_rate": 5.220338983050848e-06,
15970
+ "loss": 1.4144,
15971
+ "step": 22300
15972
+ },
15973
+ {
15974
+ "epoch": 0.56,
15975
+ "grad_norm": 5.561450958251953,
15976
+ "learning_rate": 5.213559322033899e-06,
15977
+ "loss": 1.3018,
15978
+ "step": 22310
15979
+ },
15980
+ {
15981
+ "epoch": 0.56,
15982
+ "grad_norm": 8.294450759887695,
15983
+ "learning_rate": 5.20677966101695e-06,
15984
+ "loss": 1.3509,
15985
+ "step": 22320
15986
+ },
15987
+ {
15988
+ "epoch": 0.56,
15989
+ "grad_norm": 8.498016357421875,
15990
+ "learning_rate": 5.2e-06,
15991
+ "loss": 1.4932,
15992
+ "step": 22330
15993
+ },
15994
+ {
15995
+ "epoch": 0.56,
15996
+ "grad_norm": 3.3640925884246826,
15997
+ "learning_rate": 5.193220338983052e-06,
15998
+ "loss": 1.4413,
15999
+ "step": 22340
16000
+ },
16001
+ {
16002
+ "epoch": 0.56,
16003
+ "grad_norm": 3.236924886703491,
16004
+ "learning_rate": 5.186440677966102e-06,
16005
+ "loss": 1.2535,
16006
+ "step": 22350
16007
+ },
16008
+ {
16009
+ "epoch": 0.56,
16010
+ "grad_norm": 6.234226703643799,
16011
+ "learning_rate": 5.1796610169491535e-06,
16012
+ "loss": 1.1816,
16013
+ "step": 22360
16014
+ },
16015
+ {
16016
+ "epoch": 0.56,
16017
+ "grad_norm": 6.9503703117370605,
16018
+ "learning_rate": 5.172881355932203e-06,
16019
+ "loss": 1.2022,
16020
+ "step": 22370
16021
+ },
16022
+ {
16023
+ "epoch": 0.56,
16024
+ "grad_norm": 11.745012283325195,
16025
+ "learning_rate": 5.166101694915255e-06,
16026
+ "loss": 1.543,
16027
+ "step": 22380
16028
+ },
16029
+ {
16030
+ "epoch": 0.56,
16031
+ "grad_norm": 8.977341651916504,
16032
+ "learning_rate": 5.159322033898305e-06,
16033
+ "loss": 1.4185,
16034
+ "step": 22390
16035
+ },
16036
+ {
16037
+ "epoch": 0.56,
16038
+ "grad_norm": 2.515448570251465,
16039
+ "learning_rate": 5.152542372881356e-06,
16040
+ "loss": 1.2665,
16041
+ "step": 22400
16042
+ },
16043
+ {
16044
+ "epoch": 0.56,
16045
+ "grad_norm": 9.47354507446289,
16046
+ "learning_rate": 5.145762711864407e-06,
16047
+ "loss": 1.3086,
16048
+ "step": 22410
16049
+ },
16050
+ {
16051
+ "epoch": 0.56,
16052
+ "grad_norm": 6.550108432769775,
16053
+ "learning_rate": 5.138983050847458e-06,
16054
+ "loss": 1.3129,
16055
+ "step": 22420
16056
+ },
16057
+ {
16058
+ "epoch": 0.56,
16059
+ "grad_norm": 3.860886335372925,
16060
+ "learning_rate": 5.132203389830509e-06,
16061
+ "loss": 1.4046,
16062
+ "step": 22430
16063
+ },
16064
+ {
16065
+ "epoch": 0.56,
16066
+ "grad_norm": 8.062068939208984,
16067
+ "learning_rate": 5.12542372881356e-06,
16068
+ "loss": 1.207,
16069
+ "step": 22440
16070
+ },
16071
+ {
16072
+ "epoch": 0.56,
16073
+ "grad_norm": 5.823556423187256,
16074
+ "learning_rate": 5.118644067796611e-06,
16075
+ "loss": 1.2547,
16076
+ "step": 22450
16077
+ },
16078
+ {
16079
+ "epoch": 0.56,
16080
+ "grad_norm": 7.168029308319092,
16081
+ "learning_rate": 5.111864406779662e-06,
16082
+ "loss": 1.2772,
16083
+ "step": 22460
16084
+ },
16085
+ {
16086
+ "epoch": 0.56,
16087
+ "grad_norm": 9.224081039428711,
16088
+ "learning_rate": 5.105084745762711e-06,
16089
+ "loss": 1.3066,
16090
+ "step": 22470
16091
+ },
16092
+ {
16093
+ "epoch": 0.56,
16094
+ "grad_norm": 3.878537178039551,
16095
+ "learning_rate": 5.098305084745763e-06,
16096
+ "loss": 1.2413,
16097
+ "step": 22480
16098
+ },
16099
+ {
16100
+ "epoch": 0.56,
16101
+ "grad_norm": 5.821982383728027,
16102
+ "learning_rate": 5.091525423728813e-06,
16103
+ "loss": 1.3609,
16104
+ "step": 22490
16105
+ },
16106
+ {
16107
+ "epoch": 0.56,
16108
+ "grad_norm": 9.057456970214844,
16109
+ "learning_rate": 5.084745762711865e-06,
16110
+ "loss": 1.2255,
16111
+ "step": 22500
16112
+ },
16113
+ {
16114
+ "epoch": 0.56,
16115
+ "eval_loss": 1.3439137935638428,
16116
+ "eval_runtime": 66.2088,
16117
+ "eval_samples_per_second": 15.104,
16118
+ "eval_steps_per_second": 15.104,
16119
+ "step": 22500
16120
  }
16121
  ],
16122
  "logging_steps": 10,
 
16124
  "num_input_tokens_seen": 0,
16125
  "num_train_epochs": 1,
16126
  "save_steps": 2500,
16127
+ "total_flos": 3.6229783486464e+17,
16128
  "train_batch_size": 1,
16129
  "trial_name": null,
16130
  "trial_params": null