cgihlstorf's picture
Upload 8 files
6c59853 verified
raw
history blame
97.8 kB
{
"best_metric": NaN,
"best_model_checkpoint": "/scratch/czm5kz/finetuned_pythia70M_deduped_cp_14300016_1_0.0001_sequential/checkpoint-20",
"epoch": 0.9975062344139651,
"eval_steps": 20,
"global_step": 2800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 20
},
{
"epoch": 0.01,
"eval_loss": NaN,
"eval_runtime": 16.2646,
"eval_samples_per_second": 690.396,
"eval_steps_per_second": 86.323,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 40
},
{
"epoch": 0.01,
"eval_loss": NaN,
"eval_runtime": 16.2319,
"eval_samples_per_second": 691.785,
"eval_steps_per_second": 86.496,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 45
},
{
"epoch": 0.02,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 55
},
{
"epoch": 0.02,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 60
},
{
"epoch": 0.02,
"eval_loss": NaN,
"eval_runtime": 16.2349,
"eval_samples_per_second": 691.658,
"eval_steps_per_second": 86.48,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 65
},
{
"epoch": 0.02,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 75
},
{
"epoch": 0.03,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 80
},
{
"epoch": 0.03,
"eval_loss": NaN,
"eval_runtime": 16.1717,
"eval_samples_per_second": 694.359,
"eval_steps_per_second": 86.818,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 85
},
{
"epoch": 0.03,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 95
},
{
"epoch": 0.04,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 100
},
{
"epoch": 0.04,
"eval_loss": NaN,
"eval_runtime": 16.2119,
"eval_samples_per_second": 692.638,
"eval_steps_per_second": 86.603,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 105
},
{
"epoch": 0.04,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 115
},
{
"epoch": 0.04,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 120
},
{
"epoch": 0.04,
"eval_loss": NaN,
"eval_runtime": 16.1587,
"eval_samples_per_second": 694.922,
"eval_steps_per_second": 86.888,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 125
},
{
"epoch": 0.05,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 130
},
{
"epoch": 0.05,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 135
},
{
"epoch": 0.05,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 140
},
{
"epoch": 0.05,
"eval_loss": NaN,
"eval_runtime": 16.1453,
"eval_samples_per_second": 695.498,
"eval_steps_per_second": 86.961,
"step": 140
},
{
"epoch": 0.05,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 145
},
{
"epoch": 0.05,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 150
},
{
"epoch": 0.06,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 155
},
{
"epoch": 0.06,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 160
},
{
"epoch": 0.06,
"eval_loss": NaN,
"eval_runtime": 16.1155,
"eval_samples_per_second": 696.781,
"eval_steps_per_second": 87.121,
"step": 160
},
{
"epoch": 0.06,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 165
},
{
"epoch": 0.06,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 170
},
{
"epoch": 0.06,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 175
},
{
"epoch": 0.06,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 180
},
{
"epoch": 0.06,
"eval_loss": NaN,
"eval_runtime": 16.197,
"eval_samples_per_second": 693.274,
"eval_steps_per_second": 86.682,
"step": 180
},
{
"epoch": 0.07,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 185
},
{
"epoch": 0.07,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 190
},
{
"epoch": 0.07,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 195
},
{
"epoch": 0.07,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 200
},
{
"epoch": 0.07,
"eval_loss": NaN,
"eval_runtime": 16.1959,
"eval_samples_per_second": 693.325,
"eval_steps_per_second": 86.689,
"step": 200
},
{
"epoch": 0.07,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 205
},
{
"epoch": 0.07,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 210
},
{
"epoch": 0.08,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 215
},
{
"epoch": 0.08,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 220
},
{
"epoch": 0.08,
"eval_loss": NaN,
"eval_runtime": 16.2076,
"eval_samples_per_second": 692.822,
"eval_steps_per_second": 86.626,
"step": 220
},
{
"epoch": 0.08,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 225
},
{
"epoch": 0.08,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 230
},
{
"epoch": 0.08,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 235
},
{
"epoch": 0.09,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 240
},
{
"epoch": 0.09,
"eval_loss": NaN,
"eval_runtime": 16.194,
"eval_samples_per_second": 693.405,
"eval_steps_per_second": 86.699,
"step": 240
},
{
"epoch": 0.09,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 245
},
{
"epoch": 0.09,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 250
},
{
"epoch": 0.09,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 255
},
{
"epoch": 0.09,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 260
},
{
"epoch": 0.09,
"eval_loss": NaN,
"eval_runtime": 16.2126,
"eval_samples_per_second": 692.608,
"eval_steps_per_second": 86.599,
"step": 260
},
{
"epoch": 0.09,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 265
},
{
"epoch": 0.1,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 270
},
{
"epoch": 0.1,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 275
},
{
"epoch": 0.1,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 280
},
{
"epoch": 0.1,
"eval_loss": NaN,
"eval_runtime": 16.1502,
"eval_samples_per_second": 695.286,
"eval_steps_per_second": 86.934,
"step": 280
},
{
"epoch": 0.1,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 285
},
{
"epoch": 0.1,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 290
},
{
"epoch": 0.11,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 295
},
{
"epoch": 0.11,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 300
},
{
"epoch": 0.11,
"eval_loss": NaN,
"eval_runtime": 16.0935,
"eval_samples_per_second": 697.736,
"eval_steps_per_second": 87.24,
"step": 300
},
{
"epoch": 0.11,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 305
},
{
"epoch": 0.11,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 310
},
{
"epoch": 0.11,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 315
},
{
"epoch": 0.11,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 320
},
{
"epoch": 0.11,
"eval_loss": NaN,
"eval_runtime": 16.1237,
"eval_samples_per_second": 696.428,
"eval_steps_per_second": 87.077,
"step": 320
},
{
"epoch": 0.12,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 325
},
{
"epoch": 0.12,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 330
},
{
"epoch": 0.12,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 335
},
{
"epoch": 0.12,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 340
},
{
"epoch": 0.12,
"eval_loss": NaN,
"eval_runtime": 16.1237,
"eval_samples_per_second": 696.427,
"eval_steps_per_second": 87.077,
"step": 340
},
{
"epoch": 0.12,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 345
},
{
"epoch": 0.12,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 350
},
{
"epoch": 0.13,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 355
},
{
"epoch": 0.13,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 360
},
{
"epoch": 0.13,
"eval_loss": NaN,
"eval_runtime": 16.1376,
"eval_samples_per_second": 695.826,
"eval_steps_per_second": 87.002,
"step": 360
},
{
"epoch": 0.13,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 365
},
{
"epoch": 0.13,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 370
},
{
"epoch": 0.13,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 375
},
{
"epoch": 0.14,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 380
},
{
"epoch": 0.14,
"eval_loss": NaN,
"eval_runtime": 16.1667,
"eval_samples_per_second": 694.575,
"eval_steps_per_second": 86.845,
"step": 380
},
{
"epoch": 0.14,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 385
},
{
"epoch": 0.14,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 390
},
{
"epoch": 0.14,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 395
},
{
"epoch": 0.14,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 400
},
{
"epoch": 0.14,
"eval_loss": NaN,
"eval_runtime": 16.2383,
"eval_samples_per_second": 691.514,
"eval_steps_per_second": 86.462,
"step": 400
},
{
"epoch": 0.14,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 405
},
{
"epoch": 0.15,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 410
},
{
"epoch": 0.15,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 415
},
{
"epoch": 0.15,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 420
},
{
"epoch": 0.15,
"eval_loss": NaN,
"eval_runtime": 16.1316,
"eval_samples_per_second": 696.086,
"eval_steps_per_second": 87.034,
"step": 420
},
{
"epoch": 0.15,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 425
},
{
"epoch": 0.15,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 430
},
{
"epoch": 0.15,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 435
},
{
"epoch": 0.16,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 440
},
{
"epoch": 0.16,
"eval_loss": NaN,
"eval_runtime": 16.1279,
"eval_samples_per_second": 696.246,
"eval_steps_per_second": 87.054,
"step": 440
},
{
"epoch": 0.16,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 445
},
{
"epoch": 0.16,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 450
},
{
"epoch": 0.16,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 455
},
{
"epoch": 0.16,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 460
},
{
"epoch": 0.16,
"eval_loss": NaN,
"eval_runtime": 16.1569,
"eval_samples_per_second": 694.996,
"eval_steps_per_second": 86.898,
"step": 460
},
{
"epoch": 0.17,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 465
},
{
"epoch": 0.17,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 470
},
{
"epoch": 0.17,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 475
},
{
"epoch": 0.17,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 480
},
{
"epoch": 0.17,
"eval_loss": NaN,
"eval_runtime": 16.1815,
"eval_samples_per_second": 693.941,
"eval_steps_per_second": 86.766,
"step": 480
},
{
"epoch": 0.17,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 485
},
{
"epoch": 0.17,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 490
},
{
"epoch": 0.18,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 495
},
{
"epoch": 0.18,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 500
},
{
"epoch": 0.18,
"eval_loss": NaN,
"eval_runtime": 16.167,
"eval_samples_per_second": 694.563,
"eval_steps_per_second": 86.844,
"step": 500
},
{
"epoch": 0.18,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 505
},
{
"epoch": 0.18,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 510
},
{
"epoch": 0.18,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 515
},
{
"epoch": 0.19,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 520
},
{
"epoch": 0.19,
"eval_loss": NaN,
"eval_runtime": 16.2232,
"eval_samples_per_second": 692.157,
"eval_steps_per_second": 86.543,
"step": 520
},
{
"epoch": 0.19,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 525
},
{
"epoch": 0.19,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 530
},
{
"epoch": 0.19,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 535
},
{
"epoch": 0.19,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 540
},
{
"epoch": 0.19,
"eval_loss": NaN,
"eval_runtime": 16.1294,
"eval_samples_per_second": 696.182,
"eval_steps_per_second": 87.046,
"step": 540
},
{
"epoch": 0.19,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 545
},
{
"epoch": 0.2,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 550
},
{
"epoch": 0.2,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 555
},
{
"epoch": 0.2,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 560
},
{
"epoch": 0.2,
"eval_loss": NaN,
"eval_runtime": 16.1562,
"eval_samples_per_second": 695.026,
"eval_steps_per_second": 86.902,
"step": 560
},
{
"epoch": 0.2,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 565
},
{
"epoch": 0.2,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 570
},
{
"epoch": 0.2,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 575
},
{
"epoch": 0.21,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 580
},
{
"epoch": 0.21,
"eval_loss": NaN,
"eval_runtime": 16.148,
"eval_samples_per_second": 695.381,
"eval_steps_per_second": 86.946,
"step": 580
},
{
"epoch": 0.21,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 585
},
{
"epoch": 0.21,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 590
},
{
"epoch": 0.21,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 595
},
{
"epoch": 0.21,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 600
},
{
"epoch": 0.21,
"eval_loss": NaN,
"eval_runtime": 16.1902,
"eval_samples_per_second": 693.566,
"eval_steps_per_second": 86.719,
"step": 600
},
{
"epoch": 0.22,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 605
},
{
"epoch": 0.22,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 610
},
{
"epoch": 0.22,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 615
},
{
"epoch": 0.22,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 620
},
{
"epoch": 0.22,
"eval_loss": NaN,
"eval_runtime": 16.1725,
"eval_samples_per_second": 694.329,
"eval_steps_per_second": 86.814,
"step": 620
},
{
"epoch": 0.22,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 625
},
{
"epoch": 0.22,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 630
},
{
"epoch": 0.23,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 635
},
{
"epoch": 0.23,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 640
},
{
"epoch": 0.23,
"eval_loss": NaN,
"eval_runtime": 16.1777,
"eval_samples_per_second": 694.105,
"eval_steps_per_second": 86.786,
"step": 640
},
{
"epoch": 0.23,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 645
},
{
"epoch": 0.23,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 650
},
{
"epoch": 0.23,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 655
},
{
"epoch": 0.24,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 660
},
{
"epoch": 0.24,
"eval_loss": NaN,
"eval_runtime": 16.1979,
"eval_samples_per_second": 693.236,
"eval_steps_per_second": 86.678,
"step": 660
},
{
"epoch": 0.24,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 665
},
{
"epoch": 0.24,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 670
},
{
"epoch": 0.24,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 675
},
{
"epoch": 0.24,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 680
},
{
"epoch": 0.24,
"eval_loss": NaN,
"eval_runtime": 16.1488,
"eval_samples_per_second": 695.348,
"eval_steps_per_second": 86.942,
"step": 680
},
{
"epoch": 0.24,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 685
},
{
"epoch": 0.25,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 690
},
{
"epoch": 0.25,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 695
},
{
"epoch": 0.25,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 700
},
{
"epoch": 0.25,
"eval_loss": NaN,
"eval_runtime": 16.1234,
"eval_samples_per_second": 696.442,
"eval_steps_per_second": 87.079,
"step": 700
},
{
"epoch": 0.25,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 705
},
{
"epoch": 0.25,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 710
},
{
"epoch": 0.25,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 715
},
{
"epoch": 0.26,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 720
},
{
"epoch": 0.26,
"eval_loss": NaN,
"eval_runtime": 16.0945,
"eval_samples_per_second": 697.69,
"eval_steps_per_second": 87.235,
"step": 720
},
{
"epoch": 0.26,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 725
},
{
"epoch": 0.26,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 730
},
{
"epoch": 0.26,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 735
},
{
"epoch": 0.26,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 740
},
{
"epoch": 0.26,
"eval_loss": NaN,
"eval_runtime": 16.0903,
"eval_samples_per_second": 697.876,
"eval_steps_per_second": 87.258,
"step": 740
},
{
"epoch": 0.27,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 745
},
{
"epoch": 0.27,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 750
},
{
"epoch": 0.27,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 755
},
{
"epoch": 0.27,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 760
},
{
"epoch": 0.27,
"eval_loss": NaN,
"eval_runtime": 16.1093,
"eval_samples_per_second": 697.05,
"eval_steps_per_second": 87.154,
"step": 760
},
{
"epoch": 0.27,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 765
},
{
"epoch": 0.27,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 770
},
{
"epoch": 0.28,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 1.2589,
"step": 775
},
{
"epoch": 0.28,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 780
},
{
"epoch": 0.28,
"eval_loss": NaN,
"eval_runtime": 16.1207,
"eval_samples_per_second": 696.558,
"eval_steps_per_second": 87.093,
"step": 780
},
{
"epoch": 0.28,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 785
},
{
"epoch": 0.28,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 790
},
{
"epoch": 0.28,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 795
},
{
"epoch": 0.29,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 800
},
{
"epoch": 0.29,
"eval_loss": NaN,
"eval_runtime": 16.0988,
"eval_samples_per_second": 697.507,
"eval_steps_per_second": 87.212,
"step": 800
},
{
"epoch": 0.29,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 805
},
{
"epoch": 0.29,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 810
},
{
"epoch": 0.29,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 815
},
{
"epoch": 0.29,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 820
},
{
"epoch": 0.29,
"eval_loss": NaN,
"eval_runtime": 16.1116,
"eval_samples_per_second": 696.952,
"eval_steps_per_second": 87.142,
"step": 820
},
{
"epoch": 0.29,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 825
},
{
"epoch": 0.3,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 830
},
{
"epoch": 0.3,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 835
},
{
"epoch": 0.3,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 840
},
{
"epoch": 0.3,
"eval_loss": NaN,
"eval_runtime": 16.1157,
"eval_samples_per_second": 696.772,
"eval_steps_per_second": 87.12,
"step": 840
},
{
"epoch": 0.3,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 845
},
{
"epoch": 0.3,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 850
},
{
"epoch": 0.3,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 855
},
{
"epoch": 0.31,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 860
},
{
"epoch": 0.31,
"eval_loss": NaN,
"eval_runtime": 16.1211,
"eval_samples_per_second": 696.539,
"eval_steps_per_second": 87.091,
"step": 860
},
{
"epoch": 0.31,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 865
},
{
"epoch": 0.31,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 870
},
{
"epoch": 0.31,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 875
},
{
"epoch": 0.31,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 880
},
{
"epoch": 0.31,
"eval_loss": NaN,
"eval_runtime": 16.1913,
"eval_samples_per_second": 693.522,
"eval_steps_per_second": 86.713,
"step": 880
},
{
"epoch": 0.32,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 885
},
{
"epoch": 0.32,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 890
},
{
"epoch": 0.32,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 895
},
{
"epoch": 0.32,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 900
},
{
"epoch": 0.32,
"eval_loss": NaN,
"eval_runtime": 16.1202,
"eval_samples_per_second": 696.579,
"eval_steps_per_second": 87.096,
"step": 900
},
{
"epoch": 0.32,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 905
},
{
"epoch": 0.32,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 910
},
{
"epoch": 0.33,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 915
},
{
"epoch": 0.33,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 920
},
{
"epoch": 0.33,
"eval_loss": NaN,
"eval_runtime": 16.1717,
"eval_samples_per_second": 694.361,
"eval_steps_per_second": 86.818,
"step": 920
},
{
"epoch": 0.33,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 925
},
{
"epoch": 0.33,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 930
},
{
"epoch": 0.33,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 935
},
{
"epoch": 0.33,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 940
},
{
"epoch": 0.33,
"eval_loss": NaN,
"eval_runtime": 16.1822,
"eval_samples_per_second": 693.91,
"eval_steps_per_second": 86.762,
"step": 940
},
{
"epoch": 0.34,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 945
},
{
"epoch": 0.34,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 950
},
{
"epoch": 0.34,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 955
},
{
"epoch": 0.34,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 960
},
{
"epoch": 0.34,
"eval_loss": NaN,
"eval_runtime": 16.1731,
"eval_samples_per_second": 694.3,
"eval_steps_per_second": 86.811,
"step": 960
},
{
"epoch": 0.34,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 965
},
{
"epoch": 0.35,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 970
},
{
"epoch": 0.35,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 975
},
{
"epoch": 0.35,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 980
},
{
"epoch": 0.35,
"eval_loss": NaN,
"eval_runtime": 16.1675,
"eval_samples_per_second": 694.542,
"eval_steps_per_second": 86.841,
"step": 980
},
{
"epoch": 0.35,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 985
},
{
"epoch": 0.35,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 990
},
{
"epoch": 0.35,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 995
},
{
"epoch": 0.36,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1000
},
{
"epoch": 0.36,
"eval_loss": NaN,
"eval_runtime": 16.1725,
"eval_samples_per_second": 694.327,
"eval_steps_per_second": 86.814,
"step": 1000
},
{
"epoch": 0.36,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1005
},
{
"epoch": 0.36,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1010
},
{
"epoch": 0.36,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1015
},
{
"epoch": 0.36,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1020
},
{
"epoch": 0.36,
"eval_loss": NaN,
"eval_runtime": 16.2071,
"eval_samples_per_second": 692.844,
"eval_steps_per_second": 86.629,
"step": 1020
},
{
"epoch": 0.37,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1025
},
{
"epoch": 0.37,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1030
},
{
"epoch": 0.37,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1035
},
{
"epoch": 0.37,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1040
},
{
"epoch": 0.37,
"eval_loss": NaN,
"eval_runtime": 16.1622,
"eval_samples_per_second": 694.768,
"eval_steps_per_second": 86.869,
"step": 1040
},
{
"epoch": 0.37,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1045
},
{
"epoch": 0.37,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1050
},
{
"epoch": 0.38,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1055
},
{
"epoch": 0.38,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1060
},
{
"epoch": 0.38,
"eval_loss": NaN,
"eval_runtime": 16.1992,
"eval_samples_per_second": 693.182,
"eval_steps_per_second": 86.671,
"step": 1060
},
{
"epoch": 0.38,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1065
},
{
"epoch": 0.38,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1070
},
{
"epoch": 0.38,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1075
},
{
"epoch": 0.38,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1080
},
{
"epoch": 0.38,
"eval_loss": NaN,
"eval_runtime": 16.1864,
"eval_samples_per_second": 693.729,
"eval_steps_per_second": 86.739,
"step": 1080
},
{
"epoch": 0.39,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1085
},
{
"epoch": 0.39,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1090
},
{
"epoch": 0.39,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1095
},
{
"epoch": 0.39,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1100
},
{
"epoch": 0.39,
"eval_loss": NaN,
"eval_runtime": 16.1551,
"eval_samples_per_second": 695.076,
"eval_steps_per_second": 86.908,
"step": 1100
},
{
"epoch": 0.39,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1105
},
{
"epoch": 0.4,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1110
},
{
"epoch": 0.4,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1115
},
{
"epoch": 0.4,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1120
},
{
"epoch": 0.4,
"eval_loss": NaN,
"eval_runtime": 16.1439,
"eval_samples_per_second": 695.556,
"eval_steps_per_second": 86.968,
"step": 1120
},
{
"epoch": 0.4,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1125
},
{
"epoch": 0.4,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1130
},
{
"epoch": 0.4,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1135
},
{
"epoch": 0.41,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1140
},
{
"epoch": 0.41,
"eval_loss": NaN,
"eval_runtime": 16.1942,
"eval_samples_per_second": 693.397,
"eval_steps_per_second": 86.698,
"step": 1140
},
{
"epoch": 0.41,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1145
},
{
"epoch": 0.41,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1150
},
{
"epoch": 0.41,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1155
},
{
"epoch": 0.41,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1160
},
{
"epoch": 0.41,
"eval_loss": NaN,
"eval_runtime": 16.1761,
"eval_samples_per_second": 694.173,
"eval_steps_per_second": 86.795,
"step": 1160
},
{
"epoch": 0.42,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1165
},
{
"epoch": 0.42,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1170
},
{
"epoch": 0.42,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1175
},
{
"epoch": 0.42,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1180
},
{
"epoch": 0.42,
"eval_loss": NaN,
"eval_runtime": 16.1256,
"eval_samples_per_second": 696.348,
"eval_steps_per_second": 87.067,
"step": 1180
},
{
"epoch": 0.42,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1185
},
{
"epoch": 0.42,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1190
},
{
"epoch": 0.43,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1195
},
{
"epoch": 0.43,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1200
},
{
"epoch": 0.43,
"eval_loss": NaN,
"eval_runtime": 16.2228,
"eval_samples_per_second": 692.172,
"eval_steps_per_second": 86.545,
"step": 1200
},
{
"epoch": 0.43,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1205
},
{
"epoch": 0.43,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1210
},
{
"epoch": 0.43,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1215
},
{
"epoch": 0.43,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1220
},
{
"epoch": 0.43,
"eval_loss": NaN,
"eval_runtime": 16.1816,
"eval_samples_per_second": 693.936,
"eval_steps_per_second": 86.765,
"step": 1220
},
{
"epoch": 0.44,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1225
},
{
"epoch": 0.44,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1230
},
{
"epoch": 0.44,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1235
},
{
"epoch": 0.44,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1240
},
{
"epoch": 0.44,
"eval_loss": NaN,
"eval_runtime": 16.1431,
"eval_samples_per_second": 695.592,
"eval_steps_per_second": 86.972,
"step": 1240
},
{
"epoch": 0.44,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1245
},
{
"epoch": 0.45,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1250
},
{
"epoch": 0.45,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1255
},
{
"epoch": 0.45,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1260
},
{
"epoch": 0.45,
"eval_loss": NaN,
"eval_runtime": 16.1159,
"eval_samples_per_second": 696.764,
"eval_steps_per_second": 87.119,
"step": 1260
},
{
"epoch": 0.45,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1265
},
{
"epoch": 0.45,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1270
},
{
"epoch": 0.45,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1275
},
{
"epoch": 0.46,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1280
},
{
"epoch": 0.46,
"eval_loss": NaN,
"eval_runtime": 16.1487,
"eval_samples_per_second": 695.348,
"eval_steps_per_second": 86.942,
"step": 1280
},
{
"epoch": 0.46,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1285
},
{
"epoch": 0.46,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1290
},
{
"epoch": 0.46,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1295
},
{
"epoch": 0.46,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1300
},
{
"epoch": 0.46,
"eval_loss": NaN,
"eval_runtime": 16.1799,
"eval_samples_per_second": 694.011,
"eval_steps_per_second": 86.775,
"step": 1300
},
{
"epoch": 0.46,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1305
},
{
"epoch": 0.47,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1310
},
{
"epoch": 0.47,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1315
},
{
"epoch": 0.47,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1320
},
{
"epoch": 0.47,
"eval_loss": NaN,
"eval_runtime": 16.1229,
"eval_samples_per_second": 696.463,
"eval_steps_per_second": 87.081,
"step": 1320
},
{
"epoch": 0.47,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1325
},
{
"epoch": 0.47,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1330
},
{
"epoch": 0.48,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1335
},
{
"epoch": 0.48,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1340
},
{
"epoch": 0.48,
"eval_loss": NaN,
"eval_runtime": 16.1407,
"eval_samples_per_second": 695.696,
"eval_steps_per_second": 86.985,
"step": 1340
},
{
"epoch": 0.48,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1345
},
{
"epoch": 0.48,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1350
},
{
"epoch": 0.48,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1355
},
{
"epoch": 0.48,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1360
},
{
"epoch": 0.48,
"eval_loss": NaN,
"eval_runtime": 16.1523,
"eval_samples_per_second": 695.196,
"eval_steps_per_second": 86.923,
"step": 1360
},
{
"epoch": 0.49,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1365
},
{
"epoch": 0.49,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1370
},
{
"epoch": 0.49,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1375
},
{
"epoch": 0.49,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1380
},
{
"epoch": 0.49,
"eval_loss": NaN,
"eval_runtime": 16.14,
"eval_samples_per_second": 695.723,
"eval_steps_per_second": 86.989,
"step": 1380
},
{
"epoch": 0.49,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1385
},
{
"epoch": 0.5,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1390
},
{
"epoch": 0.5,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1395
},
{
"epoch": 0.5,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1400
},
{
"epoch": 0.5,
"eval_loss": NaN,
"eval_runtime": 16.1664,
"eval_samples_per_second": 694.59,
"eval_steps_per_second": 86.847,
"step": 1400
},
{
"epoch": 0.5,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1405
},
{
"epoch": 0.5,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1410
},
{
"epoch": 0.5,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1415
},
{
"epoch": 0.51,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1420
},
{
"epoch": 0.51,
"eval_loss": NaN,
"eval_runtime": 16.2224,
"eval_samples_per_second": 692.191,
"eval_steps_per_second": 86.547,
"step": 1420
},
{
"epoch": 0.51,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1425
},
{
"epoch": 0.51,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1430
},
{
"epoch": 0.51,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1435
},
{
"epoch": 0.51,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1440
},
{
"epoch": 0.51,
"eval_loss": NaN,
"eval_runtime": 16.1801,
"eval_samples_per_second": 694.0,
"eval_steps_per_second": 86.773,
"step": 1440
},
{
"epoch": 0.51,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1445
},
{
"epoch": 0.52,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1450
},
{
"epoch": 0.52,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1455
},
{
"epoch": 0.52,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1460
},
{
"epoch": 0.52,
"eval_loss": NaN,
"eval_runtime": 16.2045,
"eval_samples_per_second": 692.955,
"eval_steps_per_second": 86.642,
"step": 1460
},
{
"epoch": 0.52,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1465
},
{
"epoch": 0.52,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1470
},
{
"epoch": 0.53,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 20.2422,
"step": 1475
},
{
"epoch": 0.53,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 117.5013,
"step": 1480
},
{
"epoch": 0.53,
"eval_loss": NaN,
"eval_runtime": 16.1953,
"eval_samples_per_second": 693.35,
"eval_steps_per_second": 86.692,
"step": 1480
},
{
"epoch": 0.53,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1485
},
{
"epoch": 0.53,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1490
},
{
"epoch": 0.53,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1495
},
{
"epoch": 0.53,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1500
},
{
"epoch": 0.53,
"eval_loss": NaN,
"eval_runtime": 16.1941,
"eval_samples_per_second": 693.401,
"eval_steps_per_second": 86.698,
"step": 1500
},
{
"epoch": 0.54,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1505
},
{
"epoch": 0.54,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1510
},
{
"epoch": 0.54,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1515
},
{
"epoch": 0.54,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1520
},
{
"epoch": 0.54,
"eval_loss": NaN,
"eval_runtime": 16.2896,
"eval_samples_per_second": 689.334,
"eval_steps_per_second": 86.19,
"step": 1520
},
{
"epoch": 0.54,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1525
},
{
"epoch": 0.55,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1530
},
{
"epoch": 0.55,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1535
},
{
"epoch": 0.55,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1540
},
{
"epoch": 0.55,
"eval_loss": NaN,
"eval_runtime": 16.1612,
"eval_samples_per_second": 694.814,
"eval_steps_per_second": 86.875,
"step": 1540
},
{
"epoch": 0.55,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1545
},
{
"epoch": 0.55,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1550
},
{
"epoch": 0.55,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1555
},
{
"epoch": 0.56,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1560
},
{
"epoch": 0.56,
"eval_loss": NaN,
"eval_runtime": 16.194,
"eval_samples_per_second": 693.405,
"eval_steps_per_second": 86.699,
"step": 1560
},
{
"epoch": 0.56,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1565
},
{
"epoch": 0.56,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1570
},
{
"epoch": 0.56,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1575
},
{
"epoch": 0.56,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1580
},
{
"epoch": 0.56,
"eval_loss": NaN,
"eval_runtime": 16.164,
"eval_samples_per_second": 694.691,
"eval_steps_per_second": 86.86,
"step": 1580
},
{
"epoch": 0.56,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1585
},
{
"epoch": 0.57,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1590
},
{
"epoch": 0.57,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1595
},
{
"epoch": 0.57,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1600
},
{
"epoch": 0.57,
"eval_loss": NaN,
"eval_runtime": 16.1661,
"eval_samples_per_second": 694.603,
"eval_steps_per_second": 86.849,
"step": 1600
},
{
"epoch": 0.57,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1605
},
{
"epoch": 0.57,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1610
},
{
"epoch": 0.58,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1615
},
{
"epoch": 0.58,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1620
},
{
"epoch": 0.58,
"eval_loss": NaN,
"eval_runtime": 16.2063,
"eval_samples_per_second": 692.879,
"eval_steps_per_second": 86.633,
"step": 1620
},
{
"epoch": 0.58,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1625
},
{
"epoch": 0.58,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1630
},
{
"epoch": 0.58,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1635
},
{
"epoch": 0.58,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1640
},
{
"epoch": 0.58,
"eval_loss": NaN,
"eval_runtime": 16.2074,
"eval_samples_per_second": 692.832,
"eval_steps_per_second": 86.627,
"step": 1640
},
{
"epoch": 0.59,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1645
},
{
"epoch": 0.59,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 4.0531,
"step": 1650
},
{
"epoch": 0.59,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1655
},
{
"epoch": 0.59,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1660
},
{
"epoch": 0.59,
"eval_loss": NaN,
"eval_runtime": 16.1356,
"eval_samples_per_second": 695.916,
"eval_steps_per_second": 87.013,
"step": 1660
},
{
"epoch": 0.59,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1665
},
{
"epoch": 0.59,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1670
},
{
"epoch": 0.6,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1675
},
{
"epoch": 0.6,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1680
},
{
"epoch": 0.6,
"eval_loss": NaN,
"eval_runtime": 16.1839,
"eval_samples_per_second": 693.838,
"eval_steps_per_second": 86.753,
"step": 1680
},
{
"epoch": 0.6,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1685
},
{
"epoch": 0.6,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1690
},
{
"epoch": 0.6,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1695
},
{
"epoch": 0.61,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1700
},
{
"epoch": 0.61,
"eval_loss": NaN,
"eval_runtime": 16.1942,
"eval_samples_per_second": 693.398,
"eval_steps_per_second": 86.698,
"step": 1700
},
{
"epoch": 0.61,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1705
},
{
"epoch": 0.61,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1710
},
{
"epoch": 0.61,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1715
},
{
"epoch": 0.61,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1720
},
{
"epoch": 0.61,
"eval_loss": NaN,
"eval_runtime": 16.1515,
"eval_samples_per_second": 695.23,
"eval_steps_per_second": 86.927,
"step": 1720
},
{
"epoch": 0.61,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1725
},
{
"epoch": 0.62,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1730
},
{
"epoch": 0.62,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1735
},
{
"epoch": 0.62,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1740
},
{
"epoch": 0.62,
"eval_loss": NaN,
"eval_runtime": 16.244,
"eval_samples_per_second": 691.269,
"eval_steps_per_second": 86.432,
"step": 1740
},
{
"epoch": 0.62,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1745
},
{
"epoch": 0.62,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1750
},
{
"epoch": 0.63,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1755
},
{
"epoch": 0.63,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1760
},
{
"epoch": 0.63,
"eval_loss": NaN,
"eval_runtime": 16.2126,
"eval_samples_per_second": 692.611,
"eval_steps_per_second": 86.6,
"step": 1760
},
{
"epoch": 0.63,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1765
},
{
"epoch": 0.63,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1770
},
{
"epoch": 0.63,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1775
},
{
"epoch": 0.63,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1780
},
{
"epoch": 0.63,
"eval_loss": NaN,
"eval_runtime": 16.1706,
"eval_samples_per_second": 694.41,
"eval_steps_per_second": 86.824,
"step": 1780
},
{
"epoch": 0.64,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1785
},
{
"epoch": 0.64,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1790
},
{
"epoch": 0.64,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1795
},
{
"epoch": 0.64,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1800
},
{
"epoch": 0.64,
"eval_loss": NaN,
"eval_runtime": 16.1746,
"eval_samples_per_second": 694.236,
"eval_steps_per_second": 86.803,
"step": 1800
},
{
"epoch": 0.64,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1805
},
{
"epoch": 0.64,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1810
},
{
"epoch": 0.65,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1815
},
{
"epoch": 0.65,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1820
},
{
"epoch": 0.65,
"eval_loss": NaN,
"eval_runtime": 16.2212,
"eval_samples_per_second": 692.24,
"eval_steps_per_second": 86.553,
"step": 1820
},
{
"epoch": 0.65,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1825
},
{
"epoch": 0.65,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1830
},
{
"epoch": 0.65,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1835
},
{
"epoch": 0.66,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1840
},
{
"epoch": 0.66,
"eval_loss": NaN,
"eval_runtime": 16.2052,
"eval_samples_per_second": 692.924,
"eval_steps_per_second": 86.639,
"step": 1840
},
{
"epoch": 0.66,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1845
},
{
"epoch": 0.66,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1850
},
{
"epoch": 0.66,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1855
},
{
"epoch": 0.66,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1860
},
{
"epoch": 0.66,
"eval_loss": NaN,
"eval_runtime": 16.2254,
"eval_samples_per_second": 692.065,
"eval_steps_per_second": 86.531,
"step": 1860
},
{
"epoch": 0.66,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1865
},
{
"epoch": 0.67,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1870
},
{
"epoch": 0.67,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1875
},
{
"epoch": 0.67,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1880
},
{
"epoch": 0.67,
"eval_loss": NaN,
"eval_runtime": 16.2606,
"eval_samples_per_second": 690.564,
"eval_steps_per_second": 86.344,
"step": 1880
},
{
"epoch": 0.67,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1885
},
{
"epoch": 0.67,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 6.9059,
"step": 1890
},
{
"epoch": 0.68,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1895
},
{
"epoch": 0.68,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1900
},
{
"epoch": 0.68,
"eval_loss": NaN,
"eval_runtime": 16.2856,
"eval_samples_per_second": 689.504,
"eval_steps_per_second": 86.211,
"step": 1900
},
{
"epoch": 0.68,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1905
},
{
"epoch": 0.68,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1910
},
{
"epoch": 0.68,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1915
},
{
"epoch": 0.68,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1920
},
{
"epoch": 0.68,
"eval_loss": NaN,
"eval_runtime": 16.2044,
"eval_samples_per_second": 692.959,
"eval_steps_per_second": 86.643,
"step": 1920
},
{
"epoch": 0.69,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1925
},
{
"epoch": 0.69,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1930
},
{
"epoch": 0.69,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1935
},
{
"epoch": 0.69,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1940
},
{
"epoch": 0.69,
"eval_loss": NaN,
"eval_runtime": 16.1584,
"eval_samples_per_second": 694.931,
"eval_steps_per_second": 86.89,
"step": 1940
},
{
"epoch": 0.69,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1945
},
{
"epoch": 0.69,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1950
},
{
"epoch": 0.7,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1955
},
{
"epoch": 0.7,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1960
},
{
"epoch": 0.7,
"eval_loss": NaN,
"eval_runtime": 16.269,
"eval_samples_per_second": 690.208,
"eval_steps_per_second": 86.299,
"step": 1960
},
{
"epoch": 0.7,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1965
},
{
"epoch": 0.7,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1970
},
{
"epoch": 0.7,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1975
},
{
"epoch": 0.71,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1980
},
{
"epoch": 0.71,
"eval_loss": NaN,
"eval_runtime": 16.1943,
"eval_samples_per_second": 693.393,
"eval_steps_per_second": 86.697,
"step": 1980
},
{
"epoch": 0.71,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1985
},
{
"epoch": 0.71,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1990
},
{
"epoch": 0.71,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 1995
},
{
"epoch": 0.71,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2000
},
{
"epoch": 0.71,
"eval_loss": NaN,
"eval_runtime": 16.1939,
"eval_samples_per_second": 693.411,
"eval_steps_per_second": 86.7,
"step": 2000
},
{
"epoch": 0.71,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2005
},
{
"epoch": 0.72,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2010
},
{
"epoch": 0.72,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2015
},
{
"epoch": 0.72,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2020
},
{
"epoch": 0.72,
"eval_loss": NaN,
"eval_runtime": 16.3019,
"eval_samples_per_second": 688.814,
"eval_steps_per_second": 86.125,
"step": 2020
},
{
"epoch": 0.72,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2025
},
{
"epoch": 0.72,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2030
},
{
"epoch": 0.72,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2035
},
{
"epoch": 0.73,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2040
},
{
"epoch": 0.73,
"eval_loss": NaN,
"eval_runtime": 16.2124,
"eval_samples_per_second": 692.617,
"eval_steps_per_second": 86.6,
"step": 2040
},
{
"epoch": 0.73,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2045
},
{
"epoch": 0.73,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 34.0299,
"step": 2050
},
{
"epoch": 0.73,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2055
},
{
"epoch": 0.73,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2060
},
{
"epoch": 0.73,
"eval_loss": NaN,
"eval_runtime": 16.1907,
"eval_samples_per_second": 693.546,
"eval_steps_per_second": 86.716,
"step": 2060
},
{
"epoch": 0.74,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2065
},
{
"epoch": 0.74,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2070
},
{
"epoch": 0.74,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2075
},
{
"epoch": 0.74,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2080
},
{
"epoch": 0.74,
"eval_loss": NaN,
"eval_runtime": 16.1888,
"eval_samples_per_second": 693.628,
"eval_steps_per_second": 86.727,
"step": 2080
},
{
"epoch": 0.74,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2085
},
{
"epoch": 0.74,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2090
},
{
"epoch": 0.75,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2095
},
{
"epoch": 0.75,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2100
},
{
"epoch": 0.75,
"eval_loss": NaN,
"eval_runtime": 16.1955,
"eval_samples_per_second": 693.341,
"eval_steps_per_second": 86.691,
"step": 2100
},
{
"epoch": 0.75,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2105
},
{
"epoch": 0.75,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2110
},
{
"epoch": 0.75,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2115
},
{
"epoch": 0.76,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2120
},
{
"epoch": 0.76,
"eval_loss": NaN,
"eval_runtime": 16.1528,
"eval_samples_per_second": 695.173,
"eval_steps_per_second": 86.92,
"step": 2120
},
{
"epoch": 0.76,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2125
},
{
"epoch": 0.76,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2130
},
{
"epoch": 0.76,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2135
},
{
"epoch": 0.76,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2140
},
{
"epoch": 0.76,
"eval_loss": NaN,
"eval_runtime": 16.1943,
"eval_samples_per_second": 693.392,
"eval_steps_per_second": 86.697,
"step": 2140
},
{
"epoch": 0.76,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2145
},
{
"epoch": 0.77,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2150
},
{
"epoch": 0.77,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2155
},
{
"epoch": 0.77,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2160
},
{
"epoch": 0.77,
"eval_loss": NaN,
"eval_runtime": 16.1942,
"eval_samples_per_second": 693.398,
"eval_steps_per_second": 86.698,
"step": 2160
},
{
"epoch": 0.77,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2165
},
{
"epoch": 0.77,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2170
},
{
"epoch": 0.77,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2175
},
{
"epoch": 0.78,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2180
},
{
"epoch": 0.78,
"eval_loss": NaN,
"eval_runtime": 16.1922,
"eval_samples_per_second": 693.481,
"eval_steps_per_second": 86.708,
"step": 2180
},
{
"epoch": 0.78,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2185
},
{
"epoch": 0.78,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2190
},
{
"epoch": 0.78,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2195
},
{
"epoch": 0.78,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2200
},
{
"epoch": 0.78,
"eval_loss": NaN,
"eval_runtime": 16.2072,
"eval_samples_per_second": 692.842,
"eval_steps_per_second": 86.628,
"step": 2200
},
{
"epoch": 0.79,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2205
},
{
"epoch": 0.79,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2210
},
{
"epoch": 0.79,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2215
},
{
"epoch": 0.79,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2220
},
{
"epoch": 0.79,
"eval_loss": NaN,
"eval_runtime": 16.1933,
"eval_samples_per_second": 693.433,
"eval_steps_per_second": 86.702,
"step": 2220
},
{
"epoch": 0.79,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2225
},
{
"epoch": 0.79,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2230
},
{
"epoch": 0.8,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2235
},
{
"epoch": 0.8,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2240
},
{
"epoch": 0.8,
"eval_loss": NaN,
"eval_runtime": 16.2672,
"eval_samples_per_second": 690.287,
"eval_steps_per_second": 86.309,
"step": 2240
},
{
"epoch": 0.8,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2245
},
{
"epoch": 0.8,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2250
},
{
"epoch": 0.8,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2255
},
{
"epoch": 0.81,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2260
},
{
"epoch": 0.81,
"eval_loss": NaN,
"eval_runtime": 16.147,
"eval_samples_per_second": 695.421,
"eval_steps_per_second": 86.951,
"step": 2260
},
{
"epoch": 0.81,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2265
},
{
"epoch": 0.81,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2270
},
{
"epoch": 0.81,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2275
},
{
"epoch": 0.81,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2280
},
{
"epoch": 0.81,
"eval_loss": NaN,
"eval_runtime": 16.2004,
"eval_samples_per_second": 693.132,
"eval_steps_per_second": 86.665,
"step": 2280
},
{
"epoch": 0.81,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2285
},
{
"epoch": 0.82,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2290
},
{
"epoch": 0.82,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2295
},
{
"epoch": 0.82,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2300
},
{
"epoch": 0.82,
"eval_loss": NaN,
"eval_runtime": 16.2039,
"eval_samples_per_second": 692.981,
"eval_steps_per_second": 86.646,
"step": 2300
},
{
"epoch": 0.82,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2305
},
{
"epoch": 0.82,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2310
},
{
"epoch": 0.82,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2315
},
{
"epoch": 0.83,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2320
},
{
"epoch": 0.83,
"eval_loss": NaN,
"eval_runtime": 16.2038,
"eval_samples_per_second": 692.985,
"eval_steps_per_second": 86.646,
"step": 2320
},
{
"epoch": 0.83,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2325
},
{
"epoch": 0.83,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2330
},
{
"epoch": 0.83,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2335
},
{
"epoch": 0.83,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2340
},
{
"epoch": 0.83,
"eval_loss": NaN,
"eval_runtime": 16.1966,
"eval_samples_per_second": 693.295,
"eval_steps_per_second": 86.685,
"step": 2340
},
{
"epoch": 0.84,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2345
},
{
"epoch": 0.84,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2350
},
{
"epoch": 0.84,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2355
},
{
"epoch": 0.84,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2360
},
{
"epoch": 0.84,
"eval_loss": NaN,
"eval_runtime": 16.2513,
"eval_samples_per_second": 690.962,
"eval_steps_per_second": 86.393,
"step": 2360
},
{
"epoch": 0.84,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2365
},
{
"epoch": 0.84,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2370
},
{
"epoch": 0.85,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2375
},
{
"epoch": 0.85,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2380
},
{
"epoch": 0.85,
"eval_loss": NaN,
"eval_runtime": 16.1975,
"eval_samples_per_second": 693.256,
"eval_steps_per_second": 86.68,
"step": 2380
},
{
"epoch": 0.85,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2385
},
{
"epoch": 0.85,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2390
},
{
"epoch": 0.85,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2395
},
{
"epoch": 0.86,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2400
},
{
"epoch": 0.86,
"eval_loss": NaN,
"eval_runtime": 16.1908,
"eval_samples_per_second": 693.541,
"eval_steps_per_second": 86.716,
"step": 2400
},
{
"epoch": 0.86,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2405
},
{
"epoch": 0.86,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2410
},
{
"epoch": 0.86,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2415
},
{
"epoch": 0.86,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2420
},
{
"epoch": 0.86,
"eval_loss": NaN,
"eval_runtime": 16.203,
"eval_samples_per_second": 693.021,
"eval_steps_per_second": 86.651,
"step": 2420
},
{
"epoch": 0.86,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2425
},
{
"epoch": 0.87,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2430
},
{
"epoch": 0.87,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2435
},
{
"epoch": 0.87,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2440
},
{
"epoch": 0.87,
"eval_loss": NaN,
"eval_runtime": 16.1902,
"eval_samples_per_second": 693.567,
"eval_steps_per_second": 86.719,
"step": 2440
},
{
"epoch": 0.87,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2445
},
{
"epoch": 0.87,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2450
},
{
"epoch": 0.87,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2455
},
{
"epoch": 0.88,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2460
},
{
"epoch": 0.88,
"eval_loss": NaN,
"eval_runtime": 16.249,
"eval_samples_per_second": 691.058,
"eval_steps_per_second": 86.405,
"step": 2460
},
{
"epoch": 0.88,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2465
},
{
"epoch": 0.88,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2470
},
{
"epoch": 0.88,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2475
},
{
"epoch": 0.88,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2480
},
{
"epoch": 0.88,
"eval_loss": NaN,
"eval_runtime": 16.2094,
"eval_samples_per_second": 692.748,
"eval_steps_per_second": 86.617,
"step": 2480
},
{
"epoch": 0.89,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2485
},
{
"epoch": 0.89,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2490
},
{
"epoch": 0.89,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2495
},
{
"epoch": 0.89,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2500
},
{
"epoch": 0.89,
"eval_loss": NaN,
"eval_runtime": 16.2068,
"eval_samples_per_second": 692.859,
"eval_steps_per_second": 86.631,
"step": 2500
},
{
"epoch": 0.89,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2505
},
{
"epoch": 0.89,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2510
},
{
"epoch": 0.9,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2515
},
{
"epoch": 0.9,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2520
},
{
"epoch": 0.9,
"eval_loss": NaN,
"eval_runtime": 16.1974,
"eval_samples_per_second": 693.258,
"eval_steps_per_second": 86.68,
"step": 2520
},
{
"epoch": 0.9,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2525
},
{
"epoch": 0.9,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2530
},
{
"epoch": 0.9,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2535
},
{
"epoch": 0.9,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2540
},
{
"epoch": 0.9,
"eval_loss": NaN,
"eval_runtime": 16.1773,
"eval_samples_per_second": 694.119,
"eval_steps_per_second": 86.788,
"step": 2540
},
{
"epoch": 0.91,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2545
},
{
"epoch": 0.91,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2550
},
{
"epoch": 0.91,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2555
},
{
"epoch": 0.91,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2560
},
{
"epoch": 0.91,
"eval_loss": NaN,
"eval_runtime": 16.1948,
"eval_samples_per_second": 693.37,
"eval_steps_per_second": 86.694,
"step": 2560
},
{
"epoch": 0.91,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2565
},
{
"epoch": 0.92,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2570
},
{
"epoch": 0.92,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2575
},
{
"epoch": 0.92,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2580
},
{
"epoch": 0.92,
"eval_loss": NaN,
"eval_runtime": 16.1716,
"eval_samples_per_second": 694.365,
"eval_steps_per_second": 86.819,
"step": 2580
},
{
"epoch": 0.92,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2585
},
{
"epoch": 0.92,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2590
},
{
"epoch": 0.92,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2595
},
{
"epoch": 0.93,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2600
},
{
"epoch": 0.93,
"eval_loss": NaN,
"eval_runtime": 16.1676,
"eval_samples_per_second": 694.538,
"eval_steps_per_second": 86.841,
"step": 2600
},
{
"epoch": 0.93,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2605
},
{
"epoch": 0.93,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2610
},
{
"epoch": 0.93,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2615
},
{
"epoch": 0.93,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2620
},
{
"epoch": 0.93,
"eval_loss": NaN,
"eval_runtime": 16.1591,
"eval_samples_per_second": 694.901,
"eval_steps_per_second": 86.886,
"step": 2620
},
{
"epoch": 0.94,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2625
},
{
"epoch": 0.94,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2630
},
{
"epoch": 0.94,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2635
},
{
"epoch": 0.94,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2640
},
{
"epoch": 0.94,
"eval_loss": NaN,
"eval_runtime": 16.2088,
"eval_samples_per_second": 692.774,
"eval_steps_per_second": 86.62,
"step": 2640
},
{
"epoch": 0.94,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2645
},
{
"epoch": 0.94,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2650
},
{
"epoch": 0.95,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2655
},
{
"epoch": 0.95,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2660
},
{
"epoch": 0.95,
"eval_loss": NaN,
"eval_runtime": 16.1786,
"eval_samples_per_second": 694.063,
"eval_steps_per_second": 86.781,
"step": 2660
},
{
"epoch": 0.95,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2665
},
{
"epoch": 0.95,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2670
},
{
"epoch": 0.95,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2675
},
{
"epoch": 0.95,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2680
},
{
"epoch": 0.95,
"eval_loss": NaN,
"eval_runtime": 16.2273,
"eval_samples_per_second": 691.982,
"eval_steps_per_second": 86.521,
"step": 2680
},
{
"epoch": 0.96,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2685
},
{
"epoch": 0.96,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2690
},
{
"epoch": 0.96,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2695
},
{
"epoch": 0.96,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2700
},
{
"epoch": 0.96,
"eval_loss": NaN,
"eval_runtime": 16.1623,
"eval_samples_per_second": 694.765,
"eval_steps_per_second": 86.869,
"step": 2700
},
{
"epoch": 0.96,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2705
},
{
"epoch": 0.97,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2710
},
{
"epoch": 0.97,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2715
},
{
"epoch": 0.97,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2720
},
{
"epoch": 0.97,
"eval_loss": NaN,
"eval_runtime": 16.2151,
"eval_samples_per_second": 692.504,
"eval_steps_per_second": 86.586,
"step": 2720
},
{
"epoch": 0.97,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2725
},
{
"epoch": 0.97,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2730
},
{
"epoch": 0.97,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2735
},
{
"epoch": 0.98,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2740
},
{
"epoch": 0.98,
"eval_loss": NaN,
"eval_runtime": 16.1991,
"eval_samples_per_second": 693.186,
"eval_steps_per_second": 86.671,
"step": 2740
},
{
"epoch": 0.98,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2745
},
{
"epoch": 0.98,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2750
},
{
"epoch": 0.98,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2755
},
{
"epoch": 0.98,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2760
},
{
"epoch": 0.98,
"eval_loss": NaN,
"eval_runtime": 16.1704,
"eval_samples_per_second": 694.419,
"eval_steps_per_second": 86.826,
"step": 2760
},
{
"epoch": 0.99,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2765
},
{
"epoch": 0.99,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2770
},
{
"epoch": 0.99,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2775
},
{
"epoch": 0.99,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2780
},
{
"epoch": 0.99,
"eval_loss": NaN,
"eval_runtime": 16.2677,
"eval_samples_per_second": 690.265,
"eval_steps_per_second": 86.306,
"step": 2780
},
{
"epoch": 0.99,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2785
},
{
"epoch": 0.99,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2790
},
{
"epoch": 1.0,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2795
},
{
"epoch": 1.0,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 2800
},
{
"epoch": 1.0,
"eval_loss": NaN,
"eval_runtime": 16.2191,
"eval_samples_per_second": 692.331,
"eval_steps_per_second": 86.565,
"step": 2800
}
],
"logging_steps": 5,
"max_steps": 2807,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"total_flos": 1587243693637632.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}