retnet-tinystories / trainer_state.json
Azamorn's picture
Upload folder using huggingface_hub
1e49201
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 0.0004975,
"loss": 1.8757,
"step": 5
},
{
"epoch": 0.01,
"learning_rate": 0.000495,
"loss": 1.8758,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 0.0004925,
"loss": 1.9307,
"step": 15
},
{
"epoch": 0.02,
"learning_rate": 0.00049,
"loss": 1.9338,
"step": 20
},
{
"epoch": 0.03,
"learning_rate": 0.0004875,
"loss": 1.8599,
"step": 25
},
{
"epoch": 0.03,
"learning_rate": 0.00048499999999999997,
"loss": 1.9875,
"step": 30
},
{
"epoch": 0.04,
"learning_rate": 0.0004825,
"loss": 1.9947,
"step": 35
},
{
"epoch": 0.04,
"learning_rate": 0.00048,
"loss": 1.9015,
"step": 40
},
{
"epoch": 0.04,
"learning_rate": 0.0004775,
"loss": 1.8941,
"step": 45
},
{
"epoch": 0.05,
"learning_rate": 0.000475,
"loss": 1.8592,
"step": 50
},
{
"epoch": 0.06,
"learning_rate": 0.0004725,
"loss": 1.8977,
"step": 55
},
{
"epoch": 0.06,
"learning_rate": 0.00047,
"loss": 1.886,
"step": 60
},
{
"epoch": 0.07,
"learning_rate": 0.00046750000000000003,
"loss": 1.9486,
"step": 65
},
{
"epoch": 0.07,
"learning_rate": 0.000465,
"loss": 1.8669,
"step": 70
},
{
"epoch": 0.07,
"learning_rate": 0.0004625,
"loss": 1.936,
"step": 75
},
{
"epoch": 0.08,
"learning_rate": 0.00046,
"loss": 1.8385,
"step": 80
},
{
"epoch": 0.09,
"learning_rate": 0.0004575,
"loss": 1.8045,
"step": 85
},
{
"epoch": 0.09,
"learning_rate": 0.000455,
"loss": 1.9058,
"step": 90
},
{
"epoch": 0.1,
"learning_rate": 0.00045250000000000005,
"loss": 1.868,
"step": 95
},
{
"epoch": 0.1,
"learning_rate": 0.00045000000000000004,
"loss": 1.8055,
"step": 100
},
{
"epoch": 0.1,
"learning_rate": 0.00044750000000000004,
"loss": 1.849,
"step": 105
},
{
"epoch": 0.11,
"learning_rate": 0.00044500000000000003,
"loss": 1.869,
"step": 110
},
{
"epoch": 0.12,
"learning_rate": 0.0004425,
"loss": 1.8587,
"step": 115
},
{
"epoch": 0.12,
"learning_rate": 0.00044,
"loss": 1.9206,
"step": 120
},
{
"epoch": 0.12,
"learning_rate": 0.0004375,
"loss": 1.8406,
"step": 125
},
{
"epoch": 0.13,
"learning_rate": 0.000435,
"loss": 1.8721,
"step": 130
},
{
"epoch": 0.14,
"learning_rate": 0.0004325,
"loss": 1.9409,
"step": 135
},
{
"epoch": 0.14,
"learning_rate": 0.00043,
"loss": 1.9222,
"step": 140
},
{
"epoch": 0.14,
"learning_rate": 0.0004275,
"loss": 1.8705,
"step": 145
},
{
"epoch": 0.15,
"learning_rate": 0.000425,
"loss": 1.9348,
"step": 150
},
{
"epoch": 0.15,
"learning_rate": 0.00042249999999999997,
"loss": 1.8167,
"step": 155
},
{
"epoch": 0.16,
"learning_rate": 0.00042,
"loss": 1.8904,
"step": 160
},
{
"epoch": 0.17,
"learning_rate": 0.0004175,
"loss": 1.8545,
"step": 165
},
{
"epoch": 0.17,
"learning_rate": 0.000415,
"loss": 1.8448,
"step": 170
},
{
"epoch": 0.17,
"learning_rate": 0.0004125,
"loss": 1.8898,
"step": 175
},
{
"epoch": 0.18,
"learning_rate": 0.00041,
"loss": 1.8338,
"step": 180
},
{
"epoch": 0.18,
"learning_rate": 0.0004075,
"loss": 1.8246,
"step": 185
},
{
"epoch": 0.19,
"learning_rate": 0.00040500000000000003,
"loss": 1.8754,
"step": 190
},
{
"epoch": 0.2,
"learning_rate": 0.0004025,
"loss": 1.8603,
"step": 195
},
{
"epoch": 0.2,
"learning_rate": 0.0004,
"loss": 1.799,
"step": 200
},
{
"epoch": 0.2,
"learning_rate": 0.0003975,
"loss": 1.8652,
"step": 205
},
{
"epoch": 0.21,
"learning_rate": 0.000395,
"loss": 1.8406,
"step": 210
},
{
"epoch": 0.21,
"learning_rate": 0.0003925,
"loss": 1.8341,
"step": 215
},
{
"epoch": 0.22,
"learning_rate": 0.00039000000000000005,
"loss": 1.9399,
"step": 220
},
{
"epoch": 0.23,
"learning_rate": 0.00038750000000000004,
"loss": 1.8095,
"step": 225
},
{
"epoch": 0.23,
"learning_rate": 0.00038500000000000003,
"loss": 1.8286,
"step": 230
},
{
"epoch": 0.23,
"learning_rate": 0.00038250000000000003,
"loss": 1.8846,
"step": 235
},
{
"epoch": 0.24,
"learning_rate": 0.00038,
"loss": 1.8101,
"step": 240
},
{
"epoch": 0.24,
"learning_rate": 0.0003775,
"loss": 1.8791,
"step": 245
},
{
"epoch": 0.25,
"learning_rate": 0.000375,
"loss": 1.8181,
"step": 250
},
{
"epoch": 0.26,
"learning_rate": 0.0003725,
"loss": 1.8555,
"step": 255
},
{
"epoch": 0.26,
"learning_rate": 0.00037,
"loss": 1.8328,
"step": 260
},
{
"epoch": 0.27,
"learning_rate": 0.0003675,
"loss": 1.814,
"step": 265
},
{
"epoch": 0.27,
"learning_rate": 0.000365,
"loss": 1.8647,
"step": 270
},
{
"epoch": 0.28,
"learning_rate": 0.0003625,
"loss": 1.8754,
"step": 275
},
{
"epoch": 0.28,
"learning_rate": 0.00035999999999999997,
"loss": 1.8184,
"step": 280
},
{
"epoch": 0.28,
"learning_rate": 0.0003575,
"loss": 1.8879,
"step": 285
},
{
"epoch": 0.29,
"learning_rate": 0.000355,
"loss": 1.8329,
"step": 290
},
{
"epoch": 0.29,
"learning_rate": 0.0003525,
"loss": 1.7787,
"step": 295
},
{
"epoch": 0.3,
"learning_rate": 0.00035,
"loss": 1.7543,
"step": 300
},
{
"epoch": 0.3,
"learning_rate": 0.0003475,
"loss": 1.7782,
"step": 305
},
{
"epoch": 0.31,
"learning_rate": 0.000345,
"loss": 1.8857,
"step": 310
},
{
"epoch": 0.32,
"learning_rate": 0.00034250000000000003,
"loss": 1.7608,
"step": 315
},
{
"epoch": 0.32,
"learning_rate": 0.00034,
"loss": 1.8622,
"step": 320
},
{
"epoch": 0.33,
"learning_rate": 0.0003375,
"loss": 1.7055,
"step": 325
},
{
"epoch": 0.33,
"learning_rate": 0.000335,
"loss": 1.7356,
"step": 330
},
{
"epoch": 0.34,
"learning_rate": 0.0003325,
"loss": 1.8353,
"step": 335
},
{
"epoch": 0.34,
"learning_rate": 0.00033,
"loss": 1.7389,
"step": 340
},
{
"epoch": 0.34,
"learning_rate": 0.00032750000000000005,
"loss": 1.8115,
"step": 345
},
{
"epoch": 0.35,
"learning_rate": 0.00032500000000000004,
"loss": 1.7303,
"step": 350
},
{
"epoch": 0.35,
"learning_rate": 0.00032250000000000003,
"loss": 1.7603,
"step": 355
},
{
"epoch": 0.36,
"learning_rate": 0.00032,
"loss": 1.7925,
"step": 360
},
{
"epoch": 0.36,
"learning_rate": 0.0003175,
"loss": 1.806,
"step": 365
},
{
"epoch": 0.37,
"learning_rate": 0.000315,
"loss": 1.8047,
"step": 370
},
{
"epoch": 0.38,
"learning_rate": 0.0003125,
"loss": 1.7939,
"step": 375
},
{
"epoch": 0.38,
"learning_rate": 0.00031,
"loss": 1.7539,
"step": 380
},
{
"epoch": 0.39,
"learning_rate": 0.0003075,
"loss": 1.7817,
"step": 385
},
{
"epoch": 0.39,
"learning_rate": 0.000305,
"loss": 1.7652,
"step": 390
},
{
"epoch": 0.4,
"learning_rate": 0.0003025,
"loss": 1.757,
"step": 395
},
{
"epoch": 0.4,
"learning_rate": 0.0003,
"loss": 1.7845,
"step": 400
},
{
"epoch": 0.41,
"learning_rate": 0.00029749999999999997,
"loss": 1.7701,
"step": 405
},
{
"epoch": 0.41,
"learning_rate": 0.000295,
"loss": 1.7759,
"step": 410
},
{
"epoch": 0.41,
"learning_rate": 0.0002925,
"loss": 1.697,
"step": 415
},
{
"epoch": 0.42,
"learning_rate": 0.00029,
"loss": 1.7623,
"step": 420
},
{
"epoch": 0.42,
"learning_rate": 0.0002875,
"loss": 1.7926,
"step": 425
},
{
"epoch": 0.43,
"learning_rate": 0.000285,
"loss": 1.8367,
"step": 430
},
{
"epoch": 0.43,
"learning_rate": 0.0002825,
"loss": 1.764,
"step": 435
},
{
"epoch": 0.44,
"learning_rate": 0.00028000000000000003,
"loss": 1.7322,
"step": 440
},
{
"epoch": 0.45,
"learning_rate": 0.0002775,
"loss": 1.7723,
"step": 445
},
{
"epoch": 0.45,
"learning_rate": 0.000275,
"loss": 1.7971,
"step": 450
},
{
"epoch": 0.46,
"learning_rate": 0.0002725,
"loss": 1.7938,
"step": 455
},
{
"epoch": 0.46,
"learning_rate": 0.00027,
"loss": 1.8143,
"step": 460
},
{
"epoch": 0.47,
"learning_rate": 0.0002675,
"loss": 1.735,
"step": 465
},
{
"epoch": 0.47,
"learning_rate": 0.00026500000000000004,
"loss": 1.7571,
"step": 470
},
{
"epoch": 0.47,
"learning_rate": 0.00026250000000000004,
"loss": 1.7636,
"step": 475
},
{
"epoch": 0.48,
"learning_rate": 0.00026000000000000003,
"loss": 1.7344,
"step": 480
},
{
"epoch": 0.48,
"learning_rate": 0.0002575,
"loss": 1.7156,
"step": 485
},
{
"epoch": 0.49,
"learning_rate": 0.000255,
"loss": 1.6996,
"step": 490
},
{
"epoch": 0.49,
"learning_rate": 0.0002525,
"loss": 1.7917,
"step": 495
},
{
"epoch": 0.5,
"learning_rate": 0.00025,
"loss": 1.7578,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 1.31426122150656e+16,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}