|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004975, |
|
"loss": 1.8757, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.000495, |
|
"loss": 1.8758, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004925, |
|
"loss": 1.9307, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00049, |
|
"loss": 1.9338, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004875, |
|
"loss": 1.8599, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00048499999999999997, |
|
"loss": 1.9875, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004825, |
|
"loss": 1.9947, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048, |
|
"loss": 1.9015, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004775, |
|
"loss": 1.8941, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.000475, |
|
"loss": 1.8592, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004725, |
|
"loss": 1.8977, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047, |
|
"loss": 1.886, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046750000000000003, |
|
"loss": 1.9486, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.000465, |
|
"loss": 1.8669, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004625, |
|
"loss": 1.936, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046, |
|
"loss": 1.8385, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004575, |
|
"loss": 1.8045, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.000455, |
|
"loss": 1.9058, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045250000000000005, |
|
"loss": 1.868, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 1.8055, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00044750000000000004, |
|
"loss": 1.849, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044500000000000003, |
|
"loss": 1.869, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004425, |
|
"loss": 1.8587, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044, |
|
"loss": 1.9206, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004375, |
|
"loss": 1.8406, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.000435, |
|
"loss": 1.8721, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004325, |
|
"loss": 1.9409, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043, |
|
"loss": 1.9222, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004275, |
|
"loss": 1.8705, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.000425, |
|
"loss": 1.9348, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042249999999999997, |
|
"loss": 1.8167, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042, |
|
"loss": 1.8904, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004175, |
|
"loss": 1.8545, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.000415, |
|
"loss": 1.8448, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004125, |
|
"loss": 1.8898, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041, |
|
"loss": 1.8338, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004075, |
|
"loss": 1.8246, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040500000000000003, |
|
"loss": 1.8754, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004025, |
|
"loss": 1.8603, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004, |
|
"loss": 1.799, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0003975, |
|
"loss": 1.8652, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.000395, |
|
"loss": 1.8406, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003925, |
|
"loss": 1.8341, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.00039000000000000005, |
|
"loss": 1.9399, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.00038750000000000004, |
|
"loss": 1.8095, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.00038500000000000003, |
|
"loss": 1.8286, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.00038250000000000003, |
|
"loss": 1.8846, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 0.00038, |
|
"loss": 1.8101, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 0.0003775, |
|
"loss": 1.8791, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 0.000375, |
|
"loss": 1.8181, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 0.0003725, |
|
"loss": 1.8555, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 0.00037, |
|
"loss": 1.8328, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.0003675, |
|
"loss": 1.814, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.000365, |
|
"loss": 1.8647, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.0003625, |
|
"loss": 1.8754, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.00035999999999999997, |
|
"loss": 1.8184, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.0003575, |
|
"loss": 1.8879, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 0.000355, |
|
"loss": 1.8329, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 0.0003525, |
|
"loss": 1.7787, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 0.00035, |
|
"loss": 1.7543, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 0.0003475, |
|
"loss": 1.7782, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 0.000345, |
|
"loss": 1.8857, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 0.00034250000000000003, |
|
"loss": 1.7608, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 0.00034, |
|
"loss": 1.8622, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.0003375, |
|
"loss": 1.7055, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.000335, |
|
"loss": 1.7356, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 0.0003325, |
|
"loss": 1.8353, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 0.00033, |
|
"loss": 1.7389, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 0.00032750000000000005, |
|
"loss": 1.8115, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.00032500000000000004, |
|
"loss": 1.7303, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.00032250000000000003, |
|
"loss": 1.7603, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 0.00032, |
|
"loss": 1.7925, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 0.0003175, |
|
"loss": 1.806, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 0.000315, |
|
"loss": 1.8047, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.0003125, |
|
"loss": 1.7939, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.00031, |
|
"loss": 1.7539, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 0.0003075, |
|
"loss": 1.7817, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 0.000305, |
|
"loss": 1.7652, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 0.0003025, |
|
"loss": 1.757, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 0.0003, |
|
"loss": 1.7845, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.00029749999999999997, |
|
"loss": 1.7701, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.000295, |
|
"loss": 1.7759, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.0002925, |
|
"loss": 1.697, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.00029, |
|
"loss": 1.7623, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.0002875, |
|
"loss": 1.7926, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.000285, |
|
"loss": 1.8367, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.0002825, |
|
"loss": 1.764, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 0.00028000000000000003, |
|
"loss": 1.7322, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 0.0002775, |
|
"loss": 1.7723, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 0.000275, |
|
"loss": 1.7971, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 0.0002725, |
|
"loss": 1.7938, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 0.00027, |
|
"loss": 1.8143, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.0002675, |
|
"loss": 1.735, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.00026500000000000004, |
|
"loss": 1.7571, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.00026250000000000004, |
|
"loss": 1.7636, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.00026000000000000003, |
|
"loss": 1.7344, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.0002575, |
|
"loss": 1.7156, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 0.000255, |
|
"loss": 1.6996, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"learning_rate": 0.0002525, |
|
"loss": 1.7917, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 0.00025, |
|
"loss": 1.7578, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 1.31426122150656e+16, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|