|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.008965138752331828, |
|
"eval_steps": 100000000, |
|
"global_step": 1100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 8.15012613848348e-06, |
|
"grad_norm": 3.4710192680358887, |
|
"learning_rate": 1.0000000000000001e-07, |
|
"loss": 11.2605, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002445037841545044, |
|
"grad_norm": 5.464046955108643, |
|
"learning_rate": 3e-06, |
|
"loss": 10.9977, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0004890075683090088, |
|
"grad_norm": 1.1376756429672241, |
|
"learning_rate": 6e-06, |
|
"loss": 9.4082, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0007335113524635132, |
|
"grad_norm": 1.2878344058990479, |
|
"learning_rate": 9e-06, |
|
"loss": 8.549, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0009780151366180175, |
|
"grad_norm": 1.2800588607788086, |
|
"learning_rate": 1.2e-05, |
|
"loss": 8.0361, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.001222518920772522, |
|
"grad_norm": 1.4315314292907715, |
|
"learning_rate": 1.5e-05, |
|
"loss": 7.653, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0014670227049270264, |
|
"grad_norm": 1.381317377090454, |
|
"learning_rate": 1.8e-05, |
|
"loss": 7.4179, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0017115264890815308, |
|
"grad_norm": 1.7989460229873657, |
|
"learning_rate": 2.1e-05, |
|
"loss": 7.2012, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.001956030273236035, |
|
"grad_norm": 1.3431414365768433, |
|
"learning_rate": 2.4e-05, |
|
"loss": 7.0383, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0022005340573905395, |
|
"grad_norm": 1.028826117515564, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 6.866, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.002445037841545044, |
|
"grad_norm": 1.201025128364563, |
|
"learning_rate": 3e-05, |
|
"loss": 6.717, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0026895416256995483, |
|
"grad_norm": 1.1023098230361938, |
|
"learning_rate": 3.3e-05, |
|
"loss": 6.5535, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0029340454098540527, |
|
"grad_norm": 1.2839674949645996, |
|
"learning_rate": 3.6e-05, |
|
"loss": 6.4022, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.003178549194008557, |
|
"grad_norm": 2.267265796661377, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 6.2858, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.0034230529781630616, |
|
"grad_norm": 1.0635628700256348, |
|
"learning_rate": 4.2e-05, |
|
"loss": 6.1681, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.0036675567623175656, |
|
"grad_norm": 1.263838768005371, |
|
"learning_rate": 4.5e-05, |
|
"loss": 6.0728, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.00391206054647207, |
|
"grad_norm": 1.4611454010009766, |
|
"learning_rate": 4.8e-05, |
|
"loss": 5.972, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.0041565643306265745, |
|
"grad_norm": 1.0120561122894287, |
|
"learning_rate": 4.999999990869806e-05, |
|
"loss": 5.8619, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.004401068114781079, |
|
"grad_norm": 1.1349974870681763, |
|
"learning_rate": 4.999999853916893e-05, |
|
"loss": 5.785, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.004645571898935583, |
|
"grad_norm": 1.0840613842010498, |
|
"learning_rate": 4.9999995526204936e-05, |
|
"loss": 5.7071, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.004890075683090088, |
|
"grad_norm": 1.258074402809143, |
|
"learning_rate": 4.999999086980628e-05, |
|
"loss": 5.6199, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.005134579467244592, |
|
"grad_norm": 1.284726858139038, |
|
"learning_rate": 4.999998456997326e-05, |
|
"loss": 5.5465, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.005379083251399097, |
|
"grad_norm": 1.2079874277114868, |
|
"learning_rate": 4.999997662670628e-05, |
|
"loss": 5.4816, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.005623587035553601, |
|
"grad_norm": 1.3364052772521973, |
|
"learning_rate": 4.999996704000589e-05, |
|
"loss": 5.4079, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.0058680908197081055, |
|
"grad_norm": 0.9860705137252808, |
|
"learning_rate": 4.99999558098727e-05, |
|
"loss": 5.3598, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.00611259460386261, |
|
"grad_norm": 1.2071930170059204, |
|
"learning_rate": 4.9999942936307445e-05, |
|
"loss": 5.2884, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.006357098388017114, |
|
"grad_norm": 0.8959563970565796, |
|
"learning_rate": 4.9999928419310994e-05, |
|
"loss": 5.2391, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.006601602172171619, |
|
"grad_norm": 1.2356096506118774, |
|
"learning_rate": 4.999991225888427e-05, |
|
"loss": 5.1879, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.006846105956326123, |
|
"grad_norm": 0.9705113172531128, |
|
"learning_rate": 4.999989445502837e-05, |
|
"loss": 5.1424, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.007090609740480627, |
|
"grad_norm": 0.9504437446594238, |
|
"learning_rate": 4.9999875007744436e-05, |
|
"loss": 5.0966, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.007335113524635131, |
|
"grad_norm": 0.9488673806190491, |
|
"learning_rate": 4.9999853917033756e-05, |
|
"loss": 5.0424, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.007579617308789636, |
|
"grad_norm": 0.959373950958252, |
|
"learning_rate": 4.999983118289773e-05, |
|
"loss": 5.0387, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.00782412109294414, |
|
"grad_norm": 0.8465414643287659, |
|
"learning_rate": 4.999980680533782e-05, |
|
"loss": 4.9769, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.008068624877098645, |
|
"grad_norm": 0.8328993916511536, |
|
"learning_rate": 4.999978078435567e-05, |
|
"loss": 4.9335, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.008313128661253149, |
|
"grad_norm": 0.8107655644416809, |
|
"learning_rate": 4.999975311995295e-05, |
|
"loss": 4.9214, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.008557632445407654, |
|
"grad_norm": 0.8149654865264893, |
|
"learning_rate": 4.99997238121315e-05, |
|
"loss": 4.8651, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.008802136229562158, |
|
"grad_norm": 0.8837414979934692, |
|
"learning_rate": 4.999969286089325e-05, |
|
"loss": 4.8327, |
|
"step": 1080 |
|
} |
|
], |
|
"logging_steps": 30, |
|
"max_steps": 368091, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 1.052232627781632e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|