|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.012697337791509713, |
|
"eval_steps": 6, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002539467558301943, |
|
"grad_norm": 2.3970961570739746, |
|
"learning_rate": 1e-05, |
|
"loss": 66.6603, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002539467558301943, |
|
"eval_loss": 11.107478141784668, |
|
"eval_runtime": 18.8123, |
|
"eval_samples_per_second": 264.454, |
|
"eval_steps_per_second": 66.127, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0005078935116603886, |
|
"grad_norm": 2.4752585887908936, |
|
"learning_rate": 2e-05, |
|
"loss": 66.6705, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0007618402674905828, |
|
"grad_norm": 2.5148186683654785, |
|
"learning_rate": 3e-05, |
|
"loss": 66.6682, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0010157870233207772, |
|
"grad_norm": 2.3080971240997314, |
|
"learning_rate": 4e-05, |
|
"loss": 66.5837, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0012697337791509713, |
|
"grad_norm": 2.354560613632202, |
|
"learning_rate": 5e-05, |
|
"loss": 66.6436, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0015236805349811656, |
|
"grad_norm": 2.5473434925079346, |
|
"learning_rate": 6e-05, |
|
"loss": 66.6068, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0015236805349811656, |
|
"eval_loss": 11.101515769958496, |
|
"eval_runtime": 18.1722, |
|
"eval_samples_per_second": 273.77, |
|
"eval_steps_per_second": 68.456, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00177762729081136, |
|
"grad_norm": 2.5556399822235107, |
|
"learning_rate": 7e-05, |
|
"loss": 66.6045, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0020315740466415543, |
|
"grad_norm": 2.6539461612701416, |
|
"learning_rate": 8e-05, |
|
"loss": 66.6228, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0022855208024717484, |
|
"grad_norm": 2.602025032043457, |
|
"learning_rate": 9e-05, |
|
"loss": 66.4995, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0025394675583019426, |
|
"grad_norm": 2.4404804706573486, |
|
"learning_rate": 0.0001, |
|
"loss": 66.5529, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.002793414314132137, |
|
"grad_norm": 2.2321112155914307, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 66.4916, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0030473610699623312, |
|
"grad_norm": 2.796868085861206, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 66.4966, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0030473610699623312, |
|
"eval_loss": 11.079522132873535, |
|
"eval_runtime": 18.1003, |
|
"eval_samples_per_second": 274.858, |
|
"eval_steps_per_second": 68.728, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0033013078257925254, |
|
"grad_norm": 2.591143846511841, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 66.4451, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.00355525458162272, |
|
"grad_norm": 2.323777675628662, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 66.4647, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.003809201337452914, |
|
"grad_norm": 2.9256598949432373, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 66.3829, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.004063148093283109, |
|
"grad_norm": 2.4250593185424805, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 66.4283, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.004317094849113303, |
|
"grad_norm": 2.607290029525757, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 66.2636, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.004571041604943497, |
|
"grad_norm": 2.8356802463531494, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 66.2697, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.004571041604943497, |
|
"eval_loss": 11.051901817321777, |
|
"eval_runtime": 18.1634, |
|
"eval_samples_per_second": 273.902, |
|
"eval_steps_per_second": 68.489, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.004824988360773691, |
|
"grad_norm": 2.4801783561706543, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 66.3638, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.005078935116603885, |
|
"grad_norm": 2.5052285194396973, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 66.2984, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005332881872434079, |
|
"grad_norm": 2.5191738605499268, |
|
"learning_rate": 8.247240241650918e-05, |
|
"loss": 66.3049, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.005586828628264274, |
|
"grad_norm": 2.384291887283325, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 66.2243, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.005840775384094468, |
|
"grad_norm": 2.3815808296203613, |
|
"learning_rate": 7.612492823579745e-05, |
|
"loss": 66.3124, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0060947221399246625, |
|
"grad_norm": 2.601182222366333, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 66.2157, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0060947221399246625, |
|
"eval_loss": 11.026150703430176, |
|
"eval_runtime": 18.1547, |
|
"eval_samples_per_second": 274.034, |
|
"eval_steps_per_second": 68.522, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.006348668895754857, |
|
"grad_norm": 2.959272861480713, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 66.0543, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.006602615651585051, |
|
"grad_norm": 2.5673084259033203, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 66.0834, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.006856562407415245, |
|
"grad_norm": 2.866518259048462, |
|
"learning_rate": 6.167226819279528e-05, |
|
"loss": 66.044, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00711050916324544, |
|
"grad_norm": 2.417178153991699, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 66.1386, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.007364455919075634, |
|
"grad_norm": 2.504648447036743, |
|
"learning_rate": 5.392295478639225e-05, |
|
"loss": 66.046, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.007618402674905828, |
|
"grad_norm": 2.539602279663086, |
|
"learning_rate": 5e-05, |
|
"loss": 66.0429, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007618402674905828, |
|
"eval_loss": 11.006031036376953, |
|
"eval_runtime": 18.2105, |
|
"eval_samples_per_second": 273.194, |
|
"eval_steps_per_second": 68.312, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007872349430736022, |
|
"grad_norm": 2.809628963470459, |
|
"learning_rate": 4.607704521360776e-05, |
|
"loss": 65.9401, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.008126296186566217, |
|
"grad_norm": 2.533236026763916, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 66.0943, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.00838024294239641, |
|
"grad_norm": 2.7511136531829834, |
|
"learning_rate": 3.832773180720475e-05, |
|
"loss": 65.9569, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.008634189698226605, |
|
"grad_norm": 2.529318332672119, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 65.9841, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.008888136454056799, |
|
"grad_norm": 2.60366153717041, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 65.9559, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.009142083209886994, |
|
"grad_norm": 2.6384119987487793, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 65.9892, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.009142083209886994, |
|
"eval_loss": 10.992709159851074, |
|
"eval_runtime": 18.2216, |
|
"eval_samples_per_second": 273.028, |
|
"eval_steps_per_second": 68.271, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.009396029965717189, |
|
"grad_norm": 2.68752121925354, |
|
"learning_rate": 2.3875071764202563e-05, |
|
"loss": 65.8811, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.009649976721547382, |
|
"grad_norm": 2.529193162918091, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 65.97, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.009903923477377577, |
|
"grad_norm": 2.684138059616089, |
|
"learning_rate": 1.7527597583490822e-05, |
|
"loss": 65.8963, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.01015787023320777, |
|
"grad_norm": 2.5320022106170654, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 65.9401, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.010411816989037965, |
|
"grad_norm": 2.516324281692505, |
|
"learning_rate": 1.1979701719998453e-05, |
|
"loss": 65.9055, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.010665763744868158, |
|
"grad_norm": 2.543747901916504, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 65.8852, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.010665763744868158, |
|
"eval_loss": 10.98641300201416, |
|
"eval_runtime": 18.1929, |
|
"eval_samples_per_second": 273.459, |
|
"eval_steps_per_second": 68.378, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.010919710500698353, |
|
"grad_norm": 2.3728582859039307, |
|
"learning_rate": 7.367991782295391e-06, |
|
"loss": 66.1291, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.011173657256528548, |
|
"grad_norm": 2.4906511306762695, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 65.9421, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.011427604012358742, |
|
"grad_norm": 2.665292978286743, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 65.9262, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.011681550768188937, |
|
"grad_norm": 2.6886563301086426, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 65.881, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.01193549752401913, |
|
"grad_norm": 2.69474196434021, |
|
"learning_rate": 1.3815039801161721e-06, |
|
"loss": 65.8305, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.012189444279849325, |
|
"grad_norm": 2.3172616958618164, |
|
"learning_rate": 6.15582970243117e-07, |
|
"loss": 65.9682, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.012189444279849325, |
|
"eval_loss": 10.984763145446777, |
|
"eval_runtime": 18.4652, |
|
"eval_samples_per_second": 269.425, |
|
"eval_steps_per_second": 67.37, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.01244339103567952, |
|
"grad_norm": 2.542539119720459, |
|
"learning_rate": 1.5413331334360182e-07, |
|
"loss": 66.0153, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.012697337791509713, |
|
"grad_norm": 2.630923271179199, |
|
"learning_rate": 0.0, |
|
"loss": 65.7818, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7145835724800.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|