|
{ |
|
"best_metric": 1.349927544593811, |
|
"best_model_checkpoint": "checkpoints/sft_2_1_1/checkpoint-2555", |
|
"epoch": 7.0, |
|
"eval_steps": 500, |
|
"global_step": 2555, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1506849315068493, |
|
"grad_norm": 18.4865665435791, |
|
"learning_rate": 5.018248175182482e-07, |
|
"loss": 2.5927, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3013698630136986, |
|
"grad_norm": 16.606660842895508, |
|
"learning_rate": 1.0036496350364965e-06, |
|
"loss": 2.3833, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4520547945205479, |
|
"grad_norm": 6.788235187530518, |
|
"learning_rate": 1.5054744525547446e-06, |
|
"loss": 1.8868, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6027397260273972, |
|
"grad_norm": 3.3164093494415283, |
|
"learning_rate": 2.007299270072993e-06, |
|
"loss": 1.5665, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7534246575342466, |
|
"grad_norm": 3.4226760864257812, |
|
"learning_rate": 2.509124087591241e-06, |
|
"loss": 1.4994, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9041095890410958, |
|
"grad_norm": 3.687007427215576, |
|
"learning_rate": 3.0109489051094893e-06, |
|
"loss": 1.4708, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.4502822160720825, |
|
"eval_runtime": 41.7137, |
|
"eval_samples_per_second": 23.973, |
|
"eval_steps_per_second": 2.997, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.0547945205479452, |
|
"grad_norm": 3.667193651199341, |
|
"learning_rate": 3.5127737226277376e-06, |
|
"loss": 1.4589, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.2054794520547945, |
|
"grad_norm": 3.444368362426758, |
|
"learning_rate": 4.014598540145986e-06, |
|
"loss": 1.4383, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.356164383561644, |
|
"grad_norm": 3.4761803150177, |
|
"learning_rate": 4.516423357664234e-06, |
|
"loss": 1.4421, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.5068493150684932, |
|
"grad_norm": 3.8773984909057617, |
|
"learning_rate": 4.9999979671535945e-06, |
|
"loss": 1.4388, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6575342465753424, |
|
"grad_norm": 3.5462825298309326, |
|
"learning_rate": 4.998349002034396e-06, |
|
"loss": 1.4198, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.808219178082192, |
|
"grad_norm": 3.9237027168273926, |
|
"learning_rate": 4.993627701726671e-06, |
|
"loss": 1.4052, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.958904109589041, |
|
"grad_norm": 3.995187997817993, |
|
"learning_rate": 4.9858398722315225e-06, |
|
"loss": 1.4121, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.4027259349822998, |
|
"eval_runtime": 41.7142, |
|
"eval_samples_per_second": 23.973, |
|
"eval_steps_per_second": 2.997, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.1095890410958904, |
|
"grad_norm": 3.973104238510132, |
|
"learning_rate": 4.974995090602673e-06, |
|
"loss": 1.4018, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.26027397260274, |
|
"grad_norm": 4.114542484283447, |
|
"learning_rate": 4.9611066931691045e-06, |
|
"loss": 1.3977, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.410958904109589, |
|
"grad_norm": 4.350598335266113, |
|
"learning_rate": 4.94419175913477e-06, |
|
"loss": 1.3778, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.5616438356164384, |
|
"grad_norm": 3.951005697250366, |
|
"learning_rate": 4.9242710895755e-06, |
|
"loss": 1.372, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.712328767123288, |
|
"grad_norm": 4.071479797363281, |
|
"learning_rate": 4.9013691818589635e-06, |
|
"loss": 1.3826, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.863013698630137, |
|
"grad_norm": 3.968268632888794, |
|
"learning_rate": 4.87551419951912e-06, |
|
"loss": 1.3845, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.3834009170532227, |
|
"eval_runtime": 41.778, |
|
"eval_samples_per_second": 23.936, |
|
"eval_steps_per_second": 2.992, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 3.0136986301369864, |
|
"grad_norm": 4.093992233276367, |
|
"learning_rate": 4.8467379376222215e-06, |
|
"loss": 1.3736, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.1643835616438354, |
|
"grad_norm": 4.021303176879883, |
|
"learning_rate": 4.815075783666952e-06, |
|
"loss": 1.3547, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 3.315068493150685, |
|
"grad_norm": 4.797937393188477, |
|
"learning_rate": 4.780566674066782e-06, |
|
"loss": 1.3671, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.4657534246575343, |
|
"grad_norm": 4.535392761230469, |
|
"learning_rate": 4.743253046268069e-06, |
|
"loss": 1.3545, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 3.616438356164384, |
|
"grad_norm": 4.504812717437744, |
|
"learning_rate": 4.703180786562761e-06, |
|
"loss": 1.3623, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.767123287671233, |
|
"grad_norm": 4.607705116271973, |
|
"learning_rate": 4.660399173659908e-06, |
|
"loss": 1.3487, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 3.9178082191780823, |
|
"grad_norm": 4.659298896789551, |
|
"learning_rate": 4.6149608180853545e-06, |
|
"loss": 1.3502, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.3703773021697998, |
|
"eval_runtime": 41.7996, |
|
"eval_samples_per_second": 23.924, |
|
"eval_steps_per_second": 2.99, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.068493150684931, |
|
"grad_norm": 4.691000461578369, |
|
"learning_rate": 4.566921597484149e-06, |
|
"loss": 1.3453, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 4.219178082191781, |
|
"grad_norm": 4.80633020401001, |
|
"learning_rate": 4.51634058790522e-06, |
|
"loss": 1.3329, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.36986301369863, |
|
"grad_norm": 5.040696144104004, |
|
"learning_rate": 4.463279991152828e-06, |
|
"loss": 1.3329, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 4.52054794520548, |
|
"grad_norm": 5.084527015686035, |
|
"learning_rate": 4.407805058294135e-06, |
|
"loss": 1.3453, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.671232876712329, |
|
"grad_norm": 5.078038692474365, |
|
"learning_rate": 4.349984009416952e-06, |
|
"loss": 1.3266, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 4.821917808219178, |
|
"grad_norm": 5.201215744018555, |
|
"learning_rate": 4.289887949736347e-06, |
|
"loss": 1.3281, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.972602739726027, |
|
"grad_norm": 4.974658966064453, |
|
"learning_rate": 4.227590782153277e-06, |
|
"loss": 1.3168, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.3636702299118042, |
|
"eval_runtime": 41.8147, |
|
"eval_samples_per_second": 23.915, |
|
"eval_steps_per_second": 2.989, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 5.123287671232877, |
|
"grad_norm": 5.115445137023926, |
|
"learning_rate": 4.16316911637277e-06, |
|
"loss": 1.3135, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 5.273972602739726, |
|
"grad_norm": 5.82274055480957, |
|
"learning_rate": 4.0967021746934436e-06, |
|
"loss": 1.3107, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 5.424657534246576, |
|
"grad_norm": 5.606359481811523, |
|
"learning_rate": 4.02827169458417e-06, |
|
"loss": 1.301, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 5.575342465753424, |
|
"grad_norm": 5.442434787750244, |
|
"learning_rate": 3.957961828167748e-06, |
|
"loss": 1.3171, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 5.726027397260274, |
|
"grad_norm": 5.444327354431152, |
|
"learning_rate": 3.885859038735141e-06, |
|
"loss": 1.3045, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 5.876712328767123, |
|
"grad_norm": 5.671774864196777, |
|
"learning_rate": 3.8120519944175767e-06, |
|
"loss": 1.3036, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.353081464767456, |
|
"eval_runtime": 41.6872, |
|
"eval_samples_per_second": 23.988, |
|
"eval_steps_per_second": 2.999, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 6.027397260273973, |
|
"grad_norm": 5.856392860412598, |
|
"learning_rate": 3.7366314591472484e-06, |
|
"loss": 1.2882, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.178082191780822, |
|
"grad_norm": 6.328695774078369, |
|
"learning_rate": 3.659690181040717e-06, |
|
"loss": 1.2881, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 6.328767123287671, |
|
"grad_norm": 6.592623710632324, |
|
"learning_rate": 3.5813227783422654e-06, |
|
"loss": 1.278, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 6.47945205479452, |
|
"grad_norm": 6.272197723388672, |
|
"learning_rate": 3.5016256230674704e-06, |
|
"loss": 1.2799, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 6.63013698630137, |
|
"grad_norm": 6.509876251220703, |
|
"learning_rate": 3.4206967224900885e-06, |
|
"loss": 1.2866, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 6.780821917808219, |
|
"grad_norm": 6.4894304275512695, |
|
"learning_rate": 3.338635598617975e-06, |
|
"loss": 1.2952, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 6.931506849315069, |
|
"grad_norm": 6.477168560028076, |
|
"learning_rate": 3.2555431658062837e-06, |
|
"loss": 1.2752, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 1.349927544593811, |
|
"eval_runtime": 41.6959, |
|
"eval_samples_per_second": 23.983, |
|
"eval_steps_per_second": 2.998, |
|
"step": 2555 |
|
} |
|
], |
|
"logging_steps": 55, |
|
"max_steps": 5475, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.363484660255949e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|