mcq_sft / trainer_state.json
BrokenSki8's picture
Upload folder using huggingface_hub
1df6582 verified
{
"best_metric": 1.349927544593811,
"best_model_checkpoint": "checkpoints/sft_2_1_1/checkpoint-2555",
"epoch": 7.0,
"eval_steps": 500,
"global_step": 2555,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.1506849315068493,
"grad_norm": 18.4865665435791,
"learning_rate": 5.018248175182482e-07,
"loss": 2.5927,
"step": 55
},
{
"epoch": 0.3013698630136986,
"grad_norm": 16.606660842895508,
"learning_rate": 1.0036496350364965e-06,
"loss": 2.3833,
"step": 110
},
{
"epoch": 0.4520547945205479,
"grad_norm": 6.788235187530518,
"learning_rate": 1.5054744525547446e-06,
"loss": 1.8868,
"step": 165
},
{
"epoch": 0.6027397260273972,
"grad_norm": 3.3164093494415283,
"learning_rate": 2.007299270072993e-06,
"loss": 1.5665,
"step": 220
},
{
"epoch": 0.7534246575342466,
"grad_norm": 3.4226760864257812,
"learning_rate": 2.509124087591241e-06,
"loss": 1.4994,
"step": 275
},
{
"epoch": 0.9041095890410958,
"grad_norm": 3.687007427215576,
"learning_rate": 3.0109489051094893e-06,
"loss": 1.4708,
"step": 330
},
{
"epoch": 1.0,
"eval_loss": 1.4502822160720825,
"eval_runtime": 41.7137,
"eval_samples_per_second": 23.973,
"eval_steps_per_second": 2.997,
"step": 365
},
{
"epoch": 1.0547945205479452,
"grad_norm": 3.667193651199341,
"learning_rate": 3.5127737226277376e-06,
"loss": 1.4589,
"step": 385
},
{
"epoch": 1.2054794520547945,
"grad_norm": 3.444368362426758,
"learning_rate": 4.014598540145986e-06,
"loss": 1.4383,
"step": 440
},
{
"epoch": 1.356164383561644,
"grad_norm": 3.4761803150177,
"learning_rate": 4.516423357664234e-06,
"loss": 1.4421,
"step": 495
},
{
"epoch": 1.5068493150684932,
"grad_norm": 3.8773984909057617,
"learning_rate": 4.9999979671535945e-06,
"loss": 1.4388,
"step": 550
},
{
"epoch": 1.6575342465753424,
"grad_norm": 3.5462825298309326,
"learning_rate": 4.998349002034396e-06,
"loss": 1.4198,
"step": 605
},
{
"epoch": 1.808219178082192,
"grad_norm": 3.9237027168273926,
"learning_rate": 4.993627701726671e-06,
"loss": 1.4052,
"step": 660
},
{
"epoch": 1.958904109589041,
"grad_norm": 3.995187997817993,
"learning_rate": 4.9858398722315225e-06,
"loss": 1.4121,
"step": 715
},
{
"epoch": 2.0,
"eval_loss": 1.4027259349822998,
"eval_runtime": 41.7142,
"eval_samples_per_second": 23.973,
"eval_steps_per_second": 2.997,
"step": 730
},
{
"epoch": 2.1095890410958904,
"grad_norm": 3.973104238510132,
"learning_rate": 4.974995090602673e-06,
"loss": 1.4018,
"step": 770
},
{
"epoch": 2.26027397260274,
"grad_norm": 4.114542484283447,
"learning_rate": 4.9611066931691045e-06,
"loss": 1.3977,
"step": 825
},
{
"epoch": 2.410958904109589,
"grad_norm": 4.350598335266113,
"learning_rate": 4.94419175913477e-06,
"loss": 1.3778,
"step": 880
},
{
"epoch": 2.5616438356164384,
"grad_norm": 3.951005697250366,
"learning_rate": 4.9242710895755e-06,
"loss": 1.372,
"step": 935
},
{
"epoch": 2.712328767123288,
"grad_norm": 4.071479797363281,
"learning_rate": 4.9013691818589635e-06,
"loss": 1.3826,
"step": 990
},
{
"epoch": 2.863013698630137,
"grad_norm": 3.968268632888794,
"learning_rate": 4.87551419951912e-06,
"loss": 1.3845,
"step": 1045
},
{
"epoch": 3.0,
"eval_loss": 1.3834009170532227,
"eval_runtime": 41.778,
"eval_samples_per_second": 23.936,
"eval_steps_per_second": 2.992,
"step": 1095
},
{
"epoch": 3.0136986301369864,
"grad_norm": 4.093992233276367,
"learning_rate": 4.8467379376222215e-06,
"loss": 1.3736,
"step": 1100
},
{
"epoch": 3.1643835616438354,
"grad_norm": 4.021303176879883,
"learning_rate": 4.815075783666952e-06,
"loss": 1.3547,
"step": 1155
},
{
"epoch": 3.315068493150685,
"grad_norm": 4.797937393188477,
"learning_rate": 4.780566674066782e-06,
"loss": 1.3671,
"step": 1210
},
{
"epoch": 3.4657534246575343,
"grad_norm": 4.535392761230469,
"learning_rate": 4.743253046268069e-06,
"loss": 1.3545,
"step": 1265
},
{
"epoch": 3.616438356164384,
"grad_norm": 4.504812717437744,
"learning_rate": 4.703180786562761e-06,
"loss": 1.3623,
"step": 1320
},
{
"epoch": 3.767123287671233,
"grad_norm": 4.607705116271973,
"learning_rate": 4.660399173659908e-06,
"loss": 1.3487,
"step": 1375
},
{
"epoch": 3.9178082191780823,
"grad_norm": 4.659298896789551,
"learning_rate": 4.6149608180853545e-06,
"loss": 1.3502,
"step": 1430
},
{
"epoch": 4.0,
"eval_loss": 1.3703773021697998,
"eval_runtime": 41.7996,
"eval_samples_per_second": 23.924,
"eval_steps_per_second": 2.99,
"step": 1460
},
{
"epoch": 4.068493150684931,
"grad_norm": 4.691000461578369,
"learning_rate": 4.566921597484149e-06,
"loss": 1.3453,
"step": 1485
},
{
"epoch": 4.219178082191781,
"grad_norm": 4.80633020401001,
"learning_rate": 4.51634058790522e-06,
"loss": 1.3329,
"step": 1540
},
{
"epoch": 4.36986301369863,
"grad_norm": 5.040696144104004,
"learning_rate": 4.463279991152828e-06,
"loss": 1.3329,
"step": 1595
},
{
"epoch": 4.52054794520548,
"grad_norm": 5.084527015686035,
"learning_rate": 4.407805058294135e-06,
"loss": 1.3453,
"step": 1650
},
{
"epoch": 4.671232876712329,
"grad_norm": 5.078038692474365,
"learning_rate": 4.349984009416952e-06,
"loss": 1.3266,
"step": 1705
},
{
"epoch": 4.821917808219178,
"grad_norm": 5.201215744018555,
"learning_rate": 4.289887949736347e-06,
"loss": 1.3281,
"step": 1760
},
{
"epoch": 4.972602739726027,
"grad_norm": 4.974658966064453,
"learning_rate": 4.227590782153277e-06,
"loss": 1.3168,
"step": 1815
},
{
"epoch": 5.0,
"eval_loss": 1.3636702299118042,
"eval_runtime": 41.8147,
"eval_samples_per_second": 23.915,
"eval_steps_per_second": 2.989,
"step": 1825
},
{
"epoch": 5.123287671232877,
"grad_norm": 5.115445137023926,
"learning_rate": 4.16316911637277e-06,
"loss": 1.3135,
"step": 1870
},
{
"epoch": 5.273972602739726,
"grad_norm": 5.82274055480957,
"learning_rate": 4.0967021746934436e-06,
"loss": 1.3107,
"step": 1925
},
{
"epoch": 5.424657534246576,
"grad_norm": 5.606359481811523,
"learning_rate": 4.02827169458417e-06,
"loss": 1.301,
"step": 1980
},
{
"epoch": 5.575342465753424,
"grad_norm": 5.442434787750244,
"learning_rate": 3.957961828167748e-06,
"loss": 1.3171,
"step": 2035
},
{
"epoch": 5.726027397260274,
"grad_norm": 5.444327354431152,
"learning_rate": 3.885859038735141e-06,
"loss": 1.3045,
"step": 2090
},
{
"epoch": 5.876712328767123,
"grad_norm": 5.671774864196777,
"learning_rate": 3.8120519944175767e-06,
"loss": 1.3036,
"step": 2145
},
{
"epoch": 6.0,
"eval_loss": 1.353081464767456,
"eval_runtime": 41.6872,
"eval_samples_per_second": 23.988,
"eval_steps_per_second": 2.999,
"step": 2190
},
{
"epoch": 6.027397260273973,
"grad_norm": 5.856392860412598,
"learning_rate": 3.7366314591472484e-06,
"loss": 1.2882,
"step": 2200
},
{
"epoch": 6.178082191780822,
"grad_norm": 6.328695774078369,
"learning_rate": 3.659690181040717e-06,
"loss": 1.2881,
"step": 2255
},
{
"epoch": 6.328767123287671,
"grad_norm": 6.592623710632324,
"learning_rate": 3.5813227783422654e-06,
"loss": 1.278,
"step": 2310
},
{
"epoch": 6.47945205479452,
"grad_norm": 6.272197723388672,
"learning_rate": 3.5016256230674704e-06,
"loss": 1.2799,
"step": 2365
},
{
"epoch": 6.63013698630137,
"grad_norm": 6.509876251220703,
"learning_rate": 3.4206967224900885e-06,
"loss": 1.2866,
"step": 2420
},
{
"epoch": 6.780821917808219,
"grad_norm": 6.4894304275512695,
"learning_rate": 3.338635598617975e-06,
"loss": 1.2952,
"step": 2475
},
{
"epoch": 6.931506849315069,
"grad_norm": 6.477168560028076,
"learning_rate": 3.2555431658062837e-06,
"loss": 1.2752,
"step": 2530
},
{
"epoch": 7.0,
"eval_loss": 1.349927544593811,
"eval_runtime": 41.6959,
"eval_samples_per_second": 23.983,
"eval_steps_per_second": 2.998,
"step": 2555
}
],
"logging_steps": 55,
"max_steps": 5475,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.363484660255949e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}