BilelDJ's picture
Training in progress, step 141
565dee2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 3,
"global_step": 141,
"is_hyper_param_search": true,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06382978723404255,
"grad_norm": 134.82960510253906,
"learning_rate": 3.4524225370179614e-06,
"loss": 2.0515,
"step": 3
},
{
"epoch": 0.06382978723404255,
"eval_loss": 1.1583281755447388,
"eval_runtime": 97.0528,
"eval_samples_per_second": 3.091,
"eval_steps_per_second": 0.103,
"step": 3
},
{
"epoch": 0.1276595744680851,
"grad_norm": 141.0943145751953,
"learning_rate": 3.377369873169745e-06,
"loss": 1.5004,
"step": 6
},
{
"epoch": 0.1276595744680851,
"eval_loss": 1.1162742376327515,
"eval_runtime": 60.2202,
"eval_samples_per_second": 4.982,
"eval_steps_per_second": 0.166,
"step": 6
},
{
"epoch": 0.19148936170212766,
"grad_norm": 134.9931640625,
"learning_rate": 3.3023172093215284e-06,
"loss": 1.5627,
"step": 9
},
{
"epoch": 0.19148936170212766,
"eval_loss": 1.0802254676818848,
"eval_runtime": 41.4969,
"eval_samples_per_second": 7.229,
"eval_steps_per_second": 0.241,
"step": 9
},
{
"epoch": 0.2553191489361702,
"grad_norm": 113.67478942871094,
"learning_rate": 3.227264545473312e-06,
"loss": 1.3625,
"step": 12
},
{
"epoch": 0.2553191489361702,
"eval_loss": 1.0606831312179565,
"eval_runtime": 41.3217,
"eval_samples_per_second": 7.26,
"eval_steps_per_second": 0.242,
"step": 12
},
{
"epoch": 0.3191489361702128,
"grad_norm": 105.70118713378906,
"learning_rate": 3.152211881625095e-06,
"loss": 1.6648,
"step": 15
},
{
"epoch": 0.3191489361702128,
"eval_loss": 1.0375689268112183,
"eval_runtime": 48.7659,
"eval_samples_per_second": 6.152,
"eval_steps_per_second": 0.205,
"step": 15
},
{
"epoch": 0.3829787234042553,
"grad_norm": 96.33596801757812,
"learning_rate": 3.077159217776879e-06,
"loss": 1.3305,
"step": 18
},
{
"epoch": 0.3829787234042553,
"eval_loss": 1.0117995738983154,
"eval_runtime": 41.5605,
"eval_samples_per_second": 7.218,
"eval_steps_per_second": 0.241,
"step": 18
},
{
"epoch": 0.44680851063829785,
"grad_norm": 105.89424133300781,
"learning_rate": 3.002106553928662e-06,
"loss": 1.2195,
"step": 21
},
{
"epoch": 0.44680851063829785,
"eval_loss": 0.9878906011581421,
"eval_runtime": 44.9672,
"eval_samples_per_second": 6.672,
"eval_steps_per_second": 0.222,
"step": 21
},
{
"epoch": 0.5106382978723404,
"grad_norm": 96.90633392333984,
"learning_rate": 2.9270538900804454e-06,
"loss": 1.4076,
"step": 24
},
{
"epoch": 0.5106382978723404,
"eval_loss": 0.9666847586631775,
"eval_runtime": 52.2922,
"eval_samples_per_second": 5.737,
"eval_steps_per_second": 0.191,
"step": 24
},
{
"epoch": 0.574468085106383,
"grad_norm": 101.37423706054688,
"learning_rate": 2.852001226232229e-06,
"loss": 1.5353,
"step": 27
},
{
"epoch": 0.574468085106383,
"eval_loss": 0.9519509077072144,
"eval_runtime": 41.7527,
"eval_samples_per_second": 7.185,
"eval_steps_per_second": 0.24,
"step": 27
},
{
"epoch": 0.6382978723404256,
"grad_norm": 92.97295379638672,
"learning_rate": 2.7769485623840124e-06,
"loss": 1.2954,
"step": 30
},
{
"epoch": 0.6382978723404256,
"eval_loss": 0.937315046787262,
"eval_runtime": 41.2373,
"eval_samples_per_second": 7.275,
"eval_steps_per_second": 0.242,
"step": 30
},
{
"epoch": 0.7021276595744681,
"grad_norm": 110.28973388671875,
"learning_rate": 2.701895898535796e-06,
"loss": 1.489,
"step": 33
},
{
"epoch": 0.7021276595744681,
"eval_loss": 0.9187784790992737,
"eval_runtime": 42.0237,
"eval_samples_per_second": 7.139,
"eval_steps_per_second": 0.238,
"step": 33
},
{
"epoch": 0.7659574468085106,
"grad_norm": 93.82958984375,
"learning_rate": 2.6268432346875793e-06,
"loss": 1.5987,
"step": 36
},
{
"epoch": 0.7659574468085106,
"eval_loss": 0.9012949466705322,
"eval_runtime": 41.6757,
"eval_samples_per_second": 7.198,
"eval_steps_per_second": 0.24,
"step": 36
},
{
"epoch": 0.8297872340425532,
"grad_norm": 74.33155822753906,
"learning_rate": 2.551790570839363e-06,
"loss": 1.3207,
"step": 39
},
{
"epoch": 0.8297872340425532,
"eval_loss": 0.8868340849876404,
"eval_runtime": 41.5644,
"eval_samples_per_second": 7.218,
"eval_steps_per_second": 0.241,
"step": 39
},
{
"epoch": 0.8936170212765957,
"grad_norm": 100.4666519165039,
"learning_rate": 2.4767379069911463e-06,
"loss": 1.3121,
"step": 42
},
{
"epoch": 0.8936170212765957,
"eval_loss": 0.8769957423210144,
"eval_runtime": 51.3924,
"eval_samples_per_second": 5.837,
"eval_steps_per_second": 0.195,
"step": 42
},
{
"epoch": 0.9574468085106383,
"grad_norm": 78.03990173339844,
"learning_rate": 2.4016852431429298e-06,
"loss": 1.198,
"step": 45
},
{
"epoch": 0.9574468085106383,
"eval_loss": 0.8720031976699829,
"eval_runtime": 41.262,
"eval_samples_per_second": 7.271,
"eval_steps_per_second": 0.242,
"step": 45
},
{
"epoch": 1.0212765957446808,
"grad_norm": 48.33544158935547,
"learning_rate": 2.326632579294713e-06,
"loss": 1.0786,
"step": 48
},
{
"epoch": 1.0212765957446808,
"eval_loss": 0.8655586838722229,
"eval_runtime": 41.5498,
"eval_samples_per_second": 7.22,
"eval_steps_per_second": 0.241,
"step": 48
},
{
"epoch": 1.0851063829787233,
"grad_norm": 35.39493179321289,
"learning_rate": 2.2515799154464967e-06,
"loss": 0.7367,
"step": 51
},
{
"epoch": 1.0851063829787233,
"eval_loss": 0.8586752414703369,
"eval_runtime": 47.8744,
"eval_samples_per_second": 6.266,
"eval_steps_per_second": 0.209,
"step": 51
},
{
"epoch": 1.148936170212766,
"grad_norm": 38.954620361328125,
"learning_rate": 2.17652725159828e-06,
"loss": 0.5906,
"step": 54
},
{
"epoch": 1.148936170212766,
"eval_loss": 0.854831874370575,
"eval_runtime": 41.7553,
"eval_samples_per_second": 7.185,
"eval_steps_per_second": 0.239,
"step": 54
},
{
"epoch": 1.2127659574468086,
"grad_norm": 61.59217071533203,
"learning_rate": 2.1014745877500633e-06,
"loss": 0.794,
"step": 57
},
{
"epoch": 1.2127659574468086,
"eval_loss": 0.8518243432044983,
"eval_runtime": 41.3107,
"eval_samples_per_second": 7.262,
"eval_steps_per_second": 0.242,
"step": 57
},
{
"epoch": 1.2765957446808511,
"grad_norm": 43.460899353027344,
"learning_rate": 2.026421923901847e-06,
"loss": 0.6699,
"step": 60
},
{
"epoch": 1.2765957446808511,
"eval_loss": 0.849520742893219,
"eval_runtime": 41.3487,
"eval_samples_per_second": 7.255,
"eval_steps_per_second": 0.242,
"step": 60
},
{
"epoch": 1.3404255319148937,
"grad_norm": 40.51012420654297,
"learning_rate": 1.9513692600536303e-06,
"loss": 0.5351,
"step": 63
},
{
"epoch": 1.3404255319148937,
"eval_loss": 0.8519204258918762,
"eval_runtime": 41.2821,
"eval_samples_per_second": 7.267,
"eval_steps_per_second": 0.242,
"step": 63
},
{
"epoch": 1.4042553191489362,
"grad_norm": 55.731903076171875,
"learning_rate": 1.8763165962054137e-06,
"loss": 0.6526,
"step": 66
},
{
"epoch": 1.4042553191489362,
"eval_loss": 0.85467129945755,
"eval_runtime": 41.1797,
"eval_samples_per_second": 7.285,
"eval_steps_per_second": 0.243,
"step": 66
},
{
"epoch": 1.4680851063829787,
"grad_norm": 55.70009231567383,
"learning_rate": 1.801263932357197e-06,
"loss": 0.7753,
"step": 69
},
{
"epoch": 1.4680851063829787,
"eval_loss": 0.8556016087532043,
"eval_runtime": 41.2215,
"eval_samples_per_second": 7.278,
"eval_steps_per_second": 0.243,
"step": 69
},
{
"epoch": 1.5319148936170213,
"grad_norm": 51.552974700927734,
"learning_rate": 1.7262112685089807e-06,
"loss": 0.6996,
"step": 72
},
{
"epoch": 1.5319148936170213,
"eval_loss": 0.8541069626808167,
"eval_runtime": 41.8446,
"eval_samples_per_second": 7.169,
"eval_steps_per_second": 0.239,
"step": 72
},
{
"epoch": 1.5957446808510638,
"grad_norm": 61.95724105834961,
"learning_rate": 1.6511586046607642e-06,
"loss": 0.8078,
"step": 75
},
{
"epoch": 1.5957446808510638,
"eval_loss": 0.8522682785987854,
"eval_runtime": 45.1234,
"eval_samples_per_second": 6.648,
"eval_steps_per_second": 0.222,
"step": 75
},
{
"epoch": 1.6595744680851063,
"grad_norm": 51.09059524536133,
"learning_rate": 1.5761059408125475e-06,
"loss": 0.6704,
"step": 78
},
{
"epoch": 1.6595744680851063,
"eval_loss": 0.8499072194099426,
"eval_runtime": 41.6072,
"eval_samples_per_second": 7.21,
"eval_steps_per_second": 0.24,
"step": 78
},
{
"epoch": 1.7234042553191489,
"grad_norm": 57.01670455932617,
"learning_rate": 1.501053276964331e-06,
"loss": 0.5976,
"step": 81
},
{
"epoch": 1.7234042553191489,
"eval_loss": 0.8474313616752625,
"eval_runtime": 41.405,
"eval_samples_per_second": 7.246,
"eval_steps_per_second": 0.242,
"step": 81
},
{
"epoch": 1.7872340425531914,
"grad_norm": 58.575130462646484,
"learning_rate": 1.4260006131161144e-06,
"loss": 0.6588,
"step": 84
},
{
"epoch": 1.7872340425531914,
"eval_loss": 0.8449164032936096,
"eval_runtime": 41.6188,
"eval_samples_per_second": 7.208,
"eval_steps_per_second": 0.24,
"step": 84
},
{
"epoch": 1.851063829787234,
"grad_norm": 60.291622161865234,
"learning_rate": 1.350947949267898e-06,
"loss": 0.7081,
"step": 87
},
{
"epoch": 1.851063829787234,
"eval_loss": 0.8420330882072449,
"eval_runtime": 41.9537,
"eval_samples_per_second": 7.151,
"eval_steps_per_second": 0.238,
"step": 87
},
{
"epoch": 1.9148936170212765,
"grad_norm": 50.23271942138672,
"learning_rate": 1.2758952854196814e-06,
"loss": 0.5632,
"step": 90
},
{
"epoch": 1.9148936170212765,
"eval_loss": 0.8388514518737793,
"eval_runtime": 41.6015,
"eval_samples_per_second": 7.211,
"eval_steps_per_second": 0.24,
"step": 90
},
{
"epoch": 1.978723404255319,
"grad_norm": 52.8415412902832,
"learning_rate": 1.2008426215714649e-06,
"loss": 0.6249,
"step": 93
},
{
"epoch": 1.978723404255319,
"eval_loss": 0.8353903293609619,
"eval_runtime": 41.6241,
"eval_samples_per_second": 7.207,
"eval_steps_per_second": 0.24,
"step": 93
},
{
"epoch": 2.0425531914893615,
"grad_norm": 55.50529098510742,
"learning_rate": 1.1257899577232484e-06,
"loss": 0.612,
"step": 96
},
{
"epoch": 2.0425531914893615,
"eval_loss": 0.8322966694831848,
"eval_runtime": 41.6827,
"eval_samples_per_second": 7.197,
"eval_steps_per_second": 0.24,
"step": 96
},
{
"epoch": 2.106382978723404,
"grad_norm": 33.535972595214844,
"learning_rate": 1.0507372938750316e-06,
"loss": 0.4171,
"step": 99
},
{
"epoch": 2.106382978723404,
"eval_loss": 0.8299477100372314,
"eval_runtime": 41.6454,
"eval_samples_per_second": 7.204,
"eval_steps_per_second": 0.24,
"step": 99
},
{
"epoch": 2.1702127659574466,
"grad_norm": 33.902915954589844,
"learning_rate": 9.756846300268151e-07,
"loss": 0.3473,
"step": 102
},
{
"epoch": 2.1702127659574466,
"eval_loss": 0.8301065564155579,
"eval_runtime": 41.3799,
"eval_samples_per_second": 7.25,
"eval_steps_per_second": 0.242,
"step": 102
},
{
"epoch": 2.2340425531914896,
"grad_norm": 54.96790313720703,
"learning_rate": 9.006319661785985e-07,
"loss": 0.4751,
"step": 105
},
{
"epoch": 2.2340425531914896,
"eval_loss": 0.8314597606658936,
"eval_runtime": 41.4767,
"eval_samples_per_second": 7.233,
"eval_steps_per_second": 0.241,
"step": 105
},
{
"epoch": 2.297872340425532,
"grad_norm": 43.61612319946289,
"learning_rate": 8.255793023303821e-07,
"loss": 0.4088,
"step": 108
},
{
"epoch": 2.297872340425532,
"eval_loss": 0.8334099054336548,
"eval_runtime": 60.1786,
"eval_samples_per_second": 4.985,
"eval_steps_per_second": 0.166,
"step": 108
},
{
"epoch": 2.3617021276595747,
"grad_norm": 41.3808708190918,
"learning_rate": 7.505266384821655e-07,
"loss": 0.3174,
"step": 111
},
{
"epoch": 2.3617021276595747,
"eval_loss": 0.8363153338432312,
"eval_runtime": 41.6562,
"eval_samples_per_second": 7.202,
"eval_steps_per_second": 0.24,
"step": 111
},
{
"epoch": 2.425531914893617,
"grad_norm": 39.414100646972656,
"learning_rate": 6.75473974633949e-07,
"loss": 0.3267,
"step": 114
},
{
"epoch": 2.425531914893617,
"eval_loss": 0.8386973142623901,
"eval_runtime": 41.5441,
"eval_samples_per_second": 7.221,
"eval_steps_per_second": 0.241,
"step": 114
},
{
"epoch": 2.4893617021276597,
"grad_norm": 41.55443572998047,
"learning_rate": 6.004213107857324e-07,
"loss": 0.3142,
"step": 117
},
{
"epoch": 2.4893617021276597,
"eval_loss": 0.8412825465202332,
"eval_runtime": 41.7129,
"eval_samples_per_second": 7.192,
"eval_steps_per_second": 0.24,
"step": 117
},
{
"epoch": 2.5531914893617023,
"grad_norm": 45.54193115234375,
"learning_rate": 5.253686469375158e-07,
"loss": 0.4253,
"step": 120
},
{
"epoch": 2.5531914893617023,
"eval_loss": 0.8432453274726868,
"eval_runtime": 41.5872,
"eval_samples_per_second": 7.214,
"eval_steps_per_second": 0.24,
"step": 120
},
{
"epoch": 2.617021276595745,
"grad_norm": 58.93795394897461,
"learning_rate": 4.5031598308929925e-07,
"loss": 0.4269,
"step": 123
},
{
"epoch": 2.617021276595745,
"eval_loss": 0.8454075455665588,
"eval_runtime": 41.7045,
"eval_samples_per_second": 7.193,
"eval_steps_per_second": 0.24,
"step": 123
},
{
"epoch": 2.6808510638297873,
"grad_norm": 30.54688835144043,
"learning_rate": 3.7526331924108274e-07,
"loss": 0.2765,
"step": 126
},
{
"epoch": 2.6808510638297873,
"eval_loss": 0.8469324707984924,
"eval_runtime": 41.6321,
"eval_samples_per_second": 7.206,
"eval_steps_per_second": 0.24,
"step": 126
},
{
"epoch": 2.74468085106383,
"grad_norm": 46.656986236572266,
"learning_rate": 3.002106553928662e-07,
"loss": 0.3071,
"step": 129
},
{
"epoch": 2.74468085106383,
"eval_loss": 0.848118782043457,
"eval_runtime": 41.3133,
"eval_samples_per_second": 7.262,
"eval_steps_per_second": 0.242,
"step": 129
},
{
"epoch": 2.8085106382978724,
"grad_norm": 41.179161071777344,
"learning_rate": 2.2515799154464963e-07,
"loss": 0.3277,
"step": 132
},
{
"epoch": 2.8085106382978724,
"eval_loss": 0.8486995100975037,
"eval_runtime": 41.4208,
"eval_samples_per_second": 7.243,
"eval_steps_per_second": 0.241,
"step": 132
},
{
"epoch": 2.872340425531915,
"grad_norm": 35.64809036254883,
"learning_rate": 1.501053276964331e-07,
"loss": 0.3206,
"step": 135
},
{
"epoch": 2.872340425531915,
"eval_loss": 0.8487841486930847,
"eval_runtime": 41.4922,
"eval_samples_per_second": 7.23,
"eval_steps_per_second": 0.241,
"step": 135
},
{
"epoch": 2.9361702127659575,
"grad_norm": 39.29479217529297,
"learning_rate": 7.505266384821656e-08,
"loss": 0.4095,
"step": 138
},
{
"epoch": 2.9361702127659575,
"eval_loss": 0.848578929901123,
"eval_runtime": 41.4984,
"eval_samples_per_second": 7.229,
"eval_steps_per_second": 0.241,
"step": 138
},
{
"epoch": 3.0,
"grad_norm": 37.54413604736328,
"learning_rate": 0.0,
"loss": 0.2987,
"step": 141
},
{
"epoch": 3.0,
"eval_loss": 0.8485209941864014,
"eval_runtime": 41.5472,
"eval_samples_per_second": 7.221,
"eval_steps_per_second": 0.241,
"step": 141
}
],
"logging_steps": 3,
"max_steps": 141,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 523328480700102.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": {
"_wandb": {},
"assignments": {},
"decay": 0.1,
"learning_rate": 3.527475200866178e-06,
"metric": "eval/loss",
"per_device_train_batch_size": 64
}
}