leixa's picture
Training in progress, step 249, checkpoint
1c75e2c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0181268882175227,
"eval_steps": 21,
"global_step": 249,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012084592145015106,
"eval_loss": 10.387871742248535,
"eval_runtime": 0.2615,
"eval_samples_per_second": 535.336,
"eval_steps_per_second": 68.829,
"step": 1
},
{
"epoch": 0.03625377643504532,
"grad_norm": 0.13119755685329437,
"learning_rate": 3e-05,
"loss": 10.3886,
"step": 3
},
{
"epoch": 0.07250755287009064,
"grad_norm": 0.10730081051588058,
"learning_rate": 6e-05,
"loss": 10.3847,
"step": 6
},
{
"epoch": 0.10876132930513595,
"grad_norm": 0.09705545753240585,
"learning_rate": 9e-05,
"loss": 10.3871,
"step": 9
},
{
"epoch": 0.14501510574018128,
"grad_norm": 0.12139848619699478,
"learning_rate": 9.998272257842641e-05,
"loss": 10.3874,
"step": 12
},
{
"epoch": 0.18126888217522658,
"grad_norm": 0.11973798274993896,
"learning_rate": 9.989204876292688e-05,
"loss": 10.3781,
"step": 15
},
{
"epoch": 0.2175226586102719,
"grad_norm": 0.13775140047073364,
"learning_rate": 9.972379999624936e-05,
"loss": 10.3777,
"step": 18
},
{
"epoch": 0.2537764350453172,
"grad_norm": 0.12977235019207,
"learning_rate": 9.947823788099753e-05,
"loss": 10.3796,
"step": 21
},
{
"epoch": 0.2537764350453172,
"eval_loss": 10.373146057128906,
"eval_runtime": 0.2637,
"eval_samples_per_second": 530.958,
"eval_steps_per_second": 68.266,
"step": 21
},
{
"epoch": 0.29003021148036257,
"grad_norm": 0.16504716873168945,
"learning_rate": 9.91557442308987e-05,
"loss": 10.3703,
"step": 24
},
{
"epoch": 0.32628398791540786,
"grad_norm": 0.16905713081359863,
"learning_rate": 9.875682047713846e-05,
"loss": 10.3718,
"step": 27
},
{
"epoch": 0.36253776435045315,
"grad_norm": 0.18219968676567078,
"learning_rate": 9.828208688870735e-05,
"loss": 10.3611,
"step": 30
},
{
"epoch": 0.3987915407854985,
"grad_norm": 0.2741997241973877,
"learning_rate": 9.773228160797188e-05,
"loss": 10.3599,
"step": 33
},
{
"epoch": 0.4350453172205438,
"grad_norm": 0.20199109613895416,
"learning_rate": 9.71082595029695e-05,
"loss": 10.3536,
"step": 36
},
{
"epoch": 0.47129909365558914,
"grad_norm": 0.16347676515579224,
"learning_rate": 9.64109908382119e-05,
"loss": 10.3424,
"step": 39
},
{
"epoch": 0.5075528700906344,
"grad_norm": 0.15472479164600372,
"learning_rate": 9.564155976606339e-05,
"loss": 10.3365,
"step": 42
},
{
"epoch": 0.5075528700906344,
"eval_loss": 10.338454246520996,
"eval_runtime": 0.2615,
"eval_samples_per_second": 535.377,
"eval_steps_per_second": 68.834,
"step": 42
},
{
"epoch": 0.5438066465256798,
"grad_norm": 0.15116359293460846,
"learning_rate": 9.480116264104011e-05,
"loss": 10.3382,
"step": 45
},
{
"epoch": 0.5800604229607251,
"grad_norm": 0.20655445754528046,
"learning_rate": 9.389110615965102e-05,
"loss": 10.3347,
"step": 48
},
{
"epoch": 0.6163141993957704,
"grad_norm": 0.15545906126499176,
"learning_rate": 9.291280532867302e-05,
"loss": 10.3275,
"step": 51
},
{
"epoch": 0.6525679758308157,
"grad_norm": 0.189162015914917,
"learning_rate": 9.186778126501916e-05,
"loss": 10.3294,
"step": 54
},
{
"epoch": 0.6888217522658611,
"grad_norm": 0.21338708698749542,
"learning_rate": 9.075765883062093e-05,
"loss": 10.3236,
"step": 57
},
{
"epoch": 0.7250755287009063,
"grad_norm": 0.23534299433231354,
"learning_rate": 8.958416410600187e-05,
"loss": 10.3183,
"step": 60
},
{
"epoch": 0.7613293051359517,
"grad_norm": 0.2692500054836273,
"learning_rate": 8.834912170647101e-05,
"loss": 10.3116,
"step": 63
},
{
"epoch": 0.7613293051359517,
"eval_loss": 10.310102462768555,
"eval_runtime": 0.2628,
"eval_samples_per_second": 532.785,
"eval_steps_per_second": 68.501,
"step": 63
},
{
"epoch": 0.797583081570997,
"grad_norm": 0.2844769358634949,
"learning_rate": 8.705445194510868e-05,
"loss": 10.3075,
"step": 66
},
{
"epoch": 0.8338368580060423,
"grad_norm": 0.2514300048351288,
"learning_rate": 8.570216784695637e-05,
"loss": 10.3049,
"step": 69
},
{
"epoch": 0.8700906344410876,
"grad_norm": 0.24744316935539246,
"learning_rate": 8.429437201905254e-05,
"loss": 10.295,
"step": 72
},
{
"epoch": 0.9063444108761329,
"grad_norm": 0.21623125672340393,
"learning_rate": 8.283325338118153e-05,
"loss": 10.2903,
"step": 75
},
{
"epoch": 0.9425981873111783,
"grad_norm": 0.21527834236621857,
"learning_rate": 8.132108376241849e-05,
"loss": 10.2817,
"step": 78
},
{
"epoch": 0.9788519637462235,
"grad_norm": 0.2678958475589752,
"learning_rate": 7.97602143687623e-05,
"loss": 10.2804,
"step": 81
},
{
"epoch": 1.0181268882175227,
"grad_norm": 0.20537346601486206,
"learning_rate": 7.815307212734888e-05,
"loss": 11.7642,
"step": 84
},
{
"epoch": 1.0181268882175227,
"eval_loss": 10.264846801757812,
"eval_runtime": 0.2605,
"eval_samples_per_second": 537.448,
"eval_steps_per_second": 69.1,
"step": 84
},
{
"epoch": 1.054380664652568,
"grad_norm": 0.24083495140075684,
"learning_rate": 7.650215591292888e-05,
"loss": 10.8142,
"step": 87
},
{
"epoch": 1.0906344410876132,
"grad_norm": 0.20766647160053253,
"learning_rate": 7.481003266247744e-05,
"loss": 10.0981,
"step": 90
},
{
"epoch": 1.1268882175226587,
"grad_norm": 0.19924764335155487,
"learning_rate": 7.307933338397667e-05,
"loss": 10.1149,
"step": 93
},
{
"epoch": 1.163141993957704,
"grad_norm": 0.2571873664855957,
"learning_rate": 7.131274906557725e-05,
"loss": 10.134,
"step": 96
},
{
"epoch": 1.1993957703927491,
"grad_norm": 0.20171616971492767,
"learning_rate": 6.95130264914993e-05,
"loss": 10.2961,
"step": 99
},
{
"epoch": 1.2356495468277946,
"grad_norm": 0.2096317708492279,
"learning_rate": 6.768296397117848e-05,
"loss": 10.2312,
"step": 102
},
{
"epoch": 1.2719033232628398,
"grad_norm": 0.28320643305778503,
"learning_rate": 6.582540698829781e-05,
"loss": 10.2853,
"step": 105
},
{
"epoch": 1.2719033232628398,
"eval_loss": 10.228970527648926,
"eval_runtime": 0.2716,
"eval_samples_per_second": 515.557,
"eval_steps_per_second": 66.286,
"step": 105
},
{
"epoch": 1.308157099697885,
"grad_norm": 0.21600359678268433,
"learning_rate": 6.394324377647028e-05,
"loss": 10.1603,
"step": 108
},
{
"epoch": 1.3444108761329305,
"grad_norm": 0.24075965583324432,
"learning_rate": 6.203940082845144e-05,
"loss": 10.0864,
"step": 111
},
{
"epoch": 1.3806646525679758,
"grad_norm": 0.25287488102912903,
"learning_rate": 6.011683834586473e-05,
"loss": 10.6661,
"step": 114
},
{
"epoch": 1.4169184290030212,
"grad_norm": 0.2387695461511612,
"learning_rate": 5.8178545636514145e-05,
"loss": 9.6976,
"step": 117
},
{
"epoch": 1.4531722054380665,
"grad_norm": 0.21192365884780884,
"learning_rate": 5.622753646644102e-05,
"loss": 10.451,
"step": 120
},
{
"epoch": 1.4894259818731117,
"grad_norm": 0.18546977639198303,
"learning_rate": 5.426684437395196e-05,
"loss": 10.2875,
"step": 123
},
{
"epoch": 1.525679758308157,
"grad_norm": 0.2497938573360443,
"learning_rate": 5.229951795290353e-05,
"loss": 10.3627,
"step": 126
},
{
"epoch": 1.525679758308157,
"eval_loss": 10.205331802368164,
"eval_runtime": 0.2653,
"eval_samples_per_second": 527.718,
"eval_steps_per_second": 67.849,
"step": 126
},
{
"epoch": 1.5619335347432024,
"grad_norm": 0.2541723847389221,
"learning_rate": 5.032861611257783e-05,
"loss": 10.2813,
"step": 129
},
{
"epoch": 1.5981873111782479,
"grad_norm": 0.18722322583198547,
"learning_rate": 4.835720332151907e-05,
"loss": 10.0301,
"step": 132
},
{
"epoch": 1.634441087613293,
"grad_norm": 0.2005719244480133,
"learning_rate": 4.6388344842726264e-05,
"loss": 9.9704,
"step": 135
},
{
"epoch": 1.6706948640483383,
"grad_norm": 0.22246921062469482,
"learning_rate": 4.4425101967610674e-05,
"loss": 10.3317,
"step": 138
},
{
"epoch": 1.7069486404833838,
"grad_norm": 0.16641516983509064,
"learning_rate": 4.247052725612852e-05,
"loss": 10.1891,
"step": 141
},
{
"epoch": 1.743202416918429,
"grad_norm": 0.19296815991401672,
"learning_rate": 4.052765979048986e-05,
"loss": 10.3081,
"step": 144
},
{
"epoch": 1.7794561933534743,
"grad_norm": 0.30453190207481384,
"learning_rate": 3.859952044982329e-05,
"loss": 10.2634,
"step": 147
},
{
"epoch": 1.7794561933534743,
"eval_loss": 10.188948631286621,
"eval_runtime": 0.2641,
"eval_samples_per_second": 530.026,
"eval_steps_per_second": 68.146,
"step": 147
},
{
"epoch": 1.8157099697885197,
"grad_norm": 0.21797674894332886,
"learning_rate": 3.668910721314402e-05,
"loss": 10.4318,
"step": 150
},
{
"epoch": 1.851963746223565,
"grad_norm": 0.196367546916008,
"learning_rate": 3.479939049792817e-05,
"loss": 9.8743,
"step": 153
},
{
"epoch": 1.8882175226586102,
"grad_norm": 0.1838054358959198,
"learning_rate": 3.293330854154136e-05,
"loss": 10.1771,
"step": 156
},
{
"epoch": 1.9244712990936557,
"grad_norm": 0.2736557722091675,
"learning_rate": 3.109376283270277e-05,
"loss": 9.9654,
"step": 159
},
{
"epoch": 1.960725075528701,
"grad_norm": 0.19549483060836792,
"learning_rate": 2.9283613600087933e-05,
"loss": 10.4694,
"step": 162
},
{
"epoch": 1.9969788519637461,
"grad_norm": 0.28228676319122314,
"learning_rate": 2.750567536508504e-05,
"loss": 11.781,
"step": 165
},
{
"epoch": 2.0362537764350455,
"grad_norm": 0.18733060359954834,
"learning_rate": 2.5762712565619528e-05,
"loss": 10.1856,
"step": 168
},
{
"epoch": 2.0362537764350455,
"eval_loss": 10.178958892822266,
"eval_runtime": 0.2596,
"eval_samples_per_second": 539.382,
"eval_steps_per_second": 69.349,
"step": 168
},
{
"epoch": 2.0725075528700905,
"grad_norm": 0.19772112369537354,
"learning_rate": 2.4057435257851175e-05,
"loss": 10.1846,
"step": 171
},
{
"epoch": 2.108761329305136,
"grad_norm": 0.29851359128952026,
"learning_rate": 2.2392494902427025e-05,
"loss": 10.1801,
"step": 174
},
{
"epoch": 2.1450151057401814,
"grad_norm": 0.21380534768104553,
"learning_rate": 2.07704802418419e-05,
"loss": 10.1843,
"step": 177
},
{
"epoch": 2.1812688821752264,
"grad_norm": 0.1674821972846985,
"learning_rate": 1.9193913275316626e-05,
"loss": 10.1844,
"step": 180
},
{
"epoch": 2.217522658610272,
"grad_norm": 0.1863589584827423,
"learning_rate": 1.7665245337452368e-05,
"loss": 10.18,
"step": 183
},
{
"epoch": 2.2537764350453173,
"grad_norm": 0.22038479149341583,
"learning_rate": 1.6186853286758397e-05,
"loss": 10.1813,
"step": 186
},
{
"epoch": 2.290030211480363,
"grad_norm": 0.17609906196594238,
"learning_rate": 1.4761035809979395e-05,
"loss": 10.1798,
"step": 189
},
{
"epoch": 2.290030211480363,
"eval_loss": 10.172968864440918,
"eval_runtime": 0.2591,
"eval_samples_per_second": 540.238,
"eval_steps_per_second": 69.459,
"step": 189
},
{
"epoch": 2.326283987915408,
"grad_norm": 0.17823714017868042,
"learning_rate": 1.3390009847968504e-05,
"loss": 10.1773,
"step": 192
},
{
"epoch": 2.3625377643504533,
"grad_norm": 0.21689902245998383,
"learning_rate": 1.2075907148663579e-05,
"loss": 10.1772,
"step": 195
},
{
"epoch": 2.3987915407854983,
"grad_norm": 0.3612368106842041,
"learning_rate": 1.0820770952526155e-05,
"loss": 10.1826,
"step": 198
},
{
"epoch": 2.4350453172205437,
"grad_norm": 0.19127142429351807,
"learning_rate": 9.62655281559679e-06,
"loss": 10.1821,
"step": 201
},
{
"epoch": 2.471299093655589,
"grad_norm": 0.21965357661247253,
"learning_rate": 8.49510957510633e-06,
"loss": 10.1765,
"step": 204
},
{
"epoch": 2.5075528700906347,
"grad_norm": 0.1769980639219284,
"learning_rate": 7.4282004623615396e-06,
"loss": 10.1756,
"step": 207
},
{
"epoch": 2.5438066465256797,
"grad_norm": 0.20193351805210114,
"learning_rate": 6.427484367393699e-06,
"loss": 10.178,
"step": 210
},
{
"epoch": 2.5438066465256797,
"eval_loss": 10.170087814331055,
"eval_runtime": 0.2628,
"eval_samples_per_second": 532.826,
"eval_steps_per_second": 68.506,
"step": 210
},
{
"epoch": 2.580060422960725,
"grad_norm": 0.17420655488967896,
"learning_rate": 5.494517259623477e-06,
"loss": 10.1712,
"step": 213
},
{
"epoch": 2.61631419939577,
"grad_norm": 0.23611021041870117,
"learning_rate": 4.630749768552589e-06,
"loss": 10.1776,
"step": 216
},
{
"epoch": 2.6525679758308156,
"grad_norm": 0.21432390809059143,
"learning_rate": 3.837524928243774e-06,
"loss": 10.1729,
"step": 219
},
{
"epoch": 2.688821752265861,
"grad_norm": 0.27384114265441895,
"learning_rate": 3.116076089096265e-06,
"loss": 10.1782,
"step": 222
},
{
"epoch": 2.7250755287009065,
"grad_norm": 0.18094521760940552,
"learning_rate": 2.4675250001635232e-06,
"loss": 10.1835,
"step": 225
},
{
"epoch": 2.7613293051359515,
"grad_norm": 0.25660476088523865,
"learning_rate": 1.892880064994934e-06,
"loss": 10.179,
"step": 228
},
{
"epoch": 2.797583081570997,
"grad_norm": 0.23092766106128693,
"learning_rate": 1.3930347737136196e-06,
"loss": 10.1815,
"step": 231
},
{
"epoch": 2.797583081570997,
"eval_loss": 10.169166564941406,
"eval_runtime": 0.2602,
"eval_samples_per_second": 538.147,
"eval_steps_per_second": 69.19,
"step": 231
},
{
"epoch": 2.8338368580060425,
"grad_norm": 0.20488251745700836,
"learning_rate": 9.687663137678604e-07,
"loss": 10.175,
"step": 234
},
{
"epoch": 2.8700906344410875,
"grad_norm": 0.1691775619983673,
"learning_rate": 6.207343615165561e-07,
"loss": 10.1781,
"step": 237
},
{
"epoch": 2.906344410876133,
"grad_norm": 0.19150525331497192,
"learning_rate": 3.494800565275125e-07,
"loss": 10.184,
"step": 240
},
{
"epoch": 2.9425981873111784,
"grad_norm": 0.16350044310092926,
"learning_rate": 1.554251601833201e-07,
"loss": 10.1778,
"step": 243
},
{
"epoch": 2.9788519637462234,
"grad_norm": 0.16869449615478516,
"learning_rate": 3.8871399903134265e-08,
"loss": 10.1763,
"step": 246
},
{
"epoch": 3.0181268882175227,
"grad_norm": 0.19018259644508362,
"learning_rate": 0.0,
"loss": 12.1418,
"step": 249
}
],
"logging_steps": 3,
"max_steps": 249,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 21,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 27776114491392.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}