|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0181268882175227, |
|
"eval_steps": 21, |
|
"global_step": 249, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.012084592145015106, |
|
"eval_loss": 10.387871742248535, |
|
"eval_runtime": 0.2615, |
|
"eval_samples_per_second": 535.336, |
|
"eval_steps_per_second": 68.829, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03625377643504532, |
|
"grad_norm": 0.13119755685329437, |
|
"learning_rate": 3e-05, |
|
"loss": 10.3886, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07250755287009064, |
|
"grad_norm": 0.10730081051588058, |
|
"learning_rate": 6e-05, |
|
"loss": 10.3847, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.10876132930513595, |
|
"grad_norm": 0.09705545753240585, |
|
"learning_rate": 9e-05, |
|
"loss": 10.3871, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.14501510574018128, |
|
"grad_norm": 0.12139848619699478, |
|
"learning_rate": 9.998272257842641e-05, |
|
"loss": 10.3874, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.18126888217522658, |
|
"grad_norm": 0.11973798274993896, |
|
"learning_rate": 9.989204876292688e-05, |
|
"loss": 10.3781, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2175226586102719, |
|
"grad_norm": 0.13775140047073364, |
|
"learning_rate": 9.972379999624936e-05, |
|
"loss": 10.3777, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2537764350453172, |
|
"grad_norm": 0.12977235019207, |
|
"learning_rate": 9.947823788099753e-05, |
|
"loss": 10.3796, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2537764350453172, |
|
"eval_loss": 10.373146057128906, |
|
"eval_runtime": 0.2637, |
|
"eval_samples_per_second": 530.958, |
|
"eval_steps_per_second": 68.266, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.29003021148036257, |
|
"grad_norm": 0.16504716873168945, |
|
"learning_rate": 9.91557442308987e-05, |
|
"loss": 10.3703, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.32628398791540786, |
|
"grad_norm": 0.16905713081359863, |
|
"learning_rate": 9.875682047713846e-05, |
|
"loss": 10.3718, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.36253776435045315, |
|
"grad_norm": 0.18219968676567078, |
|
"learning_rate": 9.828208688870735e-05, |
|
"loss": 10.3611, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3987915407854985, |
|
"grad_norm": 0.2741997241973877, |
|
"learning_rate": 9.773228160797188e-05, |
|
"loss": 10.3599, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.4350453172205438, |
|
"grad_norm": 0.20199109613895416, |
|
"learning_rate": 9.71082595029695e-05, |
|
"loss": 10.3536, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.47129909365558914, |
|
"grad_norm": 0.16347676515579224, |
|
"learning_rate": 9.64109908382119e-05, |
|
"loss": 10.3424, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5075528700906344, |
|
"grad_norm": 0.15472479164600372, |
|
"learning_rate": 9.564155976606339e-05, |
|
"loss": 10.3365, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5075528700906344, |
|
"eval_loss": 10.338454246520996, |
|
"eval_runtime": 0.2615, |
|
"eval_samples_per_second": 535.377, |
|
"eval_steps_per_second": 68.834, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5438066465256798, |
|
"grad_norm": 0.15116359293460846, |
|
"learning_rate": 9.480116264104011e-05, |
|
"loss": 10.3382, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5800604229607251, |
|
"grad_norm": 0.20655445754528046, |
|
"learning_rate": 9.389110615965102e-05, |
|
"loss": 10.3347, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6163141993957704, |
|
"grad_norm": 0.15545906126499176, |
|
"learning_rate": 9.291280532867302e-05, |
|
"loss": 10.3275, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6525679758308157, |
|
"grad_norm": 0.189162015914917, |
|
"learning_rate": 9.186778126501916e-05, |
|
"loss": 10.3294, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.6888217522658611, |
|
"grad_norm": 0.21338708698749542, |
|
"learning_rate": 9.075765883062093e-05, |
|
"loss": 10.3236, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.7250755287009063, |
|
"grad_norm": 0.23534299433231354, |
|
"learning_rate": 8.958416410600187e-05, |
|
"loss": 10.3183, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7613293051359517, |
|
"grad_norm": 0.2692500054836273, |
|
"learning_rate": 8.834912170647101e-05, |
|
"loss": 10.3116, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.7613293051359517, |
|
"eval_loss": 10.310102462768555, |
|
"eval_runtime": 0.2628, |
|
"eval_samples_per_second": 532.785, |
|
"eval_steps_per_second": 68.501, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.797583081570997, |
|
"grad_norm": 0.2844769358634949, |
|
"learning_rate": 8.705445194510868e-05, |
|
"loss": 10.3075, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.8338368580060423, |
|
"grad_norm": 0.2514300048351288, |
|
"learning_rate": 8.570216784695637e-05, |
|
"loss": 10.3049, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.8700906344410876, |
|
"grad_norm": 0.24744316935539246, |
|
"learning_rate": 8.429437201905254e-05, |
|
"loss": 10.295, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.9063444108761329, |
|
"grad_norm": 0.21623125672340393, |
|
"learning_rate": 8.283325338118153e-05, |
|
"loss": 10.2903, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.9425981873111783, |
|
"grad_norm": 0.21527834236621857, |
|
"learning_rate": 8.132108376241849e-05, |
|
"loss": 10.2817, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.9788519637462235, |
|
"grad_norm": 0.2678958475589752, |
|
"learning_rate": 7.97602143687623e-05, |
|
"loss": 10.2804, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.0181268882175227, |
|
"grad_norm": 0.20537346601486206, |
|
"learning_rate": 7.815307212734888e-05, |
|
"loss": 11.7642, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.0181268882175227, |
|
"eval_loss": 10.264846801757812, |
|
"eval_runtime": 0.2605, |
|
"eval_samples_per_second": 537.448, |
|
"eval_steps_per_second": 69.1, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.054380664652568, |
|
"grad_norm": 0.24083495140075684, |
|
"learning_rate": 7.650215591292888e-05, |
|
"loss": 10.8142, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.0906344410876132, |
|
"grad_norm": 0.20766647160053253, |
|
"learning_rate": 7.481003266247744e-05, |
|
"loss": 10.0981, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1268882175226587, |
|
"grad_norm": 0.19924764335155487, |
|
"learning_rate": 7.307933338397667e-05, |
|
"loss": 10.1149, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.163141993957704, |
|
"grad_norm": 0.2571873664855957, |
|
"learning_rate": 7.131274906557725e-05, |
|
"loss": 10.134, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.1993957703927491, |
|
"grad_norm": 0.20171616971492767, |
|
"learning_rate": 6.95130264914993e-05, |
|
"loss": 10.2961, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.2356495468277946, |
|
"grad_norm": 0.2096317708492279, |
|
"learning_rate": 6.768296397117848e-05, |
|
"loss": 10.2312, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.2719033232628398, |
|
"grad_norm": 0.28320643305778503, |
|
"learning_rate": 6.582540698829781e-05, |
|
"loss": 10.2853, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.2719033232628398, |
|
"eval_loss": 10.228970527648926, |
|
"eval_runtime": 0.2716, |
|
"eval_samples_per_second": 515.557, |
|
"eval_steps_per_second": 66.286, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.308157099697885, |
|
"grad_norm": 0.21600359678268433, |
|
"learning_rate": 6.394324377647028e-05, |
|
"loss": 10.1603, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.3444108761329305, |
|
"grad_norm": 0.24075965583324432, |
|
"learning_rate": 6.203940082845144e-05, |
|
"loss": 10.0864, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.3806646525679758, |
|
"grad_norm": 0.25287488102912903, |
|
"learning_rate": 6.011683834586473e-05, |
|
"loss": 10.6661, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.4169184290030212, |
|
"grad_norm": 0.2387695461511612, |
|
"learning_rate": 5.8178545636514145e-05, |
|
"loss": 9.6976, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.4531722054380665, |
|
"grad_norm": 0.21192365884780884, |
|
"learning_rate": 5.622753646644102e-05, |
|
"loss": 10.451, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.4894259818731117, |
|
"grad_norm": 0.18546977639198303, |
|
"learning_rate": 5.426684437395196e-05, |
|
"loss": 10.2875, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.525679758308157, |
|
"grad_norm": 0.2497938573360443, |
|
"learning_rate": 5.229951795290353e-05, |
|
"loss": 10.3627, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.525679758308157, |
|
"eval_loss": 10.205331802368164, |
|
"eval_runtime": 0.2653, |
|
"eval_samples_per_second": 527.718, |
|
"eval_steps_per_second": 67.849, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.5619335347432024, |
|
"grad_norm": 0.2541723847389221, |
|
"learning_rate": 5.032861611257783e-05, |
|
"loss": 10.2813, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.5981873111782479, |
|
"grad_norm": 0.18722322583198547, |
|
"learning_rate": 4.835720332151907e-05, |
|
"loss": 10.0301, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.634441087613293, |
|
"grad_norm": 0.2005719244480133, |
|
"learning_rate": 4.6388344842726264e-05, |
|
"loss": 9.9704, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.6706948640483383, |
|
"grad_norm": 0.22246921062469482, |
|
"learning_rate": 4.4425101967610674e-05, |
|
"loss": 10.3317, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.7069486404833838, |
|
"grad_norm": 0.16641516983509064, |
|
"learning_rate": 4.247052725612852e-05, |
|
"loss": 10.1891, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.743202416918429, |
|
"grad_norm": 0.19296815991401672, |
|
"learning_rate": 4.052765979048986e-05, |
|
"loss": 10.3081, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.7794561933534743, |
|
"grad_norm": 0.30453190207481384, |
|
"learning_rate": 3.859952044982329e-05, |
|
"loss": 10.2634, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.7794561933534743, |
|
"eval_loss": 10.188948631286621, |
|
"eval_runtime": 0.2641, |
|
"eval_samples_per_second": 530.026, |
|
"eval_steps_per_second": 68.146, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.8157099697885197, |
|
"grad_norm": 0.21797674894332886, |
|
"learning_rate": 3.668910721314402e-05, |
|
"loss": 10.4318, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.851963746223565, |
|
"grad_norm": 0.196367546916008, |
|
"learning_rate": 3.479939049792817e-05, |
|
"loss": 9.8743, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.8882175226586102, |
|
"grad_norm": 0.1838054358959198, |
|
"learning_rate": 3.293330854154136e-05, |
|
"loss": 10.1771, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.9244712990936557, |
|
"grad_norm": 0.2736557722091675, |
|
"learning_rate": 3.109376283270277e-05, |
|
"loss": 9.9654, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.960725075528701, |
|
"grad_norm": 0.19549483060836792, |
|
"learning_rate": 2.9283613600087933e-05, |
|
"loss": 10.4694, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.9969788519637461, |
|
"grad_norm": 0.28228676319122314, |
|
"learning_rate": 2.750567536508504e-05, |
|
"loss": 11.781, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.0362537764350455, |
|
"grad_norm": 0.18733060359954834, |
|
"learning_rate": 2.5762712565619528e-05, |
|
"loss": 10.1856, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.0362537764350455, |
|
"eval_loss": 10.178958892822266, |
|
"eval_runtime": 0.2596, |
|
"eval_samples_per_second": 539.382, |
|
"eval_steps_per_second": 69.349, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.0725075528700905, |
|
"grad_norm": 0.19772112369537354, |
|
"learning_rate": 2.4057435257851175e-05, |
|
"loss": 10.1846, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.108761329305136, |
|
"grad_norm": 0.29851359128952026, |
|
"learning_rate": 2.2392494902427025e-05, |
|
"loss": 10.1801, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.1450151057401814, |
|
"grad_norm": 0.21380534768104553, |
|
"learning_rate": 2.07704802418419e-05, |
|
"loss": 10.1843, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.1812688821752264, |
|
"grad_norm": 0.1674821972846985, |
|
"learning_rate": 1.9193913275316626e-05, |
|
"loss": 10.1844, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.217522658610272, |
|
"grad_norm": 0.1863589584827423, |
|
"learning_rate": 1.7665245337452368e-05, |
|
"loss": 10.18, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.2537764350453173, |
|
"grad_norm": 0.22038479149341583, |
|
"learning_rate": 1.6186853286758397e-05, |
|
"loss": 10.1813, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.290030211480363, |
|
"grad_norm": 0.17609906196594238, |
|
"learning_rate": 1.4761035809979395e-05, |
|
"loss": 10.1798, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.290030211480363, |
|
"eval_loss": 10.172968864440918, |
|
"eval_runtime": 0.2591, |
|
"eval_samples_per_second": 540.238, |
|
"eval_steps_per_second": 69.459, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.326283987915408, |
|
"grad_norm": 0.17823714017868042, |
|
"learning_rate": 1.3390009847968504e-05, |
|
"loss": 10.1773, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.3625377643504533, |
|
"grad_norm": 0.21689902245998383, |
|
"learning_rate": 1.2075907148663579e-05, |
|
"loss": 10.1772, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.3987915407854983, |
|
"grad_norm": 0.3612368106842041, |
|
"learning_rate": 1.0820770952526155e-05, |
|
"loss": 10.1826, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.4350453172205437, |
|
"grad_norm": 0.19127142429351807, |
|
"learning_rate": 9.62655281559679e-06, |
|
"loss": 10.1821, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.471299093655589, |
|
"grad_norm": 0.21965357661247253, |
|
"learning_rate": 8.49510957510633e-06, |
|
"loss": 10.1765, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.5075528700906347, |
|
"grad_norm": 0.1769980639219284, |
|
"learning_rate": 7.4282004623615396e-06, |
|
"loss": 10.1756, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.5438066465256797, |
|
"grad_norm": 0.20193351805210114, |
|
"learning_rate": 6.427484367393699e-06, |
|
"loss": 10.178, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.5438066465256797, |
|
"eval_loss": 10.170087814331055, |
|
"eval_runtime": 0.2628, |
|
"eval_samples_per_second": 532.826, |
|
"eval_steps_per_second": 68.506, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.580060422960725, |
|
"grad_norm": 0.17420655488967896, |
|
"learning_rate": 5.494517259623477e-06, |
|
"loss": 10.1712, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.61631419939577, |
|
"grad_norm": 0.23611021041870117, |
|
"learning_rate": 4.630749768552589e-06, |
|
"loss": 10.1776, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.6525679758308156, |
|
"grad_norm": 0.21432390809059143, |
|
"learning_rate": 3.837524928243774e-06, |
|
"loss": 10.1729, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.688821752265861, |
|
"grad_norm": 0.27384114265441895, |
|
"learning_rate": 3.116076089096265e-06, |
|
"loss": 10.1782, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.7250755287009065, |
|
"grad_norm": 0.18094521760940552, |
|
"learning_rate": 2.4675250001635232e-06, |
|
"loss": 10.1835, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.7613293051359515, |
|
"grad_norm": 0.25660476088523865, |
|
"learning_rate": 1.892880064994934e-06, |
|
"loss": 10.179, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.797583081570997, |
|
"grad_norm": 0.23092766106128693, |
|
"learning_rate": 1.3930347737136196e-06, |
|
"loss": 10.1815, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.797583081570997, |
|
"eval_loss": 10.169166564941406, |
|
"eval_runtime": 0.2602, |
|
"eval_samples_per_second": 538.147, |
|
"eval_steps_per_second": 69.19, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.8338368580060425, |
|
"grad_norm": 0.20488251745700836, |
|
"learning_rate": 9.687663137678604e-07, |
|
"loss": 10.175, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.8700906344410875, |
|
"grad_norm": 0.1691775619983673, |
|
"learning_rate": 6.207343615165561e-07, |
|
"loss": 10.1781, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.906344410876133, |
|
"grad_norm": 0.19150525331497192, |
|
"learning_rate": 3.494800565275125e-07, |
|
"loss": 10.184, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.9425981873111784, |
|
"grad_norm": 0.16350044310092926, |
|
"learning_rate": 1.554251601833201e-07, |
|
"loss": 10.1778, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.9788519637462234, |
|
"grad_norm": 0.16869449615478516, |
|
"learning_rate": 3.8871399903134265e-08, |
|
"loss": 10.1763, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 3.0181268882175227, |
|
"grad_norm": 0.19018259644508362, |
|
"learning_rate": 0.0, |
|
"loss": 12.1418, |
|
"step": 249 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 249, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 21, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 27776114491392.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|