|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.4144271570014144, |
|
"eval_steps": 42, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002828854314002829, |
|
"eval_loss": 11.916790962219238, |
|
"eval_runtime": 6.6784, |
|
"eval_samples_per_second": 89.243, |
|
"eval_steps_per_second": 11.23, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008486562942008486, |
|
"grad_norm": 0.5188692212104797, |
|
"learning_rate": 3e-05, |
|
"loss": 11.92, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.016973125884016973, |
|
"grad_norm": 0.48136115074157715, |
|
"learning_rate": 6e-05, |
|
"loss": 11.9198, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02545968882602546, |
|
"grad_norm": 0.5018041729927063, |
|
"learning_rate": 9e-05, |
|
"loss": 11.9077, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.033946251768033946, |
|
"grad_norm": 0.6158789992332458, |
|
"learning_rate": 9.999588943391597e-05, |
|
"loss": 11.9074, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.042432814710042434, |
|
"grad_norm": 0.6823853850364685, |
|
"learning_rate": 9.99743108100344e-05, |
|
"loss": 11.9042, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05091937765205092, |
|
"grad_norm": 0.8695465922355652, |
|
"learning_rate": 9.993424445916923e-05, |
|
"loss": 11.8991, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0594059405940594, |
|
"grad_norm": 0.9028195142745972, |
|
"learning_rate": 9.987570520365104e-05, |
|
"loss": 11.8853, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06789250353606789, |
|
"grad_norm": 0.9360626935958862, |
|
"learning_rate": 9.979871469976196e-05, |
|
"loss": 11.8781, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07637906647807638, |
|
"grad_norm": 0.8196120858192444, |
|
"learning_rate": 9.970330142972401e-05, |
|
"loss": 11.8677, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08486562942008487, |
|
"grad_norm": 0.7162870168685913, |
|
"learning_rate": 9.95895006911623e-05, |
|
"loss": 11.8614, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09335219236209336, |
|
"grad_norm": 0.5132609605789185, |
|
"learning_rate": 9.945735458404681e-05, |
|
"loss": 11.8498, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.10183875530410184, |
|
"grad_norm": 0.4584226608276367, |
|
"learning_rate": 9.930691199511775e-05, |
|
"loss": 11.8467, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.11032531824611033, |
|
"grad_norm": 0.29786884784698486, |
|
"learning_rate": 9.91382285798002e-05, |
|
"loss": 11.8511, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1188118811881188, |
|
"grad_norm": 0.2889010012149811, |
|
"learning_rate": 9.895136674161465e-05, |
|
"loss": 11.8408, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1188118811881188, |
|
"eval_loss": 11.839421272277832, |
|
"eval_runtime": 6.332, |
|
"eval_samples_per_second": 94.125, |
|
"eval_steps_per_second": 11.845, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1272984441301273, |
|
"grad_norm": 0.20692946016788483, |
|
"learning_rate": 9.874639560909117e-05, |
|
"loss": 11.8396, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.13578500707213578, |
|
"grad_norm": 0.23685254156589508, |
|
"learning_rate": 9.852339101019574e-05, |
|
"loss": 11.8316, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.14427157001414428, |
|
"grad_norm": 0.2432631552219391, |
|
"learning_rate": 9.828243544427796e-05, |
|
"loss": 11.8341, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.15275813295615276, |
|
"grad_norm": 0.11745542287826538, |
|
"learning_rate": 9.802361805155097e-05, |
|
"loss": 11.8301, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.16124469589816123, |
|
"grad_norm": 0.134097158908844, |
|
"learning_rate": 9.774703458011453e-05, |
|
"loss": 11.8359, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.16973125884016974, |
|
"grad_norm": 0.19680051505565643, |
|
"learning_rate": 9.745278735053343e-05, |
|
"loss": 11.8378, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1782178217821782, |
|
"grad_norm": 0.10566498339176178, |
|
"learning_rate": 9.714098521798465e-05, |
|
"loss": 11.832, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1867043847241867, |
|
"grad_norm": 0.1530551165342331, |
|
"learning_rate": 9.681174353198687e-05, |
|
"loss": 11.8363, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.19519094766619519, |
|
"grad_norm": 0.1929464042186737, |
|
"learning_rate": 9.64651840937276e-05, |
|
"loss": 11.8284, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2036775106082037, |
|
"grad_norm": 0.17411480844020844, |
|
"learning_rate": 9.610143511100354e-05, |
|
"loss": 11.8314, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.21216407355021216, |
|
"grad_norm": 0.14971987903118134, |
|
"learning_rate": 9.572063115079063e-05, |
|
"loss": 11.832, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.22065063649222066, |
|
"grad_norm": 0.18370923399925232, |
|
"learning_rate": 9.53229130894619e-05, |
|
"loss": 11.8275, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.22913719943422914, |
|
"grad_norm": 0.26103201508522034, |
|
"learning_rate": 9.490842806067095e-05, |
|
"loss": 11.8278, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2376237623762376, |
|
"grad_norm": 0.21483545005321503, |
|
"learning_rate": 9.44773294009206e-05, |
|
"loss": 11.825, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2376237623762376, |
|
"eval_loss": 11.825268745422363, |
|
"eval_runtime": 6.402, |
|
"eval_samples_per_second": 93.097, |
|
"eval_steps_per_second": 11.715, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.24611032531824611, |
|
"grad_norm": 0.28861185908317566, |
|
"learning_rate": 9.40297765928369e-05, |
|
"loss": 11.8219, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2545968882602546, |
|
"grad_norm": 0.13039568066596985, |
|
"learning_rate": 9.356593520616948e-05, |
|
"loss": 11.8245, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.26308345120226306, |
|
"grad_norm": 0.1711033284664154, |
|
"learning_rate": 9.308597683653975e-05, |
|
"loss": 11.8246, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.27157001414427157, |
|
"grad_norm": 0.2108013778924942, |
|
"learning_rate": 9.259007904196023e-05, |
|
"loss": 11.8228, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.28005657708628007, |
|
"grad_norm": 0.1204076036810875, |
|
"learning_rate": 9.207842527714767e-05, |
|
"loss": 11.82, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.28854314002828857, |
|
"grad_norm": 0.1477670669555664, |
|
"learning_rate": 9.155120482565521e-05, |
|
"loss": 11.8189, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.297029702970297, |
|
"grad_norm": 0.10705884546041489, |
|
"learning_rate": 9.10086127298478e-05, |
|
"loss": 11.8169, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3055162659123055, |
|
"grad_norm": 0.29956066608428955, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 11.8205, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.314002828854314, |
|
"grad_norm": 0.13027921319007874, |
|
"learning_rate": 8.987812213377424e-05, |
|
"loss": 11.8168, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.32248939179632247, |
|
"grad_norm": 0.13590934872627258, |
|
"learning_rate": 8.929064185241213e-05, |
|
"loss": 11.819, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.33097595473833097, |
|
"grad_norm": 0.09333682060241699, |
|
"learning_rate": 8.868862620982534e-05, |
|
"loss": 11.8267, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.33946251768033947, |
|
"grad_norm": 0.12400602549314499, |
|
"learning_rate": 8.807229791845673e-05, |
|
"loss": 11.818, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.347949080622348, |
|
"grad_norm": 0.12071343511343002, |
|
"learning_rate": 8.744188498563641e-05, |
|
"loss": 11.8166, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3564356435643564, |
|
"grad_norm": 0.1693616658449173, |
|
"learning_rate": 8.679762062923175e-05, |
|
"loss": 11.8183, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3564356435643564, |
|
"eval_loss": 11.819117546081543, |
|
"eval_runtime": 6.7651, |
|
"eval_samples_per_second": 88.1, |
|
"eval_steps_per_second": 11.086, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3649222065063649, |
|
"grad_norm": 0.17696824669837952, |
|
"learning_rate": 8.613974319136958e-05, |
|
"loss": 11.8171, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3734087694483734, |
|
"grad_norm": 0.11144156008958817, |
|
"learning_rate": 8.54684960502629e-05, |
|
"loss": 11.815, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.38189533239038187, |
|
"grad_norm": 0.16119325160980225, |
|
"learning_rate": 8.478412753017433e-05, |
|
"loss": 11.8134, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.39038189533239037, |
|
"grad_norm": 0.10234789550304413, |
|
"learning_rate": 8.408689080954998e-05, |
|
"loss": 11.8138, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3988684582743989, |
|
"grad_norm": 0.12111084908246994, |
|
"learning_rate": 8.33770438273574e-05, |
|
"loss": 11.8152, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4073550212164074, |
|
"grad_norm": 0.07516127824783325, |
|
"learning_rate": 8.265484918766243e-05, |
|
"loss": 11.8128, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4158415841584158, |
|
"grad_norm": 0.23707596957683563, |
|
"learning_rate": 8.192057406248028e-05, |
|
"loss": 11.8202, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4243281471004243, |
|
"grad_norm": 0.1228451356291771, |
|
"learning_rate": 8.117449009293668e-05, |
|
"loss": 11.8201, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4328147100424328, |
|
"grad_norm": 0.16358840465545654, |
|
"learning_rate": 8.041687328877567e-05, |
|
"loss": 11.8141, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.44130127298444133, |
|
"grad_norm": 0.10192089527845383, |
|
"learning_rate": 7.964800392625129e-05, |
|
"loss": 11.8128, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4497878359264498, |
|
"grad_norm": 0.07679455727338791, |
|
"learning_rate": 7.886816644444098e-05, |
|
"loss": 11.8124, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4582743988684583, |
|
"grad_norm": 0.10075189918279648, |
|
"learning_rate": 7.807764934001874e-05, |
|
"loss": 11.8119, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4667609618104668, |
|
"grad_norm": 0.1872919499874115, |
|
"learning_rate": 7.727674506052743e-05, |
|
"loss": 11.8203, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4752475247524752, |
|
"grad_norm": 0.12166598439216614, |
|
"learning_rate": 7.646574989618938e-05, |
|
"loss": 11.8202, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4752475247524752, |
|
"eval_loss": 11.818514823913574, |
|
"eval_runtime": 6.4158, |
|
"eval_samples_per_second": 92.895, |
|
"eval_steps_per_second": 11.69, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4837340876944837, |
|
"grad_norm": 0.13749000430107117, |
|
"learning_rate": 7.564496387029532e-05, |
|
"loss": 11.8156, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.49222065063649223, |
|
"grad_norm": 0.07802052795886993, |
|
"learning_rate": 7.481469062821252e-05, |
|
"loss": 11.8182, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5007072135785007, |
|
"grad_norm": 0.150814950466156, |
|
"learning_rate": 7.39752373250527e-05, |
|
"loss": 11.8179, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5091937765205092, |
|
"grad_norm": 0.1514790952205658, |
|
"learning_rate": 7.312691451204178e-05, |
|
"loss": 11.8099, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5176803394625177, |
|
"grad_norm": 0.13362684845924377, |
|
"learning_rate": 7.227003602163295e-05, |
|
"loss": 11.8172, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5261669024045261, |
|
"grad_norm": 0.09337490051984787, |
|
"learning_rate": 7.14049188514063e-05, |
|
"loss": 11.8184, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5346534653465347, |
|
"grad_norm": 0.08015663921833038, |
|
"learning_rate": 7.05318830467969e-05, |
|
"loss": 11.8158, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5431400282885431, |
|
"grad_norm": 0.16405069828033447, |
|
"learning_rate": 6.965125158269619e-05, |
|
"loss": 11.816, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5516265912305516, |
|
"grad_norm": 0.14057497680187225, |
|
"learning_rate": 6.876335024396872e-05, |
|
"loss": 11.8147, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5601131541725601, |
|
"grad_norm": 0.1409187614917755, |
|
"learning_rate": 6.786850750493006e-05, |
|
"loss": 11.8157, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5685997171145686, |
|
"grad_norm": 0.1987845003604889, |
|
"learning_rate": 6.696705440782938e-05, |
|
"loss": 11.8185, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5770862800565771, |
|
"grad_norm": 0.10339465737342834, |
|
"learning_rate": 6.605932444038229e-05, |
|
"loss": 11.815, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5855728429985856, |
|
"grad_norm": 0.12926329672336578, |
|
"learning_rate": 6.514565341239861e-05, |
|
"loss": 11.818, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.594059405940594, |
|
"grad_norm": 0.07949727028608322, |
|
"learning_rate": 6.422637933155162e-05, |
|
"loss": 11.8151, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.594059405940594, |
|
"eval_loss": 11.818094253540039, |
|
"eval_runtime": 6.6444, |
|
"eval_samples_per_second": 89.7, |
|
"eval_steps_per_second": 11.288, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6025459688826026, |
|
"grad_norm": 0.18179267644882202, |
|
"learning_rate": 6.330184227833376e-05, |
|
"loss": 11.8178, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.611032531824611, |
|
"grad_norm": 0.12221992760896683, |
|
"learning_rate": 6.237238428024572e-05, |
|
"loss": 11.8145, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.6195190947666195, |
|
"grad_norm": 0.07228324562311172, |
|
"learning_rate": 6.143834918526527e-05, |
|
"loss": 11.8137, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.628005657708628, |
|
"grad_norm": 0.11851081997156143, |
|
"learning_rate": 6.0500082534642464e-05, |
|
"loss": 11.8156, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6364922206506365, |
|
"grad_norm": 0.08459550887346268, |
|
"learning_rate": 5.955793143506863e-05, |
|
"loss": 11.8108, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6449787835926449, |
|
"grad_norm": 0.07968215644359589, |
|
"learning_rate": 5.861224443026595e-05, |
|
"loss": 11.8181, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6534653465346535, |
|
"grad_norm": 0.1503295600414276, |
|
"learning_rate": 5.766337137204579e-05, |
|
"loss": 11.8167, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6619519094766619, |
|
"grad_norm": 0.08652139455080032, |
|
"learning_rate": 5.6711663290882776e-05, |
|
"loss": 11.8122, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6704384724186704, |
|
"grad_norm": 0.08518210798501968, |
|
"learning_rate": 5.575747226605298e-05, |
|
"loss": 11.8164, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6789250353606789, |
|
"grad_norm": 0.12115988880395889, |
|
"learning_rate": 5.480115129538409e-05, |
|
"loss": 11.8114, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6874115983026874, |
|
"grad_norm": 0.13773323595523834, |
|
"learning_rate": 5.384305416466584e-05, |
|
"loss": 11.815, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.695898161244696, |
|
"grad_norm": 0.10262436419725418, |
|
"learning_rate": 5.288353531676873e-05, |
|
"loss": 11.8163, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.7043847241867044, |
|
"grad_norm": 0.12103616446256638, |
|
"learning_rate": 5.192294972051992e-05, |
|
"loss": 11.8176, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.7128712871287128, |
|
"grad_norm": 0.09017164260149002, |
|
"learning_rate": 5.0961652739384356e-05, |
|
"loss": 11.8221, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.7128712871287128, |
|
"eval_loss": 11.818002700805664, |
|
"eval_runtime": 6.3716, |
|
"eval_samples_per_second": 93.539, |
|
"eval_steps_per_second": 11.771, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.7213578500707214, |
|
"grad_norm": 0.050791915506124496, |
|
"learning_rate": 5e-05, |
|
"loss": 11.8143, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7298444130127298, |
|
"grad_norm": 0.1059621125459671, |
|
"learning_rate": 4.903834726061565e-05, |
|
"loss": 11.8175, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7383309759547383, |
|
"grad_norm": 0.07910209894180298, |
|
"learning_rate": 4.807705027948008e-05, |
|
"loss": 11.8104, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7468175388967468, |
|
"grad_norm": 0.11326657980680466, |
|
"learning_rate": 4.711646468323129e-05, |
|
"loss": 11.8178, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7553041018387553, |
|
"grad_norm": 0.15818099677562714, |
|
"learning_rate": 4.6156945835334184e-05, |
|
"loss": 11.8152, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7637906647807637, |
|
"grad_norm": 0.08803381770849228, |
|
"learning_rate": 4.5198848704615914e-05, |
|
"loss": 11.8165, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7722772277227723, |
|
"grad_norm": 0.13214413821697235, |
|
"learning_rate": 4.424252773394704e-05, |
|
"loss": 11.8183, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7807637906647807, |
|
"grad_norm": 0.0714588537812233, |
|
"learning_rate": 4.328833670911724e-05, |
|
"loss": 11.8207, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7892503536067893, |
|
"grad_norm": 0.3612169325351715, |
|
"learning_rate": 4.23366286279542e-05, |
|
"loss": 11.81, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7977369165487977, |
|
"grad_norm": 0.10489798337221146, |
|
"learning_rate": 4.138775556973406e-05, |
|
"loss": 11.8155, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.8062234794908062, |
|
"grad_norm": 0.07869977504014969, |
|
"learning_rate": 4.04420685649314e-05, |
|
"loss": 11.8183, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8147100424328148, |
|
"grad_norm": 0.1785990297794342, |
|
"learning_rate": 3.9499917465357534e-05, |
|
"loss": 11.8151, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.8231966053748232, |
|
"grad_norm": 0.08096058666706085, |
|
"learning_rate": 3.856165081473474e-05, |
|
"loss": 11.8138, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.8316831683168316, |
|
"grad_norm": 0.11269628256559372, |
|
"learning_rate": 3.762761571975429e-05, |
|
"loss": 11.8173, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.8316831683168316, |
|
"eval_loss": 11.817937850952148, |
|
"eval_runtime": 6.604, |
|
"eval_samples_per_second": 90.248, |
|
"eval_steps_per_second": 11.357, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.8401697312588402, |
|
"grad_norm": 0.11685353517532349, |
|
"learning_rate": 3.6698157721666246e-05, |
|
"loss": 11.8211, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.8486562942008486, |
|
"grad_norm": 0.10467521101236343, |
|
"learning_rate": 3.5773620668448384e-05, |
|
"loss": 11.8131, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.08696059882640839, |
|
"learning_rate": 3.48543465876014e-05, |
|
"loss": 11.8151, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8656294200848657, |
|
"grad_norm": 0.21422286331653595, |
|
"learning_rate": 3.3940675559617724e-05, |
|
"loss": 11.817, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8741159830268741, |
|
"grad_norm": 0.09098684042692184, |
|
"learning_rate": 3.303294559217063e-05, |
|
"loss": 11.8177, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8826025459688827, |
|
"grad_norm": 0.11003749072551727, |
|
"learning_rate": 3.213149249506997e-05, |
|
"loss": 11.8127, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8910891089108911, |
|
"grad_norm": 0.10718018561601639, |
|
"learning_rate": 3.12366497560313e-05, |
|
"loss": 11.8152, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8995756718528995, |
|
"grad_norm": 0.058545950800180435, |
|
"learning_rate": 3.0348748417303823e-05, |
|
"loss": 11.8117, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.9080622347949081, |
|
"grad_norm": 0.15917275846004486, |
|
"learning_rate": 2.9468116953203107e-05, |
|
"loss": 11.816, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.9165487977369166, |
|
"grad_norm": 0.08903782814741135, |
|
"learning_rate": 2.8595081148593738e-05, |
|
"loss": 11.8119, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.925035360678925, |
|
"grad_norm": 0.10488823056221008, |
|
"learning_rate": 2.772996397836704e-05, |
|
"loss": 11.8185, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.9335219236209336, |
|
"grad_norm": 0.09897799789905548, |
|
"learning_rate": 2.687308548795825e-05, |
|
"loss": 11.8194, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.942008486562942, |
|
"grad_norm": 0.10232014954090118, |
|
"learning_rate": 2.6024762674947313e-05, |
|
"loss": 11.8139, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.9504950495049505, |
|
"grad_norm": 0.07098989188671112, |
|
"learning_rate": 2.5185309371787513e-05, |
|
"loss": 11.8159, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.9504950495049505, |
|
"eval_loss": 11.817768096923828, |
|
"eval_runtime": 6.6321, |
|
"eval_samples_per_second": 89.866, |
|
"eval_steps_per_second": 11.309, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.958981612446959, |
|
"grad_norm": 0.1141035333275795, |
|
"learning_rate": 2.43550361297047e-05, |
|
"loss": 11.8135, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.9674681753889675, |
|
"grad_norm": 0.15841831266880035, |
|
"learning_rate": 2.353425010381063e-05, |
|
"loss": 11.8191, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9759547383309759, |
|
"grad_norm": 0.1034071147441864, |
|
"learning_rate": 2.272325493947257e-05, |
|
"loss": 11.8196, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9844413012729845, |
|
"grad_norm": 0.1345403790473938, |
|
"learning_rate": 2.192235065998126e-05, |
|
"loss": 11.8162, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9929278642149929, |
|
"grad_norm": 0.06921133399009705, |
|
"learning_rate": 2.1131833555559037e-05, |
|
"loss": 11.8178, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.0014144271570014, |
|
"grad_norm": 0.22766686975955963, |
|
"learning_rate": 2.0351996073748713e-05, |
|
"loss": 13.7864, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.00990099009901, |
|
"grad_norm": 0.15477143228054047, |
|
"learning_rate": 1.9583126711224343e-05, |
|
"loss": 11.817, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.0183875530410185, |
|
"grad_norm": 0.23170539736747742, |
|
"learning_rate": 1.8825509907063327e-05, |
|
"loss": 11.8147, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0268741159830268, |
|
"grad_norm": 0.06929183006286621, |
|
"learning_rate": 1.807942593751973e-05, |
|
"loss": 11.8198, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.0353606789250354, |
|
"grad_norm": 0.15376947820186615, |
|
"learning_rate": 1.7345150812337564e-05, |
|
"loss": 11.8151, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.043847241867044, |
|
"grad_norm": 0.2093440443277359, |
|
"learning_rate": 1.66229561726426e-05, |
|
"loss": 11.8205, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.0523338048090523, |
|
"grad_norm": 0.1123834103345871, |
|
"learning_rate": 1.5913109190450032e-05, |
|
"loss": 11.8159, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.0608203677510608, |
|
"grad_norm": 0.22159625589847565, |
|
"learning_rate": 1.5215872469825682e-05, |
|
"loss": 11.8144, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.0693069306930694, |
|
"grad_norm": 0.12495719641447067, |
|
"learning_rate": 1.4531503949737108e-05, |
|
"loss": 11.8175, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.0693069306930694, |
|
"eval_loss": 11.817733764648438, |
|
"eval_runtime": 6.6332, |
|
"eval_samples_per_second": 89.851, |
|
"eval_steps_per_second": 11.307, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.0777934936350777, |
|
"grad_norm": 0.12858428061008453, |
|
"learning_rate": 1.3860256808630428e-05, |
|
"loss": 11.8168, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.0862800565770863, |
|
"grad_norm": 0.1034870445728302, |
|
"learning_rate": 1.3202379370768252e-05, |
|
"loss": 11.8184, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.0947666195190948, |
|
"grad_norm": 0.19289083778858185, |
|
"learning_rate": 1.2558115014363592e-05, |
|
"loss": 11.8211, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.1032531824611032, |
|
"grad_norm": 0.15419146418571472, |
|
"learning_rate": 1.1927702081543279e-05, |
|
"loss": 11.8142, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1117397454031117, |
|
"grad_norm": 0.15567056834697723, |
|
"learning_rate": 1.1311373790174657e-05, |
|
"loss": 11.8155, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.1202263083451203, |
|
"grad_norm": 0.13290712237358093, |
|
"learning_rate": 1.0709358147587884e-05, |
|
"loss": 11.8208, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.1287128712871288, |
|
"grad_norm": 0.09749293327331543, |
|
"learning_rate": 1.0121877866225781e-05, |
|
"loss": 11.8177, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.1371994342291372, |
|
"grad_norm": 0.120842345058918, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 11.8156, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.1456859971711457, |
|
"grad_norm": 0.09248703718185425, |
|
"learning_rate": 8.991387270152201e-06, |
|
"loss": 11.8186, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.154172560113154, |
|
"grad_norm": 0.12557213008403778, |
|
"learning_rate": 8.448795174344804e-06, |
|
"loss": 11.8199, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.1626591230551626, |
|
"grad_norm": 0.17817071080207825, |
|
"learning_rate": 7.921574722852343e-06, |
|
"loss": 11.8154, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.1711456859971712, |
|
"grad_norm": 0.10258757323026657, |
|
"learning_rate": 7.409920958039795e-06, |
|
"loss": 11.8124, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.1796322489391797, |
|
"grad_norm": 0.16313178837299347, |
|
"learning_rate": 6.9140231634602485e-06, |
|
"loss": 11.815, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.188118811881188, |
|
"grad_norm": 0.10050709545612335, |
|
"learning_rate": 6.43406479383053e-06, |
|
"loss": 11.8149, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.188118811881188, |
|
"eval_loss": 11.817734718322754, |
|
"eval_runtime": 6.4421, |
|
"eval_samples_per_second": 92.516, |
|
"eval_steps_per_second": 11.642, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1966053748231966, |
|
"grad_norm": 0.09405164420604706, |
|
"learning_rate": 5.9702234071631e-06, |
|
"loss": 11.8103, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.2050919377652052, |
|
"grad_norm": 0.07892228662967682, |
|
"learning_rate": 5.5226705990794155e-06, |
|
"loss": 11.8166, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.2135785007072135, |
|
"grad_norm": 0.09944994747638702, |
|
"learning_rate": 5.091571939329048e-06, |
|
"loss": 11.8157, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.222065063649222, |
|
"grad_norm": 0.09870926290750504, |
|
"learning_rate": 4.677086910538092e-06, |
|
"loss": 11.8141, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.2305516265912306, |
|
"grad_norm": 0.16009728610515594, |
|
"learning_rate": 4.279368849209381e-06, |
|
"loss": 11.8126, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.239038189533239, |
|
"grad_norm": 0.10148247331380844, |
|
"learning_rate": 3.898564888996476e-06, |
|
"loss": 11.8173, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.2475247524752475, |
|
"grad_norm": 0.057802699506282806, |
|
"learning_rate": 3.534815906272404e-06, |
|
"loss": 11.8136, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.256011315417256, |
|
"grad_norm": 0.059853699058294296, |
|
"learning_rate": 3.18825646801314e-06, |
|
"loss": 11.8178, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.2644978783592644, |
|
"grad_norm": 0.11142271012067795, |
|
"learning_rate": 2.8590147820153513e-06, |
|
"loss": 11.8121, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.272984441301273, |
|
"grad_norm": 0.12580835819244385, |
|
"learning_rate": 2.547212649466568e-06, |
|
"loss": 11.8092, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2814710042432815, |
|
"grad_norm": 0.09984395653009415, |
|
"learning_rate": 2.2529654198854835e-06, |
|
"loss": 11.8177, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.28995756718529, |
|
"grad_norm": 0.06916932761669159, |
|
"learning_rate": 1.9763819484490355e-06, |
|
"loss": 11.8155, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.2984441301272984, |
|
"grad_norm": 0.15921108424663544, |
|
"learning_rate": 1.7175645557220566e-06, |
|
"loss": 11.813, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.306930693069307, |
|
"grad_norm": 0.08516174554824829, |
|
"learning_rate": 1.4766089898042678e-06, |
|
"loss": 11.81, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.306930693069307, |
|
"eval_loss": 11.817733764648438, |
|
"eval_runtime": 6.4426, |
|
"eval_samples_per_second": 92.509, |
|
"eval_steps_per_second": 11.641, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.3154172560113153, |
|
"grad_norm": 0.09070340543985367, |
|
"learning_rate": 1.2536043909088191e-06, |
|
"loss": 11.8145, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.3239038189533239, |
|
"grad_norm": 0.0519736111164093, |
|
"learning_rate": 1.0486332583853563e-06, |
|
"loss": 11.8139, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.3323903818953324, |
|
"grad_norm": 0.09727457165718079, |
|
"learning_rate": 8.617714201998084e-07, |
|
"loss": 11.8145, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.340876944837341, |
|
"grad_norm": 0.0795770063996315, |
|
"learning_rate": 6.93088004882253e-07, |
|
"loss": 11.8154, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.3493635077793493, |
|
"grad_norm": 0.09474173188209534, |
|
"learning_rate": 5.426454159531913e-07, |
|
"loss": 11.8169, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.3578500707213579, |
|
"grad_norm": 0.13217829167842865, |
|
"learning_rate": 4.104993088376974e-07, |
|
"loss": 11.8165, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3663366336633662, |
|
"grad_norm": 0.11465727537870407, |
|
"learning_rate": 2.966985702759828e-07, |
|
"loss": 11.8173, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.3748231966053748, |
|
"grad_norm": 0.12922833859920502, |
|
"learning_rate": 2.012853002380466e-07, |
|
"loss": 11.8154, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.3833097595473833, |
|
"grad_norm": 0.05712132155895233, |
|
"learning_rate": 1.2429479634897267e-07, |
|
"loss": 11.814, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.391796322489392, |
|
"grad_norm": 0.09231238812208176, |
|
"learning_rate": 6.575554083078084e-08, |
|
"loss": 11.8135, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.4002828854314002, |
|
"grad_norm": 0.08613143861293793, |
|
"learning_rate": 2.568918996560532e-08, |
|
"loss": 11.819, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.4087694483734088, |
|
"grad_norm": 0.11459596455097198, |
|
"learning_rate": 4.110566084036816e-09, |
|
"loss": 11.8119, |
|
"step": 498 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 42, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3067553218560.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|