{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4144271570014144, "eval_steps": 42, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002828854314002829, "eval_loss": 11.916790962219238, "eval_runtime": 6.6784, "eval_samples_per_second": 89.243, "eval_steps_per_second": 11.23, "step": 1 }, { "epoch": 0.008486562942008486, "grad_norm": 0.5188692212104797, "learning_rate": 3e-05, "loss": 11.92, "step": 3 }, { "epoch": 0.016973125884016973, "grad_norm": 0.48136115074157715, "learning_rate": 6e-05, "loss": 11.9198, "step": 6 }, { "epoch": 0.02545968882602546, "grad_norm": 0.5018041729927063, "learning_rate": 9e-05, "loss": 11.9077, "step": 9 }, { "epoch": 0.033946251768033946, "grad_norm": 0.6158789992332458, "learning_rate": 9.999588943391597e-05, "loss": 11.9074, "step": 12 }, { "epoch": 0.042432814710042434, "grad_norm": 0.6823853850364685, "learning_rate": 9.99743108100344e-05, "loss": 11.9042, "step": 15 }, { "epoch": 0.05091937765205092, "grad_norm": 0.8695465922355652, "learning_rate": 9.993424445916923e-05, "loss": 11.8991, "step": 18 }, { "epoch": 0.0594059405940594, "grad_norm": 0.9028195142745972, "learning_rate": 9.987570520365104e-05, "loss": 11.8853, "step": 21 }, { "epoch": 0.06789250353606789, "grad_norm": 0.9360626935958862, "learning_rate": 9.979871469976196e-05, "loss": 11.8781, "step": 24 }, { "epoch": 0.07637906647807638, "grad_norm": 0.8196120858192444, "learning_rate": 9.970330142972401e-05, "loss": 11.8677, "step": 27 }, { "epoch": 0.08486562942008487, "grad_norm": 0.7162870168685913, "learning_rate": 9.95895006911623e-05, "loss": 11.8614, "step": 30 }, { "epoch": 0.09335219236209336, "grad_norm": 0.5132609605789185, "learning_rate": 9.945735458404681e-05, "loss": 11.8498, "step": 33 }, { "epoch": 0.10183875530410184, "grad_norm": 0.4584226608276367, "learning_rate": 9.930691199511775e-05, "loss": 11.8467, "step": 36 }, { "epoch": 0.11032531824611033, "grad_norm": 0.29786884784698486, "learning_rate": 9.91382285798002e-05, "loss": 11.8511, "step": 39 }, { "epoch": 0.1188118811881188, "grad_norm": 0.2889010012149811, "learning_rate": 9.895136674161465e-05, "loss": 11.8408, "step": 42 }, { "epoch": 0.1188118811881188, "eval_loss": 11.839421272277832, "eval_runtime": 6.332, "eval_samples_per_second": 94.125, "eval_steps_per_second": 11.845, "step": 42 }, { "epoch": 0.1272984441301273, "grad_norm": 0.20692946016788483, "learning_rate": 9.874639560909117e-05, "loss": 11.8396, "step": 45 }, { "epoch": 0.13578500707213578, "grad_norm": 0.23685254156589508, "learning_rate": 9.852339101019574e-05, "loss": 11.8316, "step": 48 }, { "epoch": 0.14427157001414428, "grad_norm": 0.2432631552219391, "learning_rate": 9.828243544427796e-05, "loss": 11.8341, "step": 51 }, { "epoch": 0.15275813295615276, "grad_norm": 0.11745542287826538, "learning_rate": 9.802361805155097e-05, "loss": 11.8301, "step": 54 }, { "epoch": 0.16124469589816123, "grad_norm": 0.134097158908844, "learning_rate": 9.774703458011453e-05, "loss": 11.8359, "step": 57 }, { "epoch": 0.16973125884016974, "grad_norm": 0.19680051505565643, "learning_rate": 9.745278735053343e-05, "loss": 11.8378, "step": 60 }, { "epoch": 0.1782178217821782, "grad_norm": 0.10566498339176178, "learning_rate": 9.714098521798465e-05, "loss": 11.832, "step": 63 }, { "epoch": 0.1867043847241867, "grad_norm": 0.1530551165342331, "learning_rate": 9.681174353198687e-05, "loss": 11.8363, "step": 66 }, { "epoch": 0.19519094766619519, "grad_norm": 0.1929464042186737, "learning_rate": 9.64651840937276e-05, "loss": 11.8284, "step": 69 }, { "epoch": 0.2036775106082037, "grad_norm": 0.17411480844020844, "learning_rate": 9.610143511100354e-05, "loss": 11.8314, "step": 72 }, { "epoch": 0.21216407355021216, "grad_norm": 0.14971987903118134, "learning_rate": 9.572063115079063e-05, "loss": 11.832, "step": 75 }, { "epoch": 0.22065063649222066, "grad_norm": 0.18370923399925232, "learning_rate": 9.53229130894619e-05, "loss": 11.8275, "step": 78 }, { "epoch": 0.22913719943422914, "grad_norm": 0.26103201508522034, "learning_rate": 9.490842806067095e-05, "loss": 11.8278, "step": 81 }, { "epoch": 0.2376237623762376, "grad_norm": 0.21483545005321503, "learning_rate": 9.44773294009206e-05, "loss": 11.825, "step": 84 }, { "epoch": 0.2376237623762376, "eval_loss": 11.825268745422363, "eval_runtime": 6.402, "eval_samples_per_second": 93.097, "eval_steps_per_second": 11.715, "step": 84 }, { "epoch": 0.24611032531824611, "grad_norm": 0.28861185908317566, "learning_rate": 9.40297765928369e-05, "loss": 11.8219, "step": 87 }, { "epoch": 0.2545968882602546, "grad_norm": 0.13039568066596985, "learning_rate": 9.356593520616948e-05, "loss": 11.8245, "step": 90 }, { "epoch": 0.26308345120226306, "grad_norm": 0.1711033284664154, "learning_rate": 9.308597683653975e-05, "loss": 11.8246, "step": 93 }, { "epoch": 0.27157001414427157, "grad_norm": 0.2108013778924942, "learning_rate": 9.259007904196023e-05, "loss": 11.8228, "step": 96 }, { "epoch": 0.28005657708628007, "grad_norm": 0.1204076036810875, "learning_rate": 9.207842527714767e-05, "loss": 11.82, "step": 99 }, { "epoch": 0.28854314002828857, "grad_norm": 0.1477670669555664, "learning_rate": 9.155120482565521e-05, "loss": 11.8189, "step": 102 }, { "epoch": 0.297029702970297, "grad_norm": 0.10705884546041489, "learning_rate": 9.10086127298478e-05, "loss": 11.8169, "step": 105 }, { "epoch": 0.3055162659123055, "grad_norm": 0.29956066608428955, "learning_rate": 9.045084971874738e-05, "loss": 11.8205, "step": 108 }, { "epoch": 0.314002828854314, "grad_norm": 0.13027921319007874, "learning_rate": 8.987812213377424e-05, "loss": 11.8168, "step": 111 }, { "epoch": 0.32248939179632247, "grad_norm": 0.13590934872627258, "learning_rate": 8.929064185241213e-05, "loss": 11.819, "step": 114 }, { "epoch": 0.33097595473833097, "grad_norm": 0.09333682060241699, "learning_rate": 8.868862620982534e-05, "loss": 11.8267, "step": 117 }, { "epoch": 0.33946251768033947, "grad_norm": 0.12400602549314499, "learning_rate": 8.807229791845673e-05, "loss": 11.818, "step": 120 }, { "epoch": 0.347949080622348, "grad_norm": 0.12071343511343002, "learning_rate": 8.744188498563641e-05, "loss": 11.8166, "step": 123 }, { "epoch": 0.3564356435643564, "grad_norm": 0.1693616658449173, "learning_rate": 8.679762062923175e-05, "loss": 11.8183, "step": 126 }, { "epoch": 0.3564356435643564, "eval_loss": 11.819117546081543, "eval_runtime": 6.7651, "eval_samples_per_second": 88.1, "eval_steps_per_second": 11.086, "step": 126 }, { "epoch": 0.3649222065063649, "grad_norm": 0.17696824669837952, "learning_rate": 8.613974319136958e-05, "loss": 11.8171, "step": 129 }, { "epoch": 0.3734087694483734, "grad_norm": 0.11144156008958817, "learning_rate": 8.54684960502629e-05, "loss": 11.815, "step": 132 }, { "epoch": 0.38189533239038187, "grad_norm": 0.16119325160980225, "learning_rate": 8.478412753017433e-05, "loss": 11.8134, "step": 135 }, { "epoch": 0.39038189533239037, "grad_norm": 0.10234789550304413, "learning_rate": 8.408689080954998e-05, "loss": 11.8138, "step": 138 }, { "epoch": 0.3988684582743989, "grad_norm": 0.12111084908246994, "learning_rate": 8.33770438273574e-05, "loss": 11.8152, "step": 141 }, { "epoch": 0.4073550212164074, "grad_norm": 0.07516127824783325, "learning_rate": 8.265484918766243e-05, "loss": 11.8128, "step": 144 }, { "epoch": 0.4158415841584158, "grad_norm": 0.23707596957683563, "learning_rate": 8.192057406248028e-05, "loss": 11.8202, "step": 147 }, { "epoch": 0.4243281471004243, "grad_norm": 0.1228451356291771, "learning_rate": 8.117449009293668e-05, "loss": 11.8201, "step": 150 }, { "epoch": 0.4328147100424328, "grad_norm": 0.16358840465545654, "learning_rate": 8.041687328877567e-05, "loss": 11.8141, "step": 153 }, { "epoch": 0.44130127298444133, "grad_norm": 0.10192089527845383, "learning_rate": 7.964800392625129e-05, "loss": 11.8128, "step": 156 }, { "epoch": 0.4497878359264498, "grad_norm": 0.07679455727338791, "learning_rate": 7.886816644444098e-05, "loss": 11.8124, "step": 159 }, { "epoch": 0.4582743988684583, "grad_norm": 0.10075189918279648, "learning_rate": 7.807764934001874e-05, "loss": 11.8119, "step": 162 }, { "epoch": 0.4667609618104668, "grad_norm": 0.1872919499874115, "learning_rate": 7.727674506052743e-05, "loss": 11.8203, "step": 165 }, { "epoch": 0.4752475247524752, "grad_norm": 0.12166598439216614, "learning_rate": 7.646574989618938e-05, "loss": 11.8202, "step": 168 }, { "epoch": 0.4752475247524752, "eval_loss": 11.818514823913574, "eval_runtime": 6.4158, "eval_samples_per_second": 92.895, "eval_steps_per_second": 11.69, "step": 168 }, { "epoch": 0.4837340876944837, "grad_norm": 0.13749000430107117, "learning_rate": 7.564496387029532e-05, "loss": 11.8156, "step": 171 }, { "epoch": 0.49222065063649223, "grad_norm": 0.07802052795886993, "learning_rate": 7.481469062821252e-05, "loss": 11.8182, "step": 174 }, { "epoch": 0.5007072135785007, "grad_norm": 0.150814950466156, "learning_rate": 7.39752373250527e-05, "loss": 11.8179, "step": 177 }, { "epoch": 0.5091937765205092, "grad_norm": 0.1514790952205658, "learning_rate": 7.312691451204178e-05, "loss": 11.8099, "step": 180 }, { "epoch": 0.5176803394625177, "grad_norm": 0.13362684845924377, "learning_rate": 7.227003602163295e-05, "loss": 11.8172, "step": 183 }, { "epoch": 0.5261669024045261, "grad_norm": 0.09337490051984787, "learning_rate": 7.14049188514063e-05, "loss": 11.8184, "step": 186 }, { "epoch": 0.5346534653465347, "grad_norm": 0.08015663921833038, "learning_rate": 7.05318830467969e-05, "loss": 11.8158, "step": 189 }, { "epoch": 0.5431400282885431, "grad_norm": 0.16405069828033447, "learning_rate": 6.965125158269619e-05, "loss": 11.816, "step": 192 }, { "epoch": 0.5516265912305516, "grad_norm": 0.14057497680187225, "learning_rate": 6.876335024396872e-05, "loss": 11.8147, "step": 195 }, { "epoch": 0.5601131541725601, "grad_norm": 0.1409187614917755, "learning_rate": 6.786850750493006e-05, "loss": 11.8157, "step": 198 }, { "epoch": 0.5685997171145686, "grad_norm": 0.1987845003604889, "learning_rate": 6.696705440782938e-05, "loss": 11.8185, "step": 201 }, { "epoch": 0.5770862800565771, "grad_norm": 0.10339465737342834, "learning_rate": 6.605932444038229e-05, "loss": 11.815, "step": 204 }, { "epoch": 0.5855728429985856, "grad_norm": 0.12926329672336578, "learning_rate": 6.514565341239861e-05, "loss": 11.818, "step": 207 }, { "epoch": 0.594059405940594, "grad_norm": 0.07949727028608322, "learning_rate": 6.422637933155162e-05, "loss": 11.8151, "step": 210 }, { "epoch": 0.594059405940594, "eval_loss": 11.818094253540039, "eval_runtime": 6.6444, "eval_samples_per_second": 89.7, "eval_steps_per_second": 11.288, "step": 210 }, { "epoch": 0.6025459688826026, "grad_norm": 0.18179267644882202, "learning_rate": 6.330184227833376e-05, "loss": 11.8178, "step": 213 }, { "epoch": 0.611032531824611, "grad_norm": 0.12221992760896683, "learning_rate": 6.237238428024572e-05, "loss": 11.8145, "step": 216 }, { "epoch": 0.6195190947666195, "grad_norm": 0.07228324562311172, "learning_rate": 6.143834918526527e-05, "loss": 11.8137, "step": 219 }, { "epoch": 0.628005657708628, "grad_norm": 0.11851081997156143, "learning_rate": 6.0500082534642464e-05, "loss": 11.8156, "step": 222 }, { "epoch": 0.6364922206506365, "grad_norm": 0.08459550887346268, "learning_rate": 5.955793143506863e-05, "loss": 11.8108, "step": 225 }, { "epoch": 0.6449787835926449, "grad_norm": 0.07968215644359589, "learning_rate": 5.861224443026595e-05, "loss": 11.8181, "step": 228 }, { "epoch": 0.6534653465346535, "grad_norm": 0.1503295600414276, "learning_rate": 5.766337137204579e-05, "loss": 11.8167, "step": 231 }, { "epoch": 0.6619519094766619, "grad_norm": 0.08652139455080032, "learning_rate": 5.6711663290882776e-05, "loss": 11.8122, "step": 234 }, { "epoch": 0.6704384724186704, "grad_norm": 0.08518210798501968, "learning_rate": 5.575747226605298e-05, "loss": 11.8164, "step": 237 }, { "epoch": 0.6789250353606789, "grad_norm": 0.12115988880395889, "learning_rate": 5.480115129538409e-05, "loss": 11.8114, "step": 240 }, { "epoch": 0.6874115983026874, "grad_norm": 0.13773323595523834, "learning_rate": 5.384305416466584e-05, "loss": 11.815, "step": 243 }, { "epoch": 0.695898161244696, "grad_norm": 0.10262436419725418, "learning_rate": 5.288353531676873e-05, "loss": 11.8163, "step": 246 }, { "epoch": 0.7043847241867044, "grad_norm": 0.12103616446256638, "learning_rate": 5.192294972051992e-05, "loss": 11.8176, "step": 249 }, { "epoch": 0.7128712871287128, "grad_norm": 0.09017164260149002, "learning_rate": 5.0961652739384356e-05, "loss": 11.8221, "step": 252 }, { "epoch": 0.7128712871287128, "eval_loss": 11.818002700805664, "eval_runtime": 6.3716, "eval_samples_per_second": 93.539, "eval_steps_per_second": 11.771, "step": 252 }, { "epoch": 0.7213578500707214, "grad_norm": 0.050791915506124496, "learning_rate": 5e-05, "loss": 11.8143, "step": 255 }, { "epoch": 0.7298444130127298, "grad_norm": 0.1059621125459671, "learning_rate": 4.903834726061565e-05, "loss": 11.8175, "step": 258 }, { "epoch": 0.7383309759547383, "grad_norm": 0.07910209894180298, "learning_rate": 4.807705027948008e-05, "loss": 11.8104, "step": 261 }, { "epoch": 0.7468175388967468, "grad_norm": 0.11326657980680466, "learning_rate": 4.711646468323129e-05, "loss": 11.8178, "step": 264 }, { "epoch": 0.7553041018387553, "grad_norm": 0.15818099677562714, "learning_rate": 4.6156945835334184e-05, "loss": 11.8152, "step": 267 }, { "epoch": 0.7637906647807637, "grad_norm": 0.08803381770849228, "learning_rate": 4.5198848704615914e-05, "loss": 11.8165, "step": 270 }, { "epoch": 0.7722772277227723, "grad_norm": 0.13214413821697235, "learning_rate": 4.424252773394704e-05, "loss": 11.8183, "step": 273 }, { "epoch": 0.7807637906647807, "grad_norm": 0.0714588537812233, "learning_rate": 4.328833670911724e-05, "loss": 11.8207, "step": 276 }, { "epoch": 0.7892503536067893, "grad_norm": 0.3612169325351715, "learning_rate": 4.23366286279542e-05, "loss": 11.81, "step": 279 }, { "epoch": 0.7977369165487977, "grad_norm": 0.10489798337221146, "learning_rate": 4.138775556973406e-05, "loss": 11.8155, "step": 282 }, { "epoch": 0.8062234794908062, "grad_norm": 0.07869977504014969, "learning_rate": 4.04420685649314e-05, "loss": 11.8183, "step": 285 }, { "epoch": 0.8147100424328148, "grad_norm": 0.1785990297794342, "learning_rate": 3.9499917465357534e-05, "loss": 11.8151, "step": 288 }, { "epoch": 0.8231966053748232, "grad_norm": 0.08096058666706085, "learning_rate": 3.856165081473474e-05, "loss": 11.8138, "step": 291 }, { "epoch": 0.8316831683168316, "grad_norm": 0.11269628256559372, "learning_rate": 3.762761571975429e-05, "loss": 11.8173, "step": 294 }, { "epoch": 0.8316831683168316, "eval_loss": 11.817937850952148, "eval_runtime": 6.604, "eval_samples_per_second": 90.248, "eval_steps_per_second": 11.357, "step": 294 }, { "epoch": 0.8401697312588402, "grad_norm": 0.11685353517532349, "learning_rate": 3.6698157721666246e-05, "loss": 11.8211, "step": 297 }, { "epoch": 0.8486562942008486, "grad_norm": 0.10467521101236343, "learning_rate": 3.5773620668448384e-05, "loss": 11.8131, "step": 300 }, { "epoch": 0.8571428571428571, "grad_norm": 0.08696059882640839, "learning_rate": 3.48543465876014e-05, "loss": 11.8151, "step": 303 }, { "epoch": 0.8656294200848657, "grad_norm": 0.21422286331653595, "learning_rate": 3.3940675559617724e-05, "loss": 11.817, "step": 306 }, { "epoch": 0.8741159830268741, "grad_norm": 0.09098684042692184, "learning_rate": 3.303294559217063e-05, "loss": 11.8177, "step": 309 }, { "epoch": 0.8826025459688827, "grad_norm": 0.11003749072551727, "learning_rate": 3.213149249506997e-05, "loss": 11.8127, "step": 312 }, { "epoch": 0.8910891089108911, "grad_norm": 0.10718018561601639, "learning_rate": 3.12366497560313e-05, "loss": 11.8152, "step": 315 }, { "epoch": 0.8995756718528995, "grad_norm": 0.058545950800180435, "learning_rate": 3.0348748417303823e-05, "loss": 11.8117, "step": 318 }, { "epoch": 0.9080622347949081, "grad_norm": 0.15917275846004486, "learning_rate": 2.9468116953203107e-05, "loss": 11.816, "step": 321 }, { "epoch": 0.9165487977369166, "grad_norm": 0.08903782814741135, "learning_rate": 2.8595081148593738e-05, "loss": 11.8119, "step": 324 }, { "epoch": 0.925035360678925, "grad_norm": 0.10488823056221008, "learning_rate": 2.772996397836704e-05, "loss": 11.8185, "step": 327 }, { "epoch": 0.9335219236209336, "grad_norm": 0.09897799789905548, "learning_rate": 2.687308548795825e-05, "loss": 11.8194, "step": 330 }, { "epoch": 0.942008486562942, "grad_norm": 0.10232014954090118, "learning_rate": 2.6024762674947313e-05, "loss": 11.8139, "step": 333 }, { "epoch": 0.9504950495049505, "grad_norm": 0.07098989188671112, "learning_rate": 2.5185309371787513e-05, "loss": 11.8159, "step": 336 }, { "epoch": 0.9504950495049505, "eval_loss": 11.817768096923828, "eval_runtime": 6.6321, "eval_samples_per_second": 89.866, "eval_steps_per_second": 11.309, "step": 336 }, { "epoch": 0.958981612446959, "grad_norm": 0.1141035333275795, "learning_rate": 2.43550361297047e-05, "loss": 11.8135, "step": 339 }, { "epoch": 0.9674681753889675, "grad_norm": 0.15841831266880035, "learning_rate": 2.353425010381063e-05, "loss": 11.8191, "step": 342 }, { "epoch": 0.9759547383309759, "grad_norm": 0.1034071147441864, "learning_rate": 2.272325493947257e-05, "loss": 11.8196, "step": 345 }, { "epoch": 0.9844413012729845, "grad_norm": 0.1345403790473938, "learning_rate": 2.192235065998126e-05, "loss": 11.8162, "step": 348 }, { "epoch": 0.9929278642149929, "grad_norm": 0.06921133399009705, "learning_rate": 2.1131833555559037e-05, "loss": 11.8178, "step": 351 }, { "epoch": 1.0014144271570014, "grad_norm": 0.22766686975955963, "learning_rate": 2.0351996073748713e-05, "loss": 13.7864, "step": 354 }, { "epoch": 1.00990099009901, "grad_norm": 0.15477143228054047, "learning_rate": 1.9583126711224343e-05, "loss": 11.817, "step": 357 }, { "epoch": 1.0183875530410185, "grad_norm": 0.23170539736747742, "learning_rate": 1.8825509907063327e-05, "loss": 11.8147, "step": 360 }, { "epoch": 1.0268741159830268, "grad_norm": 0.06929183006286621, "learning_rate": 1.807942593751973e-05, "loss": 11.8198, "step": 363 }, { "epoch": 1.0353606789250354, "grad_norm": 0.15376947820186615, "learning_rate": 1.7345150812337564e-05, "loss": 11.8151, "step": 366 }, { "epoch": 1.043847241867044, "grad_norm": 0.2093440443277359, "learning_rate": 1.66229561726426e-05, "loss": 11.8205, "step": 369 }, { "epoch": 1.0523338048090523, "grad_norm": 0.1123834103345871, "learning_rate": 1.5913109190450032e-05, "loss": 11.8159, "step": 372 }, { "epoch": 1.0608203677510608, "grad_norm": 0.22159625589847565, "learning_rate": 1.5215872469825682e-05, "loss": 11.8144, "step": 375 }, { "epoch": 1.0693069306930694, "grad_norm": 0.12495719641447067, "learning_rate": 1.4531503949737108e-05, "loss": 11.8175, "step": 378 }, { "epoch": 1.0693069306930694, "eval_loss": 11.817733764648438, "eval_runtime": 6.6332, "eval_samples_per_second": 89.851, "eval_steps_per_second": 11.307, "step": 378 }, { "epoch": 1.0777934936350777, "grad_norm": 0.12858428061008453, "learning_rate": 1.3860256808630428e-05, "loss": 11.8168, "step": 381 }, { "epoch": 1.0862800565770863, "grad_norm": 0.1034870445728302, "learning_rate": 1.3202379370768252e-05, "loss": 11.8184, "step": 384 }, { "epoch": 1.0947666195190948, "grad_norm": 0.19289083778858185, "learning_rate": 1.2558115014363592e-05, "loss": 11.8211, "step": 387 }, { "epoch": 1.1032531824611032, "grad_norm": 0.15419146418571472, "learning_rate": 1.1927702081543279e-05, "loss": 11.8142, "step": 390 }, { "epoch": 1.1117397454031117, "grad_norm": 0.15567056834697723, "learning_rate": 1.1311373790174657e-05, "loss": 11.8155, "step": 393 }, { "epoch": 1.1202263083451203, "grad_norm": 0.13290712237358093, "learning_rate": 1.0709358147587884e-05, "loss": 11.8208, "step": 396 }, { "epoch": 1.1287128712871288, "grad_norm": 0.09749293327331543, "learning_rate": 1.0121877866225781e-05, "loss": 11.8177, "step": 399 }, { "epoch": 1.1371994342291372, "grad_norm": 0.120842345058918, "learning_rate": 9.549150281252633e-06, "loss": 11.8156, "step": 402 }, { "epoch": 1.1456859971711457, "grad_norm": 0.09248703718185425, "learning_rate": 8.991387270152201e-06, "loss": 11.8186, "step": 405 }, { "epoch": 1.154172560113154, "grad_norm": 0.12557213008403778, "learning_rate": 8.448795174344804e-06, "loss": 11.8199, "step": 408 }, { "epoch": 1.1626591230551626, "grad_norm": 0.17817071080207825, "learning_rate": 7.921574722852343e-06, "loss": 11.8154, "step": 411 }, { "epoch": 1.1711456859971712, "grad_norm": 0.10258757323026657, "learning_rate": 7.409920958039795e-06, "loss": 11.8124, "step": 414 }, { "epoch": 1.1796322489391797, "grad_norm": 0.16313178837299347, "learning_rate": 6.9140231634602485e-06, "loss": 11.815, "step": 417 }, { "epoch": 1.188118811881188, "grad_norm": 0.10050709545612335, "learning_rate": 6.43406479383053e-06, "loss": 11.8149, "step": 420 }, { "epoch": 1.188118811881188, "eval_loss": 11.817734718322754, "eval_runtime": 6.4421, "eval_samples_per_second": 92.516, "eval_steps_per_second": 11.642, "step": 420 }, { "epoch": 1.1966053748231966, "grad_norm": 0.09405164420604706, "learning_rate": 5.9702234071631e-06, "loss": 11.8103, "step": 423 }, { "epoch": 1.2050919377652052, "grad_norm": 0.07892228662967682, "learning_rate": 5.5226705990794155e-06, "loss": 11.8166, "step": 426 }, { "epoch": 1.2135785007072135, "grad_norm": 0.09944994747638702, "learning_rate": 5.091571939329048e-06, "loss": 11.8157, "step": 429 }, { "epoch": 1.222065063649222, "grad_norm": 0.09870926290750504, "learning_rate": 4.677086910538092e-06, "loss": 11.8141, "step": 432 }, { "epoch": 1.2305516265912306, "grad_norm": 0.16009728610515594, "learning_rate": 4.279368849209381e-06, "loss": 11.8126, "step": 435 }, { "epoch": 1.239038189533239, "grad_norm": 0.10148247331380844, "learning_rate": 3.898564888996476e-06, "loss": 11.8173, "step": 438 }, { "epoch": 1.2475247524752475, "grad_norm": 0.057802699506282806, "learning_rate": 3.534815906272404e-06, "loss": 11.8136, "step": 441 }, { "epoch": 1.256011315417256, "grad_norm": 0.059853699058294296, "learning_rate": 3.18825646801314e-06, "loss": 11.8178, "step": 444 }, { "epoch": 1.2644978783592644, "grad_norm": 0.11142271012067795, "learning_rate": 2.8590147820153513e-06, "loss": 11.8121, "step": 447 }, { "epoch": 1.272984441301273, "grad_norm": 0.12580835819244385, "learning_rate": 2.547212649466568e-06, "loss": 11.8092, "step": 450 }, { "epoch": 1.2814710042432815, "grad_norm": 0.09984395653009415, "learning_rate": 2.2529654198854835e-06, "loss": 11.8177, "step": 453 }, { "epoch": 1.28995756718529, "grad_norm": 0.06916932761669159, "learning_rate": 1.9763819484490355e-06, "loss": 11.8155, "step": 456 }, { "epoch": 1.2984441301272984, "grad_norm": 0.15921108424663544, "learning_rate": 1.7175645557220566e-06, "loss": 11.813, "step": 459 }, { "epoch": 1.306930693069307, "grad_norm": 0.08516174554824829, "learning_rate": 1.4766089898042678e-06, "loss": 11.81, "step": 462 }, { "epoch": 1.306930693069307, "eval_loss": 11.817733764648438, "eval_runtime": 6.4426, "eval_samples_per_second": 92.509, "eval_steps_per_second": 11.641, "step": 462 }, { "epoch": 1.3154172560113153, "grad_norm": 0.09070340543985367, "learning_rate": 1.2536043909088191e-06, "loss": 11.8145, "step": 465 }, { "epoch": 1.3239038189533239, "grad_norm": 0.0519736111164093, "learning_rate": 1.0486332583853563e-06, "loss": 11.8139, "step": 468 }, { "epoch": 1.3323903818953324, "grad_norm": 0.09727457165718079, "learning_rate": 8.617714201998084e-07, "loss": 11.8145, "step": 471 }, { "epoch": 1.340876944837341, "grad_norm": 0.0795770063996315, "learning_rate": 6.93088004882253e-07, "loss": 11.8154, "step": 474 }, { "epoch": 1.3493635077793493, "grad_norm": 0.09474173188209534, "learning_rate": 5.426454159531913e-07, "loss": 11.8169, "step": 477 }, { "epoch": 1.3578500707213579, "grad_norm": 0.13217829167842865, "learning_rate": 4.104993088376974e-07, "loss": 11.8165, "step": 480 }, { "epoch": 1.3663366336633662, "grad_norm": 0.11465727537870407, "learning_rate": 2.966985702759828e-07, "loss": 11.8173, "step": 483 }, { "epoch": 1.3748231966053748, "grad_norm": 0.12922833859920502, "learning_rate": 2.012853002380466e-07, "loss": 11.8154, "step": 486 }, { "epoch": 1.3833097595473833, "grad_norm": 0.05712132155895233, "learning_rate": 1.2429479634897267e-07, "loss": 11.814, "step": 489 }, { "epoch": 1.391796322489392, "grad_norm": 0.09231238812208176, "learning_rate": 6.575554083078084e-08, "loss": 11.8135, "step": 492 }, { "epoch": 1.4002828854314002, "grad_norm": 0.08613143861293793, "learning_rate": 2.568918996560532e-08, "loss": 11.819, "step": 495 }, { "epoch": 1.4087694483734088, "grad_norm": 0.11459596455097198, "learning_rate": 4.110566084036816e-09, "loss": 11.8119, "step": 498 } ], "logging_steps": 3, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 42, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3067553218560.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }