|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 2408, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004152823920265781, |
|
"grad_norm": 25.422981813437236, |
|
"learning_rate": 4.1493775933609963e-08, |
|
"loss": 1.3975, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0020764119601328905, |
|
"grad_norm": 23.65282908395334, |
|
"learning_rate": 2.074688796680498e-07, |
|
"loss": 1.4281, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004152823920265781, |
|
"grad_norm": 16.38973942245371, |
|
"learning_rate": 4.149377593360996e-07, |
|
"loss": 1.3933, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006229235880398671, |
|
"grad_norm": 8.620332321861904, |
|
"learning_rate": 6.224066390041494e-07, |
|
"loss": 1.2986, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.008305647840531562, |
|
"grad_norm": 10.289897317705874, |
|
"learning_rate": 8.298755186721992e-07, |
|
"loss": 1.1565, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010382059800664452, |
|
"grad_norm": 4.429779856244459, |
|
"learning_rate": 1.037344398340249e-06, |
|
"loss": 1.051, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.012458471760797342, |
|
"grad_norm": 3.3098208738585213, |
|
"learning_rate": 1.2448132780082988e-06, |
|
"loss": 0.9902, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014534883720930232, |
|
"grad_norm": 3.4349888460346687, |
|
"learning_rate": 1.4522821576763488e-06, |
|
"loss": 0.9652, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.016611295681063124, |
|
"grad_norm": 3.1515624301454133, |
|
"learning_rate": 1.6597510373443984e-06, |
|
"loss": 0.9415, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.018687707641196014, |
|
"grad_norm": 3.1235312209606505, |
|
"learning_rate": 1.8672199170124482e-06, |
|
"loss": 0.93, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.020764119601328904, |
|
"grad_norm": 3.1741829648141926, |
|
"learning_rate": 2.074688796680498e-06, |
|
"loss": 0.9238, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.022840531561461794, |
|
"grad_norm": 3.232116295196654, |
|
"learning_rate": 2.282157676348548e-06, |
|
"loss": 0.9123, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.024916943521594685, |
|
"grad_norm": 3.1515595029223396, |
|
"learning_rate": 2.4896265560165977e-06, |
|
"loss": 0.9031, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.026993355481727575, |
|
"grad_norm": 3.1003061370301617, |
|
"learning_rate": 2.6970954356846475e-06, |
|
"loss": 0.8947, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.029069767441860465, |
|
"grad_norm": 2.9767060692194844, |
|
"learning_rate": 2.9045643153526977e-06, |
|
"loss": 0.8919, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.031146179401993355, |
|
"grad_norm": 3.0759553041103205, |
|
"learning_rate": 3.112033195020747e-06, |
|
"loss": 0.8702, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03322259136212625, |
|
"grad_norm": 3.285827319776166, |
|
"learning_rate": 3.319502074688797e-06, |
|
"loss": 0.8727, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03529900332225914, |
|
"grad_norm": 3.3462993523967186, |
|
"learning_rate": 3.526970954356847e-06, |
|
"loss": 0.8736, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.03737541528239203, |
|
"grad_norm": 3.256004424550593, |
|
"learning_rate": 3.7344398340248965e-06, |
|
"loss": 0.8858, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03945182724252492, |
|
"grad_norm": 3.159488005717498, |
|
"learning_rate": 3.941908713692946e-06, |
|
"loss": 0.8559, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04152823920265781, |
|
"grad_norm": 2.9634363451500114, |
|
"learning_rate": 4.149377593360996e-06, |
|
"loss": 0.8586, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0436046511627907, |
|
"grad_norm": 3.159728031235876, |
|
"learning_rate": 4.356846473029046e-06, |
|
"loss": 0.8674, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04568106312292359, |
|
"grad_norm": 2.970452415217835, |
|
"learning_rate": 4.564315352697096e-06, |
|
"loss": 0.8542, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04775747508305648, |
|
"grad_norm": 3.1788047641427513, |
|
"learning_rate": 4.771784232365146e-06, |
|
"loss": 0.8701, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.04983388704318937, |
|
"grad_norm": 3.451301944053267, |
|
"learning_rate": 4.979253112033195e-06, |
|
"loss": 0.8714, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05191029900332226, |
|
"grad_norm": 3.388326009403783, |
|
"learning_rate": 5.1867219917012455e-06, |
|
"loss": 0.8483, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05398671096345515, |
|
"grad_norm": 2.9959540593135645, |
|
"learning_rate": 5.394190871369295e-06, |
|
"loss": 0.8481, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05606312292358804, |
|
"grad_norm": 3.1213953476841856, |
|
"learning_rate": 5.601659751037345e-06, |
|
"loss": 0.8387, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.05813953488372093, |
|
"grad_norm": 3.403963416369247, |
|
"learning_rate": 5.809128630705395e-06, |
|
"loss": 0.8399, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06021594684385382, |
|
"grad_norm": 3.0893053330914695, |
|
"learning_rate": 6.016597510373444e-06, |
|
"loss": 0.8386, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.06229235880398671, |
|
"grad_norm": 3.166492177328262, |
|
"learning_rate": 6.224066390041494e-06, |
|
"loss": 0.8457, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0643687707641196, |
|
"grad_norm": 3.2851556967703117, |
|
"learning_rate": 6.431535269709544e-06, |
|
"loss": 0.8421, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0664451827242525, |
|
"grad_norm": 2.9899317822541454, |
|
"learning_rate": 6.639004149377594e-06, |
|
"loss": 0.8373, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06852159468438539, |
|
"grad_norm": 3.0509892785590456, |
|
"learning_rate": 6.846473029045644e-06, |
|
"loss": 0.8334, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.07059800664451828, |
|
"grad_norm": 3.00742757115455, |
|
"learning_rate": 7.053941908713694e-06, |
|
"loss": 0.8233, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07267441860465117, |
|
"grad_norm": 3.0518393701751485, |
|
"learning_rate": 7.261410788381743e-06, |
|
"loss": 0.8296, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.07475083056478406, |
|
"grad_norm": 3.1984146233667263, |
|
"learning_rate": 7.468879668049793e-06, |
|
"loss": 0.8155, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07682724252491695, |
|
"grad_norm": 3.05629449726749, |
|
"learning_rate": 7.676348547717844e-06, |
|
"loss": 0.8377, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.07890365448504984, |
|
"grad_norm": 3.249088059891964, |
|
"learning_rate": 7.883817427385892e-06, |
|
"loss": 0.8432, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08098006644518273, |
|
"grad_norm": 3.0028481508425515, |
|
"learning_rate": 8.091286307053943e-06, |
|
"loss": 0.8173, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.08305647840531562, |
|
"grad_norm": 3.059733445916786, |
|
"learning_rate": 8.298755186721992e-06, |
|
"loss": 0.8227, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08513289036544851, |
|
"grad_norm": 3.0867633236533365, |
|
"learning_rate": 8.506224066390042e-06, |
|
"loss": 0.8181, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.0872093023255814, |
|
"grad_norm": 2.997953986592159, |
|
"learning_rate": 8.713692946058093e-06, |
|
"loss": 0.821, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08928571428571429, |
|
"grad_norm": 3.2351659520743072, |
|
"learning_rate": 8.921161825726142e-06, |
|
"loss": 0.8294, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.09136212624584718, |
|
"grad_norm": 3.1494481731597586, |
|
"learning_rate": 9.128630705394191e-06, |
|
"loss": 0.8261, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09343853820598007, |
|
"grad_norm": 3.105511823234228, |
|
"learning_rate": 9.33609958506224e-06, |
|
"loss": 0.8165, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.09551495016611296, |
|
"grad_norm": 3.023901781664328, |
|
"learning_rate": 9.543568464730292e-06, |
|
"loss": 0.8123, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09759136212624585, |
|
"grad_norm": 3.4303556589177187, |
|
"learning_rate": 9.751037344398341e-06, |
|
"loss": 0.8093, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.09966777408637874, |
|
"grad_norm": 3.6054989714255408, |
|
"learning_rate": 9.95850622406639e-06, |
|
"loss": 0.8201, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10174418604651163, |
|
"grad_norm": 2.990225009601177, |
|
"learning_rate": 9.999915930067828e-06, |
|
"loss": 0.8208, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.10382059800664452, |
|
"grad_norm": 2.9957103647324264, |
|
"learning_rate": 9.999574400813641e-06, |
|
"loss": 0.816, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10589700996677741, |
|
"grad_norm": 2.8988415018010287, |
|
"learning_rate": 9.998970175798065e-06, |
|
"loss": 0.8044, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1079734219269103, |
|
"grad_norm": 2.893907971746992, |
|
"learning_rate": 9.998103286769267e-06, |
|
"loss": 0.799, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11004983388704319, |
|
"grad_norm": 2.898946354458808, |
|
"learning_rate": 9.996973779276743e-06, |
|
"loss": 0.8113, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.11212624584717608, |
|
"grad_norm": 3.0485697591450998, |
|
"learning_rate": 9.99558171266891e-06, |
|
"loss": 0.8194, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11420265780730897, |
|
"grad_norm": 2.933613250090363, |
|
"learning_rate": 9.993927160089991e-06, |
|
"loss": 0.7981, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.11627906976744186, |
|
"grad_norm": 2.900283777987733, |
|
"learning_rate": 9.992010208476178e-06, |
|
"loss": 0.8114, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11835548172757475, |
|
"grad_norm": 2.869639926652705, |
|
"learning_rate": 9.989830958551058e-06, |
|
"loss": 0.8026, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.12043189368770764, |
|
"grad_norm": 3.0764284732072236, |
|
"learning_rate": 9.98738952482032e-06, |
|
"loss": 0.7816, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12250830564784053, |
|
"grad_norm": 2.872848930860205, |
|
"learning_rate": 9.984686035565742e-06, |
|
"loss": 0.7851, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.12458471760797342, |
|
"grad_norm": 2.7170384439590367, |
|
"learning_rate": 9.98172063283845e-06, |
|
"loss": 0.8054, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12666112956810632, |
|
"grad_norm": 2.785739578421159, |
|
"learning_rate": 9.978493472451451e-06, |
|
"loss": 0.7824, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.1287375415282392, |
|
"grad_norm": 2.955753943035507, |
|
"learning_rate": 9.975004723971452e-06, |
|
"loss": 0.7788, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1308139534883721, |
|
"grad_norm": 2.7566534229071378, |
|
"learning_rate": 9.971254570709939e-06, |
|
"loss": 0.7804, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.132890365448505, |
|
"grad_norm": 3.0399050026271945, |
|
"learning_rate": 9.967243209713563e-06, |
|
"loss": 0.7712, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13496677740863788, |
|
"grad_norm": 3.227011718605211, |
|
"learning_rate": 9.962970851753767e-06, |
|
"loss": 0.7852, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.13704318936877077, |
|
"grad_norm": 2.894940556652265, |
|
"learning_rate": 9.95843772131573e-06, |
|
"loss": 0.767, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.13911960132890366, |
|
"grad_norm": 3.137972193410393, |
|
"learning_rate": 9.95364405658655e-06, |
|
"loss": 0.77, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.14119601328903655, |
|
"grad_norm": 2.7913612546678426, |
|
"learning_rate": 9.948590109442755e-06, |
|
"loss": 0.7768, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14327242524916944, |
|
"grad_norm": 2.893979747266515, |
|
"learning_rate": 9.94327614543704e-06, |
|
"loss": 0.7827, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.14534883720930233, |
|
"grad_norm": 2.665071280290936, |
|
"learning_rate": 9.937702443784343e-06, |
|
"loss": 0.7474, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.14742524916943522, |
|
"grad_norm": 2.741350083908129, |
|
"learning_rate": 9.931869297347146e-06, |
|
"loss": 0.7638, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.14950166112956811, |
|
"grad_norm": 2.9878149207237357, |
|
"learning_rate": 9.925777012620111e-06, |
|
"loss": 0.7419, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.151578073089701, |
|
"grad_norm": 2.801227928713699, |
|
"learning_rate": 9.919425909713958e-06, |
|
"loss": 0.769, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.1536544850498339, |
|
"grad_norm": 3.023770968839729, |
|
"learning_rate": 9.912816322338659e-06, |
|
"loss": 0.7447, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.15573089700996678, |
|
"grad_norm": 2.9927287523796715, |
|
"learning_rate": 9.905948597785888e-06, |
|
"loss": 0.754, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.15780730897009967, |
|
"grad_norm": 11.785492453222856, |
|
"learning_rate": 9.89882309691079e-06, |
|
"loss": 0.7497, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.15988372093023256, |
|
"grad_norm": 2.8752234411604682, |
|
"learning_rate": 9.891440194113008e-06, |
|
"loss": 0.7427, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.16196013289036545, |
|
"grad_norm": 3.097207390376622, |
|
"learning_rate": 9.88380027731702e-06, |
|
"loss": 0.7542, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.16403654485049834, |
|
"grad_norm": 2.921991118334764, |
|
"learning_rate": 9.875903747951742e-06, |
|
"loss": 0.7621, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.16611295681063123, |
|
"grad_norm": 2.8395297947865963, |
|
"learning_rate": 9.867751020929454e-06, |
|
"loss": 0.735, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.16818936877076412, |
|
"grad_norm": 2.726116425089643, |
|
"learning_rate": 9.859342524623985e-06, |
|
"loss": 0.7124, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.17026578073089702, |
|
"grad_norm": 3.2173444091652943, |
|
"learning_rate": 9.850678700848208e-06, |
|
"loss": 0.7374, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.1723421926910299, |
|
"grad_norm": 2.716930762983964, |
|
"learning_rate": 9.84176000483083e-06, |
|
"loss": 0.7138, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.1744186046511628, |
|
"grad_norm": 2.985441779621083, |
|
"learning_rate": 9.832586905192469e-06, |
|
"loss": 0.731, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.17649501661129569, |
|
"grad_norm": 3.032790315651323, |
|
"learning_rate": 9.823159883921028e-06, |
|
"loss": 0.7215, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 2.6988344818168155, |
|
"learning_rate": 9.813479436346378e-06, |
|
"loss": 0.7183, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18064784053156147, |
|
"grad_norm": 2.973146607192177, |
|
"learning_rate": 9.803546071114323e-06, |
|
"loss": 0.7311, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.18272425249169436, |
|
"grad_norm": 2.9093506646801344, |
|
"learning_rate": 9.793360310159878e-06, |
|
"loss": 0.7049, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18480066445182725, |
|
"grad_norm": 3.01100096145872, |
|
"learning_rate": 9.782922688679847e-06, |
|
"loss": 0.7118, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.18687707641196014, |
|
"grad_norm": 2.716470652939527, |
|
"learning_rate": 9.772233755104695e-06, |
|
"loss": 0.7277, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.18895348837209303, |
|
"grad_norm": 2.7134248053870165, |
|
"learning_rate": 9.761294071069736e-06, |
|
"loss": 0.7205, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.19102990033222592, |
|
"grad_norm": 2.6251507638777163, |
|
"learning_rate": 9.750104211385625e-06, |
|
"loss": 0.7152, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1931063122923588, |
|
"grad_norm": 2.8023948010803483, |
|
"learning_rate": 9.738664764008149e-06, |
|
"loss": 0.7233, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.1951827242524917, |
|
"grad_norm": 3.714290449563204, |
|
"learning_rate": 9.726976330007341e-06, |
|
"loss": 0.6998, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1972591362126246, |
|
"grad_norm": 2.8670419197216512, |
|
"learning_rate": 9.71503952353589e-06, |
|
"loss": 0.6985, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.19933554817275748, |
|
"grad_norm": 3.1683988394439107, |
|
"learning_rate": 9.702854971796876e-06, |
|
"loss": 0.7089, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20141196013289037, |
|
"grad_norm": 3.2223078839261166, |
|
"learning_rate": 9.690423315010814e-06, |
|
"loss": 0.7053, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.20348837209302326, |
|
"grad_norm": 2.77875488832717, |
|
"learning_rate": 9.677745206382014e-06, |
|
"loss": 0.7271, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.20556478405315615, |
|
"grad_norm": 2.888271933836237, |
|
"learning_rate": 9.664821312064258e-06, |
|
"loss": 0.7018, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.20764119601328904, |
|
"grad_norm": 3.2746008040723815, |
|
"learning_rate": 9.651652311125803e-06, |
|
"loss": 0.6991, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.20971760797342193, |
|
"grad_norm": 2.76622547311742, |
|
"learning_rate": 9.638238895513687e-06, |
|
"loss": 0.7075, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.21179401993355482, |
|
"grad_norm": 2.9972446036957114, |
|
"learning_rate": 9.624581770017392e-06, |
|
"loss": 0.6857, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2138704318936877, |
|
"grad_norm": 2.869516499460042, |
|
"learning_rate": 9.610681652231794e-06, |
|
"loss": 0.6916, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.2159468438538206, |
|
"grad_norm": 2.742923434452921, |
|
"learning_rate": 9.596539272519468e-06, |
|
"loss": 0.6811, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2180232558139535, |
|
"grad_norm": 2.8482023108565677, |
|
"learning_rate": 9.582155373972303e-06, |
|
"loss": 0.6744, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.22009966777408638, |
|
"grad_norm": 2.9348099403663124, |
|
"learning_rate": 9.56753071237247e-06, |
|
"loss": 0.6776, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22217607973421927, |
|
"grad_norm": 2.786772996017183, |
|
"learning_rate": 9.552666056152704e-06, |
|
"loss": 0.6798, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.22425249169435216, |
|
"grad_norm": 2.92722689041533, |
|
"learning_rate": 9.537562186355918e-06, |
|
"loss": 0.6843, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.22632890365448505, |
|
"grad_norm": 2.7694998172195207, |
|
"learning_rate": 9.52221989659418e-06, |
|
"loss": 0.6938, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.22840531561461794, |
|
"grad_norm": 2.9300442858036244, |
|
"learning_rate": 9.506639993007012e-06, |
|
"loss": 0.6944, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.23048172757475083, |
|
"grad_norm": 3.1035204783454993, |
|
"learning_rate": 9.490823294219015e-06, |
|
"loss": 0.672, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"grad_norm": 2.6193387690961245, |
|
"learning_rate": 9.474770631296882e-06, |
|
"loss": 0.6561, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2346345514950166, |
|
"grad_norm": 2.61646550507026, |
|
"learning_rate": 9.458482847705705e-06, |
|
"loss": 0.6576, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.2367109634551495, |
|
"grad_norm": 2.756473668019519, |
|
"learning_rate": 9.441960799264678e-06, |
|
"loss": 0.6851, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2387873754152824, |
|
"grad_norm": 2.6995089678231614, |
|
"learning_rate": 9.425205354102111e-06, |
|
"loss": 0.6648, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.24086378737541528, |
|
"grad_norm": 2.7140254791209677, |
|
"learning_rate": 9.408217392609831e-06, |
|
"loss": 0.6451, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.24294019933554817, |
|
"grad_norm": 2.607599787114018, |
|
"learning_rate": 9.390997807396912e-06, |
|
"loss": 0.67, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.24501661129568106, |
|
"grad_norm": 2.8420050898692764, |
|
"learning_rate": 9.373547503242775e-06, |
|
"loss": 0.6657, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.24709302325581395, |
|
"grad_norm": 2.9228965685399095, |
|
"learning_rate": 9.355867397049658e-06, |
|
"loss": 0.6566, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.24916943521594684, |
|
"grad_norm": 2.8048600929777403, |
|
"learning_rate": 9.337958417794425e-06, |
|
"loss": 0.6457, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.25124584717607973, |
|
"grad_norm": 2.6983485281997415, |
|
"learning_rate": 9.319821506479762e-06, |
|
"loss": 0.6376, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.25332225913621265, |
|
"grad_norm": 2.801805288954333, |
|
"learning_rate": 9.301457616084733e-06, |
|
"loss": 0.6523, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2553986710963455, |
|
"grad_norm": 2.820864396273499, |
|
"learning_rate": 9.282867711514703e-06, |
|
"loss": 0.6365, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.2574750830564784, |
|
"grad_norm": 2.9932167823643043, |
|
"learning_rate": 9.264052769550643e-06, |
|
"loss": 0.6425, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2595514950166113, |
|
"grad_norm": 2.6556108045628544, |
|
"learning_rate": 9.245013778797802e-06, |
|
"loss": 0.6562, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.2616279069767442, |
|
"grad_norm": 2.676416816690246, |
|
"learning_rate": 9.225751739633772e-06, |
|
"loss": 0.6387, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.26370431893687707, |
|
"grad_norm": 2.702226526508375, |
|
"learning_rate": 9.206267664155906e-06, |
|
"loss": 0.6348, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.26578073089701, |
|
"grad_norm": 2.637563222880754, |
|
"learning_rate": 9.186562576128159e-06, |
|
"loss": 0.6263, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 2.7815352111724603, |
|
"learning_rate": 9.16663751092728e-06, |
|
"loss": 0.6362, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.26993355481727577, |
|
"grad_norm": 2.8822755136904528, |
|
"learning_rate": 9.146493515488418e-06, |
|
"loss": 0.6164, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.27200996677740863, |
|
"grad_norm": 2.5755107274498146, |
|
"learning_rate": 9.126131648250112e-06, |
|
"loss": 0.6342, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.27408637873754155, |
|
"grad_norm": 2.584492766117294, |
|
"learning_rate": 9.105552979098675e-06, |
|
"loss": 0.6329, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2761627906976744, |
|
"grad_norm": 2.6805388863449036, |
|
"learning_rate": 9.084758589311977e-06, |
|
"loss": 0.6307, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.2782392026578073, |
|
"grad_norm": 2.7584115266730693, |
|
"learning_rate": 9.063749571502633e-06, |
|
"loss": 0.6374, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.2803156146179402, |
|
"grad_norm": 2.8092430217085145, |
|
"learning_rate": 9.04252702956059e-06, |
|
"loss": 0.6282, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.2823920265780731, |
|
"grad_norm": 2.6353604501522168, |
|
"learning_rate": 9.021092078595132e-06, |
|
"loss": 0.6332, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.28446843853820597, |
|
"grad_norm": 2.7859177417571486, |
|
"learning_rate": 8.999445844876276e-06, |
|
"loss": 0.6381, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.2865448504983389, |
|
"grad_norm": 2.6603634875986457, |
|
"learning_rate": 8.977589465775607e-06, |
|
"loss": 0.6312, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.28862126245847175, |
|
"grad_norm": 2.6293766795824354, |
|
"learning_rate": 8.955524089706506e-06, |
|
"loss": 0.5999, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.29069767441860467, |
|
"grad_norm": 2.8986723382239967, |
|
"learning_rate": 8.933250876063815e-06, |
|
"loss": 0.6297, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.29277408637873753, |
|
"grad_norm": 2.6589365161649834, |
|
"learning_rate": 8.910770995162913e-06, |
|
"loss": 0.6303, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.29485049833887045, |
|
"grad_norm": 2.64992234535583, |
|
"learning_rate": 8.88808562817823e-06, |
|
"loss": 0.6114, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2969269102990033, |
|
"grad_norm": 2.7322760412568776, |
|
"learning_rate": 8.865195967081174e-06, |
|
"loss": 0.6215, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.29900332225913623, |
|
"grad_norm": 2.576473302210113, |
|
"learning_rate": 8.842103214577511e-06, |
|
"loss": 0.6147, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3010797342192691, |
|
"grad_norm": 2.507546434543662, |
|
"learning_rate": 8.818808584044163e-06, |
|
"loss": 0.6089, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.303156146179402, |
|
"grad_norm": 2.953501799132662, |
|
"learning_rate": 8.795313299465455e-06, |
|
"loss": 0.6147, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.30523255813953487, |
|
"grad_norm": 2.58266860044093, |
|
"learning_rate": 8.771618595368806e-06, |
|
"loss": 0.6024, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.3073089700996678, |
|
"grad_norm": 2.7291039422306613, |
|
"learning_rate": 8.747725716759859e-06, |
|
"loss": 0.6152, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.30938538205980065, |
|
"grad_norm": 2.696653736904745, |
|
"learning_rate": 8.723635919057058e-06, |
|
"loss": 0.6082, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.31146179401993357, |
|
"grad_norm": 2.639188973608746, |
|
"learning_rate": 8.699350468025699e-06, |
|
"loss": 0.5924, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.31353820598006643, |
|
"grad_norm": 2.5960120065556294, |
|
"learning_rate": 8.674870639711403e-06, |
|
"loss": 0.5871, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.31561461794019935, |
|
"grad_norm": 2.691098687645451, |
|
"learning_rate": 8.650197720373091e-06, |
|
"loss": 0.5937, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3176910299003322, |
|
"grad_norm": 2.7922815680081947, |
|
"learning_rate": 8.625333006415372e-06, |
|
"loss": 0.5806, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.31976744186046513, |
|
"grad_norm": 2.5989983221444635, |
|
"learning_rate": 8.600277804320452e-06, |
|
"loss": 0.5889, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.321843853820598, |
|
"grad_norm": 2.7500580415708553, |
|
"learning_rate": 8.575033430579465e-06, |
|
"loss": 0.5929, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.3239202657807309, |
|
"grad_norm": 2.9863748696055485, |
|
"learning_rate": 8.549601211623316e-06, |
|
"loss": 0.5905, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.32599667774086377, |
|
"grad_norm": 2.7128601524461966, |
|
"learning_rate": 8.523982483752973e-06, |
|
"loss": 0.5838, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.3280730897009967, |
|
"grad_norm": 2.6273588590853727, |
|
"learning_rate": 8.498178593069262e-06, |
|
"loss": 0.579, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.33014950166112955, |
|
"grad_norm": 2.6424251208940714, |
|
"learning_rate": 8.472190895402131e-06, |
|
"loss": 0.568, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.33222591362126247, |
|
"grad_norm": 2.774060760650428, |
|
"learning_rate": 8.446020756239418e-06, |
|
"loss": 0.5881, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.33430232558139533, |
|
"grad_norm": 2.7429673227633193, |
|
"learning_rate": 8.419669550655093e-06, |
|
"loss": 0.5807, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.33637873754152825, |
|
"grad_norm": 2.4588138685140164, |
|
"learning_rate": 8.393138663237015e-06, |
|
"loss": 0.5699, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.3384551495016611, |
|
"grad_norm": 2.8894345220890845, |
|
"learning_rate": 8.366429488014178e-06, |
|
"loss": 0.5644, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.34053156146179403, |
|
"grad_norm": 2.6417969175920253, |
|
"learning_rate": 8.339543428383467e-06, |
|
"loss": 0.577, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3426079734219269, |
|
"grad_norm": 2.639049529021501, |
|
"learning_rate": 8.312481897035906e-06, |
|
"loss": 0.5835, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.3446843853820598, |
|
"grad_norm": 2.791601353912272, |
|
"learning_rate": 8.285246315882448e-06, |
|
"loss": 0.5873, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.3467607973421927, |
|
"grad_norm": 2.760486538247162, |
|
"learning_rate": 8.257838115979244e-06, |
|
"loss": 0.5743, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.3488372093023256, |
|
"grad_norm": 2.6084506349864114, |
|
"learning_rate": 8.230258737452473e-06, |
|
"loss": 0.5835, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.35091362126245845, |
|
"grad_norm": 2.568077365967415, |
|
"learning_rate": 8.202509629422647e-06, |
|
"loss": 0.5663, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.35299003322259137, |
|
"grad_norm": 3.338586543406698, |
|
"learning_rate": 8.17459224992849e-06, |
|
"loss": 0.561, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.35506644518272423, |
|
"grad_norm": 2.550936924190995, |
|
"learning_rate": 8.14650806585031e-06, |
|
"loss": 0.5748, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 2.730568567607308, |
|
"learning_rate": 8.118258552832945e-06, |
|
"loss": 0.5526, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.35921926910299, |
|
"grad_norm": 2.7922640713365765, |
|
"learning_rate": 8.0898451952082e-06, |
|
"loss": 0.5636, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.36129568106312293, |
|
"grad_norm": 2.4817520439108782, |
|
"learning_rate": 8.061269485916881e-06, |
|
"loss": 0.565, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.3633720930232558, |
|
"grad_norm": 2.5897869437416814, |
|
"learning_rate": 8.032532926430335e-06, |
|
"loss": 0.5718, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.3654485049833887, |
|
"grad_norm": 2.6233407361081196, |
|
"learning_rate": 8.003637026671558e-06, |
|
"loss": 0.5495, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3675249169435216, |
|
"grad_norm": 2.590608968830393, |
|
"learning_rate": 7.974583304935867e-06, |
|
"loss": 0.5701, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.3696013289036545, |
|
"grad_norm": 2.676185626796156, |
|
"learning_rate": 7.945373287811116e-06, |
|
"loss": 0.5476, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.37167774086378735, |
|
"grad_norm": 2.588749653152642, |
|
"learning_rate": 7.916008510097483e-06, |
|
"loss": 0.5363, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.37375415282392027, |
|
"grad_norm": 2.648109565452331, |
|
"learning_rate": 7.88649051472683e-06, |
|
"loss": 0.5566, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.37583056478405313, |
|
"grad_norm": 2.6123078212762567, |
|
"learning_rate": 7.856820852681634e-06, |
|
"loss": 0.5481, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.37790697674418605, |
|
"grad_norm": 2.5715025577779107, |
|
"learning_rate": 7.82700108291348e-06, |
|
"loss": 0.5554, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.3799833887043189, |
|
"grad_norm": 2.6810117688521333, |
|
"learning_rate": 7.797032772261164e-06, |
|
"loss": 0.5396, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.38205980066445183, |
|
"grad_norm": 2.828001329589521, |
|
"learning_rate": 7.766917495368356e-06, |
|
"loss": 0.549, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.3841362126245847, |
|
"grad_norm": 2.6073301891312455, |
|
"learning_rate": 7.736656834600866e-06, |
|
"loss": 0.5403, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3862126245847176, |
|
"grad_norm": 2.7467154847057107, |
|
"learning_rate": 7.706252379963498e-06, |
|
"loss": 0.5395, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.3882890365448505, |
|
"grad_norm": 2.6418072073420067, |
|
"learning_rate": 7.675705729016508e-06, |
|
"loss": 0.5363, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.3903654485049834, |
|
"grad_norm": 2.632007372607857, |
|
"learning_rate": 7.645018486791664e-06, |
|
"loss": 0.5377, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.39244186046511625, |
|
"grad_norm": 2.4652302347093364, |
|
"learning_rate": 7.6141922657079045e-06, |
|
"loss": 0.5321, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.3945182724252492, |
|
"grad_norm": 2.5492866422631764, |
|
"learning_rate": 7.583228685486623e-06, |
|
"loss": 0.5433, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.39659468438538203, |
|
"grad_norm": 2.4794671881341936, |
|
"learning_rate": 7.552129373066565e-06, |
|
"loss": 0.5423, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.39867109634551495, |
|
"grad_norm": 2.565377450639672, |
|
"learning_rate": 7.520895962518329e-06, |
|
"loss": 0.5357, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.4007475083056478, |
|
"grad_norm": 2.7376349329000504, |
|
"learning_rate": 7.489530094958521e-06, |
|
"loss": 0.5529, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.40282392026578073, |
|
"grad_norm": 2.5470062145134778, |
|
"learning_rate": 7.458033418463517e-06, |
|
"loss": 0.5167, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.4049003322259136, |
|
"grad_norm": 2.5915393940286724, |
|
"learning_rate": 7.426407587982869e-06, |
|
"loss": 0.5359, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.4069767441860465, |
|
"grad_norm": 2.5521473612501118, |
|
"learning_rate": 7.394654265252348e-06, |
|
"loss": 0.5448, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4090531561461794, |
|
"grad_norm": 2.540390049884069, |
|
"learning_rate": 7.362775118706627e-06, |
|
"loss": 0.5224, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.4111295681063123, |
|
"grad_norm": 2.67106563437947, |
|
"learning_rate": 7.330771823391622e-06, |
|
"loss": 0.547, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.41320598006644516, |
|
"grad_norm": 2.5844286453504752, |
|
"learning_rate": 7.298646060876473e-06, |
|
"loss": 0.5245, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.4152823920265781, |
|
"grad_norm": 2.489462893225223, |
|
"learning_rate": 7.266399519165193e-06, |
|
"loss": 0.5177, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.417358803986711, |
|
"grad_norm": 2.548885028848683, |
|
"learning_rate": 7.234033892607969e-06, |
|
"loss": 0.5285, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.41943521594684385, |
|
"grad_norm": 2.515732979636329, |
|
"learning_rate": 7.201550881812138e-06, |
|
"loss": 0.5295, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.42151162790697677, |
|
"grad_norm": 2.580813201220608, |
|
"learning_rate": 7.168952193552831e-06, |
|
"loss": 0.5144, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.42358803986710963, |
|
"grad_norm": 2.8605769340325544, |
|
"learning_rate": 7.136239540683297e-06, |
|
"loss": 0.5189, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.42566445182724255, |
|
"grad_norm": 2.7042921962644773, |
|
"learning_rate": 7.103414642044888e-06, |
|
"loss": 0.516, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.4277408637873754, |
|
"grad_norm": 2.5935305392513475, |
|
"learning_rate": 7.070479222376765e-06, |
|
"loss": 0.5273, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.42981727574750833, |
|
"grad_norm": 2.521806447567166, |
|
"learning_rate": 7.037435012225259e-06, |
|
"loss": 0.514, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.4318936877076412, |
|
"grad_norm": 2.4922095571026808, |
|
"learning_rate": 7.00428374785295e-06, |
|
"loss": 0.5191, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.4339700996677741, |
|
"grad_norm": 2.53445755137843, |
|
"learning_rate": 6.971027171147436e-06, |
|
"loss": 0.5175, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.436046511627907, |
|
"grad_norm": 2.5854663493896815, |
|
"learning_rate": 6.937667029529803e-06, |
|
"loss": 0.5052, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4381229235880399, |
|
"grad_norm": 2.6149256231235767, |
|
"learning_rate": 6.904205075862816e-06, |
|
"loss": 0.5155, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.44019933554817275, |
|
"grad_norm": 2.5728069972099643, |
|
"learning_rate": 6.870643068358813e-06, |
|
"loss": 0.5164, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.44227574750830567, |
|
"grad_norm": 2.610034601385569, |
|
"learning_rate": 6.8369827704873225e-06, |
|
"loss": 0.515, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.44435215946843853, |
|
"grad_norm": 2.559653943614866, |
|
"learning_rate": 6.803225950882407e-06, |
|
"loss": 0.5103, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 2.744659999074845, |
|
"learning_rate": 6.769374383249728e-06, |
|
"loss": 0.5144, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.4485049833887043, |
|
"grad_norm": 2.500834722382555, |
|
"learning_rate": 6.735429846273356e-06, |
|
"loss": 0.509, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.45058139534883723, |
|
"grad_norm": 2.571303478772175, |
|
"learning_rate": 6.701394123522303e-06, |
|
"loss": 0.5061, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.4526578073089701, |
|
"grad_norm": 2.6726371126474042, |
|
"learning_rate": 6.667269003356815e-06, |
|
"loss": 0.4872, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.454734219269103, |
|
"grad_norm": 2.314624945694432, |
|
"learning_rate": 6.633056278834403e-06, |
|
"loss": 0.4978, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.4568106312292359, |
|
"grad_norm": 2.5660125412801986, |
|
"learning_rate": 6.598757747615625e-06, |
|
"loss": 0.4873, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.4588870431893688, |
|
"grad_norm": 2.5055302944005655, |
|
"learning_rate": 6.564375211869638e-06, |
|
"loss": 0.4955, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.46096345514950166, |
|
"grad_norm": 2.3161654964295963, |
|
"learning_rate": 6.529910478179499e-06, |
|
"loss": 0.4996, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.4630398671096346, |
|
"grad_norm": 2.713583584390501, |
|
"learning_rate": 6.495365357447242e-06, |
|
"loss": 0.4837, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 2.6986080979156597, |
|
"learning_rate": 6.4607416647987285e-06, |
|
"loss": 0.503, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.46719269102990035, |
|
"grad_norm": 2.3758745672703614, |
|
"learning_rate": 6.426041219488275e-06, |
|
"loss": 0.4917, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.4692691029900332, |
|
"grad_norm": 2.468317610874025, |
|
"learning_rate": 6.39126584480306e-06, |
|
"loss": 0.4947, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.47134551495016613, |
|
"grad_norm": 2.672466601805675, |
|
"learning_rate": 6.3564173679673225e-06, |
|
"loss": 0.4956, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.473421926910299, |
|
"grad_norm": 2.686387722109422, |
|
"learning_rate": 6.321497620046353e-06, |
|
"loss": 0.4958, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.4754983388704319, |
|
"grad_norm": 2.4115883144762105, |
|
"learning_rate": 6.286508435850282e-06, |
|
"loss": 0.4884, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.4775747508305648, |
|
"grad_norm": 2.473062095275494, |
|
"learning_rate": 6.251451653837679e-06, |
|
"loss": 0.4873, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4796511627906977, |
|
"grad_norm": 2.4611172122096034, |
|
"learning_rate": 6.216329116018943e-06, |
|
"loss": 0.4828, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.48172757475083056, |
|
"grad_norm": 2.438501558434762, |
|
"learning_rate": 6.181142667859521e-06, |
|
"loss": 0.4743, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.4838039867109635, |
|
"grad_norm": 2.4623748153401586, |
|
"learning_rate": 6.145894158182945e-06, |
|
"loss": 0.4813, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.48588039867109634, |
|
"grad_norm": 2.5841330806095093, |
|
"learning_rate": 6.11058543907368e-06, |
|
"loss": 0.4757, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.48795681063122925, |
|
"grad_norm": 2.420645551171905, |
|
"learning_rate": 6.075218365779814e-06, |
|
"loss": 0.4717, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.4900332225913621, |
|
"grad_norm": 2.41753538282735, |
|
"learning_rate": 6.039794796615575e-06, |
|
"loss": 0.4683, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.49210963455149503, |
|
"grad_norm": 2.6345922483315993, |
|
"learning_rate": 6.004316592863693e-06, |
|
"loss": 0.4758, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.4941860465116279, |
|
"grad_norm": 2.580357854248359, |
|
"learning_rate": 5.96878561867759e-06, |
|
"loss": 0.4923, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.4962624584717608, |
|
"grad_norm": 2.3693846881679463, |
|
"learning_rate": 5.9332037409834466e-06, |
|
"loss": 0.4732, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.4983388704318937, |
|
"grad_norm": 2.769567429139866, |
|
"learning_rate": 5.89757282938209e-06, |
|
"loss": 0.4713, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5004152823920266, |
|
"grad_norm": 2.41622785319668, |
|
"learning_rate": 5.86189475605077e-06, |
|
"loss": 0.476, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.5024916943521595, |
|
"grad_norm": 2.499791289384567, |
|
"learning_rate": 5.826171395644786e-06, |
|
"loss": 0.4749, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5045681063122923, |
|
"grad_norm": 2.417525944289692, |
|
"learning_rate": 5.790404625198982e-06, |
|
"loss": 0.4726, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.5066445182724253, |
|
"grad_norm": 2.5878334687029114, |
|
"learning_rate": 5.754596324029125e-06, |
|
"loss": 0.4761, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5087209302325582, |
|
"grad_norm": 2.4962102663667043, |
|
"learning_rate": 5.7187483736331554e-06, |
|
"loss": 0.4578, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.510797342192691, |
|
"grad_norm": 2.6263564446414636, |
|
"learning_rate": 5.682862657592327e-06, |
|
"loss": 0.4825, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5128737541528239, |
|
"grad_norm": 2.880797119411763, |
|
"learning_rate": 5.646941061472242e-06, |
|
"loss": 0.469, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.5149501661129569, |
|
"grad_norm": 2.555965100494747, |
|
"learning_rate": 5.610985472723764e-06, |
|
"loss": 0.4712, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5170265780730897, |
|
"grad_norm": 2.502236357284136, |
|
"learning_rate": 5.5749977805838615e-06, |
|
"loss": 0.4681, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.5191029900332226, |
|
"grad_norm": 2.4360635002482347, |
|
"learning_rate": 5.538979875976324e-06, |
|
"loss": 0.4636, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5211794019933554, |
|
"grad_norm": 2.488011716508302, |
|
"learning_rate": 5.502933651412417e-06, |
|
"loss": 0.4699, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.5232558139534884, |
|
"grad_norm": 2.3770436189443696, |
|
"learning_rate": 5.466861000891439e-06, |
|
"loss": 0.4592, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5253322259136213, |
|
"grad_norm": 2.7541846157024876, |
|
"learning_rate": 5.430763819801205e-06, |
|
"loss": 0.4692, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.5274086378737541, |
|
"grad_norm": 2.7287082031019745, |
|
"learning_rate": 5.394644004818452e-06, |
|
"loss": 0.4745, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.529485049833887, |
|
"grad_norm": 2.5164954994115094, |
|
"learning_rate": 5.3585034538091885e-06, |
|
"loss": 0.4525, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.53156146179402, |
|
"grad_norm": 2.347205777105881, |
|
"learning_rate": 5.322344065728964e-06, |
|
"loss": 0.4689, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5336378737541528, |
|
"grad_norm": 2.582827989286747, |
|
"learning_rate": 5.286167740523099e-06, |
|
"loss": 0.4691, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 2.5061090934097843, |
|
"learning_rate": 5.249976379026851e-06, |
|
"loss": 0.4436, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5377906976744186, |
|
"grad_norm": 2.4524559965169748, |
|
"learning_rate": 5.213771882865538e-06, |
|
"loss": 0.4643, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.5398671096345515, |
|
"grad_norm": 2.560097527019471, |
|
"learning_rate": 5.177556154354622e-06, |
|
"loss": 0.4464, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5419435215946844, |
|
"grad_norm": 2.397260026201424, |
|
"learning_rate": 5.141331096399755e-06, |
|
"loss": 0.4501, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.5440199335548173, |
|
"grad_norm": 2.351541148312247, |
|
"learning_rate": 5.1050986123967884e-06, |
|
"loss": 0.4398, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5460963455149501, |
|
"grad_norm": 2.452194040455103, |
|
"learning_rate": 5.068860606131766e-06, |
|
"loss": 0.4516, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.5481727574750831, |
|
"grad_norm": 2.593569889967618, |
|
"learning_rate": 5.032618981680893e-06, |
|
"loss": 0.4534, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.550249169435216, |
|
"grad_norm": 2.491194365967403, |
|
"learning_rate": 4.9963756433104875e-06, |
|
"loss": 0.4561, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.5523255813953488, |
|
"grad_norm": 2.5315048028501432, |
|
"learning_rate": 4.960132495376919e-06, |
|
"loss": 0.4387, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.5544019933554817, |
|
"grad_norm": 2.4221610492026566, |
|
"learning_rate": 4.923891442226554e-06, |
|
"loss": 0.4526, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.5564784053156147, |
|
"grad_norm": 2.4574741459986043, |
|
"learning_rate": 4.887654388095691e-06, |
|
"loss": 0.4388, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5585548172757475, |
|
"grad_norm": 2.5581004359073565, |
|
"learning_rate": 4.851423237010504e-06, |
|
"loss": 0.4512, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.5606312292358804, |
|
"grad_norm": 2.5084567945271634, |
|
"learning_rate": 4.815199892687006e-06, |
|
"loss": 0.464, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.5627076411960132, |
|
"grad_norm": 2.4656070255557294, |
|
"learning_rate": 4.778986258431005e-06, |
|
"loss": 0.4471, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.5647840531561462, |
|
"grad_norm": 2.494517722129321, |
|
"learning_rate": 4.742784237038113e-06, |
|
"loss": 0.4352, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5668604651162791, |
|
"grad_norm": 2.5383042319953995, |
|
"learning_rate": 4.70659573069376e-06, |
|
"loss": 0.421, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.5689368770764119, |
|
"grad_norm": 2.3933135171603936, |
|
"learning_rate": 4.670422640873242e-06, |
|
"loss": 0.4379, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5710132890365448, |
|
"grad_norm": 2.4020680375977133, |
|
"learning_rate": 4.63426686824182e-06, |
|
"loss": 0.4323, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.5730897009966778, |
|
"grad_norm": 2.43581294994139, |
|
"learning_rate": 4.598130312554843e-06, |
|
"loss": 0.4397, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5751661129568106, |
|
"grad_norm": 2.5772706634163027, |
|
"learning_rate": 4.562014872557936e-06, |
|
"loss": 0.4362, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.5772425249169435, |
|
"grad_norm": 2.448863408768738, |
|
"learning_rate": 4.525922445887224e-06, |
|
"loss": 0.4349, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5793189368770764, |
|
"grad_norm": 2.535308434878213, |
|
"learning_rate": 4.489854928969635e-06, |
|
"loss": 0.4516, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.5813953488372093, |
|
"grad_norm": 2.3973615256768768, |
|
"learning_rate": 4.453814216923242e-06, |
|
"loss": 0.4336, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5834717607973422, |
|
"grad_norm": 2.3119199540164965, |
|
"learning_rate": 4.4178022034576976e-06, |
|
"loss": 0.4226, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.5855481727574751, |
|
"grad_norm": 2.3014825037296633, |
|
"learning_rate": 4.381820780774724e-06, |
|
"loss": 0.4322, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5876245847176079, |
|
"grad_norm": 2.5351337278959556, |
|
"learning_rate": 4.345871839468694e-06, |
|
"loss": 0.4055, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.5897009966777409, |
|
"grad_norm": 2.611286820208639, |
|
"learning_rate": 4.309957268427292e-06, |
|
"loss": 0.4216, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5917774086378738, |
|
"grad_norm": 2.3889570520642684, |
|
"learning_rate": 4.274078954732262e-06, |
|
"loss": 0.4427, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.5938538205980066, |
|
"grad_norm": 2.384724624598042, |
|
"learning_rate": 4.2382387835602565e-06, |
|
"loss": 0.4246, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.5959302325581395, |
|
"grad_norm": 2.3536762842777126, |
|
"learning_rate": 4.20243863808378e-06, |
|
"loss": 0.4352, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.5980066445182725, |
|
"grad_norm": 2.367560729519929, |
|
"learning_rate": 4.166680399372248e-06, |
|
"loss": 0.4226, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6000830564784053, |
|
"grad_norm": 2.401186140827422, |
|
"learning_rate": 4.130965946293135e-06, |
|
"loss": 0.4529, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.6021594684385382, |
|
"grad_norm": 2.3503805374006457, |
|
"learning_rate": 4.095297155413264e-06, |
|
"loss": 0.4213, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.604235880398671, |
|
"grad_norm": 2.404199762232402, |
|
"learning_rate": 4.059675900900199e-06, |
|
"loss": 0.4309, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.606312292358804, |
|
"grad_norm": 2.5304024582625053, |
|
"learning_rate": 4.024104054423772e-06, |
|
"loss": 0.4215, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6083887043189369, |
|
"grad_norm": 2.4035116235125473, |
|
"learning_rate": 3.9885834850577375e-06, |
|
"loss": 0.4282, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.6104651162790697, |
|
"grad_norm": 2.3499844076305156, |
|
"learning_rate": 3.953116059181563e-06, |
|
"loss": 0.422, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6125415282392026, |
|
"grad_norm": 2.5288170114153585, |
|
"learning_rate": 3.9177036403823645e-06, |
|
"loss": 0.4329, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.6146179401993356, |
|
"grad_norm": 2.3290974062316057, |
|
"learning_rate": 3.882348089356992e-06, |
|
"loss": 0.4137, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6166943521594684, |
|
"grad_norm": 2.4328677326588894, |
|
"learning_rate": 3.84705126381425e-06, |
|
"loss": 0.4297, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.6187707641196013, |
|
"grad_norm": 2.3908310630477954, |
|
"learning_rate": 3.8118150183772974e-06, |
|
"loss": 0.4293, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6208471760797342, |
|
"grad_norm": 2.4893827738846808, |
|
"learning_rate": 3.776641204486191e-06, |
|
"loss": 0.4214, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.6229235880398671, |
|
"grad_norm": 2.3486377563484133, |
|
"learning_rate": 3.7415316703006116e-06, |
|
"loss": 0.405, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 2.466506888817687, |
|
"learning_rate": 3.7064882606027497e-06, |
|
"loss": 0.426, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.6270764119601329, |
|
"grad_norm": 2.496662130115367, |
|
"learning_rate": 3.671512816700375e-06, |
|
"loss": 0.4201, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6291528239202658, |
|
"grad_norm": 2.265163717312505, |
|
"learning_rate": 3.636607176330088e-06, |
|
"loss": 0.4205, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.6312292358803987, |
|
"grad_norm": 2.2703878574783163, |
|
"learning_rate": 3.60177317356076e-06, |
|
"loss": 0.4101, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6333056478405316, |
|
"grad_norm": 2.423443407995488, |
|
"learning_rate": 3.5670126386971625e-06, |
|
"loss": 0.4171, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.6353820598006644, |
|
"grad_norm": 2.44608682526587, |
|
"learning_rate": 3.5323273981837965e-06, |
|
"loss": 0.416, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.6374584717607974, |
|
"grad_norm": 2.2051417207338173, |
|
"learning_rate": 3.497719274508925e-06, |
|
"loss": 0.4019, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.6395348837209303, |
|
"grad_norm": 2.4800578989548034, |
|
"learning_rate": 3.4631900861088132e-06, |
|
"loss": 0.4029, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.6416112956810631, |
|
"grad_norm": 2.3268282845100035, |
|
"learning_rate": 3.4287416472721795e-06, |
|
"loss": 0.4111, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.643687707641196, |
|
"grad_norm": 2.3872453059218532, |
|
"learning_rate": 3.3943757680448697e-06, |
|
"loss": 0.4061, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.645764119601329, |
|
"grad_norm": 2.42558490404232, |
|
"learning_rate": 3.360094254134746e-06, |
|
"loss": 0.403, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.6478405315614618, |
|
"grad_norm": 2.441847356983534, |
|
"learning_rate": 3.3258989068168123e-06, |
|
"loss": 0.417, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.6499169435215947, |
|
"grad_norm": 2.356616246546388, |
|
"learning_rate": 3.2917915228385676e-06, |
|
"loss": 0.4008, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.6519933554817275, |
|
"grad_norm": 2.457529466848808, |
|
"learning_rate": 3.257773894325599e-06, |
|
"loss": 0.4166, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.6540697674418605, |
|
"grad_norm": 2.5688010790796154, |
|
"learning_rate": 3.223847808687415e-06, |
|
"loss": 0.3982, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.6561461794019934, |
|
"grad_norm": 2.2695295812005836, |
|
"learning_rate": 3.190015048523528e-06, |
|
"loss": 0.3912, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.6582225913621262, |
|
"grad_norm": 2.5664307243505227, |
|
"learning_rate": 3.156277391529796e-06, |
|
"loss": 0.4044, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.6602990033222591, |
|
"grad_norm": 2.421377162101449, |
|
"learning_rate": 3.1226366104050067e-06, |
|
"loss": 0.4061, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.6623754152823921, |
|
"grad_norm": 2.50702313044333, |
|
"learning_rate": 3.089094472757742e-06, |
|
"loss": 0.3986, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.6644518272425249, |
|
"grad_norm": 2.2015982709846122, |
|
"learning_rate": 3.055652741013497e-06, |
|
"loss": 0.3773, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6665282392026578, |
|
"grad_norm": 2.484025604844624, |
|
"learning_rate": 3.0223131723220756e-06, |
|
"loss": 0.4043, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.6686046511627907, |
|
"grad_norm": 2.2673450694224426, |
|
"learning_rate": 2.9890775184652666e-06, |
|
"loss": 0.3975, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.6706810631229236, |
|
"grad_norm": 2.411243052140437, |
|
"learning_rate": 2.955947525764796e-06, |
|
"loss": 0.4162, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.6727574750830565, |
|
"grad_norm": 2.467788088547966, |
|
"learning_rate": 2.9229249349905686e-06, |
|
"loss": 0.3905, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.6748338870431894, |
|
"grad_norm": 2.441034044229084, |
|
"learning_rate": 2.890011481269204e-06, |
|
"loss": 0.404, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.6769102990033222, |
|
"grad_norm": 2.4310426686498507, |
|
"learning_rate": 2.8572088939928623e-06, |
|
"loss": 0.3985, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.6789867109634552, |
|
"grad_norm": 2.5154739727394397, |
|
"learning_rate": 2.824518896728386e-06, |
|
"loss": 0.3972, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.6810631229235881, |
|
"grad_norm": 2.4239374759188066, |
|
"learning_rate": 2.7919432071267212e-06, |
|
"loss": 0.3986, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6831395348837209, |
|
"grad_norm": 2.401230714452262, |
|
"learning_rate": 2.759483536832682e-06, |
|
"loss": 0.3961, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.6852159468438538, |
|
"grad_norm": 2.3945770626194425, |
|
"learning_rate": 2.7271415913950027e-06, |
|
"loss": 0.3987, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.6872923588039868, |
|
"grad_norm": 2.5083750676716248, |
|
"learning_rate": 2.6949190701767323e-06, |
|
"loss": 0.3987, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.6893687707641196, |
|
"grad_norm": 2.359597868105036, |
|
"learning_rate": 2.662817666265932e-06, |
|
"loss": 0.3992, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6914451827242525, |
|
"grad_norm": 2.3950900870588305, |
|
"learning_rate": 2.6308390663867247e-06, |
|
"loss": 0.3755, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.6935215946843853, |
|
"grad_norm": 2.2726643843793783, |
|
"learning_rate": 2.5989849508106663e-06, |
|
"loss": 0.3788, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.6955980066445183, |
|
"grad_norm": 2.3688642141053644, |
|
"learning_rate": 2.5672569932684486e-06, |
|
"loss": 0.3923, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 2.4674555381530543, |
|
"learning_rate": 2.5356568608619737e-06, |
|
"loss": 0.3784, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.699750830564784, |
|
"grad_norm": 2.348080957902949, |
|
"learning_rate": 2.504186213976736e-06, |
|
"loss": 0.3888, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.7018272425249169, |
|
"grad_norm": 2.2245908133987506, |
|
"learning_rate": 2.4728467061946017e-06, |
|
"loss": 0.383, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7039036544850499, |
|
"grad_norm": 2.308262964854599, |
|
"learning_rate": 2.441639984206903e-06, |
|
"loss": 0.3873, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.7059800664451827, |
|
"grad_norm": 2.3316191201720726, |
|
"learning_rate": 2.4105676877279376e-06, |
|
"loss": 0.3764, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7080564784053156, |
|
"grad_norm": 2.2575654898253363, |
|
"learning_rate": 2.379631449408788e-06, |
|
"loss": 0.3857, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.7101328903654485, |
|
"grad_norm": 2.295434521334263, |
|
"learning_rate": 2.3488328947515566e-06, |
|
"loss": 0.3825, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7122093023255814, |
|
"grad_norm": 2.3045365012329704, |
|
"learning_rate": 2.318173642023939e-06, |
|
"loss": 0.3851, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 2.3117392889776665, |
|
"learning_rate": 2.287655302174208e-06, |
|
"loss": 0.3897, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7163621262458472, |
|
"grad_norm": 2.422532892044474, |
|
"learning_rate": 2.257279478746564e-06, |
|
"loss": 0.3799, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.71843853820598, |
|
"grad_norm": 2.2839185079742514, |
|
"learning_rate": 2.2270477677968727e-06, |
|
"loss": 0.3703, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.720514950166113, |
|
"grad_norm": 2.7279247585921786, |
|
"learning_rate": 2.196961757808813e-06, |
|
"loss": 0.3794, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.7225913621262459, |
|
"grad_norm": 2.396361579385602, |
|
"learning_rate": 2.167023029610402e-06, |
|
"loss": 0.3642, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7246677740863787, |
|
"grad_norm": 2.340856081292544, |
|
"learning_rate": 2.1372331562909453e-06, |
|
"loss": 0.372, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.7267441860465116, |
|
"grad_norm": 2.413915292833693, |
|
"learning_rate": 2.1075937031183636e-06, |
|
"loss": 0.3767, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7288205980066446, |
|
"grad_norm": 2.2094868525489386, |
|
"learning_rate": 2.0781062274569657e-06, |
|
"loss": 0.3713, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.7308970099667774, |
|
"grad_norm": 2.2242377702402663, |
|
"learning_rate": 2.0487722786856107e-06, |
|
"loss": 0.3808, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.7329734219269103, |
|
"grad_norm": 2.451226818715509, |
|
"learning_rate": 2.019593398116292e-06, |
|
"loss": 0.3752, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.7350498338870431, |
|
"grad_norm": 2.5070300923436006, |
|
"learning_rate": 1.990571118913166e-06, |
|
"loss": 0.3754, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.7371262458471761, |
|
"grad_norm": 2.4891905395473963, |
|
"learning_rate": 1.961706966011978e-06, |
|
"loss": 0.3877, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.739202657807309, |
|
"grad_norm": 2.4842650358701905, |
|
"learning_rate": 1.9330024560399507e-06, |
|
"loss": 0.3836, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.7412790697674418, |
|
"grad_norm": 2.250133568783516, |
|
"learning_rate": 1.9044590972360822e-06, |
|
"loss": 0.3725, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.7433554817275747, |
|
"grad_norm": 2.341904795212687, |
|
"learning_rate": 1.876078389371911e-06, |
|
"loss": 0.3679, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.7454318936877077, |
|
"grad_norm": 2.3068998565270746, |
|
"learning_rate": 1.8478618236726992e-06, |
|
"loss": 0.3757, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.7475083056478405, |
|
"grad_norm": 2.2619310866276203, |
|
"learning_rate": 1.8198108827390892e-06, |
|
"loss": 0.3742, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7495847176079734, |
|
"grad_norm": 2.406091048606607, |
|
"learning_rate": 1.791927040469198e-06, |
|
"loss": 0.3805, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.7516611295681063, |
|
"grad_norm": 2.3430777426784077, |
|
"learning_rate": 1.7642117619811672e-06, |
|
"loss": 0.3744, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.7537375415282392, |
|
"grad_norm": 2.309496934162411, |
|
"learning_rate": 1.7366665035361947e-06, |
|
"loss": 0.3856, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.7558139534883721, |
|
"grad_norm": 2.3680236136606085, |
|
"learning_rate": 1.7092927124620007e-06, |
|
"loss": 0.3747, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.757890365448505, |
|
"grad_norm": 2.3303370070854066, |
|
"learning_rate": 1.682091827076796e-06, |
|
"loss": 0.3724, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.7599667774086378, |
|
"grad_norm": 2.308665058379731, |
|
"learning_rate": 1.6550652766136932e-06, |
|
"loss": 0.3701, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.7620431893687708, |
|
"grad_norm": 2.423141151726278, |
|
"learning_rate": 1.6282144811456196e-06, |
|
"loss": 0.3749, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.7641196013289037, |
|
"grad_norm": 2.310790310097539, |
|
"learning_rate": 1.6015408515107e-06, |
|
"loss": 0.3649, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.7661960132890365, |
|
"grad_norm": 2.350953218186428, |
|
"learning_rate": 1.5750457892381183e-06, |
|
"loss": 0.3766, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.7682724252491694, |
|
"grad_norm": 2.3685044215677826, |
|
"learning_rate": 1.5487306864744878e-06, |
|
"loss": 0.3626, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.7703488372093024, |
|
"grad_norm": 2.4283396349263384, |
|
"learning_rate": 1.5225969259106909e-06, |
|
"loss": 0.358, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.7724252491694352, |
|
"grad_norm": 2.515904865078178, |
|
"learning_rate": 1.4966458807092404e-06, |
|
"loss": 0.3703, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.7745016611295681, |
|
"grad_norm": 2.369156818267499, |
|
"learning_rate": 1.470878914432115e-06, |
|
"loss": 0.3628, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.776578073089701, |
|
"grad_norm": 2.3898762463795302, |
|
"learning_rate": 1.4452973809691245e-06, |
|
"loss": 0.3491, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.7786544850498339, |
|
"grad_norm": 2.307405290268551, |
|
"learning_rate": 1.4199026244667636e-06, |
|
"loss": 0.3715, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.7807308970099668, |
|
"grad_norm": 2.3679557325362808, |
|
"learning_rate": 1.3946959792575915e-06, |
|
"loss": 0.3716, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.7828073089700996, |
|
"grad_norm": 2.368304219604154, |
|
"learning_rate": 1.3696787697901131e-06, |
|
"loss": 0.3661, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.7848837209302325, |
|
"grad_norm": 2.337789695422565, |
|
"learning_rate": 1.3448523105591976e-06, |
|
"loss": 0.3605, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.7869601328903655, |
|
"grad_norm": 2.611258973451232, |
|
"learning_rate": 1.3202179060370041e-06, |
|
"loss": 0.3699, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.7890365448504983, |
|
"grad_norm": 2.437657572808606, |
|
"learning_rate": 1.2957768506044383e-06, |
|
"loss": 0.3651, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7911129568106312, |
|
"grad_norm": 2.388228690853508, |
|
"learning_rate": 1.2715304284831492e-06, |
|
"loss": 0.3664, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.7931893687707641, |
|
"grad_norm": 2.2860587085497235, |
|
"learning_rate": 1.2474799136680394e-06, |
|
"loss": 0.3577, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.795265780730897, |
|
"grad_norm": 2.2178621526275077, |
|
"learning_rate": 1.223626569860339e-06, |
|
"loss": 0.3441, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.7973421926910299, |
|
"grad_norm": 2.518415787103085, |
|
"learning_rate": 1.1999716504011917e-06, |
|
"loss": 0.3673, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.7994186046511628, |
|
"grad_norm": 2.359475880122496, |
|
"learning_rate": 1.1765163982058109e-06, |
|
"loss": 0.3567, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.8014950166112956, |
|
"grad_norm": 2.407404285602653, |
|
"learning_rate": 1.1532620456981685e-06, |
|
"loss": 0.3476, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.8035714285714286, |
|
"grad_norm": 2.562334088122669, |
|
"learning_rate": 1.1302098147462348e-06, |
|
"loss": 0.3658, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.8056478405315615, |
|
"grad_norm": 2.4467720130350163, |
|
"learning_rate": 1.1073609165977866e-06, |
|
"loss": 0.348, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.8077242524916943, |
|
"grad_norm": 2.3514873698583574, |
|
"learning_rate": 1.0847165518167513e-06, |
|
"loss": 0.3601, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.8098006644518272, |
|
"grad_norm": 2.258063143891622, |
|
"learning_rate": 1.062277910220138e-06, |
|
"loss": 0.3548, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8118770764119602, |
|
"grad_norm": 2.3377988411022246, |
|
"learning_rate": 1.0400461708155095e-06, |
|
"loss": 0.3591, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.813953488372093, |
|
"grad_norm": 2.4485426821221004, |
|
"learning_rate": 1.0180225017390416e-06, |
|
"loss": 0.3583, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.8160299003322259, |
|
"grad_norm": 2.3726559534317797, |
|
"learning_rate": 9.962080601941365e-07, |
|
"loss": 0.3426, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.8181063122923588, |
|
"grad_norm": 2.2417751776494543, |
|
"learning_rate": 9.746039923906258e-07, |
|
"loss": 0.343, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.8201827242524917, |
|
"grad_norm": 2.5294843157217906, |
|
"learning_rate": 9.532114334845444e-07, |
|
"loss": 0.3664, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.8222591362126246, |
|
"grad_norm": 2.5572851406694235, |
|
"learning_rate": 9.320315075184771e-07, |
|
"loss": 0.3483, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.8243355481727574, |
|
"grad_norm": 2.4014306355585973, |
|
"learning_rate": 9.110653273625103e-07, |
|
"loss": 0.3454, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.8264119601328903, |
|
"grad_norm": 2.3699223457500715, |
|
"learning_rate": 8.903139946557437e-07, |
|
"loss": 0.3527, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.8284883720930233, |
|
"grad_norm": 2.4489197804834197, |
|
"learning_rate": 8.697785997484198e-07, |
|
"loss": 0.3535, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.8305647840531561, |
|
"grad_norm": 2.4381698669696044, |
|
"learning_rate": 8.494602216446213e-07, |
|
"loss": 0.3522, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.832641196013289, |
|
"grad_norm": 2.373612659548005, |
|
"learning_rate": 8.293599279455838e-07, |
|
"loss": 0.352, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.834717607973422, |
|
"grad_norm": 2.5001126967401763, |
|
"learning_rate": 8.094787747935995e-07, |
|
"loss": 0.3533, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.8367940199335548, |
|
"grad_norm": 2.4033229472375637, |
|
"learning_rate": 7.898178068165175e-07, |
|
"loss": 0.3569, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.8388704318936877, |
|
"grad_norm": 2.34177766700727, |
|
"learning_rate": 7.703780570728637e-07, |
|
"loss": 0.3485, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.8409468438538206, |
|
"grad_norm": 2.345211689975521, |
|
"learning_rate": 7.511605469975524e-07, |
|
"loss": 0.3541, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.8430232558139535, |
|
"grad_norm": 2.491346976334481, |
|
"learning_rate": 7.321662863482248e-07, |
|
"loss": 0.357, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.8450996677740864, |
|
"grad_norm": 2.4991193300068515, |
|
"learning_rate": 7.133962731521837e-07, |
|
"loss": 0.3504, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.8471760797342193, |
|
"grad_norm": 2.4131651786978376, |
|
"learning_rate": 6.948514936539596e-07, |
|
"loss": 0.3413, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.8492524916943521, |
|
"grad_norm": 2.4158508388648046, |
|
"learning_rate": 6.765329222634892e-07, |
|
"loss": 0.3368, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.8513289036544851, |
|
"grad_norm": 2.444048773418729, |
|
"learning_rate": 6.584415215049145e-07, |
|
"loss": 0.3478, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.853405315614618, |
|
"grad_norm": 2.3067727734077854, |
|
"learning_rate": 6.405782419660073e-07, |
|
"loss": 0.3539, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.8554817275747508, |
|
"grad_norm": 2.389540542776719, |
|
"learning_rate": 6.229440222482258e-07, |
|
"loss": 0.3568, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.8575581395348837, |
|
"grad_norm": 2.490728442827626, |
|
"learning_rate": 6.055397889173947e-07, |
|
"loss": 0.3425, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.8596345514950167, |
|
"grad_norm": 2.4309142506564116, |
|
"learning_rate": 5.88366456455019e-07, |
|
"loss": 0.3556, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.8617109634551495, |
|
"grad_norm": 2.577695548294538, |
|
"learning_rate": 5.714249272102368e-07, |
|
"loss": 0.3479, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.8637873754152824, |
|
"grad_norm": 2.3780994980865513, |
|
"learning_rate": 5.547160913524024e-07, |
|
"loss": 0.3407, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.8658637873754153, |
|
"grad_norm": 2.3471940728385645, |
|
"learning_rate": 5.382408268243194e-07, |
|
"loss": 0.327, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.8679401993355482, |
|
"grad_norm": 2.5308209588235964, |
|
"learning_rate": 5.219999992961044e-07, |
|
"loss": 0.3486, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.8700166112956811, |
|
"grad_norm": 2.347529844497377, |
|
"learning_rate": 5.05994462119705e-07, |
|
"loss": 0.3507, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.872093023255814, |
|
"grad_norm": 2.4490768218202428, |
|
"learning_rate": 4.902250562840622e-07, |
|
"loss": 0.3484, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8741694352159468, |
|
"grad_norm": 2.4607053819399227, |
|
"learning_rate": 4.7469261037091765e-07, |
|
"loss": 0.355, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.8762458471760798, |
|
"grad_norm": 2.37905091425431, |
|
"learning_rate": 4.5939794051128363e-07, |
|
"loss": 0.3544, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.8783222591362126, |
|
"grad_norm": 2.3898177002048397, |
|
"learning_rate": 4.443418503425517e-07, |
|
"loss": 0.3459, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.8803986710963455, |
|
"grad_norm": 2.3945638825763336, |
|
"learning_rate": 4.295251309662768e-07, |
|
"loss": 0.3475, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.8824750830564784, |
|
"grad_norm": 2.376437633901908, |
|
"learning_rate": 4.149485609066001e-07, |
|
"loss": 0.3448, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.8845514950166113, |
|
"grad_norm": 2.4682795986451884, |
|
"learning_rate": 4.0061290606935145e-07, |
|
"loss": 0.3501, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.8866279069767442, |
|
"grad_norm": 2.307696986215917, |
|
"learning_rate": 3.8651891970179876e-07, |
|
"loss": 0.3509, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.8887043189368771, |
|
"grad_norm": 2.2638655900879323, |
|
"learning_rate": 3.7266734235307357e-07, |
|
"loss": 0.3494, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.8907807308970099, |
|
"grad_norm": 2.4074516319355865, |
|
"learning_rate": 3.5905890183525916e-07, |
|
"loss": 0.3381, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 2.4580735039851263, |
|
"learning_rate": 3.4569431318514647e-07, |
|
"loss": 0.3506, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.8949335548172758, |
|
"grad_norm": 2.223651003352099, |
|
"learning_rate": 3.3257427862666894e-07, |
|
"loss": 0.3426, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.8970099667774086, |
|
"grad_norm": 2.5240054200803925, |
|
"learning_rate": 3.196994875339976e-07, |
|
"loss": 0.3394, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.8990863787375415, |
|
"grad_norm": 2.650418412385108, |
|
"learning_rate": 3.0707061639532687e-07, |
|
"loss": 0.3469, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.9011627906976745, |
|
"grad_norm": 2.5283079967315256, |
|
"learning_rate": 2.946883287773211e-07, |
|
"loss": 0.3572, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.9032392026578073, |
|
"grad_norm": 2.482824449172331, |
|
"learning_rate": 2.82553275290256e-07, |
|
"loss": 0.3469, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.9053156146179402, |
|
"grad_norm": 2.42162117653704, |
|
"learning_rate": 2.706660935538297e-07, |
|
"loss": 0.3522, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.907392026578073, |
|
"grad_norm": 2.610628055343181, |
|
"learning_rate": 2.590274081636568e-07, |
|
"loss": 0.3326, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.909468438538206, |
|
"grad_norm": 2.337754822501405, |
|
"learning_rate": 2.476378306584576e-07, |
|
"loss": 0.3472, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.9115448504983389, |
|
"grad_norm": 2.422013772805342, |
|
"learning_rate": 2.3649795948791744e-07, |
|
"loss": 0.3291, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.9136212624584718, |
|
"grad_norm": 2.5260012444754865, |
|
"learning_rate": 2.2560837998124862e-07, |
|
"loss": 0.3443, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9156976744186046, |
|
"grad_norm": 2.5167784300702203, |
|
"learning_rate": 2.1496966431642895e-07, |
|
"loss": 0.344, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.9177740863787376, |
|
"grad_norm": 2.5184080924547976, |
|
"learning_rate": 2.0458237149014347e-07, |
|
"loss": 0.3431, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.9198504983388704, |
|
"grad_norm": 2.6121850478268915, |
|
"learning_rate": 1.944470472884097e-07, |
|
"loss": 0.3469, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.9219269102990033, |
|
"grad_norm": 2.4250182138955987, |
|
"learning_rate": 1.8456422425789822e-07, |
|
"loss": 0.346, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.9240033222591362, |
|
"grad_norm": 2.4126854578567056, |
|
"learning_rate": 1.7493442167795526e-07, |
|
"loss": 0.3394, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.9260797342192691, |
|
"grad_norm": 2.2732400743037546, |
|
"learning_rate": 1.6555814553331328e-07, |
|
"loss": 0.3474, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.928156146179402, |
|
"grad_norm": 2.4576436036196867, |
|
"learning_rate": 1.5643588848750944e-07, |
|
"loss": 0.3455, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 2.417373647969096, |
|
"learning_rate": 1.4756812985699364e-07, |
|
"loss": 0.3389, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.9323089700996677, |
|
"grad_norm": 2.314864797926019, |
|
"learning_rate": 1.3895533558594853e-07, |
|
"loss": 0.3307, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.9343853820598007, |
|
"grad_norm": 2.4942438872944375, |
|
"learning_rate": 1.305979582218042e-07, |
|
"loss": 0.3413, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.9364617940199336, |
|
"grad_norm": 2.4271492623044733, |
|
"learning_rate": 1.224964368914622e-07, |
|
"loss": 0.3533, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.9385382059800664, |
|
"grad_norm": 2.404072393255019, |
|
"learning_rate": 1.1465119727821828e-07, |
|
"loss": 0.3388, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.9406146179401993, |
|
"grad_norm": 2.4291366569357233, |
|
"learning_rate": 1.0706265159939944e-07, |
|
"loss": 0.329, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.9426910299003323, |
|
"grad_norm": 2.370319609790916, |
|
"learning_rate": 9.973119858470326e-08, |
|
"loss": 0.3435, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.9447674418604651, |
|
"grad_norm": 2.612518036597659, |
|
"learning_rate": 9.265722345524475e-08, |
|
"loss": 0.3544, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.946843853820598, |
|
"grad_norm": 2.325383175606347, |
|
"learning_rate": 8.584109790331918e-08, |
|
"loss": 0.334, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.9489202657807309, |
|
"grad_norm": 2.483650038896797, |
|
"learning_rate": 7.92831800728705e-08, |
|
"loss": 0.3495, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.9509966777408638, |
|
"grad_norm": 2.3917415303858323, |
|
"learning_rate": 7.29838145406725e-08, |
|
"loss": 0.3525, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.9530730897009967, |
|
"grad_norm": 2.3491361755345297, |
|
"learning_rate": 6.69433322982238e-08, |
|
"loss": 0.3261, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.9551495016611296, |
|
"grad_norm": 2.395546616132027, |
|
"learning_rate": 6.116205073435632e-08, |
|
"loss": 0.3572, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.9572259136212624, |
|
"grad_norm": 2.38127790933904, |
|
"learning_rate": 5.5640273618560724e-08, |
|
"loss": 0.3477, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.9593023255813954, |
|
"grad_norm": 2.437345577309693, |
|
"learning_rate": 5.0378291085020905e-08, |
|
"loss": 0.3498, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.9613787375415282, |
|
"grad_norm": 2.3743576771009125, |
|
"learning_rate": 4.537637961737285e-08, |
|
"loss": 0.3537, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.9634551495016611, |
|
"grad_norm": 2.4186159686143816, |
|
"learning_rate": 4.063480203417625e-08, |
|
"loss": 0.3491, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.965531561461794, |
|
"grad_norm": 2.43658746364112, |
|
"learning_rate": 3.6153807475103886e-08, |
|
"loss": 0.3372, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.967607973421927, |
|
"grad_norm": 2.433100952556644, |
|
"learning_rate": 3.1933631387853215e-08, |
|
"loss": 0.34, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.9696843853820598, |
|
"grad_norm": 2.3533082714101288, |
|
"learning_rate": 2.7974495515772915e-08, |
|
"loss": 0.3478, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.9717607973421927, |
|
"grad_norm": 2.3042817476032296, |
|
"learning_rate": 2.427660788621222e-08, |
|
"loss": 0.3522, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.9738372093023255, |
|
"grad_norm": 2.5758246509298184, |
|
"learning_rate": 2.0840162799591335e-08, |
|
"loss": 0.3518, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.9759136212624585, |
|
"grad_norm": 2.2839484862848254, |
|
"learning_rate": 1.7665340819192356e-08, |
|
"loss": 0.3412, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.9779900332225914, |
|
"grad_norm": 2.3825796160738184, |
|
"learning_rate": 1.475230876166911e-08, |
|
"loss": 0.3484, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.9800664451827242, |
|
"grad_norm": 2.436946151591597, |
|
"learning_rate": 1.2101219688285815e-08, |
|
"loss": 0.3406, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.9821428571428571, |
|
"grad_norm": 2.3004422772721385, |
|
"learning_rate": 9.712212896871854e-09, |
|
"loss": 0.3483, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.9842192691029901, |
|
"grad_norm": 2.243191260776767, |
|
"learning_rate": 7.585413914503182e-09, |
|
"loss": 0.3279, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.9862956810631229, |
|
"grad_norm": 2.4579139874339213, |
|
"learning_rate": 5.720934490907604e-09, |
|
"loss": 0.3539, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.9883720930232558, |
|
"grad_norm": 2.2560463637497885, |
|
"learning_rate": 4.118872592592804e-09, |
|
"loss": 0.3376, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.9904485049833887, |
|
"grad_norm": 2.4374426699588327, |
|
"learning_rate": 2.7793123976976866e-09, |
|
"loss": 0.337, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.9925249169435216, |
|
"grad_norm": 2.4021295260594466, |
|
"learning_rate": 1.7023242915703563e-09, |
|
"loss": 0.3422, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.9946013289036545, |
|
"grad_norm": 2.3647029145641847, |
|
"learning_rate": 8.879648630705229e-10, |
|
"loss": 0.3402, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.9966777408637874, |
|
"grad_norm": 2.377691718973852, |
|
"learning_rate": 3.362769015941014e-10, |
|
"loss": 0.3437, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9987541528239202, |
|
"grad_norm": 2.277353937646912, |
|
"learning_rate": 4.7289394825567046e-11, |
|
"loss": 0.3486, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_runtime": 3.4135, |
|
"eval_samples_per_second": 2.93, |
|
"eval_steps_per_second": 0.879, |
|
"step": 2408 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2408, |
|
"total_flos": 252093105438720.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 0.0085, |
|
"train_samples_per_second": 4527521.09, |
|
"train_steps_per_second": 283028.837 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2408, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 252093105438720.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|