terry69's picture
Model save
58a1e6b verified
raw
history blame
84.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004152823920265781,
"grad_norm": 25.422981813437236,
"learning_rate": 4.1493775933609963e-08,
"loss": 1.3975,
"step": 1
},
{
"epoch": 0.0020764119601328905,
"grad_norm": 23.65282908395334,
"learning_rate": 2.074688796680498e-07,
"loss": 1.4281,
"step": 5
},
{
"epoch": 0.004152823920265781,
"grad_norm": 16.38973942245371,
"learning_rate": 4.149377593360996e-07,
"loss": 1.3933,
"step": 10
},
{
"epoch": 0.006229235880398671,
"grad_norm": 8.620332321861904,
"learning_rate": 6.224066390041494e-07,
"loss": 1.2986,
"step": 15
},
{
"epoch": 0.008305647840531562,
"grad_norm": 10.289897317705874,
"learning_rate": 8.298755186721992e-07,
"loss": 1.1565,
"step": 20
},
{
"epoch": 0.010382059800664452,
"grad_norm": 4.429779856244459,
"learning_rate": 1.037344398340249e-06,
"loss": 1.051,
"step": 25
},
{
"epoch": 0.012458471760797342,
"grad_norm": 3.3098208738585213,
"learning_rate": 1.2448132780082988e-06,
"loss": 0.9902,
"step": 30
},
{
"epoch": 0.014534883720930232,
"grad_norm": 3.4349888460346687,
"learning_rate": 1.4522821576763488e-06,
"loss": 0.9652,
"step": 35
},
{
"epoch": 0.016611295681063124,
"grad_norm": 3.1515624301454133,
"learning_rate": 1.6597510373443984e-06,
"loss": 0.9415,
"step": 40
},
{
"epoch": 0.018687707641196014,
"grad_norm": 3.1235312209606505,
"learning_rate": 1.8672199170124482e-06,
"loss": 0.93,
"step": 45
},
{
"epoch": 0.020764119601328904,
"grad_norm": 3.1741829648141926,
"learning_rate": 2.074688796680498e-06,
"loss": 0.9238,
"step": 50
},
{
"epoch": 0.022840531561461794,
"grad_norm": 3.232116295196654,
"learning_rate": 2.282157676348548e-06,
"loss": 0.9123,
"step": 55
},
{
"epoch": 0.024916943521594685,
"grad_norm": 3.1515595029223396,
"learning_rate": 2.4896265560165977e-06,
"loss": 0.9031,
"step": 60
},
{
"epoch": 0.026993355481727575,
"grad_norm": 3.1003061370301617,
"learning_rate": 2.6970954356846475e-06,
"loss": 0.8947,
"step": 65
},
{
"epoch": 0.029069767441860465,
"grad_norm": 2.9767060692194844,
"learning_rate": 2.9045643153526977e-06,
"loss": 0.8919,
"step": 70
},
{
"epoch": 0.031146179401993355,
"grad_norm": 3.0759553041103205,
"learning_rate": 3.112033195020747e-06,
"loss": 0.8702,
"step": 75
},
{
"epoch": 0.03322259136212625,
"grad_norm": 3.285827319776166,
"learning_rate": 3.319502074688797e-06,
"loss": 0.8727,
"step": 80
},
{
"epoch": 0.03529900332225914,
"grad_norm": 3.3462993523967186,
"learning_rate": 3.526970954356847e-06,
"loss": 0.8736,
"step": 85
},
{
"epoch": 0.03737541528239203,
"grad_norm": 3.256004424550593,
"learning_rate": 3.7344398340248965e-06,
"loss": 0.8858,
"step": 90
},
{
"epoch": 0.03945182724252492,
"grad_norm": 3.159488005717498,
"learning_rate": 3.941908713692946e-06,
"loss": 0.8559,
"step": 95
},
{
"epoch": 0.04152823920265781,
"grad_norm": 2.9634363451500114,
"learning_rate": 4.149377593360996e-06,
"loss": 0.8586,
"step": 100
},
{
"epoch": 0.0436046511627907,
"grad_norm": 3.159728031235876,
"learning_rate": 4.356846473029046e-06,
"loss": 0.8674,
"step": 105
},
{
"epoch": 0.04568106312292359,
"grad_norm": 2.970452415217835,
"learning_rate": 4.564315352697096e-06,
"loss": 0.8542,
"step": 110
},
{
"epoch": 0.04775747508305648,
"grad_norm": 3.1788047641427513,
"learning_rate": 4.771784232365146e-06,
"loss": 0.8701,
"step": 115
},
{
"epoch": 0.04983388704318937,
"grad_norm": 3.451301944053267,
"learning_rate": 4.979253112033195e-06,
"loss": 0.8714,
"step": 120
},
{
"epoch": 0.05191029900332226,
"grad_norm": 3.388326009403783,
"learning_rate": 5.1867219917012455e-06,
"loss": 0.8483,
"step": 125
},
{
"epoch": 0.05398671096345515,
"grad_norm": 2.9959540593135645,
"learning_rate": 5.394190871369295e-06,
"loss": 0.8481,
"step": 130
},
{
"epoch": 0.05606312292358804,
"grad_norm": 3.1213953476841856,
"learning_rate": 5.601659751037345e-06,
"loss": 0.8387,
"step": 135
},
{
"epoch": 0.05813953488372093,
"grad_norm": 3.403963416369247,
"learning_rate": 5.809128630705395e-06,
"loss": 0.8399,
"step": 140
},
{
"epoch": 0.06021594684385382,
"grad_norm": 3.0893053330914695,
"learning_rate": 6.016597510373444e-06,
"loss": 0.8386,
"step": 145
},
{
"epoch": 0.06229235880398671,
"grad_norm": 3.166492177328262,
"learning_rate": 6.224066390041494e-06,
"loss": 0.8457,
"step": 150
},
{
"epoch": 0.0643687707641196,
"grad_norm": 3.2851556967703117,
"learning_rate": 6.431535269709544e-06,
"loss": 0.8421,
"step": 155
},
{
"epoch": 0.0664451827242525,
"grad_norm": 2.9899317822541454,
"learning_rate": 6.639004149377594e-06,
"loss": 0.8373,
"step": 160
},
{
"epoch": 0.06852159468438539,
"grad_norm": 3.0509892785590456,
"learning_rate": 6.846473029045644e-06,
"loss": 0.8334,
"step": 165
},
{
"epoch": 0.07059800664451828,
"grad_norm": 3.00742757115455,
"learning_rate": 7.053941908713694e-06,
"loss": 0.8233,
"step": 170
},
{
"epoch": 0.07267441860465117,
"grad_norm": 3.0518393701751485,
"learning_rate": 7.261410788381743e-06,
"loss": 0.8296,
"step": 175
},
{
"epoch": 0.07475083056478406,
"grad_norm": 3.1984146233667263,
"learning_rate": 7.468879668049793e-06,
"loss": 0.8155,
"step": 180
},
{
"epoch": 0.07682724252491695,
"grad_norm": 3.05629449726749,
"learning_rate": 7.676348547717844e-06,
"loss": 0.8377,
"step": 185
},
{
"epoch": 0.07890365448504984,
"grad_norm": 3.249088059891964,
"learning_rate": 7.883817427385892e-06,
"loss": 0.8432,
"step": 190
},
{
"epoch": 0.08098006644518273,
"grad_norm": 3.0028481508425515,
"learning_rate": 8.091286307053943e-06,
"loss": 0.8173,
"step": 195
},
{
"epoch": 0.08305647840531562,
"grad_norm": 3.059733445916786,
"learning_rate": 8.298755186721992e-06,
"loss": 0.8227,
"step": 200
},
{
"epoch": 0.08513289036544851,
"grad_norm": 3.0867633236533365,
"learning_rate": 8.506224066390042e-06,
"loss": 0.8181,
"step": 205
},
{
"epoch": 0.0872093023255814,
"grad_norm": 2.997953986592159,
"learning_rate": 8.713692946058093e-06,
"loss": 0.821,
"step": 210
},
{
"epoch": 0.08928571428571429,
"grad_norm": 3.2351659520743072,
"learning_rate": 8.921161825726142e-06,
"loss": 0.8294,
"step": 215
},
{
"epoch": 0.09136212624584718,
"grad_norm": 3.1494481731597586,
"learning_rate": 9.128630705394191e-06,
"loss": 0.8261,
"step": 220
},
{
"epoch": 0.09343853820598007,
"grad_norm": 3.105511823234228,
"learning_rate": 9.33609958506224e-06,
"loss": 0.8165,
"step": 225
},
{
"epoch": 0.09551495016611296,
"grad_norm": 3.023901781664328,
"learning_rate": 9.543568464730292e-06,
"loss": 0.8123,
"step": 230
},
{
"epoch": 0.09759136212624585,
"grad_norm": 3.4303556589177187,
"learning_rate": 9.751037344398341e-06,
"loss": 0.8093,
"step": 235
},
{
"epoch": 0.09966777408637874,
"grad_norm": 3.6054989714255408,
"learning_rate": 9.95850622406639e-06,
"loss": 0.8201,
"step": 240
},
{
"epoch": 0.10174418604651163,
"grad_norm": 2.990225009601177,
"learning_rate": 9.999915930067828e-06,
"loss": 0.8208,
"step": 245
},
{
"epoch": 0.10382059800664452,
"grad_norm": 2.9957103647324264,
"learning_rate": 9.999574400813641e-06,
"loss": 0.816,
"step": 250
},
{
"epoch": 0.10589700996677741,
"grad_norm": 2.8988415018010287,
"learning_rate": 9.998970175798065e-06,
"loss": 0.8044,
"step": 255
},
{
"epoch": 0.1079734219269103,
"grad_norm": 2.893907971746992,
"learning_rate": 9.998103286769267e-06,
"loss": 0.799,
"step": 260
},
{
"epoch": 0.11004983388704319,
"grad_norm": 2.898946354458808,
"learning_rate": 9.996973779276743e-06,
"loss": 0.8113,
"step": 265
},
{
"epoch": 0.11212624584717608,
"grad_norm": 3.0485697591450998,
"learning_rate": 9.99558171266891e-06,
"loss": 0.8194,
"step": 270
},
{
"epoch": 0.11420265780730897,
"grad_norm": 2.933613250090363,
"learning_rate": 9.993927160089991e-06,
"loss": 0.7981,
"step": 275
},
{
"epoch": 0.11627906976744186,
"grad_norm": 2.900283777987733,
"learning_rate": 9.992010208476178e-06,
"loss": 0.8114,
"step": 280
},
{
"epoch": 0.11835548172757475,
"grad_norm": 2.869639926652705,
"learning_rate": 9.989830958551058e-06,
"loss": 0.8026,
"step": 285
},
{
"epoch": 0.12043189368770764,
"grad_norm": 3.0764284732072236,
"learning_rate": 9.98738952482032e-06,
"loss": 0.7816,
"step": 290
},
{
"epoch": 0.12250830564784053,
"grad_norm": 2.872848930860205,
"learning_rate": 9.984686035565742e-06,
"loss": 0.7851,
"step": 295
},
{
"epoch": 0.12458471760797342,
"grad_norm": 2.7170384439590367,
"learning_rate": 9.98172063283845e-06,
"loss": 0.8054,
"step": 300
},
{
"epoch": 0.12666112956810632,
"grad_norm": 2.785739578421159,
"learning_rate": 9.978493472451451e-06,
"loss": 0.7824,
"step": 305
},
{
"epoch": 0.1287375415282392,
"grad_norm": 2.955753943035507,
"learning_rate": 9.975004723971452e-06,
"loss": 0.7788,
"step": 310
},
{
"epoch": 0.1308139534883721,
"grad_norm": 2.7566534229071378,
"learning_rate": 9.971254570709939e-06,
"loss": 0.7804,
"step": 315
},
{
"epoch": 0.132890365448505,
"grad_norm": 3.0399050026271945,
"learning_rate": 9.967243209713563e-06,
"loss": 0.7712,
"step": 320
},
{
"epoch": 0.13496677740863788,
"grad_norm": 3.227011718605211,
"learning_rate": 9.962970851753767e-06,
"loss": 0.7852,
"step": 325
},
{
"epoch": 0.13704318936877077,
"grad_norm": 2.894940556652265,
"learning_rate": 9.95843772131573e-06,
"loss": 0.767,
"step": 330
},
{
"epoch": 0.13911960132890366,
"grad_norm": 3.137972193410393,
"learning_rate": 9.95364405658655e-06,
"loss": 0.77,
"step": 335
},
{
"epoch": 0.14119601328903655,
"grad_norm": 2.7913612546678426,
"learning_rate": 9.948590109442755e-06,
"loss": 0.7768,
"step": 340
},
{
"epoch": 0.14327242524916944,
"grad_norm": 2.893979747266515,
"learning_rate": 9.94327614543704e-06,
"loss": 0.7827,
"step": 345
},
{
"epoch": 0.14534883720930233,
"grad_norm": 2.665071280290936,
"learning_rate": 9.937702443784343e-06,
"loss": 0.7474,
"step": 350
},
{
"epoch": 0.14742524916943522,
"grad_norm": 2.741350083908129,
"learning_rate": 9.931869297347146e-06,
"loss": 0.7638,
"step": 355
},
{
"epoch": 0.14950166112956811,
"grad_norm": 2.9878149207237357,
"learning_rate": 9.925777012620111e-06,
"loss": 0.7419,
"step": 360
},
{
"epoch": 0.151578073089701,
"grad_norm": 2.801227928713699,
"learning_rate": 9.919425909713958e-06,
"loss": 0.769,
"step": 365
},
{
"epoch": 0.1536544850498339,
"grad_norm": 3.023770968839729,
"learning_rate": 9.912816322338659e-06,
"loss": 0.7447,
"step": 370
},
{
"epoch": 0.15573089700996678,
"grad_norm": 2.9927287523796715,
"learning_rate": 9.905948597785888e-06,
"loss": 0.754,
"step": 375
},
{
"epoch": 0.15780730897009967,
"grad_norm": 11.785492453222856,
"learning_rate": 9.89882309691079e-06,
"loss": 0.7497,
"step": 380
},
{
"epoch": 0.15988372093023256,
"grad_norm": 2.8752234411604682,
"learning_rate": 9.891440194113008e-06,
"loss": 0.7427,
"step": 385
},
{
"epoch": 0.16196013289036545,
"grad_norm": 3.097207390376622,
"learning_rate": 9.88380027731702e-06,
"loss": 0.7542,
"step": 390
},
{
"epoch": 0.16403654485049834,
"grad_norm": 2.921991118334764,
"learning_rate": 9.875903747951742e-06,
"loss": 0.7621,
"step": 395
},
{
"epoch": 0.16611295681063123,
"grad_norm": 2.8395297947865963,
"learning_rate": 9.867751020929454e-06,
"loss": 0.735,
"step": 400
},
{
"epoch": 0.16818936877076412,
"grad_norm": 2.726116425089643,
"learning_rate": 9.859342524623985e-06,
"loss": 0.7124,
"step": 405
},
{
"epoch": 0.17026578073089702,
"grad_norm": 3.2173444091652943,
"learning_rate": 9.850678700848208e-06,
"loss": 0.7374,
"step": 410
},
{
"epoch": 0.1723421926910299,
"grad_norm": 2.716930762983964,
"learning_rate": 9.84176000483083e-06,
"loss": 0.7138,
"step": 415
},
{
"epoch": 0.1744186046511628,
"grad_norm": 2.985441779621083,
"learning_rate": 9.832586905192469e-06,
"loss": 0.731,
"step": 420
},
{
"epoch": 0.17649501661129569,
"grad_norm": 3.032790315651323,
"learning_rate": 9.823159883921028e-06,
"loss": 0.7215,
"step": 425
},
{
"epoch": 0.17857142857142858,
"grad_norm": 2.6988344818168155,
"learning_rate": 9.813479436346378e-06,
"loss": 0.7183,
"step": 430
},
{
"epoch": 0.18064784053156147,
"grad_norm": 2.973146607192177,
"learning_rate": 9.803546071114323e-06,
"loss": 0.7311,
"step": 435
},
{
"epoch": 0.18272425249169436,
"grad_norm": 2.9093506646801344,
"learning_rate": 9.793360310159878e-06,
"loss": 0.7049,
"step": 440
},
{
"epoch": 0.18480066445182725,
"grad_norm": 3.01100096145872,
"learning_rate": 9.782922688679847e-06,
"loss": 0.7118,
"step": 445
},
{
"epoch": 0.18687707641196014,
"grad_norm": 2.716470652939527,
"learning_rate": 9.772233755104695e-06,
"loss": 0.7277,
"step": 450
},
{
"epoch": 0.18895348837209303,
"grad_norm": 2.7134248053870165,
"learning_rate": 9.761294071069736e-06,
"loss": 0.7205,
"step": 455
},
{
"epoch": 0.19102990033222592,
"grad_norm": 2.6251507638777163,
"learning_rate": 9.750104211385625e-06,
"loss": 0.7152,
"step": 460
},
{
"epoch": 0.1931063122923588,
"grad_norm": 2.8023948010803483,
"learning_rate": 9.738664764008149e-06,
"loss": 0.7233,
"step": 465
},
{
"epoch": 0.1951827242524917,
"grad_norm": 3.714290449563204,
"learning_rate": 9.726976330007341e-06,
"loss": 0.6998,
"step": 470
},
{
"epoch": 0.1972591362126246,
"grad_norm": 2.8670419197216512,
"learning_rate": 9.71503952353589e-06,
"loss": 0.6985,
"step": 475
},
{
"epoch": 0.19933554817275748,
"grad_norm": 3.1683988394439107,
"learning_rate": 9.702854971796876e-06,
"loss": 0.7089,
"step": 480
},
{
"epoch": 0.20141196013289037,
"grad_norm": 3.2223078839261166,
"learning_rate": 9.690423315010814e-06,
"loss": 0.7053,
"step": 485
},
{
"epoch": 0.20348837209302326,
"grad_norm": 2.77875488832717,
"learning_rate": 9.677745206382014e-06,
"loss": 0.7271,
"step": 490
},
{
"epoch": 0.20556478405315615,
"grad_norm": 2.888271933836237,
"learning_rate": 9.664821312064258e-06,
"loss": 0.7018,
"step": 495
},
{
"epoch": 0.20764119601328904,
"grad_norm": 3.2746008040723815,
"learning_rate": 9.651652311125803e-06,
"loss": 0.6991,
"step": 500
},
{
"epoch": 0.20971760797342193,
"grad_norm": 2.76622547311742,
"learning_rate": 9.638238895513687e-06,
"loss": 0.7075,
"step": 505
},
{
"epoch": 0.21179401993355482,
"grad_norm": 2.9972446036957114,
"learning_rate": 9.624581770017392e-06,
"loss": 0.6857,
"step": 510
},
{
"epoch": 0.2138704318936877,
"grad_norm": 2.869516499460042,
"learning_rate": 9.610681652231794e-06,
"loss": 0.6916,
"step": 515
},
{
"epoch": 0.2159468438538206,
"grad_norm": 2.742923434452921,
"learning_rate": 9.596539272519468e-06,
"loss": 0.6811,
"step": 520
},
{
"epoch": 0.2180232558139535,
"grad_norm": 2.8482023108565677,
"learning_rate": 9.582155373972303e-06,
"loss": 0.6744,
"step": 525
},
{
"epoch": 0.22009966777408638,
"grad_norm": 2.9348099403663124,
"learning_rate": 9.56753071237247e-06,
"loss": 0.6776,
"step": 530
},
{
"epoch": 0.22217607973421927,
"grad_norm": 2.786772996017183,
"learning_rate": 9.552666056152704e-06,
"loss": 0.6798,
"step": 535
},
{
"epoch": 0.22425249169435216,
"grad_norm": 2.92722689041533,
"learning_rate": 9.537562186355918e-06,
"loss": 0.6843,
"step": 540
},
{
"epoch": 0.22632890365448505,
"grad_norm": 2.7694998172195207,
"learning_rate": 9.52221989659418e-06,
"loss": 0.6938,
"step": 545
},
{
"epoch": 0.22840531561461794,
"grad_norm": 2.9300442858036244,
"learning_rate": 9.506639993007012e-06,
"loss": 0.6944,
"step": 550
},
{
"epoch": 0.23048172757475083,
"grad_norm": 3.1035204783454993,
"learning_rate": 9.490823294219015e-06,
"loss": 0.672,
"step": 555
},
{
"epoch": 0.23255813953488372,
"grad_norm": 2.6193387690961245,
"learning_rate": 9.474770631296882e-06,
"loss": 0.6561,
"step": 560
},
{
"epoch": 0.2346345514950166,
"grad_norm": 2.61646550507026,
"learning_rate": 9.458482847705705e-06,
"loss": 0.6576,
"step": 565
},
{
"epoch": 0.2367109634551495,
"grad_norm": 2.756473668019519,
"learning_rate": 9.441960799264678e-06,
"loss": 0.6851,
"step": 570
},
{
"epoch": 0.2387873754152824,
"grad_norm": 2.6995089678231614,
"learning_rate": 9.425205354102111e-06,
"loss": 0.6648,
"step": 575
},
{
"epoch": 0.24086378737541528,
"grad_norm": 2.7140254791209677,
"learning_rate": 9.408217392609831e-06,
"loss": 0.6451,
"step": 580
},
{
"epoch": 0.24294019933554817,
"grad_norm": 2.607599787114018,
"learning_rate": 9.390997807396912e-06,
"loss": 0.67,
"step": 585
},
{
"epoch": 0.24501661129568106,
"grad_norm": 2.8420050898692764,
"learning_rate": 9.373547503242775e-06,
"loss": 0.6657,
"step": 590
},
{
"epoch": 0.24709302325581395,
"grad_norm": 2.9228965685399095,
"learning_rate": 9.355867397049658e-06,
"loss": 0.6566,
"step": 595
},
{
"epoch": 0.24916943521594684,
"grad_norm": 2.8048600929777403,
"learning_rate": 9.337958417794425e-06,
"loss": 0.6457,
"step": 600
},
{
"epoch": 0.25124584717607973,
"grad_norm": 2.6983485281997415,
"learning_rate": 9.319821506479762e-06,
"loss": 0.6376,
"step": 605
},
{
"epoch": 0.25332225913621265,
"grad_norm": 2.801805288954333,
"learning_rate": 9.301457616084733e-06,
"loss": 0.6523,
"step": 610
},
{
"epoch": 0.2553986710963455,
"grad_norm": 2.820864396273499,
"learning_rate": 9.282867711514703e-06,
"loss": 0.6365,
"step": 615
},
{
"epoch": 0.2574750830564784,
"grad_norm": 2.9932167823643043,
"learning_rate": 9.264052769550643e-06,
"loss": 0.6425,
"step": 620
},
{
"epoch": 0.2595514950166113,
"grad_norm": 2.6556108045628544,
"learning_rate": 9.245013778797802e-06,
"loss": 0.6562,
"step": 625
},
{
"epoch": 0.2616279069767442,
"grad_norm": 2.676416816690246,
"learning_rate": 9.225751739633772e-06,
"loss": 0.6387,
"step": 630
},
{
"epoch": 0.26370431893687707,
"grad_norm": 2.702226526508375,
"learning_rate": 9.206267664155906e-06,
"loss": 0.6348,
"step": 635
},
{
"epoch": 0.26578073089701,
"grad_norm": 2.637563222880754,
"learning_rate": 9.186562576128159e-06,
"loss": 0.6263,
"step": 640
},
{
"epoch": 0.26785714285714285,
"grad_norm": 2.7815352111724603,
"learning_rate": 9.16663751092728e-06,
"loss": 0.6362,
"step": 645
},
{
"epoch": 0.26993355481727577,
"grad_norm": 2.8822755136904528,
"learning_rate": 9.146493515488418e-06,
"loss": 0.6164,
"step": 650
},
{
"epoch": 0.27200996677740863,
"grad_norm": 2.5755107274498146,
"learning_rate": 9.126131648250112e-06,
"loss": 0.6342,
"step": 655
},
{
"epoch": 0.27408637873754155,
"grad_norm": 2.584492766117294,
"learning_rate": 9.105552979098675e-06,
"loss": 0.6329,
"step": 660
},
{
"epoch": 0.2761627906976744,
"grad_norm": 2.6805388863449036,
"learning_rate": 9.084758589311977e-06,
"loss": 0.6307,
"step": 665
},
{
"epoch": 0.2782392026578073,
"grad_norm": 2.7584115266730693,
"learning_rate": 9.063749571502633e-06,
"loss": 0.6374,
"step": 670
},
{
"epoch": 0.2803156146179402,
"grad_norm": 2.8092430217085145,
"learning_rate": 9.04252702956059e-06,
"loss": 0.6282,
"step": 675
},
{
"epoch": 0.2823920265780731,
"grad_norm": 2.6353604501522168,
"learning_rate": 9.021092078595132e-06,
"loss": 0.6332,
"step": 680
},
{
"epoch": 0.28446843853820597,
"grad_norm": 2.7859177417571486,
"learning_rate": 8.999445844876276e-06,
"loss": 0.6381,
"step": 685
},
{
"epoch": 0.2865448504983389,
"grad_norm": 2.6603634875986457,
"learning_rate": 8.977589465775607e-06,
"loss": 0.6312,
"step": 690
},
{
"epoch": 0.28862126245847175,
"grad_norm": 2.6293766795824354,
"learning_rate": 8.955524089706506e-06,
"loss": 0.5999,
"step": 695
},
{
"epoch": 0.29069767441860467,
"grad_norm": 2.8986723382239967,
"learning_rate": 8.933250876063815e-06,
"loss": 0.6297,
"step": 700
},
{
"epoch": 0.29277408637873753,
"grad_norm": 2.6589365161649834,
"learning_rate": 8.910770995162913e-06,
"loss": 0.6303,
"step": 705
},
{
"epoch": 0.29485049833887045,
"grad_norm": 2.64992234535583,
"learning_rate": 8.88808562817823e-06,
"loss": 0.6114,
"step": 710
},
{
"epoch": 0.2969269102990033,
"grad_norm": 2.7322760412568776,
"learning_rate": 8.865195967081174e-06,
"loss": 0.6215,
"step": 715
},
{
"epoch": 0.29900332225913623,
"grad_norm": 2.576473302210113,
"learning_rate": 8.842103214577511e-06,
"loss": 0.6147,
"step": 720
},
{
"epoch": 0.3010797342192691,
"grad_norm": 2.507546434543662,
"learning_rate": 8.818808584044163e-06,
"loss": 0.6089,
"step": 725
},
{
"epoch": 0.303156146179402,
"grad_norm": 2.953501799132662,
"learning_rate": 8.795313299465455e-06,
"loss": 0.6147,
"step": 730
},
{
"epoch": 0.30523255813953487,
"grad_norm": 2.58266860044093,
"learning_rate": 8.771618595368806e-06,
"loss": 0.6024,
"step": 735
},
{
"epoch": 0.3073089700996678,
"grad_norm": 2.7291039422306613,
"learning_rate": 8.747725716759859e-06,
"loss": 0.6152,
"step": 740
},
{
"epoch": 0.30938538205980065,
"grad_norm": 2.696653736904745,
"learning_rate": 8.723635919057058e-06,
"loss": 0.6082,
"step": 745
},
{
"epoch": 0.31146179401993357,
"grad_norm": 2.639188973608746,
"learning_rate": 8.699350468025699e-06,
"loss": 0.5924,
"step": 750
},
{
"epoch": 0.31353820598006643,
"grad_norm": 2.5960120065556294,
"learning_rate": 8.674870639711403e-06,
"loss": 0.5871,
"step": 755
},
{
"epoch": 0.31561461794019935,
"grad_norm": 2.691098687645451,
"learning_rate": 8.650197720373091e-06,
"loss": 0.5937,
"step": 760
},
{
"epoch": 0.3176910299003322,
"grad_norm": 2.7922815680081947,
"learning_rate": 8.625333006415372e-06,
"loss": 0.5806,
"step": 765
},
{
"epoch": 0.31976744186046513,
"grad_norm": 2.5989983221444635,
"learning_rate": 8.600277804320452e-06,
"loss": 0.5889,
"step": 770
},
{
"epoch": 0.321843853820598,
"grad_norm": 2.7500580415708553,
"learning_rate": 8.575033430579465e-06,
"loss": 0.5929,
"step": 775
},
{
"epoch": 0.3239202657807309,
"grad_norm": 2.9863748696055485,
"learning_rate": 8.549601211623316e-06,
"loss": 0.5905,
"step": 780
},
{
"epoch": 0.32599667774086377,
"grad_norm": 2.7128601524461966,
"learning_rate": 8.523982483752973e-06,
"loss": 0.5838,
"step": 785
},
{
"epoch": 0.3280730897009967,
"grad_norm": 2.6273588590853727,
"learning_rate": 8.498178593069262e-06,
"loss": 0.579,
"step": 790
},
{
"epoch": 0.33014950166112955,
"grad_norm": 2.6424251208940714,
"learning_rate": 8.472190895402131e-06,
"loss": 0.568,
"step": 795
},
{
"epoch": 0.33222591362126247,
"grad_norm": 2.774060760650428,
"learning_rate": 8.446020756239418e-06,
"loss": 0.5881,
"step": 800
},
{
"epoch": 0.33430232558139533,
"grad_norm": 2.7429673227633193,
"learning_rate": 8.419669550655093e-06,
"loss": 0.5807,
"step": 805
},
{
"epoch": 0.33637873754152825,
"grad_norm": 2.4588138685140164,
"learning_rate": 8.393138663237015e-06,
"loss": 0.5699,
"step": 810
},
{
"epoch": 0.3384551495016611,
"grad_norm": 2.8894345220890845,
"learning_rate": 8.366429488014178e-06,
"loss": 0.5644,
"step": 815
},
{
"epoch": 0.34053156146179403,
"grad_norm": 2.6417969175920253,
"learning_rate": 8.339543428383467e-06,
"loss": 0.577,
"step": 820
},
{
"epoch": 0.3426079734219269,
"grad_norm": 2.639049529021501,
"learning_rate": 8.312481897035906e-06,
"loss": 0.5835,
"step": 825
},
{
"epoch": 0.3446843853820598,
"grad_norm": 2.791601353912272,
"learning_rate": 8.285246315882448e-06,
"loss": 0.5873,
"step": 830
},
{
"epoch": 0.3467607973421927,
"grad_norm": 2.760486538247162,
"learning_rate": 8.257838115979244e-06,
"loss": 0.5743,
"step": 835
},
{
"epoch": 0.3488372093023256,
"grad_norm": 2.6084506349864114,
"learning_rate": 8.230258737452473e-06,
"loss": 0.5835,
"step": 840
},
{
"epoch": 0.35091362126245845,
"grad_norm": 2.568077365967415,
"learning_rate": 8.202509629422647e-06,
"loss": 0.5663,
"step": 845
},
{
"epoch": 0.35299003322259137,
"grad_norm": 3.338586543406698,
"learning_rate": 8.17459224992849e-06,
"loss": 0.561,
"step": 850
},
{
"epoch": 0.35506644518272423,
"grad_norm": 2.550936924190995,
"learning_rate": 8.14650806585031e-06,
"loss": 0.5748,
"step": 855
},
{
"epoch": 0.35714285714285715,
"grad_norm": 2.730568567607308,
"learning_rate": 8.118258552832945e-06,
"loss": 0.5526,
"step": 860
},
{
"epoch": 0.35921926910299,
"grad_norm": 2.7922640713365765,
"learning_rate": 8.0898451952082e-06,
"loss": 0.5636,
"step": 865
},
{
"epoch": 0.36129568106312293,
"grad_norm": 2.4817520439108782,
"learning_rate": 8.061269485916881e-06,
"loss": 0.565,
"step": 870
},
{
"epoch": 0.3633720930232558,
"grad_norm": 2.5897869437416814,
"learning_rate": 8.032532926430335e-06,
"loss": 0.5718,
"step": 875
},
{
"epoch": 0.3654485049833887,
"grad_norm": 2.6233407361081196,
"learning_rate": 8.003637026671558e-06,
"loss": 0.5495,
"step": 880
},
{
"epoch": 0.3675249169435216,
"grad_norm": 2.590608968830393,
"learning_rate": 7.974583304935867e-06,
"loss": 0.5701,
"step": 885
},
{
"epoch": 0.3696013289036545,
"grad_norm": 2.676185626796156,
"learning_rate": 7.945373287811116e-06,
"loss": 0.5476,
"step": 890
},
{
"epoch": 0.37167774086378735,
"grad_norm": 2.588749653152642,
"learning_rate": 7.916008510097483e-06,
"loss": 0.5363,
"step": 895
},
{
"epoch": 0.37375415282392027,
"grad_norm": 2.648109565452331,
"learning_rate": 7.88649051472683e-06,
"loss": 0.5566,
"step": 900
},
{
"epoch": 0.37583056478405313,
"grad_norm": 2.6123078212762567,
"learning_rate": 7.856820852681634e-06,
"loss": 0.5481,
"step": 905
},
{
"epoch": 0.37790697674418605,
"grad_norm": 2.5715025577779107,
"learning_rate": 7.82700108291348e-06,
"loss": 0.5554,
"step": 910
},
{
"epoch": 0.3799833887043189,
"grad_norm": 2.6810117688521333,
"learning_rate": 7.797032772261164e-06,
"loss": 0.5396,
"step": 915
},
{
"epoch": 0.38205980066445183,
"grad_norm": 2.828001329589521,
"learning_rate": 7.766917495368356e-06,
"loss": 0.549,
"step": 920
},
{
"epoch": 0.3841362126245847,
"grad_norm": 2.6073301891312455,
"learning_rate": 7.736656834600866e-06,
"loss": 0.5403,
"step": 925
},
{
"epoch": 0.3862126245847176,
"grad_norm": 2.7467154847057107,
"learning_rate": 7.706252379963498e-06,
"loss": 0.5395,
"step": 930
},
{
"epoch": 0.3882890365448505,
"grad_norm": 2.6418072073420067,
"learning_rate": 7.675705729016508e-06,
"loss": 0.5363,
"step": 935
},
{
"epoch": 0.3903654485049834,
"grad_norm": 2.632007372607857,
"learning_rate": 7.645018486791664e-06,
"loss": 0.5377,
"step": 940
},
{
"epoch": 0.39244186046511625,
"grad_norm": 2.4652302347093364,
"learning_rate": 7.6141922657079045e-06,
"loss": 0.5321,
"step": 945
},
{
"epoch": 0.3945182724252492,
"grad_norm": 2.5492866422631764,
"learning_rate": 7.583228685486623e-06,
"loss": 0.5433,
"step": 950
},
{
"epoch": 0.39659468438538203,
"grad_norm": 2.4794671881341936,
"learning_rate": 7.552129373066565e-06,
"loss": 0.5423,
"step": 955
},
{
"epoch": 0.39867109634551495,
"grad_norm": 2.565377450639672,
"learning_rate": 7.520895962518329e-06,
"loss": 0.5357,
"step": 960
},
{
"epoch": 0.4007475083056478,
"grad_norm": 2.7376349329000504,
"learning_rate": 7.489530094958521e-06,
"loss": 0.5529,
"step": 965
},
{
"epoch": 0.40282392026578073,
"grad_norm": 2.5470062145134778,
"learning_rate": 7.458033418463517e-06,
"loss": 0.5167,
"step": 970
},
{
"epoch": 0.4049003322259136,
"grad_norm": 2.5915393940286724,
"learning_rate": 7.426407587982869e-06,
"loss": 0.5359,
"step": 975
},
{
"epoch": 0.4069767441860465,
"grad_norm": 2.5521473612501118,
"learning_rate": 7.394654265252348e-06,
"loss": 0.5448,
"step": 980
},
{
"epoch": 0.4090531561461794,
"grad_norm": 2.540390049884069,
"learning_rate": 7.362775118706627e-06,
"loss": 0.5224,
"step": 985
},
{
"epoch": 0.4111295681063123,
"grad_norm": 2.67106563437947,
"learning_rate": 7.330771823391622e-06,
"loss": 0.547,
"step": 990
},
{
"epoch": 0.41320598006644516,
"grad_norm": 2.5844286453504752,
"learning_rate": 7.298646060876473e-06,
"loss": 0.5245,
"step": 995
},
{
"epoch": 0.4152823920265781,
"grad_norm": 2.489462893225223,
"learning_rate": 7.266399519165193e-06,
"loss": 0.5177,
"step": 1000
},
{
"epoch": 0.417358803986711,
"grad_norm": 2.548885028848683,
"learning_rate": 7.234033892607969e-06,
"loss": 0.5285,
"step": 1005
},
{
"epoch": 0.41943521594684385,
"grad_norm": 2.515732979636329,
"learning_rate": 7.201550881812138e-06,
"loss": 0.5295,
"step": 1010
},
{
"epoch": 0.42151162790697677,
"grad_norm": 2.580813201220608,
"learning_rate": 7.168952193552831e-06,
"loss": 0.5144,
"step": 1015
},
{
"epoch": 0.42358803986710963,
"grad_norm": 2.8605769340325544,
"learning_rate": 7.136239540683297e-06,
"loss": 0.5189,
"step": 1020
},
{
"epoch": 0.42566445182724255,
"grad_norm": 2.7042921962644773,
"learning_rate": 7.103414642044888e-06,
"loss": 0.516,
"step": 1025
},
{
"epoch": 0.4277408637873754,
"grad_norm": 2.5935305392513475,
"learning_rate": 7.070479222376765e-06,
"loss": 0.5273,
"step": 1030
},
{
"epoch": 0.42981727574750833,
"grad_norm": 2.521806447567166,
"learning_rate": 7.037435012225259e-06,
"loss": 0.514,
"step": 1035
},
{
"epoch": 0.4318936877076412,
"grad_norm": 2.4922095571026808,
"learning_rate": 7.00428374785295e-06,
"loss": 0.5191,
"step": 1040
},
{
"epoch": 0.4339700996677741,
"grad_norm": 2.53445755137843,
"learning_rate": 6.971027171147436e-06,
"loss": 0.5175,
"step": 1045
},
{
"epoch": 0.436046511627907,
"grad_norm": 2.5854663493896815,
"learning_rate": 6.937667029529803e-06,
"loss": 0.5052,
"step": 1050
},
{
"epoch": 0.4381229235880399,
"grad_norm": 2.6149256231235767,
"learning_rate": 6.904205075862816e-06,
"loss": 0.5155,
"step": 1055
},
{
"epoch": 0.44019933554817275,
"grad_norm": 2.5728069972099643,
"learning_rate": 6.870643068358813e-06,
"loss": 0.5164,
"step": 1060
},
{
"epoch": 0.44227574750830567,
"grad_norm": 2.610034601385569,
"learning_rate": 6.8369827704873225e-06,
"loss": 0.515,
"step": 1065
},
{
"epoch": 0.44435215946843853,
"grad_norm": 2.559653943614866,
"learning_rate": 6.803225950882407e-06,
"loss": 0.5103,
"step": 1070
},
{
"epoch": 0.44642857142857145,
"grad_norm": 2.744659999074845,
"learning_rate": 6.769374383249728e-06,
"loss": 0.5144,
"step": 1075
},
{
"epoch": 0.4485049833887043,
"grad_norm": 2.500834722382555,
"learning_rate": 6.735429846273356e-06,
"loss": 0.509,
"step": 1080
},
{
"epoch": 0.45058139534883723,
"grad_norm": 2.571303478772175,
"learning_rate": 6.701394123522303e-06,
"loss": 0.5061,
"step": 1085
},
{
"epoch": 0.4526578073089701,
"grad_norm": 2.6726371126474042,
"learning_rate": 6.667269003356815e-06,
"loss": 0.4872,
"step": 1090
},
{
"epoch": 0.454734219269103,
"grad_norm": 2.314624945694432,
"learning_rate": 6.633056278834403e-06,
"loss": 0.4978,
"step": 1095
},
{
"epoch": 0.4568106312292359,
"grad_norm": 2.5660125412801986,
"learning_rate": 6.598757747615625e-06,
"loss": 0.4873,
"step": 1100
},
{
"epoch": 0.4588870431893688,
"grad_norm": 2.5055302944005655,
"learning_rate": 6.564375211869638e-06,
"loss": 0.4955,
"step": 1105
},
{
"epoch": 0.46096345514950166,
"grad_norm": 2.3161654964295963,
"learning_rate": 6.529910478179499e-06,
"loss": 0.4996,
"step": 1110
},
{
"epoch": 0.4630398671096346,
"grad_norm": 2.713583584390501,
"learning_rate": 6.495365357447242e-06,
"loss": 0.4837,
"step": 1115
},
{
"epoch": 0.46511627906976744,
"grad_norm": 2.6986080979156597,
"learning_rate": 6.4607416647987285e-06,
"loss": 0.503,
"step": 1120
},
{
"epoch": 0.46719269102990035,
"grad_norm": 2.3758745672703614,
"learning_rate": 6.426041219488275e-06,
"loss": 0.4917,
"step": 1125
},
{
"epoch": 0.4692691029900332,
"grad_norm": 2.468317610874025,
"learning_rate": 6.39126584480306e-06,
"loss": 0.4947,
"step": 1130
},
{
"epoch": 0.47134551495016613,
"grad_norm": 2.672466601805675,
"learning_rate": 6.3564173679673225e-06,
"loss": 0.4956,
"step": 1135
},
{
"epoch": 0.473421926910299,
"grad_norm": 2.686387722109422,
"learning_rate": 6.321497620046353e-06,
"loss": 0.4958,
"step": 1140
},
{
"epoch": 0.4754983388704319,
"grad_norm": 2.4115883144762105,
"learning_rate": 6.286508435850282e-06,
"loss": 0.4884,
"step": 1145
},
{
"epoch": 0.4775747508305648,
"grad_norm": 2.473062095275494,
"learning_rate": 6.251451653837679e-06,
"loss": 0.4873,
"step": 1150
},
{
"epoch": 0.4796511627906977,
"grad_norm": 2.4611172122096034,
"learning_rate": 6.216329116018943e-06,
"loss": 0.4828,
"step": 1155
},
{
"epoch": 0.48172757475083056,
"grad_norm": 2.438501558434762,
"learning_rate": 6.181142667859521e-06,
"loss": 0.4743,
"step": 1160
},
{
"epoch": 0.4838039867109635,
"grad_norm": 2.4623748153401586,
"learning_rate": 6.145894158182945e-06,
"loss": 0.4813,
"step": 1165
},
{
"epoch": 0.48588039867109634,
"grad_norm": 2.5841330806095093,
"learning_rate": 6.11058543907368e-06,
"loss": 0.4757,
"step": 1170
},
{
"epoch": 0.48795681063122925,
"grad_norm": 2.420645551171905,
"learning_rate": 6.075218365779814e-06,
"loss": 0.4717,
"step": 1175
},
{
"epoch": 0.4900332225913621,
"grad_norm": 2.41753538282735,
"learning_rate": 6.039794796615575e-06,
"loss": 0.4683,
"step": 1180
},
{
"epoch": 0.49210963455149503,
"grad_norm": 2.6345922483315993,
"learning_rate": 6.004316592863693e-06,
"loss": 0.4758,
"step": 1185
},
{
"epoch": 0.4941860465116279,
"grad_norm": 2.580357854248359,
"learning_rate": 5.96878561867759e-06,
"loss": 0.4923,
"step": 1190
},
{
"epoch": 0.4962624584717608,
"grad_norm": 2.3693846881679463,
"learning_rate": 5.9332037409834466e-06,
"loss": 0.4732,
"step": 1195
},
{
"epoch": 0.4983388704318937,
"grad_norm": 2.769567429139866,
"learning_rate": 5.89757282938209e-06,
"loss": 0.4713,
"step": 1200
},
{
"epoch": 0.5004152823920266,
"grad_norm": 2.41622785319668,
"learning_rate": 5.86189475605077e-06,
"loss": 0.476,
"step": 1205
},
{
"epoch": 0.5024916943521595,
"grad_norm": 2.499791289384567,
"learning_rate": 5.826171395644786e-06,
"loss": 0.4749,
"step": 1210
},
{
"epoch": 0.5045681063122923,
"grad_norm": 2.417525944289692,
"learning_rate": 5.790404625198982e-06,
"loss": 0.4726,
"step": 1215
},
{
"epoch": 0.5066445182724253,
"grad_norm": 2.5878334687029114,
"learning_rate": 5.754596324029125e-06,
"loss": 0.4761,
"step": 1220
},
{
"epoch": 0.5087209302325582,
"grad_norm": 2.4962102663667043,
"learning_rate": 5.7187483736331554e-06,
"loss": 0.4578,
"step": 1225
},
{
"epoch": 0.510797342192691,
"grad_norm": 2.6263564446414636,
"learning_rate": 5.682862657592327e-06,
"loss": 0.4825,
"step": 1230
},
{
"epoch": 0.5128737541528239,
"grad_norm": 2.880797119411763,
"learning_rate": 5.646941061472242e-06,
"loss": 0.469,
"step": 1235
},
{
"epoch": 0.5149501661129569,
"grad_norm": 2.555965100494747,
"learning_rate": 5.610985472723764e-06,
"loss": 0.4712,
"step": 1240
},
{
"epoch": 0.5170265780730897,
"grad_norm": 2.502236357284136,
"learning_rate": 5.5749977805838615e-06,
"loss": 0.4681,
"step": 1245
},
{
"epoch": 0.5191029900332226,
"grad_norm": 2.4360635002482347,
"learning_rate": 5.538979875976324e-06,
"loss": 0.4636,
"step": 1250
},
{
"epoch": 0.5211794019933554,
"grad_norm": 2.488011716508302,
"learning_rate": 5.502933651412417e-06,
"loss": 0.4699,
"step": 1255
},
{
"epoch": 0.5232558139534884,
"grad_norm": 2.3770436189443696,
"learning_rate": 5.466861000891439e-06,
"loss": 0.4592,
"step": 1260
},
{
"epoch": 0.5253322259136213,
"grad_norm": 2.7541846157024876,
"learning_rate": 5.430763819801205e-06,
"loss": 0.4692,
"step": 1265
},
{
"epoch": 0.5274086378737541,
"grad_norm": 2.7287082031019745,
"learning_rate": 5.394644004818452e-06,
"loss": 0.4745,
"step": 1270
},
{
"epoch": 0.529485049833887,
"grad_norm": 2.5164954994115094,
"learning_rate": 5.3585034538091885e-06,
"loss": 0.4525,
"step": 1275
},
{
"epoch": 0.53156146179402,
"grad_norm": 2.347205777105881,
"learning_rate": 5.322344065728964e-06,
"loss": 0.4689,
"step": 1280
},
{
"epoch": 0.5336378737541528,
"grad_norm": 2.582827989286747,
"learning_rate": 5.286167740523099e-06,
"loss": 0.4691,
"step": 1285
},
{
"epoch": 0.5357142857142857,
"grad_norm": 2.5061090934097843,
"learning_rate": 5.249976379026851e-06,
"loss": 0.4436,
"step": 1290
},
{
"epoch": 0.5377906976744186,
"grad_norm": 2.4524559965169748,
"learning_rate": 5.213771882865538e-06,
"loss": 0.4643,
"step": 1295
},
{
"epoch": 0.5398671096345515,
"grad_norm": 2.560097527019471,
"learning_rate": 5.177556154354622e-06,
"loss": 0.4464,
"step": 1300
},
{
"epoch": 0.5419435215946844,
"grad_norm": 2.397260026201424,
"learning_rate": 5.141331096399755e-06,
"loss": 0.4501,
"step": 1305
},
{
"epoch": 0.5440199335548173,
"grad_norm": 2.351541148312247,
"learning_rate": 5.1050986123967884e-06,
"loss": 0.4398,
"step": 1310
},
{
"epoch": 0.5460963455149501,
"grad_norm": 2.452194040455103,
"learning_rate": 5.068860606131766e-06,
"loss": 0.4516,
"step": 1315
},
{
"epoch": 0.5481727574750831,
"grad_norm": 2.593569889967618,
"learning_rate": 5.032618981680893e-06,
"loss": 0.4534,
"step": 1320
},
{
"epoch": 0.550249169435216,
"grad_norm": 2.491194365967403,
"learning_rate": 4.9963756433104875e-06,
"loss": 0.4561,
"step": 1325
},
{
"epoch": 0.5523255813953488,
"grad_norm": 2.5315048028501432,
"learning_rate": 4.960132495376919e-06,
"loss": 0.4387,
"step": 1330
},
{
"epoch": 0.5544019933554817,
"grad_norm": 2.4221610492026566,
"learning_rate": 4.923891442226554e-06,
"loss": 0.4526,
"step": 1335
},
{
"epoch": 0.5564784053156147,
"grad_norm": 2.4574741459986043,
"learning_rate": 4.887654388095691e-06,
"loss": 0.4388,
"step": 1340
},
{
"epoch": 0.5585548172757475,
"grad_norm": 2.5581004359073565,
"learning_rate": 4.851423237010504e-06,
"loss": 0.4512,
"step": 1345
},
{
"epoch": 0.5606312292358804,
"grad_norm": 2.5084567945271634,
"learning_rate": 4.815199892687006e-06,
"loss": 0.464,
"step": 1350
},
{
"epoch": 0.5627076411960132,
"grad_norm": 2.4656070255557294,
"learning_rate": 4.778986258431005e-06,
"loss": 0.4471,
"step": 1355
},
{
"epoch": 0.5647840531561462,
"grad_norm": 2.494517722129321,
"learning_rate": 4.742784237038113e-06,
"loss": 0.4352,
"step": 1360
},
{
"epoch": 0.5668604651162791,
"grad_norm": 2.5383042319953995,
"learning_rate": 4.70659573069376e-06,
"loss": 0.421,
"step": 1365
},
{
"epoch": 0.5689368770764119,
"grad_norm": 2.3933135171603936,
"learning_rate": 4.670422640873242e-06,
"loss": 0.4379,
"step": 1370
},
{
"epoch": 0.5710132890365448,
"grad_norm": 2.4020680375977133,
"learning_rate": 4.63426686824182e-06,
"loss": 0.4323,
"step": 1375
},
{
"epoch": 0.5730897009966778,
"grad_norm": 2.43581294994139,
"learning_rate": 4.598130312554843e-06,
"loss": 0.4397,
"step": 1380
},
{
"epoch": 0.5751661129568106,
"grad_norm": 2.5772706634163027,
"learning_rate": 4.562014872557936e-06,
"loss": 0.4362,
"step": 1385
},
{
"epoch": 0.5772425249169435,
"grad_norm": 2.448863408768738,
"learning_rate": 4.525922445887224e-06,
"loss": 0.4349,
"step": 1390
},
{
"epoch": 0.5793189368770764,
"grad_norm": 2.535308434878213,
"learning_rate": 4.489854928969635e-06,
"loss": 0.4516,
"step": 1395
},
{
"epoch": 0.5813953488372093,
"grad_norm": 2.3973615256768768,
"learning_rate": 4.453814216923242e-06,
"loss": 0.4336,
"step": 1400
},
{
"epoch": 0.5834717607973422,
"grad_norm": 2.3119199540164965,
"learning_rate": 4.4178022034576976e-06,
"loss": 0.4226,
"step": 1405
},
{
"epoch": 0.5855481727574751,
"grad_norm": 2.3014825037296633,
"learning_rate": 4.381820780774724e-06,
"loss": 0.4322,
"step": 1410
},
{
"epoch": 0.5876245847176079,
"grad_norm": 2.5351337278959556,
"learning_rate": 4.345871839468694e-06,
"loss": 0.4055,
"step": 1415
},
{
"epoch": 0.5897009966777409,
"grad_norm": 2.611286820208639,
"learning_rate": 4.309957268427292e-06,
"loss": 0.4216,
"step": 1420
},
{
"epoch": 0.5917774086378738,
"grad_norm": 2.3889570520642684,
"learning_rate": 4.274078954732262e-06,
"loss": 0.4427,
"step": 1425
},
{
"epoch": 0.5938538205980066,
"grad_norm": 2.384724624598042,
"learning_rate": 4.2382387835602565e-06,
"loss": 0.4246,
"step": 1430
},
{
"epoch": 0.5959302325581395,
"grad_norm": 2.3536762842777126,
"learning_rate": 4.20243863808378e-06,
"loss": 0.4352,
"step": 1435
},
{
"epoch": 0.5980066445182725,
"grad_norm": 2.367560729519929,
"learning_rate": 4.166680399372248e-06,
"loss": 0.4226,
"step": 1440
},
{
"epoch": 0.6000830564784053,
"grad_norm": 2.401186140827422,
"learning_rate": 4.130965946293135e-06,
"loss": 0.4529,
"step": 1445
},
{
"epoch": 0.6021594684385382,
"grad_norm": 2.3503805374006457,
"learning_rate": 4.095297155413264e-06,
"loss": 0.4213,
"step": 1450
},
{
"epoch": 0.604235880398671,
"grad_norm": 2.404199762232402,
"learning_rate": 4.059675900900199e-06,
"loss": 0.4309,
"step": 1455
},
{
"epoch": 0.606312292358804,
"grad_norm": 2.5304024582625053,
"learning_rate": 4.024104054423772e-06,
"loss": 0.4215,
"step": 1460
},
{
"epoch": 0.6083887043189369,
"grad_norm": 2.4035116235125473,
"learning_rate": 3.9885834850577375e-06,
"loss": 0.4282,
"step": 1465
},
{
"epoch": 0.6104651162790697,
"grad_norm": 2.3499844076305156,
"learning_rate": 3.953116059181563e-06,
"loss": 0.422,
"step": 1470
},
{
"epoch": 0.6125415282392026,
"grad_norm": 2.5288170114153585,
"learning_rate": 3.9177036403823645e-06,
"loss": 0.4329,
"step": 1475
},
{
"epoch": 0.6146179401993356,
"grad_norm": 2.3290974062316057,
"learning_rate": 3.882348089356992e-06,
"loss": 0.4137,
"step": 1480
},
{
"epoch": 0.6166943521594684,
"grad_norm": 2.4328677326588894,
"learning_rate": 3.84705126381425e-06,
"loss": 0.4297,
"step": 1485
},
{
"epoch": 0.6187707641196013,
"grad_norm": 2.3908310630477954,
"learning_rate": 3.8118150183772974e-06,
"loss": 0.4293,
"step": 1490
},
{
"epoch": 0.6208471760797342,
"grad_norm": 2.4893827738846808,
"learning_rate": 3.776641204486191e-06,
"loss": 0.4214,
"step": 1495
},
{
"epoch": 0.6229235880398671,
"grad_norm": 2.3486377563484133,
"learning_rate": 3.7415316703006116e-06,
"loss": 0.405,
"step": 1500
},
{
"epoch": 0.625,
"grad_norm": 2.466506888817687,
"learning_rate": 3.7064882606027497e-06,
"loss": 0.426,
"step": 1505
},
{
"epoch": 0.6270764119601329,
"grad_norm": 2.496662130115367,
"learning_rate": 3.671512816700375e-06,
"loss": 0.4201,
"step": 1510
},
{
"epoch": 0.6291528239202658,
"grad_norm": 2.265163717312505,
"learning_rate": 3.636607176330088e-06,
"loss": 0.4205,
"step": 1515
},
{
"epoch": 0.6312292358803987,
"grad_norm": 2.2703878574783163,
"learning_rate": 3.60177317356076e-06,
"loss": 0.4101,
"step": 1520
},
{
"epoch": 0.6333056478405316,
"grad_norm": 2.423443407995488,
"learning_rate": 3.5670126386971625e-06,
"loss": 0.4171,
"step": 1525
},
{
"epoch": 0.6353820598006644,
"grad_norm": 2.44608682526587,
"learning_rate": 3.5323273981837965e-06,
"loss": 0.416,
"step": 1530
},
{
"epoch": 0.6374584717607974,
"grad_norm": 2.2051417207338173,
"learning_rate": 3.497719274508925e-06,
"loss": 0.4019,
"step": 1535
},
{
"epoch": 0.6395348837209303,
"grad_norm": 2.4800578989548034,
"learning_rate": 3.4631900861088132e-06,
"loss": 0.4029,
"step": 1540
},
{
"epoch": 0.6416112956810631,
"grad_norm": 2.3268282845100035,
"learning_rate": 3.4287416472721795e-06,
"loss": 0.4111,
"step": 1545
},
{
"epoch": 0.643687707641196,
"grad_norm": 2.3872453059218532,
"learning_rate": 3.3943757680448697e-06,
"loss": 0.4061,
"step": 1550
},
{
"epoch": 0.645764119601329,
"grad_norm": 2.42558490404232,
"learning_rate": 3.360094254134746e-06,
"loss": 0.403,
"step": 1555
},
{
"epoch": 0.6478405315614618,
"grad_norm": 2.441847356983534,
"learning_rate": 3.3258989068168123e-06,
"loss": 0.417,
"step": 1560
},
{
"epoch": 0.6499169435215947,
"grad_norm": 2.356616246546388,
"learning_rate": 3.2917915228385676e-06,
"loss": 0.4008,
"step": 1565
},
{
"epoch": 0.6519933554817275,
"grad_norm": 2.457529466848808,
"learning_rate": 3.257773894325599e-06,
"loss": 0.4166,
"step": 1570
},
{
"epoch": 0.6540697674418605,
"grad_norm": 2.5688010790796154,
"learning_rate": 3.223847808687415e-06,
"loss": 0.3982,
"step": 1575
},
{
"epoch": 0.6561461794019934,
"grad_norm": 2.2695295812005836,
"learning_rate": 3.190015048523528e-06,
"loss": 0.3912,
"step": 1580
},
{
"epoch": 0.6582225913621262,
"grad_norm": 2.5664307243505227,
"learning_rate": 3.156277391529796e-06,
"loss": 0.4044,
"step": 1585
},
{
"epoch": 0.6602990033222591,
"grad_norm": 2.421377162101449,
"learning_rate": 3.1226366104050067e-06,
"loss": 0.4061,
"step": 1590
},
{
"epoch": 0.6623754152823921,
"grad_norm": 2.50702313044333,
"learning_rate": 3.089094472757742e-06,
"loss": 0.3986,
"step": 1595
},
{
"epoch": 0.6644518272425249,
"grad_norm": 2.2015982709846122,
"learning_rate": 3.055652741013497e-06,
"loss": 0.3773,
"step": 1600
},
{
"epoch": 0.6665282392026578,
"grad_norm": 2.484025604844624,
"learning_rate": 3.0223131723220756e-06,
"loss": 0.4043,
"step": 1605
},
{
"epoch": 0.6686046511627907,
"grad_norm": 2.2673450694224426,
"learning_rate": 2.9890775184652666e-06,
"loss": 0.3975,
"step": 1610
},
{
"epoch": 0.6706810631229236,
"grad_norm": 2.411243052140437,
"learning_rate": 2.955947525764796e-06,
"loss": 0.4162,
"step": 1615
},
{
"epoch": 0.6727574750830565,
"grad_norm": 2.467788088547966,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.3905,
"step": 1620
},
{
"epoch": 0.6748338870431894,
"grad_norm": 2.441034044229084,
"learning_rate": 2.890011481269204e-06,
"loss": 0.404,
"step": 1625
},
{
"epoch": 0.6769102990033222,
"grad_norm": 2.4310426686498507,
"learning_rate": 2.8572088939928623e-06,
"loss": 0.3985,
"step": 1630
},
{
"epoch": 0.6789867109634552,
"grad_norm": 2.5154739727394397,
"learning_rate": 2.824518896728386e-06,
"loss": 0.3972,
"step": 1635
},
{
"epoch": 0.6810631229235881,
"grad_norm": 2.4239374759188066,
"learning_rate": 2.7919432071267212e-06,
"loss": 0.3986,
"step": 1640
},
{
"epoch": 0.6831395348837209,
"grad_norm": 2.401230714452262,
"learning_rate": 2.759483536832682e-06,
"loss": 0.3961,
"step": 1645
},
{
"epoch": 0.6852159468438538,
"grad_norm": 2.3945770626194425,
"learning_rate": 2.7271415913950027e-06,
"loss": 0.3987,
"step": 1650
},
{
"epoch": 0.6872923588039868,
"grad_norm": 2.5083750676716248,
"learning_rate": 2.6949190701767323e-06,
"loss": 0.3987,
"step": 1655
},
{
"epoch": 0.6893687707641196,
"grad_norm": 2.359597868105036,
"learning_rate": 2.662817666265932e-06,
"loss": 0.3992,
"step": 1660
},
{
"epoch": 0.6914451827242525,
"grad_norm": 2.3950900870588305,
"learning_rate": 2.6308390663867247e-06,
"loss": 0.3755,
"step": 1665
},
{
"epoch": 0.6935215946843853,
"grad_norm": 2.2726643843793783,
"learning_rate": 2.5989849508106663e-06,
"loss": 0.3788,
"step": 1670
},
{
"epoch": 0.6955980066445183,
"grad_norm": 2.3688642141053644,
"learning_rate": 2.5672569932684486e-06,
"loss": 0.3923,
"step": 1675
},
{
"epoch": 0.6976744186046512,
"grad_norm": 2.4674555381530543,
"learning_rate": 2.5356568608619737e-06,
"loss": 0.3784,
"step": 1680
},
{
"epoch": 0.699750830564784,
"grad_norm": 2.348080957902949,
"learning_rate": 2.504186213976736e-06,
"loss": 0.3888,
"step": 1685
},
{
"epoch": 0.7018272425249169,
"grad_norm": 2.2245908133987506,
"learning_rate": 2.4728467061946017e-06,
"loss": 0.383,
"step": 1690
},
{
"epoch": 0.7039036544850499,
"grad_norm": 2.308262964854599,
"learning_rate": 2.441639984206903e-06,
"loss": 0.3873,
"step": 1695
},
{
"epoch": 0.7059800664451827,
"grad_norm": 2.3316191201720726,
"learning_rate": 2.4105676877279376e-06,
"loss": 0.3764,
"step": 1700
},
{
"epoch": 0.7080564784053156,
"grad_norm": 2.2575654898253363,
"learning_rate": 2.379631449408788e-06,
"loss": 0.3857,
"step": 1705
},
{
"epoch": 0.7101328903654485,
"grad_norm": 2.295434521334263,
"learning_rate": 2.3488328947515566e-06,
"loss": 0.3825,
"step": 1710
},
{
"epoch": 0.7122093023255814,
"grad_norm": 2.3045365012329704,
"learning_rate": 2.318173642023939e-06,
"loss": 0.3851,
"step": 1715
},
{
"epoch": 0.7142857142857143,
"grad_norm": 2.3117392889776665,
"learning_rate": 2.287655302174208e-06,
"loss": 0.3897,
"step": 1720
},
{
"epoch": 0.7163621262458472,
"grad_norm": 2.422532892044474,
"learning_rate": 2.257279478746564e-06,
"loss": 0.3799,
"step": 1725
},
{
"epoch": 0.71843853820598,
"grad_norm": 2.2839185079742514,
"learning_rate": 2.2270477677968727e-06,
"loss": 0.3703,
"step": 1730
},
{
"epoch": 0.720514950166113,
"grad_norm": 2.7279247585921786,
"learning_rate": 2.196961757808813e-06,
"loss": 0.3794,
"step": 1735
},
{
"epoch": 0.7225913621262459,
"grad_norm": 2.396361579385602,
"learning_rate": 2.167023029610402e-06,
"loss": 0.3642,
"step": 1740
},
{
"epoch": 0.7246677740863787,
"grad_norm": 2.340856081292544,
"learning_rate": 2.1372331562909453e-06,
"loss": 0.372,
"step": 1745
},
{
"epoch": 0.7267441860465116,
"grad_norm": 2.413915292833693,
"learning_rate": 2.1075937031183636e-06,
"loss": 0.3767,
"step": 1750
},
{
"epoch": 0.7288205980066446,
"grad_norm": 2.2094868525489386,
"learning_rate": 2.0781062274569657e-06,
"loss": 0.3713,
"step": 1755
},
{
"epoch": 0.7308970099667774,
"grad_norm": 2.2242377702402663,
"learning_rate": 2.0487722786856107e-06,
"loss": 0.3808,
"step": 1760
},
{
"epoch": 0.7329734219269103,
"grad_norm": 2.451226818715509,
"learning_rate": 2.019593398116292e-06,
"loss": 0.3752,
"step": 1765
},
{
"epoch": 0.7350498338870431,
"grad_norm": 2.5070300923436006,
"learning_rate": 1.990571118913166e-06,
"loss": 0.3754,
"step": 1770
},
{
"epoch": 0.7371262458471761,
"grad_norm": 2.4891905395473963,
"learning_rate": 1.961706966011978e-06,
"loss": 0.3877,
"step": 1775
},
{
"epoch": 0.739202657807309,
"grad_norm": 2.4842650358701905,
"learning_rate": 1.9330024560399507e-06,
"loss": 0.3836,
"step": 1780
},
{
"epoch": 0.7412790697674418,
"grad_norm": 2.250133568783516,
"learning_rate": 1.9044590972360822e-06,
"loss": 0.3725,
"step": 1785
},
{
"epoch": 0.7433554817275747,
"grad_norm": 2.341904795212687,
"learning_rate": 1.876078389371911e-06,
"loss": 0.3679,
"step": 1790
},
{
"epoch": 0.7454318936877077,
"grad_norm": 2.3068998565270746,
"learning_rate": 1.8478618236726992e-06,
"loss": 0.3757,
"step": 1795
},
{
"epoch": 0.7475083056478405,
"grad_norm": 2.2619310866276203,
"learning_rate": 1.8198108827390892e-06,
"loss": 0.3742,
"step": 1800
},
{
"epoch": 0.7495847176079734,
"grad_norm": 2.406091048606607,
"learning_rate": 1.791927040469198e-06,
"loss": 0.3805,
"step": 1805
},
{
"epoch": 0.7516611295681063,
"grad_norm": 2.3430777426784077,
"learning_rate": 1.7642117619811672e-06,
"loss": 0.3744,
"step": 1810
},
{
"epoch": 0.7537375415282392,
"grad_norm": 2.309496934162411,
"learning_rate": 1.7366665035361947e-06,
"loss": 0.3856,
"step": 1815
},
{
"epoch": 0.7558139534883721,
"grad_norm": 2.3680236136606085,
"learning_rate": 1.7092927124620007e-06,
"loss": 0.3747,
"step": 1820
},
{
"epoch": 0.757890365448505,
"grad_norm": 2.3303370070854066,
"learning_rate": 1.682091827076796e-06,
"loss": 0.3724,
"step": 1825
},
{
"epoch": 0.7599667774086378,
"grad_norm": 2.308665058379731,
"learning_rate": 1.6550652766136932e-06,
"loss": 0.3701,
"step": 1830
},
{
"epoch": 0.7620431893687708,
"grad_norm": 2.423141151726278,
"learning_rate": 1.6282144811456196e-06,
"loss": 0.3749,
"step": 1835
},
{
"epoch": 0.7641196013289037,
"grad_norm": 2.310790310097539,
"learning_rate": 1.6015408515107e-06,
"loss": 0.3649,
"step": 1840
},
{
"epoch": 0.7661960132890365,
"grad_norm": 2.350953218186428,
"learning_rate": 1.5750457892381183e-06,
"loss": 0.3766,
"step": 1845
},
{
"epoch": 0.7682724252491694,
"grad_norm": 2.3685044215677826,
"learning_rate": 1.5487306864744878e-06,
"loss": 0.3626,
"step": 1850
},
{
"epoch": 0.7703488372093024,
"grad_norm": 2.4283396349263384,
"learning_rate": 1.5225969259106909e-06,
"loss": 0.358,
"step": 1855
},
{
"epoch": 0.7724252491694352,
"grad_norm": 2.515904865078178,
"learning_rate": 1.4966458807092404e-06,
"loss": 0.3703,
"step": 1860
},
{
"epoch": 0.7745016611295681,
"grad_norm": 2.369156818267499,
"learning_rate": 1.470878914432115e-06,
"loss": 0.3628,
"step": 1865
},
{
"epoch": 0.776578073089701,
"grad_norm": 2.3898762463795302,
"learning_rate": 1.4452973809691245e-06,
"loss": 0.3491,
"step": 1870
},
{
"epoch": 0.7786544850498339,
"grad_norm": 2.307405290268551,
"learning_rate": 1.4199026244667636e-06,
"loss": 0.3715,
"step": 1875
},
{
"epoch": 0.7807308970099668,
"grad_norm": 2.3679557325362808,
"learning_rate": 1.3946959792575915e-06,
"loss": 0.3716,
"step": 1880
},
{
"epoch": 0.7828073089700996,
"grad_norm": 2.368304219604154,
"learning_rate": 1.3696787697901131e-06,
"loss": 0.3661,
"step": 1885
},
{
"epoch": 0.7848837209302325,
"grad_norm": 2.337789695422565,
"learning_rate": 1.3448523105591976e-06,
"loss": 0.3605,
"step": 1890
},
{
"epoch": 0.7869601328903655,
"grad_norm": 2.611258973451232,
"learning_rate": 1.3202179060370041e-06,
"loss": 0.3699,
"step": 1895
},
{
"epoch": 0.7890365448504983,
"grad_norm": 2.437657572808606,
"learning_rate": 1.2957768506044383e-06,
"loss": 0.3651,
"step": 1900
},
{
"epoch": 0.7911129568106312,
"grad_norm": 2.388228690853508,
"learning_rate": 1.2715304284831492e-06,
"loss": 0.3664,
"step": 1905
},
{
"epoch": 0.7931893687707641,
"grad_norm": 2.2860587085497235,
"learning_rate": 1.2474799136680394e-06,
"loss": 0.3577,
"step": 1910
},
{
"epoch": 0.795265780730897,
"grad_norm": 2.2178621526275077,
"learning_rate": 1.223626569860339e-06,
"loss": 0.3441,
"step": 1915
},
{
"epoch": 0.7973421926910299,
"grad_norm": 2.518415787103085,
"learning_rate": 1.1999716504011917e-06,
"loss": 0.3673,
"step": 1920
},
{
"epoch": 0.7994186046511628,
"grad_norm": 2.359475880122496,
"learning_rate": 1.1765163982058109e-06,
"loss": 0.3567,
"step": 1925
},
{
"epoch": 0.8014950166112956,
"grad_norm": 2.407404285602653,
"learning_rate": 1.1532620456981685e-06,
"loss": 0.3476,
"step": 1930
},
{
"epoch": 0.8035714285714286,
"grad_norm": 2.562334088122669,
"learning_rate": 1.1302098147462348e-06,
"loss": 0.3658,
"step": 1935
},
{
"epoch": 0.8056478405315615,
"grad_norm": 2.4467720130350163,
"learning_rate": 1.1073609165977866e-06,
"loss": 0.348,
"step": 1940
},
{
"epoch": 0.8077242524916943,
"grad_norm": 2.3514873698583574,
"learning_rate": 1.0847165518167513e-06,
"loss": 0.3601,
"step": 1945
},
{
"epoch": 0.8098006644518272,
"grad_norm": 2.258063143891622,
"learning_rate": 1.062277910220138e-06,
"loss": 0.3548,
"step": 1950
},
{
"epoch": 0.8118770764119602,
"grad_norm": 2.3377988411022246,
"learning_rate": 1.0400461708155095e-06,
"loss": 0.3591,
"step": 1955
},
{
"epoch": 0.813953488372093,
"grad_norm": 2.4485426821221004,
"learning_rate": 1.0180225017390416e-06,
"loss": 0.3583,
"step": 1960
},
{
"epoch": 0.8160299003322259,
"grad_norm": 2.3726559534317797,
"learning_rate": 9.962080601941365e-07,
"loss": 0.3426,
"step": 1965
},
{
"epoch": 0.8181063122923588,
"grad_norm": 2.2417751776494543,
"learning_rate": 9.746039923906258e-07,
"loss": 0.343,
"step": 1970
},
{
"epoch": 0.8201827242524917,
"grad_norm": 2.5294843157217906,
"learning_rate": 9.532114334845444e-07,
"loss": 0.3664,
"step": 1975
},
{
"epoch": 0.8222591362126246,
"grad_norm": 2.5572851406694235,
"learning_rate": 9.320315075184771e-07,
"loss": 0.3483,
"step": 1980
},
{
"epoch": 0.8243355481727574,
"grad_norm": 2.4014306355585973,
"learning_rate": 9.110653273625103e-07,
"loss": 0.3454,
"step": 1985
},
{
"epoch": 0.8264119601328903,
"grad_norm": 2.3699223457500715,
"learning_rate": 8.903139946557437e-07,
"loss": 0.3527,
"step": 1990
},
{
"epoch": 0.8284883720930233,
"grad_norm": 2.4489197804834197,
"learning_rate": 8.697785997484198e-07,
"loss": 0.3535,
"step": 1995
},
{
"epoch": 0.8305647840531561,
"grad_norm": 2.4381698669696044,
"learning_rate": 8.494602216446213e-07,
"loss": 0.3522,
"step": 2000
},
{
"epoch": 0.832641196013289,
"grad_norm": 2.373612659548005,
"learning_rate": 8.293599279455838e-07,
"loss": 0.352,
"step": 2005
},
{
"epoch": 0.834717607973422,
"grad_norm": 2.5001126967401763,
"learning_rate": 8.094787747935995e-07,
"loss": 0.3533,
"step": 2010
},
{
"epoch": 0.8367940199335548,
"grad_norm": 2.4033229472375637,
"learning_rate": 7.898178068165175e-07,
"loss": 0.3569,
"step": 2015
},
{
"epoch": 0.8388704318936877,
"grad_norm": 2.34177766700727,
"learning_rate": 7.703780570728637e-07,
"loss": 0.3485,
"step": 2020
},
{
"epoch": 0.8409468438538206,
"grad_norm": 2.345211689975521,
"learning_rate": 7.511605469975524e-07,
"loss": 0.3541,
"step": 2025
},
{
"epoch": 0.8430232558139535,
"grad_norm": 2.491346976334481,
"learning_rate": 7.321662863482248e-07,
"loss": 0.357,
"step": 2030
},
{
"epoch": 0.8450996677740864,
"grad_norm": 2.4991193300068515,
"learning_rate": 7.133962731521837e-07,
"loss": 0.3504,
"step": 2035
},
{
"epoch": 0.8471760797342193,
"grad_norm": 2.4131651786978376,
"learning_rate": 6.948514936539596e-07,
"loss": 0.3413,
"step": 2040
},
{
"epoch": 0.8492524916943521,
"grad_norm": 2.4158508388648046,
"learning_rate": 6.765329222634892e-07,
"loss": 0.3368,
"step": 2045
},
{
"epoch": 0.8513289036544851,
"grad_norm": 2.444048773418729,
"learning_rate": 6.584415215049145e-07,
"loss": 0.3478,
"step": 2050
},
{
"epoch": 0.853405315614618,
"grad_norm": 2.3067727734077854,
"learning_rate": 6.405782419660073e-07,
"loss": 0.3539,
"step": 2055
},
{
"epoch": 0.8554817275747508,
"grad_norm": 2.389540542776719,
"learning_rate": 6.229440222482258e-07,
"loss": 0.3568,
"step": 2060
},
{
"epoch": 0.8575581395348837,
"grad_norm": 2.490728442827626,
"learning_rate": 6.055397889173947e-07,
"loss": 0.3425,
"step": 2065
},
{
"epoch": 0.8596345514950167,
"grad_norm": 2.4309142506564116,
"learning_rate": 5.88366456455019e-07,
"loss": 0.3556,
"step": 2070
},
{
"epoch": 0.8617109634551495,
"grad_norm": 2.577695548294538,
"learning_rate": 5.714249272102368e-07,
"loss": 0.3479,
"step": 2075
},
{
"epoch": 0.8637873754152824,
"grad_norm": 2.3780994980865513,
"learning_rate": 5.547160913524024e-07,
"loss": 0.3407,
"step": 2080
},
{
"epoch": 0.8658637873754153,
"grad_norm": 2.3471940728385645,
"learning_rate": 5.382408268243194e-07,
"loss": 0.327,
"step": 2085
},
{
"epoch": 0.8679401993355482,
"grad_norm": 2.5308209588235964,
"learning_rate": 5.219999992961044e-07,
"loss": 0.3486,
"step": 2090
},
{
"epoch": 0.8700166112956811,
"grad_norm": 2.347529844497377,
"learning_rate": 5.05994462119705e-07,
"loss": 0.3507,
"step": 2095
},
{
"epoch": 0.872093023255814,
"grad_norm": 2.4490768218202428,
"learning_rate": 4.902250562840622e-07,
"loss": 0.3484,
"step": 2100
},
{
"epoch": 0.8741694352159468,
"grad_norm": 2.4607053819399227,
"learning_rate": 4.7469261037091765e-07,
"loss": 0.355,
"step": 2105
},
{
"epoch": 0.8762458471760798,
"grad_norm": 2.37905091425431,
"learning_rate": 4.5939794051128363e-07,
"loss": 0.3544,
"step": 2110
},
{
"epoch": 0.8783222591362126,
"grad_norm": 2.3898177002048397,
"learning_rate": 4.443418503425517e-07,
"loss": 0.3459,
"step": 2115
},
{
"epoch": 0.8803986710963455,
"grad_norm": 2.3945638825763336,
"learning_rate": 4.295251309662768e-07,
"loss": 0.3475,
"step": 2120
},
{
"epoch": 0.8824750830564784,
"grad_norm": 2.376437633901908,
"learning_rate": 4.149485609066001e-07,
"loss": 0.3448,
"step": 2125
},
{
"epoch": 0.8845514950166113,
"grad_norm": 2.4682795986451884,
"learning_rate": 4.0061290606935145e-07,
"loss": 0.3501,
"step": 2130
},
{
"epoch": 0.8866279069767442,
"grad_norm": 2.307696986215917,
"learning_rate": 3.8651891970179876e-07,
"loss": 0.3509,
"step": 2135
},
{
"epoch": 0.8887043189368771,
"grad_norm": 2.2638655900879323,
"learning_rate": 3.7266734235307357e-07,
"loss": 0.3494,
"step": 2140
},
{
"epoch": 0.8907807308970099,
"grad_norm": 2.4074516319355865,
"learning_rate": 3.5905890183525916e-07,
"loss": 0.3381,
"step": 2145
},
{
"epoch": 0.8928571428571429,
"grad_norm": 2.4580735039851263,
"learning_rate": 3.4569431318514647e-07,
"loss": 0.3506,
"step": 2150
},
{
"epoch": 0.8949335548172758,
"grad_norm": 2.223651003352099,
"learning_rate": 3.3257427862666894e-07,
"loss": 0.3426,
"step": 2155
},
{
"epoch": 0.8970099667774086,
"grad_norm": 2.5240054200803925,
"learning_rate": 3.196994875339976e-07,
"loss": 0.3394,
"step": 2160
},
{
"epoch": 0.8990863787375415,
"grad_norm": 2.650418412385108,
"learning_rate": 3.0707061639532687e-07,
"loss": 0.3469,
"step": 2165
},
{
"epoch": 0.9011627906976745,
"grad_norm": 2.5283079967315256,
"learning_rate": 2.946883287773211e-07,
"loss": 0.3572,
"step": 2170
},
{
"epoch": 0.9032392026578073,
"grad_norm": 2.482824449172331,
"learning_rate": 2.82553275290256e-07,
"loss": 0.3469,
"step": 2175
},
{
"epoch": 0.9053156146179402,
"grad_norm": 2.42162117653704,
"learning_rate": 2.706660935538297e-07,
"loss": 0.3522,
"step": 2180
},
{
"epoch": 0.907392026578073,
"grad_norm": 2.610628055343181,
"learning_rate": 2.590274081636568e-07,
"loss": 0.3326,
"step": 2185
},
{
"epoch": 0.909468438538206,
"grad_norm": 2.337754822501405,
"learning_rate": 2.476378306584576e-07,
"loss": 0.3472,
"step": 2190
},
{
"epoch": 0.9115448504983389,
"grad_norm": 2.422013772805342,
"learning_rate": 2.3649795948791744e-07,
"loss": 0.3291,
"step": 2195
},
{
"epoch": 0.9136212624584718,
"grad_norm": 2.5260012444754865,
"learning_rate": 2.2560837998124862e-07,
"loss": 0.3443,
"step": 2200
},
{
"epoch": 0.9156976744186046,
"grad_norm": 2.5167784300702203,
"learning_rate": 2.1496966431642895e-07,
"loss": 0.344,
"step": 2205
},
{
"epoch": 0.9177740863787376,
"grad_norm": 2.5184080924547976,
"learning_rate": 2.0458237149014347e-07,
"loss": 0.3431,
"step": 2210
},
{
"epoch": 0.9198504983388704,
"grad_norm": 2.6121850478268915,
"learning_rate": 1.944470472884097e-07,
"loss": 0.3469,
"step": 2215
},
{
"epoch": 0.9219269102990033,
"grad_norm": 2.4250182138955987,
"learning_rate": 1.8456422425789822e-07,
"loss": 0.346,
"step": 2220
},
{
"epoch": 0.9240033222591362,
"grad_norm": 2.4126854578567056,
"learning_rate": 1.7493442167795526e-07,
"loss": 0.3394,
"step": 2225
},
{
"epoch": 0.9260797342192691,
"grad_norm": 2.2732400743037546,
"learning_rate": 1.6555814553331328e-07,
"loss": 0.3474,
"step": 2230
},
{
"epoch": 0.928156146179402,
"grad_norm": 2.4576436036196867,
"learning_rate": 1.5643588848750944e-07,
"loss": 0.3455,
"step": 2235
},
{
"epoch": 0.9302325581395349,
"grad_norm": 2.417373647969096,
"learning_rate": 1.4756812985699364e-07,
"loss": 0.3389,
"step": 2240
},
{
"epoch": 0.9323089700996677,
"grad_norm": 2.314864797926019,
"learning_rate": 1.3895533558594853e-07,
"loss": 0.3307,
"step": 2245
},
{
"epoch": 0.9343853820598007,
"grad_norm": 2.4942438872944375,
"learning_rate": 1.305979582218042e-07,
"loss": 0.3413,
"step": 2250
},
{
"epoch": 0.9364617940199336,
"grad_norm": 2.4271492623044733,
"learning_rate": 1.224964368914622e-07,
"loss": 0.3533,
"step": 2255
},
{
"epoch": 0.9385382059800664,
"grad_norm": 2.404072393255019,
"learning_rate": 1.1465119727821828e-07,
"loss": 0.3388,
"step": 2260
},
{
"epoch": 0.9406146179401993,
"grad_norm": 2.4291366569357233,
"learning_rate": 1.0706265159939944e-07,
"loss": 0.329,
"step": 2265
},
{
"epoch": 0.9426910299003323,
"grad_norm": 2.370319609790916,
"learning_rate": 9.973119858470326e-08,
"loss": 0.3435,
"step": 2270
},
{
"epoch": 0.9447674418604651,
"grad_norm": 2.612518036597659,
"learning_rate": 9.265722345524475e-08,
"loss": 0.3544,
"step": 2275
},
{
"epoch": 0.946843853820598,
"grad_norm": 2.325383175606347,
"learning_rate": 8.584109790331918e-08,
"loss": 0.334,
"step": 2280
},
{
"epoch": 0.9489202657807309,
"grad_norm": 2.483650038896797,
"learning_rate": 7.92831800728705e-08,
"loss": 0.3495,
"step": 2285
},
{
"epoch": 0.9509966777408638,
"grad_norm": 2.3917415303858323,
"learning_rate": 7.29838145406725e-08,
"loss": 0.3525,
"step": 2290
},
{
"epoch": 0.9530730897009967,
"grad_norm": 2.3491361755345297,
"learning_rate": 6.69433322982238e-08,
"loss": 0.3261,
"step": 2295
},
{
"epoch": 0.9551495016611296,
"grad_norm": 2.395546616132027,
"learning_rate": 6.116205073435632e-08,
"loss": 0.3572,
"step": 2300
},
{
"epoch": 0.9572259136212624,
"grad_norm": 2.38127790933904,
"learning_rate": 5.5640273618560724e-08,
"loss": 0.3477,
"step": 2305
},
{
"epoch": 0.9593023255813954,
"grad_norm": 2.437345577309693,
"learning_rate": 5.0378291085020905e-08,
"loss": 0.3498,
"step": 2310
},
{
"epoch": 0.9613787375415282,
"grad_norm": 2.3743576771009125,
"learning_rate": 4.537637961737285e-08,
"loss": 0.3537,
"step": 2315
},
{
"epoch": 0.9634551495016611,
"grad_norm": 2.4186159686143816,
"learning_rate": 4.063480203417625e-08,
"loss": 0.3491,
"step": 2320
},
{
"epoch": 0.965531561461794,
"grad_norm": 2.43658746364112,
"learning_rate": 3.6153807475103886e-08,
"loss": 0.3372,
"step": 2325
},
{
"epoch": 0.967607973421927,
"grad_norm": 2.433100952556644,
"learning_rate": 3.1933631387853215e-08,
"loss": 0.34,
"step": 2330
},
{
"epoch": 0.9696843853820598,
"grad_norm": 2.3533082714101288,
"learning_rate": 2.7974495515772915e-08,
"loss": 0.3478,
"step": 2335
},
{
"epoch": 0.9717607973421927,
"grad_norm": 2.3042817476032296,
"learning_rate": 2.427660788621222e-08,
"loss": 0.3522,
"step": 2340
},
{
"epoch": 0.9738372093023255,
"grad_norm": 2.5758246509298184,
"learning_rate": 2.0840162799591335e-08,
"loss": 0.3518,
"step": 2345
},
{
"epoch": 0.9759136212624585,
"grad_norm": 2.2839484862848254,
"learning_rate": 1.7665340819192356e-08,
"loss": 0.3412,
"step": 2350
},
{
"epoch": 0.9779900332225914,
"grad_norm": 2.3825796160738184,
"learning_rate": 1.475230876166911e-08,
"loss": 0.3484,
"step": 2355
},
{
"epoch": 0.9800664451827242,
"grad_norm": 2.436946151591597,
"learning_rate": 1.2101219688285815e-08,
"loss": 0.3406,
"step": 2360
},
{
"epoch": 0.9821428571428571,
"grad_norm": 2.3004422772721385,
"learning_rate": 9.712212896871854e-09,
"loss": 0.3483,
"step": 2365
},
{
"epoch": 0.9842192691029901,
"grad_norm": 2.243191260776767,
"learning_rate": 7.585413914503182e-09,
"loss": 0.3279,
"step": 2370
},
{
"epoch": 0.9862956810631229,
"grad_norm": 2.4579139874339213,
"learning_rate": 5.720934490907604e-09,
"loss": 0.3539,
"step": 2375
},
{
"epoch": 0.9883720930232558,
"grad_norm": 2.2560463637497885,
"learning_rate": 4.118872592592804e-09,
"loss": 0.3376,
"step": 2380
},
{
"epoch": 0.9904485049833887,
"grad_norm": 2.4374426699588327,
"learning_rate": 2.7793123976976866e-09,
"loss": 0.337,
"step": 2385
},
{
"epoch": 0.9925249169435216,
"grad_norm": 2.4021295260594466,
"learning_rate": 1.7023242915703563e-09,
"loss": 0.3422,
"step": 2390
},
{
"epoch": 0.9946013289036545,
"grad_norm": 2.3647029145641847,
"learning_rate": 8.879648630705229e-10,
"loss": 0.3402,
"step": 2395
},
{
"epoch": 0.9966777408637874,
"grad_norm": 2.377691718973852,
"learning_rate": 3.362769015941014e-10,
"loss": 0.3437,
"step": 2400
},
{
"epoch": 0.9987541528239202,
"grad_norm": 2.277353937646912,
"learning_rate": 4.7289394825567046e-11,
"loss": 0.3486,
"step": 2405
},
{
"epoch": 1.0,
"eval_runtime": 3.4135,
"eval_samples_per_second": 2.93,
"eval_steps_per_second": 0.879,
"step": 2408
},
{
"epoch": 1.0,
"step": 2408,
"total_flos": 252093105438720.0,
"train_loss": 0.0,
"train_runtime": 0.0085,
"train_samples_per_second": 4527521.09,
"train_steps_per_second": 283028.837
}
],
"logging_steps": 5,
"max_steps": 2408,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 252093105438720.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}