Mistral-DNA-v1-138M-lncRNA / trainer_state.json
RaphaelMourad's picture
Upload 9 files
241c51c verified
{
"best_metric": 6.436838150024414,
"best_model_checkpoint": "./results/models/checkpoint-434265",
"epoch": 17.0,
"eval_steps": 500,
"global_step": 434265,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019573302016050106,
"grad_norm": 0.59765625,
"learning_rate": 0.000999608533959679,
"loss": 7.0053,
"step": 500
},
{
"epoch": 0.03914660403210021,
"grad_norm": 0.625,
"learning_rate": 0.000999217067919358,
"loss": 6.8806,
"step": 1000
},
{
"epoch": 0.058719906048150326,
"grad_norm": 0.79296875,
"learning_rate": 0.0009988256018790371,
"loss": 6.8512,
"step": 1500
},
{
"epoch": 0.07829320806420043,
"grad_norm": 4.4375,
"learning_rate": 0.000998434135838716,
"loss": 6.8494,
"step": 2000
},
{
"epoch": 0.09786651008025053,
"grad_norm": 1.3203125,
"learning_rate": 0.000998042669798395,
"loss": 6.8296,
"step": 2500
},
{
"epoch": 0.11743981209630065,
"grad_norm": 1.7265625,
"learning_rate": 0.000997651203758074,
"loss": 6.8209,
"step": 3000
},
{
"epoch": 0.13701311411235076,
"grad_norm": 0.93359375,
"learning_rate": 0.0009972597377177531,
"loss": 6.8119,
"step": 3500
},
{
"epoch": 0.15658641612840085,
"grad_norm": 0.95703125,
"learning_rate": 0.000996868271677432,
"loss": 6.8096,
"step": 4000
},
{
"epoch": 0.17615971814445097,
"grad_norm": 1.0546875,
"learning_rate": 0.0009964768056371109,
"loss": 6.7987,
"step": 4500
},
{
"epoch": 0.19573302016050106,
"grad_norm": 0.94921875,
"learning_rate": 0.00099608533959679,
"loss": 6.7946,
"step": 5000
},
{
"epoch": 0.21530632217655118,
"grad_norm": 2.34375,
"learning_rate": 0.000995693873556469,
"loss": 6.7825,
"step": 5500
},
{
"epoch": 0.2348796241926013,
"grad_norm": 0.94140625,
"learning_rate": 0.000995302407516148,
"loss": 6.7724,
"step": 6000
},
{
"epoch": 0.2544529262086514,
"grad_norm": 0.8671875,
"learning_rate": 0.0009949109414758269,
"loss": 6.7732,
"step": 6500
},
{
"epoch": 0.2740262282247015,
"grad_norm": 1.3671875,
"learning_rate": 0.000994519475435506,
"loss": 6.7652,
"step": 7000
},
{
"epoch": 0.29359953024075164,
"grad_norm": 1.046875,
"learning_rate": 0.000994128009395185,
"loss": 6.7589,
"step": 7500
},
{
"epoch": 0.3131728322568017,
"grad_norm": 0.96484375,
"learning_rate": 0.000993736543354864,
"loss": 6.7583,
"step": 8000
},
{
"epoch": 0.3327461342728518,
"grad_norm": 1.5234375,
"learning_rate": 0.0009933450773145429,
"loss": 6.7515,
"step": 8500
},
{
"epoch": 0.35231943628890194,
"grad_norm": 2.234375,
"learning_rate": 0.000992953611274222,
"loss": 6.7467,
"step": 9000
},
{
"epoch": 0.37189273830495206,
"grad_norm": 1.46875,
"learning_rate": 0.000992562145233901,
"loss": 6.7366,
"step": 9500
},
{
"epoch": 0.39146604032100213,
"grad_norm": 1.1484375,
"learning_rate": 0.00099217067919358,
"loss": 6.7365,
"step": 10000
},
{
"epoch": 0.41103934233705225,
"grad_norm": 1.4375,
"learning_rate": 0.000991779213153259,
"loss": 6.7253,
"step": 10500
},
{
"epoch": 0.43061264435310237,
"grad_norm": 0.87109375,
"learning_rate": 0.000991387747112938,
"loss": 6.7356,
"step": 11000
},
{
"epoch": 0.4501859463691525,
"grad_norm": 1.484375,
"learning_rate": 0.000990996281072617,
"loss": 6.7177,
"step": 11500
},
{
"epoch": 0.4697592483852026,
"grad_norm": 3.15625,
"learning_rate": 0.000990604815032296,
"loss": 6.7195,
"step": 12000
},
{
"epoch": 0.4893325504012527,
"grad_norm": 0.9140625,
"learning_rate": 0.000990213348991975,
"loss": 6.7202,
"step": 12500
},
{
"epoch": 0.5089058524173028,
"grad_norm": 1.046875,
"learning_rate": 0.000989821882951654,
"loss": 6.7183,
"step": 13000
},
{
"epoch": 0.5284791544333529,
"grad_norm": 0.91015625,
"learning_rate": 0.000989430416911333,
"loss": 6.7186,
"step": 13500
},
{
"epoch": 0.548052456449403,
"grad_norm": 1.4765625,
"learning_rate": 0.000989038950871012,
"loss": 6.7221,
"step": 14000
},
{
"epoch": 0.5676257584654532,
"grad_norm": 1.3515625,
"learning_rate": 0.000988647484830691,
"loss": 6.7046,
"step": 14500
},
{
"epoch": 0.5871990604815033,
"grad_norm": 1.1640625,
"learning_rate": 0.00098825601879037,
"loss": 6.7078,
"step": 15000
},
{
"epoch": 0.6067723624975533,
"grad_norm": 1.109375,
"learning_rate": 0.000987864552750049,
"loss": 6.7063,
"step": 15500
},
{
"epoch": 0.6263456645136034,
"grad_norm": 4.8125,
"learning_rate": 0.000987473086709728,
"loss": 6.6989,
"step": 16000
},
{
"epoch": 0.6459189665296535,
"grad_norm": 1.3125,
"learning_rate": 0.000987081620669407,
"loss": 6.6951,
"step": 16500
},
{
"epoch": 0.6654922685457036,
"grad_norm": 0.890625,
"learning_rate": 0.000986690154629086,
"loss": 6.6921,
"step": 17000
},
{
"epoch": 0.6850655705617538,
"grad_norm": 1.7890625,
"learning_rate": 0.0009862986885887648,
"loss": 6.6921,
"step": 17500
},
{
"epoch": 0.7046388725778039,
"grad_norm": 7.75,
"learning_rate": 0.000985907222548444,
"loss": 6.6884,
"step": 18000
},
{
"epoch": 0.724212174593854,
"grad_norm": 1.203125,
"learning_rate": 0.000985515756508123,
"loss": 6.6841,
"step": 18500
},
{
"epoch": 0.7437854766099041,
"grad_norm": 7.46875,
"learning_rate": 0.000985124290467802,
"loss": 6.6904,
"step": 19000
},
{
"epoch": 0.7633587786259542,
"grad_norm": 1.6171875,
"learning_rate": 0.0009847328244274808,
"loss": 6.6847,
"step": 19500
},
{
"epoch": 0.7829320806420043,
"grad_norm": 1.125,
"learning_rate": 0.00098434135838716,
"loss": 6.6799,
"step": 20000
},
{
"epoch": 0.8025053826580544,
"grad_norm": 1.734375,
"learning_rate": 0.000983949892346839,
"loss": 6.6814,
"step": 20500
},
{
"epoch": 0.8220786846741045,
"grad_norm": 1.4921875,
"learning_rate": 0.000983558426306518,
"loss": 6.6767,
"step": 21000
},
{
"epoch": 0.8416519866901546,
"grad_norm": 4.5,
"learning_rate": 0.0009831669602661968,
"loss": 6.6714,
"step": 21500
},
{
"epoch": 0.8612252887062047,
"grad_norm": 1.75,
"learning_rate": 0.000982775494225876,
"loss": 6.6725,
"step": 22000
},
{
"epoch": 0.8807985907222549,
"grad_norm": 1.7265625,
"learning_rate": 0.000982384028185555,
"loss": 6.6646,
"step": 22500
},
{
"epoch": 0.900371892738305,
"grad_norm": 2.25,
"learning_rate": 0.000981992562145234,
"loss": 6.6636,
"step": 23000
},
{
"epoch": 0.9199451947543551,
"grad_norm": 2.203125,
"learning_rate": 0.0009816010961049128,
"loss": 6.6506,
"step": 23500
},
{
"epoch": 0.9395184967704052,
"grad_norm": 2.96875,
"learning_rate": 0.000981209630064592,
"loss": 6.6546,
"step": 24000
},
{
"epoch": 0.9590917987864552,
"grad_norm": 1.0546875,
"learning_rate": 0.000980818164024271,
"loss": 6.6504,
"step": 24500
},
{
"epoch": 0.9786651008025053,
"grad_norm": 3.421875,
"learning_rate": 0.0009804266979839499,
"loss": 6.6499,
"step": 25000
},
{
"epoch": 0.9982384028185555,
"grad_norm": 3.0625,
"learning_rate": 0.0009800352319436288,
"loss": 6.6422,
"step": 25500
},
{
"epoch": 1.0,
"eval_loss": 6.643181800842285,
"eval_runtime": 23.6647,
"eval_samples_per_second": 84.514,
"eval_steps_per_second": 5.282,
"step": 25545
},
{
"epoch": 1.0178117048346056,
"grad_norm": 1.7578125,
"learning_rate": 0.0009796437659033079,
"loss": 6.6453,
"step": 26000
},
{
"epoch": 1.0373850068506556,
"grad_norm": 0.84375,
"learning_rate": 0.000979252299862987,
"loss": 6.6465,
"step": 26500
},
{
"epoch": 1.0569583088667058,
"grad_norm": 2.421875,
"learning_rate": 0.0009788608338226659,
"loss": 6.6497,
"step": 27000
},
{
"epoch": 1.0765316108827558,
"grad_norm": 1.3515625,
"learning_rate": 0.0009784693677823448,
"loss": 6.642,
"step": 27500
},
{
"epoch": 1.096104912898806,
"grad_norm": 0.89453125,
"learning_rate": 0.0009780779017420239,
"loss": 6.6384,
"step": 28000
},
{
"epoch": 1.115678214914856,
"grad_norm": 0.88671875,
"learning_rate": 0.000977686435701703,
"loss": 6.6389,
"step": 28500
},
{
"epoch": 1.1352515169309063,
"grad_norm": 1.203125,
"learning_rate": 0.0009772949696613819,
"loss": 6.6361,
"step": 29000
},
{
"epoch": 1.1548248189469563,
"grad_norm": 1.140625,
"learning_rate": 0.0009769035036210608,
"loss": 6.6393,
"step": 29500
},
{
"epoch": 1.1743981209630066,
"grad_norm": 1.421875,
"learning_rate": 0.00097651203758074,
"loss": 6.6413,
"step": 30000
},
{
"epoch": 1.1939714229790566,
"grad_norm": 1.421875,
"learning_rate": 0.0009761205715404189,
"loss": 6.6334,
"step": 30500
},
{
"epoch": 1.2135447249951068,
"grad_norm": 1.390625,
"learning_rate": 0.0009757291055000979,
"loss": 6.6316,
"step": 31000
},
{
"epoch": 1.2331180270111568,
"grad_norm": 2.015625,
"learning_rate": 0.000975337639459777,
"loss": 6.6257,
"step": 31500
},
{
"epoch": 1.2526913290272068,
"grad_norm": 2.0,
"learning_rate": 0.000974946173419456,
"loss": 6.6277,
"step": 32000
},
{
"epoch": 1.272264631043257,
"grad_norm": 1.4765625,
"learning_rate": 0.0009745547073791348,
"loss": 6.6333,
"step": 32500
},
{
"epoch": 1.291837933059307,
"grad_norm": 1.28125,
"learning_rate": 0.0009741632413388138,
"loss": 6.6241,
"step": 33000
},
{
"epoch": 1.3114112350753573,
"grad_norm": 1.078125,
"learning_rate": 0.000973771775298493,
"loss": 6.6336,
"step": 33500
},
{
"epoch": 1.3309845370914073,
"grad_norm": 3.359375,
"learning_rate": 0.0009733803092581718,
"loss": 6.6298,
"step": 34000
},
{
"epoch": 1.3505578391074575,
"grad_norm": 1.5,
"learning_rate": 0.0009729888432178508,
"loss": 6.6248,
"step": 34500
},
{
"epoch": 1.3701311411235075,
"grad_norm": 1.2109375,
"learning_rate": 0.0009725973771775298,
"loss": 6.6291,
"step": 35000
},
{
"epoch": 1.3897044431395575,
"grad_norm": 2.125,
"learning_rate": 0.0009722059111372089,
"loss": 6.6224,
"step": 35500
},
{
"epoch": 1.4092777451556078,
"grad_norm": 1.375,
"learning_rate": 0.0009718144450968878,
"loss": 6.6194,
"step": 36000
},
{
"epoch": 1.4288510471716578,
"grad_norm": 1.859375,
"learning_rate": 0.0009714229790565668,
"loss": 6.6127,
"step": 36500
},
{
"epoch": 1.448424349187708,
"grad_norm": 1.234375,
"learning_rate": 0.0009710315130162458,
"loss": 6.6062,
"step": 37000
},
{
"epoch": 1.467997651203758,
"grad_norm": 1.078125,
"learning_rate": 0.0009706400469759249,
"loss": 6.6188,
"step": 37500
},
{
"epoch": 1.4875709532198083,
"grad_norm": 2.125,
"learning_rate": 0.0009702485809356038,
"loss": 6.6132,
"step": 38000
},
{
"epoch": 1.5071442552358583,
"grad_norm": 3.34375,
"learning_rate": 0.0009698571148952828,
"loss": 6.6034,
"step": 38500
},
{
"epoch": 1.5267175572519083,
"grad_norm": 2.359375,
"learning_rate": 0.0009694656488549618,
"loss": 6.6049,
"step": 39000
},
{
"epoch": 1.5462908592679585,
"grad_norm": 3.65625,
"learning_rate": 0.0009690741828146409,
"loss": 6.6077,
"step": 39500
},
{
"epoch": 1.5658641612840087,
"grad_norm": 1.3046875,
"learning_rate": 0.0009686827167743198,
"loss": 6.6054,
"step": 40000
},
{
"epoch": 1.5854374633000587,
"grad_norm": 3.140625,
"learning_rate": 0.0009682912507339988,
"loss": 6.6109,
"step": 40500
},
{
"epoch": 1.6050107653161088,
"grad_norm": 1.21875,
"learning_rate": 0.0009678997846936779,
"loss": 6.6122,
"step": 41000
},
{
"epoch": 1.624584067332159,
"grad_norm": 1.3984375,
"learning_rate": 0.0009675083186533569,
"loss": 6.6049,
"step": 41500
},
{
"epoch": 1.644157369348209,
"grad_norm": 2.5625,
"learning_rate": 0.0009671168526130358,
"loss": 6.6004,
"step": 42000
},
{
"epoch": 1.663730671364259,
"grad_norm": 6.8125,
"learning_rate": 0.0009667253865727148,
"loss": 6.608,
"step": 42500
},
{
"epoch": 1.6833039733803092,
"grad_norm": 1.34375,
"learning_rate": 0.0009663339205323939,
"loss": 6.5973,
"step": 43000
},
{
"epoch": 1.7028772753963595,
"grad_norm": 2.328125,
"learning_rate": 0.0009659424544920729,
"loss": 6.5984,
"step": 43500
},
{
"epoch": 1.7224505774124095,
"grad_norm": 2.90625,
"learning_rate": 0.0009655509884517518,
"loss": 6.6001,
"step": 44000
},
{
"epoch": 1.7420238794284595,
"grad_norm": 1.1171875,
"learning_rate": 0.0009651595224114308,
"loss": 6.6016,
"step": 44500
},
{
"epoch": 1.7615971814445097,
"grad_norm": 2.84375,
"learning_rate": 0.0009647680563711099,
"loss": 6.598,
"step": 45000
},
{
"epoch": 1.78117048346056,
"grad_norm": 2.03125,
"learning_rate": 0.0009643765903307889,
"loss": 6.6052,
"step": 45500
},
{
"epoch": 1.80074378547661,
"grad_norm": 1.421875,
"learning_rate": 0.0009639851242904678,
"loss": 6.5902,
"step": 46000
},
{
"epoch": 1.82031708749266,
"grad_norm": 1.8671875,
"learning_rate": 0.0009635936582501468,
"loss": 6.5847,
"step": 46500
},
{
"epoch": 1.8398903895087102,
"grad_norm": 6.15625,
"learning_rate": 0.0009632021922098259,
"loss": 6.5948,
"step": 47000
},
{
"epoch": 1.8594636915247602,
"grad_norm": 9.8125,
"learning_rate": 0.0009628107261695049,
"loss": 6.6071,
"step": 47500
},
{
"epoch": 1.8790369935408102,
"grad_norm": 3.515625,
"learning_rate": 0.0009624192601291838,
"loss": 6.5973,
"step": 48000
},
{
"epoch": 1.8986102955568605,
"grad_norm": 4.5625,
"learning_rate": 0.0009620277940888628,
"loss": 6.5917,
"step": 48500
},
{
"epoch": 1.9181835975729107,
"grad_norm": 0.83984375,
"learning_rate": 0.0009616363280485419,
"loss": 6.5937,
"step": 49000
},
{
"epoch": 1.9377568995889607,
"grad_norm": 2.984375,
"learning_rate": 0.0009612448620082208,
"loss": 6.5919,
"step": 49500
},
{
"epoch": 1.9573302016050107,
"grad_norm": 1.90625,
"learning_rate": 0.0009608533959678998,
"loss": 6.5885,
"step": 50000
},
{
"epoch": 1.976903503621061,
"grad_norm": 3.203125,
"learning_rate": 0.0009604619299275788,
"loss": 6.6004,
"step": 50500
},
{
"epoch": 1.996476805637111,
"grad_norm": 7.4375,
"learning_rate": 0.0009600704638872579,
"loss": 6.5952,
"step": 51000
},
{
"epoch": 2.0,
"eval_loss": 6.583548545837402,
"eval_runtime": 20.5988,
"eval_samples_per_second": 97.093,
"eval_steps_per_second": 6.068,
"step": 51090
},
{
"epoch": 2.016050107653161,
"grad_norm": 1.4296875,
"learning_rate": 0.0009596789978469367,
"loss": 6.5932,
"step": 51500
},
{
"epoch": 2.035623409669211,
"grad_norm": 1.1796875,
"learning_rate": 0.0009592875318066157,
"loss": 6.5895,
"step": 52000
},
{
"epoch": 2.0551967116852614,
"grad_norm": 9.0625,
"learning_rate": 0.0009588960657662949,
"loss": 6.5814,
"step": 52500
},
{
"epoch": 2.074770013701311,
"grad_norm": 5.78125,
"learning_rate": 0.0009585045997259738,
"loss": 6.5826,
"step": 53000
},
{
"epoch": 2.0943433157173614,
"grad_norm": 1.5078125,
"learning_rate": 0.0009581131336856527,
"loss": 6.591,
"step": 53500
},
{
"epoch": 2.1139166177334117,
"grad_norm": 3.921875,
"learning_rate": 0.0009577216676453317,
"loss": 6.5807,
"step": 54000
},
{
"epoch": 2.133489919749462,
"grad_norm": 3.09375,
"learning_rate": 0.0009573302016050108,
"loss": 6.5793,
"step": 54500
},
{
"epoch": 2.1530632217655117,
"grad_norm": 2.953125,
"learning_rate": 0.0009569387355646898,
"loss": 6.5854,
"step": 55000
},
{
"epoch": 2.172636523781562,
"grad_norm": 5.53125,
"learning_rate": 0.0009565472695243687,
"loss": 6.5796,
"step": 55500
},
{
"epoch": 2.192209825797612,
"grad_norm": 1.3359375,
"learning_rate": 0.0009561558034840477,
"loss": 6.5693,
"step": 56000
},
{
"epoch": 2.2117831278136624,
"grad_norm": 2.21875,
"learning_rate": 0.0009557643374437268,
"loss": 6.5837,
"step": 56500
},
{
"epoch": 2.231356429829712,
"grad_norm": 2.609375,
"learning_rate": 0.0009553728714034058,
"loss": 6.5791,
"step": 57000
},
{
"epoch": 2.2509297318457624,
"grad_norm": 1.9765625,
"learning_rate": 0.0009549814053630847,
"loss": 6.5746,
"step": 57500
},
{
"epoch": 2.2705030338618126,
"grad_norm": 1.6171875,
"learning_rate": 0.0009545899393227637,
"loss": 6.5766,
"step": 58000
},
{
"epoch": 2.2900763358778624,
"grad_norm": 2.9375,
"learning_rate": 0.0009541984732824428,
"loss": 6.5832,
"step": 58500
},
{
"epoch": 2.3096496378939126,
"grad_norm": 3.28125,
"learning_rate": 0.0009538070072421218,
"loss": 6.5732,
"step": 59000
},
{
"epoch": 2.329222939909963,
"grad_norm": 1.0625,
"learning_rate": 0.0009534155412018007,
"loss": 6.5774,
"step": 59500
},
{
"epoch": 2.348796241926013,
"grad_norm": 2.921875,
"learning_rate": 0.0009530240751614797,
"loss": 6.5692,
"step": 60000
},
{
"epoch": 2.368369543942063,
"grad_norm": 1.8046875,
"learning_rate": 0.0009526326091211588,
"loss": 6.5789,
"step": 60500
},
{
"epoch": 2.387942845958113,
"grad_norm": 1.4921875,
"learning_rate": 0.0009522411430808378,
"loss": 6.576,
"step": 61000
},
{
"epoch": 2.4075161479741634,
"grad_norm": 8.6875,
"learning_rate": 0.0009518496770405167,
"loss": 6.5807,
"step": 61500
},
{
"epoch": 2.4270894499902136,
"grad_norm": 2.71875,
"learning_rate": 0.0009514582110001958,
"loss": 6.5753,
"step": 62000
},
{
"epoch": 2.4466627520062634,
"grad_norm": 1.9765625,
"learning_rate": 0.0009510667449598748,
"loss": 6.5799,
"step": 62500
},
{
"epoch": 2.4662360540223136,
"grad_norm": 1.890625,
"learning_rate": 0.0009506752789195538,
"loss": 6.5678,
"step": 63000
},
{
"epoch": 2.485809356038364,
"grad_norm": 1.421875,
"learning_rate": 0.0009502838128792327,
"loss": 6.5683,
"step": 63500
},
{
"epoch": 2.5053826580544136,
"grad_norm": 3.03125,
"learning_rate": 0.0009498923468389118,
"loss": 6.5674,
"step": 64000
},
{
"epoch": 2.524955960070464,
"grad_norm": 0.828125,
"learning_rate": 0.0009495008807985908,
"loss": 6.5606,
"step": 64500
},
{
"epoch": 2.544529262086514,
"grad_norm": 0.8359375,
"learning_rate": 0.0009491094147582697,
"loss": 6.5669,
"step": 65000
},
{
"epoch": 2.564102564102564,
"grad_norm": 1.8671875,
"learning_rate": 0.0009487179487179487,
"loss": 6.5519,
"step": 65500
},
{
"epoch": 2.583675866118614,
"grad_norm": 1.3515625,
"learning_rate": 0.0009483264826776278,
"loss": 6.5573,
"step": 66000
},
{
"epoch": 2.6032491681346643,
"grad_norm": 3.125,
"learning_rate": 0.0009479350166373068,
"loss": 6.5511,
"step": 66500
},
{
"epoch": 2.6228224701507146,
"grad_norm": 1.4453125,
"learning_rate": 0.0009475435505969857,
"loss": 6.5593,
"step": 67000
},
{
"epoch": 2.642395772166765,
"grad_norm": 7.375,
"learning_rate": 0.0009471520845566647,
"loss": 6.5606,
"step": 67500
},
{
"epoch": 2.6619690741828146,
"grad_norm": 1.4765625,
"learning_rate": 0.0009467606185163438,
"loss": 6.5601,
"step": 68000
},
{
"epoch": 2.681542376198865,
"grad_norm": 3.546875,
"learning_rate": 0.0009463691524760228,
"loss": 6.5607,
"step": 68500
},
{
"epoch": 2.701115678214915,
"grad_norm": 3.046875,
"learning_rate": 0.0009459776864357017,
"loss": 6.5462,
"step": 69000
},
{
"epoch": 2.720688980230965,
"grad_norm": 1.59375,
"learning_rate": 0.0009455862203953807,
"loss": 6.567,
"step": 69500
},
{
"epoch": 2.740262282247015,
"grad_norm": 1.5625,
"learning_rate": 0.0009451947543550598,
"loss": 6.5566,
"step": 70000
},
{
"epoch": 2.7598355842630653,
"grad_norm": 4.40625,
"learning_rate": 0.0009448032883147388,
"loss": 6.5543,
"step": 70500
},
{
"epoch": 2.779408886279115,
"grad_norm": 2.640625,
"learning_rate": 0.0009444118222744176,
"loss": 6.5527,
"step": 71000
},
{
"epoch": 2.7989821882951653,
"grad_norm": 1.09375,
"learning_rate": 0.0009440203562340968,
"loss": 6.557,
"step": 71500
},
{
"epoch": 2.8185554903112156,
"grad_norm": 1.0234375,
"learning_rate": 0.0009436288901937757,
"loss": 6.5603,
"step": 72000
},
{
"epoch": 2.8381287923272653,
"grad_norm": 4.46875,
"learning_rate": 0.0009432374241534547,
"loss": 6.5515,
"step": 72500
},
{
"epoch": 2.8577020943433156,
"grad_norm": 3.796875,
"learning_rate": 0.0009428459581131336,
"loss": 6.5506,
"step": 73000
},
{
"epoch": 2.877275396359366,
"grad_norm": 1.1640625,
"learning_rate": 0.0009424544920728127,
"loss": 6.5583,
"step": 73500
},
{
"epoch": 2.896848698375416,
"grad_norm": 1.8671875,
"learning_rate": 0.0009420630260324917,
"loss": 6.5545,
"step": 74000
},
{
"epoch": 2.9164220003914663,
"grad_norm": 8.6875,
"learning_rate": 0.0009416715599921707,
"loss": 6.5564,
"step": 74500
},
{
"epoch": 2.935995302407516,
"grad_norm": 1.28125,
"learning_rate": 0.0009412800939518496,
"loss": 6.5502,
"step": 75000
},
{
"epoch": 2.9555686044235663,
"grad_norm": 6.375,
"learning_rate": 0.0009408886279115287,
"loss": 6.5434,
"step": 75500
},
{
"epoch": 2.9751419064396165,
"grad_norm": 1.8125,
"learning_rate": 0.0009404971618712077,
"loss": 6.5457,
"step": 76000
},
{
"epoch": 2.9947152084556663,
"grad_norm": 2.34375,
"learning_rate": 0.0009401056958308867,
"loss": 6.5616,
"step": 76500
},
{
"epoch": 3.0,
"eval_loss": 6.546030044555664,
"eval_runtime": 20.7765,
"eval_samples_per_second": 96.262,
"eval_steps_per_second": 6.016,
"step": 76635
},
{
"epoch": 3.0142885104717165,
"grad_norm": 2.34375,
"learning_rate": 0.0009397142297905656,
"loss": 6.545,
"step": 77000
},
{
"epoch": 3.0338618124877668,
"grad_norm": 1.6328125,
"learning_rate": 0.0009393227637502447,
"loss": 6.5504,
"step": 77500
},
{
"epoch": 3.053435114503817,
"grad_norm": 1.453125,
"learning_rate": 0.0009389312977099237,
"loss": 6.5508,
"step": 78000
},
{
"epoch": 3.073008416519867,
"grad_norm": 3.328125,
"learning_rate": 0.0009385398316696027,
"loss": 6.5495,
"step": 78500
},
{
"epoch": 3.092581718535917,
"grad_norm": 1.9609375,
"learning_rate": 0.0009381483656292816,
"loss": 6.5518,
"step": 79000
},
{
"epoch": 3.1121550205519672,
"grad_norm": 2.578125,
"learning_rate": 0.0009377568995889607,
"loss": 6.5535,
"step": 79500
},
{
"epoch": 3.131728322568017,
"grad_norm": 1.65625,
"learning_rate": 0.0009373654335486397,
"loss": 6.5451,
"step": 80000
},
{
"epoch": 3.1513016245840673,
"grad_norm": 7.125,
"learning_rate": 0.0009369739675083186,
"loss": 6.545,
"step": 80500
},
{
"epoch": 3.1708749266001175,
"grad_norm": 2.671875,
"learning_rate": 0.0009365825014679976,
"loss": 6.5379,
"step": 81000
},
{
"epoch": 3.1904482286161677,
"grad_norm": 2.125,
"learning_rate": 0.0009361910354276767,
"loss": 6.5455,
"step": 81500
},
{
"epoch": 3.2100215306322175,
"grad_norm": 1.171875,
"learning_rate": 0.0009357995693873557,
"loss": 6.5449,
"step": 82000
},
{
"epoch": 3.2295948326482677,
"grad_norm": 2.375,
"learning_rate": 0.0009354081033470346,
"loss": 6.5413,
"step": 82500
},
{
"epoch": 3.249168134664318,
"grad_norm": 3.578125,
"learning_rate": 0.0009350166373067137,
"loss": 6.5442,
"step": 83000
},
{
"epoch": 3.2687414366803678,
"grad_norm": 1.3125,
"learning_rate": 0.0009346251712663927,
"loss": 6.5454,
"step": 83500
},
{
"epoch": 3.288314738696418,
"grad_norm": 1.265625,
"learning_rate": 0.0009342337052260717,
"loss": 6.5383,
"step": 84000
},
{
"epoch": 3.3078880407124682,
"grad_norm": 2.28125,
"learning_rate": 0.0009338422391857506,
"loss": 6.5521,
"step": 84500
},
{
"epoch": 3.3274613427285185,
"grad_norm": 3.40625,
"learning_rate": 0.0009334507731454297,
"loss": 6.5454,
"step": 85000
},
{
"epoch": 3.3470346447445682,
"grad_norm": 1.546875,
"learning_rate": 0.0009330593071051087,
"loss": 6.5394,
"step": 85500
},
{
"epoch": 3.3666079467606185,
"grad_norm": 1.453125,
"learning_rate": 0.0009326678410647877,
"loss": 6.5398,
"step": 86000
},
{
"epoch": 3.3861812487766687,
"grad_norm": 1.703125,
"learning_rate": 0.0009322763750244666,
"loss": 6.5434,
"step": 86500
},
{
"epoch": 3.405754550792719,
"grad_norm": 1.75,
"learning_rate": 0.0009318849089841457,
"loss": 6.5355,
"step": 87000
},
{
"epoch": 3.4253278528087687,
"grad_norm": 1.0390625,
"learning_rate": 0.0009314934429438247,
"loss": 6.5417,
"step": 87500
},
{
"epoch": 3.444901154824819,
"grad_norm": 1.234375,
"learning_rate": 0.0009311019769035037,
"loss": 6.5454,
"step": 88000
},
{
"epoch": 3.464474456840869,
"grad_norm": 4.15625,
"learning_rate": 0.0009307105108631826,
"loss": 6.5316,
"step": 88500
},
{
"epoch": 3.484047758856919,
"grad_norm": 3.296875,
"learning_rate": 0.0009303190448228617,
"loss": 6.5429,
"step": 89000
},
{
"epoch": 3.503621060872969,
"grad_norm": 2.65625,
"learning_rate": 0.0009299275787825407,
"loss": 6.5424,
"step": 89500
},
{
"epoch": 3.5231943628890194,
"grad_norm": 2.8125,
"learning_rate": 0.0009295361127422197,
"loss": 6.5447,
"step": 90000
},
{
"epoch": 3.5427676649050692,
"grad_norm": 1.2421875,
"learning_rate": 0.0009291446467018985,
"loss": 6.5418,
"step": 90500
},
{
"epoch": 3.5623409669211195,
"grad_norm": 1.1484375,
"learning_rate": 0.0009287531806615776,
"loss": 6.5405,
"step": 91000
},
{
"epoch": 3.5819142689371697,
"grad_norm": 1.328125,
"learning_rate": 0.0009283617146212566,
"loss": 6.5403,
"step": 91500
},
{
"epoch": 3.60148757095322,
"grad_norm": 1.2734375,
"learning_rate": 0.0009279702485809356,
"loss": 6.5384,
"step": 92000
},
{
"epoch": 3.62106087296927,
"grad_norm": 1.203125,
"learning_rate": 0.0009275787825406146,
"loss": 6.542,
"step": 92500
},
{
"epoch": 3.64063417498532,
"grad_norm": 3.75,
"learning_rate": 0.0009271873165002936,
"loss": 6.538,
"step": 93000
},
{
"epoch": 3.66020747700137,
"grad_norm": 2.09375,
"learning_rate": 0.0009267958504599726,
"loss": 6.5282,
"step": 93500
},
{
"epoch": 3.6797807790174204,
"grad_norm": 1.4609375,
"learning_rate": 0.0009264043844196516,
"loss": 6.5442,
"step": 94000
},
{
"epoch": 3.69935408103347,
"grad_norm": 3.015625,
"learning_rate": 0.0009260129183793306,
"loss": 6.5425,
"step": 94500
},
{
"epoch": 3.7189273830495204,
"grad_norm": 0.953125,
"learning_rate": 0.0009256214523390096,
"loss": 6.5375,
"step": 95000
},
{
"epoch": 3.7385006850655707,
"grad_norm": 1.734375,
"learning_rate": 0.0009252299862986886,
"loss": 6.5488,
"step": 95500
},
{
"epoch": 3.7580739870816204,
"grad_norm": 1.4453125,
"learning_rate": 0.0009248385202583675,
"loss": 6.5497,
"step": 96000
},
{
"epoch": 3.7776472890976707,
"grad_norm": 1.3359375,
"learning_rate": 0.0009244470542180466,
"loss": 6.5467,
"step": 96500
},
{
"epoch": 3.797220591113721,
"grad_norm": 1.7578125,
"learning_rate": 0.0009240555881777256,
"loss": 6.5442,
"step": 97000
},
{
"epoch": 3.816793893129771,
"grad_norm": 1.40625,
"learning_rate": 0.0009236641221374046,
"loss": 6.5393,
"step": 97500
},
{
"epoch": 3.8363671951458214,
"grad_norm": 1.34375,
"learning_rate": 0.0009232726560970835,
"loss": 6.5378,
"step": 98000
},
{
"epoch": 3.855940497161871,
"grad_norm": 1.6484375,
"learning_rate": 0.0009228811900567626,
"loss": 6.5469,
"step": 98500
},
{
"epoch": 3.8755137991779214,
"grad_norm": 2.203125,
"learning_rate": 0.0009224897240164416,
"loss": 6.5322,
"step": 99000
},
{
"epoch": 3.8950871011939716,
"grad_norm": 62.75,
"learning_rate": 0.0009220982579761206,
"loss": 6.5472,
"step": 99500
},
{
"epoch": 3.9146604032100214,
"grad_norm": 8.0,
"learning_rate": 0.0009217067919357995,
"loss": 6.5364,
"step": 100000
},
{
"epoch": 3.9342337052260716,
"grad_norm": 2.796875,
"learning_rate": 0.0009213153258954786,
"loss": 6.5356,
"step": 100500
},
{
"epoch": 3.953807007242122,
"grad_norm": 1.1484375,
"learning_rate": 0.0009209238598551576,
"loss": 6.5267,
"step": 101000
},
{
"epoch": 3.9733803092581716,
"grad_norm": 4.53125,
"learning_rate": 0.0009205323938148366,
"loss": 6.5332,
"step": 101500
},
{
"epoch": 3.992953611274222,
"grad_norm": 2.21875,
"learning_rate": 0.0009201409277745156,
"loss": 6.5363,
"step": 102000
},
{
"epoch": 4.0,
"eval_loss": 6.529191493988037,
"eval_runtime": 20.9814,
"eval_samples_per_second": 95.322,
"eval_steps_per_second": 5.958,
"step": 102180
},
{
"epoch": 4.012526913290272,
"grad_norm": 1.3046875,
"learning_rate": 0.0009197494617341946,
"loss": 6.541,
"step": 102500
},
{
"epoch": 4.032100215306322,
"grad_norm": 1.453125,
"learning_rate": 0.0009193579956938736,
"loss": 6.5328,
"step": 103000
},
{
"epoch": 4.051673517322373,
"grad_norm": 1.109375,
"learning_rate": 0.0009189665296535526,
"loss": 6.5279,
"step": 103500
},
{
"epoch": 4.071246819338422,
"grad_norm": 1.7421875,
"learning_rate": 0.0009185750636132316,
"loss": 6.5302,
"step": 104000
},
{
"epoch": 4.090820121354472,
"grad_norm": 1.03125,
"learning_rate": 0.0009181835975729106,
"loss": 6.5393,
"step": 104500
},
{
"epoch": 4.110393423370523,
"grad_norm": 1.6015625,
"learning_rate": 0.0009177921315325896,
"loss": 6.5323,
"step": 105000
},
{
"epoch": 4.129966725386573,
"grad_norm": 0.91015625,
"learning_rate": 0.0009174006654922686,
"loss": 6.5308,
"step": 105500
},
{
"epoch": 4.149540027402622,
"grad_norm": 8.6875,
"learning_rate": 0.0009170091994519476,
"loss": 6.5302,
"step": 106000
},
{
"epoch": 4.169113329418673,
"grad_norm": 1.609375,
"learning_rate": 0.0009166177334116266,
"loss": 6.5328,
"step": 106500
},
{
"epoch": 4.188686631434723,
"grad_norm": 5.90625,
"learning_rate": 0.0009162262673713056,
"loss": 6.5346,
"step": 107000
},
{
"epoch": 4.2082599334507735,
"grad_norm": 5.1875,
"learning_rate": 0.0009158348013309846,
"loss": 6.5306,
"step": 107500
},
{
"epoch": 4.227833235466823,
"grad_norm": 1.5703125,
"learning_rate": 0.0009154433352906636,
"loss": 6.5278,
"step": 108000
},
{
"epoch": 4.247406537482873,
"grad_norm": 3.09375,
"learning_rate": 0.0009150518692503426,
"loss": 6.5298,
"step": 108500
},
{
"epoch": 4.266979839498924,
"grad_norm": 7.59375,
"learning_rate": 0.0009146604032100216,
"loss": 6.5243,
"step": 109000
},
{
"epoch": 4.286553141514974,
"grad_norm": 1.6484375,
"learning_rate": 0.0009142689371697005,
"loss": 6.5258,
"step": 109500
},
{
"epoch": 4.306126443531023,
"grad_norm": 3.390625,
"learning_rate": 0.0009138774711293795,
"loss": 6.5237,
"step": 110000
},
{
"epoch": 4.325699745547074,
"grad_norm": 3.171875,
"learning_rate": 0.0009134860050890585,
"loss": 6.5307,
"step": 110500
},
{
"epoch": 4.345273047563124,
"grad_norm": 2.3125,
"learning_rate": 0.0009130945390487375,
"loss": 6.5236,
"step": 111000
},
{
"epoch": 4.364846349579174,
"grad_norm": 5.625,
"learning_rate": 0.0009127030730084165,
"loss": 6.5264,
"step": 111500
},
{
"epoch": 4.384419651595224,
"grad_norm": 1.9375,
"learning_rate": 0.0009123116069680955,
"loss": 6.5254,
"step": 112000
},
{
"epoch": 4.403992953611274,
"grad_norm": 2.65625,
"learning_rate": 0.0009119201409277745,
"loss": 6.5323,
"step": 112500
},
{
"epoch": 4.423566255627325,
"grad_norm": 1.703125,
"learning_rate": 0.0009115286748874535,
"loss": 6.5362,
"step": 113000
},
{
"epoch": 4.4431395576433745,
"grad_norm": 3.21875,
"learning_rate": 0.0009111372088471325,
"loss": 6.5292,
"step": 113500
},
{
"epoch": 4.462712859659424,
"grad_norm": 1.1640625,
"learning_rate": 0.0009107457428068115,
"loss": 6.5248,
"step": 114000
},
{
"epoch": 4.482286161675475,
"grad_norm": 1.4453125,
"learning_rate": 0.0009103542767664905,
"loss": 6.5216,
"step": 114500
},
{
"epoch": 4.501859463691525,
"grad_norm": 1.21875,
"learning_rate": 0.0009099628107261695,
"loss": 6.5157,
"step": 115000
},
{
"epoch": 4.521432765707575,
"grad_norm": 3.609375,
"learning_rate": 0.0009095713446858485,
"loss": 6.5172,
"step": 115500
},
{
"epoch": 4.541006067723625,
"grad_norm": 1.0546875,
"learning_rate": 0.0009091798786455275,
"loss": 6.524,
"step": 116000
},
{
"epoch": 4.560579369739675,
"grad_norm": 1.421875,
"learning_rate": 0.0009087884126052065,
"loss": 6.5227,
"step": 116500
},
{
"epoch": 4.580152671755725,
"grad_norm": 1.8359375,
"learning_rate": 0.0009083969465648855,
"loss": 6.5148,
"step": 117000
},
{
"epoch": 4.5997259737717755,
"grad_norm": 3.546875,
"learning_rate": 0.0009080054805245645,
"loss": 6.5208,
"step": 117500
},
{
"epoch": 4.619299275787825,
"grad_norm": 5.90625,
"learning_rate": 0.0009076140144842435,
"loss": 6.5183,
"step": 118000
},
{
"epoch": 4.638872577803875,
"grad_norm": 4.0625,
"learning_rate": 0.0009072225484439225,
"loss": 6.5221,
"step": 118500
},
{
"epoch": 4.658445879819926,
"grad_norm": 3.5625,
"learning_rate": 0.0009068310824036015,
"loss": 6.5201,
"step": 119000
},
{
"epoch": 4.6780191818359755,
"grad_norm": 1.1640625,
"learning_rate": 0.0009064396163632805,
"loss": 6.5176,
"step": 119500
},
{
"epoch": 4.697592483852026,
"grad_norm": 1.28125,
"learning_rate": 0.0009060481503229595,
"loss": 6.5137,
"step": 120000
},
{
"epoch": 4.717165785868076,
"grad_norm": 1.8046875,
"learning_rate": 0.0009056566842826385,
"loss": 6.5178,
"step": 120500
},
{
"epoch": 4.736739087884126,
"grad_norm": 1.171875,
"learning_rate": 0.0009052652182423175,
"loss": 6.5129,
"step": 121000
},
{
"epoch": 4.7563123899001765,
"grad_norm": 11.375,
"learning_rate": 0.0009048737522019965,
"loss": 6.5149,
"step": 121500
},
{
"epoch": 4.775885691916226,
"grad_norm": 2.375,
"learning_rate": 0.0009044822861616755,
"loss": 6.5124,
"step": 122000
},
{
"epoch": 4.795458993932276,
"grad_norm": 6.375,
"learning_rate": 0.0009040908201213545,
"loss": 6.5112,
"step": 122500
},
{
"epoch": 4.815032295948327,
"grad_norm": 0.87109375,
"learning_rate": 0.0009036993540810336,
"loss": 6.5083,
"step": 123000
},
{
"epoch": 4.8346055979643765,
"grad_norm": 8.0,
"learning_rate": 0.0009033078880407125,
"loss": 6.5072,
"step": 123500
},
{
"epoch": 4.854178899980427,
"grad_norm": 1.203125,
"learning_rate": 0.0009029164220003915,
"loss": 6.5118,
"step": 124000
},
{
"epoch": 4.873752201996477,
"grad_norm": 1.15625,
"learning_rate": 0.0009025249559600705,
"loss": 6.5117,
"step": 124500
},
{
"epoch": 4.893325504012527,
"grad_norm": 1.9296875,
"learning_rate": 0.0009021334899197496,
"loss": 6.5099,
"step": 125000
},
{
"epoch": 4.912898806028577,
"grad_norm": 2.453125,
"learning_rate": 0.0009017420238794285,
"loss": 6.5095,
"step": 125500
},
{
"epoch": 4.932472108044627,
"grad_norm": 1.3359375,
"learning_rate": 0.0009013505578391075,
"loss": 6.5088,
"step": 126000
},
{
"epoch": 4.952045410060677,
"grad_norm": 1.28125,
"learning_rate": 0.0009009590917987865,
"loss": 6.5085,
"step": 126500
},
{
"epoch": 4.971618712076728,
"grad_norm": 3.75,
"learning_rate": 0.0009005676257584656,
"loss": 6.5163,
"step": 127000
},
{
"epoch": 4.9911920140927775,
"grad_norm": 3.015625,
"learning_rate": 0.0009001761597181445,
"loss": 6.513,
"step": 127500
},
{
"epoch": 5.0,
"eval_loss": 6.507379055023193,
"eval_runtime": 22.1406,
"eval_samples_per_second": 90.332,
"eval_steps_per_second": 5.646,
"step": 127725
},
{
"epoch": 5.010765316108827,
"grad_norm": 1.6171875,
"learning_rate": 0.0008997846936778235,
"loss": 6.5096,
"step": 128000
},
{
"epoch": 5.030338618124878,
"grad_norm": 2.203125,
"learning_rate": 0.0008993932276375024,
"loss": 6.5089,
"step": 128500
},
{
"epoch": 5.049911920140928,
"grad_norm": 2.90625,
"learning_rate": 0.0008990017615971816,
"loss": 6.5087,
"step": 129000
},
{
"epoch": 5.0694852221569775,
"grad_norm": 2.265625,
"learning_rate": 0.0008986102955568604,
"loss": 6.5135,
"step": 129500
},
{
"epoch": 5.089058524173028,
"grad_norm": 1.8828125,
"learning_rate": 0.0008982188295165394,
"loss": 6.5038,
"step": 130000
},
{
"epoch": 5.108631826189078,
"grad_norm": 2.359375,
"learning_rate": 0.0008978273634762184,
"loss": 6.5035,
"step": 130500
},
{
"epoch": 5.128205128205128,
"grad_norm": 2.15625,
"learning_rate": 0.0008974358974358974,
"loss": 6.5167,
"step": 131000
},
{
"epoch": 5.147778430221178,
"grad_norm": 2.140625,
"learning_rate": 0.0008970444313955764,
"loss": 6.5109,
"step": 131500
},
{
"epoch": 5.167351732237228,
"grad_norm": 2.015625,
"learning_rate": 0.0008966529653552554,
"loss": 6.5009,
"step": 132000
},
{
"epoch": 5.186925034253279,
"grad_norm": 0.75,
"learning_rate": 0.0008962614993149345,
"loss": 6.5078,
"step": 132500
},
{
"epoch": 5.206498336269329,
"grad_norm": 5.78125,
"learning_rate": 0.0008958700332746134,
"loss": 6.5062,
"step": 133000
},
{
"epoch": 5.2260716382853785,
"grad_norm": 3.390625,
"learning_rate": 0.0008954785672342924,
"loss": 6.5045,
"step": 133500
},
{
"epoch": 5.245644940301429,
"grad_norm": 4.09375,
"learning_rate": 0.0008950871011939714,
"loss": 6.4964,
"step": 134000
},
{
"epoch": 5.265218242317479,
"grad_norm": 1.84375,
"learning_rate": 0.0008946956351536505,
"loss": 6.5082,
"step": 134500
},
{
"epoch": 5.284791544333529,
"grad_norm": 1.953125,
"learning_rate": 0.0008943041691133294,
"loss": 6.5047,
"step": 135000
},
{
"epoch": 5.304364846349579,
"grad_norm": 1.09375,
"learning_rate": 0.0008939127030730084,
"loss": 6.5025,
"step": 135500
},
{
"epoch": 5.323938148365629,
"grad_norm": 2.046875,
"learning_rate": 0.0008935212370326874,
"loss": 6.4966,
"step": 136000
},
{
"epoch": 5.34351145038168,
"grad_norm": 1.078125,
"learning_rate": 0.0008931297709923665,
"loss": 6.503,
"step": 136500
},
{
"epoch": 5.36308475239773,
"grad_norm": 5.0625,
"learning_rate": 0.0008927383049520454,
"loss": 6.5022,
"step": 137000
},
{
"epoch": 5.382658054413779,
"grad_norm": 1.140625,
"learning_rate": 0.0008923468389117244,
"loss": 6.5025,
"step": 137500
},
{
"epoch": 5.40223135642983,
"grad_norm": 2.4375,
"learning_rate": 0.0008919553728714034,
"loss": 6.5053,
"step": 138000
},
{
"epoch": 5.42180465844588,
"grad_norm": 1.78125,
"learning_rate": 0.0008915639068310825,
"loss": 6.5127,
"step": 138500
},
{
"epoch": 5.44137796046193,
"grad_norm": 14.375,
"learning_rate": 0.0008911724407907614,
"loss": 6.5019,
"step": 139000
},
{
"epoch": 5.46095126247798,
"grad_norm": 4.59375,
"learning_rate": 0.0008907809747504404,
"loss": 6.5024,
"step": 139500
},
{
"epoch": 5.48052456449403,
"grad_norm": 2.28125,
"learning_rate": 0.0008903895087101194,
"loss": 6.5059,
"step": 140000
},
{
"epoch": 5.50009786651008,
"grad_norm": 3.84375,
"learning_rate": 0.0008899980426697985,
"loss": 6.5027,
"step": 140500
},
{
"epoch": 5.519671168526131,
"grad_norm": 6.1875,
"learning_rate": 0.0008896065766294774,
"loss": 6.4974,
"step": 141000
},
{
"epoch": 5.53924447054218,
"grad_norm": 1.921875,
"learning_rate": 0.0008892151105891564,
"loss": 6.4957,
"step": 141500
},
{
"epoch": 5.55881777255823,
"grad_norm": 1.78125,
"learning_rate": 0.0008888236445488354,
"loss": 6.5043,
"step": 142000
},
{
"epoch": 5.578391074574281,
"grad_norm": 1.796875,
"learning_rate": 0.0008884321785085145,
"loss": 6.4968,
"step": 142500
},
{
"epoch": 5.597964376590331,
"grad_norm": 2.4375,
"learning_rate": 0.0008880407124681934,
"loss": 6.5016,
"step": 143000
},
{
"epoch": 5.61753767860638,
"grad_norm": 1.078125,
"learning_rate": 0.0008876492464278724,
"loss": 6.5012,
"step": 143500
},
{
"epoch": 5.637110980622431,
"grad_norm": 3.921875,
"learning_rate": 0.0008872577803875515,
"loss": 6.5061,
"step": 144000
},
{
"epoch": 5.656684282638481,
"grad_norm": 1.015625,
"learning_rate": 0.0008868663143472305,
"loss": 6.5026,
"step": 144500
},
{
"epoch": 5.676257584654532,
"grad_norm": 1.484375,
"learning_rate": 0.0008864748483069094,
"loss": 6.4981,
"step": 145000
},
{
"epoch": 5.695830886670581,
"grad_norm": 1.8359375,
"learning_rate": 0.0008860833822665884,
"loss": 6.5063,
"step": 145500
},
{
"epoch": 5.715404188686631,
"grad_norm": 1.59375,
"learning_rate": 0.0008856919162262675,
"loss": 6.5071,
"step": 146000
},
{
"epoch": 5.734977490702682,
"grad_norm": 3.703125,
"learning_rate": 0.0008853004501859464,
"loss": 6.5037,
"step": 146500
},
{
"epoch": 5.754550792718732,
"grad_norm": 4.1875,
"learning_rate": 0.0008849089841456254,
"loss": 6.504,
"step": 147000
},
{
"epoch": 5.774124094734781,
"grad_norm": 0.98828125,
"learning_rate": 0.0008845175181053043,
"loss": 6.4992,
"step": 147500
},
{
"epoch": 5.793697396750832,
"grad_norm": 25.25,
"learning_rate": 0.0008841260520649835,
"loss": 6.5052,
"step": 148000
},
{
"epoch": 5.813270698766882,
"grad_norm": 1.9765625,
"learning_rate": 0.0008837345860246623,
"loss": 6.4979,
"step": 148500
},
{
"epoch": 5.8328440007829325,
"grad_norm": 2.03125,
"learning_rate": 0.0008833431199843413,
"loss": 6.5023,
"step": 149000
},
{
"epoch": 5.852417302798982,
"grad_norm": 1.2890625,
"learning_rate": 0.0008829516539440203,
"loss": 6.4971,
"step": 149500
},
{
"epoch": 5.871990604815032,
"grad_norm": 1.3125,
"learning_rate": 0.0008825601879036994,
"loss": 6.4968,
"step": 150000
},
{
"epoch": 5.891563906831083,
"grad_norm": 1.7734375,
"learning_rate": 0.0008821687218633783,
"loss": 6.5053,
"step": 150500
},
{
"epoch": 5.911137208847133,
"grad_norm": 2.40625,
"learning_rate": 0.0008817772558230573,
"loss": 6.4988,
"step": 151000
},
{
"epoch": 5.930710510863182,
"grad_norm": 4.25,
"learning_rate": 0.0008813857897827363,
"loss": 6.5071,
"step": 151500
},
{
"epoch": 5.950283812879233,
"grad_norm": 3.125,
"learning_rate": 0.0008809943237424154,
"loss": 6.5014,
"step": 152000
},
{
"epoch": 5.969857114895283,
"grad_norm": 1.515625,
"learning_rate": 0.0008806028577020943,
"loss": 6.5002,
"step": 152500
},
{
"epoch": 5.989430416911333,
"grad_norm": 2.671875,
"learning_rate": 0.0008802113916617733,
"loss": 6.4993,
"step": 153000
},
{
"epoch": 6.0,
"eval_loss": 6.495845794677734,
"eval_runtime": 21.9172,
"eval_samples_per_second": 91.253,
"eval_steps_per_second": 5.703,
"step": 153270
},
{
"epoch": 6.009003718927383,
"grad_norm": 8.25,
"learning_rate": 0.0008798199256214524,
"loss": 6.4952,
"step": 153500
},
{
"epoch": 6.028577020943433,
"grad_norm": 2.40625,
"learning_rate": 0.0008794284595811314,
"loss": 6.4967,
"step": 154000
},
{
"epoch": 6.048150322959483,
"grad_norm": 2.25,
"learning_rate": 0.0008790369935408103,
"loss": 6.5073,
"step": 154500
},
{
"epoch": 6.0677236249755335,
"grad_norm": 1.3515625,
"learning_rate": 0.0008786455275004893,
"loss": 6.4969,
"step": 155000
},
{
"epoch": 6.087296926991583,
"grad_norm": 0.9921875,
"learning_rate": 0.0008782540614601684,
"loss": 6.487,
"step": 155500
},
{
"epoch": 6.106870229007634,
"grad_norm": 1.875,
"learning_rate": 0.0008778625954198474,
"loss": 6.49,
"step": 156000
},
{
"epoch": 6.126443531023684,
"grad_norm": 5.25,
"learning_rate": 0.0008774711293795263,
"loss": 6.4948,
"step": 156500
},
{
"epoch": 6.146016833039734,
"grad_norm": 14.25,
"learning_rate": 0.0008770796633392053,
"loss": 6.4921,
"step": 157000
},
{
"epoch": 6.165590135055784,
"grad_norm": 1.1171875,
"learning_rate": 0.0008766881972988844,
"loss": 6.4909,
"step": 157500
},
{
"epoch": 6.185163437071834,
"grad_norm": 3.375,
"learning_rate": 0.0008762967312585634,
"loss": 6.4917,
"step": 158000
},
{
"epoch": 6.204736739087884,
"grad_norm": 10.6875,
"learning_rate": 0.0008759052652182423,
"loss": 6.494,
"step": 158500
},
{
"epoch": 6.2243100411039345,
"grad_norm": 5.0,
"learning_rate": 0.0008755137991779213,
"loss": 6.4909,
"step": 159000
},
{
"epoch": 6.243883343119984,
"grad_norm": 1.7421875,
"learning_rate": 0.0008751223331376004,
"loss": 6.498,
"step": 159500
},
{
"epoch": 6.263456645136034,
"grad_norm": 1.7734375,
"learning_rate": 0.0008747308670972794,
"loss": 6.4899,
"step": 160000
},
{
"epoch": 6.283029947152085,
"grad_norm": 1.5390625,
"learning_rate": 0.0008743394010569583,
"loss": 6.4991,
"step": 160500
},
{
"epoch": 6.3026032491681345,
"grad_norm": 5.71875,
"learning_rate": 0.0008739479350166373,
"loss": 6.4939,
"step": 161000
},
{
"epoch": 6.322176551184185,
"grad_norm": 3.6875,
"learning_rate": 0.0008735564689763164,
"loss": 6.4888,
"step": 161500
},
{
"epoch": 6.341749853200235,
"grad_norm": 3.359375,
"learning_rate": 0.0008731650029359953,
"loss": 6.4918,
"step": 162000
},
{
"epoch": 6.361323155216285,
"grad_norm": 1.125,
"learning_rate": 0.0008727735368956743,
"loss": 6.4992,
"step": 162500
},
{
"epoch": 6.3808964572323355,
"grad_norm": 2.625,
"learning_rate": 0.0008723820708553534,
"loss": 6.4958,
"step": 163000
},
{
"epoch": 6.400469759248385,
"grad_norm": 1.2421875,
"learning_rate": 0.0008719906048150324,
"loss": 6.4894,
"step": 163500
},
{
"epoch": 6.420043061264435,
"grad_norm": 2.03125,
"learning_rate": 0.0008715991387747113,
"loss": 6.4953,
"step": 164000
},
{
"epoch": 6.439616363280486,
"grad_norm": 2.03125,
"learning_rate": 0.0008712076727343903,
"loss": 6.4929,
"step": 164500
},
{
"epoch": 6.4591896652965355,
"grad_norm": 1.1875,
"learning_rate": 0.0008708162066940694,
"loss": 6.4848,
"step": 165000
},
{
"epoch": 6.478762967312585,
"grad_norm": 1.1015625,
"learning_rate": 0.0008704247406537484,
"loss": 6.4871,
"step": 165500
},
{
"epoch": 6.498336269328636,
"grad_norm": 3.75,
"learning_rate": 0.0008700332746134272,
"loss": 6.4914,
"step": 166000
},
{
"epoch": 6.517909571344686,
"grad_norm": 2.1875,
"learning_rate": 0.0008696418085731062,
"loss": 6.4888,
"step": 166500
},
{
"epoch": 6.5374828733607355,
"grad_norm": 2.84375,
"learning_rate": 0.0008692503425327854,
"loss": 6.4932,
"step": 167000
},
{
"epoch": 6.557056175376786,
"grad_norm": 2.828125,
"learning_rate": 0.0008688588764924644,
"loss": 6.4992,
"step": 167500
},
{
"epoch": 6.576629477392836,
"grad_norm": 4.9375,
"learning_rate": 0.0008684674104521432,
"loss": 6.4881,
"step": 168000
},
{
"epoch": 6.596202779408887,
"grad_norm": 2.796875,
"learning_rate": 0.0008680759444118222,
"loss": 6.4944,
"step": 168500
},
{
"epoch": 6.6157760814249365,
"grad_norm": 19.0,
"learning_rate": 0.0008676844783715013,
"loss": 6.4874,
"step": 169000
},
{
"epoch": 6.635349383440986,
"grad_norm": 8.25,
"learning_rate": 0.0008672930123311803,
"loss": 6.4871,
"step": 169500
},
{
"epoch": 6.654922685457037,
"grad_norm": 1.484375,
"learning_rate": 0.0008669015462908592,
"loss": 6.4978,
"step": 170000
},
{
"epoch": 6.674495987473087,
"grad_norm": 3.140625,
"learning_rate": 0.0008665100802505382,
"loss": 6.4946,
"step": 170500
},
{
"epoch": 6.6940692894891365,
"grad_norm": 8.0625,
"learning_rate": 0.0008661186142102173,
"loss": 6.4868,
"step": 171000
},
{
"epoch": 6.713642591505187,
"grad_norm": 3.265625,
"learning_rate": 0.0008657271481698963,
"loss": 6.4876,
"step": 171500
},
{
"epoch": 6.733215893521237,
"grad_norm": 1.6015625,
"learning_rate": 0.0008653356821295752,
"loss": 6.4887,
"step": 172000
},
{
"epoch": 6.752789195537288,
"grad_norm": 3.828125,
"learning_rate": 0.0008649442160892542,
"loss": 6.4897,
"step": 172500
},
{
"epoch": 6.772362497553337,
"grad_norm": 0.8515625,
"learning_rate": 0.0008645527500489333,
"loss": 6.4857,
"step": 173000
},
{
"epoch": 6.791935799569387,
"grad_norm": 1.2734375,
"learning_rate": 0.0008641612840086123,
"loss": 6.4867,
"step": 173500
},
{
"epoch": 6.811509101585438,
"grad_norm": 1.578125,
"learning_rate": 0.0008637698179682912,
"loss": 6.4883,
"step": 174000
},
{
"epoch": 6.831082403601488,
"grad_norm": 1.875,
"learning_rate": 0.0008633783519279703,
"loss": 6.4783,
"step": 174500
},
{
"epoch": 6.8506557056175374,
"grad_norm": 0.84765625,
"learning_rate": 0.0008629868858876493,
"loss": 6.4862,
"step": 175000
},
{
"epoch": 6.870229007633588,
"grad_norm": 20.875,
"learning_rate": 0.0008625954198473283,
"loss": 6.486,
"step": 175500
},
{
"epoch": 6.889802309649638,
"grad_norm": 1.75,
"learning_rate": 0.0008622039538070072,
"loss": 6.4941,
"step": 176000
},
{
"epoch": 6.909375611665688,
"grad_norm": 1.46875,
"learning_rate": 0.0008618124877666863,
"loss": 6.4904,
"step": 176500
},
{
"epoch": 6.928948913681738,
"grad_norm": 2.171875,
"learning_rate": 0.0008614210217263653,
"loss": 6.4864,
"step": 177000
},
{
"epoch": 6.948522215697788,
"grad_norm": 1.6875,
"learning_rate": 0.0008610295556860442,
"loss": 6.4876,
"step": 177500
},
{
"epoch": 6.968095517713838,
"grad_norm": 0.94921875,
"learning_rate": 0.0008606380896457232,
"loss": 6.4811,
"step": 178000
},
{
"epoch": 6.987668819729889,
"grad_norm": 1.46875,
"learning_rate": 0.0008602466236054023,
"loss": 6.4881,
"step": 178500
},
{
"epoch": 7.0,
"eval_loss": 6.482935905456543,
"eval_runtime": 22.9737,
"eval_samples_per_second": 87.056,
"eval_steps_per_second": 5.441,
"step": 178815
},
{
"epoch": 7.007242121745938,
"grad_norm": 1.3515625,
"learning_rate": 0.0008598551575650813,
"loss": 6.4776,
"step": 179000
},
{
"epoch": 7.026815423761989,
"grad_norm": 1.5078125,
"learning_rate": 0.0008594636915247602,
"loss": 6.4777,
"step": 179500
},
{
"epoch": 7.046388725778039,
"grad_norm": 2.78125,
"learning_rate": 0.0008590722254844392,
"loss": 6.4789,
"step": 180000
},
{
"epoch": 7.065962027794089,
"grad_norm": 1.5625,
"learning_rate": 0.0008586807594441183,
"loss": 6.4884,
"step": 180500
},
{
"epoch": 7.085535329810139,
"grad_norm": 1.671875,
"learning_rate": 0.0008582892934037973,
"loss": 6.4847,
"step": 181000
},
{
"epoch": 7.105108631826189,
"grad_norm": 1.9140625,
"learning_rate": 0.0008578978273634762,
"loss": 6.4811,
"step": 181500
},
{
"epoch": 7.124681933842239,
"grad_norm": 1.375,
"learning_rate": 0.0008575063613231552,
"loss": 6.4872,
"step": 182000
},
{
"epoch": 7.14425523585829,
"grad_norm": 1.375,
"learning_rate": 0.0008571148952828343,
"loss": 6.486,
"step": 182500
},
{
"epoch": 7.163828537874339,
"grad_norm": 1.921875,
"learning_rate": 0.0008567234292425133,
"loss": 6.4848,
"step": 183000
},
{
"epoch": 7.183401839890389,
"grad_norm": 2.125,
"learning_rate": 0.0008563319632021922,
"loss": 6.4808,
"step": 183500
},
{
"epoch": 7.20297514190644,
"grad_norm": 1.0859375,
"learning_rate": 0.0008559404971618713,
"loss": 6.4858,
"step": 184000
},
{
"epoch": 7.22254844392249,
"grad_norm": 1.7578125,
"learning_rate": 0.0008555490311215503,
"loss": 6.4875,
"step": 184500
},
{
"epoch": 7.242121745938539,
"grad_norm": 10.375,
"learning_rate": 0.0008551575650812293,
"loss": 6.483,
"step": 185000
},
{
"epoch": 7.26169504795459,
"grad_norm": 2.0,
"learning_rate": 0.0008547660990409081,
"loss": 6.4803,
"step": 185500
},
{
"epoch": 7.28126834997064,
"grad_norm": 2.734375,
"learning_rate": 0.0008543746330005873,
"loss": 6.4823,
"step": 186000
},
{
"epoch": 7.3008416519866906,
"grad_norm": 1.5390625,
"learning_rate": 0.0008539831669602662,
"loss": 6.4725,
"step": 186500
},
{
"epoch": 7.32041495400274,
"grad_norm": 1.53125,
"learning_rate": 0.0008535917009199452,
"loss": 6.4783,
"step": 187000
},
{
"epoch": 7.33998825601879,
"grad_norm": 2.25,
"learning_rate": 0.0008532002348796241,
"loss": 6.4714,
"step": 187500
},
{
"epoch": 7.359561558034841,
"grad_norm": 1.3671875,
"learning_rate": 0.0008528087688393032,
"loss": 6.4807,
"step": 188000
},
{
"epoch": 7.379134860050891,
"grad_norm": 1.4140625,
"learning_rate": 0.0008524173027989822,
"loss": 6.471,
"step": 188500
},
{
"epoch": 7.39870816206694,
"grad_norm": 1.234375,
"learning_rate": 0.0008520258367586612,
"loss": 6.4799,
"step": 189000
},
{
"epoch": 7.418281464082991,
"grad_norm": 1.53125,
"learning_rate": 0.0008516343707183401,
"loss": 6.48,
"step": 189500
},
{
"epoch": 7.437854766099041,
"grad_norm": 1.1328125,
"learning_rate": 0.0008512429046780192,
"loss": 6.4835,
"step": 190000
},
{
"epoch": 7.457428068115091,
"grad_norm": 3.90625,
"learning_rate": 0.0008508514386376982,
"loss": 6.4747,
"step": 190500
},
{
"epoch": 7.477001370131141,
"grad_norm": 1.609375,
"learning_rate": 0.0008504599725973772,
"loss": 6.4759,
"step": 191000
},
{
"epoch": 7.496574672147191,
"grad_norm": 1.6171875,
"learning_rate": 0.0008500685065570561,
"loss": 6.4765,
"step": 191500
},
{
"epoch": 7.516147974163241,
"grad_norm": 1.0234375,
"learning_rate": 0.0008496770405167352,
"loss": 6.4753,
"step": 192000
},
{
"epoch": 7.5357212761792916,
"grad_norm": 1.046875,
"learning_rate": 0.0008492855744764142,
"loss": 6.4782,
"step": 192500
},
{
"epoch": 7.555294578195341,
"grad_norm": 3.140625,
"learning_rate": 0.0008488941084360931,
"loss": 6.4725,
"step": 193000
},
{
"epoch": 7.574867880211392,
"grad_norm": 1.703125,
"learning_rate": 0.0008485026423957722,
"loss": 6.4825,
"step": 193500
},
{
"epoch": 7.594441182227442,
"grad_norm": 2.265625,
"learning_rate": 0.0008481111763554512,
"loss": 6.4758,
"step": 194000
},
{
"epoch": 7.614014484243492,
"grad_norm": 5.9375,
"learning_rate": 0.0008477197103151302,
"loss": 6.4732,
"step": 194500
},
{
"epoch": 7.633587786259542,
"grad_norm": 1.125,
"learning_rate": 0.0008473282442748091,
"loss": 6.4812,
"step": 195000
},
{
"epoch": 7.653161088275592,
"grad_norm": 1.8125,
"learning_rate": 0.0008469367782344882,
"loss": 6.4711,
"step": 195500
},
{
"epoch": 7.672734390291642,
"grad_norm": 3.421875,
"learning_rate": 0.0008465453121941672,
"loss": 6.4718,
"step": 196000
},
{
"epoch": 7.6923076923076925,
"grad_norm": 9.625,
"learning_rate": 0.0008461538461538462,
"loss": 6.4793,
"step": 196500
},
{
"epoch": 7.711880994323742,
"grad_norm": 2.390625,
"learning_rate": 0.0008457623801135251,
"loss": 6.4758,
"step": 197000
},
{
"epoch": 7.731454296339793,
"grad_norm": 2.125,
"learning_rate": 0.0008453709140732042,
"loss": 6.4706,
"step": 197500
},
{
"epoch": 7.751027598355843,
"grad_norm": 1.296875,
"learning_rate": 0.0008449794480328832,
"loss": 6.4838,
"step": 198000
},
{
"epoch": 7.7706009003718925,
"grad_norm": 2.53125,
"learning_rate": 0.0008445879819925622,
"loss": 6.475,
"step": 198500
},
{
"epoch": 7.790174202387943,
"grad_norm": 2.75,
"learning_rate": 0.0008441965159522411,
"loss": 6.4766,
"step": 199000
},
{
"epoch": 7.809747504403993,
"grad_norm": 1.84375,
"learning_rate": 0.0008438050499119202,
"loss": 6.4833,
"step": 199500
},
{
"epoch": 7.829320806420043,
"grad_norm": 1.40625,
"learning_rate": 0.0008434135838715992,
"loss": 6.4786,
"step": 200000
},
{
"epoch": 7.8488941084360935,
"grad_norm": 1.5546875,
"learning_rate": 0.0008430221178312782,
"loss": 6.4747,
"step": 200500
},
{
"epoch": 7.868467410452143,
"grad_norm": 2.59375,
"learning_rate": 0.0008426306517909571,
"loss": 6.4839,
"step": 201000
},
{
"epoch": 7.888040712468193,
"grad_norm": 0.92578125,
"learning_rate": 0.0008422391857506362,
"loss": 6.477,
"step": 201500
},
{
"epoch": 7.907614014484244,
"grad_norm": 2.40625,
"learning_rate": 0.0008418477197103152,
"loss": 6.4823,
"step": 202000
},
{
"epoch": 7.9271873165002935,
"grad_norm": 4.9375,
"learning_rate": 0.0008414562536699942,
"loss": 6.4666,
"step": 202500
},
{
"epoch": 7.946760618516343,
"grad_norm": 1.2421875,
"learning_rate": 0.000841064787629673,
"loss": 6.4716,
"step": 203000
},
{
"epoch": 7.966333920532394,
"grad_norm": 1.6875,
"learning_rate": 0.0008406733215893522,
"loss": 6.4678,
"step": 203500
},
{
"epoch": 7.985907222548444,
"grad_norm": 1.109375,
"learning_rate": 0.0008402818555490312,
"loss": 6.471,
"step": 204000
},
{
"epoch": 8.0,
"eval_loss": 6.472127914428711,
"eval_runtime": 24.0419,
"eval_samples_per_second": 83.188,
"eval_steps_per_second": 5.199,
"step": 204360
},
{
"epoch": 8.005480524564494,
"grad_norm": 3.421875,
"learning_rate": 0.0008398903895087102,
"loss": 6.4769,
"step": 204500
},
{
"epoch": 8.025053826580544,
"grad_norm": 1.3046875,
"learning_rate": 0.0008394989234683892,
"loss": 6.4651,
"step": 205000
},
{
"epoch": 8.044627128596595,
"grad_norm": 2.359375,
"learning_rate": 0.0008391074574280681,
"loss": 6.4746,
"step": 205500
},
{
"epoch": 8.064200430612644,
"grad_norm": 1.40625,
"learning_rate": 0.0008387159913877471,
"loss": 6.4704,
"step": 206000
},
{
"epoch": 8.083773732628694,
"grad_norm": 1.21875,
"learning_rate": 0.0008383245253474261,
"loss": 6.4648,
"step": 206500
},
{
"epoch": 8.103347034644745,
"grad_norm": 1.21875,
"learning_rate": 0.0008379330593071051,
"loss": 6.4735,
"step": 207000
},
{
"epoch": 8.122920336660794,
"grad_norm": 17.0,
"learning_rate": 0.0008375415932667841,
"loss": 6.474,
"step": 207500
},
{
"epoch": 8.142493638676845,
"grad_norm": 6.5,
"learning_rate": 0.0008371501272264631,
"loss": 6.4643,
"step": 208000
},
{
"epoch": 8.162066940692895,
"grad_norm": 3.0,
"learning_rate": 0.000836758661186142,
"loss": 6.4722,
"step": 208500
},
{
"epoch": 8.181640242708944,
"grad_norm": 1.3359375,
"learning_rate": 0.0008363671951458211,
"loss": 6.4671,
"step": 209000
},
{
"epoch": 8.201213544724995,
"grad_norm": 5.0,
"learning_rate": 0.0008359757291055001,
"loss": 6.4788,
"step": 209500
},
{
"epoch": 8.220786846741046,
"grad_norm": 3.609375,
"learning_rate": 0.0008355842630651791,
"loss": 6.4751,
"step": 210000
},
{
"epoch": 8.240360148757095,
"grad_norm": 0.93359375,
"learning_rate": 0.000835192797024858,
"loss": 6.4759,
"step": 210500
},
{
"epoch": 8.259933450773145,
"grad_norm": 12.6875,
"learning_rate": 0.0008348013309845371,
"loss": 6.4707,
"step": 211000
},
{
"epoch": 8.279506752789196,
"grad_norm": 1.828125,
"learning_rate": 0.0008344098649442161,
"loss": 6.4734,
"step": 211500
},
{
"epoch": 8.299080054805245,
"grad_norm": 1.625,
"learning_rate": 0.0008340183989038951,
"loss": 6.4632,
"step": 212000
},
{
"epoch": 8.318653356821295,
"grad_norm": 1.3125,
"learning_rate": 0.000833626932863574,
"loss": 6.4708,
"step": 212500
},
{
"epoch": 8.338226658837346,
"grad_norm": 1.5390625,
"learning_rate": 0.0008332354668232531,
"loss": 6.4614,
"step": 213000
},
{
"epoch": 8.357799960853397,
"grad_norm": 1.234375,
"learning_rate": 0.0008328440007829321,
"loss": 6.4625,
"step": 213500
},
{
"epoch": 8.377373262869446,
"grad_norm": 2.5,
"learning_rate": 0.0008324525347426111,
"loss": 6.4605,
"step": 214000
},
{
"epoch": 8.396946564885496,
"grad_norm": 8.0,
"learning_rate": 0.0008320610687022901,
"loss": 6.4681,
"step": 214500
},
{
"epoch": 8.416519866901547,
"grad_norm": 1.6875,
"learning_rate": 0.0008316696026619691,
"loss": 6.46,
"step": 215000
},
{
"epoch": 8.436093168917596,
"grad_norm": 1.5,
"learning_rate": 0.0008312781366216481,
"loss": 6.4718,
"step": 215500
},
{
"epoch": 8.455666470933647,
"grad_norm": 1.6015625,
"learning_rate": 0.0008308866705813271,
"loss": 6.4647,
"step": 216000
},
{
"epoch": 8.475239772949697,
"grad_norm": 1.0859375,
"learning_rate": 0.0008304952045410061,
"loss": 6.4723,
"step": 216500
},
{
"epoch": 8.494813074965746,
"grad_norm": 1.234375,
"learning_rate": 0.0008301037385006851,
"loss": 6.4798,
"step": 217000
},
{
"epoch": 8.514386376981797,
"grad_norm": 2.09375,
"learning_rate": 0.0008297122724603641,
"loss": 6.468,
"step": 217500
},
{
"epoch": 8.533959678997848,
"grad_norm": 1.234375,
"learning_rate": 0.0008293208064200431,
"loss": 6.4728,
"step": 218000
},
{
"epoch": 8.553532981013896,
"grad_norm": 0.98828125,
"learning_rate": 0.0008289293403797221,
"loss": 6.4703,
"step": 218500
},
{
"epoch": 8.573106283029947,
"grad_norm": 1.3828125,
"learning_rate": 0.0008285378743394011,
"loss": 6.4645,
"step": 219000
},
{
"epoch": 8.592679585045998,
"grad_norm": 0.9140625,
"learning_rate": 0.0008281464082990801,
"loss": 6.4694,
"step": 219500
},
{
"epoch": 8.612252887062047,
"grad_norm": 1.1328125,
"learning_rate": 0.0008277549422587591,
"loss": 6.4632,
"step": 220000
},
{
"epoch": 8.631826189078097,
"grad_norm": 3.328125,
"learning_rate": 0.0008273634762184381,
"loss": 6.4733,
"step": 220500
},
{
"epoch": 8.651399491094148,
"grad_norm": 3.25,
"learning_rate": 0.0008269720101781171,
"loss": 6.4682,
"step": 221000
},
{
"epoch": 8.670972793110197,
"grad_norm": 1.0859375,
"learning_rate": 0.0008265805441377961,
"loss": 6.4654,
"step": 221500
},
{
"epoch": 8.690546095126248,
"grad_norm": 2.890625,
"learning_rate": 0.0008261890780974751,
"loss": 6.4569,
"step": 222000
},
{
"epoch": 8.710119397142298,
"grad_norm": 0.9921875,
"learning_rate": 0.0008257976120571541,
"loss": 6.4643,
"step": 222500
},
{
"epoch": 8.729692699158347,
"grad_norm": 0.890625,
"learning_rate": 0.0008254061460168331,
"loss": 6.4544,
"step": 223000
},
{
"epoch": 8.749266001174398,
"grad_norm": 1.234375,
"learning_rate": 0.000825014679976512,
"loss": 6.4667,
"step": 223500
},
{
"epoch": 8.768839303190449,
"grad_norm": 0.98046875,
"learning_rate": 0.0008246232139361912,
"loss": 6.458,
"step": 224000
},
{
"epoch": 8.7884126052065,
"grad_norm": 1.6796875,
"learning_rate": 0.00082423174789587,
"loss": 6.4643,
"step": 224500
},
{
"epoch": 8.807985907222548,
"grad_norm": 0.94140625,
"learning_rate": 0.000823840281855549,
"loss": 6.4635,
"step": 225000
},
{
"epoch": 8.827559209238599,
"grad_norm": 1.5703125,
"learning_rate": 0.000823448815815228,
"loss": 6.458,
"step": 225500
},
{
"epoch": 8.84713251125465,
"grad_norm": 1.0625,
"learning_rate": 0.000823057349774907,
"loss": 6.4666,
"step": 226000
},
{
"epoch": 8.866705813270698,
"grad_norm": 1.859375,
"learning_rate": 0.000822665883734586,
"loss": 6.4582,
"step": 226500
},
{
"epoch": 8.886279115286749,
"grad_norm": 2.515625,
"learning_rate": 0.000822274417694265,
"loss": 6.4728,
"step": 227000
},
{
"epoch": 8.9058524173028,
"grad_norm": 1.7734375,
"learning_rate": 0.000821882951653944,
"loss": 6.4735,
"step": 227500
},
{
"epoch": 8.925425719318849,
"grad_norm": 1.3125,
"learning_rate": 0.000821491485613623,
"loss": 6.4586,
"step": 228000
},
{
"epoch": 8.9449990213349,
"grad_norm": 0.93359375,
"learning_rate": 0.000821100019573302,
"loss": 6.4698,
"step": 228500
},
{
"epoch": 8.96457232335095,
"grad_norm": 1.1484375,
"learning_rate": 0.000820708553532981,
"loss": 6.469,
"step": 229000
},
{
"epoch": 8.984145625366999,
"grad_norm": 2.046875,
"learning_rate": 0.00082031708749266,
"loss": 6.4725,
"step": 229500
},
{
"epoch": 9.0,
"eval_loss": 6.4647536277771,
"eval_runtime": 21.362,
"eval_samples_per_second": 93.624,
"eval_steps_per_second": 5.852,
"step": 229905
},
{
"epoch": 9.00371892738305,
"grad_norm": 1.5859375,
"learning_rate": 0.000819925621452339,
"loss": 6.4643,
"step": 230000
},
{
"epoch": 9.0232922293991,
"grad_norm": 2.828125,
"learning_rate": 0.000819534155412018,
"loss": 6.4632,
"step": 230500
},
{
"epoch": 9.04286553141515,
"grad_norm": 1.390625,
"learning_rate": 0.000819142689371697,
"loss": 6.4633,
"step": 231000
},
{
"epoch": 9.0624388334312,
"grad_norm": 2.296875,
"learning_rate": 0.000818751223331376,
"loss": 6.4673,
"step": 231500
},
{
"epoch": 9.08201213544725,
"grad_norm": 3.40625,
"learning_rate": 0.000818359757291055,
"loss": 6.469,
"step": 232000
},
{
"epoch": 9.1015854374633,
"grad_norm": 1.28125,
"learning_rate": 0.000817968291250734,
"loss": 6.4532,
"step": 232500
},
{
"epoch": 9.12115873947935,
"grad_norm": 1.1953125,
"learning_rate": 0.000817576825210413,
"loss": 6.4608,
"step": 233000
},
{
"epoch": 9.1407320414954,
"grad_norm": 1.046875,
"learning_rate": 0.000817185359170092,
"loss": 6.4537,
"step": 233500
},
{
"epoch": 9.16030534351145,
"grad_norm": 2.90625,
"learning_rate": 0.000816793893129771,
"loss": 6.444,
"step": 234000
},
{
"epoch": 9.1798786455275,
"grad_norm": 2.40625,
"learning_rate": 0.00081640242708945,
"loss": 6.4606,
"step": 234500
},
{
"epoch": 9.199451947543551,
"grad_norm": 1.3046875,
"learning_rate": 0.000816010961049129,
"loss": 6.4578,
"step": 235000
},
{
"epoch": 9.2190252495596,
"grad_norm": 3.359375,
"learning_rate": 0.0008156194950088081,
"loss": 6.4648,
"step": 235500
},
{
"epoch": 9.23859855157565,
"grad_norm": 1.3359375,
"learning_rate": 0.000815228028968487,
"loss": 6.4649,
"step": 236000
},
{
"epoch": 9.258171853591701,
"grad_norm": 1.6796875,
"learning_rate": 0.000814836562928166,
"loss": 6.4617,
"step": 236500
},
{
"epoch": 9.27774515560775,
"grad_norm": 5.34375,
"learning_rate": 0.000814445096887845,
"loss": 6.4598,
"step": 237000
},
{
"epoch": 9.2973184576238,
"grad_norm": 7.15625,
"learning_rate": 0.0008140536308475241,
"loss": 6.4565,
"step": 237500
},
{
"epoch": 9.316891759639852,
"grad_norm": 2.546875,
"learning_rate": 0.000813662164807203,
"loss": 6.4612,
"step": 238000
},
{
"epoch": 9.336465061655902,
"grad_norm": 1.375,
"learning_rate": 0.000813270698766882,
"loss": 6.4583,
"step": 238500
},
{
"epoch": 9.356038363671951,
"grad_norm": 18.5,
"learning_rate": 0.000812879232726561,
"loss": 6.4549,
"step": 239000
},
{
"epoch": 9.375611665688002,
"grad_norm": 5.25,
"learning_rate": 0.0008124877666862401,
"loss": 6.4607,
"step": 239500
},
{
"epoch": 9.395184967704052,
"grad_norm": 1.2890625,
"learning_rate": 0.000812096300645919,
"loss": 6.4573,
"step": 240000
},
{
"epoch": 9.414758269720101,
"grad_norm": 1.03125,
"learning_rate": 0.000811704834605598,
"loss": 6.4706,
"step": 240500
},
{
"epoch": 9.434331571736152,
"grad_norm": 7.3125,
"learning_rate": 0.000811313368565277,
"loss": 6.4632,
"step": 241000
},
{
"epoch": 9.453904873752203,
"grad_norm": 5.28125,
"learning_rate": 0.000810921902524956,
"loss": 6.4549,
"step": 241500
},
{
"epoch": 9.473478175768252,
"grad_norm": 3.546875,
"learning_rate": 0.000810530436484635,
"loss": 6.4705,
"step": 242000
},
{
"epoch": 9.493051477784302,
"grad_norm": 2.125,
"learning_rate": 0.000810138970444314,
"loss": 6.4479,
"step": 242500
},
{
"epoch": 9.512624779800353,
"grad_norm": 1.5390625,
"learning_rate": 0.000809747504403993,
"loss": 6.4602,
"step": 243000
},
{
"epoch": 9.532198081816402,
"grad_norm": 1.1484375,
"learning_rate": 0.000809356038363672,
"loss": 6.469,
"step": 243500
},
{
"epoch": 9.551771383832452,
"grad_norm": 5.71875,
"learning_rate": 0.0008089645723233509,
"loss": 6.464,
"step": 244000
},
{
"epoch": 9.571344685848503,
"grad_norm": 2.609375,
"learning_rate": 0.0008085731062830299,
"loss": 6.4604,
"step": 244500
},
{
"epoch": 9.590917987864552,
"grad_norm": 1.7265625,
"learning_rate": 0.000808181640242709,
"loss": 6.4589,
"step": 245000
},
{
"epoch": 9.610491289880603,
"grad_norm": 1.15625,
"learning_rate": 0.0008077901742023879,
"loss": 6.4551,
"step": 245500
},
{
"epoch": 9.630064591896653,
"grad_norm": 1.6796875,
"learning_rate": 0.0008073987081620669,
"loss": 6.4583,
"step": 246000
},
{
"epoch": 9.649637893912702,
"grad_norm": 2.265625,
"learning_rate": 0.0008070072421217459,
"loss": 6.4529,
"step": 246500
},
{
"epoch": 9.669211195928753,
"grad_norm": 1.2734375,
"learning_rate": 0.000806615776081425,
"loss": 6.457,
"step": 247000
},
{
"epoch": 9.688784497944804,
"grad_norm": 1.5390625,
"learning_rate": 0.0008062243100411039,
"loss": 6.4537,
"step": 247500
},
{
"epoch": 9.708357799960853,
"grad_norm": 1.3671875,
"learning_rate": 0.0008058328440007829,
"loss": 6.4652,
"step": 248000
},
{
"epoch": 9.727931101976903,
"grad_norm": 1.390625,
"learning_rate": 0.0008054413779604619,
"loss": 6.4547,
"step": 248500
},
{
"epoch": 9.747504403992954,
"grad_norm": 1.5625,
"learning_rate": 0.000805049911920141,
"loss": 6.467,
"step": 249000
},
{
"epoch": 9.767077706009005,
"grad_norm": 1.4453125,
"learning_rate": 0.0008046584458798199,
"loss": 6.4568,
"step": 249500
},
{
"epoch": 9.786651008025053,
"grad_norm": 1.75,
"learning_rate": 0.0008042669798394989,
"loss": 6.4616,
"step": 250000
},
{
"epoch": 9.806224310041104,
"grad_norm": 1.046875,
"learning_rate": 0.0008038755137991779,
"loss": 6.455,
"step": 250500
},
{
"epoch": 9.825797612057155,
"grad_norm": 1.625,
"learning_rate": 0.000803484047758857,
"loss": 6.4547,
"step": 251000
},
{
"epoch": 9.845370914073204,
"grad_norm": 0.95703125,
"learning_rate": 0.0008030925817185359,
"loss": 6.4668,
"step": 251500
},
{
"epoch": 9.864944216089254,
"grad_norm": 1.515625,
"learning_rate": 0.0008027011156782149,
"loss": 6.4458,
"step": 252000
},
{
"epoch": 9.884517518105305,
"grad_norm": 2.171875,
"learning_rate": 0.0008023096496378939,
"loss": 6.4636,
"step": 252500
},
{
"epoch": 9.904090820121354,
"grad_norm": 1.9140625,
"learning_rate": 0.000801918183597573,
"loss": 6.4622,
"step": 253000
},
{
"epoch": 9.923664122137405,
"grad_norm": 1.7734375,
"learning_rate": 0.0008015267175572519,
"loss": 6.4557,
"step": 253500
},
{
"epoch": 9.943237424153455,
"grad_norm": 2.015625,
"learning_rate": 0.0008011352515169309,
"loss": 6.4513,
"step": 254000
},
{
"epoch": 9.962810726169504,
"grad_norm": 1.0859375,
"learning_rate": 0.00080074378547661,
"loss": 6.4529,
"step": 254500
},
{
"epoch": 9.982384028185555,
"grad_norm": 1.3046875,
"learning_rate": 0.000800352319436289,
"loss": 6.4559,
"step": 255000
},
{
"epoch": 10.0,
"eval_loss": 6.455629825592041,
"eval_runtime": 21.377,
"eval_samples_per_second": 93.559,
"eval_steps_per_second": 5.847,
"step": 255450
},
{
"epoch": 10.001957330201606,
"grad_norm": 0.984375,
"learning_rate": 0.0007999608533959679,
"loss": 6.454,
"step": 255500
},
{
"epoch": 10.021530632217654,
"grad_norm": 1.78125,
"learning_rate": 0.0007995693873556469,
"loss": 6.4444,
"step": 256000
},
{
"epoch": 10.041103934233705,
"grad_norm": 3.0625,
"learning_rate": 0.000799177921315326,
"loss": 6.4606,
"step": 256500
},
{
"epoch": 10.060677236249756,
"grad_norm": 1.09375,
"learning_rate": 0.000798786455275005,
"loss": 6.4541,
"step": 257000
},
{
"epoch": 10.080250538265805,
"grad_norm": 2.59375,
"learning_rate": 0.0007983949892346839,
"loss": 6.4583,
"step": 257500
},
{
"epoch": 10.099823840281855,
"grad_norm": 8.6875,
"learning_rate": 0.0007980035231943629,
"loss": 6.4542,
"step": 258000
},
{
"epoch": 10.119397142297906,
"grad_norm": 1.0390625,
"learning_rate": 0.000797612057154042,
"loss": 6.4503,
"step": 258500
},
{
"epoch": 10.138970444313955,
"grad_norm": 1.2890625,
"learning_rate": 0.0007972205911137209,
"loss": 6.4531,
"step": 259000
},
{
"epoch": 10.158543746330006,
"grad_norm": 2.03125,
"learning_rate": 0.0007968291250733999,
"loss": 6.4552,
"step": 259500
},
{
"epoch": 10.178117048346056,
"grad_norm": 2.71875,
"learning_rate": 0.0007964376590330789,
"loss": 6.4557,
"step": 260000
},
{
"epoch": 10.197690350362105,
"grad_norm": 2.359375,
"learning_rate": 0.000796046192992758,
"loss": 6.4519,
"step": 260500
},
{
"epoch": 10.217263652378156,
"grad_norm": 1.546875,
"learning_rate": 0.0007956547269524369,
"loss": 6.4584,
"step": 261000
},
{
"epoch": 10.236836954394207,
"grad_norm": 1.2421875,
"learning_rate": 0.0007952632609121159,
"loss": 6.4519,
"step": 261500
},
{
"epoch": 10.256410256410255,
"grad_norm": 2.46875,
"learning_rate": 0.0007948717948717948,
"loss": 6.4576,
"step": 262000
},
{
"epoch": 10.275983558426306,
"grad_norm": 1.1015625,
"learning_rate": 0.000794480328831474,
"loss": 6.4455,
"step": 262500
},
{
"epoch": 10.295556860442357,
"grad_norm": 2.671875,
"learning_rate": 0.0007940888627911528,
"loss": 6.4499,
"step": 263000
},
{
"epoch": 10.315130162458408,
"grad_norm": 1.4453125,
"learning_rate": 0.0007936973967508318,
"loss": 6.4572,
"step": 263500
},
{
"epoch": 10.334703464474456,
"grad_norm": 1.0390625,
"learning_rate": 0.0007933059307105108,
"loss": 6.4513,
"step": 264000
},
{
"epoch": 10.354276766490507,
"grad_norm": 1.140625,
"learning_rate": 0.0007929144646701899,
"loss": 6.4472,
"step": 264500
},
{
"epoch": 10.373850068506558,
"grad_norm": 1.953125,
"learning_rate": 0.0007925229986298688,
"loss": 6.4385,
"step": 265000
},
{
"epoch": 10.393423370522607,
"grad_norm": 1.1484375,
"learning_rate": 0.0007921315325895478,
"loss": 6.4558,
"step": 265500
},
{
"epoch": 10.412996672538657,
"grad_norm": 2.46875,
"learning_rate": 0.0007917400665492269,
"loss": 6.4503,
"step": 266000
},
{
"epoch": 10.432569974554708,
"grad_norm": 1.359375,
"learning_rate": 0.0007913486005089059,
"loss": 6.4402,
"step": 266500
},
{
"epoch": 10.452143276570757,
"grad_norm": 1.046875,
"learning_rate": 0.0007909571344685848,
"loss": 6.4552,
"step": 267000
},
{
"epoch": 10.471716578586808,
"grad_norm": 1.25,
"learning_rate": 0.0007905656684282638,
"loss": 6.4488,
"step": 267500
},
{
"epoch": 10.491289880602858,
"grad_norm": 1.3515625,
"learning_rate": 0.0007901742023879429,
"loss": 6.4577,
"step": 268000
},
{
"epoch": 10.510863182618907,
"grad_norm": 2.546875,
"learning_rate": 0.0007897827363476219,
"loss": 6.4526,
"step": 268500
},
{
"epoch": 10.530436484634958,
"grad_norm": 3.65625,
"learning_rate": 0.0007893912703073008,
"loss": 6.4473,
"step": 269000
},
{
"epoch": 10.550009786651009,
"grad_norm": 1.15625,
"learning_rate": 0.0007889998042669798,
"loss": 6.4555,
"step": 269500
},
{
"epoch": 10.569583088667057,
"grad_norm": 1.609375,
"learning_rate": 0.0007886083382266589,
"loss": 6.4522,
"step": 270000
},
{
"epoch": 10.589156390683108,
"grad_norm": 2.171875,
"learning_rate": 0.0007882168721863379,
"loss": 6.4481,
"step": 270500
},
{
"epoch": 10.608729692699159,
"grad_norm": 1.234375,
"learning_rate": 0.0007878254061460168,
"loss": 6.4498,
"step": 271000
},
{
"epoch": 10.628302994715208,
"grad_norm": 1.015625,
"learning_rate": 0.0007874339401056958,
"loss": 6.4581,
"step": 271500
},
{
"epoch": 10.647876296731258,
"grad_norm": 4.0,
"learning_rate": 0.0007870424740653749,
"loss": 6.4554,
"step": 272000
},
{
"epoch": 10.667449598747309,
"grad_norm": 1.625,
"learning_rate": 0.0007866510080250539,
"loss": 6.4506,
"step": 272500
},
{
"epoch": 10.68702290076336,
"grad_norm": 1.1484375,
"learning_rate": 0.0007862595419847328,
"loss": 6.4586,
"step": 273000
},
{
"epoch": 10.706596202779409,
"grad_norm": 2.453125,
"learning_rate": 0.0007858680759444118,
"loss": 6.4467,
"step": 273500
},
{
"epoch": 10.72616950479546,
"grad_norm": 1.5703125,
"learning_rate": 0.0007854766099040909,
"loss": 6.4449,
"step": 274000
},
{
"epoch": 10.74574280681151,
"grad_norm": 2.71875,
"learning_rate": 0.0007850851438637698,
"loss": 6.4576,
"step": 274500
},
{
"epoch": 10.765316108827559,
"grad_norm": 2.515625,
"learning_rate": 0.0007846936778234488,
"loss": 6.4427,
"step": 275000
},
{
"epoch": 10.78488941084361,
"grad_norm": 1.328125,
"learning_rate": 0.0007843022117831279,
"loss": 6.4571,
"step": 275500
},
{
"epoch": 10.80446271285966,
"grad_norm": 1.5703125,
"learning_rate": 0.0007839107457428069,
"loss": 6.4507,
"step": 276000
},
{
"epoch": 10.824036014875709,
"grad_norm": 4.875,
"learning_rate": 0.0007835192797024858,
"loss": 6.4542,
"step": 276500
},
{
"epoch": 10.84360931689176,
"grad_norm": 18.125,
"learning_rate": 0.0007831278136621648,
"loss": 6.4592,
"step": 277000
},
{
"epoch": 10.86318261890781,
"grad_norm": 1.046875,
"learning_rate": 0.0007827363476218439,
"loss": 6.4503,
"step": 277500
},
{
"epoch": 10.88275592092386,
"grad_norm": 1.265625,
"learning_rate": 0.0007823448815815229,
"loss": 6.461,
"step": 278000
},
{
"epoch": 10.90232922293991,
"grad_norm": 4.1875,
"learning_rate": 0.0007819534155412018,
"loss": 6.4549,
"step": 278500
},
{
"epoch": 10.92190252495596,
"grad_norm": 26.375,
"learning_rate": 0.0007815619495008808,
"loss": 6.4469,
"step": 279000
},
{
"epoch": 10.94147582697201,
"grad_norm": 0.98828125,
"learning_rate": 0.0007811704834605599,
"loss": 6.453,
"step": 279500
},
{
"epoch": 10.96104912898806,
"grad_norm": 25.5,
"learning_rate": 0.0007807790174202389,
"loss": 6.4485,
"step": 280000
},
{
"epoch": 10.980622431004111,
"grad_norm": 2.90625,
"learning_rate": 0.0007803875513799178,
"loss": 6.4544,
"step": 280500
},
{
"epoch": 11.0,
"eval_loss": 6.449069499969482,
"eval_runtime": 22.9095,
"eval_samples_per_second": 87.3,
"eval_steps_per_second": 5.456,
"step": 280995
},
{
"epoch": 11.00019573302016,
"grad_norm": 10.125,
"learning_rate": 0.0007799960853395967,
"loss": 6.4525,
"step": 281000
},
{
"epoch": 11.01976903503621,
"grad_norm": 1.3984375,
"learning_rate": 0.0007796046192992759,
"loss": 6.4456,
"step": 281500
},
{
"epoch": 11.039342337052261,
"grad_norm": 7.28125,
"learning_rate": 0.0007792131532589549,
"loss": 6.4479,
"step": 282000
},
{
"epoch": 11.05891563906831,
"grad_norm": 1.234375,
"learning_rate": 0.0007788216872186337,
"loss": 6.4415,
"step": 282500
},
{
"epoch": 11.07848894108436,
"grad_norm": 1.234375,
"learning_rate": 0.0007784302211783127,
"loss": 6.4438,
"step": 283000
},
{
"epoch": 11.098062243100411,
"grad_norm": 1.5703125,
"learning_rate": 0.0007780387551379918,
"loss": 6.4485,
"step": 283500
},
{
"epoch": 11.11763554511646,
"grad_norm": 2.171875,
"learning_rate": 0.0007776472890976708,
"loss": 6.4491,
"step": 284000
},
{
"epoch": 11.137208847132511,
"grad_norm": 1.2890625,
"learning_rate": 0.0007772558230573497,
"loss": 6.449,
"step": 284500
},
{
"epoch": 11.156782149148562,
"grad_norm": 3.046875,
"learning_rate": 0.0007768643570170288,
"loss": 6.4526,
"step": 285000
},
{
"epoch": 11.17635545116461,
"grad_norm": 3.859375,
"learning_rate": 0.0007764728909767078,
"loss": 6.4408,
"step": 285500
},
{
"epoch": 11.195928753180661,
"grad_norm": 1.21875,
"learning_rate": 0.0007760814249363868,
"loss": 6.4384,
"step": 286000
},
{
"epoch": 11.215502055196712,
"grad_norm": 1.4453125,
"learning_rate": 0.0007756899588960657,
"loss": 6.4557,
"step": 286500
},
{
"epoch": 11.235075357212763,
"grad_norm": 0.9375,
"learning_rate": 0.0007752984928557448,
"loss": 6.4432,
"step": 287000
},
{
"epoch": 11.254648659228812,
"grad_norm": 1.5859375,
"learning_rate": 0.0007749070268154238,
"loss": 6.4451,
"step": 287500
},
{
"epoch": 11.274221961244862,
"grad_norm": 2.734375,
"learning_rate": 0.0007745155607751028,
"loss": 6.4532,
"step": 288000
},
{
"epoch": 11.293795263260913,
"grad_norm": 2.234375,
"learning_rate": 0.0007741240947347817,
"loss": 6.4489,
"step": 288500
},
{
"epoch": 11.313368565276962,
"grad_norm": 1.2734375,
"learning_rate": 0.0007737326286944608,
"loss": 6.4448,
"step": 289000
},
{
"epoch": 11.332941867293012,
"grad_norm": 1.5859375,
"learning_rate": 0.0007733411626541398,
"loss": 6.4421,
"step": 289500
},
{
"epoch": 11.352515169309063,
"grad_norm": 1.2109375,
"learning_rate": 0.0007729496966138187,
"loss": 6.4408,
"step": 290000
},
{
"epoch": 11.372088471325112,
"grad_norm": 1.125,
"learning_rate": 0.0007725582305734977,
"loss": 6.4487,
"step": 290500
},
{
"epoch": 11.391661773341163,
"grad_norm": 1.171875,
"learning_rate": 0.0007721667645331768,
"loss": 6.4488,
"step": 291000
},
{
"epoch": 11.411235075357213,
"grad_norm": 1.5625,
"learning_rate": 0.0007717752984928558,
"loss": 6.4394,
"step": 291500
},
{
"epoch": 11.430808377373262,
"grad_norm": 1.40625,
"learning_rate": 0.0007713838324525347,
"loss": 6.4548,
"step": 292000
},
{
"epoch": 11.450381679389313,
"grad_norm": 1.4375,
"learning_rate": 0.0007709923664122137,
"loss": 6.454,
"step": 292500
},
{
"epoch": 11.469954981405364,
"grad_norm": 1.9375,
"learning_rate": 0.0007706009003718928,
"loss": 6.4516,
"step": 293000
},
{
"epoch": 11.489528283421413,
"grad_norm": 1.2109375,
"learning_rate": 0.0007702094343315718,
"loss": 6.4482,
"step": 293500
},
{
"epoch": 11.509101585437463,
"grad_norm": 2.40625,
"learning_rate": 0.0007698179682912507,
"loss": 6.4602,
"step": 294000
},
{
"epoch": 11.528674887453514,
"grad_norm": 1.453125,
"learning_rate": 0.0007694265022509297,
"loss": 6.4459,
"step": 294500
},
{
"epoch": 11.548248189469563,
"grad_norm": 1.4375,
"learning_rate": 0.0007690350362106088,
"loss": 6.4508,
"step": 295000
},
{
"epoch": 11.567821491485613,
"grad_norm": 1.8203125,
"learning_rate": 0.0007686435701702878,
"loss": 6.4459,
"step": 295500
},
{
"epoch": 11.587394793501664,
"grad_norm": 4.78125,
"learning_rate": 0.0007682521041299667,
"loss": 6.4517,
"step": 296000
},
{
"epoch": 11.606968095517713,
"grad_norm": 1.3125,
"learning_rate": 0.0007678606380896458,
"loss": 6.4519,
"step": 296500
},
{
"epoch": 11.626541397533764,
"grad_norm": 1.3515625,
"learning_rate": 0.0007674691720493248,
"loss": 6.4474,
"step": 297000
},
{
"epoch": 11.646114699549814,
"grad_norm": 1.078125,
"learning_rate": 0.0007670777060090038,
"loss": 6.4506,
"step": 297500
},
{
"epoch": 11.665688001565865,
"grad_norm": 0.9296875,
"learning_rate": 0.0007666862399686827,
"loss": 6.4396,
"step": 298000
},
{
"epoch": 11.685261303581914,
"grad_norm": 1.0703125,
"learning_rate": 0.0007662947739283618,
"loss": 6.4453,
"step": 298500
},
{
"epoch": 11.704834605597965,
"grad_norm": 1.9296875,
"learning_rate": 0.0007659033078880408,
"loss": 6.4414,
"step": 299000
},
{
"epoch": 11.724407907614015,
"grad_norm": 1.46875,
"learning_rate": 0.0007655118418477198,
"loss": 6.4513,
"step": 299500
},
{
"epoch": 11.743981209630064,
"grad_norm": 3.53125,
"learning_rate": 0.0007651203758073986,
"loss": 6.4501,
"step": 300000
},
{
"epoch": 11.763554511646115,
"grad_norm": 1.4765625,
"learning_rate": 0.0007647289097670778,
"loss": 6.4471,
"step": 300500
},
{
"epoch": 11.783127813662166,
"grad_norm": 2.3125,
"learning_rate": 0.0007643374437267568,
"loss": 6.4567,
"step": 301000
},
{
"epoch": 11.802701115678214,
"grad_norm": 1.59375,
"learning_rate": 0.0007639459776864357,
"loss": 6.4509,
"step": 301500
},
{
"epoch": 11.822274417694265,
"grad_norm": 1.140625,
"learning_rate": 0.0007635545116461146,
"loss": 6.4451,
"step": 302000
},
{
"epoch": 11.841847719710316,
"grad_norm": 1.7421875,
"learning_rate": 0.0007631630456057937,
"loss": 6.4524,
"step": 302500
},
{
"epoch": 11.861421021726365,
"grad_norm": 12.8125,
"learning_rate": 0.0007627715795654727,
"loss": 6.4502,
"step": 303000
},
{
"epoch": 11.880994323742415,
"grad_norm": 1.8203125,
"learning_rate": 0.0007623801135251517,
"loss": 6.4476,
"step": 303500
},
{
"epoch": 11.900567625758466,
"grad_norm": 1.421875,
"learning_rate": 0.0007619886474848306,
"loss": 6.4338,
"step": 304000
},
{
"epoch": 11.920140927774515,
"grad_norm": 1.9296875,
"learning_rate": 0.0007615971814445097,
"loss": 6.4472,
"step": 304500
},
{
"epoch": 11.939714229790566,
"grad_norm": 2.78125,
"learning_rate": 0.0007612057154041887,
"loss": 6.4465,
"step": 305000
},
{
"epoch": 11.959287531806616,
"grad_norm": 1.2734375,
"learning_rate": 0.0007608142493638676,
"loss": 6.4529,
"step": 305500
},
{
"epoch": 11.978860833822665,
"grad_norm": 1.375,
"learning_rate": 0.0007604227833235467,
"loss": 6.4456,
"step": 306000
},
{
"epoch": 11.998434135838716,
"grad_norm": 2.015625,
"learning_rate": 0.0007600313172832257,
"loss": 6.4525,
"step": 306500
},
{
"epoch": 12.0,
"eval_loss": 6.446938514709473,
"eval_runtime": 23.9647,
"eval_samples_per_second": 83.456,
"eval_steps_per_second": 5.216,
"step": 306540
},
{
"epoch": 12.018007437854767,
"grad_norm": 1.4453125,
"learning_rate": 0.0007596398512429047,
"loss": 6.4458,
"step": 307000
},
{
"epoch": 12.037580739870815,
"grad_norm": 1.734375,
"learning_rate": 0.0007592483852025836,
"loss": 6.4423,
"step": 307500
},
{
"epoch": 12.057154041886866,
"grad_norm": 11.3125,
"learning_rate": 0.0007588569191622627,
"loss": 6.4446,
"step": 308000
},
{
"epoch": 12.076727343902917,
"grad_norm": 1.3828125,
"learning_rate": 0.0007584654531219417,
"loss": 6.4461,
"step": 308500
},
{
"epoch": 12.096300645918966,
"grad_norm": 1.84375,
"learning_rate": 0.0007580739870816207,
"loss": 6.4431,
"step": 309000
},
{
"epoch": 12.115873947935016,
"grad_norm": 2.421875,
"learning_rate": 0.0007576825210412996,
"loss": 6.4439,
"step": 309500
},
{
"epoch": 12.135447249951067,
"grad_norm": 3.03125,
"learning_rate": 0.0007572910550009787,
"loss": 6.4428,
"step": 310000
},
{
"epoch": 12.155020551967118,
"grad_norm": 1.046875,
"learning_rate": 0.0007568995889606577,
"loss": 6.4456,
"step": 310500
},
{
"epoch": 12.174593853983167,
"grad_norm": 1.0390625,
"learning_rate": 0.0007565081229203367,
"loss": 6.4479,
"step": 311000
},
{
"epoch": 12.194167155999217,
"grad_norm": 2.640625,
"learning_rate": 0.0007561166568800156,
"loss": 6.4457,
"step": 311500
},
{
"epoch": 12.213740458015268,
"grad_norm": 2.015625,
"learning_rate": 0.0007557251908396947,
"loss": 6.4405,
"step": 312000
},
{
"epoch": 12.233313760031317,
"grad_norm": 2.21875,
"learning_rate": 0.0007553337247993737,
"loss": 6.4557,
"step": 312500
},
{
"epoch": 12.252887062047368,
"grad_norm": 1.28125,
"learning_rate": 0.0007549422587590527,
"loss": 6.4407,
"step": 313000
},
{
"epoch": 12.272460364063418,
"grad_norm": 1.421875,
"learning_rate": 0.0007545507927187316,
"loss": 6.4457,
"step": 313500
},
{
"epoch": 12.292033666079467,
"grad_norm": 1.1484375,
"learning_rate": 0.0007541593266784107,
"loss": 6.4471,
"step": 314000
},
{
"epoch": 12.311606968095518,
"grad_norm": 1.4765625,
"learning_rate": 0.0007537678606380897,
"loss": 6.4473,
"step": 314500
},
{
"epoch": 12.331180270111568,
"grad_norm": 3.03125,
"learning_rate": 0.0007533763945977687,
"loss": 6.4451,
"step": 315000
},
{
"epoch": 12.350753572127617,
"grad_norm": 1.1171875,
"learning_rate": 0.0007529849285574477,
"loss": 6.4336,
"step": 315500
},
{
"epoch": 12.370326874143668,
"grad_norm": 1.1875,
"learning_rate": 0.0007525934625171267,
"loss": 6.4454,
"step": 316000
},
{
"epoch": 12.389900176159719,
"grad_norm": 1.9609375,
"learning_rate": 0.0007522019964768057,
"loss": 6.4481,
"step": 316500
},
{
"epoch": 12.409473478175768,
"grad_norm": 1.796875,
"learning_rate": 0.0007518105304364847,
"loss": 6.4485,
"step": 317000
},
{
"epoch": 12.429046780191818,
"grad_norm": 1.5390625,
"learning_rate": 0.0007514190643961637,
"loss": 6.4553,
"step": 317500
},
{
"epoch": 12.448620082207869,
"grad_norm": 1.421875,
"learning_rate": 0.0007510275983558427,
"loss": 6.4435,
"step": 318000
},
{
"epoch": 12.468193384223918,
"grad_norm": 1.8046875,
"learning_rate": 0.0007506361323155217,
"loss": 6.4435,
"step": 318500
},
{
"epoch": 12.487766686239969,
"grad_norm": 2.578125,
"learning_rate": 0.0007502446662752007,
"loss": 6.4451,
"step": 319000
},
{
"epoch": 12.50733998825602,
"grad_norm": 1.6953125,
"learning_rate": 0.0007498532002348797,
"loss": 6.4511,
"step": 319500
},
{
"epoch": 12.526913290272068,
"grad_norm": 1.671875,
"learning_rate": 0.0007494617341945587,
"loss": 6.4384,
"step": 320000
},
{
"epoch": 12.546486592288119,
"grad_norm": 2.09375,
"learning_rate": 0.0007490702681542376,
"loss": 6.4448,
"step": 320500
},
{
"epoch": 12.56605989430417,
"grad_norm": 2.03125,
"learning_rate": 0.0007486788021139165,
"loss": 6.4514,
"step": 321000
},
{
"epoch": 12.58563319632022,
"grad_norm": 1.0390625,
"learning_rate": 0.0007482873360735956,
"loss": 6.4437,
"step": 321500
},
{
"epoch": 12.605206498336269,
"grad_norm": 1.3671875,
"learning_rate": 0.0007478958700332746,
"loss": 6.4439,
"step": 322000
},
{
"epoch": 12.62477980035232,
"grad_norm": 1.1953125,
"learning_rate": 0.0007475044039929536,
"loss": 6.4455,
"step": 322500
},
{
"epoch": 12.64435310236837,
"grad_norm": 0.875,
"learning_rate": 0.0007471129379526325,
"loss": 6.4413,
"step": 323000
},
{
"epoch": 12.66392640438442,
"grad_norm": 1.8125,
"learning_rate": 0.0007467214719123116,
"loss": 6.4366,
"step": 323500
},
{
"epoch": 12.68349970640047,
"grad_norm": 1.0234375,
"learning_rate": 0.0007463300058719906,
"loss": 6.4511,
"step": 324000
},
{
"epoch": 12.70307300841652,
"grad_norm": 1.625,
"learning_rate": 0.0007459385398316696,
"loss": 6.4384,
"step": 324500
},
{
"epoch": 12.72264631043257,
"grad_norm": 0.8203125,
"learning_rate": 0.0007455470737913485,
"loss": 6.4402,
"step": 325000
},
{
"epoch": 12.74221961244862,
"grad_norm": 4.0625,
"learning_rate": 0.0007451556077510276,
"loss": 6.4542,
"step": 325500
},
{
"epoch": 12.761792914464671,
"grad_norm": 2.328125,
"learning_rate": 0.0007447641417107066,
"loss": 6.4417,
"step": 326000
},
{
"epoch": 12.78136621648072,
"grad_norm": 5.4375,
"learning_rate": 0.0007443726756703856,
"loss": 6.4429,
"step": 326500
},
{
"epoch": 12.80093951849677,
"grad_norm": 1.375,
"learning_rate": 0.0007439812096300646,
"loss": 6.45,
"step": 327000
},
{
"epoch": 12.820512820512821,
"grad_norm": 1.703125,
"learning_rate": 0.0007435897435897436,
"loss": 6.4472,
"step": 327500
},
{
"epoch": 12.84008612252887,
"grad_norm": 1.9453125,
"learning_rate": 0.0007431982775494226,
"loss": 6.438,
"step": 328000
},
{
"epoch": 12.85965942454492,
"grad_norm": 2.234375,
"learning_rate": 0.0007428068115091016,
"loss": 6.451,
"step": 328500
},
{
"epoch": 12.879232726560971,
"grad_norm": 1.53125,
"learning_rate": 0.0007424153454687806,
"loss": 6.4495,
"step": 329000
},
{
"epoch": 12.89880602857702,
"grad_norm": 8.375,
"learning_rate": 0.0007420238794284596,
"loss": 6.4412,
"step": 329500
},
{
"epoch": 12.918379330593071,
"grad_norm": 1.1640625,
"learning_rate": 0.0007416324133881386,
"loss": 6.4584,
"step": 330000
},
{
"epoch": 12.937952632609122,
"grad_norm": 1.71875,
"learning_rate": 0.0007412409473478176,
"loss": 6.4446,
"step": 330500
},
{
"epoch": 12.95752593462517,
"grad_norm": 1.8828125,
"learning_rate": 0.0007408494813074966,
"loss": 6.4465,
"step": 331000
},
{
"epoch": 12.977099236641221,
"grad_norm": 1.890625,
"learning_rate": 0.0007404580152671756,
"loss": 6.4441,
"step": 331500
},
{
"epoch": 12.996672538657272,
"grad_norm": 2.453125,
"learning_rate": 0.0007400665492268546,
"loss": 6.4464,
"step": 332000
},
{
"epoch": 13.0,
"eval_loss": 6.443148612976074,
"eval_runtime": 21.9848,
"eval_samples_per_second": 90.972,
"eval_steps_per_second": 5.686,
"step": 332085
},
{
"epoch": 13.01624584067332,
"grad_norm": 3.203125,
"learning_rate": 0.0007396750831865336,
"loss": 6.4378,
"step": 332500
},
{
"epoch": 13.035819142689371,
"grad_norm": 1.34375,
"learning_rate": 0.0007392836171462126,
"loss": 6.4393,
"step": 333000
},
{
"epoch": 13.055392444705422,
"grad_norm": 1.1328125,
"learning_rate": 0.0007388921511058916,
"loss": 6.441,
"step": 333500
},
{
"epoch": 13.074965746721471,
"grad_norm": 4.875,
"learning_rate": 0.0007385006850655706,
"loss": 6.4441,
"step": 334000
},
{
"epoch": 13.094539048737522,
"grad_norm": 1.0,
"learning_rate": 0.0007381092190252496,
"loss": 6.4412,
"step": 334500
},
{
"epoch": 13.114112350753572,
"grad_norm": 1.4296875,
"learning_rate": 0.0007377177529849286,
"loss": 6.4431,
"step": 335000
},
{
"epoch": 13.133685652769623,
"grad_norm": 1.9921875,
"learning_rate": 0.0007373262869446076,
"loss": 6.4336,
"step": 335500
},
{
"epoch": 13.153258954785672,
"grad_norm": 1.015625,
"learning_rate": 0.0007369348209042866,
"loss": 6.4395,
"step": 336000
},
{
"epoch": 13.172832256801723,
"grad_norm": 2.046875,
"learning_rate": 0.0007365433548639657,
"loss": 6.4369,
"step": 336500
},
{
"epoch": 13.192405558817773,
"grad_norm": 1.1875,
"learning_rate": 0.0007361518888236446,
"loss": 6.4482,
"step": 337000
},
{
"epoch": 13.211978860833822,
"grad_norm": 4.46875,
"learning_rate": 0.0007357604227833236,
"loss": 6.45,
"step": 337500
},
{
"epoch": 13.231552162849873,
"grad_norm": 1.6484375,
"learning_rate": 0.0007353689567430026,
"loss": 6.4423,
"step": 338000
},
{
"epoch": 13.251125464865924,
"grad_norm": 1.59375,
"learning_rate": 0.0007349774907026816,
"loss": 6.4455,
"step": 338500
},
{
"epoch": 13.270698766881972,
"grad_norm": 1.6640625,
"learning_rate": 0.0007345860246623605,
"loss": 6.4393,
"step": 339000
},
{
"epoch": 13.290272068898023,
"grad_norm": 1.0,
"learning_rate": 0.0007341945586220395,
"loss": 6.4441,
"step": 339500
},
{
"epoch": 13.309845370914074,
"grad_norm": 1.4375,
"learning_rate": 0.0007338030925817185,
"loss": 6.4395,
"step": 340000
},
{
"epoch": 13.329418672930123,
"grad_norm": 1.5703125,
"learning_rate": 0.0007334116265413975,
"loss": 6.4375,
"step": 340500
},
{
"epoch": 13.348991974946173,
"grad_norm": 1.46875,
"learning_rate": 0.0007330201605010765,
"loss": 6.438,
"step": 341000
},
{
"epoch": 13.368565276962224,
"grad_norm": 1.6015625,
"learning_rate": 0.0007326286944607555,
"loss": 6.4392,
"step": 341500
},
{
"epoch": 13.388138578978273,
"grad_norm": 1.3359375,
"learning_rate": 0.0007322372284204345,
"loss": 6.4433,
"step": 342000
},
{
"epoch": 13.407711880994324,
"grad_norm": 1.46875,
"learning_rate": 0.0007318457623801135,
"loss": 6.4467,
"step": 342500
},
{
"epoch": 13.427285183010374,
"grad_norm": 0.87890625,
"learning_rate": 0.0007314542963397925,
"loss": 6.4401,
"step": 343000
},
{
"epoch": 13.446858485026423,
"grad_norm": 0.87109375,
"learning_rate": 0.0007310628302994715,
"loss": 6.4414,
"step": 343500
},
{
"epoch": 13.466431787042474,
"grad_norm": 0.98046875,
"learning_rate": 0.0007306713642591505,
"loss": 6.4451,
"step": 344000
},
{
"epoch": 13.486005089058525,
"grad_norm": 2.921875,
"learning_rate": 0.0007302798982188295,
"loss": 6.4419,
"step": 344500
},
{
"epoch": 13.505578391074573,
"grad_norm": 2.296875,
"learning_rate": 0.0007298884321785085,
"loss": 6.439,
"step": 345000
},
{
"epoch": 13.525151693090624,
"grad_norm": 14.0625,
"learning_rate": 0.0007294969661381875,
"loss": 6.4437,
"step": 345500
},
{
"epoch": 13.544724995106675,
"grad_norm": 1.2578125,
"learning_rate": 0.0007291055000978665,
"loss": 6.4399,
"step": 346000
},
{
"epoch": 13.564298297122726,
"grad_norm": 11.5625,
"learning_rate": 0.0007287140340575455,
"loss": 6.4447,
"step": 346500
},
{
"epoch": 13.583871599138774,
"grad_norm": 0.91796875,
"learning_rate": 0.0007283225680172245,
"loss": 6.4373,
"step": 347000
},
{
"epoch": 13.603444901154825,
"grad_norm": 1.9375,
"learning_rate": 0.0007279311019769035,
"loss": 6.4484,
"step": 347500
},
{
"epoch": 13.623018203170876,
"grad_norm": 1.734375,
"learning_rate": 0.0007275396359365826,
"loss": 6.4486,
"step": 348000
},
{
"epoch": 13.642591505186925,
"grad_norm": 1.4765625,
"learning_rate": 0.0007271481698962615,
"loss": 6.4417,
"step": 348500
},
{
"epoch": 13.662164807202975,
"grad_norm": 0.88671875,
"learning_rate": 0.0007267567038559405,
"loss": 6.4397,
"step": 349000
},
{
"epoch": 13.681738109219026,
"grad_norm": 1.84375,
"learning_rate": 0.0007263652378156195,
"loss": 6.4328,
"step": 349500
},
{
"epoch": 13.701311411235075,
"grad_norm": 1.1171875,
"learning_rate": 0.0007259737717752986,
"loss": 6.4462,
"step": 350000
},
{
"epoch": 13.720884713251126,
"grad_norm": 4.15625,
"learning_rate": 0.0007255823057349775,
"loss": 6.4394,
"step": 350500
},
{
"epoch": 13.740458015267176,
"grad_norm": 1.40625,
"learning_rate": 0.0007251908396946565,
"loss": 6.4434,
"step": 351000
},
{
"epoch": 13.760031317283225,
"grad_norm": 1.6328125,
"learning_rate": 0.0007247993736543355,
"loss": 6.4437,
"step": 351500
},
{
"epoch": 13.779604619299276,
"grad_norm": 1.5859375,
"learning_rate": 0.0007244079076140146,
"loss": 6.4386,
"step": 352000
},
{
"epoch": 13.799177921315327,
"grad_norm": 1.4609375,
"learning_rate": 0.0007240164415736935,
"loss": 6.4476,
"step": 352500
},
{
"epoch": 13.818751223331375,
"grad_norm": 2.515625,
"learning_rate": 0.0007236249755333725,
"loss": 6.4442,
"step": 353000
},
{
"epoch": 13.838324525347426,
"grad_norm": 2.203125,
"learning_rate": 0.0007232335094930515,
"loss": 6.44,
"step": 353500
},
{
"epoch": 13.857897827363477,
"grad_norm": 0.99609375,
"learning_rate": 0.0007228420434527305,
"loss": 6.4395,
"step": 354000
},
{
"epoch": 13.877471129379526,
"grad_norm": 2.75,
"learning_rate": 0.0007224505774124095,
"loss": 6.4392,
"step": 354500
},
{
"epoch": 13.897044431395576,
"grad_norm": 0.9140625,
"learning_rate": 0.0007220591113720885,
"loss": 6.4484,
"step": 355000
},
{
"epoch": 13.916617733411627,
"grad_norm": 1.3203125,
"learning_rate": 0.0007216676453317675,
"loss": 6.4449,
"step": 355500
},
{
"epoch": 13.936191035427676,
"grad_norm": 1.390625,
"learning_rate": 0.0007212761792914465,
"loss": 6.4444,
"step": 356000
},
{
"epoch": 13.955764337443727,
"grad_norm": 3.921875,
"learning_rate": 0.0007208847132511255,
"loss": 6.4438,
"step": 356500
},
{
"epoch": 13.975337639459777,
"grad_norm": 4.71875,
"learning_rate": 0.0007204932472108045,
"loss": 6.4417,
"step": 357000
},
{
"epoch": 13.994910941475826,
"grad_norm": 3.65625,
"learning_rate": 0.0007201017811704836,
"loss": 6.4479,
"step": 357500
},
{
"epoch": 14.0,
"eval_loss": 6.441241264343262,
"eval_runtime": 23.2496,
"eval_samples_per_second": 86.023,
"eval_steps_per_second": 5.376,
"step": 357630
},
{
"epoch": 14.014484243491877,
"grad_norm": 1.4296875,
"learning_rate": 0.0007197103151301624,
"loss": 6.4372,
"step": 358000
},
{
"epoch": 14.034057545507928,
"grad_norm": 0.9375,
"learning_rate": 0.0007193188490898414,
"loss": 6.4488,
"step": 358500
},
{
"epoch": 14.053630847523978,
"grad_norm": 1.234375,
"learning_rate": 0.0007189273830495204,
"loss": 6.441,
"step": 359000
},
{
"epoch": 14.073204149540027,
"grad_norm": 0.890625,
"learning_rate": 0.0007185359170091995,
"loss": 6.4405,
"step": 359500
},
{
"epoch": 14.092777451556078,
"grad_norm": 1.46875,
"learning_rate": 0.0007181444509688784,
"loss": 6.4363,
"step": 360000
},
{
"epoch": 14.112350753572128,
"grad_norm": 1.421875,
"learning_rate": 0.0007177529849285574,
"loss": 6.4307,
"step": 360500
},
{
"epoch": 14.131924055588177,
"grad_norm": 1.0078125,
"learning_rate": 0.0007173615188882364,
"loss": 6.4421,
"step": 361000
},
{
"epoch": 14.151497357604228,
"grad_norm": 2.703125,
"learning_rate": 0.0007169700528479155,
"loss": 6.4308,
"step": 361500
},
{
"epoch": 14.171070659620279,
"grad_norm": 1.59375,
"learning_rate": 0.0007165785868075944,
"loss": 6.445,
"step": 362000
},
{
"epoch": 14.190643961636328,
"grad_norm": 1.015625,
"learning_rate": 0.0007161871207672734,
"loss": 6.4366,
"step": 362500
},
{
"epoch": 14.210217263652378,
"grad_norm": 8.3125,
"learning_rate": 0.0007157956547269524,
"loss": 6.4377,
"step": 363000
},
{
"epoch": 14.229790565668429,
"grad_norm": 1.578125,
"learning_rate": 0.0007154041886866315,
"loss": 6.4332,
"step": 363500
},
{
"epoch": 14.249363867684478,
"grad_norm": 20.75,
"learning_rate": 0.0007150127226463104,
"loss": 6.4339,
"step": 364000
},
{
"epoch": 14.268937169700529,
"grad_norm": 1.5703125,
"learning_rate": 0.0007146212566059894,
"loss": 6.4443,
"step": 364500
},
{
"epoch": 14.28851047171658,
"grad_norm": 1.265625,
"learning_rate": 0.0007142297905656684,
"loss": 6.4381,
"step": 365000
},
{
"epoch": 14.308083773732628,
"grad_norm": 0.9765625,
"learning_rate": 0.0007138383245253475,
"loss": 6.4398,
"step": 365500
},
{
"epoch": 14.327657075748679,
"grad_norm": 1.3125,
"learning_rate": 0.0007134468584850264,
"loss": 6.4347,
"step": 366000
},
{
"epoch": 14.34723037776473,
"grad_norm": 1.2578125,
"learning_rate": 0.0007130553924447054,
"loss": 6.4376,
"step": 366500
},
{
"epoch": 14.366803679780778,
"grad_norm": 2.3125,
"learning_rate": 0.0007126639264043845,
"loss": 6.4392,
"step": 367000
},
{
"epoch": 14.386376981796829,
"grad_norm": 2.5,
"learning_rate": 0.0007122724603640635,
"loss": 6.4388,
"step": 367500
},
{
"epoch": 14.40595028381288,
"grad_norm": 0.93359375,
"learning_rate": 0.0007118809943237424,
"loss": 6.4346,
"step": 368000
},
{
"epoch": 14.425523585828929,
"grad_norm": 1.34375,
"learning_rate": 0.0007114895282834214,
"loss": 6.4427,
"step": 368500
},
{
"epoch": 14.44509688784498,
"grad_norm": 1.015625,
"learning_rate": 0.0007110980622431005,
"loss": 6.4425,
"step": 369000
},
{
"epoch": 14.46467018986103,
"grad_norm": 1.9921875,
"learning_rate": 0.0007107065962027795,
"loss": 6.4497,
"step": 369500
},
{
"epoch": 14.484243491877079,
"grad_norm": 1.4140625,
"learning_rate": 0.0007103151301624584,
"loss": 6.4378,
"step": 370000
},
{
"epoch": 14.50381679389313,
"grad_norm": 1.109375,
"learning_rate": 0.0007099236641221374,
"loss": 6.4414,
"step": 370500
},
{
"epoch": 14.52339009590918,
"grad_norm": 1.1015625,
"learning_rate": 0.0007095321980818165,
"loss": 6.4258,
"step": 371000
},
{
"epoch": 14.54296339792523,
"grad_norm": 4.8125,
"learning_rate": 0.0007091407320414954,
"loss": 6.4274,
"step": 371500
},
{
"epoch": 14.56253669994128,
"grad_norm": 4.5625,
"learning_rate": 0.0007087492660011744,
"loss": 6.4458,
"step": 372000
},
{
"epoch": 14.58211000195733,
"grad_norm": 1.8671875,
"learning_rate": 0.0007083577999608534,
"loss": 6.4381,
"step": 372500
},
{
"epoch": 14.601683303973381,
"grad_norm": 4.125,
"learning_rate": 0.0007079663339205325,
"loss": 6.441,
"step": 373000
},
{
"epoch": 14.62125660598943,
"grad_norm": 1.09375,
"learning_rate": 0.0007075748678802114,
"loss": 6.4338,
"step": 373500
},
{
"epoch": 14.64082990800548,
"grad_norm": 1.953125,
"learning_rate": 0.0007071834018398904,
"loss": 6.4328,
"step": 374000
},
{
"epoch": 14.660403210021531,
"grad_norm": 3.375,
"learning_rate": 0.0007067919357995694,
"loss": 6.4386,
"step": 374500
},
{
"epoch": 14.67997651203758,
"grad_norm": 4.6875,
"learning_rate": 0.0007064004697592485,
"loss": 6.4317,
"step": 375000
},
{
"epoch": 14.699549814053631,
"grad_norm": 1.0234375,
"learning_rate": 0.0007060090037189274,
"loss": 6.4334,
"step": 375500
},
{
"epoch": 14.719123116069682,
"grad_norm": 2.296875,
"learning_rate": 0.0007056175376786064,
"loss": 6.4464,
"step": 376000
},
{
"epoch": 14.73869641808573,
"grad_norm": 0.90625,
"learning_rate": 0.0007052260716382854,
"loss": 6.4376,
"step": 376500
},
{
"epoch": 14.758269720101781,
"grad_norm": 3.546875,
"learning_rate": 0.0007048346055979645,
"loss": 6.4439,
"step": 377000
},
{
"epoch": 14.777843022117832,
"grad_norm": 1.6171875,
"learning_rate": 0.0007044431395576433,
"loss": 6.4408,
"step": 377500
},
{
"epoch": 14.79741632413388,
"grad_norm": 2.015625,
"learning_rate": 0.0007040516735173223,
"loss": 6.434,
"step": 378000
},
{
"epoch": 14.816989626149931,
"grad_norm": 1.8203125,
"learning_rate": 0.0007036602074770014,
"loss": 6.4404,
"step": 378500
},
{
"epoch": 14.836562928165982,
"grad_norm": 3.484375,
"learning_rate": 0.0007032687414366804,
"loss": 6.4377,
"step": 379000
},
{
"epoch": 14.856136230182031,
"grad_norm": 1.890625,
"learning_rate": 0.0007028772753963593,
"loss": 6.4332,
"step": 379500
},
{
"epoch": 14.875709532198082,
"grad_norm": 3.625,
"learning_rate": 0.0007024858093560383,
"loss": 6.4384,
"step": 380000
},
{
"epoch": 14.895282834214132,
"grad_norm": 1.328125,
"learning_rate": 0.0007020943433157174,
"loss": 6.4439,
"step": 380500
},
{
"epoch": 14.914856136230181,
"grad_norm": 6.625,
"learning_rate": 0.0007017028772753964,
"loss": 6.439,
"step": 381000
},
{
"epoch": 14.934429438246232,
"grad_norm": 3.28125,
"learning_rate": 0.0007013114112350753,
"loss": 6.4396,
"step": 381500
},
{
"epoch": 14.954002740262283,
"grad_norm": 0.828125,
"learning_rate": 0.0007009199451947543,
"loss": 6.4498,
"step": 382000
},
{
"epoch": 14.973576042278331,
"grad_norm": 1.4140625,
"learning_rate": 0.0007005284791544334,
"loss": 6.4371,
"step": 382500
},
{
"epoch": 14.993149344294382,
"grad_norm": 24.375,
"learning_rate": 0.0007001370131141124,
"loss": 6.4382,
"step": 383000
},
{
"epoch": 15.0,
"eval_loss": 6.4391045570373535,
"eval_runtime": 20.2509,
"eval_samples_per_second": 98.761,
"eval_steps_per_second": 6.173,
"step": 383175
},
{
"epoch": 15.012722646310433,
"grad_norm": 4.21875,
"learning_rate": 0.0006997455470737913,
"loss": 6.4283,
"step": 383500
},
{
"epoch": 15.032295948326484,
"grad_norm": 1.484375,
"learning_rate": 0.0006993540810334703,
"loss": 6.4405,
"step": 384000
},
{
"epoch": 15.051869250342532,
"grad_norm": 1.1328125,
"learning_rate": 0.0006989626149931494,
"loss": 6.4427,
"step": 384500
},
{
"epoch": 15.071442552358583,
"grad_norm": 4.625,
"learning_rate": 0.0006985711489528284,
"loss": 6.4322,
"step": 385000
},
{
"epoch": 15.091015854374634,
"grad_norm": 1.125,
"learning_rate": 0.0006981796829125073,
"loss": 6.4296,
"step": 385500
},
{
"epoch": 15.110589156390683,
"grad_norm": 2.296875,
"learning_rate": 0.0006977882168721863,
"loss": 6.437,
"step": 386000
},
{
"epoch": 15.130162458406733,
"grad_norm": 2.078125,
"learning_rate": 0.0006973967508318654,
"loss": 6.4392,
"step": 386500
},
{
"epoch": 15.149735760422784,
"grad_norm": 4.5625,
"learning_rate": 0.0006970052847915443,
"loss": 6.4383,
"step": 387000
},
{
"epoch": 15.169309062438833,
"grad_norm": 1.453125,
"learning_rate": 0.0006966138187512233,
"loss": 6.4385,
"step": 387500
},
{
"epoch": 15.188882364454884,
"grad_norm": 6.0,
"learning_rate": 0.0006962223527109024,
"loss": 6.4358,
"step": 388000
},
{
"epoch": 15.208455666470934,
"grad_norm": 1.5234375,
"learning_rate": 0.0006958308866705814,
"loss": 6.4298,
"step": 388500
},
{
"epoch": 15.228028968486983,
"grad_norm": 2.796875,
"learning_rate": 0.0006954394206302603,
"loss": 6.4386,
"step": 389000
},
{
"epoch": 15.247602270503034,
"grad_norm": 1.546875,
"learning_rate": 0.0006950479545899393,
"loss": 6.4419,
"step": 389500
},
{
"epoch": 15.267175572519085,
"grad_norm": 0.80859375,
"learning_rate": 0.0006946564885496184,
"loss": 6.4326,
"step": 390000
},
{
"epoch": 15.286748874535133,
"grad_norm": 1.3046875,
"learning_rate": 0.0006942650225092974,
"loss": 6.4367,
"step": 390500
},
{
"epoch": 15.306322176551184,
"grad_norm": 1.40625,
"learning_rate": 0.0006938735564689763,
"loss": 6.4375,
"step": 391000
},
{
"epoch": 15.325895478567235,
"grad_norm": 1.4921875,
"learning_rate": 0.0006934820904286553,
"loss": 6.4316,
"step": 391500
},
{
"epoch": 15.345468780583284,
"grad_norm": 9.875,
"learning_rate": 0.0006930906243883344,
"loss": 6.4386,
"step": 392000
},
{
"epoch": 15.365042082599334,
"grad_norm": 1.421875,
"learning_rate": 0.0006926991583480134,
"loss": 6.4268,
"step": 392500
},
{
"epoch": 15.384615384615385,
"grad_norm": 0.92578125,
"learning_rate": 0.0006923076923076923,
"loss": 6.4301,
"step": 393000
},
{
"epoch": 15.404188686631434,
"grad_norm": 0.92578125,
"learning_rate": 0.0006919162262673713,
"loss": 6.4374,
"step": 393500
},
{
"epoch": 15.423761988647485,
"grad_norm": 2.0625,
"learning_rate": 0.0006915247602270504,
"loss": 6.4423,
"step": 394000
},
{
"epoch": 15.443335290663535,
"grad_norm": 1.5390625,
"learning_rate": 0.0006911332941867294,
"loss": 6.4376,
"step": 394500
},
{
"epoch": 15.462908592679586,
"grad_norm": 4.34375,
"learning_rate": 0.0006907418281464083,
"loss": 6.4384,
"step": 395000
},
{
"epoch": 15.482481894695635,
"grad_norm": 5.375,
"learning_rate": 0.0006903503621060873,
"loss": 6.4293,
"step": 395500
},
{
"epoch": 15.502055196711686,
"grad_norm": 2.203125,
"learning_rate": 0.0006899588960657664,
"loss": 6.4331,
"step": 396000
},
{
"epoch": 15.521628498727736,
"grad_norm": 1.7734375,
"learning_rate": 0.0006895674300254454,
"loss": 6.4326,
"step": 396500
},
{
"epoch": 15.541201800743785,
"grad_norm": 1.890625,
"learning_rate": 0.0006891759639851242,
"loss": 6.4296,
"step": 397000
},
{
"epoch": 15.560775102759836,
"grad_norm": 1.015625,
"learning_rate": 0.0006887844979448033,
"loss": 6.4363,
"step": 397500
},
{
"epoch": 15.580348404775886,
"grad_norm": 1.4453125,
"learning_rate": 0.0006883930319044823,
"loss": 6.4393,
"step": 398000
},
{
"epoch": 15.599921706791935,
"grad_norm": 9.6875,
"learning_rate": 0.0006880015658641613,
"loss": 6.4373,
"step": 398500
},
{
"epoch": 15.619495008807986,
"grad_norm": 2.953125,
"learning_rate": 0.0006876100998238402,
"loss": 6.4404,
"step": 399000
},
{
"epoch": 15.639068310824037,
"grad_norm": 1.421875,
"learning_rate": 0.0006872186337835193,
"loss": 6.4434,
"step": 399500
},
{
"epoch": 15.658641612840086,
"grad_norm": 1.40625,
"learning_rate": 0.0006868271677431983,
"loss": 6.4329,
"step": 400000
},
{
"epoch": 15.678214914856136,
"grad_norm": 1.046875,
"learning_rate": 0.0006864357017028773,
"loss": 6.4316,
"step": 400500
},
{
"epoch": 15.697788216872187,
"grad_norm": 3.109375,
"learning_rate": 0.0006860442356625562,
"loss": 6.4376,
"step": 401000
},
{
"epoch": 15.717361518888236,
"grad_norm": 0.875,
"learning_rate": 0.0006856527696222353,
"loss": 6.4376,
"step": 401500
},
{
"epoch": 15.736934820904287,
"grad_norm": 1.421875,
"learning_rate": 0.0006852613035819143,
"loss": 6.434,
"step": 402000
},
{
"epoch": 15.756508122920337,
"grad_norm": 1.8359375,
"learning_rate": 0.0006848698375415932,
"loss": 6.441,
"step": 402500
},
{
"epoch": 15.776081424936386,
"grad_norm": 1.5546875,
"learning_rate": 0.0006844783715012722,
"loss": 6.4375,
"step": 403000
},
{
"epoch": 15.795654726952437,
"grad_norm": 15.9375,
"learning_rate": 0.0006840869054609513,
"loss": 6.431,
"step": 403500
},
{
"epoch": 15.815228028968487,
"grad_norm": 1.453125,
"learning_rate": 0.0006836954394206303,
"loss": 6.4348,
"step": 404000
},
{
"epoch": 15.834801330984536,
"grad_norm": 1.9140625,
"learning_rate": 0.0006833039733803092,
"loss": 6.4251,
"step": 404500
},
{
"epoch": 15.854374633000587,
"grad_norm": 1.453125,
"learning_rate": 0.0006829125073399882,
"loss": 6.4409,
"step": 405000
},
{
"epoch": 15.873947935016638,
"grad_norm": 1.3125,
"learning_rate": 0.0006825210412996673,
"loss": 6.4516,
"step": 405500
},
{
"epoch": 15.893521237032687,
"grad_norm": 2.65625,
"learning_rate": 0.0006821295752593463,
"loss": 6.4334,
"step": 406000
},
{
"epoch": 15.913094539048737,
"grad_norm": 2.203125,
"learning_rate": 0.0006817381092190252,
"loss": 6.4318,
"step": 406500
},
{
"epoch": 15.932667841064788,
"grad_norm": 1.3828125,
"learning_rate": 0.0006813466431787042,
"loss": 6.4306,
"step": 407000
},
{
"epoch": 15.952241143080837,
"grad_norm": 1.1015625,
"learning_rate": 0.0006809551771383833,
"loss": 6.437,
"step": 407500
},
{
"epoch": 15.971814445096888,
"grad_norm": 0.9375,
"learning_rate": 0.0006805637110980623,
"loss": 6.4377,
"step": 408000
},
{
"epoch": 15.991387747112938,
"grad_norm": 4.25,
"learning_rate": 0.0006801722450577412,
"loss": 6.4369,
"step": 408500
},
{
"epoch": 16.0,
"eval_loss": 6.43704080581665,
"eval_runtime": 20.4936,
"eval_samples_per_second": 97.592,
"eval_steps_per_second": 6.099,
"step": 408720
},
{
"epoch": 16.010961049128987,
"grad_norm": 2.109375,
"learning_rate": 0.0006797807790174203,
"loss": 6.427,
"step": 409000
},
{
"epoch": 16.03053435114504,
"grad_norm": 1.1953125,
"learning_rate": 0.0006793893129770993,
"loss": 6.4358,
"step": 409500
},
{
"epoch": 16.05010765316109,
"grad_norm": 1.1796875,
"learning_rate": 0.0006789978469367783,
"loss": 6.4409,
"step": 410000
},
{
"epoch": 16.069680955177137,
"grad_norm": 1.3828125,
"learning_rate": 0.0006786063808964572,
"loss": 6.439,
"step": 410500
},
{
"epoch": 16.08925425719319,
"grad_norm": 2.203125,
"learning_rate": 0.0006782149148561363,
"loss": 6.4372,
"step": 411000
},
{
"epoch": 16.10882755920924,
"grad_norm": 3.09375,
"learning_rate": 0.0006778234488158153,
"loss": 6.4322,
"step": 411500
},
{
"epoch": 16.128400861225288,
"grad_norm": 1.046875,
"learning_rate": 0.0006774319827754943,
"loss": 6.4346,
"step": 412000
},
{
"epoch": 16.14797416324134,
"grad_norm": 1.0390625,
"learning_rate": 0.0006770405167351732,
"loss": 6.433,
"step": 412500
},
{
"epoch": 16.16754746525739,
"grad_norm": 1.5390625,
"learning_rate": 0.0006766490506948523,
"loss": 6.4358,
"step": 413000
},
{
"epoch": 16.187120767273438,
"grad_norm": 1.9921875,
"learning_rate": 0.0006762575846545313,
"loss": 6.4389,
"step": 413500
},
{
"epoch": 16.20669406928949,
"grad_norm": 1.03125,
"learning_rate": 0.0006758661186142103,
"loss": 6.4403,
"step": 414000
},
{
"epoch": 16.22626737130554,
"grad_norm": 5.75,
"learning_rate": 0.0006754746525738891,
"loss": 6.4404,
"step": 414500
},
{
"epoch": 16.245840673321588,
"grad_norm": 1.5546875,
"learning_rate": 0.0006750831865335683,
"loss": 6.439,
"step": 415000
},
{
"epoch": 16.26541397533764,
"grad_norm": 3.125,
"learning_rate": 0.0006746917204932473,
"loss": 6.4303,
"step": 415500
},
{
"epoch": 16.28498727735369,
"grad_norm": 4.40625,
"learning_rate": 0.0006743002544529263,
"loss": 6.4322,
"step": 416000
},
{
"epoch": 16.30456057936974,
"grad_norm": 2.0625,
"learning_rate": 0.0006739087884126051,
"loss": 6.4349,
"step": 416500
},
{
"epoch": 16.32413388138579,
"grad_norm": 1.1796875,
"learning_rate": 0.0006735173223722842,
"loss": 6.4351,
"step": 417000
},
{
"epoch": 16.34370718340184,
"grad_norm": 2.0625,
"learning_rate": 0.0006731258563319632,
"loss": 6.4387,
"step": 417500
},
{
"epoch": 16.36328048541789,
"grad_norm": 1.46875,
"learning_rate": 0.0006727343902916421,
"loss": 6.4356,
"step": 418000
},
{
"epoch": 16.38285378743394,
"grad_norm": 1.59375,
"learning_rate": 0.0006723429242513212,
"loss": 6.4408,
"step": 418500
},
{
"epoch": 16.40242708944999,
"grad_norm": 1.953125,
"learning_rate": 0.0006719514582110002,
"loss": 6.4244,
"step": 419000
},
{
"epoch": 16.42200039146604,
"grad_norm": 1.1640625,
"learning_rate": 0.0006715599921706792,
"loss": 6.4338,
"step": 419500
},
{
"epoch": 16.44157369348209,
"grad_norm": 2.25,
"learning_rate": 0.0006711685261303581,
"loss": 6.435,
"step": 420000
},
{
"epoch": 16.46114699549814,
"grad_norm": 1.125,
"learning_rate": 0.0006707770600900372,
"loss": 6.4414,
"step": 420500
},
{
"epoch": 16.48072029751419,
"grad_norm": 2.234375,
"learning_rate": 0.0006703855940497162,
"loss": 6.4342,
"step": 421000
},
{
"epoch": 16.50029359953024,
"grad_norm": 2.5,
"learning_rate": 0.0006699941280093952,
"loss": 6.4382,
"step": 421500
},
{
"epoch": 16.51986690154629,
"grad_norm": 1.2734375,
"learning_rate": 0.0006696026619690741,
"loss": 6.433,
"step": 422000
},
{
"epoch": 16.53944020356234,
"grad_norm": 1.4609375,
"learning_rate": 0.0006692111959287532,
"loss": 6.4282,
"step": 422500
},
{
"epoch": 16.559013505578392,
"grad_norm": 1.0546875,
"learning_rate": 0.0006688197298884322,
"loss": 6.4268,
"step": 423000
},
{
"epoch": 16.57858680759444,
"grad_norm": 2.5,
"learning_rate": 0.0006684282638481112,
"loss": 6.429,
"step": 423500
},
{
"epoch": 16.59816010961049,
"grad_norm": 0.9609375,
"learning_rate": 0.0006680367978077901,
"loss": 6.4419,
"step": 424000
},
{
"epoch": 16.617733411626542,
"grad_norm": 1.234375,
"learning_rate": 0.0006676453317674692,
"loss": 6.4278,
"step": 424500
},
{
"epoch": 16.63730671364259,
"grad_norm": 1.2265625,
"learning_rate": 0.0006672538657271482,
"loss": 6.4343,
"step": 425000
},
{
"epoch": 16.656880015658643,
"grad_norm": 1.015625,
"learning_rate": 0.0006668623996868272,
"loss": 6.4376,
"step": 425500
},
{
"epoch": 16.676453317674692,
"grad_norm": 10.6875,
"learning_rate": 0.0006664709336465061,
"loss": 6.4388,
"step": 426000
},
{
"epoch": 16.69602661969074,
"grad_norm": 1.015625,
"learning_rate": 0.0006660794676061852,
"loss": 6.434,
"step": 426500
},
{
"epoch": 16.715599921706794,
"grad_norm": 1.4140625,
"learning_rate": 0.0006656880015658642,
"loss": 6.4432,
"step": 427000
},
{
"epoch": 16.735173223722843,
"grad_norm": 1.09375,
"learning_rate": 0.0006652965355255432,
"loss": 6.4351,
"step": 427500
},
{
"epoch": 16.75474652573889,
"grad_norm": 1.484375,
"learning_rate": 0.0006649050694852222,
"loss": 6.4306,
"step": 428000
},
{
"epoch": 16.774319827754944,
"grad_norm": 1.2421875,
"learning_rate": 0.0006645136034449012,
"loss": 6.4337,
"step": 428500
},
{
"epoch": 16.793893129770993,
"grad_norm": 1.6640625,
"learning_rate": 0.0006641221374045802,
"loss": 6.4353,
"step": 429000
},
{
"epoch": 16.81346643178704,
"grad_norm": 1.421875,
"learning_rate": 0.0006637306713642592,
"loss": 6.4256,
"step": 429500
},
{
"epoch": 16.833039733803094,
"grad_norm": 1.6796875,
"learning_rate": 0.0006633392053239382,
"loss": 6.4321,
"step": 430000
},
{
"epoch": 16.852613035819143,
"grad_norm": 1.8671875,
"learning_rate": 0.0006629477392836172,
"loss": 6.4416,
"step": 430500
},
{
"epoch": 16.872186337835192,
"grad_norm": 17.625,
"learning_rate": 0.0006625562732432962,
"loss": 6.4376,
"step": 431000
},
{
"epoch": 16.891759639851244,
"grad_norm": 1.2109375,
"learning_rate": 0.0006621648072029752,
"loss": 6.4386,
"step": 431500
},
{
"epoch": 16.911332941867293,
"grad_norm": 2.578125,
"learning_rate": 0.0006617733411626542,
"loss": 6.4389,
"step": 432000
},
{
"epoch": 16.930906243883342,
"grad_norm": 2.59375,
"learning_rate": 0.0006613818751223332,
"loss": 6.4396,
"step": 432500
},
{
"epoch": 16.950479545899395,
"grad_norm": 4.40625,
"learning_rate": 0.0006609904090820122,
"loss": 6.4354,
"step": 433000
},
{
"epoch": 16.970052847915444,
"grad_norm": 1.359375,
"learning_rate": 0.000660598943041691,
"loss": 6.4352,
"step": 433500
},
{
"epoch": 16.989626149931492,
"grad_norm": 1.109375,
"learning_rate": 0.0006602074770013702,
"loss": 6.4347,
"step": 434000
},
{
"epoch": 17.0,
"eval_loss": 6.436838150024414,
"eval_runtime": 20.9495,
"eval_samples_per_second": 95.467,
"eval_steps_per_second": 5.967,
"step": 434265
}
],
"logging_steps": 500,
"max_steps": 1277250,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3275457063057981e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}