{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.735632183908046, "eval_steps": 224, "global_step": 448, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016420361247947454, "grad_norm": 3.2402076721191406, "learning_rate": 2e-05, "loss": 1.3625, "step": 1 }, { "epoch": 0.003284072249589491, "grad_norm": 3.661665201187134, "learning_rate": 4e-05, "loss": 1.381, "step": 2 }, { "epoch": 0.0049261083743842365, "grad_norm": 1.9840654134750366, "learning_rate": 6e-05, "loss": 1.3547, "step": 3 }, { "epoch": 0.006568144499178982, "grad_norm": 2.6273880004882812, "learning_rate": 8e-05, "loss": 1.3121, "step": 4 }, { "epoch": 0.008210180623973728, "grad_norm": 3.8101563453674316, "learning_rate": 0.0001, "loss": 1.1715, "step": 5 }, { "epoch": 0.009852216748768473, "grad_norm": 3.0695738792419434, "learning_rate": 9.99451152579583e-05, "loss": 1.0495, "step": 6 }, { "epoch": 0.011494252873563218, "grad_norm": 5.384305000305176, "learning_rate": 9.989023051591658e-05, "loss": 0.9456, "step": 7 }, { "epoch": 0.013136288998357963, "grad_norm": 2.041106939315796, "learning_rate": 9.983534577387486e-05, "loss": 0.8497, "step": 8 }, { "epoch": 0.014778325123152709, "grad_norm": 1.2920405864715576, "learning_rate": 9.978046103183315e-05, "loss": 0.7883, "step": 9 }, { "epoch": 0.016420361247947456, "grad_norm": 1.1431885957717896, "learning_rate": 9.972557628979144e-05, "loss": 0.7066, "step": 10 }, { "epoch": 0.0180623973727422, "grad_norm": 0.9917076826095581, "learning_rate": 9.967069154774973e-05, "loss": 0.6515, "step": 11 }, { "epoch": 0.019704433497536946, "grad_norm": 2.9505903720855713, "learning_rate": 9.961580680570802e-05, "loss": 0.6377, "step": 12 }, { "epoch": 0.021346469622331693, "grad_norm": 0.9103951454162598, "learning_rate": 9.956092206366631e-05, "loss": 0.6228, "step": 13 }, { "epoch": 0.022988505747126436, "grad_norm": 0.7334574460983276, "learning_rate": 9.950603732162458e-05, "loss": 0.6089, "step": 14 }, { "epoch": 0.024630541871921183, "grad_norm": 0.712534487247467, "learning_rate": 9.945115257958289e-05, "loss": 0.5819, "step": 15 }, { "epoch": 0.026272577996715927, "grad_norm": 0.7342341542243958, "learning_rate": 9.939626783754116e-05, "loss": 0.5829, "step": 16 }, { "epoch": 0.027914614121510674, "grad_norm": 0.7734478712081909, "learning_rate": 9.934138309549945e-05, "loss": 0.5564, "step": 17 }, { "epoch": 0.029556650246305417, "grad_norm": 0.6626582145690918, "learning_rate": 9.928649835345774e-05, "loss": 0.5999, "step": 18 }, { "epoch": 0.031198686371100164, "grad_norm": 0.5777038335800171, "learning_rate": 9.923161361141603e-05, "loss": 0.5228, "step": 19 }, { "epoch": 0.03284072249589491, "grad_norm": 0.574607253074646, "learning_rate": 9.917672886937431e-05, "loss": 0.5721, "step": 20 }, { "epoch": 0.034482758620689655, "grad_norm": 0.586651623249054, "learning_rate": 9.912184412733261e-05, "loss": 0.5195, "step": 21 }, { "epoch": 0.0361247947454844, "grad_norm": 0.5510492920875549, "learning_rate": 9.906695938529089e-05, "loss": 0.548, "step": 22 }, { "epoch": 0.03776683087027915, "grad_norm": 0.4726048707962036, "learning_rate": 9.901207464324919e-05, "loss": 0.4979, "step": 23 }, { "epoch": 0.03940886699507389, "grad_norm": 0.4883709251880646, "learning_rate": 9.895718990120747e-05, "loss": 0.5387, "step": 24 }, { "epoch": 0.041050903119868636, "grad_norm": 0.4817533791065216, "learning_rate": 9.890230515916575e-05, "loss": 0.5231, "step": 25 }, { "epoch": 0.042692939244663386, "grad_norm": 0.4868399500846863, "learning_rate": 9.884742041712404e-05, "loss": 0.5441, "step": 26 }, { "epoch": 0.04433497536945813, "grad_norm": 0.4424205422401428, "learning_rate": 9.879253567508233e-05, "loss": 0.4925, "step": 27 }, { "epoch": 0.04597701149425287, "grad_norm": 0.48101040720939636, "learning_rate": 9.873765093304061e-05, "loss": 0.5553, "step": 28 }, { "epoch": 0.047619047619047616, "grad_norm": 0.48247408866882324, "learning_rate": 9.868276619099891e-05, "loss": 0.5657, "step": 29 }, { "epoch": 0.04926108374384237, "grad_norm": 0.47568339109420776, "learning_rate": 9.862788144895719e-05, "loss": 0.5287, "step": 30 }, { "epoch": 0.05090311986863711, "grad_norm": 0.4449344575405121, "learning_rate": 9.857299670691548e-05, "loss": 0.5514, "step": 31 }, { "epoch": 0.052545155993431854, "grad_norm": 0.46201637387275696, "learning_rate": 9.851811196487377e-05, "loss": 0.5521, "step": 32 }, { "epoch": 0.054187192118226604, "grad_norm": 0.44108930230140686, "learning_rate": 9.846322722283206e-05, "loss": 0.499, "step": 33 }, { "epoch": 0.05582922824302135, "grad_norm": 0.4384128451347351, "learning_rate": 9.840834248079035e-05, "loss": 0.5382, "step": 34 }, { "epoch": 0.05747126436781609, "grad_norm": 0.4624490439891815, "learning_rate": 9.835345773874864e-05, "loss": 0.4857, "step": 35 }, { "epoch": 0.059113300492610835, "grad_norm": 0.46828410029411316, "learning_rate": 9.829857299670691e-05, "loss": 0.4699, "step": 36 }, { "epoch": 0.060755336617405585, "grad_norm": 0.44205111265182495, "learning_rate": 9.824368825466522e-05, "loss": 0.4995, "step": 37 }, { "epoch": 0.06239737274220033, "grad_norm": 0.42418837547302246, "learning_rate": 9.818880351262349e-05, "loss": 0.4734, "step": 38 }, { "epoch": 0.06403940886699508, "grad_norm": 0.45498019456863403, "learning_rate": 9.813391877058178e-05, "loss": 0.4974, "step": 39 }, { "epoch": 0.06568144499178982, "grad_norm": 0.45234233140945435, "learning_rate": 9.807903402854007e-05, "loss": 0.5151, "step": 40 }, { "epoch": 0.06732348111658457, "grad_norm": 0.44345131516456604, "learning_rate": 9.802414928649836e-05, "loss": 0.5377, "step": 41 }, { "epoch": 0.06896551724137931, "grad_norm": 0.4864523112773895, "learning_rate": 9.796926454445664e-05, "loss": 0.5425, "step": 42 }, { "epoch": 0.07060755336617405, "grad_norm": 0.444519579410553, "learning_rate": 9.791437980241494e-05, "loss": 0.5229, "step": 43 }, { "epoch": 0.0722495894909688, "grad_norm": 0.4146328866481781, "learning_rate": 9.785949506037322e-05, "loss": 0.5054, "step": 44 }, { "epoch": 0.07389162561576355, "grad_norm": 0.43424683809280396, "learning_rate": 9.780461031833152e-05, "loss": 0.5206, "step": 45 }, { "epoch": 0.0755336617405583, "grad_norm": 0.4203151762485504, "learning_rate": 9.77497255762898e-05, "loss": 0.5027, "step": 46 }, { "epoch": 0.07717569786535304, "grad_norm": 0.4115687608718872, "learning_rate": 9.769484083424808e-05, "loss": 0.4806, "step": 47 }, { "epoch": 0.07881773399014778, "grad_norm": 0.42968273162841797, "learning_rate": 9.763995609220637e-05, "loss": 0.5159, "step": 48 }, { "epoch": 0.08045977011494253, "grad_norm": 0.4272300601005554, "learning_rate": 9.758507135016466e-05, "loss": 0.494, "step": 49 }, { "epoch": 0.08210180623973727, "grad_norm": 0.41626477241516113, "learning_rate": 9.753018660812294e-05, "loss": 0.4808, "step": 50 }, { "epoch": 0.08374384236453201, "grad_norm": 0.41353294253349304, "learning_rate": 9.747530186608124e-05, "loss": 0.4812, "step": 51 }, { "epoch": 0.08538587848932677, "grad_norm": 0.4321066737174988, "learning_rate": 9.742041712403952e-05, "loss": 0.541, "step": 52 }, { "epoch": 0.08702791461412152, "grad_norm": 0.4143648147583008, "learning_rate": 9.736553238199781e-05, "loss": 0.5039, "step": 53 }, { "epoch": 0.08866995073891626, "grad_norm": 0.43289196491241455, "learning_rate": 9.73106476399561e-05, "loss": 0.4891, "step": 54 }, { "epoch": 0.090311986863711, "grad_norm": 0.4179899990558624, "learning_rate": 9.725576289791439e-05, "loss": 0.4691, "step": 55 }, { "epoch": 0.09195402298850575, "grad_norm": 0.4259417951107025, "learning_rate": 9.720087815587268e-05, "loss": 0.4724, "step": 56 }, { "epoch": 0.09359605911330049, "grad_norm": 0.4181675612926483, "learning_rate": 9.714599341383097e-05, "loss": 0.5095, "step": 57 }, { "epoch": 0.09523809523809523, "grad_norm": 0.4020802080631256, "learning_rate": 9.709110867178924e-05, "loss": 0.4525, "step": 58 }, { "epoch": 0.09688013136288999, "grad_norm": 0.4126514792442322, "learning_rate": 9.703622392974754e-05, "loss": 0.5026, "step": 59 }, { "epoch": 0.09852216748768473, "grad_norm": 0.404905766248703, "learning_rate": 9.698133918770582e-05, "loss": 0.4866, "step": 60 }, { "epoch": 0.10016420361247948, "grad_norm": 0.4093894958496094, "learning_rate": 9.692645444566411e-05, "loss": 0.4953, "step": 61 }, { "epoch": 0.10180623973727422, "grad_norm": 0.4369639754295349, "learning_rate": 9.68715697036224e-05, "loss": 0.5074, "step": 62 }, { "epoch": 0.10344827586206896, "grad_norm": 0.4104146361351013, "learning_rate": 9.681668496158069e-05, "loss": 0.4935, "step": 63 }, { "epoch": 0.10509031198686371, "grad_norm": 0.4376799762248993, "learning_rate": 9.676180021953898e-05, "loss": 0.4867, "step": 64 }, { "epoch": 0.10673234811165845, "grad_norm": 0.42365819215774536, "learning_rate": 9.670691547749727e-05, "loss": 0.4542, "step": 65 }, { "epoch": 0.10837438423645321, "grad_norm": 0.4276992082595825, "learning_rate": 9.665203073545554e-05, "loss": 0.4907, "step": 66 }, { "epoch": 0.11001642036124795, "grad_norm": 0.40482285618782043, "learning_rate": 9.659714599341383e-05, "loss": 0.4828, "step": 67 }, { "epoch": 0.1116584564860427, "grad_norm": 0.4181552827358246, "learning_rate": 9.654226125137212e-05, "loss": 0.4774, "step": 68 }, { "epoch": 0.11330049261083744, "grad_norm": 0.40938231348991394, "learning_rate": 9.648737650933041e-05, "loss": 0.4846, "step": 69 }, { "epoch": 0.11494252873563218, "grad_norm": 0.4163057506084442, "learning_rate": 9.64324917672887e-05, "loss": 0.4702, "step": 70 }, { "epoch": 0.11658456486042693, "grad_norm": 0.4713898003101349, "learning_rate": 9.637760702524699e-05, "loss": 0.5263, "step": 71 }, { "epoch": 0.11822660098522167, "grad_norm": 0.43967533111572266, "learning_rate": 9.632272228320527e-05, "loss": 0.502, "step": 72 }, { "epoch": 0.11986863711001643, "grad_norm": 0.407479852437973, "learning_rate": 9.626783754116356e-05, "loss": 0.4609, "step": 73 }, { "epoch": 0.12151067323481117, "grad_norm": 0.43391069769859314, "learning_rate": 9.621295279912185e-05, "loss": 0.4642, "step": 74 }, { "epoch": 0.12315270935960591, "grad_norm": 0.5849558711051941, "learning_rate": 9.615806805708014e-05, "loss": 0.4901, "step": 75 }, { "epoch": 0.12479474548440066, "grad_norm": 0.4107006788253784, "learning_rate": 9.610318331503843e-05, "loss": 0.4964, "step": 76 }, { "epoch": 0.12643678160919541, "grad_norm": 0.46569788455963135, "learning_rate": 9.60482985729967e-05, "loss": 0.5034, "step": 77 }, { "epoch": 0.12807881773399016, "grad_norm": 0.41953814029693604, "learning_rate": 9.5993413830955e-05, "loss": 0.4662, "step": 78 }, { "epoch": 0.1297208538587849, "grad_norm": 0.4492577314376831, "learning_rate": 9.593852908891328e-05, "loss": 0.5575, "step": 79 }, { "epoch": 0.13136288998357964, "grad_norm": 0.43190428614616394, "learning_rate": 9.588364434687157e-05, "loss": 0.4832, "step": 80 }, { "epoch": 0.1330049261083744, "grad_norm": 0.4246384799480438, "learning_rate": 9.582875960482986e-05, "loss": 0.4552, "step": 81 }, { "epoch": 0.13464696223316913, "grad_norm": 0.4160482585430145, "learning_rate": 9.577387486278815e-05, "loss": 0.4594, "step": 82 }, { "epoch": 0.13628899835796388, "grad_norm": 0.3999533951282501, "learning_rate": 9.571899012074643e-05, "loss": 0.5232, "step": 83 }, { "epoch": 0.13793103448275862, "grad_norm": 0.4481320083141327, "learning_rate": 9.566410537870473e-05, "loss": 0.4913, "step": 84 }, { "epoch": 0.13957307060755336, "grad_norm": 0.410489559173584, "learning_rate": 9.5609220636663e-05, "loss": 0.4498, "step": 85 }, { "epoch": 0.1412151067323481, "grad_norm": 0.43711376190185547, "learning_rate": 9.555433589462131e-05, "loss": 0.4746, "step": 86 }, { "epoch": 0.14285714285714285, "grad_norm": 0.38364744186401367, "learning_rate": 9.549945115257958e-05, "loss": 0.4585, "step": 87 }, { "epoch": 0.1444991789819376, "grad_norm": 0.41426995396614075, "learning_rate": 9.544456641053787e-05, "loss": 0.4674, "step": 88 }, { "epoch": 0.14614121510673234, "grad_norm": 0.44232216477394104, "learning_rate": 9.538968166849616e-05, "loss": 0.5327, "step": 89 }, { "epoch": 0.1477832512315271, "grad_norm": 0.3992812931537628, "learning_rate": 9.533479692645445e-05, "loss": 0.4596, "step": 90 }, { "epoch": 0.14942528735632185, "grad_norm": 0.38763749599456787, "learning_rate": 9.527991218441273e-05, "loss": 0.4892, "step": 91 }, { "epoch": 0.1510673234811166, "grad_norm": 0.4203758239746094, "learning_rate": 9.522502744237103e-05, "loss": 0.4432, "step": 92 }, { "epoch": 0.15270935960591134, "grad_norm": 0.4376131594181061, "learning_rate": 9.517014270032931e-05, "loss": 0.5005, "step": 93 }, { "epoch": 0.15435139573070608, "grad_norm": 0.43990641832351685, "learning_rate": 9.51152579582876e-05, "loss": 0.4804, "step": 94 }, { "epoch": 0.15599343185550082, "grad_norm": 0.42096298933029175, "learning_rate": 9.506037321624589e-05, "loss": 0.4732, "step": 95 }, { "epoch": 0.15763546798029557, "grad_norm": 0.4243437945842743, "learning_rate": 9.500548847420418e-05, "loss": 0.4617, "step": 96 }, { "epoch": 0.1592775041050903, "grad_norm": 0.41374969482421875, "learning_rate": 9.495060373216247e-05, "loss": 0.501, "step": 97 }, { "epoch": 0.16091954022988506, "grad_norm": 0.4183409810066223, "learning_rate": 9.489571899012076e-05, "loss": 0.473, "step": 98 }, { "epoch": 0.1625615763546798, "grad_norm": 0.42857658863067627, "learning_rate": 9.484083424807903e-05, "loss": 0.45, "step": 99 }, { "epoch": 0.16420361247947454, "grad_norm": 0.3999362885951996, "learning_rate": 9.478594950603733e-05, "loss": 0.4536, "step": 100 }, { "epoch": 0.16584564860426929, "grad_norm": 0.3916584849357605, "learning_rate": 9.473106476399561e-05, "loss": 0.4917, "step": 101 }, { "epoch": 0.16748768472906403, "grad_norm": 0.4203507900238037, "learning_rate": 9.46761800219539e-05, "loss": 0.4775, "step": 102 }, { "epoch": 0.16912972085385877, "grad_norm": 0.40429848432540894, "learning_rate": 9.462129527991219e-05, "loss": 0.4634, "step": 103 }, { "epoch": 0.17077175697865354, "grad_norm": 0.41266727447509766, "learning_rate": 9.456641053787048e-05, "loss": 0.4222, "step": 104 }, { "epoch": 0.1724137931034483, "grad_norm": 0.41441938281059265, "learning_rate": 9.451152579582875e-05, "loss": 0.4798, "step": 105 }, { "epoch": 0.17405582922824303, "grad_norm": 0.38529106974601746, "learning_rate": 9.445664105378706e-05, "loss": 0.4639, "step": 106 }, { "epoch": 0.17569786535303777, "grad_norm": 0.4886303246021271, "learning_rate": 9.440175631174533e-05, "loss": 0.4757, "step": 107 }, { "epoch": 0.17733990147783252, "grad_norm": 0.43273624777793884, "learning_rate": 9.434687156970364e-05, "loss": 0.4808, "step": 108 }, { "epoch": 0.17898193760262726, "grad_norm": 0.4380013346672058, "learning_rate": 9.429198682766191e-05, "loss": 0.4548, "step": 109 }, { "epoch": 0.180623973727422, "grad_norm": 0.3988710641860962, "learning_rate": 9.42371020856202e-05, "loss": 0.4494, "step": 110 }, { "epoch": 0.18226600985221675, "grad_norm": 0.48673516511917114, "learning_rate": 9.418221734357849e-05, "loss": 0.5064, "step": 111 }, { "epoch": 0.1839080459770115, "grad_norm": 0.41171500086784363, "learning_rate": 9.412733260153678e-05, "loss": 0.4827, "step": 112 }, { "epoch": 0.18555008210180624, "grad_norm": 0.4917714595794678, "learning_rate": 9.407244785949506e-05, "loss": 0.4662, "step": 113 }, { "epoch": 0.18719211822660098, "grad_norm": 0.39501285552978516, "learning_rate": 9.401756311745336e-05, "loss": 0.4639, "step": 114 }, { "epoch": 0.18883415435139572, "grad_norm": 0.4013563394546509, "learning_rate": 9.396267837541164e-05, "loss": 0.4492, "step": 115 }, { "epoch": 0.19047619047619047, "grad_norm": 0.3916052579879761, "learning_rate": 9.390779363336993e-05, "loss": 0.4707, "step": 116 }, { "epoch": 0.1921182266009852, "grad_norm": 0.4220471680164337, "learning_rate": 9.385290889132822e-05, "loss": 0.4996, "step": 117 }, { "epoch": 0.19376026272577998, "grad_norm": 0.4175564646720886, "learning_rate": 9.37980241492865e-05, "loss": 0.4495, "step": 118 }, { "epoch": 0.19540229885057472, "grad_norm": 0.41558921337127686, "learning_rate": 9.37431394072448e-05, "loss": 0.4679, "step": 119 }, { "epoch": 0.19704433497536947, "grad_norm": 0.4115501940250397, "learning_rate": 9.368825466520308e-05, "loss": 0.4942, "step": 120 }, { "epoch": 0.1986863711001642, "grad_norm": 0.39015233516693115, "learning_rate": 9.363336992316136e-05, "loss": 0.4783, "step": 121 }, { "epoch": 0.20032840722495895, "grad_norm": 0.4632764756679535, "learning_rate": 9.357848518111966e-05, "loss": 0.4724, "step": 122 }, { "epoch": 0.2019704433497537, "grad_norm": 0.5011590123176575, "learning_rate": 9.352360043907794e-05, "loss": 0.4747, "step": 123 }, { "epoch": 0.20361247947454844, "grad_norm": 0.40431123971939087, "learning_rate": 9.346871569703623e-05, "loss": 0.463, "step": 124 }, { "epoch": 0.20525451559934318, "grad_norm": 0.4476254880428314, "learning_rate": 9.341383095499452e-05, "loss": 0.467, "step": 125 }, { "epoch": 0.20689655172413793, "grad_norm": 0.4251940846443176, "learning_rate": 9.335894621295281e-05, "loss": 0.4718, "step": 126 }, { "epoch": 0.20853858784893267, "grad_norm": 0.41991207003593445, "learning_rate": 9.330406147091108e-05, "loss": 0.4531, "step": 127 }, { "epoch": 0.21018062397372742, "grad_norm": 0.4370240271091461, "learning_rate": 9.324917672886939e-05, "loss": 0.5004, "step": 128 }, { "epoch": 0.21182266009852216, "grad_norm": 0.40302619338035583, "learning_rate": 9.319429198682766e-05, "loss": 0.4788, "step": 129 }, { "epoch": 0.2134646962233169, "grad_norm": 0.4022216200828552, "learning_rate": 9.313940724478595e-05, "loss": 0.4672, "step": 130 }, { "epoch": 0.21510673234811165, "grad_norm": 0.3977872431278229, "learning_rate": 9.308452250274424e-05, "loss": 0.4799, "step": 131 }, { "epoch": 0.21674876847290642, "grad_norm": 0.4125378727912903, "learning_rate": 9.302963776070253e-05, "loss": 0.4307, "step": 132 }, { "epoch": 0.21839080459770116, "grad_norm": 0.4000415802001953, "learning_rate": 9.297475301866082e-05, "loss": 0.4561, "step": 133 }, { "epoch": 0.2200328407224959, "grad_norm": 0.38186874985694885, "learning_rate": 9.291986827661911e-05, "loss": 0.5015, "step": 134 }, { "epoch": 0.22167487684729065, "grad_norm": 0.3982597589492798, "learning_rate": 9.286498353457739e-05, "loss": 0.4264, "step": 135 }, { "epoch": 0.2233169129720854, "grad_norm": 0.3948535621166229, "learning_rate": 9.281009879253568e-05, "loss": 0.4917, "step": 136 }, { "epoch": 0.22495894909688013, "grad_norm": 0.3881182074546814, "learning_rate": 9.275521405049397e-05, "loss": 0.4588, "step": 137 }, { "epoch": 0.22660098522167488, "grad_norm": 0.4619481563568115, "learning_rate": 9.270032930845226e-05, "loss": 0.46, "step": 138 }, { "epoch": 0.22824302134646962, "grad_norm": 0.4036756455898285, "learning_rate": 9.264544456641054e-05, "loss": 0.4822, "step": 139 }, { "epoch": 0.22988505747126436, "grad_norm": 0.37599611282348633, "learning_rate": 9.259055982436882e-05, "loss": 0.4456, "step": 140 }, { "epoch": 0.2315270935960591, "grad_norm": 0.4055330157279968, "learning_rate": 9.253567508232712e-05, "loss": 0.4307, "step": 141 }, { "epoch": 0.23316912972085385, "grad_norm": 0.38393133878707886, "learning_rate": 9.24807903402854e-05, "loss": 0.4573, "step": 142 }, { "epoch": 0.2348111658456486, "grad_norm": 0.39019426703453064, "learning_rate": 9.242590559824369e-05, "loss": 0.4756, "step": 143 }, { "epoch": 0.23645320197044334, "grad_norm": 0.4846177101135254, "learning_rate": 9.237102085620198e-05, "loss": 0.4894, "step": 144 }, { "epoch": 0.23809523809523808, "grad_norm": 0.42488348484039307, "learning_rate": 9.231613611416027e-05, "loss": 0.4789, "step": 145 }, { "epoch": 0.23973727422003285, "grad_norm": 0.4005594551563263, "learning_rate": 9.226125137211854e-05, "loss": 0.4353, "step": 146 }, { "epoch": 0.2413793103448276, "grad_norm": 0.3968258202075958, "learning_rate": 9.220636663007685e-05, "loss": 0.4591, "step": 147 }, { "epoch": 0.24302134646962234, "grad_norm": 0.4315483868122101, "learning_rate": 9.215148188803512e-05, "loss": 0.4529, "step": 148 }, { "epoch": 0.24466338259441708, "grad_norm": 0.3846581280231476, "learning_rate": 9.209659714599341e-05, "loss": 0.4858, "step": 149 }, { "epoch": 0.24630541871921183, "grad_norm": 0.3982647955417633, "learning_rate": 9.20417124039517e-05, "loss": 0.423, "step": 150 }, { "epoch": 0.24794745484400657, "grad_norm": 0.41849401593208313, "learning_rate": 9.198682766190999e-05, "loss": 0.508, "step": 151 }, { "epoch": 0.24958949096880131, "grad_norm": 0.41111549735069275, "learning_rate": 9.193194291986828e-05, "loss": 0.4778, "step": 152 }, { "epoch": 0.2512315270935961, "grad_norm": 0.43668660521507263, "learning_rate": 9.187705817782657e-05, "loss": 0.4559, "step": 153 }, { "epoch": 0.25287356321839083, "grad_norm": 0.42493247985839844, "learning_rate": 9.182217343578485e-05, "loss": 0.4415, "step": 154 }, { "epoch": 0.2545155993431856, "grad_norm": 0.3875834047794342, "learning_rate": 9.176728869374315e-05, "loss": 0.4582, "step": 155 }, { "epoch": 0.2561576354679803, "grad_norm": 0.4018273651599884, "learning_rate": 9.171240395170143e-05, "loss": 0.5041, "step": 156 }, { "epoch": 0.25779967159277506, "grad_norm": 0.41520723700523376, "learning_rate": 9.165751920965972e-05, "loss": 0.4983, "step": 157 }, { "epoch": 0.2594417077175698, "grad_norm": 0.40852561593055725, "learning_rate": 9.1602634467618e-05, "loss": 0.4393, "step": 158 }, { "epoch": 0.26108374384236455, "grad_norm": 0.39984506368637085, "learning_rate": 9.15477497255763e-05, "loss": 0.4524, "step": 159 }, { "epoch": 0.2627257799671593, "grad_norm": 0.5251211524009705, "learning_rate": 9.149286498353457e-05, "loss": 0.4212, "step": 160 }, { "epoch": 0.26436781609195403, "grad_norm": 0.4093887507915497, "learning_rate": 9.143798024149287e-05, "loss": 0.4602, "step": 161 }, { "epoch": 0.2660098522167488, "grad_norm": 36.77890396118164, "learning_rate": 9.138309549945115e-05, "loss": 0.7834, "step": 162 }, { "epoch": 0.2676518883415435, "grad_norm": 0.3924732506275177, "learning_rate": 9.132821075740945e-05, "loss": 0.4562, "step": 163 }, { "epoch": 0.26929392446633826, "grad_norm": 0.40318822860717773, "learning_rate": 9.127332601536773e-05, "loss": 0.4834, "step": 164 }, { "epoch": 0.270935960591133, "grad_norm": 0.42310184240341187, "learning_rate": 9.121844127332602e-05, "loss": 0.4937, "step": 165 }, { "epoch": 0.27257799671592775, "grad_norm": 0.4420574903488159, "learning_rate": 9.116355653128431e-05, "loss": 0.4797, "step": 166 }, { "epoch": 0.2742200328407225, "grad_norm": 0.40891292691230774, "learning_rate": 9.11086717892426e-05, "loss": 0.497, "step": 167 }, { "epoch": 0.27586206896551724, "grad_norm": 0.4228072762489319, "learning_rate": 9.105378704720087e-05, "loss": 0.4153, "step": 168 }, { "epoch": 0.277504105090312, "grad_norm": 0.4439708888530731, "learning_rate": 9.099890230515918e-05, "loss": 0.4703, "step": 169 }, { "epoch": 0.2791461412151067, "grad_norm": 0.4555833041667938, "learning_rate": 9.094401756311745e-05, "loss": 0.4504, "step": 170 }, { "epoch": 0.28078817733990147, "grad_norm": 0.407587468624115, "learning_rate": 9.088913282107574e-05, "loss": 0.4587, "step": 171 }, { "epoch": 0.2824302134646962, "grad_norm": 0.41558319330215454, "learning_rate": 9.083424807903403e-05, "loss": 0.4754, "step": 172 }, { "epoch": 0.28407224958949095, "grad_norm": 0.4676796793937683, "learning_rate": 9.077936333699232e-05, "loss": 0.4598, "step": 173 }, { "epoch": 0.2857142857142857, "grad_norm": 0.4471384584903717, "learning_rate": 9.072447859495061e-05, "loss": 0.4939, "step": 174 }, { "epoch": 0.28735632183908044, "grad_norm": 0.4201520085334778, "learning_rate": 9.06695938529089e-05, "loss": 0.4865, "step": 175 }, { "epoch": 0.2889983579638752, "grad_norm": 0.40211009979248047, "learning_rate": 9.061470911086718e-05, "loss": 0.4581, "step": 176 }, { "epoch": 0.29064039408866993, "grad_norm": 0.4203701317310333, "learning_rate": 9.055982436882548e-05, "loss": 0.4322, "step": 177 }, { "epoch": 0.2922824302134647, "grad_norm": 0.4127051830291748, "learning_rate": 9.050493962678376e-05, "loss": 0.4475, "step": 178 }, { "epoch": 0.2939244663382594, "grad_norm": 0.42018842697143555, "learning_rate": 9.045005488474205e-05, "loss": 0.4614, "step": 179 }, { "epoch": 0.2955665024630542, "grad_norm": 0.41988110542297363, "learning_rate": 9.039517014270033e-05, "loss": 0.4543, "step": 180 }, { "epoch": 0.29720853858784896, "grad_norm": 16.688312530517578, "learning_rate": 9.034028540065862e-05, "loss": 0.6024, "step": 181 }, { "epoch": 0.2988505747126437, "grad_norm": 0.42974498867988586, "learning_rate": 9.028540065861691e-05, "loss": 0.4025, "step": 182 }, { "epoch": 0.30049261083743845, "grad_norm": 5.872837543487549, "learning_rate": 9.02305159165752e-05, "loss": 1.0616, "step": 183 }, { "epoch": 0.3021346469622332, "grad_norm": 0.4068329632282257, "learning_rate": 9.017563117453348e-05, "loss": 0.44, "step": 184 }, { "epoch": 0.30377668308702793, "grad_norm": 0.38907569646835327, "learning_rate": 9.012074643249178e-05, "loss": 0.4301, "step": 185 }, { "epoch": 0.3054187192118227, "grad_norm": 0.43018120527267456, "learning_rate": 9.006586169045006e-05, "loss": 0.4581, "step": 186 }, { "epoch": 0.3070607553366174, "grad_norm": 0.3698095977306366, "learning_rate": 9.001097694840835e-05, "loss": 0.439, "step": 187 }, { "epoch": 0.30870279146141216, "grad_norm": 0.39086106419563293, "learning_rate": 8.995609220636664e-05, "loss": 0.453, "step": 188 }, { "epoch": 0.3103448275862069, "grad_norm": 0.3831292390823364, "learning_rate": 8.990120746432493e-05, "loss": 0.43, "step": 189 }, { "epoch": 0.31198686371100165, "grad_norm": 1.045174479484558, "learning_rate": 8.98463227222832e-05, "loss": 0.4292, "step": 190 }, { "epoch": 0.3136288998357964, "grad_norm": 193.7196502685547, "learning_rate": 8.97914379802415e-05, "loss": 0.5235, "step": 191 }, { "epoch": 0.31527093596059114, "grad_norm": 189.31814575195312, "learning_rate": 8.973655323819978e-05, "loss": 0.5189, "step": 192 }, { "epoch": 0.3169129720853859, "grad_norm": 0.9655250310897827, "learning_rate": 8.968166849615807e-05, "loss": 0.4639, "step": 193 }, { "epoch": 0.3185550082101806, "grad_norm": 115.3967514038086, "learning_rate": 8.962678375411636e-05, "loss": 1.1458, "step": 194 }, { "epoch": 0.32019704433497537, "grad_norm": 0.4868878722190857, "learning_rate": 8.957189901207465e-05, "loss": 0.5244, "step": 195 }, { "epoch": 0.3218390804597701, "grad_norm": 2.3596913814544678, "learning_rate": 8.951701427003294e-05, "loss": 0.4487, "step": 196 }, { "epoch": 0.32348111658456485, "grad_norm": 0.41834428906440735, "learning_rate": 8.946212952799123e-05, "loss": 0.4357, "step": 197 }, { "epoch": 0.3251231527093596, "grad_norm": 0.4477536380290985, "learning_rate": 8.94072447859495e-05, "loss": 0.4833, "step": 198 }, { "epoch": 0.32676518883415434, "grad_norm": 1.5718289613723755, "learning_rate": 8.93523600439078e-05, "loss": 0.4228, "step": 199 }, { "epoch": 0.3284072249589491, "grad_norm": 0.41846340894699097, "learning_rate": 8.929747530186608e-05, "loss": 0.4191, "step": 200 }, { "epoch": 0.33004926108374383, "grad_norm": 1.9398820400238037, "learning_rate": 8.924259055982437e-05, "loss": 0.708, "step": 201 }, { "epoch": 0.33169129720853857, "grad_norm": 0.5183188915252686, "learning_rate": 8.918770581778266e-05, "loss": 0.438, "step": 202 }, { "epoch": 0.3333333333333333, "grad_norm": 0.4838462471961975, "learning_rate": 8.913282107574095e-05, "loss": 0.4277, "step": 203 }, { "epoch": 0.33497536945812806, "grad_norm": 0.3938547372817993, "learning_rate": 8.907793633369924e-05, "loss": 0.5109, "step": 204 }, { "epoch": 0.3366174055829228, "grad_norm": 0.3979876637458801, "learning_rate": 8.902305159165752e-05, "loss": 0.4594, "step": 205 }, { "epoch": 0.33825944170771755, "grad_norm": 0.44166919589042664, "learning_rate": 8.896816684961581e-05, "loss": 0.4499, "step": 206 }, { "epoch": 0.3399014778325123, "grad_norm": 0.3803441822528839, "learning_rate": 8.89132821075741e-05, "loss": 0.3871, "step": 207 }, { "epoch": 0.3415435139573071, "grad_norm": 0.37302908301353455, "learning_rate": 8.885839736553239e-05, "loss": 0.4785, "step": 208 }, { "epoch": 0.34318555008210183, "grad_norm": 0.3824538588523865, "learning_rate": 8.880351262349066e-05, "loss": 0.4371, "step": 209 }, { "epoch": 0.3448275862068966, "grad_norm": 0.37757256627082825, "learning_rate": 8.874862788144897e-05, "loss": 0.4915, "step": 210 }, { "epoch": 0.3464696223316913, "grad_norm": 0.4052748680114746, "learning_rate": 8.869374313940724e-05, "loss": 0.5331, "step": 211 }, { "epoch": 0.34811165845648606, "grad_norm": 1.4844826459884644, "learning_rate": 8.863885839736553e-05, "loss": 0.7378, "step": 212 }, { "epoch": 0.3497536945812808, "grad_norm": 0.39169180393218994, "learning_rate": 8.858397365532382e-05, "loss": 0.4253, "step": 213 }, { "epoch": 0.35139573070607555, "grad_norm": 1.0175449848175049, "learning_rate": 8.852908891328211e-05, "loss": 0.4489, "step": 214 }, { "epoch": 0.3530377668308703, "grad_norm": 0.3983254134654999, "learning_rate": 8.84742041712404e-05, "loss": 0.4552, "step": 215 }, { "epoch": 0.35467980295566504, "grad_norm": 0.38185176253318787, "learning_rate": 8.841931942919869e-05, "loss": 0.4799, "step": 216 }, { "epoch": 0.3563218390804598, "grad_norm": 0.37171781063079834, "learning_rate": 8.836443468715697e-05, "loss": 0.4402, "step": 217 }, { "epoch": 0.3579638752052545, "grad_norm": 0.40189164876937866, "learning_rate": 8.830954994511527e-05, "loss": 0.4485, "step": 218 }, { "epoch": 0.35960591133004927, "grad_norm": 0.382373571395874, "learning_rate": 8.825466520307355e-05, "loss": 0.4477, "step": 219 }, { "epoch": 0.361247947454844, "grad_norm": 0.3773851692676544, "learning_rate": 8.819978046103183e-05, "loss": 0.4779, "step": 220 }, { "epoch": 0.36288998357963875, "grad_norm": 2.171015501022339, "learning_rate": 8.814489571899012e-05, "loss": 0.7333, "step": 221 }, { "epoch": 0.3645320197044335, "grad_norm": 0.40864044427871704, "learning_rate": 8.809001097694841e-05, "loss": 0.4677, "step": 222 }, { "epoch": 0.36617405582922824, "grad_norm": 0.3959502577781677, "learning_rate": 8.803512623490669e-05, "loss": 0.4574, "step": 223 }, { "epoch": 0.367816091954023, "grad_norm": 0.3971575200557709, "learning_rate": 8.798024149286499e-05, "loss": 0.4242, "step": 224 }, { "epoch": 0.367816091954023, "eval_runtime": 72.5054, "eval_samples_per_second": 2.717, "eval_steps_per_second": 1.365, "step": 224 }, { "epoch": 0.3694581280788177, "grad_norm": 0.35307571291923523, "learning_rate": 8.792535675082327e-05, "loss": 0.4178, "step": 225 }, { "epoch": 0.37110016420361247, "grad_norm": 0.397203266620636, "learning_rate": 8.787047200878157e-05, "loss": 0.4636, "step": 226 }, { "epoch": 0.3727422003284072, "grad_norm": 0.38514411449432373, "learning_rate": 8.781558726673985e-05, "loss": 0.4654, "step": 227 }, { "epoch": 0.37438423645320196, "grad_norm": 0.38593873381614685, "learning_rate": 8.776070252469814e-05, "loss": 0.4499, "step": 228 }, { "epoch": 0.3760262725779967, "grad_norm": 0.3524252474308014, "learning_rate": 8.770581778265643e-05, "loss": 0.4267, "step": 229 }, { "epoch": 0.37766830870279144, "grad_norm": 0.39490625262260437, "learning_rate": 8.765093304061472e-05, "loss": 0.4721, "step": 230 }, { "epoch": 0.3793103448275862, "grad_norm": 0.4169183671474457, "learning_rate": 8.759604829857299e-05, "loss": 0.4653, "step": 231 }, { "epoch": 0.38095238095238093, "grad_norm": 0.38085851073265076, "learning_rate": 8.75411635565313e-05, "loss": 0.4353, "step": 232 }, { "epoch": 0.3825944170771757, "grad_norm": 0.38983282446861267, "learning_rate": 8.748627881448957e-05, "loss": 0.4553, "step": 233 }, { "epoch": 0.3842364532019704, "grad_norm": 0.38444122672080994, "learning_rate": 8.743139407244786e-05, "loss": 0.4628, "step": 234 }, { "epoch": 0.38587848932676516, "grad_norm": 0.41438883543014526, "learning_rate": 8.737650933040615e-05, "loss": 0.4368, "step": 235 }, { "epoch": 0.38752052545155996, "grad_norm": 4.214874744415283, "learning_rate": 8.732162458836444e-05, "loss": 0.5662, "step": 236 }, { "epoch": 0.3891625615763547, "grad_norm": 0.6510725617408752, "learning_rate": 8.726673984632273e-05, "loss": 0.6532, "step": 237 }, { "epoch": 0.39080459770114945, "grad_norm": 0.42591527104377747, "learning_rate": 8.721185510428102e-05, "loss": 0.4573, "step": 238 }, { "epoch": 0.3924466338259442, "grad_norm": 0.40394848585128784, "learning_rate": 8.71569703622393e-05, "loss": 0.443, "step": 239 }, { "epoch": 0.39408866995073893, "grad_norm": 0.41808223724365234, "learning_rate": 8.71020856201976e-05, "loss": 0.4429, "step": 240 }, { "epoch": 0.3957307060755337, "grad_norm": 0.41101914644241333, "learning_rate": 8.704720087815587e-05, "loss": 0.4711, "step": 241 }, { "epoch": 0.3973727422003284, "grad_norm": 0.4102398753166199, "learning_rate": 8.699231613611416e-05, "loss": 0.4441, "step": 242 }, { "epoch": 0.39901477832512317, "grad_norm": 0.4156806468963623, "learning_rate": 8.693743139407245e-05, "loss": 0.4523, "step": 243 }, { "epoch": 0.4006568144499179, "grad_norm": 0.39194008708000183, "learning_rate": 8.688254665203074e-05, "loss": 0.4418, "step": 244 }, { "epoch": 0.40229885057471265, "grad_norm": 0.40409398078918457, "learning_rate": 8.682766190998902e-05, "loss": 0.4425, "step": 245 }, { "epoch": 0.4039408866995074, "grad_norm": 0.40178677439689636, "learning_rate": 8.677277716794732e-05, "loss": 0.4747, "step": 246 }, { "epoch": 0.40558292282430214, "grad_norm": 0.3816560208797455, "learning_rate": 8.67178924259056e-05, "loss": 0.4479, "step": 247 }, { "epoch": 0.4072249589490969, "grad_norm": 0.3683323860168457, "learning_rate": 8.66630076838639e-05, "loss": 0.4228, "step": 248 }, { "epoch": 0.4088669950738916, "grad_norm": 0.3741907477378845, "learning_rate": 8.660812294182218e-05, "loss": 0.4427, "step": 249 }, { "epoch": 0.41050903119868637, "grad_norm": 0.37625035643577576, "learning_rate": 8.655323819978047e-05, "loss": 0.4459, "step": 250 }, { "epoch": 0.4121510673234811, "grad_norm": 0.39176008105278015, "learning_rate": 8.649835345773876e-05, "loss": 0.4963, "step": 251 }, { "epoch": 0.41379310344827586, "grad_norm": 0.3737246096134186, "learning_rate": 8.644346871569705e-05, "loss": 0.4726, "step": 252 }, { "epoch": 0.4154351395730706, "grad_norm": 0.39536264538764954, "learning_rate": 8.638858397365532e-05, "loss": 0.4546, "step": 253 }, { "epoch": 0.41707717569786534, "grad_norm": 0.38121530413627625, "learning_rate": 8.633369923161362e-05, "loss": 0.4438, "step": 254 }, { "epoch": 0.4187192118226601, "grad_norm": 0.3718128204345703, "learning_rate": 8.62788144895719e-05, "loss": 0.4508, "step": 255 }, { "epoch": 0.42036124794745483, "grad_norm": 0.37829530239105225, "learning_rate": 8.622392974753019e-05, "loss": 0.4563, "step": 256 }, { "epoch": 0.4220032840722496, "grad_norm": 0.36806872487068176, "learning_rate": 8.616904500548848e-05, "loss": 0.4906, "step": 257 }, { "epoch": 0.4236453201970443, "grad_norm": 0.37612655758857727, "learning_rate": 8.611416026344677e-05, "loss": 0.4701, "step": 258 }, { "epoch": 0.42528735632183906, "grad_norm": 0.3865252435207367, "learning_rate": 8.605927552140506e-05, "loss": 0.4391, "step": 259 }, { "epoch": 0.4269293924466338, "grad_norm": 0.7533395290374756, "learning_rate": 8.600439077936335e-05, "loss": 0.6815, "step": 260 }, { "epoch": 0.42857142857142855, "grad_norm": 0.38196465373039246, "learning_rate": 8.594950603732162e-05, "loss": 0.4441, "step": 261 }, { "epoch": 0.4302134646962233, "grad_norm": 0.40374839305877686, "learning_rate": 8.589462129527991e-05, "loss": 0.4542, "step": 262 }, { "epoch": 0.4318555008210181, "grad_norm": 0.3830620348453522, "learning_rate": 8.58397365532382e-05, "loss": 0.4378, "step": 263 }, { "epoch": 0.43349753694581283, "grad_norm": 0.3939151167869568, "learning_rate": 8.578485181119649e-05, "loss": 0.4478, "step": 264 }, { "epoch": 0.4351395730706076, "grad_norm": 0.3825796842575073, "learning_rate": 8.572996706915478e-05, "loss": 0.438, "step": 265 }, { "epoch": 0.4367816091954023, "grad_norm": 0.4085426330566406, "learning_rate": 8.567508232711307e-05, "loss": 0.4565, "step": 266 }, { "epoch": 0.43842364532019706, "grad_norm": 0.36484354734420776, "learning_rate": 8.562019758507135e-05, "loss": 0.4228, "step": 267 }, { "epoch": 0.4400656814449918, "grad_norm": 0.39223718643188477, "learning_rate": 8.556531284302964e-05, "loss": 0.5033, "step": 268 }, { "epoch": 0.44170771756978655, "grad_norm": 0.3737963140010834, "learning_rate": 8.551042810098793e-05, "loss": 0.4236, "step": 269 }, { "epoch": 0.4433497536945813, "grad_norm": 0.38886019587516785, "learning_rate": 8.545554335894622e-05, "loss": 0.4614, "step": 270 }, { "epoch": 0.44499178981937604, "grad_norm": 0.36267775297164917, "learning_rate": 8.54006586169045e-05, "loss": 0.4301, "step": 271 }, { "epoch": 0.4466338259441708, "grad_norm": 0.39036551117897034, "learning_rate": 8.53457738748628e-05, "loss": 0.464, "step": 272 }, { "epoch": 0.4482758620689655, "grad_norm": 0.3881411850452423, "learning_rate": 8.529088913282109e-05, "loss": 0.4164, "step": 273 }, { "epoch": 0.44991789819376027, "grad_norm": 0.38660669326782227, "learning_rate": 8.523600439077936e-05, "loss": 0.4483, "step": 274 }, { "epoch": 0.451559934318555, "grad_norm": 0.3897208273410797, "learning_rate": 8.518111964873765e-05, "loss": 0.4425, "step": 275 }, { "epoch": 0.45320197044334976, "grad_norm": 0.3783514201641083, "learning_rate": 8.512623490669594e-05, "loss": 0.4738, "step": 276 }, { "epoch": 0.4548440065681445, "grad_norm": 0.3835807740688324, "learning_rate": 8.507135016465423e-05, "loss": 0.4611, "step": 277 }, { "epoch": 0.45648604269293924, "grad_norm": 0.3897012174129486, "learning_rate": 8.50164654226125e-05, "loss": 0.444, "step": 278 }, { "epoch": 0.458128078817734, "grad_norm": 0.3812754452228546, "learning_rate": 8.496158068057081e-05, "loss": 0.4641, "step": 279 }, { "epoch": 0.45977011494252873, "grad_norm": 0.38262248039245605, "learning_rate": 8.490669593852908e-05, "loss": 0.4707, "step": 280 }, { "epoch": 0.4614121510673235, "grad_norm": 0.3785279393196106, "learning_rate": 8.485181119648739e-05, "loss": 0.4204, "step": 281 }, { "epoch": 0.4630541871921182, "grad_norm": 0.8933166265487671, "learning_rate": 8.479692645444566e-05, "loss": 0.7041, "step": 282 }, { "epoch": 0.46469622331691296, "grad_norm": 0.3715771734714508, "learning_rate": 8.474204171240395e-05, "loss": 0.4569, "step": 283 }, { "epoch": 0.4663382594417077, "grad_norm": 0.36009031534194946, "learning_rate": 8.468715697036224e-05, "loss": 0.4329, "step": 284 }, { "epoch": 0.46798029556650245, "grad_norm": 0.38231948018074036, "learning_rate": 8.463227222832053e-05, "loss": 0.4366, "step": 285 }, { "epoch": 0.4696223316912972, "grad_norm": 0.35655301809310913, "learning_rate": 8.457738748627881e-05, "loss": 0.4767, "step": 286 }, { "epoch": 0.47126436781609193, "grad_norm": 0.3441294729709625, "learning_rate": 8.452250274423711e-05, "loss": 0.4219, "step": 287 }, { "epoch": 0.4729064039408867, "grad_norm": 0.37239736318588257, "learning_rate": 8.446761800219539e-05, "loss": 0.4368, "step": 288 }, { "epoch": 0.4745484400656814, "grad_norm": 0.5180085897445679, "learning_rate": 8.441273326015368e-05, "loss": 0.5885, "step": 289 }, { "epoch": 0.47619047619047616, "grad_norm": 0.365043967962265, "learning_rate": 8.435784851811197e-05, "loss": 0.4411, "step": 290 }, { "epoch": 0.47783251231527096, "grad_norm": 0.43386518955230713, "learning_rate": 8.430296377607026e-05, "loss": 0.6578, "step": 291 }, { "epoch": 0.4794745484400657, "grad_norm": 0.4762422740459442, "learning_rate": 8.424807903402855e-05, "loss": 0.4465, "step": 292 }, { "epoch": 0.48111658456486045, "grad_norm": 0.3735050857067108, "learning_rate": 8.419319429198684e-05, "loss": 0.4579, "step": 293 }, { "epoch": 0.4827586206896552, "grad_norm": 0.3508679270744324, "learning_rate": 8.413830954994511e-05, "loss": 0.4374, "step": 294 }, { "epoch": 0.48440065681444994, "grad_norm": 0.3632643520832062, "learning_rate": 8.408342480790341e-05, "loss": 0.465, "step": 295 }, { "epoch": 0.4860426929392447, "grad_norm": 0.38123026490211487, "learning_rate": 8.402854006586169e-05, "loss": 0.4597, "step": 296 }, { "epoch": 0.4876847290640394, "grad_norm": 0.38209372758865356, "learning_rate": 8.397365532381998e-05, "loss": 0.4645, "step": 297 }, { "epoch": 0.48932676518883417, "grad_norm": 0.38527724146842957, "learning_rate": 8.391877058177827e-05, "loss": 0.5191, "step": 298 }, { "epoch": 0.4909688013136289, "grad_norm": 0.38787147402763367, "learning_rate": 8.386388583973656e-05, "loss": 0.4749, "step": 299 }, { "epoch": 0.49261083743842365, "grad_norm": 0.36668193340301514, "learning_rate": 8.380900109769485e-05, "loss": 0.4274, "step": 300 }, { "epoch": 0.4942528735632184, "grad_norm": 0.3840336501598358, "learning_rate": 8.375411635565314e-05, "loss": 0.4351, "step": 301 }, { "epoch": 0.49589490968801314, "grad_norm": 0.3673497438430786, "learning_rate": 8.369923161361141e-05, "loss": 0.4585, "step": 302 }, { "epoch": 0.4975369458128079, "grad_norm": 0.3443084955215454, "learning_rate": 8.364434687156972e-05, "loss": 0.4094, "step": 303 }, { "epoch": 0.49917898193760263, "grad_norm": 0.3644309937953949, "learning_rate": 8.358946212952799e-05, "loss": 0.4347, "step": 304 }, { "epoch": 0.5008210180623974, "grad_norm": 0.3743632733821869, "learning_rate": 8.353457738748628e-05, "loss": 0.4457, "step": 305 }, { "epoch": 0.5024630541871922, "grad_norm": 0.3582209348678589, "learning_rate": 8.347969264544457e-05, "loss": 0.4408, "step": 306 }, { "epoch": 0.5041050903119869, "grad_norm": 0.3710041642189026, "learning_rate": 8.342480790340286e-05, "loss": 0.4429, "step": 307 }, { "epoch": 0.5057471264367817, "grad_norm": 8.523002624511719, "learning_rate": 8.336992316136114e-05, "loss": 0.5718, "step": 308 }, { "epoch": 0.5073891625615764, "grad_norm": 0.37652868032455444, "learning_rate": 8.331503841931944e-05, "loss": 0.4465, "step": 309 }, { "epoch": 0.5090311986863711, "grad_norm": 0.38204285502433777, "learning_rate": 8.326015367727772e-05, "loss": 0.4542, "step": 310 }, { "epoch": 0.5106732348111659, "grad_norm": 0.3687360882759094, "learning_rate": 8.320526893523602e-05, "loss": 0.4262, "step": 311 }, { "epoch": 0.5123152709359606, "grad_norm": 0.37486550211906433, "learning_rate": 8.31503841931943e-05, "loss": 0.4329, "step": 312 }, { "epoch": 0.5139573070607554, "grad_norm": 0.3730839192867279, "learning_rate": 8.309549945115259e-05, "loss": 0.4654, "step": 313 }, { "epoch": 0.5155993431855501, "grad_norm": 0.459148108959198, "learning_rate": 8.304061470911087e-05, "loss": 0.6377, "step": 314 }, { "epoch": 0.5172413793103449, "grad_norm": 0.4899662137031555, "learning_rate": 8.298572996706916e-05, "loss": 0.6825, "step": 315 }, { "epoch": 0.5188834154351396, "grad_norm": 0.39348316192626953, "learning_rate": 8.293084522502744e-05, "loss": 0.4554, "step": 316 }, { "epoch": 0.5205254515599343, "grad_norm": 0.38743141293525696, "learning_rate": 8.287596048298574e-05, "loss": 0.4595, "step": 317 }, { "epoch": 0.5221674876847291, "grad_norm": 0.36865395307540894, "learning_rate": 8.282107574094402e-05, "loss": 0.4774, "step": 318 }, { "epoch": 0.5238095238095238, "grad_norm": 0.3767486810684204, "learning_rate": 8.276619099890231e-05, "loss": 0.4782, "step": 319 }, { "epoch": 0.5254515599343186, "grad_norm": 0.39057424664497375, "learning_rate": 8.27113062568606e-05, "loss": 0.4421, "step": 320 }, { "epoch": 0.5270935960591133, "grad_norm": 0.3860589265823364, "learning_rate": 8.265642151481889e-05, "loss": 0.4512, "step": 321 }, { "epoch": 0.5287356321839081, "grad_norm": 0.3683719336986542, "learning_rate": 8.260153677277718e-05, "loss": 0.4305, "step": 322 }, { "epoch": 0.5303776683087028, "grad_norm": 0.3533482551574707, "learning_rate": 8.254665203073547e-05, "loss": 0.4492, "step": 323 }, { "epoch": 0.5320197044334976, "grad_norm": 0.3771316707134247, "learning_rate": 8.249176728869374e-05, "loss": 0.4745, "step": 324 }, { "epoch": 0.5336617405582923, "grad_norm": 0.39757585525512695, "learning_rate": 8.243688254665203e-05, "loss": 0.4418, "step": 325 }, { "epoch": 0.535303776683087, "grad_norm": 0.3886704444885254, "learning_rate": 8.238199780461032e-05, "loss": 0.4366, "step": 326 }, { "epoch": 0.5369458128078818, "grad_norm": 0.36773255467414856, "learning_rate": 8.232711306256861e-05, "loss": 0.4195, "step": 327 }, { "epoch": 0.5385878489326765, "grad_norm": 0.3555610775947571, "learning_rate": 8.22722283205269e-05, "loss": 0.409, "step": 328 }, { "epoch": 0.5402298850574713, "grad_norm": 0.3500663638114929, "learning_rate": 8.221734357848519e-05, "loss": 0.4238, "step": 329 }, { "epoch": 0.541871921182266, "grad_norm": 0.3708746135234833, "learning_rate": 8.216245883644347e-05, "loss": 0.4429, "step": 330 }, { "epoch": 0.5435139573070608, "grad_norm": 0.34657180309295654, "learning_rate": 8.210757409440176e-05, "loss": 0.4161, "step": 331 }, { "epoch": 0.5451559934318555, "grad_norm": 0.3708213269710541, "learning_rate": 8.205268935236005e-05, "loss": 0.4554, "step": 332 }, { "epoch": 0.5467980295566502, "grad_norm": 0.3768496811389923, "learning_rate": 8.199780461031834e-05, "loss": 0.4476, "step": 333 }, { "epoch": 0.548440065681445, "grad_norm": 0.3515271842479706, "learning_rate": 8.194291986827662e-05, "loss": 0.4691, "step": 334 }, { "epoch": 0.5500821018062397, "grad_norm": 0.36145442724227905, "learning_rate": 8.188803512623491e-05, "loss": 0.4573, "step": 335 }, { "epoch": 0.5517241379310345, "grad_norm": 0.3574090898036957, "learning_rate": 8.18331503841932e-05, "loss": 0.4491, "step": 336 }, { "epoch": 0.5533661740558292, "grad_norm": 0.5666980743408203, "learning_rate": 8.177826564215148e-05, "loss": 0.4411, "step": 337 }, { "epoch": 0.555008210180624, "grad_norm": 0.35076069831848145, "learning_rate": 8.172338090010977e-05, "loss": 0.4348, "step": 338 }, { "epoch": 0.5566502463054187, "grad_norm": 0.37491166591644287, "learning_rate": 8.166849615806806e-05, "loss": 0.4688, "step": 339 }, { "epoch": 0.5582922824302134, "grad_norm": 0.3747582733631134, "learning_rate": 8.161361141602635e-05, "loss": 0.428, "step": 340 }, { "epoch": 0.5599343185550082, "grad_norm": 0.3803423345088959, "learning_rate": 8.155872667398462e-05, "loss": 0.4499, "step": 341 }, { "epoch": 0.5615763546798029, "grad_norm": 0.3716084063053131, "learning_rate": 8.150384193194293e-05, "loss": 0.4301, "step": 342 }, { "epoch": 0.5632183908045977, "grad_norm": 0.363215833902359, "learning_rate": 8.14489571899012e-05, "loss": 0.4365, "step": 343 }, { "epoch": 0.5648604269293924, "grad_norm": 0.3525783121585846, "learning_rate": 8.13940724478595e-05, "loss": 0.4256, "step": 344 }, { "epoch": 0.5665024630541872, "grad_norm": 0.36122629046440125, "learning_rate": 8.133918770581778e-05, "loss": 0.4183, "step": 345 }, { "epoch": 0.5681444991789819, "grad_norm": 0.36856260895729065, "learning_rate": 8.128430296377607e-05, "loss": 0.4671, "step": 346 }, { "epoch": 0.5697865353037767, "grad_norm": 0.35569101572036743, "learning_rate": 8.122941822173436e-05, "loss": 0.4237, "step": 347 }, { "epoch": 0.5714285714285714, "grad_norm": 3.2163820266723633, "learning_rate": 8.117453347969265e-05, "loss": 0.5731, "step": 348 }, { "epoch": 0.5730706075533661, "grad_norm": 0.47037696838378906, "learning_rate": 8.111964873765093e-05, "loss": 0.3985, "step": 349 }, { "epoch": 0.5747126436781609, "grad_norm": 0.3765190541744232, "learning_rate": 8.106476399560923e-05, "loss": 0.4579, "step": 350 }, { "epoch": 0.5763546798029556, "grad_norm": 0.3708636462688446, "learning_rate": 8.10098792535675e-05, "loss": 0.4557, "step": 351 }, { "epoch": 0.5779967159277504, "grad_norm": 0.7152990102767944, "learning_rate": 8.09549945115258e-05, "loss": 0.6374, "step": 352 }, { "epoch": 0.5796387520525451, "grad_norm": 0.3930763304233551, "learning_rate": 8.090010976948409e-05, "loss": 0.4765, "step": 353 }, { "epoch": 0.5812807881773399, "grad_norm": 1.5390056371688843, "learning_rate": 8.084522502744237e-05, "loss": 0.5686, "step": 354 }, { "epoch": 0.5829228243021346, "grad_norm": 0.38492628931999207, "learning_rate": 8.079034028540066e-05, "loss": 0.4387, "step": 355 }, { "epoch": 0.5845648604269293, "grad_norm": 0.37243515253067017, "learning_rate": 8.073545554335895e-05, "loss": 0.423, "step": 356 }, { "epoch": 0.5862068965517241, "grad_norm": 0.3952900469303131, "learning_rate": 8.068057080131723e-05, "loss": 0.4428, "step": 357 }, { "epoch": 0.5878489326765188, "grad_norm": 6.266873359680176, "learning_rate": 8.062568605927553e-05, "loss": 0.742, "step": 358 }, { "epoch": 0.5894909688013136, "grad_norm": 0.3811399042606354, "learning_rate": 8.057080131723381e-05, "loss": 0.4171, "step": 359 }, { "epoch": 0.5911330049261084, "grad_norm": 0.4073907434940338, "learning_rate": 8.05159165751921e-05, "loss": 0.4586, "step": 360 }, { "epoch": 0.5927750410509032, "grad_norm": 0.3944762647151947, "learning_rate": 8.046103183315039e-05, "loss": 0.4574, "step": 361 }, { "epoch": 0.5944170771756979, "grad_norm": 0.3794701099395752, "learning_rate": 8.040614709110868e-05, "loss": 0.4473, "step": 362 }, { "epoch": 0.5960591133004927, "grad_norm": 0.400260865688324, "learning_rate": 8.035126234906695e-05, "loss": 0.4379, "step": 363 }, { "epoch": 0.5977011494252874, "grad_norm": 0.3719666600227356, "learning_rate": 8.029637760702526e-05, "loss": 0.411, "step": 364 }, { "epoch": 0.5993431855500821, "grad_norm": 0.3694835901260376, "learning_rate": 8.024149286498353e-05, "loss": 0.4467, "step": 365 }, { "epoch": 0.6009852216748769, "grad_norm": 0.3488430380821228, "learning_rate": 8.018660812294184e-05, "loss": 0.4257, "step": 366 }, { "epoch": 0.6026272577996716, "grad_norm": 0.36884555220603943, "learning_rate": 8.013172338090011e-05, "loss": 0.4462, "step": 367 }, { "epoch": 0.6042692939244664, "grad_norm": 0.34274569153785706, "learning_rate": 8.00768386388584e-05, "loss": 0.4383, "step": 368 }, { "epoch": 0.6059113300492611, "grad_norm": 0.3839503824710846, "learning_rate": 8.002195389681669e-05, "loss": 0.4466, "step": 369 }, { "epoch": 0.6075533661740559, "grad_norm": 0.3670174777507782, "learning_rate": 7.996706915477498e-05, "loss": 0.467, "step": 370 }, { "epoch": 0.6091954022988506, "grad_norm": 0.39021462202072144, "learning_rate": 7.991218441273326e-05, "loss": 0.4523, "step": 371 }, { "epoch": 0.6108374384236454, "grad_norm": 0.34886837005615234, "learning_rate": 7.985729967069156e-05, "loss": 0.4746, "step": 372 }, { "epoch": 0.6124794745484401, "grad_norm": 2.4073169231414795, "learning_rate": 7.980241492864984e-05, "loss": 0.4377, "step": 373 }, { "epoch": 0.6141215106732348, "grad_norm": 0.4135701656341553, "learning_rate": 7.974753018660812e-05, "loss": 0.4079, "step": 374 }, { "epoch": 0.6157635467980296, "grad_norm": 0.3664657175540924, "learning_rate": 7.969264544456641e-05, "loss": 0.4315, "step": 375 }, { "epoch": 0.6174055829228243, "grad_norm": 0.35204172134399414, "learning_rate": 7.96377607025247e-05, "loss": 0.4733, "step": 376 }, { "epoch": 0.6190476190476191, "grad_norm": 0.3566751778125763, "learning_rate": 7.9582875960483e-05, "loss": 0.4238, "step": 377 }, { "epoch": 0.6206896551724138, "grad_norm": 0.3561181128025055, "learning_rate": 7.952799121844128e-05, "loss": 0.4541, "step": 378 }, { "epoch": 0.6223316912972086, "grad_norm": 0.39184868335723877, "learning_rate": 7.947310647639956e-05, "loss": 0.4767, "step": 379 }, { "epoch": 0.6239737274220033, "grad_norm": 0.34649932384490967, "learning_rate": 7.941822173435786e-05, "loss": 0.4129, "step": 380 }, { "epoch": 0.625615763546798, "grad_norm": 0.33730626106262207, "learning_rate": 7.936333699231614e-05, "loss": 0.4286, "step": 381 }, { "epoch": 0.6272577996715928, "grad_norm": 0.3442136347293854, "learning_rate": 7.930845225027443e-05, "loss": 0.4288, "step": 382 }, { "epoch": 0.6288998357963875, "grad_norm": 0.33078014850616455, "learning_rate": 7.925356750823272e-05, "loss": 0.4232, "step": 383 }, { "epoch": 0.6305418719211823, "grad_norm": 0.3536471128463745, "learning_rate": 7.9198682766191e-05, "loss": 0.4364, "step": 384 }, { "epoch": 0.632183908045977, "grad_norm": 0.35268905758857727, "learning_rate": 7.914379802414928e-05, "loss": 0.4465, "step": 385 }, { "epoch": 0.6338259441707718, "grad_norm": 0.3333486318588257, "learning_rate": 7.908891328210759e-05, "loss": 0.4311, "step": 386 }, { "epoch": 0.6354679802955665, "grad_norm": 0.3493632376194, "learning_rate": 7.903402854006586e-05, "loss": 0.4532, "step": 387 }, { "epoch": 0.6371100164203612, "grad_norm": 0.3376450836658478, "learning_rate": 7.897914379802415e-05, "loss": 0.4372, "step": 388 }, { "epoch": 0.638752052545156, "grad_norm": 0.3757476508617401, "learning_rate": 7.892425905598244e-05, "loss": 0.4403, "step": 389 }, { "epoch": 0.6403940886699507, "grad_norm": 0.3452053666114807, "learning_rate": 7.886937431394073e-05, "loss": 0.4337, "step": 390 }, { "epoch": 0.6420361247947455, "grad_norm": 0.34478917717933655, "learning_rate": 7.881448957189902e-05, "loss": 0.4216, "step": 391 }, { "epoch": 0.6436781609195402, "grad_norm": 0.34799471497535706, "learning_rate": 7.875960482985731e-05, "loss": 0.4366, "step": 392 }, { "epoch": 0.645320197044335, "grad_norm": 0.3328522741794586, "learning_rate": 7.870472008781559e-05, "loss": 0.4297, "step": 393 }, { "epoch": 0.6469622331691297, "grad_norm": 0.3236556351184845, "learning_rate": 7.864983534577387e-05, "loss": 0.4312, "step": 394 }, { "epoch": 0.6486042692939245, "grad_norm": 0.3533720374107361, "learning_rate": 7.859495060373216e-05, "loss": 0.4432, "step": 395 }, { "epoch": 0.6502463054187192, "grad_norm": 0.3423120677471161, "learning_rate": 7.854006586169045e-05, "loss": 0.4697, "step": 396 }, { "epoch": 0.6518883415435139, "grad_norm": 0.3588278591632843, "learning_rate": 7.848518111964874e-05, "loss": 0.4183, "step": 397 }, { "epoch": 0.6535303776683087, "grad_norm": 0.34922847151756287, "learning_rate": 7.843029637760703e-05, "loss": 0.4478, "step": 398 }, { "epoch": 0.6551724137931034, "grad_norm": 0.3259122967720032, "learning_rate": 7.837541163556532e-05, "loss": 0.4409, "step": 399 }, { "epoch": 0.6568144499178982, "grad_norm": 0.33292415738105774, "learning_rate": 7.83205268935236e-05, "loss": 0.4423, "step": 400 }, { "epoch": 0.6584564860426929, "grad_norm": 0.34763479232788086, "learning_rate": 7.826564215148189e-05, "loss": 0.43, "step": 401 }, { "epoch": 0.6600985221674877, "grad_norm": 0.3363021910190582, "learning_rate": 7.821075740944018e-05, "loss": 0.4435, "step": 402 }, { "epoch": 0.6617405582922824, "grad_norm": 0.3428126573562622, "learning_rate": 7.815587266739847e-05, "loss": 0.4337, "step": 403 }, { "epoch": 0.6633825944170771, "grad_norm": 0.3409019410610199, "learning_rate": 7.810098792535676e-05, "loss": 0.441, "step": 404 }, { "epoch": 0.6650246305418719, "grad_norm": 0.3440620005130768, "learning_rate": 7.804610318331505e-05, "loss": 0.422, "step": 405 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3330303132534027, "learning_rate": 7.799121844127332e-05, "loss": 0.4374, "step": 406 }, { "epoch": 0.6683087027914614, "grad_norm": 0.34392058849334717, "learning_rate": 7.793633369923163e-05, "loss": 0.4425, "step": 407 }, { "epoch": 0.6699507389162561, "grad_norm": 0.3372094929218292, "learning_rate": 7.78814489571899e-05, "loss": 0.4419, "step": 408 }, { "epoch": 0.6715927750410509, "grad_norm": 0.34330740571022034, "learning_rate": 7.782656421514819e-05, "loss": 0.4251, "step": 409 }, { "epoch": 0.6732348111658456, "grad_norm": 0.3496563136577606, "learning_rate": 7.777167947310648e-05, "loss": 0.4393, "step": 410 }, { "epoch": 0.6748768472906403, "grad_norm": 0.3364163041114807, "learning_rate": 7.771679473106477e-05, "loss": 0.4329, "step": 411 }, { "epoch": 0.6765188834154351, "grad_norm": 0.3421562910079956, "learning_rate": 7.766190998902305e-05, "loss": 0.4454, "step": 412 }, { "epoch": 0.6781609195402298, "grad_norm": 0.3287839889526367, "learning_rate": 7.760702524698135e-05, "loss": 0.4338, "step": 413 }, { "epoch": 0.6798029556650246, "grad_norm": 0.34713035821914673, "learning_rate": 7.755214050493962e-05, "loss": 0.4267, "step": 414 }, { "epoch": 0.6814449917898193, "grad_norm": 0.33156299591064453, "learning_rate": 7.749725576289791e-05, "loss": 0.4071, "step": 415 }, { "epoch": 0.6830870279146142, "grad_norm": 0.3401210606098175, "learning_rate": 7.74423710208562e-05, "loss": 0.4477, "step": 416 }, { "epoch": 0.6847290640394089, "grad_norm": 0.34161490201950073, "learning_rate": 7.73874862788145e-05, "loss": 0.4588, "step": 417 }, { "epoch": 0.6863711001642037, "grad_norm": 0.3357692062854767, "learning_rate": 7.733260153677278e-05, "loss": 0.4138, "step": 418 }, { "epoch": 0.6880131362889984, "grad_norm": 0.33759045600891113, "learning_rate": 7.727771679473107e-05, "loss": 0.4379, "step": 419 }, { "epoch": 0.6896551724137931, "grad_norm": 0.35734590888023376, "learning_rate": 7.722283205268935e-05, "loss": 0.445, "step": 420 }, { "epoch": 0.6912972085385879, "grad_norm": 0.33996787667274475, "learning_rate": 7.716794731064765e-05, "loss": 0.415, "step": 421 }, { "epoch": 0.6929392446633826, "grad_norm": 0.34739911556243896, "learning_rate": 7.711306256860593e-05, "loss": 0.4097, "step": 422 }, { "epoch": 0.6945812807881774, "grad_norm": 0.3301583528518677, "learning_rate": 7.705817782656422e-05, "loss": 0.4077, "step": 423 }, { "epoch": 0.6962233169129721, "grad_norm": 0.33429691195487976, "learning_rate": 7.70032930845225e-05, "loss": 0.4141, "step": 424 }, { "epoch": 0.6978653530377669, "grad_norm": 0.34263110160827637, "learning_rate": 7.69484083424808e-05, "loss": 0.4146, "step": 425 }, { "epoch": 0.6995073891625616, "grad_norm": 0.3295872211456299, "learning_rate": 7.689352360043907e-05, "loss": 0.4116, "step": 426 }, { "epoch": 0.7011494252873564, "grad_norm": 0.3661148250102997, "learning_rate": 7.683863885839738e-05, "loss": 0.4234, "step": 427 }, { "epoch": 0.7027914614121511, "grad_norm": 0.3412879407405853, "learning_rate": 7.678375411635565e-05, "loss": 0.4465, "step": 428 }, { "epoch": 0.7044334975369458, "grad_norm": 0.3411206901073456, "learning_rate": 7.672886937431395e-05, "loss": 0.411, "step": 429 }, { "epoch": 0.7060755336617406, "grad_norm": 0.32851916551589966, "learning_rate": 7.667398463227223e-05, "loss": 0.4041, "step": 430 }, { "epoch": 0.7077175697865353, "grad_norm": 0.336635947227478, "learning_rate": 7.661909989023052e-05, "loss": 0.447, "step": 431 }, { "epoch": 0.7093596059113301, "grad_norm": 0.35531318187713623, "learning_rate": 7.656421514818881e-05, "loss": 0.4115, "step": 432 }, { "epoch": 0.7110016420361248, "grad_norm": 0.33751359581947327, "learning_rate": 7.65093304061471e-05, "loss": 0.4199, "step": 433 }, { "epoch": 0.7126436781609196, "grad_norm": 0.34485429525375366, "learning_rate": 7.645444566410537e-05, "loss": 0.3993, "step": 434 }, { "epoch": 0.7142857142857143, "grad_norm": 0.33902156352996826, "learning_rate": 7.639956092206368e-05, "loss": 0.4375, "step": 435 }, { "epoch": 0.715927750410509, "grad_norm": 0.3342963755130768, "learning_rate": 7.634467618002195e-05, "loss": 0.4476, "step": 436 }, { "epoch": 0.7175697865353038, "grad_norm": 0.3154541254043579, "learning_rate": 7.628979143798024e-05, "loss": 0.4052, "step": 437 }, { "epoch": 0.7192118226600985, "grad_norm": 0.33906570076942444, "learning_rate": 7.623490669593853e-05, "loss": 0.4314, "step": 438 }, { "epoch": 0.7208538587848933, "grad_norm": 0.3214341402053833, "learning_rate": 7.618002195389682e-05, "loss": 0.4172, "step": 439 }, { "epoch": 0.722495894909688, "grad_norm": 0.3152031898498535, "learning_rate": 7.612513721185511e-05, "loss": 0.4179, "step": 440 }, { "epoch": 0.7241379310344828, "grad_norm": 0.33467361330986023, "learning_rate": 7.60702524698134e-05, "loss": 0.4223, "step": 441 }, { "epoch": 0.7257799671592775, "grad_norm": 0.35381338000297546, "learning_rate": 7.601536772777168e-05, "loss": 0.4606, "step": 442 }, { "epoch": 0.7274220032840722, "grad_norm": 0.34129199385643005, "learning_rate": 7.596048298572998e-05, "loss": 0.4364, "step": 443 }, { "epoch": 0.729064039408867, "grad_norm": 0.33412957191467285, "learning_rate": 7.590559824368826e-05, "loss": 0.4367, "step": 444 }, { "epoch": 0.7307060755336617, "grad_norm": 0.3414211869239807, "learning_rate": 7.585071350164655e-05, "loss": 0.4167, "step": 445 }, { "epoch": 0.7323481116584565, "grad_norm": 0.3421005606651306, "learning_rate": 7.579582875960484e-05, "loss": 0.4231, "step": 446 }, { "epoch": 0.7339901477832512, "grad_norm": 0.3322966396808624, "learning_rate": 7.574094401756313e-05, "loss": 0.4655, "step": 447 }, { "epoch": 0.735632183908046, "grad_norm": 0.3495261073112488, "learning_rate": 7.56860592755214e-05, "loss": 0.4357, "step": 448 }, { "epoch": 0.735632183908046, "eval_runtime": 72.6421, "eval_samples_per_second": 2.712, "eval_steps_per_second": 1.363, "step": 448 } ], "logging_steps": 1, "max_steps": 1827, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 224, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.879823639484826e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }