{ "best_metric": 6.436838150024414, "best_model_checkpoint": "./results/models/checkpoint-434265", "epoch": 17.0, "eval_steps": 500, "global_step": 434265, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019573302016050106, "grad_norm": 0.59765625, "learning_rate": 0.000999608533959679, "loss": 7.0053, "step": 500 }, { "epoch": 0.03914660403210021, "grad_norm": 0.625, "learning_rate": 0.000999217067919358, "loss": 6.8806, "step": 1000 }, { "epoch": 0.058719906048150326, "grad_norm": 0.79296875, "learning_rate": 0.0009988256018790371, "loss": 6.8512, "step": 1500 }, { "epoch": 0.07829320806420043, "grad_norm": 4.4375, "learning_rate": 0.000998434135838716, "loss": 6.8494, "step": 2000 }, { "epoch": 0.09786651008025053, "grad_norm": 1.3203125, "learning_rate": 0.000998042669798395, "loss": 6.8296, "step": 2500 }, { "epoch": 0.11743981209630065, "grad_norm": 1.7265625, "learning_rate": 0.000997651203758074, "loss": 6.8209, "step": 3000 }, { "epoch": 0.13701311411235076, "grad_norm": 0.93359375, "learning_rate": 0.0009972597377177531, "loss": 6.8119, "step": 3500 }, { "epoch": 0.15658641612840085, "grad_norm": 0.95703125, "learning_rate": 0.000996868271677432, "loss": 6.8096, "step": 4000 }, { "epoch": 0.17615971814445097, "grad_norm": 1.0546875, "learning_rate": 0.0009964768056371109, "loss": 6.7987, "step": 4500 }, { "epoch": 0.19573302016050106, "grad_norm": 0.94921875, "learning_rate": 0.00099608533959679, "loss": 6.7946, "step": 5000 }, { "epoch": 0.21530632217655118, "grad_norm": 2.34375, "learning_rate": 0.000995693873556469, "loss": 6.7825, "step": 5500 }, { "epoch": 0.2348796241926013, "grad_norm": 0.94140625, "learning_rate": 0.000995302407516148, "loss": 6.7724, "step": 6000 }, { "epoch": 0.2544529262086514, "grad_norm": 0.8671875, "learning_rate": 0.0009949109414758269, "loss": 6.7732, "step": 6500 }, { "epoch": 0.2740262282247015, "grad_norm": 1.3671875, "learning_rate": 0.000994519475435506, "loss": 6.7652, "step": 7000 }, { "epoch": 0.29359953024075164, "grad_norm": 1.046875, "learning_rate": 0.000994128009395185, "loss": 6.7589, "step": 7500 }, { "epoch": 0.3131728322568017, "grad_norm": 0.96484375, "learning_rate": 0.000993736543354864, "loss": 6.7583, "step": 8000 }, { "epoch": 0.3327461342728518, "grad_norm": 1.5234375, "learning_rate": 0.0009933450773145429, "loss": 6.7515, "step": 8500 }, { "epoch": 0.35231943628890194, "grad_norm": 2.234375, "learning_rate": 0.000992953611274222, "loss": 6.7467, "step": 9000 }, { "epoch": 0.37189273830495206, "grad_norm": 1.46875, "learning_rate": 0.000992562145233901, "loss": 6.7366, "step": 9500 }, { "epoch": 0.39146604032100213, "grad_norm": 1.1484375, "learning_rate": 0.00099217067919358, "loss": 6.7365, "step": 10000 }, { "epoch": 0.41103934233705225, "grad_norm": 1.4375, "learning_rate": 0.000991779213153259, "loss": 6.7253, "step": 10500 }, { "epoch": 0.43061264435310237, "grad_norm": 0.87109375, "learning_rate": 0.000991387747112938, "loss": 6.7356, "step": 11000 }, { "epoch": 0.4501859463691525, "grad_norm": 1.484375, "learning_rate": 0.000990996281072617, "loss": 6.7177, "step": 11500 }, { "epoch": 0.4697592483852026, "grad_norm": 3.15625, "learning_rate": 0.000990604815032296, "loss": 6.7195, "step": 12000 }, { "epoch": 0.4893325504012527, "grad_norm": 0.9140625, "learning_rate": 0.000990213348991975, "loss": 6.7202, "step": 12500 }, { "epoch": 0.5089058524173028, "grad_norm": 1.046875, "learning_rate": 0.000989821882951654, "loss": 6.7183, "step": 13000 }, { "epoch": 0.5284791544333529, "grad_norm": 0.91015625, "learning_rate": 0.000989430416911333, "loss": 6.7186, "step": 13500 }, { "epoch": 0.548052456449403, "grad_norm": 1.4765625, "learning_rate": 0.000989038950871012, "loss": 6.7221, "step": 14000 }, { "epoch": 0.5676257584654532, "grad_norm": 1.3515625, "learning_rate": 0.000988647484830691, "loss": 6.7046, "step": 14500 }, { "epoch": 0.5871990604815033, "grad_norm": 1.1640625, "learning_rate": 0.00098825601879037, "loss": 6.7078, "step": 15000 }, { "epoch": 0.6067723624975533, "grad_norm": 1.109375, "learning_rate": 0.000987864552750049, "loss": 6.7063, "step": 15500 }, { "epoch": 0.6263456645136034, "grad_norm": 4.8125, "learning_rate": 0.000987473086709728, "loss": 6.6989, "step": 16000 }, { "epoch": 0.6459189665296535, "grad_norm": 1.3125, "learning_rate": 0.000987081620669407, "loss": 6.6951, "step": 16500 }, { "epoch": 0.6654922685457036, "grad_norm": 0.890625, "learning_rate": 0.000986690154629086, "loss": 6.6921, "step": 17000 }, { "epoch": 0.6850655705617538, "grad_norm": 1.7890625, "learning_rate": 0.0009862986885887648, "loss": 6.6921, "step": 17500 }, { "epoch": 0.7046388725778039, "grad_norm": 7.75, "learning_rate": 0.000985907222548444, "loss": 6.6884, "step": 18000 }, { "epoch": 0.724212174593854, "grad_norm": 1.203125, "learning_rate": 0.000985515756508123, "loss": 6.6841, "step": 18500 }, { "epoch": 0.7437854766099041, "grad_norm": 7.46875, "learning_rate": 0.000985124290467802, "loss": 6.6904, "step": 19000 }, { "epoch": 0.7633587786259542, "grad_norm": 1.6171875, "learning_rate": 0.0009847328244274808, "loss": 6.6847, "step": 19500 }, { "epoch": 0.7829320806420043, "grad_norm": 1.125, "learning_rate": 0.00098434135838716, "loss": 6.6799, "step": 20000 }, { "epoch": 0.8025053826580544, "grad_norm": 1.734375, "learning_rate": 0.000983949892346839, "loss": 6.6814, "step": 20500 }, { "epoch": 0.8220786846741045, "grad_norm": 1.4921875, "learning_rate": 0.000983558426306518, "loss": 6.6767, "step": 21000 }, { "epoch": 0.8416519866901546, "grad_norm": 4.5, "learning_rate": 0.0009831669602661968, "loss": 6.6714, "step": 21500 }, { "epoch": 0.8612252887062047, "grad_norm": 1.75, "learning_rate": 0.000982775494225876, "loss": 6.6725, "step": 22000 }, { "epoch": 0.8807985907222549, "grad_norm": 1.7265625, "learning_rate": 0.000982384028185555, "loss": 6.6646, "step": 22500 }, { "epoch": 0.900371892738305, "grad_norm": 2.25, "learning_rate": 0.000981992562145234, "loss": 6.6636, "step": 23000 }, { "epoch": 0.9199451947543551, "grad_norm": 2.203125, "learning_rate": 0.0009816010961049128, "loss": 6.6506, "step": 23500 }, { "epoch": 0.9395184967704052, "grad_norm": 2.96875, "learning_rate": 0.000981209630064592, "loss": 6.6546, "step": 24000 }, { "epoch": 0.9590917987864552, "grad_norm": 1.0546875, "learning_rate": 0.000980818164024271, "loss": 6.6504, "step": 24500 }, { "epoch": 0.9786651008025053, "grad_norm": 3.421875, "learning_rate": 0.0009804266979839499, "loss": 6.6499, "step": 25000 }, { "epoch": 0.9982384028185555, "grad_norm": 3.0625, "learning_rate": 0.0009800352319436288, "loss": 6.6422, "step": 25500 }, { "epoch": 1.0, "eval_loss": 6.643181800842285, "eval_runtime": 23.6647, "eval_samples_per_second": 84.514, "eval_steps_per_second": 5.282, "step": 25545 }, { "epoch": 1.0178117048346056, "grad_norm": 1.7578125, "learning_rate": 0.0009796437659033079, "loss": 6.6453, "step": 26000 }, { "epoch": 1.0373850068506556, "grad_norm": 0.84375, "learning_rate": 0.000979252299862987, "loss": 6.6465, "step": 26500 }, { "epoch": 1.0569583088667058, "grad_norm": 2.421875, "learning_rate": 0.0009788608338226659, "loss": 6.6497, "step": 27000 }, { "epoch": 1.0765316108827558, "grad_norm": 1.3515625, "learning_rate": 0.0009784693677823448, "loss": 6.642, "step": 27500 }, { "epoch": 1.096104912898806, "grad_norm": 0.89453125, "learning_rate": 0.0009780779017420239, "loss": 6.6384, "step": 28000 }, { "epoch": 1.115678214914856, "grad_norm": 0.88671875, "learning_rate": 0.000977686435701703, "loss": 6.6389, "step": 28500 }, { "epoch": 1.1352515169309063, "grad_norm": 1.203125, "learning_rate": 0.0009772949696613819, "loss": 6.6361, "step": 29000 }, { "epoch": 1.1548248189469563, "grad_norm": 1.140625, "learning_rate": 0.0009769035036210608, "loss": 6.6393, "step": 29500 }, { "epoch": 1.1743981209630066, "grad_norm": 1.421875, "learning_rate": 0.00097651203758074, "loss": 6.6413, "step": 30000 }, { "epoch": 1.1939714229790566, "grad_norm": 1.421875, "learning_rate": 0.0009761205715404189, "loss": 6.6334, "step": 30500 }, { "epoch": 1.2135447249951068, "grad_norm": 1.390625, "learning_rate": 0.0009757291055000979, "loss": 6.6316, "step": 31000 }, { "epoch": 1.2331180270111568, "grad_norm": 2.015625, "learning_rate": 0.000975337639459777, "loss": 6.6257, "step": 31500 }, { "epoch": 1.2526913290272068, "grad_norm": 2.0, "learning_rate": 0.000974946173419456, "loss": 6.6277, "step": 32000 }, { "epoch": 1.272264631043257, "grad_norm": 1.4765625, "learning_rate": 0.0009745547073791348, "loss": 6.6333, "step": 32500 }, { "epoch": 1.291837933059307, "grad_norm": 1.28125, "learning_rate": 0.0009741632413388138, "loss": 6.6241, "step": 33000 }, { "epoch": 1.3114112350753573, "grad_norm": 1.078125, "learning_rate": 0.000973771775298493, "loss": 6.6336, "step": 33500 }, { "epoch": 1.3309845370914073, "grad_norm": 3.359375, "learning_rate": 0.0009733803092581718, "loss": 6.6298, "step": 34000 }, { "epoch": 1.3505578391074575, "grad_norm": 1.5, "learning_rate": 0.0009729888432178508, "loss": 6.6248, "step": 34500 }, { "epoch": 1.3701311411235075, "grad_norm": 1.2109375, "learning_rate": 0.0009725973771775298, "loss": 6.6291, "step": 35000 }, { "epoch": 1.3897044431395575, "grad_norm": 2.125, "learning_rate": 0.0009722059111372089, "loss": 6.6224, "step": 35500 }, { "epoch": 1.4092777451556078, "grad_norm": 1.375, "learning_rate": 0.0009718144450968878, "loss": 6.6194, "step": 36000 }, { "epoch": 1.4288510471716578, "grad_norm": 1.859375, "learning_rate": 0.0009714229790565668, "loss": 6.6127, "step": 36500 }, { "epoch": 1.448424349187708, "grad_norm": 1.234375, "learning_rate": 0.0009710315130162458, "loss": 6.6062, "step": 37000 }, { "epoch": 1.467997651203758, "grad_norm": 1.078125, "learning_rate": 0.0009706400469759249, "loss": 6.6188, "step": 37500 }, { "epoch": 1.4875709532198083, "grad_norm": 2.125, "learning_rate": 0.0009702485809356038, "loss": 6.6132, "step": 38000 }, { "epoch": 1.5071442552358583, "grad_norm": 3.34375, "learning_rate": 0.0009698571148952828, "loss": 6.6034, "step": 38500 }, { "epoch": 1.5267175572519083, "grad_norm": 2.359375, "learning_rate": 0.0009694656488549618, "loss": 6.6049, "step": 39000 }, { "epoch": 1.5462908592679585, "grad_norm": 3.65625, "learning_rate": 0.0009690741828146409, "loss": 6.6077, "step": 39500 }, { "epoch": 1.5658641612840087, "grad_norm": 1.3046875, "learning_rate": 0.0009686827167743198, "loss": 6.6054, "step": 40000 }, { "epoch": 1.5854374633000587, "grad_norm": 3.140625, "learning_rate": 0.0009682912507339988, "loss": 6.6109, "step": 40500 }, { "epoch": 1.6050107653161088, "grad_norm": 1.21875, "learning_rate": 0.0009678997846936779, "loss": 6.6122, "step": 41000 }, { "epoch": 1.624584067332159, "grad_norm": 1.3984375, "learning_rate": 0.0009675083186533569, "loss": 6.6049, "step": 41500 }, { "epoch": 1.644157369348209, "grad_norm": 2.5625, "learning_rate": 0.0009671168526130358, "loss": 6.6004, "step": 42000 }, { "epoch": 1.663730671364259, "grad_norm": 6.8125, "learning_rate": 0.0009667253865727148, "loss": 6.608, "step": 42500 }, { "epoch": 1.6833039733803092, "grad_norm": 1.34375, "learning_rate": 0.0009663339205323939, "loss": 6.5973, "step": 43000 }, { "epoch": 1.7028772753963595, "grad_norm": 2.328125, "learning_rate": 0.0009659424544920729, "loss": 6.5984, "step": 43500 }, { "epoch": 1.7224505774124095, "grad_norm": 2.90625, "learning_rate": 0.0009655509884517518, "loss": 6.6001, "step": 44000 }, { "epoch": 1.7420238794284595, "grad_norm": 1.1171875, "learning_rate": 0.0009651595224114308, "loss": 6.6016, "step": 44500 }, { "epoch": 1.7615971814445097, "grad_norm": 2.84375, "learning_rate": 0.0009647680563711099, "loss": 6.598, "step": 45000 }, { "epoch": 1.78117048346056, "grad_norm": 2.03125, "learning_rate": 0.0009643765903307889, "loss": 6.6052, "step": 45500 }, { "epoch": 1.80074378547661, "grad_norm": 1.421875, "learning_rate": 0.0009639851242904678, "loss": 6.5902, "step": 46000 }, { "epoch": 1.82031708749266, "grad_norm": 1.8671875, "learning_rate": 0.0009635936582501468, "loss": 6.5847, "step": 46500 }, { "epoch": 1.8398903895087102, "grad_norm": 6.15625, "learning_rate": 0.0009632021922098259, "loss": 6.5948, "step": 47000 }, { "epoch": 1.8594636915247602, "grad_norm": 9.8125, "learning_rate": 0.0009628107261695049, "loss": 6.6071, "step": 47500 }, { "epoch": 1.8790369935408102, "grad_norm": 3.515625, "learning_rate": 0.0009624192601291838, "loss": 6.5973, "step": 48000 }, { "epoch": 1.8986102955568605, "grad_norm": 4.5625, "learning_rate": 0.0009620277940888628, "loss": 6.5917, "step": 48500 }, { "epoch": 1.9181835975729107, "grad_norm": 0.83984375, "learning_rate": 0.0009616363280485419, "loss": 6.5937, "step": 49000 }, { "epoch": 1.9377568995889607, "grad_norm": 2.984375, "learning_rate": 0.0009612448620082208, "loss": 6.5919, "step": 49500 }, { "epoch": 1.9573302016050107, "grad_norm": 1.90625, "learning_rate": 0.0009608533959678998, "loss": 6.5885, "step": 50000 }, { "epoch": 1.976903503621061, "grad_norm": 3.203125, "learning_rate": 0.0009604619299275788, "loss": 6.6004, "step": 50500 }, { "epoch": 1.996476805637111, "grad_norm": 7.4375, "learning_rate": 0.0009600704638872579, "loss": 6.5952, "step": 51000 }, { "epoch": 2.0, "eval_loss": 6.583548545837402, "eval_runtime": 20.5988, "eval_samples_per_second": 97.093, "eval_steps_per_second": 6.068, "step": 51090 }, { "epoch": 2.016050107653161, "grad_norm": 1.4296875, "learning_rate": 0.0009596789978469367, "loss": 6.5932, "step": 51500 }, { "epoch": 2.035623409669211, "grad_norm": 1.1796875, "learning_rate": 0.0009592875318066157, "loss": 6.5895, "step": 52000 }, { "epoch": 2.0551967116852614, "grad_norm": 9.0625, "learning_rate": 0.0009588960657662949, "loss": 6.5814, "step": 52500 }, { "epoch": 2.074770013701311, "grad_norm": 5.78125, "learning_rate": 0.0009585045997259738, "loss": 6.5826, "step": 53000 }, { "epoch": 2.0943433157173614, "grad_norm": 1.5078125, "learning_rate": 0.0009581131336856527, "loss": 6.591, "step": 53500 }, { "epoch": 2.1139166177334117, "grad_norm": 3.921875, "learning_rate": 0.0009577216676453317, "loss": 6.5807, "step": 54000 }, { "epoch": 2.133489919749462, "grad_norm": 3.09375, "learning_rate": 0.0009573302016050108, "loss": 6.5793, "step": 54500 }, { "epoch": 2.1530632217655117, "grad_norm": 2.953125, "learning_rate": 0.0009569387355646898, "loss": 6.5854, "step": 55000 }, { "epoch": 2.172636523781562, "grad_norm": 5.53125, "learning_rate": 0.0009565472695243687, "loss": 6.5796, "step": 55500 }, { "epoch": 2.192209825797612, "grad_norm": 1.3359375, "learning_rate": 0.0009561558034840477, "loss": 6.5693, "step": 56000 }, { "epoch": 2.2117831278136624, "grad_norm": 2.21875, "learning_rate": 0.0009557643374437268, "loss": 6.5837, "step": 56500 }, { "epoch": 2.231356429829712, "grad_norm": 2.609375, "learning_rate": 0.0009553728714034058, "loss": 6.5791, "step": 57000 }, { "epoch": 2.2509297318457624, "grad_norm": 1.9765625, "learning_rate": 0.0009549814053630847, "loss": 6.5746, "step": 57500 }, { "epoch": 2.2705030338618126, "grad_norm": 1.6171875, "learning_rate": 0.0009545899393227637, "loss": 6.5766, "step": 58000 }, { "epoch": 2.2900763358778624, "grad_norm": 2.9375, "learning_rate": 0.0009541984732824428, "loss": 6.5832, "step": 58500 }, { "epoch": 2.3096496378939126, "grad_norm": 3.28125, "learning_rate": 0.0009538070072421218, "loss": 6.5732, "step": 59000 }, { "epoch": 2.329222939909963, "grad_norm": 1.0625, "learning_rate": 0.0009534155412018007, "loss": 6.5774, "step": 59500 }, { "epoch": 2.348796241926013, "grad_norm": 2.921875, "learning_rate": 0.0009530240751614797, "loss": 6.5692, "step": 60000 }, { "epoch": 2.368369543942063, "grad_norm": 1.8046875, "learning_rate": 0.0009526326091211588, "loss": 6.5789, "step": 60500 }, { "epoch": 2.387942845958113, "grad_norm": 1.4921875, "learning_rate": 0.0009522411430808378, "loss": 6.576, "step": 61000 }, { "epoch": 2.4075161479741634, "grad_norm": 8.6875, "learning_rate": 0.0009518496770405167, "loss": 6.5807, "step": 61500 }, { "epoch": 2.4270894499902136, "grad_norm": 2.71875, "learning_rate": 0.0009514582110001958, "loss": 6.5753, "step": 62000 }, { "epoch": 2.4466627520062634, "grad_norm": 1.9765625, "learning_rate": 0.0009510667449598748, "loss": 6.5799, "step": 62500 }, { "epoch": 2.4662360540223136, "grad_norm": 1.890625, "learning_rate": 0.0009506752789195538, "loss": 6.5678, "step": 63000 }, { "epoch": 2.485809356038364, "grad_norm": 1.421875, "learning_rate": 0.0009502838128792327, "loss": 6.5683, "step": 63500 }, { "epoch": 2.5053826580544136, "grad_norm": 3.03125, "learning_rate": 0.0009498923468389118, "loss": 6.5674, "step": 64000 }, { "epoch": 2.524955960070464, "grad_norm": 0.828125, "learning_rate": 0.0009495008807985908, "loss": 6.5606, "step": 64500 }, { "epoch": 2.544529262086514, "grad_norm": 0.8359375, "learning_rate": 0.0009491094147582697, "loss": 6.5669, "step": 65000 }, { "epoch": 2.564102564102564, "grad_norm": 1.8671875, "learning_rate": 0.0009487179487179487, "loss": 6.5519, "step": 65500 }, { "epoch": 2.583675866118614, "grad_norm": 1.3515625, "learning_rate": 0.0009483264826776278, "loss": 6.5573, "step": 66000 }, { "epoch": 2.6032491681346643, "grad_norm": 3.125, "learning_rate": 0.0009479350166373068, "loss": 6.5511, "step": 66500 }, { "epoch": 2.6228224701507146, "grad_norm": 1.4453125, "learning_rate": 0.0009475435505969857, "loss": 6.5593, "step": 67000 }, { "epoch": 2.642395772166765, "grad_norm": 7.375, "learning_rate": 0.0009471520845566647, "loss": 6.5606, "step": 67500 }, { "epoch": 2.6619690741828146, "grad_norm": 1.4765625, "learning_rate": 0.0009467606185163438, "loss": 6.5601, "step": 68000 }, { "epoch": 2.681542376198865, "grad_norm": 3.546875, "learning_rate": 0.0009463691524760228, "loss": 6.5607, "step": 68500 }, { "epoch": 2.701115678214915, "grad_norm": 3.046875, "learning_rate": 0.0009459776864357017, "loss": 6.5462, "step": 69000 }, { "epoch": 2.720688980230965, "grad_norm": 1.59375, "learning_rate": 0.0009455862203953807, "loss": 6.567, "step": 69500 }, { "epoch": 2.740262282247015, "grad_norm": 1.5625, "learning_rate": 0.0009451947543550598, "loss": 6.5566, "step": 70000 }, { "epoch": 2.7598355842630653, "grad_norm": 4.40625, "learning_rate": 0.0009448032883147388, "loss": 6.5543, "step": 70500 }, { "epoch": 2.779408886279115, "grad_norm": 2.640625, "learning_rate": 0.0009444118222744176, "loss": 6.5527, "step": 71000 }, { "epoch": 2.7989821882951653, "grad_norm": 1.09375, "learning_rate": 0.0009440203562340968, "loss": 6.557, "step": 71500 }, { "epoch": 2.8185554903112156, "grad_norm": 1.0234375, "learning_rate": 0.0009436288901937757, "loss": 6.5603, "step": 72000 }, { "epoch": 2.8381287923272653, "grad_norm": 4.46875, "learning_rate": 0.0009432374241534547, "loss": 6.5515, "step": 72500 }, { "epoch": 2.8577020943433156, "grad_norm": 3.796875, "learning_rate": 0.0009428459581131336, "loss": 6.5506, "step": 73000 }, { "epoch": 2.877275396359366, "grad_norm": 1.1640625, "learning_rate": 0.0009424544920728127, "loss": 6.5583, "step": 73500 }, { "epoch": 2.896848698375416, "grad_norm": 1.8671875, "learning_rate": 0.0009420630260324917, "loss": 6.5545, "step": 74000 }, { "epoch": 2.9164220003914663, "grad_norm": 8.6875, "learning_rate": 0.0009416715599921707, "loss": 6.5564, "step": 74500 }, { "epoch": 2.935995302407516, "grad_norm": 1.28125, "learning_rate": 0.0009412800939518496, "loss": 6.5502, "step": 75000 }, { "epoch": 2.9555686044235663, "grad_norm": 6.375, "learning_rate": 0.0009408886279115287, "loss": 6.5434, "step": 75500 }, { "epoch": 2.9751419064396165, "grad_norm": 1.8125, "learning_rate": 0.0009404971618712077, "loss": 6.5457, "step": 76000 }, { "epoch": 2.9947152084556663, "grad_norm": 2.34375, "learning_rate": 0.0009401056958308867, "loss": 6.5616, "step": 76500 }, { "epoch": 3.0, "eval_loss": 6.546030044555664, "eval_runtime": 20.7765, "eval_samples_per_second": 96.262, "eval_steps_per_second": 6.016, "step": 76635 }, { "epoch": 3.0142885104717165, "grad_norm": 2.34375, "learning_rate": 0.0009397142297905656, "loss": 6.545, "step": 77000 }, { "epoch": 3.0338618124877668, "grad_norm": 1.6328125, "learning_rate": 0.0009393227637502447, "loss": 6.5504, "step": 77500 }, { "epoch": 3.053435114503817, "grad_norm": 1.453125, "learning_rate": 0.0009389312977099237, "loss": 6.5508, "step": 78000 }, { "epoch": 3.073008416519867, "grad_norm": 3.328125, "learning_rate": 0.0009385398316696027, "loss": 6.5495, "step": 78500 }, { "epoch": 3.092581718535917, "grad_norm": 1.9609375, "learning_rate": 0.0009381483656292816, "loss": 6.5518, "step": 79000 }, { "epoch": 3.1121550205519672, "grad_norm": 2.578125, "learning_rate": 0.0009377568995889607, "loss": 6.5535, "step": 79500 }, { "epoch": 3.131728322568017, "grad_norm": 1.65625, "learning_rate": 0.0009373654335486397, "loss": 6.5451, "step": 80000 }, { "epoch": 3.1513016245840673, "grad_norm": 7.125, "learning_rate": 0.0009369739675083186, "loss": 6.545, "step": 80500 }, { "epoch": 3.1708749266001175, "grad_norm": 2.671875, "learning_rate": 0.0009365825014679976, "loss": 6.5379, "step": 81000 }, { "epoch": 3.1904482286161677, "grad_norm": 2.125, "learning_rate": 0.0009361910354276767, "loss": 6.5455, "step": 81500 }, { "epoch": 3.2100215306322175, "grad_norm": 1.171875, "learning_rate": 0.0009357995693873557, "loss": 6.5449, "step": 82000 }, { "epoch": 3.2295948326482677, "grad_norm": 2.375, "learning_rate": 0.0009354081033470346, "loss": 6.5413, "step": 82500 }, { "epoch": 3.249168134664318, "grad_norm": 3.578125, "learning_rate": 0.0009350166373067137, "loss": 6.5442, "step": 83000 }, { "epoch": 3.2687414366803678, "grad_norm": 1.3125, "learning_rate": 0.0009346251712663927, "loss": 6.5454, "step": 83500 }, { "epoch": 3.288314738696418, "grad_norm": 1.265625, "learning_rate": 0.0009342337052260717, "loss": 6.5383, "step": 84000 }, { "epoch": 3.3078880407124682, "grad_norm": 2.28125, "learning_rate": 0.0009338422391857506, "loss": 6.5521, "step": 84500 }, { "epoch": 3.3274613427285185, "grad_norm": 3.40625, "learning_rate": 0.0009334507731454297, "loss": 6.5454, "step": 85000 }, { "epoch": 3.3470346447445682, "grad_norm": 1.546875, "learning_rate": 0.0009330593071051087, "loss": 6.5394, "step": 85500 }, { "epoch": 3.3666079467606185, "grad_norm": 1.453125, "learning_rate": 0.0009326678410647877, "loss": 6.5398, "step": 86000 }, { "epoch": 3.3861812487766687, "grad_norm": 1.703125, "learning_rate": 0.0009322763750244666, "loss": 6.5434, "step": 86500 }, { "epoch": 3.405754550792719, "grad_norm": 1.75, "learning_rate": 0.0009318849089841457, "loss": 6.5355, "step": 87000 }, { "epoch": 3.4253278528087687, "grad_norm": 1.0390625, "learning_rate": 0.0009314934429438247, "loss": 6.5417, "step": 87500 }, { "epoch": 3.444901154824819, "grad_norm": 1.234375, "learning_rate": 0.0009311019769035037, "loss": 6.5454, "step": 88000 }, { "epoch": 3.464474456840869, "grad_norm": 4.15625, "learning_rate": 0.0009307105108631826, "loss": 6.5316, "step": 88500 }, { "epoch": 3.484047758856919, "grad_norm": 3.296875, "learning_rate": 0.0009303190448228617, "loss": 6.5429, "step": 89000 }, { "epoch": 3.503621060872969, "grad_norm": 2.65625, "learning_rate": 0.0009299275787825407, "loss": 6.5424, "step": 89500 }, { "epoch": 3.5231943628890194, "grad_norm": 2.8125, "learning_rate": 0.0009295361127422197, "loss": 6.5447, "step": 90000 }, { "epoch": 3.5427676649050692, "grad_norm": 1.2421875, "learning_rate": 0.0009291446467018985, "loss": 6.5418, "step": 90500 }, { "epoch": 3.5623409669211195, "grad_norm": 1.1484375, "learning_rate": 0.0009287531806615776, "loss": 6.5405, "step": 91000 }, { "epoch": 3.5819142689371697, "grad_norm": 1.328125, "learning_rate": 0.0009283617146212566, "loss": 6.5403, "step": 91500 }, { "epoch": 3.60148757095322, "grad_norm": 1.2734375, "learning_rate": 0.0009279702485809356, "loss": 6.5384, "step": 92000 }, { "epoch": 3.62106087296927, "grad_norm": 1.203125, "learning_rate": 0.0009275787825406146, "loss": 6.542, "step": 92500 }, { "epoch": 3.64063417498532, "grad_norm": 3.75, "learning_rate": 0.0009271873165002936, "loss": 6.538, "step": 93000 }, { "epoch": 3.66020747700137, "grad_norm": 2.09375, "learning_rate": 0.0009267958504599726, "loss": 6.5282, "step": 93500 }, { "epoch": 3.6797807790174204, "grad_norm": 1.4609375, "learning_rate": 0.0009264043844196516, "loss": 6.5442, "step": 94000 }, { "epoch": 3.69935408103347, "grad_norm": 3.015625, "learning_rate": 0.0009260129183793306, "loss": 6.5425, "step": 94500 }, { "epoch": 3.7189273830495204, "grad_norm": 0.953125, "learning_rate": 0.0009256214523390096, "loss": 6.5375, "step": 95000 }, { "epoch": 3.7385006850655707, "grad_norm": 1.734375, "learning_rate": 0.0009252299862986886, "loss": 6.5488, "step": 95500 }, { "epoch": 3.7580739870816204, "grad_norm": 1.4453125, "learning_rate": 0.0009248385202583675, "loss": 6.5497, "step": 96000 }, { "epoch": 3.7776472890976707, "grad_norm": 1.3359375, "learning_rate": 0.0009244470542180466, "loss": 6.5467, "step": 96500 }, { "epoch": 3.797220591113721, "grad_norm": 1.7578125, "learning_rate": 0.0009240555881777256, "loss": 6.5442, "step": 97000 }, { "epoch": 3.816793893129771, "grad_norm": 1.40625, "learning_rate": 0.0009236641221374046, "loss": 6.5393, "step": 97500 }, { "epoch": 3.8363671951458214, "grad_norm": 1.34375, "learning_rate": 0.0009232726560970835, "loss": 6.5378, "step": 98000 }, { "epoch": 3.855940497161871, "grad_norm": 1.6484375, "learning_rate": 0.0009228811900567626, "loss": 6.5469, "step": 98500 }, { "epoch": 3.8755137991779214, "grad_norm": 2.203125, "learning_rate": 0.0009224897240164416, "loss": 6.5322, "step": 99000 }, { "epoch": 3.8950871011939716, "grad_norm": 62.75, "learning_rate": 0.0009220982579761206, "loss": 6.5472, "step": 99500 }, { "epoch": 3.9146604032100214, "grad_norm": 8.0, "learning_rate": 0.0009217067919357995, "loss": 6.5364, "step": 100000 }, { "epoch": 3.9342337052260716, "grad_norm": 2.796875, "learning_rate": 0.0009213153258954786, "loss": 6.5356, "step": 100500 }, { "epoch": 3.953807007242122, "grad_norm": 1.1484375, "learning_rate": 0.0009209238598551576, "loss": 6.5267, "step": 101000 }, { "epoch": 3.9733803092581716, "grad_norm": 4.53125, "learning_rate": 0.0009205323938148366, "loss": 6.5332, "step": 101500 }, { "epoch": 3.992953611274222, "grad_norm": 2.21875, "learning_rate": 0.0009201409277745156, "loss": 6.5363, "step": 102000 }, { "epoch": 4.0, "eval_loss": 6.529191493988037, "eval_runtime": 20.9814, "eval_samples_per_second": 95.322, "eval_steps_per_second": 5.958, "step": 102180 }, { "epoch": 4.012526913290272, "grad_norm": 1.3046875, "learning_rate": 0.0009197494617341946, "loss": 6.541, "step": 102500 }, { "epoch": 4.032100215306322, "grad_norm": 1.453125, "learning_rate": 0.0009193579956938736, "loss": 6.5328, "step": 103000 }, { "epoch": 4.051673517322373, "grad_norm": 1.109375, "learning_rate": 0.0009189665296535526, "loss": 6.5279, "step": 103500 }, { "epoch": 4.071246819338422, "grad_norm": 1.7421875, "learning_rate": 0.0009185750636132316, "loss": 6.5302, "step": 104000 }, { "epoch": 4.090820121354472, "grad_norm": 1.03125, "learning_rate": 0.0009181835975729106, "loss": 6.5393, "step": 104500 }, { "epoch": 4.110393423370523, "grad_norm": 1.6015625, "learning_rate": 0.0009177921315325896, "loss": 6.5323, "step": 105000 }, { "epoch": 4.129966725386573, "grad_norm": 0.91015625, "learning_rate": 0.0009174006654922686, "loss": 6.5308, "step": 105500 }, { "epoch": 4.149540027402622, "grad_norm": 8.6875, "learning_rate": 0.0009170091994519476, "loss": 6.5302, "step": 106000 }, { "epoch": 4.169113329418673, "grad_norm": 1.609375, "learning_rate": 0.0009166177334116266, "loss": 6.5328, "step": 106500 }, { "epoch": 4.188686631434723, "grad_norm": 5.90625, "learning_rate": 0.0009162262673713056, "loss": 6.5346, "step": 107000 }, { "epoch": 4.2082599334507735, "grad_norm": 5.1875, "learning_rate": 0.0009158348013309846, "loss": 6.5306, "step": 107500 }, { "epoch": 4.227833235466823, "grad_norm": 1.5703125, "learning_rate": 0.0009154433352906636, "loss": 6.5278, "step": 108000 }, { "epoch": 4.247406537482873, "grad_norm": 3.09375, "learning_rate": 0.0009150518692503426, "loss": 6.5298, "step": 108500 }, { "epoch": 4.266979839498924, "grad_norm": 7.59375, "learning_rate": 0.0009146604032100216, "loss": 6.5243, "step": 109000 }, { "epoch": 4.286553141514974, "grad_norm": 1.6484375, "learning_rate": 0.0009142689371697005, "loss": 6.5258, "step": 109500 }, { "epoch": 4.306126443531023, "grad_norm": 3.390625, "learning_rate": 0.0009138774711293795, "loss": 6.5237, "step": 110000 }, { "epoch": 4.325699745547074, "grad_norm": 3.171875, "learning_rate": 0.0009134860050890585, "loss": 6.5307, "step": 110500 }, { "epoch": 4.345273047563124, "grad_norm": 2.3125, "learning_rate": 0.0009130945390487375, "loss": 6.5236, "step": 111000 }, { "epoch": 4.364846349579174, "grad_norm": 5.625, "learning_rate": 0.0009127030730084165, "loss": 6.5264, "step": 111500 }, { "epoch": 4.384419651595224, "grad_norm": 1.9375, "learning_rate": 0.0009123116069680955, "loss": 6.5254, "step": 112000 }, { "epoch": 4.403992953611274, "grad_norm": 2.65625, "learning_rate": 0.0009119201409277745, "loss": 6.5323, "step": 112500 }, { "epoch": 4.423566255627325, "grad_norm": 1.703125, "learning_rate": 0.0009115286748874535, "loss": 6.5362, "step": 113000 }, { "epoch": 4.4431395576433745, "grad_norm": 3.21875, "learning_rate": 0.0009111372088471325, "loss": 6.5292, "step": 113500 }, { "epoch": 4.462712859659424, "grad_norm": 1.1640625, "learning_rate": 0.0009107457428068115, "loss": 6.5248, "step": 114000 }, { "epoch": 4.482286161675475, "grad_norm": 1.4453125, "learning_rate": 0.0009103542767664905, "loss": 6.5216, "step": 114500 }, { "epoch": 4.501859463691525, "grad_norm": 1.21875, "learning_rate": 0.0009099628107261695, "loss": 6.5157, "step": 115000 }, { "epoch": 4.521432765707575, "grad_norm": 3.609375, "learning_rate": 0.0009095713446858485, "loss": 6.5172, "step": 115500 }, { "epoch": 4.541006067723625, "grad_norm": 1.0546875, "learning_rate": 0.0009091798786455275, "loss": 6.524, "step": 116000 }, { "epoch": 4.560579369739675, "grad_norm": 1.421875, "learning_rate": 0.0009087884126052065, "loss": 6.5227, "step": 116500 }, { "epoch": 4.580152671755725, "grad_norm": 1.8359375, "learning_rate": 0.0009083969465648855, "loss": 6.5148, "step": 117000 }, { "epoch": 4.5997259737717755, "grad_norm": 3.546875, "learning_rate": 0.0009080054805245645, "loss": 6.5208, "step": 117500 }, { "epoch": 4.619299275787825, "grad_norm": 5.90625, "learning_rate": 0.0009076140144842435, "loss": 6.5183, "step": 118000 }, { "epoch": 4.638872577803875, "grad_norm": 4.0625, "learning_rate": 0.0009072225484439225, "loss": 6.5221, "step": 118500 }, { "epoch": 4.658445879819926, "grad_norm": 3.5625, "learning_rate": 0.0009068310824036015, "loss": 6.5201, "step": 119000 }, { "epoch": 4.6780191818359755, "grad_norm": 1.1640625, "learning_rate": 0.0009064396163632805, "loss": 6.5176, "step": 119500 }, { "epoch": 4.697592483852026, "grad_norm": 1.28125, "learning_rate": 0.0009060481503229595, "loss": 6.5137, "step": 120000 }, { "epoch": 4.717165785868076, "grad_norm": 1.8046875, "learning_rate": 0.0009056566842826385, "loss": 6.5178, "step": 120500 }, { "epoch": 4.736739087884126, "grad_norm": 1.171875, "learning_rate": 0.0009052652182423175, "loss": 6.5129, "step": 121000 }, { "epoch": 4.7563123899001765, "grad_norm": 11.375, "learning_rate": 0.0009048737522019965, "loss": 6.5149, "step": 121500 }, { "epoch": 4.775885691916226, "grad_norm": 2.375, "learning_rate": 0.0009044822861616755, "loss": 6.5124, "step": 122000 }, { "epoch": 4.795458993932276, "grad_norm": 6.375, "learning_rate": 0.0009040908201213545, "loss": 6.5112, "step": 122500 }, { "epoch": 4.815032295948327, "grad_norm": 0.87109375, "learning_rate": 0.0009036993540810336, "loss": 6.5083, "step": 123000 }, { "epoch": 4.8346055979643765, "grad_norm": 8.0, "learning_rate": 0.0009033078880407125, "loss": 6.5072, "step": 123500 }, { "epoch": 4.854178899980427, "grad_norm": 1.203125, "learning_rate": 0.0009029164220003915, "loss": 6.5118, "step": 124000 }, { "epoch": 4.873752201996477, "grad_norm": 1.15625, "learning_rate": 0.0009025249559600705, "loss": 6.5117, "step": 124500 }, { "epoch": 4.893325504012527, "grad_norm": 1.9296875, "learning_rate": 0.0009021334899197496, "loss": 6.5099, "step": 125000 }, { "epoch": 4.912898806028577, "grad_norm": 2.453125, "learning_rate": 0.0009017420238794285, "loss": 6.5095, "step": 125500 }, { "epoch": 4.932472108044627, "grad_norm": 1.3359375, "learning_rate": 0.0009013505578391075, "loss": 6.5088, "step": 126000 }, { "epoch": 4.952045410060677, "grad_norm": 1.28125, "learning_rate": 0.0009009590917987865, "loss": 6.5085, "step": 126500 }, { "epoch": 4.971618712076728, "grad_norm": 3.75, "learning_rate": 0.0009005676257584656, "loss": 6.5163, "step": 127000 }, { "epoch": 4.9911920140927775, "grad_norm": 3.015625, "learning_rate": 0.0009001761597181445, "loss": 6.513, "step": 127500 }, { "epoch": 5.0, "eval_loss": 6.507379055023193, "eval_runtime": 22.1406, "eval_samples_per_second": 90.332, "eval_steps_per_second": 5.646, "step": 127725 }, { "epoch": 5.010765316108827, "grad_norm": 1.6171875, "learning_rate": 0.0008997846936778235, "loss": 6.5096, "step": 128000 }, { "epoch": 5.030338618124878, "grad_norm": 2.203125, "learning_rate": 0.0008993932276375024, "loss": 6.5089, "step": 128500 }, { "epoch": 5.049911920140928, "grad_norm": 2.90625, "learning_rate": 0.0008990017615971816, "loss": 6.5087, "step": 129000 }, { "epoch": 5.0694852221569775, "grad_norm": 2.265625, "learning_rate": 0.0008986102955568604, "loss": 6.5135, "step": 129500 }, { "epoch": 5.089058524173028, "grad_norm": 1.8828125, "learning_rate": 0.0008982188295165394, "loss": 6.5038, "step": 130000 }, { "epoch": 5.108631826189078, "grad_norm": 2.359375, "learning_rate": 0.0008978273634762184, "loss": 6.5035, "step": 130500 }, { "epoch": 5.128205128205128, "grad_norm": 2.15625, "learning_rate": 0.0008974358974358974, "loss": 6.5167, "step": 131000 }, { "epoch": 5.147778430221178, "grad_norm": 2.140625, "learning_rate": 0.0008970444313955764, "loss": 6.5109, "step": 131500 }, { "epoch": 5.167351732237228, "grad_norm": 2.015625, "learning_rate": 0.0008966529653552554, "loss": 6.5009, "step": 132000 }, { "epoch": 5.186925034253279, "grad_norm": 0.75, "learning_rate": 0.0008962614993149345, "loss": 6.5078, "step": 132500 }, { "epoch": 5.206498336269329, "grad_norm": 5.78125, "learning_rate": 0.0008958700332746134, "loss": 6.5062, "step": 133000 }, { "epoch": 5.2260716382853785, "grad_norm": 3.390625, "learning_rate": 0.0008954785672342924, "loss": 6.5045, "step": 133500 }, { "epoch": 5.245644940301429, "grad_norm": 4.09375, "learning_rate": 0.0008950871011939714, "loss": 6.4964, "step": 134000 }, { "epoch": 5.265218242317479, "grad_norm": 1.84375, "learning_rate": 0.0008946956351536505, "loss": 6.5082, "step": 134500 }, { "epoch": 5.284791544333529, "grad_norm": 1.953125, "learning_rate": 0.0008943041691133294, "loss": 6.5047, "step": 135000 }, { "epoch": 5.304364846349579, "grad_norm": 1.09375, "learning_rate": 0.0008939127030730084, "loss": 6.5025, "step": 135500 }, { "epoch": 5.323938148365629, "grad_norm": 2.046875, "learning_rate": 0.0008935212370326874, "loss": 6.4966, "step": 136000 }, { "epoch": 5.34351145038168, "grad_norm": 1.078125, "learning_rate": 0.0008931297709923665, "loss": 6.503, "step": 136500 }, { "epoch": 5.36308475239773, "grad_norm": 5.0625, "learning_rate": 0.0008927383049520454, "loss": 6.5022, "step": 137000 }, { "epoch": 5.382658054413779, "grad_norm": 1.140625, "learning_rate": 0.0008923468389117244, "loss": 6.5025, "step": 137500 }, { "epoch": 5.40223135642983, "grad_norm": 2.4375, "learning_rate": 0.0008919553728714034, "loss": 6.5053, "step": 138000 }, { "epoch": 5.42180465844588, "grad_norm": 1.78125, "learning_rate": 0.0008915639068310825, "loss": 6.5127, "step": 138500 }, { "epoch": 5.44137796046193, "grad_norm": 14.375, "learning_rate": 0.0008911724407907614, "loss": 6.5019, "step": 139000 }, { "epoch": 5.46095126247798, "grad_norm": 4.59375, "learning_rate": 0.0008907809747504404, "loss": 6.5024, "step": 139500 }, { "epoch": 5.48052456449403, "grad_norm": 2.28125, "learning_rate": 0.0008903895087101194, "loss": 6.5059, "step": 140000 }, { "epoch": 5.50009786651008, "grad_norm": 3.84375, "learning_rate": 0.0008899980426697985, "loss": 6.5027, "step": 140500 }, { "epoch": 5.519671168526131, "grad_norm": 6.1875, "learning_rate": 0.0008896065766294774, "loss": 6.4974, "step": 141000 }, { "epoch": 5.53924447054218, "grad_norm": 1.921875, "learning_rate": 0.0008892151105891564, "loss": 6.4957, "step": 141500 }, { "epoch": 5.55881777255823, "grad_norm": 1.78125, "learning_rate": 0.0008888236445488354, "loss": 6.5043, "step": 142000 }, { "epoch": 5.578391074574281, "grad_norm": 1.796875, "learning_rate": 0.0008884321785085145, "loss": 6.4968, "step": 142500 }, { "epoch": 5.597964376590331, "grad_norm": 2.4375, "learning_rate": 0.0008880407124681934, "loss": 6.5016, "step": 143000 }, { "epoch": 5.61753767860638, "grad_norm": 1.078125, "learning_rate": 0.0008876492464278724, "loss": 6.5012, "step": 143500 }, { "epoch": 5.637110980622431, "grad_norm": 3.921875, "learning_rate": 0.0008872577803875515, "loss": 6.5061, "step": 144000 }, { "epoch": 5.656684282638481, "grad_norm": 1.015625, "learning_rate": 0.0008868663143472305, "loss": 6.5026, "step": 144500 }, { "epoch": 5.676257584654532, "grad_norm": 1.484375, "learning_rate": 0.0008864748483069094, "loss": 6.4981, "step": 145000 }, { "epoch": 5.695830886670581, "grad_norm": 1.8359375, "learning_rate": 0.0008860833822665884, "loss": 6.5063, "step": 145500 }, { "epoch": 5.715404188686631, "grad_norm": 1.59375, "learning_rate": 0.0008856919162262675, "loss": 6.5071, "step": 146000 }, { "epoch": 5.734977490702682, "grad_norm": 3.703125, "learning_rate": 0.0008853004501859464, "loss": 6.5037, "step": 146500 }, { "epoch": 5.754550792718732, "grad_norm": 4.1875, "learning_rate": 0.0008849089841456254, "loss": 6.504, "step": 147000 }, { "epoch": 5.774124094734781, "grad_norm": 0.98828125, "learning_rate": 0.0008845175181053043, "loss": 6.4992, "step": 147500 }, { "epoch": 5.793697396750832, "grad_norm": 25.25, "learning_rate": 0.0008841260520649835, "loss": 6.5052, "step": 148000 }, { "epoch": 5.813270698766882, "grad_norm": 1.9765625, "learning_rate": 0.0008837345860246623, "loss": 6.4979, "step": 148500 }, { "epoch": 5.8328440007829325, "grad_norm": 2.03125, "learning_rate": 0.0008833431199843413, "loss": 6.5023, "step": 149000 }, { "epoch": 5.852417302798982, "grad_norm": 1.2890625, "learning_rate": 0.0008829516539440203, "loss": 6.4971, "step": 149500 }, { "epoch": 5.871990604815032, "grad_norm": 1.3125, "learning_rate": 0.0008825601879036994, "loss": 6.4968, "step": 150000 }, { "epoch": 5.891563906831083, "grad_norm": 1.7734375, "learning_rate": 0.0008821687218633783, "loss": 6.5053, "step": 150500 }, { "epoch": 5.911137208847133, "grad_norm": 2.40625, "learning_rate": 0.0008817772558230573, "loss": 6.4988, "step": 151000 }, { "epoch": 5.930710510863182, "grad_norm": 4.25, "learning_rate": 0.0008813857897827363, "loss": 6.5071, "step": 151500 }, { "epoch": 5.950283812879233, "grad_norm": 3.125, "learning_rate": 0.0008809943237424154, "loss": 6.5014, "step": 152000 }, { "epoch": 5.969857114895283, "grad_norm": 1.515625, "learning_rate": 0.0008806028577020943, "loss": 6.5002, "step": 152500 }, { "epoch": 5.989430416911333, "grad_norm": 2.671875, "learning_rate": 0.0008802113916617733, "loss": 6.4993, "step": 153000 }, { "epoch": 6.0, "eval_loss": 6.495845794677734, "eval_runtime": 21.9172, "eval_samples_per_second": 91.253, "eval_steps_per_second": 5.703, "step": 153270 }, { "epoch": 6.009003718927383, "grad_norm": 8.25, "learning_rate": 0.0008798199256214524, "loss": 6.4952, "step": 153500 }, { "epoch": 6.028577020943433, "grad_norm": 2.40625, "learning_rate": 0.0008794284595811314, "loss": 6.4967, "step": 154000 }, { "epoch": 6.048150322959483, "grad_norm": 2.25, "learning_rate": 0.0008790369935408103, "loss": 6.5073, "step": 154500 }, { "epoch": 6.0677236249755335, "grad_norm": 1.3515625, "learning_rate": 0.0008786455275004893, "loss": 6.4969, "step": 155000 }, { "epoch": 6.087296926991583, "grad_norm": 0.9921875, "learning_rate": 0.0008782540614601684, "loss": 6.487, "step": 155500 }, { "epoch": 6.106870229007634, "grad_norm": 1.875, "learning_rate": 0.0008778625954198474, "loss": 6.49, "step": 156000 }, { "epoch": 6.126443531023684, "grad_norm": 5.25, "learning_rate": 0.0008774711293795263, "loss": 6.4948, "step": 156500 }, { "epoch": 6.146016833039734, "grad_norm": 14.25, "learning_rate": 0.0008770796633392053, "loss": 6.4921, "step": 157000 }, { "epoch": 6.165590135055784, "grad_norm": 1.1171875, "learning_rate": 0.0008766881972988844, "loss": 6.4909, "step": 157500 }, { "epoch": 6.185163437071834, "grad_norm": 3.375, "learning_rate": 0.0008762967312585634, "loss": 6.4917, "step": 158000 }, { "epoch": 6.204736739087884, "grad_norm": 10.6875, "learning_rate": 0.0008759052652182423, "loss": 6.494, "step": 158500 }, { "epoch": 6.2243100411039345, "grad_norm": 5.0, "learning_rate": 0.0008755137991779213, "loss": 6.4909, "step": 159000 }, { "epoch": 6.243883343119984, "grad_norm": 1.7421875, "learning_rate": 0.0008751223331376004, "loss": 6.498, "step": 159500 }, { "epoch": 6.263456645136034, "grad_norm": 1.7734375, "learning_rate": 0.0008747308670972794, "loss": 6.4899, "step": 160000 }, { "epoch": 6.283029947152085, "grad_norm": 1.5390625, "learning_rate": 0.0008743394010569583, "loss": 6.4991, "step": 160500 }, { "epoch": 6.3026032491681345, "grad_norm": 5.71875, "learning_rate": 0.0008739479350166373, "loss": 6.4939, "step": 161000 }, { "epoch": 6.322176551184185, "grad_norm": 3.6875, "learning_rate": 0.0008735564689763164, "loss": 6.4888, "step": 161500 }, { "epoch": 6.341749853200235, "grad_norm": 3.359375, "learning_rate": 0.0008731650029359953, "loss": 6.4918, "step": 162000 }, { "epoch": 6.361323155216285, "grad_norm": 1.125, "learning_rate": 0.0008727735368956743, "loss": 6.4992, "step": 162500 }, { "epoch": 6.3808964572323355, "grad_norm": 2.625, "learning_rate": 0.0008723820708553534, "loss": 6.4958, "step": 163000 }, { "epoch": 6.400469759248385, "grad_norm": 1.2421875, "learning_rate": 0.0008719906048150324, "loss": 6.4894, "step": 163500 }, { "epoch": 6.420043061264435, "grad_norm": 2.03125, "learning_rate": 0.0008715991387747113, "loss": 6.4953, "step": 164000 }, { "epoch": 6.439616363280486, "grad_norm": 2.03125, "learning_rate": 0.0008712076727343903, "loss": 6.4929, "step": 164500 }, { "epoch": 6.4591896652965355, "grad_norm": 1.1875, "learning_rate": 0.0008708162066940694, "loss": 6.4848, "step": 165000 }, { "epoch": 6.478762967312585, "grad_norm": 1.1015625, "learning_rate": 0.0008704247406537484, "loss": 6.4871, "step": 165500 }, { "epoch": 6.498336269328636, "grad_norm": 3.75, "learning_rate": 0.0008700332746134272, "loss": 6.4914, "step": 166000 }, { "epoch": 6.517909571344686, "grad_norm": 2.1875, "learning_rate": 0.0008696418085731062, "loss": 6.4888, "step": 166500 }, { "epoch": 6.5374828733607355, "grad_norm": 2.84375, "learning_rate": 0.0008692503425327854, "loss": 6.4932, "step": 167000 }, { "epoch": 6.557056175376786, "grad_norm": 2.828125, "learning_rate": 0.0008688588764924644, "loss": 6.4992, "step": 167500 }, { "epoch": 6.576629477392836, "grad_norm": 4.9375, "learning_rate": 0.0008684674104521432, "loss": 6.4881, "step": 168000 }, { "epoch": 6.596202779408887, "grad_norm": 2.796875, "learning_rate": 0.0008680759444118222, "loss": 6.4944, "step": 168500 }, { "epoch": 6.6157760814249365, "grad_norm": 19.0, "learning_rate": 0.0008676844783715013, "loss": 6.4874, "step": 169000 }, { "epoch": 6.635349383440986, "grad_norm": 8.25, "learning_rate": 0.0008672930123311803, "loss": 6.4871, "step": 169500 }, { "epoch": 6.654922685457037, "grad_norm": 1.484375, "learning_rate": 0.0008669015462908592, "loss": 6.4978, "step": 170000 }, { "epoch": 6.674495987473087, "grad_norm": 3.140625, "learning_rate": 0.0008665100802505382, "loss": 6.4946, "step": 170500 }, { "epoch": 6.6940692894891365, "grad_norm": 8.0625, "learning_rate": 0.0008661186142102173, "loss": 6.4868, "step": 171000 }, { "epoch": 6.713642591505187, "grad_norm": 3.265625, "learning_rate": 0.0008657271481698963, "loss": 6.4876, "step": 171500 }, { "epoch": 6.733215893521237, "grad_norm": 1.6015625, "learning_rate": 0.0008653356821295752, "loss": 6.4887, "step": 172000 }, { "epoch": 6.752789195537288, "grad_norm": 3.828125, "learning_rate": 0.0008649442160892542, "loss": 6.4897, "step": 172500 }, { "epoch": 6.772362497553337, "grad_norm": 0.8515625, "learning_rate": 0.0008645527500489333, "loss": 6.4857, "step": 173000 }, { "epoch": 6.791935799569387, "grad_norm": 1.2734375, "learning_rate": 0.0008641612840086123, "loss": 6.4867, "step": 173500 }, { "epoch": 6.811509101585438, "grad_norm": 1.578125, "learning_rate": 0.0008637698179682912, "loss": 6.4883, "step": 174000 }, { "epoch": 6.831082403601488, "grad_norm": 1.875, "learning_rate": 0.0008633783519279703, "loss": 6.4783, "step": 174500 }, { "epoch": 6.8506557056175374, "grad_norm": 0.84765625, "learning_rate": 0.0008629868858876493, "loss": 6.4862, "step": 175000 }, { "epoch": 6.870229007633588, "grad_norm": 20.875, "learning_rate": 0.0008625954198473283, "loss": 6.486, "step": 175500 }, { "epoch": 6.889802309649638, "grad_norm": 1.75, "learning_rate": 0.0008622039538070072, "loss": 6.4941, "step": 176000 }, { "epoch": 6.909375611665688, "grad_norm": 1.46875, "learning_rate": 0.0008618124877666863, "loss": 6.4904, "step": 176500 }, { "epoch": 6.928948913681738, "grad_norm": 2.171875, "learning_rate": 0.0008614210217263653, "loss": 6.4864, "step": 177000 }, { "epoch": 6.948522215697788, "grad_norm": 1.6875, "learning_rate": 0.0008610295556860442, "loss": 6.4876, "step": 177500 }, { "epoch": 6.968095517713838, "grad_norm": 0.94921875, "learning_rate": 0.0008606380896457232, "loss": 6.4811, "step": 178000 }, { "epoch": 6.987668819729889, "grad_norm": 1.46875, "learning_rate": 0.0008602466236054023, "loss": 6.4881, "step": 178500 }, { "epoch": 7.0, "eval_loss": 6.482935905456543, "eval_runtime": 22.9737, "eval_samples_per_second": 87.056, "eval_steps_per_second": 5.441, "step": 178815 }, { "epoch": 7.007242121745938, "grad_norm": 1.3515625, "learning_rate": 0.0008598551575650813, "loss": 6.4776, "step": 179000 }, { "epoch": 7.026815423761989, "grad_norm": 1.5078125, "learning_rate": 0.0008594636915247602, "loss": 6.4777, "step": 179500 }, { "epoch": 7.046388725778039, "grad_norm": 2.78125, "learning_rate": 0.0008590722254844392, "loss": 6.4789, "step": 180000 }, { "epoch": 7.065962027794089, "grad_norm": 1.5625, "learning_rate": 0.0008586807594441183, "loss": 6.4884, "step": 180500 }, { "epoch": 7.085535329810139, "grad_norm": 1.671875, "learning_rate": 0.0008582892934037973, "loss": 6.4847, "step": 181000 }, { "epoch": 7.105108631826189, "grad_norm": 1.9140625, "learning_rate": 0.0008578978273634762, "loss": 6.4811, "step": 181500 }, { "epoch": 7.124681933842239, "grad_norm": 1.375, "learning_rate": 0.0008575063613231552, "loss": 6.4872, "step": 182000 }, { "epoch": 7.14425523585829, "grad_norm": 1.375, "learning_rate": 0.0008571148952828343, "loss": 6.486, "step": 182500 }, { "epoch": 7.163828537874339, "grad_norm": 1.921875, "learning_rate": 0.0008567234292425133, "loss": 6.4848, "step": 183000 }, { "epoch": 7.183401839890389, "grad_norm": 2.125, "learning_rate": 0.0008563319632021922, "loss": 6.4808, "step": 183500 }, { "epoch": 7.20297514190644, "grad_norm": 1.0859375, "learning_rate": 0.0008559404971618713, "loss": 6.4858, "step": 184000 }, { "epoch": 7.22254844392249, "grad_norm": 1.7578125, "learning_rate": 0.0008555490311215503, "loss": 6.4875, "step": 184500 }, { "epoch": 7.242121745938539, "grad_norm": 10.375, "learning_rate": 0.0008551575650812293, "loss": 6.483, "step": 185000 }, { "epoch": 7.26169504795459, "grad_norm": 2.0, "learning_rate": 0.0008547660990409081, "loss": 6.4803, "step": 185500 }, { "epoch": 7.28126834997064, "grad_norm": 2.734375, "learning_rate": 0.0008543746330005873, "loss": 6.4823, "step": 186000 }, { "epoch": 7.3008416519866906, "grad_norm": 1.5390625, "learning_rate": 0.0008539831669602662, "loss": 6.4725, "step": 186500 }, { "epoch": 7.32041495400274, "grad_norm": 1.53125, "learning_rate": 0.0008535917009199452, "loss": 6.4783, "step": 187000 }, { "epoch": 7.33998825601879, "grad_norm": 2.25, "learning_rate": 0.0008532002348796241, "loss": 6.4714, "step": 187500 }, { "epoch": 7.359561558034841, "grad_norm": 1.3671875, "learning_rate": 0.0008528087688393032, "loss": 6.4807, "step": 188000 }, { "epoch": 7.379134860050891, "grad_norm": 1.4140625, "learning_rate": 0.0008524173027989822, "loss": 6.471, "step": 188500 }, { "epoch": 7.39870816206694, "grad_norm": 1.234375, "learning_rate": 0.0008520258367586612, "loss": 6.4799, "step": 189000 }, { "epoch": 7.418281464082991, "grad_norm": 1.53125, "learning_rate": 0.0008516343707183401, "loss": 6.48, "step": 189500 }, { "epoch": 7.437854766099041, "grad_norm": 1.1328125, "learning_rate": 0.0008512429046780192, "loss": 6.4835, "step": 190000 }, { "epoch": 7.457428068115091, "grad_norm": 3.90625, "learning_rate": 0.0008508514386376982, "loss": 6.4747, "step": 190500 }, { "epoch": 7.477001370131141, "grad_norm": 1.609375, "learning_rate": 0.0008504599725973772, "loss": 6.4759, "step": 191000 }, { "epoch": 7.496574672147191, "grad_norm": 1.6171875, "learning_rate": 0.0008500685065570561, "loss": 6.4765, "step": 191500 }, { "epoch": 7.516147974163241, "grad_norm": 1.0234375, "learning_rate": 0.0008496770405167352, "loss": 6.4753, "step": 192000 }, { "epoch": 7.5357212761792916, "grad_norm": 1.046875, "learning_rate": 0.0008492855744764142, "loss": 6.4782, "step": 192500 }, { "epoch": 7.555294578195341, "grad_norm": 3.140625, "learning_rate": 0.0008488941084360931, "loss": 6.4725, "step": 193000 }, { "epoch": 7.574867880211392, "grad_norm": 1.703125, "learning_rate": 0.0008485026423957722, "loss": 6.4825, "step": 193500 }, { "epoch": 7.594441182227442, "grad_norm": 2.265625, "learning_rate": 0.0008481111763554512, "loss": 6.4758, "step": 194000 }, { "epoch": 7.614014484243492, "grad_norm": 5.9375, "learning_rate": 0.0008477197103151302, "loss": 6.4732, "step": 194500 }, { "epoch": 7.633587786259542, "grad_norm": 1.125, "learning_rate": 0.0008473282442748091, "loss": 6.4812, "step": 195000 }, { "epoch": 7.653161088275592, "grad_norm": 1.8125, "learning_rate": 0.0008469367782344882, "loss": 6.4711, "step": 195500 }, { "epoch": 7.672734390291642, "grad_norm": 3.421875, "learning_rate": 0.0008465453121941672, "loss": 6.4718, "step": 196000 }, { "epoch": 7.6923076923076925, "grad_norm": 9.625, "learning_rate": 0.0008461538461538462, "loss": 6.4793, "step": 196500 }, { "epoch": 7.711880994323742, "grad_norm": 2.390625, "learning_rate": 0.0008457623801135251, "loss": 6.4758, "step": 197000 }, { "epoch": 7.731454296339793, "grad_norm": 2.125, "learning_rate": 0.0008453709140732042, "loss": 6.4706, "step": 197500 }, { "epoch": 7.751027598355843, "grad_norm": 1.296875, "learning_rate": 0.0008449794480328832, "loss": 6.4838, "step": 198000 }, { "epoch": 7.7706009003718925, "grad_norm": 2.53125, "learning_rate": 0.0008445879819925622, "loss": 6.475, "step": 198500 }, { "epoch": 7.790174202387943, "grad_norm": 2.75, "learning_rate": 0.0008441965159522411, "loss": 6.4766, "step": 199000 }, { "epoch": 7.809747504403993, "grad_norm": 1.84375, "learning_rate": 0.0008438050499119202, "loss": 6.4833, "step": 199500 }, { "epoch": 7.829320806420043, "grad_norm": 1.40625, "learning_rate": 0.0008434135838715992, "loss": 6.4786, "step": 200000 }, { "epoch": 7.8488941084360935, "grad_norm": 1.5546875, "learning_rate": 0.0008430221178312782, "loss": 6.4747, "step": 200500 }, { "epoch": 7.868467410452143, "grad_norm": 2.59375, "learning_rate": 0.0008426306517909571, "loss": 6.4839, "step": 201000 }, { "epoch": 7.888040712468193, "grad_norm": 0.92578125, "learning_rate": 0.0008422391857506362, "loss": 6.477, "step": 201500 }, { "epoch": 7.907614014484244, "grad_norm": 2.40625, "learning_rate": 0.0008418477197103152, "loss": 6.4823, "step": 202000 }, { "epoch": 7.9271873165002935, "grad_norm": 4.9375, "learning_rate": 0.0008414562536699942, "loss": 6.4666, "step": 202500 }, { "epoch": 7.946760618516343, "grad_norm": 1.2421875, "learning_rate": 0.000841064787629673, "loss": 6.4716, "step": 203000 }, { "epoch": 7.966333920532394, "grad_norm": 1.6875, "learning_rate": 0.0008406733215893522, "loss": 6.4678, "step": 203500 }, { "epoch": 7.985907222548444, "grad_norm": 1.109375, "learning_rate": 0.0008402818555490312, "loss": 6.471, "step": 204000 }, { "epoch": 8.0, "eval_loss": 6.472127914428711, "eval_runtime": 24.0419, "eval_samples_per_second": 83.188, "eval_steps_per_second": 5.199, "step": 204360 }, { "epoch": 8.005480524564494, "grad_norm": 3.421875, "learning_rate": 0.0008398903895087102, "loss": 6.4769, "step": 204500 }, { "epoch": 8.025053826580544, "grad_norm": 1.3046875, "learning_rate": 0.0008394989234683892, "loss": 6.4651, "step": 205000 }, { "epoch": 8.044627128596595, "grad_norm": 2.359375, "learning_rate": 0.0008391074574280681, "loss": 6.4746, "step": 205500 }, { "epoch": 8.064200430612644, "grad_norm": 1.40625, "learning_rate": 0.0008387159913877471, "loss": 6.4704, "step": 206000 }, { "epoch": 8.083773732628694, "grad_norm": 1.21875, "learning_rate": 0.0008383245253474261, "loss": 6.4648, "step": 206500 }, { "epoch": 8.103347034644745, "grad_norm": 1.21875, "learning_rate": 0.0008379330593071051, "loss": 6.4735, "step": 207000 }, { "epoch": 8.122920336660794, "grad_norm": 17.0, "learning_rate": 0.0008375415932667841, "loss": 6.474, "step": 207500 }, { "epoch": 8.142493638676845, "grad_norm": 6.5, "learning_rate": 0.0008371501272264631, "loss": 6.4643, "step": 208000 }, { "epoch": 8.162066940692895, "grad_norm": 3.0, "learning_rate": 0.000836758661186142, "loss": 6.4722, "step": 208500 }, { "epoch": 8.181640242708944, "grad_norm": 1.3359375, "learning_rate": 0.0008363671951458211, "loss": 6.4671, "step": 209000 }, { "epoch": 8.201213544724995, "grad_norm": 5.0, "learning_rate": 0.0008359757291055001, "loss": 6.4788, "step": 209500 }, { "epoch": 8.220786846741046, "grad_norm": 3.609375, "learning_rate": 0.0008355842630651791, "loss": 6.4751, "step": 210000 }, { "epoch": 8.240360148757095, "grad_norm": 0.93359375, "learning_rate": 0.000835192797024858, "loss": 6.4759, "step": 210500 }, { "epoch": 8.259933450773145, "grad_norm": 12.6875, "learning_rate": 0.0008348013309845371, "loss": 6.4707, "step": 211000 }, { "epoch": 8.279506752789196, "grad_norm": 1.828125, "learning_rate": 0.0008344098649442161, "loss": 6.4734, "step": 211500 }, { "epoch": 8.299080054805245, "grad_norm": 1.625, "learning_rate": 0.0008340183989038951, "loss": 6.4632, "step": 212000 }, { "epoch": 8.318653356821295, "grad_norm": 1.3125, "learning_rate": 0.000833626932863574, "loss": 6.4708, "step": 212500 }, { "epoch": 8.338226658837346, "grad_norm": 1.5390625, "learning_rate": 0.0008332354668232531, "loss": 6.4614, "step": 213000 }, { "epoch": 8.357799960853397, "grad_norm": 1.234375, "learning_rate": 0.0008328440007829321, "loss": 6.4625, "step": 213500 }, { "epoch": 8.377373262869446, "grad_norm": 2.5, "learning_rate": 0.0008324525347426111, "loss": 6.4605, "step": 214000 }, { "epoch": 8.396946564885496, "grad_norm": 8.0, "learning_rate": 0.0008320610687022901, "loss": 6.4681, "step": 214500 }, { "epoch": 8.416519866901547, "grad_norm": 1.6875, "learning_rate": 0.0008316696026619691, "loss": 6.46, "step": 215000 }, { "epoch": 8.436093168917596, "grad_norm": 1.5, "learning_rate": 0.0008312781366216481, "loss": 6.4718, "step": 215500 }, { "epoch": 8.455666470933647, "grad_norm": 1.6015625, "learning_rate": 0.0008308866705813271, "loss": 6.4647, "step": 216000 }, { "epoch": 8.475239772949697, "grad_norm": 1.0859375, "learning_rate": 0.0008304952045410061, "loss": 6.4723, "step": 216500 }, { "epoch": 8.494813074965746, "grad_norm": 1.234375, "learning_rate": 0.0008301037385006851, "loss": 6.4798, "step": 217000 }, { "epoch": 8.514386376981797, "grad_norm": 2.09375, "learning_rate": 0.0008297122724603641, "loss": 6.468, "step": 217500 }, { "epoch": 8.533959678997848, "grad_norm": 1.234375, "learning_rate": 0.0008293208064200431, "loss": 6.4728, "step": 218000 }, { "epoch": 8.553532981013896, "grad_norm": 0.98828125, "learning_rate": 0.0008289293403797221, "loss": 6.4703, "step": 218500 }, { "epoch": 8.573106283029947, "grad_norm": 1.3828125, "learning_rate": 0.0008285378743394011, "loss": 6.4645, "step": 219000 }, { "epoch": 8.592679585045998, "grad_norm": 0.9140625, "learning_rate": 0.0008281464082990801, "loss": 6.4694, "step": 219500 }, { "epoch": 8.612252887062047, "grad_norm": 1.1328125, "learning_rate": 0.0008277549422587591, "loss": 6.4632, "step": 220000 }, { "epoch": 8.631826189078097, "grad_norm": 3.328125, "learning_rate": 0.0008273634762184381, "loss": 6.4733, "step": 220500 }, { "epoch": 8.651399491094148, "grad_norm": 3.25, "learning_rate": 0.0008269720101781171, "loss": 6.4682, "step": 221000 }, { "epoch": 8.670972793110197, "grad_norm": 1.0859375, "learning_rate": 0.0008265805441377961, "loss": 6.4654, "step": 221500 }, { "epoch": 8.690546095126248, "grad_norm": 2.890625, "learning_rate": 0.0008261890780974751, "loss": 6.4569, "step": 222000 }, { "epoch": 8.710119397142298, "grad_norm": 0.9921875, "learning_rate": 0.0008257976120571541, "loss": 6.4643, "step": 222500 }, { "epoch": 8.729692699158347, "grad_norm": 0.890625, "learning_rate": 0.0008254061460168331, "loss": 6.4544, "step": 223000 }, { "epoch": 8.749266001174398, "grad_norm": 1.234375, "learning_rate": 0.000825014679976512, "loss": 6.4667, "step": 223500 }, { "epoch": 8.768839303190449, "grad_norm": 0.98046875, "learning_rate": 0.0008246232139361912, "loss": 6.458, "step": 224000 }, { "epoch": 8.7884126052065, "grad_norm": 1.6796875, "learning_rate": 0.00082423174789587, "loss": 6.4643, "step": 224500 }, { "epoch": 8.807985907222548, "grad_norm": 0.94140625, "learning_rate": 0.000823840281855549, "loss": 6.4635, "step": 225000 }, { "epoch": 8.827559209238599, "grad_norm": 1.5703125, "learning_rate": 0.000823448815815228, "loss": 6.458, "step": 225500 }, { "epoch": 8.84713251125465, "grad_norm": 1.0625, "learning_rate": 0.000823057349774907, "loss": 6.4666, "step": 226000 }, { "epoch": 8.866705813270698, "grad_norm": 1.859375, "learning_rate": 0.000822665883734586, "loss": 6.4582, "step": 226500 }, { "epoch": 8.886279115286749, "grad_norm": 2.515625, "learning_rate": 0.000822274417694265, "loss": 6.4728, "step": 227000 }, { "epoch": 8.9058524173028, "grad_norm": 1.7734375, "learning_rate": 0.000821882951653944, "loss": 6.4735, "step": 227500 }, { "epoch": 8.925425719318849, "grad_norm": 1.3125, "learning_rate": 0.000821491485613623, "loss": 6.4586, "step": 228000 }, { "epoch": 8.9449990213349, "grad_norm": 0.93359375, "learning_rate": 0.000821100019573302, "loss": 6.4698, "step": 228500 }, { "epoch": 8.96457232335095, "grad_norm": 1.1484375, "learning_rate": 0.000820708553532981, "loss": 6.469, "step": 229000 }, { "epoch": 8.984145625366999, "grad_norm": 2.046875, "learning_rate": 0.00082031708749266, "loss": 6.4725, "step": 229500 }, { "epoch": 9.0, "eval_loss": 6.4647536277771, "eval_runtime": 21.362, "eval_samples_per_second": 93.624, "eval_steps_per_second": 5.852, "step": 229905 }, { "epoch": 9.00371892738305, "grad_norm": 1.5859375, "learning_rate": 0.000819925621452339, "loss": 6.4643, "step": 230000 }, { "epoch": 9.0232922293991, "grad_norm": 2.828125, "learning_rate": 0.000819534155412018, "loss": 6.4632, "step": 230500 }, { "epoch": 9.04286553141515, "grad_norm": 1.390625, "learning_rate": 0.000819142689371697, "loss": 6.4633, "step": 231000 }, { "epoch": 9.0624388334312, "grad_norm": 2.296875, "learning_rate": 0.000818751223331376, "loss": 6.4673, "step": 231500 }, { "epoch": 9.08201213544725, "grad_norm": 3.40625, "learning_rate": 0.000818359757291055, "loss": 6.469, "step": 232000 }, { "epoch": 9.1015854374633, "grad_norm": 1.28125, "learning_rate": 0.000817968291250734, "loss": 6.4532, "step": 232500 }, { "epoch": 9.12115873947935, "grad_norm": 1.1953125, "learning_rate": 0.000817576825210413, "loss": 6.4608, "step": 233000 }, { "epoch": 9.1407320414954, "grad_norm": 1.046875, "learning_rate": 0.000817185359170092, "loss": 6.4537, "step": 233500 }, { "epoch": 9.16030534351145, "grad_norm": 2.90625, "learning_rate": 0.000816793893129771, "loss": 6.444, "step": 234000 }, { "epoch": 9.1798786455275, "grad_norm": 2.40625, "learning_rate": 0.00081640242708945, "loss": 6.4606, "step": 234500 }, { "epoch": 9.199451947543551, "grad_norm": 1.3046875, "learning_rate": 0.000816010961049129, "loss": 6.4578, "step": 235000 }, { "epoch": 9.2190252495596, "grad_norm": 3.359375, "learning_rate": 0.0008156194950088081, "loss": 6.4648, "step": 235500 }, { "epoch": 9.23859855157565, "grad_norm": 1.3359375, "learning_rate": 0.000815228028968487, "loss": 6.4649, "step": 236000 }, { "epoch": 9.258171853591701, "grad_norm": 1.6796875, "learning_rate": 0.000814836562928166, "loss": 6.4617, "step": 236500 }, { "epoch": 9.27774515560775, "grad_norm": 5.34375, "learning_rate": 0.000814445096887845, "loss": 6.4598, "step": 237000 }, { "epoch": 9.2973184576238, "grad_norm": 7.15625, "learning_rate": 0.0008140536308475241, "loss": 6.4565, "step": 237500 }, { "epoch": 9.316891759639852, "grad_norm": 2.546875, "learning_rate": 0.000813662164807203, "loss": 6.4612, "step": 238000 }, { "epoch": 9.336465061655902, "grad_norm": 1.375, "learning_rate": 0.000813270698766882, "loss": 6.4583, "step": 238500 }, { "epoch": 9.356038363671951, "grad_norm": 18.5, "learning_rate": 0.000812879232726561, "loss": 6.4549, "step": 239000 }, { "epoch": 9.375611665688002, "grad_norm": 5.25, "learning_rate": 0.0008124877666862401, "loss": 6.4607, "step": 239500 }, { "epoch": 9.395184967704052, "grad_norm": 1.2890625, "learning_rate": 0.000812096300645919, "loss": 6.4573, "step": 240000 }, { "epoch": 9.414758269720101, "grad_norm": 1.03125, "learning_rate": 0.000811704834605598, "loss": 6.4706, "step": 240500 }, { "epoch": 9.434331571736152, "grad_norm": 7.3125, "learning_rate": 0.000811313368565277, "loss": 6.4632, "step": 241000 }, { "epoch": 9.453904873752203, "grad_norm": 5.28125, "learning_rate": 0.000810921902524956, "loss": 6.4549, "step": 241500 }, { "epoch": 9.473478175768252, "grad_norm": 3.546875, "learning_rate": 0.000810530436484635, "loss": 6.4705, "step": 242000 }, { "epoch": 9.493051477784302, "grad_norm": 2.125, "learning_rate": 0.000810138970444314, "loss": 6.4479, "step": 242500 }, { "epoch": 9.512624779800353, "grad_norm": 1.5390625, "learning_rate": 0.000809747504403993, "loss": 6.4602, "step": 243000 }, { "epoch": 9.532198081816402, "grad_norm": 1.1484375, "learning_rate": 0.000809356038363672, "loss": 6.469, "step": 243500 }, { "epoch": 9.551771383832452, "grad_norm": 5.71875, "learning_rate": 0.0008089645723233509, "loss": 6.464, "step": 244000 }, { "epoch": 9.571344685848503, "grad_norm": 2.609375, "learning_rate": 0.0008085731062830299, "loss": 6.4604, "step": 244500 }, { "epoch": 9.590917987864552, "grad_norm": 1.7265625, "learning_rate": 0.000808181640242709, "loss": 6.4589, "step": 245000 }, { "epoch": 9.610491289880603, "grad_norm": 1.15625, "learning_rate": 0.0008077901742023879, "loss": 6.4551, "step": 245500 }, { "epoch": 9.630064591896653, "grad_norm": 1.6796875, "learning_rate": 0.0008073987081620669, "loss": 6.4583, "step": 246000 }, { "epoch": 9.649637893912702, "grad_norm": 2.265625, "learning_rate": 0.0008070072421217459, "loss": 6.4529, "step": 246500 }, { "epoch": 9.669211195928753, "grad_norm": 1.2734375, "learning_rate": 0.000806615776081425, "loss": 6.457, "step": 247000 }, { "epoch": 9.688784497944804, "grad_norm": 1.5390625, "learning_rate": 0.0008062243100411039, "loss": 6.4537, "step": 247500 }, { "epoch": 9.708357799960853, "grad_norm": 1.3671875, "learning_rate": 0.0008058328440007829, "loss": 6.4652, "step": 248000 }, { "epoch": 9.727931101976903, "grad_norm": 1.390625, "learning_rate": 0.0008054413779604619, "loss": 6.4547, "step": 248500 }, { "epoch": 9.747504403992954, "grad_norm": 1.5625, "learning_rate": 0.000805049911920141, "loss": 6.467, "step": 249000 }, { "epoch": 9.767077706009005, "grad_norm": 1.4453125, "learning_rate": 0.0008046584458798199, "loss": 6.4568, "step": 249500 }, { "epoch": 9.786651008025053, "grad_norm": 1.75, "learning_rate": 0.0008042669798394989, "loss": 6.4616, "step": 250000 }, { "epoch": 9.806224310041104, "grad_norm": 1.046875, "learning_rate": 0.0008038755137991779, "loss": 6.455, "step": 250500 }, { "epoch": 9.825797612057155, "grad_norm": 1.625, "learning_rate": 0.000803484047758857, "loss": 6.4547, "step": 251000 }, { "epoch": 9.845370914073204, "grad_norm": 0.95703125, "learning_rate": 0.0008030925817185359, "loss": 6.4668, "step": 251500 }, { "epoch": 9.864944216089254, "grad_norm": 1.515625, "learning_rate": 0.0008027011156782149, "loss": 6.4458, "step": 252000 }, { "epoch": 9.884517518105305, "grad_norm": 2.171875, "learning_rate": 0.0008023096496378939, "loss": 6.4636, "step": 252500 }, { "epoch": 9.904090820121354, "grad_norm": 1.9140625, "learning_rate": 0.000801918183597573, "loss": 6.4622, "step": 253000 }, { "epoch": 9.923664122137405, "grad_norm": 1.7734375, "learning_rate": 0.0008015267175572519, "loss": 6.4557, "step": 253500 }, { "epoch": 9.943237424153455, "grad_norm": 2.015625, "learning_rate": 0.0008011352515169309, "loss": 6.4513, "step": 254000 }, { "epoch": 9.962810726169504, "grad_norm": 1.0859375, "learning_rate": 0.00080074378547661, "loss": 6.4529, "step": 254500 }, { "epoch": 9.982384028185555, "grad_norm": 1.3046875, "learning_rate": 0.000800352319436289, "loss": 6.4559, "step": 255000 }, { "epoch": 10.0, "eval_loss": 6.455629825592041, "eval_runtime": 21.377, "eval_samples_per_second": 93.559, "eval_steps_per_second": 5.847, "step": 255450 }, { "epoch": 10.001957330201606, "grad_norm": 0.984375, "learning_rate": 0.0007999608533959679, "loss": 6.454, "step": 255500 }, { "epoch": 10.021530632217654, "grad_norm": 1.78125, "learning_rate": 0.0007995693873556469, "loss": 6.4444, "step": 256000 }, { "epoch": 10.041103934233705, "grad_norm": 3.0625, "learning_rate": 0.000799177921315326, "loss": 6.4606, "step": 256500 }, { "epoch": 10.060677236249756, "grad_norm": 1.09375, "learning_rate": 0.000798786455275005, "loss": 6.4541, "step": 257000 }, { "epoch": 10.080250538265805, "grad_norm": 2.59375, "learning_rate": 0.0007983949892346839, "loss": 6.4583, "step": 257500 }, { "epoch": 10.099823840281855, "grad_norm": 8.6875, "learning_rate": 0.0007980035231943629, "loss": 6.4542, "step": 258000 }, { "epoch": 10.119397142297906, "grad_norm": 1.0390625, "learning_rate": 0.000797612057154042, "loss": 6.4503, "step": 258500 }, { "epoch": 10.138970444313955, "grad_norm": 1.2890625, "learning_rate": 0.0007972205911137209, "loss": 6.4531, "step": 259000 }, { "epoch": 10.158543746330006, "grad_norm": 2.03125, "learning_rate": 0.0007968291250733999, "loss": 6.4552, "step": 259500 }, { "epoch": 10.178117048346056, "grad_norm": 2.71875, "learning_rate": 0.0007964376590330789, "loss": 6.4557, "step": 260000 }, { "epoch": 10.197690350362105, "grad_norm": 2.359375, "learning_rate": 0.000796046192992758, "loss": 6.4519, "step": 260500 }, { "epoch": 10.217263652378156, "grad_norm": 1.546875, "learning_rate": 0.0007956547269524369, "loss": 6.4584, "step": 261000 }, { "epoch": 10.236836954394207, "grad_norm": 1.2421875, "learning_rate": 0.0007952632609121159, "loss": 6.4519, "step": 261500 }, { "epoch": 10.256410256410255, "grad_norm": 2.46875, "learning_rate": 0.0007948717948717948, "loss": 6.4576, "step": 262000 }, { "epoch": 10.275983558426306, "grad_norm": 1.1015625, "learning_rate": 0.000794480328831474, "loss": 6.4455, "step": 262500 }, { "epoch": 10.295556860442357, "grad_norm": 2.671875, "learning_rate": 0.0007940888627911528, "loss": 6.4499, "step": 263000 }, { "epoch": 10.315130162458408, "grad_norm": 1.4453125, "learning_rate": 0.0007936973967508318, "loss": 6.4572, "step": 263500 }, { "epoch": 10.334703464474456, "grad_norm": 1.0390625, "learning_rate": 0.0007933059307105108, "loss": 6.4513, "step": 264000 }, { "epoch": 10.354276766490507, "grad_norm": 1.140625, "learning_rate": 0.0007929144646701899, "loss": 6.4472, "step": 264500 }, { "epoch": 10.373850068506558, "grad_norm": 1.953125, "learning_rate": 0.0007925229986298688, "loss": 6.4385, "step": 265000 }, { "epoch": 10.393423370522607, "grad_norm": 1.1484375, "learning_rate": 0.0007921315325895478, "loss": 6.4558, "step": 265500 }, { "epoch": 10.412996672538657, "grad_norm": 2.46875, "learning_rate": 0.0007917400665492269, "loss": 6.4503, "step": 266000 }, { "epoch": 10.432569974554708, "grad_norm": 1.359375, "learning_rate": 0.0007913486005089059, "loss": 6.4402, "step": 266500 }, { "epoch": 10.452143276570757, "grad_norm": 1.046875, "learning_rate": 0.0007909571344685848, "loss": 6.4552, "step": 267000 }, { "epoch": 10.471716578586808, "grad_norm": 1.25, "learning_rate": 0.0007905656684282638, "loss": 6.4488, "step": 267500 }, { "epoch": 10.491289880602858, "grad_norm": 1.3515625, "learning_rate": 0.0007901742023879429, "loss": 6.4577, "step": 268000 }, { "epoch": 10.510863182618907, "grad_norm": 2.546875, "learning_rate": 0.0007897827363476219, "loss": 6.4526, "step": 268500 }, { "epoch": 10.530436484634958, "grad_norm": 3.65625, "learning_rate": 0.0007893912703073008, "loss": 6.4473, "step": 269000 }, { "epoch": 10.550009786651009, "grad_norm": 1.15625, "learning_rate": 0.0007889998042669798, "loss": 6.4555, "step": 269500 }, { "epoch": 10.569583088667057, "grad_norm": 1.609375, "learning_rate": 0.0007886083382266589, "loss": 6.4522, "step": 270000 }, { "epoch": 10.589156390683108, "grad_norm": 2.171875, "learning_rate": 0.0007882168721863379, "loss": 6.4481, "step": 270500 }, { "epoch": 10.608729692699159, "grad_norm": 1.234375, "learning_rate": 0.0007878254061460168, "loss": 6.4498, "step": 271000 }, { "epoch": 10.628302994715208, "grad_norm": 1.015625, "learning_rate": 0.0007874339401056958, "loss": 6.4581, "step": 271500 }, { "epoch": 10.647876296731258, "grad_norm": 4.0, "learning_rate": 0.0007870424740653749, "loss": 6.4554, "step": 272000 }, { "epoch": 10.667449598747309, "grad_norm": 1.625, "learning_rate": 0.0007866510080250539, "loss": 6.4506, "step": 272500 }, { "epoch": 10.68702290076336, "grad_norm": 1.1484375, "learning_rate": 0.0007862595419847328, "loss": 6.4586, "step": 273000 }, { "epoch": 10.706596202779409, "grad_norm": 2.453125, "learning_rate": 0.0007858680759444118, "loss": 6.4467, "step": 273500 }, { "epoch": 10.72616950479546, "grad_norm": 1.5703125, "learning_rate": 0.0007854766099040909, "loss": 6.4449, "step": 274000 }, { "epoch": 10.74574280681151, "grad_norm": 2.71875, "learning_rate": 0.0007850851438637698, "loss": 6.4576, "step": 274500 }, { "epoch": 10.765316108827559, "grad_norm": 2.515625, "learning_rate": 0.0007846936778234488, "loss": 6.4427, "step": 275000 }, { "epoch": 10.78488941084361, "grad_norm": 1.328125, "learning_rate": 0.0007843022117831279, "loss": 6.4571, "step": 275500 }, { "epoch": 10.80446271285966, "grad_norm": 1.5703125, "learning_rate": 0.0007839107457428069, "loss": 6.4507, "step": 276000 }, { "epoch": 10.824036014875709, "grad_norm": 4.875, "learning_rate": 0.0007835192797024858, "loss": 6.4542, "step": 276500 }, { "epoch": 10.84360931689176, "grad_norm": 18.125, "learning_rate": 0.0007831278136621648, "loss": 6.4592, "step": 277000 }, { "epoch": 10.86318261890781, "grad_norm": 1.046875, "learning_rate": 0.0007827363476218439, "loss": 6.4503, "step": 277500 }, { "epoch": 10.88275592092386, "grad_norm": 1.265625, "learning_rate": 0.0007823448815815229, "loss": 6.461, "step": 278000 }, { "epoch": 10.90232922293991, "grad_norm": 4.1875, "learning_rate": 0.0007819534155412018, "loss": 6.4549, "step": 278500 }, { "epoch": 10.92190252495596, "grad_norm": 26.375, "learning_rate": 0.0007815619495008808, "loss": 6.4469, "step": 279000 }, { "epoch": 10.94147582697201, "grad_norm": 0.98828125, "learning_rate": 0.0007811704834605599, "loss": 6.453, "step": 279500 }, { "epoch": 10.96104912898806, "grad_norm": 25.5, "learning_rate": 0.0007807790174202389, "loss": 6.4485, "step": 280000 }, { "epoch": 10.980622431004111, "grad_norm": 2.90625, "learning_rate": 0.0007803875513799178, "loss": 6.4544, "step": 280500 }, { "epoch": 11.0, "eval_loss": 6.449069499969482, "eval_runtime": 22.9095, "eval_samples_per_second": 87.3, "eval_steps_per_second": 5.456, "step": 280995 }, { "epoch": 11.00019573302016, "grad_norm": 10.125, "learning_rate": 0.0007799960853395967, "loss": 6.4525, "step": 281000 }, { "epoch": 11.01976903503621, "grad_norm": 1.3984375, "learning_rate": 0.0007796046192992759, "loss": 6.4456, "step": 281500 }, { "epoch": 11.039342337052261, "grad_norm": 7.28125, "learning_rate": 0.0007792131532589549, "loss": 6.4479, "step": 282000 }, { "epoch": 11.05891563906831, "grad_norm": 1.234375, "learning_rate": 0.0007788216872186337, "loss": 6.4415, "step": 282500 }, { "epoch": 11.07848894108436, "grad_norm": 1.234375, "learning_rate": 0.0007784302211783127, "loss": 6.4438, "step": 283000 }, { "epoch": 11.098062243100411, "grad_norm": 1.5703125, "learning_rate": 0.0007780387551379918, "loss": 6.4485, "step": 283500 }, { "epoch": 11.11763554511646, "grad_norm": 2.171875, "learning_rate": 0.0007776472890976708, "loss": 6.4491, "step": 284000 }, { "epoch": 11.137208847132511, "grad_norm": 1.2890625, "learning_rate": 0.0007772558230573497, "loss": 6.449, "step": 284500 }, { "epoch": 11.156782149148562, "grad_norm": 3.046875, "learning_rate": 0.0007768643570170288, "loss": 6.4526, "step": 285000 }, { "epoch": 11.17635545116461, "grad_norm": 3.859375, "learning_rate": 0.0007764728909767078, "loss": 6.4408, "step": 285500 }, { "epoch": 11.195928753180661, "grad_norm": 1.21875, "learning_rate": 0.0007760814249363868, "loss": 6.4384, "step": 286000 }, { "epoch": 11.215502055196712, "grad_norm": 1.4453125, "learning_rate": 0.0007756899588960657, "loss": 6.4557, "step": 286500 }, { "epoch": 11.235075357212763, "grad_norm": 0.9375, "learning_rate": 0.0007752984928557448, "loss": 6.4432, "step": 287000 }, { "epoch": 11.254648659228812, "grad_norm": 1.5859375, "learning_rate": 0.0007749070268154238, "loss": 6.4451, "step": 287500 }, { "epoch": 11.274221961244862, "grad_norm": 2.734375, "learning_rate": 0.0007745155607751028, "loss": 6.4532, "step": 288000 }, { "epoch": 11.293795263260913, "grad_norm": 2.234375, "learning_rate": 0.0007741240947347817, "loss": 6.4489, "step": 288500 }, { "epoch": 11.313368565276962, "grad_norm": 1.2734375, "learning_rate": 0.0007737326286944608, "loss": 6.4448, "step": 289000 }, { "epoch": 11.332941867293012, "grad_norm": 1.5859375, "learning_rate": 0.0007733411626541398, "loss": 6.4421, "step": 289500 }, { "epoch": 11.352515169309063, "grad_norm": 1.2109375, "learning_rate": 0.0007729496966138187, "loss": 6.4408, "step": 290000 }, { "epoch": 11.372088471325112, "grad_norm": 1.125, "learning_rate": 0.0007725582305734977, "loss": 6.4487, "step": 290500 }, { "epoch": 11.391661773341163, "grad_norm": 1.171875, "learning_rate": 0.0007721667645331768, "loss": 6.4488, "step": 291000 }, { "epoch": 11.411235075357213, "grad_norm": 1.5625, "learning_rate": 0.0007717752984928558, "loss": 6.4394, "step": 291500 }, { "epoch": 11.430808377373262, "grad_norm": 1.40625, "learning_rate": 0.0007713838324525347, "loss": 6.4548, "step": 292000 }, { "epoch": 11.450381679389313, "grad_norm": 1.4375, "learning_rate": 0.0007709923664122137, "loss": 6.454, "step": 292500 }, { "epoch": 11.469954981405364, "grad_norm": 1.9375, "learning_rate": 0.0007706009003718928, "loss": 6.4516, "step": 293000 }, { "epoch": 11.489528283421413, "grad_norm": 1.2109375, "learning_rate": 0.0007702094343315718, "loss": 6.4482, "step": 293500 }, { "epoch": 11.509101585437463, "grad_norm": 2.40625, "learning_rate": 0.0007698179682912507, "loss": 6.4602, "step": 294000 }, { "epoch": 11.528674887453514, "grad_norm": 1.453125, "learning_rate": 0.0007694265022509297, "loss": 6.4459, "step": 294500 }, { "epoch": 11.548248189469563, "grad_norm": 1.4375, "learning_rate": 0.0007690350362106088, "loss": 6.4508, "step": 295000 }, { "epoch": 11.567821491485613, "grad_norm": 1.8203125, "learning_rate": 0.0007686435701702878, "loss": 6.4459, "step": 295500 }, { "epoch": 11.587394793501664, "grad_norm": 4.78125, "learning_rate": 0.0007682521041299667, "loss": 6.4517, "step": 296000 }, { "epoch": 11.606968095517713, "grad_norm": 1.3125, "learning_rate": 0.0007678606380896458, "loss": 6.4519, "step": 296500 }, { "epoch": 11.626541397533764, "grad_norm": 1.3515625, "learning_rate": 0.0007674691720493248, "loss": 6.4474, "step": 297000 }, { "epoch": 11.646114699549814, "grad_norm": 1.078125, "learning_rate": 0.0007670777060090038, "loss": 6.4506, "step": 297500 }, { "epoch": 11.665688001565865, "grad_norm": 0.9296875, "learning_rate": 0.0007666862399686827, "loss": 6.4396, "step": 298000 }, { "epoch": 11.685261303581914, "grad_norm": 1.0703125, "learning_rate": 0.0007662947739283618, "loss": 6.4453, "step": 298500 }, { "epoch": 11.704834605597965, "grad_norm": 1.9296875, "learning_rate": 0.0007659033078880408, "loss": 6.4414, "step": 299000 }, { "epoch": 11.724407907614015, "grad_norm": 1.46875, "learning_rate": 0.0007655118418477198, "loss": 6.4513, "step": 299500 }, { "epoch": 11.743981209630064, "grad_norm": 3.53125, "learning_rate": 0.0007651203758073986, "loss": 6.4501, "step": 300000 }, { "epoch": 11.763554511646115, "grad_norm": 1.4765625, "learning_rate": 0.0007647289097670778, "loss": 6.4471, "step": 300500 }, { "epoch": 11.783127813662166, "grad_norm": 2.3125, "learning_rate": 0.0007643374437267568, "loss": 6.4567, "step": 301000 }, { "epoch": 11.802701115678214, "grad_norm": 1.59375, "learning_rate": 0.0007639459776864357, "loss": 6.4509, "step": 301500 }, { "epoch": 11.822274417694265, "grad_norm": 1.140625, "learning_rate": 0.0007635545116461146, "loss": 6.4451, "step": 302000 }, { "epoch": 11.841847719710316, "grad_norm": 1.7421875, "learning_rate": 0.0007631630456057937, "loss": 6.4524, "step": 302500 }, { "epoch": 11.861421021726365, "grad_norm": 12.8125, "learning_rate": 0.0007627715795654727, "loss": 6.4502, "step": 303000 }, { "epoch": 11.880994323742415, "grad_norm": 1.8203125, "learning_rate": 0.0007623801135251517, "loss": 6.4476, "step": 303500 }, { "epoch": 11.900567625758466, "grad_norm": 1.421875, "learning_rate": 0.0007619886474848306, "loss": 6.4338, "step": 304000 }, { "epoch": 11.920140927774515, "grad_norm": 1.9296875, "learning_rate": 0.0007615971814445097, "loss": 6.4472, "step": 304500 }, { "epoch": 11.939714229790566, "grad_norm": 2.78125, "learning_rate": 0.0007612057154041887, "loss": 6.4465, "step": 305000 }, { "epoch": 11.959287531806616, "grad_norm": 1.2734375, "learning_rate": 0.0007608142493638676, "loss": 6.4529, "step": 305500 }, { "epoch": 11.978860833822665, "grad_norm": 1.375, "learning_rate": 0.0007604227833235467, "loss": 6.4456, "step": 306000 }, { "epoch": 11.998434135838716, "grad_norm": 2.015625, "learning_rate": 0.0007600313172832257, "loss": 6.4525, "step": 306500 }, { "epoch": 12.0, "eval_loss": 6.446938514709473, "eval_runtime": 23.9647, "eval_samples_per_second": 83.456, "eval_steps_per_second": 5.216, "step": 306540 }, { "epoch": 12.018007437854767, "grad_norm": 1.4453125, "learning_rate": 0.0007596398512429047, "loss": 6.4458, "step": 307000 }, { "epoch": 12.037580739870815, "grad_norm": 1.734375, "learning_rate": 0.0007592483852025836, "loss": 6.4423, "step": 307500 }, { "epoch": 12.057154041886866, "grad_norm": 11.3125, "learning_rate": 0.0007588569191622627, "loss": 6.4446, "step": 308000 }, { "epoch": 12.076727343902917, "grad_norm": 1.3828125, "learning_rate": 0.0007584654531219417, "loss": 6.4461, "step": 308500 }, { "epoch": 12.096300645918966, "grad_norm": 1.84375, "learning_rate": 0.0007580739870816207, "loss": 6.4431, "step": 309000 }, { "epoch": 12.115873947935016, "grad_norm": 2.421875, "learning_rate": 0.0007576825210412996, "loss": 6.4439, "step": 309500 }, { "epoch": 12.135447249951067, "grad_norm": 3.03125, "learning_rate": 0.0007572910550009787, "loss": 6.4428, "step": 310000 }, { "epoch": 12.155020551967118, "grad_norm": 1.046875, "learning_rate": 0.0007568995889606577, "loss": 6.4456, "step": 310500 }, { "epoch": 12.174593853983167, "grad_norm": 1.0390625, "learning_rate": 0.0007565081229203367, "loss": 6.4479, "step": 311000 }, { "epoch": 12.194167155999217, "grad_norm": 2.640625, "learning_rate": 0.0007561166568800156, "loss": 6.4457, "step": 311500 }, { "epoch": 12.213740458015268, "grad_norm": 2.015625, "learning_rate": 0.0007557251908396947, "loss": 6.4405, "step": 312000 }, { "epoch": 12.233313760031317, "grad_norm": 2.21875, "learning_rate": 0.0007553337247993737, "loss": 6.4557, "step": 312500 }, { "epoch": 12.252887062047368, "grad_norm": 1.28125, "learning_rate": 0.0007549422587590527, "loss": 6.4407, "step": 313000 }, { "epoch": 12.272460364063418, "grad_norm": 1.421875, "learning_rate": 0.0007545507927187316, "loss": 6.4457, "step": 313500 }, { "epoch": 12.292033666079467, "grad_norm": 1.1484375, "learning_rate": 0.0007541593266784107, "loss": 6.4471, "step": 314000 }, { "epoch": 12.311606968095518, "grad_norm": 1.4765625, "learning_rate": 0.0007537678606380897, "loss": 6.4473, "step": 314500 }, { "epoch": 12.331180270111568, "grad_norm": 3.03125, "learning_rate": 0.0007533763945977687, "loss": 6.4451, "step": 315000 }, { "epoch": 12.350753572127617, "grad_norm": 1.1171875, "learning_rate": 0.0007529849285574477, "loss": 6.4336, "step": 315500 }, { "epoch": 12.370326874143668, "grad_norm": 1.1875, "learning_rate": 0.0007525934625171267, "loss": 6.4454, "step": 316000 }, { "epoch": 12.389900176159719, "grad_norm": 1.9609375, "learning_rate": 0.0007522019964768057, "loss": 6.4481, "step": 316500 }, { "epoch": 12.409473478175768, "grad_norm": 1.796875, "learning_rate": 0.0007518105304364847, "loss": 6.4485, "step": 317000 }, { "epoch": 12.429046780191818, "grad_norm": 1.5390625, "learning_rate": 0.0007514190643961637, "loss": 6.4553, "step": 317500 }, { "epoch": 12.448620082207869, "grad_norm": 1.421875, "learning_rate": 0.0007510275983558427, "loss": 6.4435, "step": 318000 }, { "epoch": 12.468193384223918, "grad_norm": 1.8046875, "learning_rate": 0.0007506361323155217, "loss": 6.4435, "step": 318500 }, { "epoch": 12.487766686239969, "grad_norm": 2.578125, "learning_rate": 0.0007502446662752007, "loss": 6.4451, "step": 319000 }, { "epoch": 12.50733998825602, "grad_norm": 1.6953125, "learning_rate": 0.0007498532002348797, "loss": 6.4511, "step": 319500 }, { "epoch": 12.526913290272068, "grad_norm": 1.671875, "learning_rate": 0.0007494617341945587, "loss": 6.4384, "step": 320000 }, { "epoch": 12.546486592288119, "grad_norm": 2.09375, "learning_rate": 0.0007490702681542376, "loss": 6.4448, "step": 320500 }, { "epoch": 12.56605989430417, "grad_norm": 2.03125, "learning_rate": 0.0007486788021139165, "loss": 6.4514, "step": 321000 }, { "epoch": 12.58563319632022, "grad_norm": 1.0390625, "learning_rate": 0.0007482873360735956, "loss": 6.4437, "step": 321500 }, { "epoch": 12.605206498336269, "grad_norm": 1.3671875, "learning_rate": 0.0007478958700332746, "loss": 6.4439, "step": 322000 }, { "epoch": 12.62477980035232, "grad_norm": 1.1953125, "learning_rate": 0.0007475044039929536, "loss": 6.4455, "step": 322500 }, { "epoch": 12.64435310236837, "grad_norm": 0.875, "learning_rate": 0.0007471129379526325, "loss": 6.4413, "step": 323000 }, { "epoch": 12.66392640438442, "grad_norm": 1.8125, "learning_rate": 0.0007467214719123116, "loss": 6.4366, "step": 323500 }, { "epoch": 12.68349970640047, "grad_norm": 1.0234375, "learning_rate": 0.0007463300058719906, "loss": 6.4511, "step": 324000 }, { "epoch": 12.70307300841652, "grad_norm": 1.625, "learning_rate": 0.0007459385398316696, "loss": 6.4384, "step": 324500 }, { "epoch": 12.72264631043257, "grad_norm": 0.8203125, "learning_rate": 0.0007455470737913485, "loss": 6.4402, "step": 325000 }, { "epoch": 12.74221961244862, "grad_norm": 4.0625, "learning_rate": 0.0007451556077510276, "loss": 6.4542, "step": 325500 }, { "epoch": 12.761792914464671, "grad_norm": 2.328125, "learning_rate": 0.0007447641417107066, "loss": 6.4417, "step": 326000 }, { "epoch": 12.78136621648072, "grad_norm": 5.4375, "learning_rate": 0.0007443726756703856, "loss": 6.4429, "step": 326500 }, { "epoch": 12.80093951849677, "grad_norm": 1.375, "learning_rate": 0.0007439812096300646, "loss": 6.45, "step": 327000 }, { "epoch": 12.820512820512821, "grad_norm": 1.703125, "learning_rate": 0.0007435897435897436, "loss": 6.4472, "step": 327500 }, { "epoch": 12.84008612252887, "grad_norm": 1.9453125, "learning_rate": 0.0007431982775494226, "loss": 6.438, "step": 328000 }, { "epoch": 12.85965942454492, "grad_norm": 2.234375, "learning_rate": 0.0007428068115091016, "loss": 6.451, "step": 328500 }, { "epoch": 12.879232726560971, "grad_norm": 1.53125, "learning_rate": 0.0007424153454687806, "loss": 6.4495, "step": 329000 }, { "epoch": 12.89880602857702, "grad_norm": 8.375, "learning_rate": 0.0007420238794284596, "loss": 6.4412, "step": 329500 }, { "epoch": 12.918379330593071, "grad_norm": 1.1640625, "learning_rate": 0.0007416324133881386, "loss": 6.4584, "step": 330000 }, { "epoch": 12.937952632609122, "grad_norm": 1.71875, "learning_rate": 0.0007412409473478176, "loss": 6.4446, "step": 330500 }, { "epoch": 12.95752593462517, "grad_norm": 1.8828125, "learning_rate": 0.0007408494813074966, "loss": 6.4465, "step": 331000 }, { "epoch": 12.977099236641221, "grad_norm": 1.890625, "learning_rate": 0.0007404580152671756, "loss": 6.4441, "step": 331500 }, { "epoch": 12.996672538657272, "grad_norm": 2.453125, "learning_rate": 0.0007400665492268546, "loss": 6.4464, "step": 332000 }, { "epoch": 13.0, "eval_loss": 6.443148612976074, "eval_runtime": 21.9848, "eval_samples_per_second": 90.972, "eval_steps_per_second": 5.686, "step": 332085 }, { "epoch": 13.01624584067332, "grad_norm": 3.203125, "learning_rate": 0.0007396750831865336, "loss": 6.4378, "step": 332500 }, { "epoch": 13.035819142689371, "grad_norm": 1.34375, "learning_rate": 0.0007392836171462126, "loss": 6.4393, "step": 333000 }, { "epoch": 13.055392444705422, "grad_norm": 1.1328125, "learning_rate": 0.0007388921511058916, "loss": 6.441, "step": 333500 }, { "epoch": 13.074965746721471, "grad_norm": 4.875, "learning_rate": 0.0007385006850655706, "loss": 6.4441, "step": 334000 }, { "epoch": 13.094539048737522, "grad_norm": 1.0, "learning_rate": 0.0007381092190252496, "loss": 6.4412, "step": 334500 }, { "epoch": 13.114112350753572, "grad_norm": 1.4296875, "learning_rate": 0.0007377177529849286, "loss": 6.4431, "step": 335000 }, { "epoch": 13.133685652769623, "grad_norm": 1.9921875, "learning_rate": 0.0007373262869446076, "loss": 6.4336, "step": 335500 }, { "epoch": 13.153258954785672, "grad_norm": 1.015625, "learning_rate": 0.0007369348209042866, "loss": 6.4395, "step": 336000 }, { "epoch": 13.172832256801723, "grad_norm": 2.046875, "learning_rate": 0.0007365433548639657, "loss": 6.4369, "step": 336500 }, { "epoch": 13.192405558817773, "grad_norm": 1.1875, "learning_rate": 0.0007361518888236446, "loss": 6.4482, "step": 337000 }, { "epoch": 13.211978860833822, "grad_norm": 4.46875, "learning_rate": 0.0007357604227833236, "loss": 6.45, "step": 337500 }, { "epoch": 13.231552162849873, "grad_norm": 1.6484375, "learning_rate": 0.0007353689567430026, "loss": 6.4423, "step": 338000 }, { "epoch": 13.251125464865924, "grad_norm": 1.59375, "learning_rate": 0.0007349774907026816, "loss": 6.4455, "step": 338500 }, { "epoch": 13.270698766881972, "grad_norm": 1.6640625, "learning_rate": 0.0007345860246623605, "loss": 6.4393, "step": 339000 }, { "epoch": 13.290272068898023, "grad_norm": 1.0, "learning_rate": 0.0007341945586220395, "loss": 6.4441, "step": 339500 }, { "epoch": 13.309845370914074, "grad_norm": 1.4375, "learning_rate": 0.0007338030925817185, "loss": 6.4395, "step": 340000 }, { "epoch": 13.329418672930123, "grad_norm": 1.5703125, "learning_rate": 0.0007334116265413975, "loss": 6.4375, "step": 340500 }, { "epoch": 13.348991974946173, "grad_norm": 1.46875, "learning_rate": 0.0007330201605010765, "loss": 6.438, "step": 341000 }, { "epoch": 13.368565276962224, "grad_norm": 1.6015625, "learning_rate": 0.0007326286944607555, "loss": 6.4392, "step": 341500 }, { "epoch": 13.388138578978273, "grad_norm": 1.3359375, "learning_rate": 0.0007322372284204345, "loss": 6.4433, "step": 342000 }, { "epoch": 13.407711880994324, "grad_norm": 1.46875, "learning_rate": 0.0007318457623801135, "loss": 6.4467, "step": 342500 }, { "epoch": 13.427285183010374, "grad_norm": 0.87890625, "learning_rate": 0.0007314542963397925, "loss": 6.4401, "step": 343000 }, { "epoch": 13.446858485026423, "grad_norm": 0.87109375, "learning_rate": 0.0007310628302994715, "loss": 6.4414, "step": 343500 }, { "epoch": 13.466431787042474, "grad_norm": 0.98046875, "learning_rate": 0.0007306713642591505, "loss": 6.4451, "step": 344000 }, { "epoch": 13.486005089058525, "grad_norm": 2.921875, "learning_rate": 0.0007302798982188295, "loss": 6.4419, "step": 344500 }, { "epoch": 13.505578391074573, "grad_norm": 2.296875, "learning_rate": 0.0007298884321785085, "loss": 6.439, "step": 345000 }, { "epoch": 13.525151693090624, "grad_norm": 14.0625, "learning_rate": 0.0007294969661381875, "loss": 6.4437, "step": 345500 }, { "epoch": 13.544724995106675, "grad_norm": 1.2578125, "learning_rate": 0.0007291055000978665, "loss": 6.4399, "step": 346000 }, { "epoch": 13.564298297122726, "grad_norm": 11.5625, "learning_rate": 0.0007287140340575455, "loss": 6.4447, "step": 346500 }, { "epoch": 13.583871599138774, "grad_norm": 0.91796875, "learning_rate": 0.0007283225680172245, "loss": 6.4373, "step": 347000 }, { "epoch": 13.603444901154825, "grad_norm": 1.9375, "learning_rate": 0.0007279311019769035, "loss": 6.4484, "step": 347500 }, { "epoch": 13.623018203170876, "grad_norm": 1.734375, "learning_rate": 0.0007275396359365826, "loss": 6.4486, "step": 348000 }, { "epoch": 13.642591505186925, "grad_norm": 1.4765625, "learning_rate": 0.0007271481698962615, "loss": 6.4417, "step": 348500 }, { "epoch": 13.662164807202975, "grad_norm": 0.88671875, "learning_rate": 0.0007267567038559405, "loss": 6.4397, "step": 349000 }, { "epoch": 13.681738109219026, "grad_norm": 1.84375, "learning_rate": 0.0007263652378156195, "loss": 6.4328, "step": 349500 }, { "epoch": 13.701311411235075, "grad_norm": 1.1171875, "learning_rate": 0.0007259737717752986, "loss": 6.4462, "step": 350000 }, { "epoch": 13.720884713251126, "grad_norm": 4.15625, "learning_rate": 0.0007255823057349775, "loss": 6.4394, "step": 350500 }, { "epoch": 13.740458015267176, "grad_norm": 1.40625, "learning_rate": 0.0007251908396946565, "loss": 6.4434, "step": 351000 }, { "epoch": 13.760031317283225, "grad_norm": 1.6328125, "learning_rate": 0.0007247993736543355, "loss": 6.4437, "step": 351500 }, { "epoch": 13.779604619299276, "grad_norm": 1.5859375, "learning_rate": 0.0007244079076140146, "loss": 6.4386, "step": 352000 }, { "epoch": 13.799177921315327, "grad_norm": 1.4609375, "learning_rate": 0.0007240164415736935, "loss": 6.4476, "step": 352500 }, { "epoch": 13.818751223331375, "grad_norm": 2.515625, "learning_rate": 0.0007236249755333725, "loss": 6.4442, "step": 353000 }, { "epoch": 13.838324525347426, "grad_norm": 2.203125, "learning_rate": 0.0007232335094930515, "loss": 6.44, "step": 353500 }, { "epoch": 13.857897827363477, "grad_norm": 0.99609375, "learning_rate": 0.0007228420434527305, "loss": 6.4395, "step": 354000 }, { "epoch": 13.877471129379526, "grad_norm": 2.75, "learning_rate": 0.0007224505774124095, "loss": 6.4392, "step": 354500 }, { "epoch": 13.897044431395576, "grad_norm": 0.9140625, "learning_rate": 0.0007220591113720885, "loss": 6.4484, "step": 355000 }, { "epoch": 13.916617733411627, "grad_norm": 1.3203125, "learning_rate": 0.0007216676453317675, "loss": 6.4449, "step": 355500 }, { "epoch": 13.936191035427676, "grad_norm": 1.390625, "learning_rate": 0.0007212761792914465, "loss": 6.4444, "step": 356000 }, { "epoch": 13.955764337443727, "grad_norm": 3.921875, "learning_rate": 0.0007208847132511255, "loss": 6.4438, "step": 356500 }, { "epoch": 13.975337639459777, "grad_norm": 4.71875, "learning_rate": 0.0007204932472108045, "loss": 6.4417, "step": 357000 }, { "epoch": 13.994910941475826, "grad_norm": 3.65625, "learning_rate": 0.0007201017811704836, "loss": 6.4479, "step": 357500 }, { "epoch": 14.0, "eval_loss": 6.441241264343262, "eval_runtime": 23.2496, "eval_samples_per_second": 86.023, "eval_steps_per_second": 5.376, "step": 357630 }, { "epoch": 14.014484243491877, "grad_norm": 1.4296875, "learning_rate": 0.0007197103151301624, "loss": 6.4372, "step": 358000 }, { "epoch": 14.034057545507928, "grad_norm": 0.9375, "learning_rate": 0.0007193188490898414, "loss": 6.4488, "step": 358500 }, { "epoch": 14.053630847523978, "grad_norm": 1.234375, "learning_rate": 0.0007189273830495204, "loss": 6.441, "step": 359000 }, { "epoch": 14.073204149540027, "grad_norm": 0.890625, "learning_rate": 0.0007185359170091995, "loss": 6.4405, "step": 359500 }, { "epoch": 14.092777451556078, "grad_norm": 1.46875, "learning_rate": 0.0007181444509688784, "loss": 6.4363, "step": 360000 }, { "epoch": 14.112350753572128, "grad_norm": 1.421875, "learning_rate": 0.0007177529849285574, "loss": 6.4307, "step": 360500 }, { "epoch": 14.131924055588177, "grad_norm": 1.0078125, "learning_rate": 0.0007173615188882364, "loss": 6.4421, "step": 361000 }, { "epoch": 14.151497357604228, "grad_norm": 2.703125, "learning_rate": 0.0007169700528479155, "loss": 6.4308, "step": 361500 }, { "epoch": 14.171070659620279, "grad_norm": 1.59375, "learning_rate": 0.0007165785868075944, "loss": 6.445, "step": 362000 }, { "epoch": 14.190643961636328, "grad_norm": 1.015625, "learning_rate": 0.0007161871207672734, "loss": 6.4366, "step": 362500 }, { "epoch": 14.210217263652378, "grad_norm": 8.3125, "learning_rate": 0.0007157956547269524, "loss": 6.4377, "step": 363000 }, { "epoch": 14.229790565668429, "grad_norm": 1.578125, "learning_rate": 0.0007154041886866315, "loss": 6.4332, "step": 363500 }, { "epoch": 14.249363867684478, "grad_norm": 20.75, "learning_rate": 0.0007150127226463104, "loss": 6.4339, "step": 364000 }, { "epoch": 14.268937169700529, "grad_norm": 1.5703125, "learning_rate": 0.0007146212566059894, "loss": 6.4443, "step": 364500 }, { "epoch": 14.28851047171658, "grad_norm": 1.265625, "learning_rate": 0.0007142297905656684, "loss": 6.4381, "step": 365000 }, { "epoch": 14.308083773732628, "grad_norm": 0.9765625, "learning_rate": 0.0007138383245253475, "loss": 6.4398, "step": 365500 }, { "epoch": 14.327657075748679, "grad_norm": 1.3125, "learning_rate": 0.0007134468584850264, "loss": 6.4347, "step": 366000 }, { "epoch": 14.34723037776473, "grad_norm": 1.2578125, "learning_rate": 0.0007130553924447054, "loss": 6.4376, "step": 366500 }, { "epoch": 14.366803679780778, "grad_norm": 2.3125, "learning_rate": 0.0007126639264043845, "loss": 6.4392, "step": 367000 }, { "epoch": 14.386376981796829, "grad_norm": 2.5, "learning_rate": 0.0007122724603640635, "loss": 6.4388, "step": 367500 }, { "epoch": 14.40595028381288, "grad_norm": 0.93359375, "learning_rate": 0.0007118809943237424, "loss": 6.4346, "step": 368000 }, { "epoch": 14.425523585828929, "grad_norm": 1.34375, "learning_rate": 0.0007114895282834214, "loss": 6.4427, "step": 368500 }, { "epoch": 14.44509688784498, "grad_norm": 1.015625, "learning_rate": 0.0007110980622431005, "loss": 6.4425, "step": 369000 }, { "epoch": 14.46467018986103, "grad_norm": 1.9921875, "learning_rate": 0.0007107065962027795, "loss": 6.4497, "step": 369500 }, { "epoch": 14.484243491877079, "grad_norm": 1.4140625, "learning_rate": 0.0007103151301624584, "loss": 6.4378, "step": 370000 }, { "epoch": 14.50381679389313, "grad_norm": 1.109375, "learning_rate": 0.0007099236641221374, "loss": 6.4414, "step": 370500 }, { "epoch": 14.52339009590918, "grad_norm": 1.1015625, "learning_rate": 0.0007095321980818165, "loss": 6.4258, "step": 371000 }, { "epoch": 14.54296339792523, "grad_norm": 4.8125, "learning_rate": 0.0007091407320414954, "loss": 6.4274, "step": 371500 }, { "epoch": 14.56253669994128, "grad_norm": 4.5625, "learning_rate": 0.0007087492660011744, "loss": 6.4458, "step": 372000 }, { "epoch": 14.58211000195733, "grad_norm": 1.8671875, "learning_rate": 0.0007083577999608534, "loss": 6.4381, "step": 372500 }, { "epoch": 14.601683303973381, "grad_norm": 4.125, "learning_rate": 0.0007079663339205325, "loss": 6.441, "step": 373000 }, { "epoch": 14.62125660598943, "grad_norm": 1.09375, "learning_rate": 0.0007075748678802114, "loss": 6.4338, "step": 373500 }, { "epoch": 14.64082990800548, "grad_norm": 1.953125, "learning_rate": 0.0007071834018398904, "loss": 6.4328, "step": 374000 }, { "epoch": 14.660403210021531, "grad_norm": 3.375, "learning_rate": 0.0007067919357995694, "loss": 6.4386, "step": 374500 }, { "epoch": 14.67997651203758, "grad_norm": 4.6875, "learning_rate": 0.0007064004697592485, "loss": 6.4317, "step": 375000 }, { "epoch": 14.699549814053631, "grad_norm": 1.0234375, "learning_rate": 0.0007060090037189274, "loss": 6.4334, "step": 375500 }, { "epoch": 14.719123116069682, "grad_norm": 2.296875, "learning_rate": 0.0007056175376786064, "loss": 6.4464, "step": 376000 }, { "epoch": 14.73869641808573, "grad_norm": 0.90625, "learning_rate": 0.0007052260716382854, "loss": 6.4376, "step": 376500 }, { "epoch": 14.758269720101781, "grad_norm": 3.546875, "learning_rate": 0.0007048346055979645, "loss": 6.4439, "step": 377000 }, { "epoch": 14.777843022117832, "grad_norm": 1.6171875, "learning_rate": 0.0007044431395576433, "loss": 6.4408, "step": 377500 }, { "epoch": 14.79741632413388, "grad_norm": 2.015625, "learning_rate": 0.0007040516735173223, "loss": 6.434, "step": 378000 }, { "epoch": 14.816989626149931, "grad_norm": 1.8203125, "learning_rate": 0.0007036602074770014, "loss": 6.4404, "step": 378500 }, { "epoch": 14.836562928165982, "grad_norm": 3.484375, "learning_rate": 0.0007032687414366804, "loss": 6.4377, "step": 379000 }, { "epoch": 14.856136230182031, "grad_norm": 1.890625, "learning_rate": 0.0007028772753963593, "loss": 6.4332, "step": 379500 }, { "epoch": 14.875709532198082, "grad_norm": 3.625, "learning_rate": 0.0007024858093560383, "loss": 6.4384, "step": 380000 }, { "epoch": 14.895282834214132, "grad_norm": 1.328125, "learning_rate": 0.0007020943433157174, "loss": 6.4439, "step": 380500 }, { "epoch": 14.914856136230181, "grad_norm": 6.625, "learning_rate": 0.0007017028772753964, "loss": 6.439, "step": 381000 }, { "epoch": 14.934429438246232, "grad_norm": 3.28125, "learning_rate": 0.0007013114112350753, "loss": 6.4396, "step": 381500 }, { "epoch": 14.954002740262283, "grad_norm": 0.828125, "learning_rate": 0.0007009199451947543, "loss": 6.4498, "step": 382000 }, { "epoch": 14.973576042278331, "grad_norm": 1.4140625, "learning_rate": 0.0007005284791544334, "loss": 6.4371, "step": 382500 }, { "epoch": 14.993149344294382, "grad_norm": 24.375, "learning_rate": 0.0007001370131141124, "loss": 6.4382, "step": 383000 }, { "epoch": 15.0, "eval_loss": 6.4391045570373535, "eval_runtime": 20.2509, "eval_samples_per_second": 98.761, "eval_steps_per_second": 6.173, "step": 383175 }, { "epoch": 15.012722646310433, "grad_norm": 4.21875, "learning_rate": 0.0006997455470737913, "loss": 6.4283, "step": 383500 }, { "epoch": 15.032295948326484, "grad_norm": 1.484375, "learning_rate": 0.0006993540810334703, "loss": 6.4405, "step": 384000 }, { "epoch": 15.051869250342532, "grad_norm": 1.1328125, "learning_rate": 0.0006989626149931494, "loss": 6.4427, "step": 384500 }, { "epoch": 15.071442552358583, "grad_norm": 4.625, "learning_rate": 0.0006985711489528284, "loss": 6.4322, "step": 385000 }, { "epoch": 15.091015854374634, "grad_norm": 1.125, "learning_rate": 0.0006981796829125073, "loss": 6.4296, "step": 385500 }, { "epoch": 15.110589156390683, "grad_norm": 2.296875, "learning_rate": 0.0006977882168721863, "loss": 6.437, "step": 386000 }, { "epoch": 15.130162458406733, "grad_norm": 2.078125, "learning_rate": 0.0006973967508318654, "loss": 6.4392, "step": 386500 }, { "epoch": 15.149735760422784, "grad_norm": 4.5625, "learning_rate": 0.0006970052847915443, "loss": 6.4383, "step": 387000 }, { "epoch": 15.169309062438833, "grad_norm": 1.453125, "learning_rate": 0.0006966138187512233, "loss": 6.4385, "step": 387500 }, { "epoch": 15.188882364454884, "grad_norm": 6.0, "learning_rate": 0.0006962223527109024, "loss": 6.4358, "step": 388000 }, { "epoch": 15.208455666470934, "grad_norm": 1.5234375, "learning_rate": 0.0006958308866705814, "loss": 6.4298, "step": 388500 }, { "epoch": 15.228028968486983, "grad_norm": 2.796875, "learning_rate": 0.0006954394206302603, "loss": 6.4386, "step": 389000 }, { "epoch": 15.247602270503034, "grad_norm": 1.546875, "learning_rate": 0.0006950479545899393, "loss": 6.4419, "step": 389500 }, { "epoch": 15.267175572519085, "grad_norm": 0.80859375, "learning_rate": 0.0006946564885496184, "loss": 6.4326, "step": 390000 }, { "epoch": 15.286748874535133, "grad_norm": 1.3046875, "learning_rate": 0.0006942650225092974, "loss": 6.4367, "step": 390500 }, { "epoch": 15.306322176551184, "grad_norm": 1.40625, "learning_rate": 0.0006938735564689763, "loss": 6.4375, "step": 391000 }, { "epoch": 15.325895478567235, "grad_norm": 1.4921875, "learning_rate": 0.0006934820904286553, "loss": 6.4316, "step": 391500 }, { "epoch": 15.345468780583284, "grad_norm": 9.875, "learning_rate": 0.0006930906243883344, "loss": 6.4386, "step": 392000 }, { "epoch": 15.365042082599334, "grad_norm": 1.421875, "learning_rate": 0.0006926991583480134, "loss": 6.4268, "step": 392500 }, { "epoch": 15.384615384615385, "grad_norm": 0.92578125, "learning_rate": 0.0006923076923076923, "loss": 6.4301, "step": 393000 }, { "epoch": 15.404188686631434, "grad_norm": 0.92578125, "learning_rate": 0.0006919162262673713, "loss": 6.4374, "step": 393500 }, { "epoch": 15.423761988647485, "grad_norm": 2.0625, "learning_rate": 0.0006915247602270504, "loss": 6.4423, "step": 394000 }, { "epoch": 15.443335290663535, "grad_norm": 1.5390625, "learning_rate": 0.0006911332941867294, "loss": 6.4376, "step": 394500 }, { "epoch": 15.462908592679586, "grad_norm": 4.34375, "learning_rate": 0.0006907418281464083, "loss": 6.4384, "step": 395000 }, { "epoch": 15.482481894695635, "grad_norm": 5.375, "learning_rate": 0.0006903503621060873, "loss": 6.4293, "step": 395500 }, { "epoch": 15.502055196711686, "grad_norm": 2.203125, "learning_rate": 0.0006899588960657664, "loss": 6.4331, "step": 396000 }, { "epoch": 15.521628498727736, "grad_norm": 1.7734375, "learning_rate": 0.0006895674300254454, "loss": 6.4326, "step": 396500 }, { "epoch": 15.541201800743785, "grad_norm": 1.890625, "learning_rate": 0.0006891759639851242, "loss": 6.4296, "step": 397000 }, { "epoch": 15.560775102759836, "grad_norm": 1.015625, "learning_rate": 0.0006887844979448033, "loss": 6.4363, "step": 397500 }, { "epoch": 15.580348404775886, "grad_norm": 1.4453125, "learning_rate": 0.0006883930319044823, "loss": 6.4393, "step": 398000 }, { "epoch": 15.599921706791935, "grad_norm": 9.6875, "learning_rate": 0.0006880015658641613, "loss": 6.4373, "step": 398500 }, { "epoch": 15.619495008807986, "grad_norm": 2.953125, "learning_rate": 0.0006876100998238402, "loss": 6.4404, "step": 399000 }, { "epoch": 15.639068310824037, "grad_norm": 1.421875, "learning_rate": 0.0006872186337835193, "loss": 6.4434, "step": 399500 }, { "epoch": 15.658641612840086, "grad_norm": 1.40625, "learning_rate": 0.0006868271677431983, "loss": 6.4329, "step": 400000 }, { "epoch": 15.678214914856136, "grad_norm": 1.046875, "learning_rate": 0.0006864357017028773, "loss": 6.4316, "step": 400500 }, { "epoch": 15.697788216872187, "grad_norm": 3.109375, "learning_rate": 0.0006860442356625562, "loss": 6.4376, "step": 401000 }, { "epoch": 15.717361518888236, "grad_norm": 0.875, "learning_rate": 0.0006856527696222353, "loss": 6.4376, "step": 401500 }, { "epoch": 15.736934820904287, "grad_norm": 1.421875, "learning_rate": 0.0006852613035819143, "loss": 6.434, "step": 402000 }, { "epoch": 15.756508122920337, "grad_norm": 1.8359375, "learning_rate": 0.0006848698375415932, "loss": 6.441, "step": 402500 }, { "epoch": 15.776081424936386, "grad_norm": 1.5546875, "learning_rate": 0.0006844783715012722, "loss": 6.4375, "step": 403000 }, { "epoch": 15.795654726952437, "grad_norm": 15.9375, "learning_rate": 0.0006840869054609513, "loss": 6.431, "step": 403500 }, { "epoch": 15.815228028968487, "grad_norm": 1.453125, "learning_rate": 0.0006836954394206303, "loss": 6.4348, "step": 404000 }, { "epoch": 15.834801330984536, "grad_norm": 1.9140625, "learning_rate": 0.0006833039733803092, "loss": 6.4251, "step": 404500 }, { "epoch": 15.854374633000587, "grad_norm": 1.453125, "learning_rate": 0.0006829125073399882, "loss": 6.4409, "step": 405000 }, { "epoch": 15.873947935016638, "grad_norm": 1.3125, "learning_rate": 0.0006825210412996673, "loss": 6.4516, "step": 405500 }, { "epoch": 15.893521237032687, "grad_norm": 2.65625, "learning_rate": 0.0006821295752593463, "loss": 6.4334, "step": 406000 }, { "epoch": 15.913094539048737, "grad_norm": 2.203125, "learning_rate": 0.0006817381092190252, "loss": 6.4318, "step": 406500 }, { "epoch": 15.932667841064788, "grad_norm": 1.3828125, "learning_rate": 0.0006813466431787042, "loss": 6.4306, "step": 407000 }, { "epoch": 15.952241143080837, "grad_norm": 1.1015625, "learning_rate": 0.0006809551771383833, "loss": 6.437, "step": 407500 }, { "epoch": 15.971814445096888, "grad_norm": 0.9375, "learning_rate": 0.0006805637110980623, "loss": 6.4377, "step": 408000 }, { "epoch": 15.991387747112938, "grad_norm": 4.25, "learning_rate": 0.0006801722450577412, "loss": 6.4369, "step": 408500 }, { "epoch": 16.0, "eval_loss": 6.43704080581665, "eval_runtime": 20.4936, "eval_samples_per_second": 97.592, "eval_steps_per_second": 6.099, "step": 408720 }, { "epoch": 16.010961049128987, "grad_norm": 2.109375, "learning_rate": 0.0006797807790174203, "loss": 6.427, "step": 409000 }, { "epoch": 16.03053435114504, "grad_norm": 1.1953125, "learning_rate": 0.0006793893129770993, "loss": 6.4358, "step": 409500 }, { "epoch": 16.05010765316109, "grad_norm": 1.1796875, "learning_rate": 0.0006789978469367783, "loss": 6.4409, "step": 410000 }, { "epoch": 16.069680955177137, "grad_norm": 1.3828125, "learning_rate": 0.0006786063808964572, "loss": 6.439, "step": 410500 }, { "epoch": 16.08925425719319, "grad_norm": 2.203125, "learning_rate": 0.0006782149148561363, "loss": 6.4372, "step": 411000 }, { "epoch": 16.10882755920924, "grad_norm": 3.09375, "learning_rate": 0.0006778234488158153, "loss": 6.4322, "step": 411500 }, { "epoch": 16.128400861225288, "grad_norm": 1.046875, "learning_rate": 0.0006774319827754943, "loss": 6.4346, "step": 412000 }, { "epoch": 16.14797416324134, "grad_norm": 1.0390625, "learning_rate": 0.0006770405167351732, "loss": 6.433, "step": 412500 }, { "epoch": 16.16754746525739, "grad_norm": 1.5390625, "learning_rate": 0.0006766490506948523, "loss": 6.4358, "step": 413000 }, { "epoch": 16.187120767273438, "grad_norm": 1.9921875, "learning_rate": 0.0006762575846545313, "loss": 6.4389, "step": 413500 }, { "epoch": 16.20669406928949, "grad_norm": 1.03125, "learning_rate": 0.0006758661186142103, "loss": 6.4403, "step": 414000 }, { "epoch": 16.22626737130554, "grad_norm": 5.75, "learning_rate": 0.0006754746525738891, "loss": 6.4404, "step": 414500 }, { "epoch": 16.245840673321588, "grad_norm": 1.5546875, "learning_rate": 0.0006750831865335683, "loss": 6.439, "step": 415000 }, { "epoch": 16.26541397533764, "grad_norm": 3.125, "learning_rate": 0.0006746917204932473, "loss": 6.4303, "step": 415500 }, { "epoch": 16.28498727735369, "grad_norm": 4.40625, "learning_rate": 0.0006743002544529263, "loss": 6.4322, "step": 416000 }, { "epoch": 16.30456057936974, "grad_norm": 2.0625, "learning_rate": 0.0006739087884126051, "loss": 6.4349, "step": 416500 }, { "epoch": 16.32413388138579, "grad_norm": 1.1796875, "learning_rate": 0.0006735173223722842, "loss": 6.4351, "step": 417000 }, { "epoch": 16.34370718340184, "grad_norm": 2.0625, "learning_rate": 0.0006731258563319632, "loss": 6.4387, "step": 417500 }, { "epoch": 16.36328048541789, "grad_norm": 1.46875, "learning_rate": 0.0006727343902916421, "loss": 6.4356, "step": 418000 }, { "epoch": 16.38285378743394, "grad_norm": 1.59375, "learning_rate": 0.0006723429242513212, "loss": 6.4408, "step": 418500 }, { "epoch": 16.40242708944999, "grad_norm": 1.953125, "learning_rate": 0.0006719514582110002, "loss": 6.4244, "step": 419000 }, { "epoch": 16.42200039146604, "grad_norm": 1.1640625, "learning_rate": 0.0006715599921706792, "loss": 6.4338, "step": 419500 }, { "epoch": 16.44157369348209, "grad_norm": 2.25, "learning_rate": 0.0006711685261303581, "loss": 6.435, "step": 420000 }, { "epoch": 16.46114699549814, "grad_norm": 1.125, "learning_rate": 0.0006707770600900372, "loss": 6.4414, "step": 420500 }, { "epoch": 16.48072029751419, "grad_norm": 2.234375, "learning_rate": 0.0006703855940497162, "loss": 6.4342, "step": 421000 }, { "epoch": 16.50029359953024, "grad_norm": 2.5, "learning_rate": 0.0006699941280093952, "loss": 6.4382, "step": 421500 }, { "epoch": 16.51986690154629, "grad_norm": 1.2734375, "learning_rate": 0.0006696026619690741, "loss": 6.433, "step": 422000 }, { "epoch": 16.53944020356234, "grad_norm": 1.4609375, "learning_rate": 0.0006692111959287532, "loss": 6.4282, "step": 422500 }, { "epoch": 16.559013505578392, "grad_norm": 1.0546875, "learning_rate": 0.0006688197298884322, "loss": 6.4268, "step": 423000 }, { "epoch": 16.57858680759444, "grad_norm": 2.5, "learning_rate": 0.0006684282638481112, "loss": 6.429, "step": 423500 }, { "epoch": 16.59816010961049, "grad_norm": 0.9609375, "learning_rate": 0.0006680367978077901, "loss": 6.4419, "step": 424000 }, { "epoch": 16.617733411626542, "grad_norm": 1.234375, "learning_rate": 0.0006676453317674692, "loss": 6.4278, "step": 424500 }, { "epoch": 16.63730671364259, "grad_norm": 1.2265625, "learning_rate": 0.0006672538657271482, "loss": 6.4343, "step": 425000 }, { "epoch": 16.656880015658643, "grad_norm": 1.015625, "learning_rate": 0.0006668623996868272, "loss": 6.4376, "step": 425500 }, { "epoch": 16.676453317674692, "grad_norm": 10.6875, "learning_rate": 0.0006664709336465061, "loss": 6.4388, "step": 426000 }, { "epoch": 16.69602661969074, "grad_norm": 1.015625, "learning_rate": 0.0006660794676061852, "loss": 6.434, "step": 426500 }, { "epoch": 16.715599921706794, "grad_norm": 1.4140625, "learning_rate": 0.0006656880015658642, "loss": 6.4432, "step": 427000 }, { "epoch": 16.735173223722843, "grad_norm": 1.09375, "learning_rate": 0.0006652965355255432, "loss": 6.4351, "step": 427500 }, { "epoch": 16.75474652573889, "grad_norm": 1.484375, "learning_rate": 0.0006649050694852222, "loss": 6.4306, "step": 428000 }, { "epoch": 16.774319827754944, "grad_norm": 1.2421875, "learning_rate": 0.0006645136034449012, "loss": 6.4337, "step": 428500 }, { "epoch": 16.793893129770993, "grad_norm": 1.6640625, "learning_rate": 0.0006641221374045802, "loss": 6.4353, "step": 429000 }, { "epoch": 16.81346643178704, "grad_norm": 1.421875, "learning_rate": 0.0006637306713642592, "loss": 6.4256, "step": 429500 }, { "epoch": 16.833039733803094, "grad_norm": 1.6796875, "learning_rate": 0.0006633392053239382, "loss": 6.4321, "step": 430000 }, { "epoch": 16.852613035819143, "grad_norm": 1.8671875, "learning_rate": 0.0006629477392836172, "loss": 6.4416, "step": 430500 }, { "epoch": 16.872186337835192, "grad_norm": 17.625, "learning_rate": 0.0006625562732432962, "loss": 6.4376, "step": 431000 }, { "epoch": 16.891759639851244, "grad_norm": 1.2109375, "learning_rate": 0.0006621648072029752, "loss": 6.4386, "step": 431500 }, { "epoch": 16.911332941867293, "grad_norm": 2.578125, "learning_rate": 0.0006617733411626542, "loss": 6.4389, "step": 432000 }, { "epoch": 16.930906243883342, "grad_norm": 2.59375, "learning_rate": 0.0006613818751223332, "loss": 6.4396, "step": 432500 }, { "epoch": 16.950479545899395, "grad_norm": 4.40625, "learning_rate": 0.0006609904090820122, "loss": 6.4354, "step": 433000 }, { "epoch": 16.970052847915444, "grad_norm": 1.359375, "learning_rate": 0.000660598943041691, "loss": 6.4352, "step": 433500 }, { "epoch": 16.989626149931492, "grad_norm": 1.109375, "learning_rate": 0.0006602074770013702, "loss": 6.4347, "step": 434000 }, { "epoch": 17.0, "eval_loss": 6.436838150024414, "eval_runtime": 20.9495, "eval_samples_per_second": 95.467, "eval_steps_per_second": 5.967, "step": 434265 } ], "logging_steps": 500, "max_steps": 1277250, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3275457063057981e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }