{ "best_metric": null, "best_model_checkpoint": null, "epoch": 80.0, "eval_steps": 500, "global_step": 540640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "grad_norm": 0.11938641220331192, "learning_rate": 3.122109906777153e-05, "loss": 0.7845, "step": 500 }, { "epoch": 0.15, "grad_norm": 0.11220108717679977, "learning_rate": 3.119219813554306e-05, "loss": 0.7681, "step": 1000 }, { "epoch": 0.22, "grad_norm": 0.20039337873458862, "learning_rate": 3.116329720331459e-05, "loss": 0.7627, "step": 1500 }, { "epoch": 0.3, "grad_norm": 0.20968303084373474, "learning_rate": 3.113439627108612e-05, "loss": 0.751, "step": 2000 }, { "epoch": 0.37, "grad_norm": 0.46042799949645996, "learning_rate": 3.1105495338857653e-05, "loss": 0.7395, "step": 2500 }, { "epoch": 0.44, "grad_norm": 0.5682156682014465, "learning_rate": 3.107659440662918e-05, "loss": 0.7169, "step": 3000 }, { "epoch": 0.52, "grad_norm": 0.446135938167572, "learning_rate": 3.104769347440071e-05, "loss": 0.7037, "step": 3500 }, { "epoch": 0.59, "grad_norm": 0.5436543822288513, "learning_rate": 3.1018792542172244e-05, "loss": 0.6854, "step": 4000 }, { "epoch": 0.67, "grad_norm": 0.5623897314071655, "learning_rate": 3.098989160994377e-05, "loss": 0.6661, "step": 4500 }, { "epoch": 0.74, "grad_norm": 0.7239806652069092, "learning_rate": 3.09609906777153e-05, "loss": 0.6477, "step": 5000 }, { "epoch": 0.81, "grad_norm": 0.6363663077354431, "learning_rate": 3.0932089745486834e-05, "loss": 0.6302, "step": 5500 }, { "epoch": 0.89, "grad_norm": 0.8511515855789185, "learning_rate": 3.090318881325836e-05, "loss": 0.6156, "step": 6000 }, { "epoch": 0.96, "grad_norm": 0.7209456562995911, "learning_rate": 3.087428788102989e-05, "loss": 0.6002, "step": 6500 }, { "epoch": 1.04, "grad_norm": 0.7105280756950378, "learning_rate": 3.0845386948801424e-05, "loss": 0.5852, "step": 7000 }, { "epoch": 1.11, "grad_norm": 0.7035876512527466, "learning_rate": 3.081648601657295e-05, "loss": 0.5734, "step": 7500 }, { "epoch": 1.18, "grad_norm": 0.6755463480949402, "learning_rate": 3.078758508434448e-05, "loss": 0.5619, "step": 8000 }, { "epoch": 1.26, "grad_norm": 0.6636064648628235, "learning_rate": 3.0758684152116015e-05, "loss": 0.5527, "step": 8500 }, { "epoch": 1.33, "grad_norm": 0.7909913063049316, "learning_rate": 3.072978321988754e-05, "loss": 0.545, "step": 9000 }, { "epoch": 1.41, "grad_norm": 0.7935764789581299, "learning_rate": 3.070088228765907e-05, "loss": 0.5342, "step": 9500 }, { "epoch": 1.48, "grad_norm": 0.7649631500244141, "learning_rate": 3.06719813554306e-05, "loss": 0.5264, "step": 10000 }, { "epoch": 1.55, "grad_norm": 0.7262706160545349, "learning_rate": 3.064308042320213e-05, "loss": 0.5194, "step": 10500 }, { "epoch": 1.63, "grad_norm": 0.7068478465080261, "learning_rate": 3.061417949097366e-05, "loss": 0.5137, "step": 11000 }, { "epoch": 1.7, "grad_norm": 0.6415815353393555, "learning_rate": 3.058527855874519e-05, "loss": 0.51, "step": 11500 }, { "epoch": 1.78, "grad_norm": 0.7167455554008484, "learning_rate": 3.055637762651672e-05, "loss": 0.5024, "step": 12000 }, { "epoch": 1.85, "grad_norm": 0.6563605666160583, "learning_rate": 3.052747669428825e-05, "loss": 0.4985, "step": 12500 }, { "epoch": 1.92, "grad_norm": 0.7427666783332825, "learning_rate": 3.049857576205978e-05, "loss": 0.4939, "step": 13000 }, { "epoch": 2.0, "grad_norm": 0.6371767520904541, "learning_rate": 3.046967482983131e-05, "loss": 0.4923, "step": 13500 }, { "epoch": 2.07, "grad_norm": 0.7062104940414429, "learning_rate": 3.044077389760284e-05, "loss": 0.4894, "step": 14000 }, { "epoch": 2.15, "grad_norm": 0.7556993365287781, "learning_rate": 3.041187296537437e-05, "loss": 0.4846, "step": 14500 }, { "epoch": 2.22, "grad_norm": 0.6561410427093506, "learning_rate": 3.03829720331459e-05, "loss": 0.4831, "step": 15000 }, { "epoch": 2.29, "grad_norm": 0.6414974331855774, "learning_rate": 3.0354071100917432e-05, "loss": 0.4807, "step": 15500 }, { "epoch": 2.37, "grad_norm": 0.6632120609283447, "learning_rate": 3.032517016868896e-05, "loss": 0.472, "step": 16000 }, { "epoch": 2.44, "grad_norm": 0.6413108706474304, "learning_rate": 3.029626923646049e-05, "loss": 0.4723, "step": 16500 }, { "epoch": 2.52, "grad_norm": 0.6478744149208069, "learning_rate": 3.0267368304232022e-05, "loss": 0.4692, "step": 17000 }, { "epoch": 2.59, "grad_norm": 0.5901973247528076, "learning_rate": 3.023846737200355e-05, "loss": 0.4672, "step": 17500 }, { "epoch": 2.66, "grad_norm": 0.5960791707038879, "learning_rate": 3.020956643977508e-05, "loss": 0.4649, "step": 18000 }, { "epoch": 2.74, "grad_norm": 0.6265193819999695, "learning_rate": 3.0180665507546612e-05, "loss": 0.4616, "step": 18500 }, { "epoch": 2.81, "grad_norm": 0.6381145119667053, "learning_rate": 3.0151764575318144e-05, "loss": 0.4592, "step": 19000 }, { "epoch": 2.89, "grad_norm": 0.6370628476142883, "learning_rate": 3.012286364308967e-05, "loss": 0.4594, "step": 19500 }, { "epoch": 2.96, "grad_norm": 0.5658203363418579, "learning_rate": 3.0093962710861203e-05, "loss": 0.4542, "step": 20000 }, { "epoch": 3.03, "grad_norm": 0.5123589038848877, "learning_rate": 3.0065061778632734e-05, "loss": 0.4555, "step": 20500 }, { "epoch": 3.11, "grad_norm": 0.5034360289573669, "learning_rate": 3.0036160846404262e-05, "loss": 0.4501, "step": 21000 }, { "epoch": 3.18, "grad_norm": 0.5025657415390015, "learning_rate": 3.0007259914175793e-05, "loss": 0.4501, "step": 21500 }, { "epoch": 3.26, "grad_norm": 0.5448479056358337, "learning_rate": 2.9978358981947324e-05, "loss": 0.4481, "step": 22000 }, { "epoch": 3.33, "grad_norm": 0.5894014239311218, "learning_rate": 2.9949458049718852e-05, "loss": 0.4438, "step": 22500 }, { "epoch": 3.4, "grad_norm": 0.653883159160614, "learning_rate": 2.9920557117490383e-05, "loss": 0.444, "step": 23000 }, { "epoch": 3.48, "grad_norm": 0.4382980167865753, "learning_rate": 2.9891656185261915e-05, "loss": 0.4437, "step": 23500 }, { "epoch": 3.55, "grad_norm": 0.4639624357223511, "learning_rate": 2.9862755253033443e-05, "loss": 0.4398, "step": 24000 }, { "epoch": 3.63, "grad_norm": 0.527728796005249, "learning_rate": 2.9833854320804974e-05, "loss": 0.4386, "step": 24500 }, { "epoch": 3.7, "grad_norm": 0.543736457824707, "learning_rate": 2.9804953388576505e-05, "loss": 0.4392, "step": 25000 }, { "epoch": 3.77, "grad_norm": 0.5280329585075378, "learning_rate": 2.9776052456348033e-05, "loss": 0.4383, "step": 25500 }, { "epoch": 3.85, "grad_norm": 0.4563904106616974, "learning_rate": 2.9747151524119564e-05, "loss": 0.4371, "step": 26000 }, { "epoch": 3.92, "grad_norm": 0.5162687301635742, "learning_rate": 2.9718250591891095e-05, "loss": 0.4367, "step": 26500 }, { "epoch": 4.0, "grad_norm": 0.4838933050632477, "learning_rate": 2.9689349659662623e-05, "loss": 0.4352, "step": 27000 }, { "epoch": 4.07, "grad_norm": 0.5301242470741272, "learning_rate": 2.9660448727434154e-05, "loss": 0.4319, "step": 27500 }, { "epoch": 4.14, "grad_norm": 0.5619557499885559, "learning_rate": 2.9631547795205686e-05, "loss": 0.4303, "step": 28000 }, { "epoch": 4.22, "grad_norm": 0.4900205433368683, "learning_rate": 2.9602646862977214e-05, "loss": 0.4312, "step": 28500 }, { "epoch": 4.29, "grad_norm": 0.46870502829551697, "learning_rate": 2.9573745930748745e-05, "loss": 0.4302, "step": 29000 }, { "epoch": 4.37, "grad_norm": 0.47382786870002747, "learning_rate": 2.9544844998520273e-05, "loss": 0.4287, "step": 29500 }, { "epoch": 4.44, "grad_norm": 0.5594569444656372, "learning_rate": 2.95159440662918e-05, "loss": 0.4284, "step": 30000 }, { "epoch": 4.51, "grad_norm": 0.511375367641449, "learning_rate": 2.9487043134063332e-05, "loss": 0.4262, "step": 30500 }, { "epoch": 4.59, "grad_norm": 0.5069934725761414, "learning_rate": 2.9458142201834863e-05, "loss": 0.4247, "step": 31000 }, { "epoch": 4.66, "grad_norm": 0.5310338139533997, "learning_rate": 2.942924126960639e-05, "loss": 0.4249, "step": 31500 }, { "epoch": 4.74, "grad_norm": 0.4728649854660034, "learning_rate": 2.9400340337377922e-05, "loss": 0.4225, "step": 32000 }, { "epoch": 4.81, "grad_norm": 0.45557233691215515, "learning_rate": 2.9371439405149453e-05, "loss": 0.4241, "step": 32500 }, { "epoch": 4.88, "grad_norm": 0.4630686938762665, "learning_rate": 2.934253847292098e-05, "loss": 0.4212, "step": 33000 }, { "epoch": 4.96, "grad_norm": 0.509099543094635, "learning_rate": 2.9313637540692512e-05, "loss": 0.4215, "step": 33500 }, { "epoch": 5.03, "grad_norm": 0.4747762084007263, "learning_rate": 2.9284736608464044e-05, "loss": 0.4203, "step": 34000 }, { "epoch": 5.11, "grad_norm": 0.43625542521476746, "learning_rate": 2.925583567623557e-05, "loss": 0.4211, "step": 34500 }, { "epoch": 5.18, "grad_norm": 0.44176748394966125, "learning_rate": 2.9226934744007103e-05, "loss": 0.4209, "step": 35000 }, { "epoch": 5.25, "grad_norm": 0.5236085653305054, "learning_rate": 2.9198033811778634e-05, "loss": 0.422, "step": 35500 }, { "epoch": 5.33, "grad_norm": 0.4237843453884125, "learning_rate": 2.9169132879550162e-05, "loss": 0.4163, "step": 36000 }, { "epoch": 5.4, "grad_norm": 0.44581139087677, "learning_rate": 2.9140231947321693e-05, "loss": 0.4152, "step": 36500 }, { "epoch": 5.47, "grad_norm": 0.4488186836242676, "learning_rate": 2.9111331015093224e-05, "loss": 0.4175, "step": 37000 }, { "epoch": 5.55, "grad_norm": 0.5051326751708984, "learning_rate": 2.9082430082864752e-05, "loss": 0.4149, "step": 37500 }, { "epoch": 5.62, "grad_norm": 0.4836309850215912, "learning_rate": 2.9053529150636283e-05, "loss": 0.4138, "step": 38000 }, { "epoch": 5.7, "grad_norm": 0.46710771322250366, "learning_rate": 2.9024628218407815e-05, "loss": 0.4125, "step": 38500 }, { "epoch": 5.77, "grad_norm": 0.39740118384361267, "learning_rate": 2.8995727286179342e-05, "loss": 0.4169, "step": 39000 }, { "epoch": 5.84, "grad_norm": 0.4491262435913086, "learning_rate": 2.8966826353950874e-05, "loss": 0.4136, "step": 39500 }, { "epoch": 5.92, "grad_norm": 0.4240283966064453, "learning_rate": 2.8937925421722405e-05, "loss": 0.4143, "step": 40000 }, { "epoch": 5.99, "grad_norm": 0.43018123507499695, "learning_rate": 2.8909024489493933e-05, "loss": 0.41, "step": 40500 }, { "epoch": 6.07, "grad_norm": 0.49115487933158875, "learning_rate": 2.8880123557265464e-05, "loss": 0.4086, "step": 41000 }, { "epoch": 6.14, "grad_norm": 0.4617484211921692, "learning_rate": 2.8851222625036995e-05, "loss": 0.4111, "step": 41500 }, { "epoch": 6.21, "grad_norm": 0.4269873797893524, "learning_rate": 2.8822321692808523e-05, "loss": 0.4068, "step": 42000 }, { "epoch": 6.29, "grad_norm": 0.45183584094047546, "learning_rate": 2.8793420760580054e-05, "loss": 0.4104, "step": 42500 }, { "epoch": 6.36, "grad_norm": 0.3999849557876587, "learning_rate": 2.8764519828351586e-05, "loss": 0.4074, "step": 43000 }, { "epoch": 6.44, "grad_norm": 0.3897479772567749, "learning_rate": 2.8735618896123113e-05, "loss": 0.4113, "step": 43500 }, { "epoch": 6.51, "grad_norm": 0.36687174439430237, "learning_rate": 2.8706717963894645e-05, "loss": 0.409, "step": 44000 }, { "epoch": 6.58, "grad_norm": 0.41888511180877686, "learning_rate": 2.8677817031666176e-05, "loss": 0.4072, "step": 44500 }, { "epoch": 6.66, "grad_norm": 0.4102098047733307, "learning_rate": 2.8648916099437704e-05, "loss": 0.4081, "step": 45000 }, { "epoch": 6.73, "grad_norm": 0.42067912220954895, "learning_rate": 2.8620015167209235e-05, "loss": 0.4093, "step": 45500 }, { "epoch": 6.81, "grad_norm": 0.45427748560905457, "learning_rate": 2.8591114234980766e-05, "loss": 0.4076, "step": 46000 }, { "epoch": 6.88, "grad_norm": 0.394954115152359, "learning_rate": 2.8562213302752294e-05, "loss": 0.4067, "step": 46500 }, { "epoch": 6.95, "grad_norm": 0.42659953236579895, "learning_rate": 2.8533312370523825e-05, "loss": 0.4062, "step": 47000 }, { "epoch": 7.03, "grad_norm": 0.38056984543800354, "learning_rate": 2.8504411438295357e-05, "loss": 0.4061, "step": 47500 }, { "epoch": 7.1, "grad_norm": 0.368455708026886, "learning_rate": 2.8475510506066884e-05, "loss": 0.4032, "step": 48000 }, { "epoch": 7.18, "grad_norm": 0.44540271162986755, "learning_rate": 2.8446609573838416e-05, "loss": 0.4054, "step": 48500 }, { "epoch": 7.25, "grad_norm": 0.3926877975463867, "learning_rate": 2.8417708641609943e-05, "loss": 0.4024, "step": 49000 }, { "epoch": 7.32, "grad_norm": 0.4288729727268219, "learning_rate": 2.838880770938147e-05, "loss": 0.4013, "step": 49500 }, { "epoch": 7.4, "grad_norm": 0.4729566276073456, "learning_rate": 2.8359906777153003e-05, "loss": 0.4019, "step": 50000 }, { "epoch": 7.47, "grad_norm": 0.46875321865081787, "learning_rate": 2.8331005844924534e-05, "loss": 0.4, "step": 50500 }, { "epoch": 7.55, "grad_norm": 0.63325035572052, "learning_rate": 2.830210491269606e-05, "loss": 0.4008, "step": 51000 }, { "epoch": 7.62, "grad_norm": 0.4186055064201355, "learning_rate": 2.8273203980467593e-05, "loss": 0.4026, "step": 51500 }, { "epoch": 7.69, "grad_norm": 0.3860541880130768, "learning_rate": 2.8244303048239124e-05, "loss": 0.4022, "step": 52000 }, { "epoch": 7.77, "grad_norm": 0.4552393853664398, "learning_rate": 2.8215402116010652e-05, "loss": 0.3979, "step": 52500 }, { "epoch": 7.84, "grad_norm": 0.4990374743938446, "learning_rate": 2.8186501183782183e-05, "loss": 0.4001, "step": 53000 }, { "epoch": 7.92, "grad_norm": 0.46718060970306396, "learning_rate": 2.8157600251553714e-05, "loss": 0.4, "step": 53500 }, { "epoch": 7.99, "grad_norm": 0.45432960987091064, "learning_rate": 2.8128699319325242e-05, "loss": 0.398, "step": 54000 }, { "epoch": 8.06, "grad_norm": 0.40666621923446655, "learning_rate": 2.8099798387096774e-05, "loss": 0.3996, "step": 54500 }, { "epoch": 8.14, "grad_norm": 0.402972936630249, "learning_rate": 2.8070897454868305e-05, "loss": 0.3985, "step": 55000 }, { "epoch": 8.21, "grad_norm": 0.3767193853855133, "learning_rate": 2.8041996522639836e-05, "loss": 0.4, "step": 55500 }, { "epoch": 8.29, "grad_norm": 0.40102022886276245, "learning_rate": 2.8013095590411364e-05, "loss": 0.3987, "step": 56000 }, { "epoch": 8.36, "grad_norm": 0.4435707926750183, "learning_rate": 2.7984194658182895e-05, "loss": 0.3976, "step": 56500 }, { "epoch": 8.43, "grad_norm": 0.39804941415786743, "learning_rate": 2.7955293725954426e-05, "loss": 0.395, "step": 57000 }, { "epoch": 8.51, "grad_norm": 0.41703784465789795, "learning_rate": 2.7926392793725954e-05, "loss": 0.395, "step": 57500 }, { "epoch": 8.58, "grad_norm": 0.4349576234817505, "learning_rate": 2.7897491861497485e-05, "loss": 0.3946, "step": 58000 }, { "epoch": 8.66, "grad_norm": 0.37204691767692566, "learning_rate": 2.7868590929269017e-05, "loss": 0.394, "step": 58500 }, { "epoch": 8.73, "grad_norm": 0.42759761214256287, "learning_rate": 2.7839689997040545e-05, "loss": 0.3949, "step": 59000 }, { "epoch": 8.8, "grad_norm": 0.37754470109939575, "learning_rate": 2.7810789064812076e-05, "loss": 0.3939, "step": 59500 }, { "epoch": 8.88, "grad_norm": 0.3639107346534729, "learning_rate": 2.7781888132583607e-05, "loss": 0.3932, "step": 60000 }, { "epoch": 8.95, "grad_norm": 0.37291327118873596, "learning_rate": 2.7752987200355135e-05, "loss": 0.394, "step": 60500 }, { "epoch": 9.03, "grad_norm": 0.3964773416519165, "learning_rate": 2.7724086268126666e-05, "loss": 0.3959, "step": 61000 }, { "epoch": 9.1, "grad_norm": 0.4025065004825592, "learning_rate": 2.7695185335898197e-05, "loss": 0.3922, "step": 61500 }, { "epoch": 9.17, "grad_norm": 0.5499910116195679, "learning_rate": 2.7666284403669725e-05, "loss": 0.3893, "step": 62000 }, { "epoch": 9.25, "grad_norm": 0.43492835760116577, "learning_rate": 2.7637383471441256e-05, "loss": 0.3942, "step": 62500 }, { "epoch": 9.32, "grad_norm": 0.38981184363365173, "learning_rate": 2.7608482539212788e-05, "loss": 0.3941, "step": 63000 }, { "epoch": 9.4, "grad_norm": 0.4508809745311737, "learning_rate": 2.7579581606984316e-05, "loss": 0.3922, "step": 63500 }, { "epoch": 9.47, "grad_norm": 0.37447696924209595, "learning_rate": 2.7550680674755847e-05, "loss": 0.3905, "step": 64000 }, { "epoch": 9.54, "grad_norm": 0.40094566345214844, "learning_rate": 2.7521779742527378e-05, "loss": 0.3938, "step": 64500 }, { "epoch": 9.62, "grad_norm": 0.46564099192619324, "learning_rate": 2.7492878810298906e-05, "loss": 0.3914, "step": 65000 }, { "epoch": 9.69, "grad_norm": 0.37548139691352844, "learning_rate": 2.7463977878070437e-05, "loss": 0.3923, "step": 65500 }, { "epoch": 9.77, "grad_norm": 0.39845481514930725, "learning_rate": 2.743507694584197e-05, "loss": 0.3904, "step": 66000 }, { "epoch": 9.84, "grad_norm": 0.46478548645973206, "learning_rate": 2.7406176013613496e-05, "loss": 0.3897, "step": 66500 }, { "epoch": 9.91, "grad_norm": 0.5512229204177856, "learning_rate": 2.7377275081385027e-05, "loss": 0.3898, "step": 67000 }, { "epoch": 9.99, "grad_norm": 0.34783828258514404, "learning_rate": 2.734837414915656e-05, "loss": 0.3901, "step": 67500 }, { "epoch": 10.06, "grad_norm": 0.4403396546840668, "learning_rate": 2.7319473216928083e-05, "loss": 0.388, "step": 68000 }, { "epoch": 10.14, "grad_norm": 0.37262195348739624, "learning_rate": 2.7290572284699614e-05, "loss": 0.3869, "step": 68500 }, { "epoch": 10.21, "grad_norm": 0.38222742080688477, "learning_rate": 2.7261671352471146e-05, "loss": 0.3901, "step": 69000 }, { "epoch": 10.28, "grad_norm": 0.614713191986084, "learning_rate": 2.7232770420242673e-05, "loss": 0.3886, "step": 69500 }, { "epoch": 10.36, "grad_norm": 0.4252707362174988, "learning_rate": 2.7203869488014205e-05, "loss": 0.3874, "step": 70000 }, { "epoch": 10.43, "grad_norm": 0.3792737126350403, "learning_rate": 2.7174968555785736e-05, "loss": 0.3855, "step": 70500 }, { "epoch": 10.51, "grad_norm": 0.40962672233581543, "learning_rate": 2.7146067623557264e-05, "loss": 0.3875, "step": 71000 }, { "epoch": 10.58, "grad_norm": 0.38987433910369873, "learning_rate": 2.7117166691328795e-05, "loss": 0.3855, "step": 71500 }, { "epoch": 10.65, "grad_norm": 0.407105028629303, "learning_rate": 2.7088265759100326e-05, "loss": 0.3857, "step": 72000 }, { "epoch": 10.73, "grad_norm": 0.3527330160140991, "learning_rate": 2.7059364826871854e-05, "loss": 0.3869, "step": 72500 }, { "epoch": 10.8, "grad_norm": 0.3859241306781769, "learning_rate": 2.7030463894643385e-05, "loss": 0.3872, "step": 73000 }, { "epoch": 10.88, "grad_norm": 0.3989656865596771, "learning_rate": 2.7001562962414917e-05, "loss": 0.3855, "step": 73500 }, { "epoch": 10.95, "grad_norm": 0.4163249731063843, "learning_rate": 2.6972662030186444e-05, "loss": 0.3855, "step": 74000 }, { "epoch": 11.02, "grad_norm": 0.39577716588974, "learning_rate": 2.6943761097957976e-05, "loss": 0.3848, "step": 74500 }, { "epoch": 11.1, "grad_norm": 0.3792816400527954, "learning_rate": 2.6914860165729507e-05, "loss": 0.3864, "step": 75000 }, { "epoch": 11.17, "grad_norm": 0.3979376554489136, "learning_rate": 2.6885959233501035e-05, "loss": 0.3847, "step": 75500 }, { "epoch": 11.25, "grad_norm": 0.44446560740470886, "learning_rate": 2.6857058301272566e-05, "loss": 0.3851, "step": 76000 }, { "epoch": 11.32, "grad_norm": 0.3826451599597931, "learning_rate": 2.6828157369044097e-05, "loss": 0.385, "step": 76500 }, { "epoch": 11.39, "grad_norm": 0.38595423102378845, "learning_rate": 2.6799256436815625e-05, "loss": 0.3848, "step": 77000 }, { "epoch": 11.47, "grad_norm": 0.4047674238681793, "learning_rate": 2.6770355504587156e-05, "loss": 0.384, "step": 77500 }, { "epoch": 11.54, "grad_norm": 0.3704206347465515, "learning_rate": 2.6741454572358688e-05, "loss": 0.3846, "step": 78000 }, { "epoch": 11.62, "grad_norm": 0.42468497157096863, "learning_rate": 2.6712553640130215e-05, "loss": 0.3847, "step": 78500 }, { "epoch": 11.69, "grad_norm": 0.43300580978393555, "learning_rate": 2.6683652707901747e-05, "loss": 0.3783, "step": 79000 }, { "epoch": 11.76, "grad_norm": 0.3680213689804077, "learning_rate": 2.6654751775673278e-05, "loss": 0.3811, "step": 79500 }, { "epoch": 11.84, "grad_norm": 0.3971569240093231, "learning_rate": 2.6625850843444806e-05, "loss": 0.3816, "step": 80000 }, { "epoch": 11.91, "grad_norm": 0.3454134464263916, "learning_rate": 2.6596949911216337e-05, "loss": 0.38, "step": 80500 }, { "epoch": 11.99, "grad_norm": 0.38850536942481995, "learning_rate": 2.6568048978987868e-05, "loss": 0.3806, "step": 81000 }, { "epoch": 12.06, "grad_norm": 0.41783374547958374, "learning_rate": 2.6539148046759396e-05, "loss": 0.3805, "step": 81500 }, { "epoch": 12.13, "grad_norm": 0.37427714467048645, "learning_rate": 2.6510247114530927e-05, "loss": 0.3844, "step": 82000 }, { "epoch": 12.21, "grad_norm": 0.3917700946331024, "learning_rate": 2.648134618230246e-05, "loss": 0.3818, "step": 82500 }, { "epoch": 12.28, "grad_norm": 0.36409491300582886, "learning_rate": 2.6452445250073986e-05, "loss": 0.3815, "step": 83000 }, { "epoch": 12.36, "grad_norm": 0.4320700764656067, "learning_rate": 2.6423544317845518e-05, "loss": 0.3813, "step": 83500 }, { "epoch": 12.43, "grad_norm": 0.3927746117115021, "learning_rate": 2.639464338561705e-05, "loss": 0.3827, "step": 84000 }, { "epoch": 12.5, "grad_norm": 0.3693206310272217, "learning_rate": 2.6365742453388577e-05, "loss": 0.3803, "step": 84500 }, { "epoch": 12.58, "grad_norm": 0.4206922948360443, "learning_rate": 2.6336841521160108e-05, "loss": 0.3794, "step": 85000 }, { "epoch": 12.65, "grad_norm": 0.35786914825439453, "learning_rate": 2.630794058893164e-05, "loss": 0.3823, "step": 85500 }, { "epoch": 12.73, "grad_norm": 0.4055446982383728, "learning_rate": 2.6279039656703167e-05, "loss": 0.3797, "step": 86000 }, { "epoch": 12.8, "grad_norm": 0.473630428314209, "learning_rate": 2.62501387244747e-05, "loss": 0.3776, "step": 86500 }, { "epoch": 12.87, "grad_norm": 0.36061763763427734, "learning_rate": 2.622123779224623e-05, "loss": 0.3781, "step": 87000 }, { "epoch": 12.95, "grad_norm": 0.4378969371318817, "learning_rate": 2.6192336860017754e-05, "loss": 0.3778, "step": 87500 }, { "epoch": 13.02, "grad_norm": 0.3772602379322052, "learning_rate": 2.6163435927789285e-05, "loss": 0.3776, "step": 88000 }, { "epoch": 13.1, "grad_norm": 0.42682790756225586, "learning_rate": 2.6134534995560817e-05, "loss": 0.3802, "step": 88500 }, { "epoch": 13.17, "grad_norm": 0.38328275084495544, "learning_rate": 2.6105634063332344e-05, "loss": 0.3813, "step": 89000 }, { "epoch": 13.24, "grad_norm": 0.4136464595794678, "learning_rate": 2.6076733131103876e-05, "loss": 0.3773, "step": 89500 }, { "epoch": 13.32, "grad_norm": 0.39002037048339844, "learning_rate": 2.6047832198875407e-05, "loss": 0.3767, "step": 90000 }, { "epoch": 13.39, "grad_norm": 0.4823383092880249, "learning_rate": 2.6018931266646935e-05, "loss": 0.3789, "step": 90500 }, { "epoch": 13.47, "grad_norm": 0.3532434403896332, "learning_rate": 2.5990030334418466e-05, "loss": 0.3755, "step": 91000 }, { "epoch": 13.54, "grad_norm": 0.3406650424003601, "learning_rate": 2.5961129402189997e-05, "loss": 0.3782, "step": 91500 }, { "epoch": 13.61, "grad_norm": 0.42174699902534485, "learning_rate": 2.5932228469961525e-05, "loss": 0.3792, "step": 92000 }, { "epoch": 13.69, "grad_norm": 0.4112718999385834, "learning_rate": 2.5903327537733056e-05, "loss": 0.3758, "step": 92500 }, { "epoch": 13.76, "grad_norm": 0.39170435070991516, "learning_rate": 2.5874426605504588e-05, "loss": 0.3772, "step": 93000 }, { "epoch": 13.84, "grad_norm": 0.35669615864753723, "learning_rate": 2.584552567327612e-05, "loss": 0.376, "step": 93500 }, { "epoch": 13.91, "grad_norm": 0.36161208152770996, "learning_rate": 2.5816624741047647e-05, "loss": 0.3759, "step": 94000 }, { "epoch": 13.98, "grad_norm": 0.3548930883407593, "learning_rate": 2.5787723808819178e-05, "loss": 0.3772, "step": 94500 }, { "epoch": 14.06, "grad_norm": 0.3934749662876129, "learning_rate": 2.575882287659071e-05, "loss": 0.3774, "step": 95000 }, { "epoch": 14.13, "grad_norm": 0.3642575442790985, "learning_rate": 2.5729921944362237e-05, "loss": 0.3756, "step": 95500 }, { "epoch": 14.21, "grad_norm": 0.34236952662467957, "learning_rate": 2.5701021012133768e-05, "loss": 0.3758, "step": 96000 }, { "epoch": 14.28, "grad_norm": 0.40388983488082886, "learning_rate": 2.56721200799053e-05, "loss": 0.3749, "step": 96500 }, { "epoch": 14.35, "grad_norm": 0.432076632976532, "learning_rate": 2.5643219147676827e-05, "loss": 0.3756, "step": 97000 }, { "epoch": 14.43, "grad_norm": 0.3947947025299072, "learning_rate": 2.561431821544836e-05, "loss": 0.3792, "step": 97500 }, { "epoch": 14.5, "grad_norm": 0.3974926173686981, "learning_rate": 2.558541728321989e-05, "loss": 0.3741, "step": 98000 }, { "epoch": 14.58, "grad_norm": 0.3732788860797882, "learning_rate": 2.5556516350991418e-05, "loss": 0.3741, "step": 98500 }, { "epoch": 14.65, "grad_norm": 0.35293856263160706, "learning_rate": 2.552761541876295e-05, "loss": 0.375, "step": 99000 }, { "epoch": 14.72, "grad_norm": 0.38685211539268494, "learning_rate": 2.549871448653448e-05, "loss": 0.3711, "step": 99500 }, { "epoch": 14.8, "grad_norm": 0.40676021575927734, "learning_rate": 2.5469813554306008e-05, "loss": 0.3786, "step": 100000 }, { "epoch": 14.87, "grad_norm": 0.40946874022483826, "learning_rate": 2.544091262207754e-05, "loss": 0.3749, "step": 100500 }, { "epoch": 14.95, "grad_norm": 0.35838282108306885, "learning_rate": 2.541201168984907e-05, "loss": 0.3733, "step": 101000 }, { "epoch": 15.02, "grad_norm": 0.4110182821750641, "learning_rate": 2.5383110757620598e-05, "loss": 0.3712, "step": 101500 }, { "epoch": 15.09, "grad_norm": 0.35349026322364807, "learning_rate": 2.535420982539213e-05, "loss": 0.3719, "step": 102000 }, { "epoch": 15.17, "grad_norm": 0.38222944736480713, "learning_rate": 2.532530889316366e-05, "loss": 0.3737, "step": 102500 }, { "epoch": 15.24, "grad_norm": 0.47066986560821533, "learning_rate": 2.529640796093519e-05, "loss": 0.3728, "step": 103000 }, { "epoch": 15.32, "grad_norm": 0.3949437439441681, "learning_rate": 2.526750702870672e-05, "loss": 0.3752, "step": 103500 }, { "epoch": 15.39, "grad_norm": 0.42243218421936035, "learning_rate": 2.523860609647825e-05, "loss": 0.3729, "step": 104000 }, { "epoch": 15.46, "grad_norm": 0.4031197726726532, "learning_rate": 2.520970516424978e-05, "loss": 0.3704, "step": 104500 }, { "epoch": 15.54, "grad_norm": 0.4034245014190674, "learning_rate": 2.518080423202131e-05, "loss": 0.3713, "step": 105000 }, { "epoch": 15.61, "grad_norm": 0.4237426817417145, "learning_rate": 2.515190329979284e-05, "loss": 0.3719, "step": 105500 }, { "epoch": 15.69, "grad_norm": 0.41453346610069275, "learning_rate": 2.512300236756437e-05, "loss": 0.3742, "step": 106000 }, { "epoch": 15.76, "grad_norm": 0.4238516390323639, "learning_rate": 2.50941014353359e-05, "loss": 0.3734, "step": 106500 }, { "epoch": 15.83, "grad_norm": 0.3762485384941101, "learning_rate": 2.506520050310743e-05, "loss": 0.3736, "step": 107000 }, { "epoch": 15.91, "grad_norm": 0.36851537227630615, "learning_rate": 2.5036299570878956e-05, "loss": 0.3736, "step": 107500 }, { "epoch": 15.98, "grad_norm": 0.36127322912216187, "learning_rate": 2.5007398638650487e-05, "loss": 0.3716, "step": 108000 }, { "epoch": 16.06, "grad_norm": 0.4159682095050812, "learning_rate": 2.497849770642202e-05, "loss": 0.3695, "step": 108500 }, { "epoch": 16.13, "grad_norm": 0.40441277623176575, "learning_rate": 2.4949596774193547e-05, "loss": 0.3702, "step": 109000 }, { "epoch": 16.2, "grad_norm": 0.3531157076358795, "learning_rate": 2.4920695841965078e-05, "loss": 0.3751, "step": 109500 }, { "epoch": 16.28, "grad_norm": 0.40636512637138367, "learning_rate": 2.489179490973661e-05, "loss": 0.3692, "step": 110000 }, { "epoch": 16.35, "grad_norm": 0.3990442156791687, "learning_rate": 2.4862893977508137e-05, "loss": 0.3697, "step": 110500 }, { "epoch": 16.42, "grad_norm": 0.39944297075271606, "learning_rate": 2.4833993045279668e-05, "loss": 0.3683, "step": 111000 }, { "epoch": 16.5, "grad_norm": 0.3601832985877991, "learning_rate": 2.48050921130512e-05, "loss": 0.3699, "step": 111500 }, { "epoch": 16.57, "grad_norm": 0.40571844577789307, "learning_rate": 2.4776191180822727e-05, "loss": 0.3689, "step": 112000 }, { "epoch": 16.65, "grad_norm": 0.40363049507141113, "learning_rate": 2.474729024859426e-05, "loss": 0.3708, "step": 112500 }, { "epoch": 16.72, "grad_norm": 0.3990092873573303, "learning_rate": 2.471838931636579e-05, "loss": 0.3706, "step": 113000 }, { "epoch": 16.79, "grad_norm": 0.36532795429229736, "learning_rate": 2.4689488384137317e-05, "loss": 0.3721, "step": 113500 }, { "epoch": 16.87, "grad_norm": 0.44025781750679016, "learning_rate": 2.466058745190885e-05, "loss": 0.3733, "step": 114000 }, { "epoch": 16.94, "grad_norm": 0.3850398659706116, "learning_rate": 2.463168651968038e-05, "loss": 0.3693, "step": 114500 }, { "epoch": 17.02, "grad_norm": 0.38132092356681824, "learning_rate": 2.4602785587451908e-05, "loss": 0.3703, "step": 115000 }, { "epoch": 17.09, "grad_norm": 0.39702102541923523, "learning_rate": 2.457388465522344e-05, "loss": 0.3706, "step": 115500 }, { "epoch": 17.16, "grad_norm": 0.35070690512657166, "learning_rate": 2.454498372299497e-05, "loss": 0.3706, "step": 116000 }, { "epoch": 17.24, "grad_norm": 0.42235612869262695, "learning_rate": 2.4516082790766498e-05, "loss": 0.3703, "step": 116500 }, { "epoch": 17.31, "grad_norm": 0.35915622115135193, "learning_rate": 2.448718185853803e-05, "loss": 0.3693, "step": 117000 }, { "epoch": 17.39, "grad_norm": 0.34371674060821533, "learning_rate": 2.445828092630956e-05, "loss": 0.3686, "step": 117500 }, { "epoch": 17.46, "grad_norm": 0.48452600836753845, "learning_rate": 2.442937999408109e-05, "loss": 0.3696, "step": 118000 }, { "epoch": 17.53, "grad_norm": 0.39582401514053345, "learning_rate": 2.440047906185262e-05, "loss": 0.3698, "step": 118500 }, { "epoch": 17.61, "grad_norm": 0.43614286184310913, "learning_rate": 2.437157812962415e-05, "loss": 0.3688, "step": 119000 }, { "epoch": 17.68, "grad_norm": 0.3942769765853882, "learning_rate": 2.434267719739568e-05, "loss": 0.368, "step": 119500 }, { "epoch": 17.76, "grad_norm": 0.34341031312942505, "learning_rate": 2.431377626516721e-05, "loss": 0.3665, "step": 120000 }, { "epoch": 17.83, "grad_norm": 0.3661987781524658, "learning_rate": 2.428487533293874e-05, "loss": 0.3664, "step": 120500 }, { "epoch": 17.9, "grad_norm": 0.32992053031921387, "learning_rate": 2.425597440071027e-05, "loss": 0.3683, "step": 121000 }, { "epoch": 17.98, "grad_norm": 0.40151405334472656, "learning_rate": 2.42270734684818e-05, "loss": 0.3677, "step": 121500 }, { "epoch": 18.05, "grad_norm": 0.3343447148799896, "learning_rate": 2.419817253625333e-05, "loss": 0.3679, "step": 122000 }, { "epoch": 18.13, "grad_norm": 0.3798489272594452, "learning_rate": 2.416927160402486e-05, "loss": 0.3687, "step": 122500 }, { "epoch": 18.2, "grad_norm": 0.3244016766548157, "learning_rate": 2.414037067179639e-05, "loss": 0.3643, "step": 123000 }, { "epoch": 18.27, "grad_norm": 0.4036329984664917, "learning_rate": 2.4111469739567922e-05, "loss": 0.3703, "step": 123500 }, { "epoch": 18.35, "grad_norm": 0.3875889778137207, "learning_rate": 2.408256880733945e-05, "loss": 0.3661, "step": 124000 }, { "epoch": 18.42, "grad_norm": 0.3607046902179718, "learning_rate": 2.405366787511098e-05, "loss": 0.3646, "step": 124500 }, { "epoch": 18.5, "grad_norm": 0.33054810762405396, "learning_rate": 2.4024766942882512e-05, "loss": 0.3675, "step": 125000 }, { "epoch": 18.57, "grad_norm": 0.4091895520687103, "learning_rate": 2.399586601065404e-05, "loss": 0.3652, "step": 125500 }, { "epoch": 18.64, "grad_norm": 0.37667781114578247, "learning_rate": 2.396696507842557e-05, "loss": 0.3691, "step": 126000 }, { "epoch": 18.72, "grad_norm": 0.3683590888977051, "learning_rate": 2.39380641461971e-05, "loss": 0.3675, "step": 126500 }, { "epoch": 18.79, "grad_norm": 0.397073894739151, "learning_rate": 2.3909163213968627e-05, "loss": 0.3637, "step": 127000 }, { "epoch": 18.87, "grad_norm": 0.3522073030471802, "learning_rate": 2.3880262281740158e-05, "loss": 0.3676, "step": 127500 }, { "epoch": 18.94, "grad_norm": 0.3389582633972168, "learning_rate": 2.385136134951169e-05, "loss": 0.3676, "step": 128000 }, { "epoch": 19.01, "grad_norm": 0.3726537823677063, "learning_rate": 2.3822460417283217e-05, "loss": 0.3674, "step": 128500 }, { "epoch": 19.09, "grad_norm": 0.3774533271789551, "learning_rate": 2.379355948505475e-05, "loss": 0.3669, "step": 129000 }, { "epoch": 19.16, "grad_norm": 0.45427048206329346, "learning_rate": 2.376465855282628e-05, "loss": 0.3652, "step": 129500 }, { "epoch": 19.24, "grad_norm": 0.39148712158203125, "learning_rate": 2.373575762059781e-05, "loss": 0.3632, "step": 130000 }, { "epoch": 19.31, "grad_norm": 0.3727419078350067, "learning_rate": 2.370685668836934e-05, "loss": 0.3674, "step": 130500 }, { "epoch": 19.38, "grad_norm": 0.3490041196346283, "learning_rate": 2.367795575614087e-05, "loss": 0.3685, "step": 131000 }, { "epoch": 19.46, "grad_norm": 0.33863797783851624, "learning_rate": 2.36490548239124e-05, "loss": 0.3659, "step": 131500 }, { "epoch": 19.53, "grad_norm": 0.41820865869522095, "learning_rate": 2.362015389168393e-05, "loss": 0.3647, "step": 132000 }, { "epoch": 19.61, "grad_norm": 0.31935831904411316, "learning_rate": 2.359125295945546e-05, "loss": 0.3657, "step": 132500 }, { "epoch": 19.68, "grad_norm": 0.39523938298225403, "learning_rate": 2.3562352027226992e-05, "loss": 0.3643, "step": 133000 }, { "epoch": 19.75, "grad_norm": 0.3851146996021271, "learning_rate": 2.353345109499852e-05, "loss": 0.3624, "step": 133500 }, { "epoch": 19.83, "grad_norm": 0.3778953552246094, "learning_rate": 2.350455016277005e-05, "loss": 0.3658, "step": 134000 }, { "epoch": 19.9, "grad_norm": 0.3673354387283325, "learning_rate": 2.3475649230541582e-05, "loss": 0.3645, "step": 134500 }, { "epoch": 19.98, "grad_norm": 0.40675076842308044, "learning_rate": 2.344674829831311e-05, "loss": 0.3624, "step": 135000 }, { "epoch": 20.05, "grad_norm": 0.32396331429481506, "learning_rate": 2.341784736608464e-05, "loss": 0.3608, "step": 135500 }, { "epoch": 20.12, "grad_norm": 0.4665846526622772, "learning_rate": 2.3388946433856172e-05, "loss": 0.3654, "step": 136000 }, { "epoch": 20.2, "grad_norm": 0.3753814697265625, "learning_rate": 2.33600455016277e-05, "loss": 0.3611, "step": 136500 }, { "epoch": 20.27, "grad_norm": 0.39572277665138245, "learning_rate": 2.333114456939923e-05, "loss": 0.363, "step": 137000 }, { "epoch": 20.35, "grad_norm": 0.36638927459716797, "learning_rate": 2.3302243637170763e-05, "loss": 0.3625, "step": 137500 }, { "epoch": 20.42, "grad_norm": 0.40173882246017456, "learning_rate": 2.327334270494229e-05, "loss": 0.3645, "step": 138000 }, { "epoch": 20.49, "grad_norm": 0.34684666991233826, "learning_rate": 2.3244441772713822e-05, "loss": 0.3636, "step": 138500 }, { "epoch": 20.57, "grad_norm": 0.3533775806427002, "learning_rate": 2.3215540840485353e-05, "loss": 0.3624, "step": 139000 }, { "epoch": 20.64, "grad_norm": 0.36431315541267395, "learning_rate": 2.318663990825688e-05, "loss": 0.3649, "step": 139500 }, { "epoch": 20.72, "grad_norm": 0.3629516363143921, "learning_rate": 2.3157738976028412e-05, "loss": 0.3646, "step": 140000 }, { "epoch": 20.79, "grad_norm": 0.383987158536911, "learning_rate": 2.3128838043799943e-05, "loss": 0.3623, "step": 140500 }, { "epoch": 20.86, "grad_norm": 0.38170096278190613, "learning_rate": 2.309993711157147e-05, "loss": 0.3629, "step": 141000 }, { "epoch": 20.94, "grad_norm": 0.36627018451690674, "learning_rate": 2.3071036179343003e-05, "loss": 0.362, "step": 141500 }, { "epoch": 21.01, "grad_norm": 0.37587088346481323, "learning_rate": 2.3042135247114534e-05, "loss": 0.3624, "step": 142000 }, { "epoch": 21.09, "grad_norm": 0.3648183047771454, "learning_rate": 2.301323431488606e-05, "loss": 0.3633, "step": 142500 }, { "epoch": 21.16, "grad_norm": 0.3818926513195038, "learning_rate": 2.2984333382657593e-05, "loss": 0.3627, "step": 143000 }, { "epoch": 21.23, "grad_norm": 0.38963159918785095, "learning_rate": 2.2955432450429124e-05, "loss": 0.3619, "step": 143500 }, { "epoch": 21.31, "grad_norm": 0.4655527174472809, "learning_rate": 2.2926531518200652e-05, "loss": 0.3616, "step": 144000 }, { "epoch": 21.38, "grad_norm": 0.36112433671951294, "learning_rate": 2.2897630585972183e-05, "loss": 0.3597, "step": 144500 }, { "epoch": 21.46, "grad_norm": 0.3661918044090271, "learning_rate": 2.2868729653743714e-05, "loss": 0.3626, "step": 145000 }, { "epoch": 21.53, "grad_norm": 0.340862512588501, "learning_rate": 2.2839828721515242e-05, "loss": 0.3637, "step": 145500 }, { "epoch": 21.6, "grad_norm": 0.3376435339450836, "learning_rate": 2.281092778928677e-05, "loss": 0.3622, "step": 146000 }, { "epoch": 21.68, "grad_norm": 0.4211704730987549, "learning_rate": 2.27820268570583e-05, "loss": 0.3593, "step": 146500 }, { "epoch": 21.75, "grad_norm": 0.3886992037296295, "learning_rate": 2.275312592482983e-05, "loss": 0.3614, "step": 147000 }, { "epoch": 21.83, "grad_norm": 0.34686926007270813, "learning_rate": 2.272422499260136e-05, "loss": 0.3615, "step": 147500 }, { "epoch": 21.9, "grad_norm": 0.4066803455352783, "learning_rate": 2.269532406037289e-05, "loss": 0.3597, "step": 148000 }, { "epoch": 21.97, "grad_norm": 0.37257149815559387, "learning_rate": 2.266642312814442e-05, "loss": 0.364, "step": 148500 }, { "epoch": 22.05, "grad_norm": 0.3770715892314911, "learning_rate": 2.263752219591595e-05, "loss": 0.359, "step": 149000 }, { "epoch": 22.12, "grad_norm": 0.3486000597476959, "learning_rate": 2.2608621263687482e-05, "loss": 0.3616, "step": 149500 }, { "epoch": 22.2, "grad_norm": 0.4026411771774292, "learning_rate": 2.257972033145901e-05, "loss": 0.3615, "step": 150000 }, { "epoch": 22.27, "grad_norm": 0.4740307629108429, "learning_rate": 2.255081939923054e-05, "loss": 0.359, "step": 150500 }, { "epoch": 22.34, "grad_norm": 0.5692958235740662, "learning_rate": 2.2521918467002072e-05, "loss": 0.3614, "step": 151000 }, { "epoch": 22.42, "grad_norm": 0.4546482264995575, "learning_rate": 2.24930175347736e-05, "loss": 0.3609, "step": 151500 }, { "epoch": 22.49, "grad_norm": 0.3762848675251007, "learning_rate": 2.246411660254513e-05, "loss": 0.3612, "step": 152000 }, { "epoch": 22.57, "grad_norm": 0.3631458282470703, "learning_rate": 2.2435215670316663e-05, "loss": 0.3613, "step": 152500 }, { "epoch": 22.64, "grad_norm": 0.3560335040092468, "learning_rate": 2.240631473808819e-05, "loss": 0.3601, "step": 153000 }, { "epoch": 22.71, "grad_norm": 0.3739969730377197, "learning_rate": 2.2377413805859722e-05, "loss": 0.3586, "step": 153500 }, { "epoch": 22.79, "grad_norm": 0.3538212478160858, "learning_rate": 2.2348512873631253e-05, "loss": 0.3619, "step": 154000 }, { "epoch": 22.86, "grad_norm": 0.33162832260131836, "learning_rate": 2.231961194140278e-05, "loss": 0.3566, "step": 154500 }, { "epoch": 22.94, "grad_norm": 0.3731963634490967, "learning_rate": 2.2290711009174312e-05, "loss": 0.3617, "step": 155000 }, { "epoch": 23.01, "grad_norm": 0.3658241033554077, "learning_rate": 2.2261810076945843e-05, "loss": 0.3621, "step": 155500 }, { "epoch": 23.08, "grad_norm": 0.3295973837375641, "learning_rate": 2.223290914471737e-05, "loss": 0.3592, "step": 156000 }, { "epoch": 23.16, "grad_norm": 0.3476080894470215, "learning_rate": 2.2204008212488902e-05, "loss": 0.358, "step": 156500 }, { "epoch": 23.23, "grad_norm": 0.4091341197490692, "learning_rate": 2.2175107280260434e-05, "loss": 0.3588, "step": 157000 }, { "epoch": 23.31, "grad_norm": 0.4243708848953247, "learning_rate": 2.214620634803196e-05, "loss": 0.3581, "step": 157500 }, { "epoch": 23.38, "grad_norm": 0.4200844168663025, "learning_rate": 2.2117305415803493e-05, "loss": 0.3598, "step": 158000 }, { "epoch": 23.45, "grad_norm": 0.4001297652721405, "learning_rate": 2.2088404483575024e-05, "loss": 0.3608, "step": 158500 }, { "epoch": 23.53, "grad_norm": 0.44927239418029785, "learning_rate": 2.2059503551346552e-05, "loss": 0.3572, "step": 159000 }, { "epoch": 23.6, "grad_norm": 0.3438055217266083, "learning_rate": 2.2030602619118083e-05, "loss": 0.3603, "step": 159500 }, { "epoch": 23.68, "grad_norm": 0.395907461643219, "learning_rate": 2.2001701686889614e-05, "loss": 0.3583, "step": 160000 }, { "epoch": 23.75, "grad_norm": 0.3705403506755829, "learning_rate": 2.1972800754661142e-05, "loss": 0.3606, "step": 160500 }, { "epoch": 23.82, "grad_norm": 0.34676727652549744, "learning_rate": 2.1943899822432673e-05, "loss": 0.3589, "step": 161000 }, { "epoch": 23.9, "grad_norm": 0.3712906837463379, "learning_rate": 2.1914998890204205e-05, "loss": 0.3597, "step": 161500 }, { "epoch": 23.97, "grad_norm": 0.3136620819568634, "learning_rate": 2.1886097957975732e-05, "loss": 0.3571, "step": 162000 }, { "epoch": 24.05, "grad_norm": 0.3915017545223236, "learning_rate": 2.1857197025747264e-05, "loss": 0.3595, "step": 162500 }, { "epoch": 24.12, "grad_norm": 0.3403623700141907, "learning_rate": 2.1828296093518795e-05, "loss": 0.3608, "step": 163000 }, { "epoch": 24.19, "grad_norm": 0.3841993510723114, "learning_rate": 2.1799395161290323e-05, "loss": 0.3596, "step": 163500 }, { "epoch": 24.27, "grad_norm": 0.42894381284713745, "learning_rate": 2.1770494229061854e-05, "loss": 0.3571, "step": 164000 }, { "epoch": 24.34, "grad_norm": 0.4211946129798889, "learning_rate": 2.1741593296833385e-05, "loss": 0.3552, "step": 164500 }, { "epoch": 24.42, "grad_norm": 0.35293659567832947, "learning_rate": 2.1712692364604917e-05, "loss": 0.359, "step": 165000 }, { "epoch": 24.49, "grad_norm": 0.3743543326854706, "learning_rate": 2.168379143237644e-05, "loss": 0.3556, "step": 165500 }, { "epoch": 24.56, "grad_norm": 0.4101512134075165, "learning_rate": 2.1654890500147972e-05, "loss": 0.3561, "step": 166000 }, { "epoch": 24.64, "grad_norm": 0.34685665369033813, "learning_rate": 2.1625989567919503e-05, "loss": 0.3596, "step": 166500 }, { "epoch": 24.71, "grad_norm": 0.3677741289138794, "learning_rate": 2.159708863569103e-05, "loss": 0.3572, "step": 167000 }, { "epoch": 24.79, "grad_norm": 0.3836405575275421, "learning_rate": 2.1568187703462563e-05, "loss": 0.36, "step": 167500 }, { "epoch": 24.86, "grad_norm": 0.3649022579193115, "learning_rate": 2.1539286771234094e-05, "loss": 0.3558, "step": 168000 }, { "epoch": 24.93, "grad_norm": 0.3566337525844574, "learning_rate": 2.151038583900562e-05, "loss": 0.3567, "step": 168500 }, { "epoch": 25.01, "grad_norm": 0.4024845361709595, "learning_rate": 2.1481484906777153e-05, "loss": 0.3555, "step": 169000 }, { "epoch": 25.08, "grad_norm": 0.3865801692008972, "learning_rate": 2.1452583974548684e-05, "loss": 0.3587, "step": 169500 }, { "epoch": 25.16, "grad_norm": 0.3753124475479126, "learning_rate": 2.1423683042320212e-05, "loss": 0.358, "step": 170000 }, { "epoch": 25.23, "grad_norm": 0.3889290690422058, "learning_rate": 2.1394782110091743e-05, "loss": 0.3573, "step": 170500 }, { "epoch": 25.3, "grad_norm": 0.425574392080307, "learning_rate": 2.1365881177863274e-05, "loss": 0.3583, "step": 171000 }, { "epoch": 25.38, "grad_norm": 0.35915040969848633, "learning_rate": 2.1336980245634802e-05, "loss": 0.355, "step": 171500 }, { "epoch": 25.45, "grad_norm": 0.3714876174926758, "learning_rate": 2.1308079313406334e-05, "loss": 0.3558, "step": 172000 }, { "epoch": 25.53, "grad_norm": 0.3659971356391907, "learning_rate": 2.1279178381177865e-05, "loss": 0.3526, "step": 172500 }, { "epoch": 25.6, "grad_norm": 0.35083669424057007, "learning_rate": 2.1250277448949393e-05, "loss": 0.3582, "step": 173000 }, { "epoch": 25.67, "grad_norm": 0.3540023863315582, "learning_rate": 2.1221376516720924e-05, "loss": 0.3555, "step": 173500 }, { "epoch": 25.75, "grad_norm": 0.3811222016811371, "learning_rate": 2.1192475584492455e-05, "loss": 0.3557, "step": 174000 }, { "epoch": 25.82, "grad_norm": 0.37513551115989685, "learning_rate": 2.1163574652263983e-05, "loss": 0.3563, "step": 174500 }, { "epoch": 25.9, "grad_norm": 0.4036356508731842, "learning_rate": 2.1134673720035514e-05, "loss": 0.3548, "step": 175000 }, { "epoch": 25.97, "grad_norm": 0.3446299135684967, "learning_rate": 2.1105772787807045e-05, "loss": 0.3573, "step": 175500 }, { "epoch": 26.04, "grad_norm": 0.4351588487625122, "learning_rate": 2.1076871855578573e-05, "loss": 0.3556, "step": 176000 }, { "epoch": 26.12, "grad_norm": 0.38238152861595154, "learning_rate": 2.1047970923350105e-05, "loss": 0.3566, "step": 176500 }, { "epoch": 26.19, "grad_norm": 0.3972441554069519, "learning_rate": 2.1019069991121636e-05, "loss": 0.3533, "step": 177000 }, { "epoch": 26.27, "grad_norm": 0.40132614970207214, "learning_rate": 2.0990169058893164e-05, "loss": 0.3548, "step": 177500 }, { "epoch": 26.34, "grad_norm": 0.3178902864456177, "learning_rate": 2.0961268126664695e-05, "loss": 0.355, "step": 178000 }, { "epoch": 26.41, "grad_norm": 0.4328124225139618, "learning_rate": 2.0932367194436226e-05, "loss": 0.3554, "step": 178500 }, { "epoch": 26.49, "grad_norm": 0.3971725404262543, "learning_rate": 2.0903466262207754e-05, "loss": 0.3549, "step": 179000 }, { "epoch": 26.56, "grad_norm": 0.3241216540336609, "learning_rate": 2.0874565329979285e-05, "loss": 0.3534, "step": 179500 }, { "epoch": 26.64, "grad_norm": 0.3448522984981537, "learning_rate": 2.0845664397750816e-05, "loss": 0.3559, "step": 180000 }, { "epoch": 26.71, "grad_norm": 0.34117060899734497, "learning_rate": 2.0816763465522344e-05, "loss": 0.3555, "step": 180500 }, { "epoch": 26.78, "grad_norm": 0.39051172137260437, "learning_rate": 2.0787862533293876e-05, "loss": 0.3558, "step": 181000 }, { "epoch": 26.86, "grad_norm": 0.3349858820438385, "learning_rate": 2.0758961601065407e-05, "loss": 0.3546, "step": 181500 }, { "epoch": 26.93, "grad_norm": 0.4579429030418396, "learning_rate": 2.0730060668836935e-05, "loss": 0.3537, "step": 182000 }, { "epoch": 27.01, "grad_norm": 0.3789091110229492, "learning_rate": 2.0701159736608466e-05, "loss": 0.3527, "step": 182500 }, { "epoch": 27.08, "grad_norm": 0.43690434098243713, "learning_rate": 2.0672258804379997e-05, "loss": 0.3563, "step": 183000 }, { "epoch": 27.15, "grad_norm": 0.3886288106441498, "learning_rate": 2.0643357872151525e-05, "loss": 0.3543, "step": 183500 }, { "epoch": 27.23, "grad_norm": 0.40548428893089294, "learning_rate": 2.0614456939923056e-05, "loss": 0.353, "step": 184000 }, { "epoch": 27.3, "grad_norm": 0.4054431915283203, "learning_rate": 2.0585556007694584e-05, "loss": 0.3575, "step": 184500 }, { "epoch": 27.37, "grad_norm": 0.3319009840488434, "learning_rate": 2.0556655075466112e-05, "loss": 0.3524, "step": 185000 }, { "epoch": 27.45, "grad_norm": 0.36432087421417236, "learning_rate": 2.0527754143237643e-05, "loss": 0.3556, "step": 185500 }, { "epoch": 27.52, "grad_norm": 0.3561677038669586, "learning_rate": 2.0498853211009174e-05, "loss": 0.3539, "step": 186000 }, { "epoch": 27.6, "grad_norm": 0.41498541831970215, "learning_rate": 2.0469952278780702e-05, "loss": 0.3561, "step": 186500 }, { "epoch": 27.67, "grad_norm": 0.3646217882633209, "learning_rate": 2.0441051346552233e-05, "loss": 0.3519, "step": 187000 }, { "epoch": 27.74, "grad_norm": 0.34534063935279846, "learning_rate": 2.0412150414323765e-05, "loss": 0.3539, "step": 187500 }, { "epoch": 27.82, "grad_norm": 0.4323655962944031, "learning_rate": 2.0383249482095293e-05, "loss": 0.3559, "step": 188000 }, { "epoch": 27.89, "grad_norm": 0.3833807408809662, "learning_rate": 2.0354348549866824e-05, "loss": 0.3532, "step": 188500 }, { "epoch": 27.97, "grad_norm": 0.37557268142700195, "learning_rate": 2.0325447617638355e-05, "loss": 0.3523, "step": 189000 }, { "epoch": 28.04, "grad_norm": 0.37144702672958374, "learning_rate": 2.0296546685409883e-05, "loss": 0.3536, "step": 189500 }, { "epoch": 28.11, "grad_norm": 0.40455296635627747, "learning_rate": 2.0267645753181414e-05, "loss": 0.3534, "step": 190000 }, { "epoch": 28.19, "grad_norm": 0.3639744818210602, "learning_rate": 2.0238744820952945e-05, "loss": 0.3536, "step": 190500 }, { "epoch": 28.26, "grad_norm": 0.38016533851623535, "learning_rate": 2.0209843888724473e-05, "loss": 0.3569, "step": 191000 }, { "epoch": 28.34, "grad_norm": 0.35611262917518616, "learning_rate": 2.0180942956496004e-05, "loss": 0.3539, "step": 191500 }, { "epoch": 28.41, "grad_norm": 0.3586650490760803, "learning_rate": 2.0152042024267536e-05, "loss": 0.3516, "step": 192000 }, { "epoch": 28.48, "grad_norm": 0.3105120062828064, "learning_rate": 2.0123141092039064e-05, "loss": 0.3518, "step": 192500 }, { "epoch": 28.56, "grad_norm": 0.37972891330718994, "learning_rate": 2.0094240159810595e-05, "loss": 0.3558, "step": 193000 }, { "epoch": 28.63, "grad_norm": 0.35530367493629456, "learning_rate": 2.0065339227582126e-05, "loss": 0.3505, "step": 193500 }, { "epoch": 28.71, "grad_norm": 0.42136579751968384, "learning_rate": 2.0036438295353654e-05, "loss": 0.3537, "step": 194000 }, { "epoch": 28.78, "grad_norm": 0.37874168157577515, "learning_rate": 2.0007537363125185e-05, "loss": 0.3505, "step": 194500 }, { "epoch": 28.85, "grad_norm": 0.33442074060440063, "learning_rate": 1.9978636430896716e-05, "loss": 0.3536, "step": 195000 }, { "epoch": 28.93, "grad_norm": 0.37098708748817444, "learning_rate": 1.9949735498668244e-05, "loss": 0.3553, "step": 195500 }, { "epoch": 29.0, "grad_norm": 0.33478862047195435, "learning_rate": 1.9920834566439775e-05, "loss": 0.3527, "step": 196000 }, { "epoch": 29.08, "grad_norm": 0.3783182203769684, "learning_rate": 1.9891933634211307e-05, "loss": 0.3523, "step": 196500 }, { "epoch": 29.15, "grad_norm": 0.32911786437034607, "learning_rate": 1.9863032701982835e-05, "loss": 0.3526, "step": 197000 }, { "epoch": 29.22, "grad_norm": 0.33882907032966614, "learning_rate": 1.9834131769754366e-05, "loss": 0.3507, "step": 197500 }, { "epoch": 29.3, "grad_norm": 0.4318142235279083, "learning_rate": 1.9805230837525897e-05, "loss": 0.3504, "step": 198000 }, { "epoch": 29.37, "grad_norm": 0.33973386883735657, "learning_rate": 1.9776329905297425e-05, "loss": 0.3526, "step": 198500 }, { "epoch": 29.45, "grad_norm": 0.3557802736759186, "learning_rate": 1.9747428973068956e-05, "loss": 0.3511, "step": 199000 }, { "epoch": 29.52, "grad_norm": 0.4430686831474304, "learning_rate": 1.9718528040840487e-05, "loss": 0.3503, "step": 199500 }, { "epoch": 29.59, "grad_norm": 0.33132269978523254, "learning_rate": 1.9689627108612015e-05, "loss": 0.3531, "step": 200000 }, { "epoch": 29.67, "grad_norm": 0.362075537443161, "learning_rate": 1.9660726176383546e-05, "loss": 0.3517, "step": 200500 }, { "epoch": 29.74, "grad_norm": 0.3604036867618561, "learning_rate": 1.9631825244155078e-05, "loss": 0.3558, "step": 201000 }, { "epoch": 29.82, "grad_norm": 0.39711400866508484, "learning_rate": 1.960292431192661e-05, "loss": 0.3515, "step": 201500 }, { "epoch": 29.89, "grad_norm": 0.30394095182418823, "learning_rate": 1.9574023379698137e-05, "loss": 0.352, "step": 202000 }, { "epoch": 29.96, "grad_norm": 0.3903627097606659, "learning_rate": 1.9545122447469668e-05, "loss": 0.3496, "step": 202500 }, { "epoch": 30.04, "grad_norm": 0.3124367296695709, "learning_rate": 1.95162215152412e-05, "loss": 0.3517, "step": 203000 }, { "epoch": 30.11, "grad_norm": 0.4038899540901184, "learning_rate": 1.9487320583012727e-05, "loss": 0.3518, "step": 203500 }, { "epoch": 30.19, "grad_norm": 0.32454368472099304, "learning_rate": 1.9458419650784255e-05, "loss": 0.3517, "step": 204000 }, { "epoch": 30.26, "grad_norm": 0.35400575399398804, "learning_rate": 1.9429518718555786e-05, "loss": 0.3479, "step": 204500 }, { "epoch": 30.33, "grad_norm": 0.424834281206131, "learning_rate": 1.9400617786327314e-05, "loss": 0.3502, "step": 205000 }, { "epoch": 30.41, "grad_norm": 0.4748223125934601, "learning_rate": 1.9371716854098845e-05, "loss": 0.349, "step": 205500 }, { "epoch": 30.48, "grad_norm": 0.3032289445400238, "learning_rate": 1.9342815921870376e-05, "loss": 0.3543, "step": 206000 }, { "epoch": 30.56, "grad_norm": 0.4162702262401581, "learning_rate": 1.9313914989641904e-05, "loss": 0.3518, "step": 206500 }, { "epoch": 30.63, "grad_norm": 0.3512803912162781, "learning_rate": 1.9285014057413436e-05, "loss": 0.3505, "step": 207000 }, { "epoch": 30.7, "grad_norm": 0.3622516989707947, "learning_rate": 1.9256113125184967e-05, "loss": 0.3538, "step": 207500 }, { "epoch": 30.78, "grad_norm": 0.3330516517162323, "learning_rate": 1.9227212192956495e-05, "loss": 0.3489, "step": 208000 }, { "epoch": 30.85, "grad_norm": 0.3457803726196289, "learning_rate": 1.9198311260728026e-05, "loss": 0.352, "step": 208500 }, { "epoch": 30.93, "grad_norm": 0.3154030442237854, "learning_rate": 1.9169410328499557e-05, "loss": 0.3491, "step": 209000 }, { "epoch": 31.0, "grad_norm": 0.46131783723831177, "learning_rate": 1.9140509396271085e-05, "loss": 0.3509, "step": 209500 }, { "epoch": 31.07, "grad_norm": 0.400088369846344, "learning_rate": 1.9111608464042616e-05, "loss": 0.3509, "step": 210000 }, { "epoch": 31.15, "grad_norm": 0.3647451400756836, "learning_rate": 1.9082707531814147e-05, "loss": 0.35, "step": 210500 }, { "epoch": 31.22, "grad_norm": 0.4007732570171356, "learning_rate": 1.9053806599585675e-05, "loss": 0.3504, "step": 211000 }, { "epoch": 31.3, "grad_norm": 0.3861900269985199, "learning_rate": 1.9024905667357207e-05, "loss": 0.3484, "step": 211500 }, { "epoch": 31.37, "grad_norm": 0.411627858877182, "learning_rate": 1.8996004735128738e-05, "loss": 0.3507, "step": 212000 }, { "epoch": 31.44, "grad_norm": 0.35766077041625977, "learning_rate": 1.8967103802900266e-05, "loss": 0.3485, "step": 212500 }, { "epoch": 31.52, "grad_norm": 0.3537013530731201, "learning_rate": 1.8938202870671797e-05, "loss": 0.3517, "step": 213000 }, { "epoch": 31.59, "grad_norm": 0.3919309675693512, "learning_rate": 1.8909301938443328e-05, "loss": 0.3489, "step": 213500 }, { "epoch": 31.67, "grad_norm": 0.3441930115222931, "learning_rate": 1.8880401006214856e-05, "loss": 0.351, "step": 214000 }, { "epoch": 31.74, "grad_norm": 0.38138172030448914, "learning_rate": 1.8851500073986387e-05, "loss": 0.3495, "step": 214500 }, { "epoch": 31.81, "grad_norm": 0.4080500304698944, "learning_rate": 1.882259914175792e-05, "loss": 0.3497, "step": 215000 }, { "epoch": 31.89, "grad_norm": 0.3864932358264923, "learning_rate": 1.8793698209529446e-05, "loss": 0.3525, "step": 215500 }, { "epoch": 31.96, "grad_norm": 0.4017949104309082, "learning_rate": 1.8764797277300978e-05, "loss": 0.3528, "step": 216000 }, { "epoch": 32.04, "grad_norm": 0.3484615087509155, "learning_rate": 1.873589634507251e-05, "loss": 0.3505, "step": 216500 }, { "epoch": 32.11, "grad_norm": 0.34500235319137573, "learning_rate": 1.8706995412844037e-05, "loss": 0.3498, "step": 217000 }, { "epoch": 32.18, "grad_norm": 0.32486996054649353, "learning_rate": 1.8678094480615568e-05, "loss": 0.3501, "step": 217500 }, { "epoch": 32.26, "grad_norm": 0.3440997302532196, "learning_rate": 1.86491935483871e-05, "loss": 0.3504, "step": 218000 }, { "epoch": 32.33, "grad_norm": 0.359846293926239, "learning_rate": 1.8620292616158627e-05, "loss": 0.3491, "step": 218500 }, { "epoch": 32.41, "grad_norm": 0.36168062686920166, "learning_rate": 1.8591391683930158e-05, "loss": 0.3489, "step": 219000 }, { "epoch": 32.48, "grad_norm": 0.43606841564178467, "learning_rate": 1.856249075170169e-05, "loss": 0.3497, "step": 219500 }, { "epoch": 32.55, "grad_norm": 0.3898315727710724, "learning_rate": 1.8533589819473217e-05, "loss": 0.3487, "step": 220000 }, { "epoch": 32.63, "grad_norm": 0.381244421005249, "learning_rate": 1.850468888724475e-05, "loss": 0.3475, "step": 220500 }, { "epoch": 32.7, "grad_norm": 0.41321712732315063, "learning_rate": 1.847578795501628e-05, "loss": 0.3484, "step": 221000 }, { "epoch": 32.78, "grad_norm": 0.3538101017475128, "learning_rate": 1.8446887022787808e-05, "loss": 0.3484, "step": 221500 }, { "epoch": 32.85, "grad_norm": 0.38104715943336487, "learning_rate": 1.841798609055934e-05, "loss": 0.3482, "step": 222000 }, { "epoch": 32.92, "grad_norm": 0.37761756777763367, "learning_rate": 1.838908515833087e-05, "loss": 0.3474, "step": 222500 }, { "epoch": 33.0, "grad_norm": 0.3524073362350464, "learning_rate": 1.8360184226102398e-05, "loss": 0.3518, "step": 223000 }, { "epoch": 33.07, "grad_norm": 0.3452695608139038, "learning_rate": 1.8331283293873926e-05, "loss": 0.3509, "step": 223500 }, { "epoch": 33.15, "grad_norm": 0.4063817262649536, "learning_rate": 1.8302382361645457e-05, "loss": 0.3496, "step": 224000 }, { "epoch": 33.22, "grad_norm": 0.41099056601524353, "learning_rate": 1.8273481429416985e-05, "loss": 0.3489, "step": 224500 }, { "epoch": 33.29, "grad_norm": 0.3691389560699463, "learning_rate": 1.8244580497188516e-05, "loss": 0.3507, "step": 225000 }, { "epoch": 33.37, "grad_norm": 0.36765575408935547, "learning_rate": 1.8215679564960047e-05, "loss": 0.3464, "step": 225500 }, { "epoch": 33.44, "grad_norm": 0.39067772030830383, "learning_rate": 1.8186778632731575e-05, "loss": 0.3479, "step": 226000 }, { "epoch": 33.52, "grad_norm": 0.3263433873653412, "learning_rate": 1.8157877700503106e-05, "loss": 0.3457, "step": 226500 }, { "epoch": 33.59, "grad_norm": 0.45672136545181274, "learning_rate": 1.8128976768274638e-05, "loss": 0.3488, "step": 227000 }, { "epoch": 33.66, "grad_norm": 0.42077481746673584, "learning_rate": 1.8100075836046166e-05, "loss": 0.3447, "step": 227500 }, { "epoch": 33.74, "grad_norm": 0.3391963243484497, "learning_rate": 1.8071174903817697e-05, "loss": 0.3471, "step": 228000 }, { "epoch": 33.81, "grad_norm": 0.3776566684246063, "learning_rate": 1.8042273971589228e-05, "loss": 0.3498, "step": 228500 }, { "epoch": 33.89, "grad_norm": 0.3776949942111969, "learning_rate": 1.8013373039360756e-05, "loss": 0.3463, "step": 229000 }, { "epoch": 33.96, "grad_norm": 0.3695808947086334, "learning_rate": 1.7984472107132287e-05, "loss": 0.3492, "step": 229500 }, { "epoch": 34.03, "grad_norm": 0.36413583159446716, "learning_rate": 1.795557117490382e-05, "loss": 0.3472, "step": 230000 }, { "epoch": 34.11, "grad_norm": 0.39360344409942627, "learning_rate": 1.7926670242675346e-05, "loss": 0.3505, "step": 230500 }, { "epoch": 34.18, "grad_norm": 0.3416905999183655, "learning_rate": 1.7897769310446877e-05, "loss": 0.3472, "step": 231000 }, { "epoch": 34.26, "grad_norm": 0.40496984124183655, "learning_rate": 1.786886837821841e-05, "loss": 0.3477, "step": 231500 }, { "epoch": 34.33, "grad_norm": 0.3441724479198456, "learning_rate": 1.7839967445989937e-05, "loss": 0.3479, "step": 232000 }, { "epoch": 34.4, "grad_norm": 0.37928706407546997, "learning_rate": 1.7811066513761468e-05, "loss": 0.3486, "step": 232500 }, { "epoch": 34.48, "grad_norm": 0.3675363063812256, "learning_rate": 1.7782165581533e-05, "loss": 0.3458, "step": 233000 }, { "epoch": 34.55, "grad_norm": 0.40871405601501465, "learning_rate": 1.7753264649304527e-05, "loss": 0.3493, "step": 233500 }, { "epoch": 34.63, "grad_norm": 0.417258620262146, "learning_rate": 1.7724363717076058e-05, "loss": 0.3453, "step": 234000 }, { "epoch": 34.7, "grad_norm": 0.3210906386375427, "learning_rate": 1.769546278484759e-05, "loss": 0.3457, "step": 234500 }, { "epoch": 34.77, "grad_norm": 0.3412734270095825, "learning_rate": 1.7666561852619117e-05, "loss": 0.3475, "step": 235000 }, { "epoch": 34.85, "grad_norm": 0.39695098996162415, "learning_rate": 1.763766092039065e-05, "loss": 0.3509, "step": 235500 }, { "epoch": 34.92, "grad_norm": 0.36834970116615295, "learning_rate": 1.760875998816218e-05, "loss": 0.3485, "step": 236000 }, { "epoch": 35.0, "grad_norm": 0.3971041738986969, "learning_rate": 1.7579859055933708e-05, "loss": 0.3468, "step": 236500 }, { "epoch": 35.07, "grad_norm": 0.3513786196708679, "learning_rate": 1.755095812370524e-05, "loss": 0.3478, "step": 237000 }, { "epoch": 35.14, "grad_norm": 0.35932984948158264, "learning_rate": 1.752205719147677e-05, "loss": 0.348, "step": 237500 }, { "epoch": 35.22, "grad_norm": 0.40685245394706726, "learning_rate": 1.7493156259248298e-05, "loss": 0.3447, "step": 238000 }, { "epoch": 35.29, "grad_norm": 0.37929338216781616, "learning_rate": 1.746425532701983e-05, "loss": 0.3465, "step": 238500 }, { "epoch": 35.37, "grad_norm": 0.40910473465919495, "learning_rate": 1.743535439479136e-05, "loss": 0.3465, "step": 239000 }, { "epoch": 35.44, "grad_norm": 0.34920281171798706, "learning_rate": 1.740645346256289e-05, "loss": 0.3463, "step": 239500 }, { "epoch": 35.51, "grad_norm": 0.37716421484947205, "learning_rate": 1.737755253033442e-05, "loss": 0.3487, "step": 240000 }, { "epoch": 35.59, "grad_norm": 0.4301624596118927, "learning_rate": 1.734865159810595e-05, "loss": 0.3449, "step": 240500 }, { "epoch": 35.66, "grad_norm": 0.37206390500068665, "learning_rate": 1.7319750665877482e-05, "loss": 0.3453, "step": 241000 }, { "epoch": 35.74, "grad_norm": 0.38183438777923584, "learning_rate": 1.729084973364901e-05, "loss": 0.3493, "step": 241500 }, { "epoch": 35.81, "grad_norm": 0.3732428252696991, "learning_rate": 1.726194880142054e-05, "loss": 0.349, "step": 242000 }, { "epoch": 35.88, "grad_norm": 0.3665318787097931, "learning_rate": 1.7233047869192072e-05, "loss": 0.3482, "step": 242500 }, { "epoch": 35.96, "grad_norm": 0.3693431317806244, "learning_rate": 1.7204146936963597e-05, "loss": 0.3445, "step": 243000 }, { "epoch": 36.03, "grad_norm": 0.3580050766468048, "learning_rate": 1.7175246004735128e-05, "loss": 0.3469, "step": 243500 }, { "epoch": 36.11, "grad_norm": 0.3874383568763733, "learning_rate": 1.714634507250666e-05, "loss": 0.3483, "step": 244000 }, { "epoch": 36.18, "grad_norm": 0.39490264654159546, "learning_rate": 1.7117444140278187e-05, "loss": 0.3479, "step": 244500 }, { "epoch": 36.25, "grad_norm": 0.38647031784057617, "learning_rate": 1.7088543208049718e-05, "loss": 0.3458, "step": 245000 }, { "epoch": 36.33, "grad_norm": 0.3899756073951721, "learning_rate": 1.705964227582125e-05, "loss": 0.3448, "step": 245500 }, { "epoch": 36.4, "grad_norm": 0.4022426903247833, "learning_rate": 1.7030741343592777e-05, "loss": 0.3463, "step": 246000 }, { "epoch": 36.48, "grad_norm": 0.37726154923439026, "learning_rate": 1.700184041136431e-05, "loss": 0.3469, "step": 246500 }, { "epoch": 36.55, "grad_norm": 0.4022297263145447, "learning_rate": 1.697293947913584e-05, "loss": 0.3457, "step": 247000 }, { "epoch": 36.62, "grad_norm": 0.43506869673728943, "learning_rate": 1.6944038546907368e-05, "loss": 0.3467, "step": 247500 }, { "epoch": 36.7, "grad_norm": 0.4157175123691559, "learning_rate": 1.69151376146789e-05, "loss": 0.3443, "step": 248000 }, { "epoch": 36.77, "grad_norm": 0.4038371443748474, "learning_rate": 1.688623668245043e-05, "loss": 0.3408, "step": 248500 }, { "epoch": 36.85, "grad_norm": 0.3598155081272125, "learning_rate": 1.6857335750221958e-05, "loss": 0.3455, "step": 249000 }, { "epoch": 36.92, "grad_norm": 0.3888005018234253, "learning_rate": 1.682843481799349e-05, "loss": 0.3465, "step": 249500 }, { "epoch": 36.99, "grad_norm": 0.3933923840522766, "learning_rate": 1.679953388576502e-05, "loss": 0.3488, "step": 250000 }, { "epoch": 37.07, "grad_norm": 0.36610084772109985, "learning_rate": 1.677063295353655e-05, "loss": 0.3448, "step": 250500 }, { "epoch": 37.14, "grad_norm": 0.3755366802215576, "learning_rate": 1.674173202130808e-05, "loss": 0.3475, "step": 251000 }, { "epoch": 37.22, "grad_norm": 0.3687468469142914, "learning_rate": 1.671283108907961e-05, "loss": 0.3441, "step": 251500 }, { "epoch": 37.29, "grad_norm": 0.3150022327899933, "learning_rate": 1.668393015685114e-05, "loss": 0.3439, "step": 252000 }, { "epoch": 37.36, "grad_norm": 0.37440210580825806, "learning_rate": 1.665502922462267e-05, "loss": 0.3474, "step": 252500 }, { "epoch": 37.44, "grad_norm": 0.3700256943702698, "learning_rate": 1.66261282923942e-05, "loss": 0.3453, "step": 253000 }, { "epoch": 37.51, "grad_norm": 0.40626585483551025, "learning_rate": 1.659722736016573e-05, "loss": 0.345, "step": 253500 }, { "epoch": 37.59, "grad_norm": 0.39384424686431885, "learning_rate": 1.656832642793726e-05, "loss": 0.3465, "step": 254000 }, { "epoch": 37.66, "grad_norm": 0.38323110342025757, "learning_rate": 1.653942549570879e-05, "loss": 0.3458, "step": 254500 }, { "epoch": 37.73, "grad_norm": 0.3620944619178772, "learning_rate": 1.651052456348032e-05, "loss": 0.3457, "step": 255000 }, { "epoch": 37.81, "grad_norm": 0.3920278251171112, "learning_rate": 1.648162363125185e-05, "loss": 0.3433, "step": 255500 }, { "epoch": 37.88, "grad_norm": 0.3547744154930115, "learning_rate": 1.6452722699023382e-05, "loss": 0.3427, "step": 256000 }, { "epoch": 37.96, "grad_norm": 0.3048088252544403, "learning_rate": 1.642382176679491e-05, "loss": 0.3462, "step": 256500 }, { "epoch": 38.03, "grad_norm": 0.3744346499443054, "learning_rate": 1.639492083456644e-05, "loss": 0.3451, "step": 257000 }, { "epoch": 38.1, "grad_norm": 0.3640407621860504, "learning_rate": 1.6366019902337972e-05, "loss": 0.3485, "step": 257500 }, { "epoch": 38.18, "grad_norm": 0.37335264682769775, "learning_rate": 1.63371189701095e-05, "loss": 0.3466, "step": 258000 }, { "epoch": 38.25, "grad_norm": 0.3745587170124054, "learning_rate": 1.630821803788103e-05, "loss": 0.344, "step": 258500 }, { "epoch": 38.32, "grad_norm": 0.462406724691391, "learning_rate": 1.6279317105652562e-05, "loss": 0.3432, "step": 259000 }, { "epoch": 38.4, "grad_norm": 0.3359210193157196, "learning_rate": 1.625041617342409e-05, "loss": 0.3469, "step": 259500 }, { "epoch": 38.47, "grad_norm": 0.3449317514896393, "learning_rate": 1.622151524119562e-05, "loss": 0.3435, "step": 260000 }, { "epoch": 38.55, "grad_norm": 0.4265647232532501, "learning_rate": 1.6192614308967153e-05, "loss": 0.3458, "step": 260500 }, { "epoch": 38.62, "grad_norm": 0.40118905901908875, "learning_rate": 1.616371337673868e-05, "loss": 0.3461, "step": 261000 }, { "epoch": 38.69, "grad_norm": 0.36499762535095215, "learning_rate": 1.6134812444510212e-05, "loss": 0.3457, "step": 261500 }, { "epoch": 38.77, "grad_norm": 0.37067875266075134, "learning_rate": 1.6105911512281743e-05, "loss": 0.3426, "step": 262000 }, { "epoch": 38.84, "grad_norm": 0.402778685092926, "learning_rate": 1.6077010580053268e-05, "loss": 0.3432, "step": 262500 }, { "epoch": 38.92, "grad_norm": 0.37418636679649353, "learning_rate": 1.60481096478248e-05, "loss": 0.347, "step": 263000 }, { "epoch": 38.99, "grad_norm": 0.4147396981716156, "learning_rate": 1.601920871559633e-05, "loss": 0.3456, "step": 263500 }, { "epoch": 39.06, "grad_norm": 0.42823702096939087, "learning_rate": 1.5990307783367858e-05, "loss": 0.342, "step": 264000 }, { "epoch": 39.14, "grad_norm": 0.40999341011047363, "learning_rate": 1.596140685113939e-05, "loss": 0.3413, "step": 264500 }, { "epoch": 39.21, "grad_norm": 0.32551825046539307, "learning_rate": 1.593250591891092e-05, "loss": 0.3433, "step": 265000 }, { "epoch": 39.29, "grad_norm": 0.3688596487045288, "learning_rate": 1.5903604986682448e-05, "loss": 0.3457, "step": 265500 }, { "epoch": 39.36, "grad_norm": 0.39799386262893677, "learning_rate": 1.587470405445398e-05, "loss": 0.343, "step": 266000 }, { "epoch": 39.43, "grad_norm": 0.34967321157455444, "learning_rate": 1.584580312222551e-05, "loss": 0.3418, "step": 266500 }, { "epoch": 39.51, "grad_norm": 0.36091017723083496, "learning_rate": 1.581690218999704e-05, "loss": 0.3435, "step": 267000 }, { "epoch": 39.58, "grad_norm": 0.3361178934574127, "learning_rate": 1.578800125776857e-05, "loss": 0.3471, "step": 267500 }, { "epoch": 39.66, "grad_norm": 0.36311858892440796, "learning_rate": 1.57591003255401e-05, "loss": 0.3442, "step": 268000 }, { "epoch": 39.73, "grad_norm": 0.37522685527801514, "learning_rate": 1.573019939331163e-05, "loss": 0.3432, "step": 268500 }, { "epoch": 39.8, "grad_norm": 0.42775389552116394, "learning_rate": 1.570129846108316e-05, "loss": 0.344, "step": 269000 }, { "epoch": 39.88, "grad_norm": 0.40960633754730225, "learning_rate": 1.567239752885469e-05, "loss": 0.3428, "step": 269500 }, { "epoch": 39.95, "grad_norm": 0.35652443766593933, "learning_rate": 1.564349659662622e-05, "loss": 0.3435, "step": 270000 }, { "epoch": 40.03, "grad_norm": 0.41139841079711914, "learning_rate": 1.561459566439775e-05, "loss": 0.3436, "step": 270500 }, { "epoch": 40.1, "grad_norm": 0.3651288151741028, "learning_rate": 1.5585694732169282e-05, "loss": 0.3447, "step": 271000 }, { "epoch": 40.17, "grad_norm": 0.37484046816825867, "learning_rate": 1.555679379994081e-05, "loss": 0.3456, "step": 271500 }, { "epoch": 40.25, "grad_norm": 0.3306860625743866, "learning_rate": 1.552789286771234e-05, "loss": 0.3428, "step": 272000 }, { "epoch": 40.32, "grad_norm": 0.37026843428611755, "learning_rate": 1.5498991935483872e-05, "loss": 0.3433, "step": 272500 }, { "epoch": 40.4, "grad_norm": 0.3959224224090576, "learning_rate": 1.54700910032554e-05, "loss": 0.3453, "step": 273000 }, { "epoch": 40.47, "grad_norm": 0.3823704719543457, "learning_rate": 1.544119007102693e-05, "loss": 0.3413, "step": 273500 }, { "epoch": 40.54, "grad_norm": 0.33115535974502563, "learning_rate": 1.5412289138798462e-05, "loss": 0.3442, "step": 274000 }, { "epoch": 40.62, "grad_norm": 0.5036399364471436, "learning_rate": 1.538338820656999e-05, "loss": 0.3417, "step": 274500 }, { "epoch": 40.69, "grad_norm": 0.3805595934391022, "learning_rate": 1.535448727434152e-05, "loss": 0.3453, "step": 275000 }, { "epoch": 40.77, "grad_norm": 0.4390459656715393, "learning_rate": 1.5325586342113053e-05, "loss": 0.343, "step": 275500 }, { "epoch": 40.84, "grad_norm": 0.3673398792743683, "learning_rate": 1.5296685409884584e-05, "loss": 0.3402, "step": 276000 }, { "epoch": 40.91, "grad_norm": 0.36677980422973633, "learning_rate": 1.5267784477656112e-05, "loss": 0.3418, "step": 276500 }, { "epoch": 40.99, "grad_norm": 0.37628763914108276, "learning_rate": 1.5238883545427641e-05, "loss": 0.3423, "step": 277000 }, { "epoch": 41.06, "grad_norm": 0.3959880769252777, "learning_rate": 1.5209982613199171e-05, "loss": 0.3413, "step": 277500 }, { "epoch": 41.14, "grad_norm": 0.35615137219429016, "learning_rate": 1.51810816809707e-05, "loss": 0.3438, "step": 278000 }, { "epoch": 41.21, "grad_norm": 0.4133353531360626, "learning_rate": 1.5152180748742232e-05, "loss": 0.3429, "step": 278500 }, { "epoch": 41.28, "grad_norm": 0.35143953561782837, "learning_rate": 1.5123279816513761e-05, "loss": 0.3437, "step": 279000 }, { "epoch": 41.36, "grad_norm": 0.37390509247779846, "learning_rate": 1.509437888428529e-05, "loss": 0.341, "step": 279500 }, { "epoch": 41.43, "grad_norm": 0.39959460496902466, "learning_rate": 1.5065477952056822e-05, "loss": 0.3423, "step": 280000 }, { "epoch": 41.51, "grad_norm": 0.3992210030555725, "learning_rate": 1.5036577019828352e-05, "loss": 0.3462, "step": 280500 }, { "epoch": 41.58, "grad_norm": 0.3677886724472046, "learning_rate": 1.5007676087599881e-05, "loss": 0.3427, "step": 281000 }, { "epoch": 41.65, "grad_norm": 0.3906817138195038, "learning_rate": 1.4978775155371412e-05, "loss": 0.3415, "step": 281500 }, { "epoch": 41.73, "grad_norm": 0.39412081241607666, "learning_rate": 1.4949874223142942e-05, "loss": 0.3434, "step": 282000 }, { "epoch": 41.8, "grad_norm": 0.4120485782623291, "learning_rate": 1.4920973290914471e-05, "loss": 0.344, "step": 282500 }, { "epoch": 41.88, "grad_norm": 0.4464050829410553, "learning_rate": 1.4892072358686003e-05, "loss": 0.34, "step": 283000 }, { "epoch": 41.95, "grad_norm": 0.3044414520263672, "learning_rate": 1.4863171426457532e-05, "loss": 0.3417, "step": 283500 }, { "epoch": 42.02, "grad_norm": 0.32554033398628235, "learning_rate": 1.4834270494229062e-05, "loss": 0.3417, "step": 284000 }, { "epoch": 42.1, "grad_norm": 0.3944820463657379, "learning_rate": 1.4805369562000593e-05, "loss": 0.3436, "step": 284500 }, { "epoch": 42.17, "grad_norm": 0.37802961468696594, "learning_rate": 1.4776468629772123e-05, "loss": 0.3439, "step": 285000 }, { "epoch": 42.25, "grad_norm": 0.4089604616165161, "learning_rate": 1.4747567697543654e-05, "loss": 0.3408, "step": 285500 }, { "epoch": 42.32, "grad_norm": 0.37038755416870117, "learning_rate": 1.4718666765315183e-05, "loss": 0.3426, "step": 286000 }, { "epoch": 42.39, "grad_norm": 0.35514524579048157, "learning_rate": 1.4689765833086711e-05, "loss": 0.3451, "step": 286500 }, { "epoch": 42.47, "grad_norm": 0.4710882902145386, "learning_rate": 1.4660864900858242e-05, "loss": 0.3429, "step": 287000 }, { "epoch": 42.54, "grad_norm": 0.34002232551574707, "learning_rate": 1.4631963968629772e-05, "loss": 0.3418, "step": 287500 }, { "epoch": 42.62, "grad_norm": 0.4424833059310913, "learning_rate": 1.4603063036401302e-05, "loss": 0.3425, "step": 288000 }, { "epoch": 42.69, "grad_norm": 0.4154003858566284, "learning_rate": 1.4574162104172833e-05, "loss": 0.3428, "step": 288500 }, { "epoch": 42.76, "grad_norm": 0.40983182191848755, "learning_rate": 1.4545261171944362e-05, "loss": 0.343, "step": 289000 }, { "epoch": 42.84, "grad_norm": 0.3568328320980072, "learning_rate": 1.4516360239715892e-05, "loss": 0.3387, "step": 289500 }, { "epoch": 42.91, "grad_norm": 0.3948083221912384, "learning_rate": 1.4487459307487423e-05, "loss": 0.3388, "step": 290000 }, { "epoch": 42.99, "grad_norm": 0.3891525864601135, "learning_rate": 1.4458558375258953e-05, "loss": 0.3415, "step": 290500 }, { "epoch": 43.06, "grad_norm": 0.3312503695487976, "learning_rate": 1.4429657443030482e-05, "loss": 0.3407, "step": 291000 }, { "epoch": 43.13, "grad_norm": 0.37773850560188293, "learning_rate": 1.4400756510802013e-05, "loss": 0.3411, "step": 291500 }, { "epoch": 43.21, "grad_norm": 0.29978179931640625, "learning_rate": 1.4371855578573543e-05, "loss": 0.3423, "step": 292000 }, { "epoch": 43.28, "grad_norm": 0.4314406216144562, "learning_rate": 1.4342954646345072e-05, "loss": 0.3373, "step": 292500 }, { "epoch": 43.36, "grad_norm": 0.3975353538990021, "learning_rate": 1.4314053714116604e-05, "loss": 0.3405, "step": 293000 }, { "epoch": 43.43, "grad_norm": 0.35734960436820984, "learning_rate": 1.4285152781888133e-05, "loss": 0.3416, "step": 293500 }, { "epoch": 43.5, "grad_norm": 0.44908031821250916, "learning_rate": 1.4256251849659663e-05, "loss": 0.3392, "step": 294000 }, { "epoch": 43.58, "grad_norm": 0.3516298532485962, "learning_rate": 1.4227350917431194e-05, "loss": 0.3385, "step": 294500 }, { "epoch": 43.65, "grad_norm": 0.3821066915988922, "learning_rate": 1.4198449985202724e-05, "loss": 0.3448, "step": 295000 }, { "epoch": 43.73, "grad_norm": 0.3824633061885834, "learning_rate": 1.4169549052974253e-05, "loss": 0.3415, "step": 295500 }, { "epoch": 43.8, "grad_norm": 0.3336328864097595, "learning_rate": 1.4140648120745784e-05, "loss": 0.3403, "step": 296000 }, { "epoch": 43.87, "grad_norm": 0.41100433468818665, "learning_rate": 1.4111747188517312e-05, "loss": 0.3428, "step": 296500 }, { "epoch": 43.95, "grad_norm": 0.38574787974357605, "learning_rate": 1.4082846256288842e-05, "loss": 0.3388, "step": 297000 }, { "epoch": 44.02, "grad_norm": 0.3482591509819031, "learning_rate": 1.4053945324060373e-05, "loss": 0.3392, "step": 297500 }, { "epoch": 44.1, "grad_norm": 0.39932170510292053, "learning_rate": 1.4025044391831903e-05, "loss": 0.3385, "step": 298000 }, { "epoch": 44.17, "grad_norm": 0.3750057518482208, "learning_rate": 1.3996143459603432e-05, "loss": 0.3419, "step": 298500 }, { "epoch": 44.24, "grad_norm": 0.3343985974788666, "learning_rate": 1.3967242527374963e-05, "loss": 0.3398, "step": 299000 }, { "epoch": 44.32, "grad_norm": 0.32805758714675903, "learning_rate": 1.3938341595146493e-05, "loss": 0.339, "step": 299500 }, { "epoch": 44.39, "grad_norm": 0.376280814409256, "learning_rate": 1.3909440662918022e-05, "loss": 0.3407, "step": 300000 }, { "epoch": 44.47, "grad_norm": 0.27181968092918396, "learning_rate": 1.3880539730689554e-05, "loss": 0.3392, "step": 300500 }, { "epoch": 44.54, "grad_norm": 0.3351672887802124, "learning_rate": 1.3851638798461083e-05, "loss": 0.3416, "step": 301000 }, { "epoch": 44.61, "grad_norm": 0.3780210614204407, "learning_rate": 1.3822737866232613e-05, "loss": 0.3429, "step": 301500 }, { "epoch": 44.69, "grad_norm": 0.3951726257801056, "learning_rate": 1.3793836934004144e-05, "loss": 0.3407, "step": 302000 }, { "epoch": 44.76, "grad_norm": 0.3675825893878937, "learning_rate": 1.3764936001775674e-05, "loss": 0.339, "step": 302500 }, { "epoch": 44.84, "grad_norm": 0.3629719913005829, "learning_rate": 1.3736035069547205e-05, "loss": 0.3431, "step": 303000 }, { "epoch": 44.91, "grad_norm": 0.4170212745666504, "learning_rate": 1.3707134137318734e-05, "loss": 0.3394, "step": 303500 }, { "epoch": 44.98, "grad_norm": 0.3839627206325531, "learning_rate": 1.3678233205090264e-05, "loss": 0.3422, "step": 304000 }, { "epoch": 45.06, "grad_norm": 0.4395899176597595, "learning_rate": 1.3649332272861795e-05, "loss": 0.3402, "step": 304500 }, { "epoch": 45.13, "grad_norm": 0.37044864892959595, "learning_rate": 1.3620431340633325e-05, "loss": 0.342, "step": 305000 }, { "epoch": 45.21, "grad_norm": 0.4869653582572937, "learning_rate": 1.3591530408404854e-05, "loss": 0.3391, "step": 305500 }, { "epoch": 45.28, "grad_norm": 0.416748046875, "learning_rate": 1.3562629476176384e-05, "loss": 0.3394, "step": 306000 }, { "epoch": 45.35, "grad_norm": 0.37101954221725464, "learning_rate": 1.3533728543947913e-05, "loss": 0.3425, "step": 306500 }, { "epoch": 45.43, "grad_norm": 0.3808073103427887, "learning_rate": 1.3504827611719443e-05, "loss": 0.3399, "step": 307000 }, { "epoch": 45.5, "grad_norm": 0.3832837641239166, "learning_rate": 1.3475926679490974e-05, "loss": 0.3413, "step": 307500 }, { "epoch": 45.58, "grad_norm": 0.3216901421546936, "learning_rate": 1.3447025747262504e-05, "loss": 0.3403, "step": 308000 }, { "epoch": 45.65, "grad_norm": 0.36098387837409973, "learning_rate": 1.3418124815034033e-05, "loss": 0.3426, "step": 308500 }, { "epoch": 45.72, "grad_norm": 0.5177834033966064, "learning_rate": 1.3389223882805564e-05, "loss": 0.3411, "step": 309000 }, { "epoch": 45.8, "grad_norm": 0.41095811128616333, "learning_rate": 1.3360322950577094e-05, "loss": 0.3381, "step": 309500 }, { "epoch": 45.87, "grad_norm": 0.38759657740592957, "learning_rate": 1.3331422018348624e-05, "loss": 0.3385, "step": 310000 }, { "epoch": 45.95, "grad_norm": 0.34995037317276, "learning_rate": 1.3302521086120155e-05, "loss": 0.3387, "step": 310500 }, { "epoch": 46.02, "grad_norm": 0.40866127610206604, "learning_rate": 1.3273620153891684e-05, "loss": 0.3406, "step": 311000 }, { "epoch": 46.09, "grad_norm": 0.40558964014053345, "learning_rate": 1.3244719221663214e-05, "loss": 0.34, "step": 311500 }, { "epoch": 46.17, "grad_norm": 0.3268815875053406, "learning_rate": 1.3215818289434745e-05, "loss": 0.3428, "step": 312000 }, { "epoch": 46.24, "grad_norm": 0.4113875925540924, "learning_rate": 1.3186917357206275e-05, "loss": 0.3397, "step": 312500 }, { "epoch": 46.32, "grad_norm": 0.3797847032546997, "learning_rate": 1.3158016424977804e-05, "loss": 0.3404, "step": 313000 }, { "epoch": 46.39, "grad_norm": 0.3348693251609802, "learning_rate": 1.3129115492749335e-05, "loss": 0.34, "step": 313500 }, { "epoch": 46.46, "grad_norm": 0.3879573941230774, "learning_rate": 1.3100214560520865e-05, "loss": 0.341, "step": 314000 }, { "epoch": 46.54, "grad_norm": 0.40568268299102783, "learning_rate": 1.3071313628292394e-05, "loss": 0.3387, "step": 314500 }, { "epoch": 46.61, "grad_norm": 0.4025332033634186, "learning_rate": 1.3042412696063926e-05, "loss": 0.3415, "step": 315000 }, { "epoch": 46.69, "grad_norm": 0.38457706570625305, "learning_rate": 1.3013511763835455e-05, "loss": 0.3372, "step": 315500 }, { "epoch": 46.76, "grad_norm": 0.36499732732772827, "learning_rate": 1.2984610831606983e-05, "loss": 0.3402, "step": 316000 }, { "epoch": 46.83, "grad_norm": 0.5976847410202026, "learning_rate": 1.2955709899378514e-05, "loss": 0.3402, "step": 316500 }, { "epoch": 46.91, "grad_norm": 0.38978490233421326, "learning_rate": 1.2926808967150044e-05, "loss": 0.3378, "step": 317000 }, { "epoch": 46.98, "grad_norm": 0.4110495448112488, "learning_rate": 1.2897908034921573e-05, "loss": 0.339, "step": 317500 }, { "epoch": 47.06, "grad_norm": 0.37475350499153137, "learning_rate": 1.2869007102693105e-05, "loss": 0.3402, "step": 318000 }, { "epoch": 47.13, "grad_norm": 0.34179574251174927, "learning_rate": 1.2840106170464634e-05, "loss": 0.3397, "step": 318500 }, { "epoch": 47.2, "grad_norm": 0.3006201982498169, "learning_rate": 1.2811205238236164e-05, "loss": 0.342, "step": 319000 }, { "epoch": 47.28, "grad_norm": 0.38113319873809814, "learning_rate": 1.2782304306007695e-05, "loss": 0.341, "step": 319500 }, { "epoch": 47.35, "grad_norm": 0.48638325929641724, "learning_rate": 1.2753403373779225e-05, "loss": 0.3403, "step": 320000 }, { "epoch": 47.43, "grad_norm": 0.38058334589004517, "learning_rate": 1.2724502441550754e-05, "loss": 0.3401, "step": 320500 }, { "epoch": 47.5, "grad_norm": 0.3784169554710388, "learning_rate": 1.2695601509322285e-05, "loss": 0.3378, "step": 321000 }, { "epoch": 47.57, "grad_norm": 0.3679274916648865, "learning_rate": 1.2666700577093815e-05, "loss": 0.3388, "step": 321500 }, { "epoch": 47.65, "grad_norm": 0.4005269706249237, "learning_rate": 1.2637799644865346e-05, "loss": 0.3382, "step": 322000 }, { "epoch": 47.72, "grad_norm": 0.36187744140625, "learning_rate": 1.2608898712636876e-05, "loss": 0.3372, "step": 322500 }, { "epoch": 47.8, "grad_norm": 0.4188271164894104, "learning_rate": 1.2579997780408405e-05, "loss": 0.339, "step": 323000 }, { "epoch": 47.87, "grad_norm": 0.40314343571662903, "learning_rate": 1.2551096848179936e-05, "loss": 0.3404, "step": 323500 }, { "epoch": 47.94, "grad_norm": 0.4233045279979706, "learning_rate": 1.2522195915951466e-05, "loss": 0.3398, "step": 324000 }, { "epoch": 48.02, "grad_norm": 0.3816595673561096, "learning_rate": 1.2493294983722996e-05, "loss": 0.3382, "step": 324500 }, { "epoch": 48.09, "grad_norm": 0.4000893831253052, "learning_rate": 1.2464394051494527e-05, "loss": 0.3402, "step": 325000 }, { "epoch": 48.17, "grad_norm": 0.4130527377128601, "learning_rate": 1.2435493119266055e-05, "loss": 0.3394, "step": 325500 }, { "epoch": 48.24, "grad_norm": 0.38113564252853394, "learning_rate": 1.2406592187037584e-05, "loss": 0.3365, "step": 326000 }, { "epoch": 48.31, "grad_norm": 0.346966415643692, "learning_rate": 1.2377691254809115e-05, "loss": 0.3395, "step": 326500 }, { "epoch": 48.39, "grad_norm": 0.4276494085788727, "learning_rate": 1.2348790322580645e-05, "loss": 0.3362, "step": 327000 }, { "epoch": 48.46, "grad_norm": 0.39347225427627563, "learning_rate": 1.2319889390352175e-05, "loss": 0.3422, "step": 327500 }, { "epoch": 48.54, "grad_norm": 0.3483811616897583, "learning_rate": 1.2290988458123706e-05, "loss": 0.3395, "step": 328000 }, { "epoch": 48.61, "grad_norm": 0.36153608560562134, "learning_rate": 1.2262087525895235e-05, "loss": 0.3365, "step": 328500 }, { "epoch": 48.68, "grad_norm": 0.39289888739585876, "learning_rate": 1.2233186593666765e-05, "loss": 0.3421, "step": 329000 }, { "epoch": 48.76, "grad_norm": 0.4176575839519501, "learning_rate": 1.2204285661438296e-05, "loss": 0.3364, "step": 329500 }, { "epoch": 48.83, "grad_norm": 0.3840237855911255, "learning_rate": 1.2175384729209826e-05, "loss": 0.3357, "step": 330000 }, { "epoch": 48.91, "grad_norm": 0.44171571731567383, "learning_rate": 1.2146483796981355e-05, "loss": 0.3413, "step": 330500 }, { "epoch": 48.98, "grad_norm": 0.42055392265319824, "learning_rate": 1.2117582864752886e-05, "loss": 0.335, "step": 331000 }, { "epoch": 49.05, "grad_norm": 0.44252675771713257, "learning_rate": 1.2088681932524416e-05, "loss": 0.3385, "step": 331500 }, { "epoch": 49.13, "grad_norm": 0.378095805644989, "learning_rate": 1.2059781000295946e-05, "loss": 0.3413, "step": 332000 }, { "epoch": 49.2, "grad_norm": 0.3892216980457306, "learning_rate": 1.2030880068067477e-05, "loss": 0.3374, "step": 332500 }, { "epoch": 49.27, "grad_norm": 0.3788653612136841, "learning_rate": 1.2001979135839006e-05, "loss": 0.3398, "step": 333000 }, { "epoch": 49.35, "grad_norm": 0.38030126690864563, "learning_rate": 1.1973078203610536e-05, "loss": 0.3375, "step": 333500 }, { "epoch": 49.42, "grad_norm": 0.4031144082546234, "learning_rate": 1.1944177271382067e-05, "loss": 0.3398, "step": 334000 }, { "epoch": 49.5, "grad_norm": 0.3956519663333893, "learning_rate": 1.1915276339153597e-05, "loss": 0.3374, "step": 334500 }, { "epoch": 49.57, "grad_norm": 0.3961743414402008, "learning_rate": 1.1886375406925124e-05, "loss": 0.3349, "step": 335000 }, { "epoch": 49.64, "grad_norm": 0.3616986572742462, "learning_rate": 1.1857474474696656e-05, "loss": 0.3374, "step": 335500 }, { "epoch": 49.72, "grad_norm": 0.36143624782562256, "learning_rate": 1.1828573542468185e-05, "loss": 0.3361, "step": 336000 }, { "epoch": 49.79, "grad_norm": 0.389981746673584, "learning_rate": 1.1799672610239715e-05, "loss": 0.3378, "step": 336500 }, { "epoch": 49.87, "grad_norm": 0.4078088104724884, "learning_rate": 1.1770771678011246e-05, "loss": 0.3382, "step": 337000 }, { "epoch": 49.94, "grad_norm": 0.3802012801170349, "learning_rate": 1.1741870745782776e-05, "loss": 0.3356, "step": 337500 }, { "epoch": 50.01, "grad_norm": 0.46680396795272827, "learning_rate": 1.1712969813554305e-05, "loss": 0.339, "step": 338000 }, { "epoch": 50.09, "grad_norm": 0.45273512601852417, "learning_rate": 1.1684068881325836e-05, "loss": 0.3381, "step": 338500 }, { "epoch": 50.16, "grad_norm": 0.3563522398471832, "learning_rate": 1.1655167949097366e-05, "loss": 0.3371, "step": 339000 }, { "epoch": 50.24, "grad_norm": 0.43655216693878174, "learning_rate": 1.1626267016868895e-05, "loss": 0.3354, "step": 339500 }, { "epoch": 50.31, "grad_norm": 0.3371317982673645, "learning_rate": 1.1597366084640427e-05, "loss": 0.3374, "step": 340000 }, { "epoch": 50.38, "grad_norm": 0.39056339859962463, "learning_rate": 1.1568465152411956e-05, "loss": 0.3376, "step": 340500 }, { "epoch": 50.46, "grad_norm": 0.40476441383361816, "learning_rate": 1.1539564220183487e-05, "loss": 0.3381, "step": 341000 }, { "epoch": 50.53, "grad_norm": 0.3706866502761841, "learning_rate": 1.1510663287955017e-05, "loss": 0.3355, "step": 341500 }, { "epoch": 50.61, "grad_norm": 0.43677544593811035, "learning_rate": 1.1481762355726547e-05, "loss": 0.3352, "step": 342000 }, { "epoch": 50.68, "grad_norm": 0.3938286602497101, "learning_rate": 1.1452861423498078e-05, "loss": 0.3375, "step": 342500 }, { "epoch": 50.75, "grad_norm": 0.3463038504123688, "learning_rate": 1.1423960491269607e-05, "loss": 0.3379, "step": 343000 }, { "epoch": 50.83, "grad_norm": 0.3810366988182068, "learning_rate": 1.1395059559041137e-05, "loss": 0.3367, "step": 343500 }, { "epoch": 50.9, "grad_norm": 0.3845095932483673, "learning_rate": 1.1366158626812668e-05, "loss": 0.3366, "step": 344000 }, { "epoch": 50.98, "grad_norm": 0.5161909461021423, "learning_rate": 1.1337257694584198e-05, "loss": 0.3382, "step": 344500 }, { "epoch": 51.05, "grad_norm": 0.4319625794887543, "learning_rate": 1.1308356762355726e-05, "loss": 0.3359, "step": 345000 }, { "epoch": 51.12, "grad_norm": 0.34908732771873474, "learning_rate": 1.1279455830127257e-05, "loss": 0.3353, "step": 345500 }, { "epoch": 51.2, "grad_norm": 0.38367515802383423, "learning_rate": 1.1250554897898786e-05, "loss": 0.3367, "step": 346000 }, { "epoch": 51.27, "grad_norm": 0.3939116597175598, "learning_rate": 1.1221653965670316e-05, "loss": 0.3374, "step": 346500 }, { "epoch": 51.35, "grad_norm": 0.44843488931655884, "learning_rate": 1.1192753033441847e-05, "loss": 0.3376, "step": 347000 }, { "epoch": 51.42, "grad_norm": 0.4169764816761017, "learning_rate": 1.1163852101213377e-05, "loss": 0.3385, "step": 347500 }, { "epoch": 51.49, "grad_norm": 0.3487055003643036, "learning_rate": 1.1134951168984906e-05, "loss": 0.3372, "step": 348000 }, { "epoch": 51.57, "grad_norm": 0.3876706063747406, "learning_rate": 1.1106050236756437e-05, "loss": 0.3379, "step": 348500 }, { "epoch": 51.64, "grad_norm": 0.33344700932502747, "learning_rate": 1.1077149304527967e-05, "loss": 0.3389, "step": 349000 }, { "epoch": 51.72, "grad_norm": 0.41183948516845703, "learning_rate": 1.1048248372299497e-05, "loss": 0.3363, "step": 349500 }, { "epoch": 51.79, "grad_norm": 0.3549967110157013, "learning_rate": 1.1019347440071028e-05, "loss": 0.3374, "step": 350000 }, { "epoch": 51.86, "grad_norm": 0.4144490659236908, "learning_rate": 1.0990446507842557e-05, "loss": 0.3347, "step": 350500 }, { "epoch": 51.94, "grad_norm": 0.3781343400478363, "learning_rate": 1.0961545575614087e-05, "loss": 0.3365, "step": 351000 }, { "epoch": 52.01, "grad_norm": 0.4050437808036804, "learning_rate": 1.0932644643385618e-05, "loss": 0.3384, "step": 351500 }, { "epoch": 52.09, "grad_norm": 0.3758808374404907, "learning_rate": 1.0903743711157148e-05, "loss": 0.3382, "step": 352000 }, { "epoch": 52.16, "grad_norm": 0.456534206867218, "learning_rate": 1.0874842778928677e-05, "loss": 0.3392, "step": 352500 }, { "epoch": 52.23, "grad_norm": 0.38857728242874146, "learning_rate": 1.0845941846700208e-05, "loss": 0.3374, "step": 353000 }, { "epoch": 52.31, "grad_norm": 0.39419788122177124, "learning_rate": 1.0817040914471738e-05, "loss": 0.3374, "step": 353500 }, { "epoch": 52.38, "grad_norm": 0.41852855682373047, "learning_rate": 1.0788139982243268e-05, "loss": 0.3335, "step": 354000 }, { "epoch": 52.46, "grad_norm": 0.3561359941959381, "learning_rate": 1.0759239050014797e-05, "loss": 0.3359, "step": 354500 }, { "epoch": 52.53, "grad_norm": 0.3975025713443756, "learning_rate": 1.0730338117786327e-05, "loss": 0.336, "step": 355000 }, { "epoch": 52.6, "grad_norm": 0.39150169491767883, "learning_rate": 1.0701437185557856e-05, "loss": 0.337, "step": 355500 }, { "epoch": 52.68, "grad_norm": 0.404354453086853, "learning_rate": 1.0672536253329387e-05, "loss": 0.3378, "step": 356000 }, { "epoch": 52.75, "grad_norm": 0.3414269685745239, "learning_rate": 1.0643635321100917e-05, "loss": 0.3338, "step": 356500 }, { "epoch": 52.83, "grad_norm": 0.4378945827484131, "learning_rate": 1.0614734388872446e-05, "loss": 0.3369, "step": 357000 }, { "epoch": 52.9, "grad_norm": 0.5136425495147705, "learning_rate": 1.0585833456643978e-05, "loss": 0.3348, "step": 357500 }, { "epoch": 52.97, "grad_norm": 0.3793259263038635, "learning_rate": 1.0556932524415507e-05, "loss": 0.3354, "step": 358000 }, { "epoch": 53.05, "grad_norm": 0.3828275203704834, "learning_rate": 1.0528031592187039e-05, "loss": 0.3348, "step": 358500 }, { "epoch": 53.12, "grad_norm": 0.380776971578598, "learning_rate": 1.0499130659958568e-05, "loss": 0.3375, "step": 359000 }, { "epoch": 53.2, "grad_norm": 0.40259137749671936, "learning_rate": 1.0470229727730098e-05, "loss": 0.3366, "step": 359500 }, { "epoch": 53.27, "grad_norm": 0.3794288635253906, "learning_rate": 1.0441328795501629e-05, "loss": 0.3343, "step": 360000 }, { "epoch": 53.34, "grad_norm": 0.44558051228523254, "learning_rate": 1.0412427863273158e-05, "loss": 0.3355, "step": 360500 }, { "epoch": 53.42, "grad_norm": 0.42926931381225586, "learning_rate": 1.0383526931044688e-05, "loss": 0.3368, "step": 361000 }, { "epoch": 53.49, "grad_norm": 0.3846406936645508, "learning_rate": 1.035462599881622e-05, "loss": 0.3363, "step": 361500 }, { "epoch": 53.57, "grad_norm": 0.43000903725624084, "learning_rate": 1.0325725066587749e-05, "loss": 0.3338, "step": 362000 }, { "epoch": 53.64, "grad_norm": 0.42310836911201477, "learning_rate": 1.0296824134359278e-05, "loss": 0.336, "step": 362500 }, { "epoch": 53.71, "grad_norm": 0.3451327681541443, "learning_rate": 1.026792320213081e-05, "loss": 0.3384, "step": 363000 }, { "epoch": 53.79, "grad_norm": 0.4068630337715149, "learning_rate": 1.0239022269902339e-05, "loss": 0.3389, "step": 363500 }, { "epoch": 53.86, "grad_norm": 0.36988091468811035, "learning_rate": 1.0210121337673869e-05, "loss": 0.3368, "step": 364000 }, { "epoch": 53.94, "grad_norm": 0.37670448422431946, "learning_rate": 1.0181220405445398e-05, "loss": 0.3361, "step": 364500 }, { "epoch": 54.01, "grad_norm": 0.4235304296016693, "learning_rate": 1.0152319473216928e-05, "loss": 0.3339, "step": 365000 }, { "epoch": 54.08, "grad_norm": 0.4179520606994629, "learning_rate": 1.0123418540988457e-05, "loss": 0.3372, "step": 365500 }, { "epoch": 54.16, "grad_norm": 0.3763734996318817, "learning_rate": 1.0094517608759988e-05, "loss": 0.3368, "step": 366000 }, { "epoch": 54.23, "grad_norm": 0.4098796844482422, "learning_rate": 1.0065616676531518e-05, "loss": 0.3326, "step": 366500 }, { "epoch": 54.31, "grad_norm": 0.41570228338241577, "learning_rate": 1.0036715744303048e-05, "loss": 0.3366, "step": 367000 }, { "epoch": 54.38, "grad_norm": 0.38217049837112427, "learning_rate": 1.0007814812074579e-05, "loss": 0.3338, "step": 367500 }, { "epoch": 54.45, "grad_norm": 0.36770564317703247, "learning_rate": 9.978913879846108e-06, "loss": 0.3323, "step": 368000 }, { "epoch": 54.53, "grad_norm": 0.43568935990333557, "learning_rate": 9.950012947617638e-06, "loss": 0.3361, "step": 368500 }, { "epoch": 54.6, "grad_norm": 0.47602441906929016, "learning_rate": 9.921112015389169e-06, "loss": 0.3349, "step": 369000 }, { "epoch": 54.68, "grad_norm": 0.4022866487503052, "learning_rate": 9.892211083160699e-06, "loss": 0.3347, "step": 369500 }, { "epoch": 54.75, "grad_norm": 0.3981685936450958, "learning_rate": 9.863310150932228e-06, "loss": 0.3351, "step": 370000 }, { "epoch": 54.82, "grad_norm": 0.3706594705581665, "learning_rate": 9.83440921870376e-06, "loss": 0.3342, "step": 370500 }, { "epoch": 54.9, "grad_norm": 0.36316171288490295, "learning_rate": 9.805508286475289e-06, "loss": 0.337, "step": 371000 }, { "epoch": 54.97, "grad_norm": 0.3705138564109802, "learning_rate": 9.776607354246819e-06, "loss": 0.3358, "step": 371500 }, { "epoch": 55.05, "grad_norm": 0.4264328181743622, "learning_rate": 9.74770642201835e-06, "loss": 0.3349, "step": 372000 }, { "epoch": 55.12, "grad_norm": 0.39624592661857605, "learning_rate": 9.71880548978988e-06, "loss": 0.3327, "step": 372500 }, { "epoch": 55.19, "grad_norm": 0.41520076990127563, "learning_rate": 9.689904557561409e-06, "loss": 0.3363, "step": 373000 }, { "epoch": 55.27, "grad_norm": 0.37249574065208435, "learning_rate": 9.66100362533294e-06, "loss": 0.335, "step": 373500 }, { "epoch": 55.34, "grad_norm": 0.42657721042633057, "learning_rate": 9.632102693104468e-06, "loss": 0.3353, "step": 374000 }, { "epoch": 55.42, "grad_norm": 0.3780669569969177, "learning_rate": 9.603201760875998e-06, "loss": 0.337, "step": 374500 }, { "epoch": 55.49, "grad_norm": 0.3783871829509735, "learning_rate": 9.574300828647529e-06, "loss": 0.3348, "step": 375000 }, { "epoch": 55.56, "grad_norm": 0.4328089952468872, "learning_rate": 9.545399896419058e-06, "loss": 0.3366, "step": 375500 }, { "epoch": 55.64, "grad_norm": 0.3957238793373108, "learning_rate": 9.516498964190588e-06, "loss": 0.3344, "step": 376000 }, { "epoch": 55.71, "grad_norm": 0.3606773614883423, "learning_rate": 9.487598031962119e-06, "loss": 0.3342, "step": 376500 }, { "epoch": 55.79, "grad_norm": 0.4170531928539276, "learning_rate": 9.458697099733649e-06, "loss": 0.3349, "step": 377000 }, { "epoch": 55.86, "grad_norm": 0.3830915093421936, "learning_rate": 9.42979616750518e-06, "loss": 0.3371, "step": 377500 }, { "epoch": 55.93, "grad_norm": 0.4350239634513855, "learning_rate": 9.40089523527671e-06, "loss": 0.3377, "step": 378000 }, { "epoch": 56.01, "grad_norm": 0.37382885813713074, "learning_rate": 9.371994303048239e-06, "loss": 0.3362, "step": 378500 }, { "epoch": 56.08, "grad_norm": 0.3806856870651245, "learning_rate": 9.34309337081977e-06, "loss": 0.3347, "step": 379000 }, { "epoch": 56.16, "grad_norm": 0.3189554214477539, "learning_rate": 9.3141924385913e-06, "loss": 0.3363, "step": 379500 }, { "epoch": 56.23, "grad_norm": 0.33894240856170654, "learning_rate": 9.28529150636283e-06, "loss": 0.3362, "step": 380000 }, { "epoch": 56.3, "grad_norm": 0.4565516710281372, "learning_rate": 9.25639057413436e-06, "loss": 0.3331, "step": 380500 }, { "epoch": 56.38, "grad_norm": 0.4101388156414032, "learning_rate": 9.22748964190589e-06, "loss": 0.335, "step": 381000 }, { "epoch": 56.45, "grad_norm": 0.40449845790863037, "learning_rate": 9.19858870967742e-06, "loss": 0.3337, "step": 381500 }, { "epoch": 56.53, "grad_norm": 0.47349539399147034, "learning_rate": 9.16968777744895e-06, "loss": 0.3328, "step": 382000 }, { "epoch": 56.6, "grad_norm": 0.42848438024520874, "learning_rate": 9.14078684522048e-06, "loss": 0.334, "step": 382500 }, { "epoch": 56.67, "grad_norm": 0.3625510334968567, "learning_rate": 9.11188591299201e-06, "loss": 0.3321, "step": 383000 }, { "epoch": 56.75, "grad_norm": 0.3561297357082367, "learning_rate": 9.082984980763541e-06, "loss": 0.3349, "step": 383500 }, { "epoch": 56.82, "grad_norm": 0.3738841414451599, "learning_rate": 9.054084048535069e-06, "loss": 0.3366, "step": 384000 }, { "epoch": 56.9, "grad_norm": 0.33738991618156433, "learning_rate": 9.025183116306599e-06, "loss": 0.3327, "step": 384500 }, { "epoch": 56.97, "grad_norm": 0.42749759554862976, "learning_rate": 8.99628218407813e-06, "loss": 0.336, "step": 385000 }, { "epoch": 57.04, "grad_norm": 0.4089387059211731, "learning_rate": 8.96738125184966e-06, "loss": 0.3334, "step": 385500 }, { "epoch": 57.12, "grad_norm": 0.3684140145778656, "learning_rate": 8.938480319621189e-06, "loss": 0.3345, "step": 386000 }, { "epoch": 57.19, "grad_norm": 0.3694292902946472, "learning_rate": 8.90957938739272e-06, "loss": 0.3333, "step": 386500 }, { "epoch": 57.27, "grad_norm": 0.31505081057548523, "learning_rate": 8.88067845516425e-06, "loss": 0.3339, "step": 387000 }, { "epoch": 57.34, "grad_norm": 0.4051445722579956, "learning_rate": 8.85177752293578e-06, "loss": 0.3348, "step": 387500 }, { "epoch": 57.41, "grad_norm": 0.426145076751709, "learning_rate": 8.82287659070731e-06, "loss": 0.3307, "step": 388000 }, { "epoch": 57.49, "grad_norm": 0.4356764256954193, "learning_rate": 8.79397565847884e-06, "loss": 0.3336, "step": 388500 }, { "epoch": 57.56, "grad_norm": 0.39635592699050903, "learning_rate": 8.76507472625037e-06, "loss": 0.3355, "step": 389000 }, { "epoch": 57.64, "grad_norm": 0.4467043876647949, "learning_rate": 8.7361737940219e-06, "loss": 0.3369, "step": 389500 }, { "epoch": 57.71, "grad_norm": 0.5042401552200317, "learning_rate": 8.70727286179343e-06, "loss": 0.3352, "step": 390000 }, { "epoch": 57.78, "grad_norm": 0.38742733001708984, "learning_rate": 8.67837192956496e-06, "loss": 0.3349, "step": 390500 }, { "epoch": 57.86, "grad_norm": 0.35748493671417236, "learning_rate": 8.649470997336491e-06, "loss": 0.3331, "step": 391000 }, { "epoch": 57.93, "grad_norm": 0.406547486782074, "learning_rate": 8.62057006510802e-06, "loss": 0.3345, "step": 391500 }, { "epoch": 58.01, "grad_norm": 0.37016528844833374, "learning_rate": 8.59166913287955e-06, "loss": 0.3338, "step": 392000 }, { "epoch": 58.08, "grad_norm": 0.39589524269104004, "learning_rate": 8.562768200651081e-06, "loss": 0.3334, "step": 392500 }, { "epoch": 58.15, "grad_norm": 0.42654627561569214, "learning_rate": 8.533867268422611e-06, "loss": 0.3336, "step": 393000 }, { "epoch": 58.23, "grad_norm": 0.4174553453922272, "learning_rate": 8.504966336194139e-06, "loss": 0.3339, "step": 393500 }, { "epoch": 58.3, "grad_norm": 0.43379977345466614, "learning_rate": 8.47606540396567e-06, "loss": 0.3329, "step": 394000 }, { "epoch": 58.38, "grad_norm": 0.3706502914428711, "learning_rate": 8.4471644717372e-06, "loss": 0.332, "step": 394500 }, { "epoch": 58.45, "grad_norm": 0.4529905319213867, "learning_rate": 8.41826353950873e-06, "loss": 0.3342, "step": 395000 }, { "epoch": 58.52, "grad_norm": 0.4060870110988617, "learning_rate": 8.38936260728026e-06, "loss": 0.3331, "step": 395500 }, { "epoch": 58.6, "grad_norm": 0.4102860689163208, "learning_rate": 8.36046167505179e-06, "loss": 0.3339, "step": 396000 }, { "epoch": 58.67, "grad_norm": 0.38025009632110596, "learning_rate": 8.331560742823321e-06, "loss": 0.3334, "step": 396500 }, { "epoch": 58.75, "grad_norm": 0.3559959828853607, "learning_rate": 8.30265981059485e-06, "loss": 0.334, "step": 397000 }, { "epoch": 58.82, "grad_norm": 0.48199519515037537, "learning_rate": 8.27375887836638e-06, "loss": 0.3328, "step": 397500 }, { "epoch": 58.89, "grad_norm": 0.40932905673980713, "learning_rate": 8.244857946137912e-06, "loss": 0.3314, "step": 398000 }, { "epoch": 58.97, "grad_norm": 0.4070405960083008, "learning_rate": 8.215957013909441e-06, "loss": 0.3354, "step": 398500 }, { "epoch": 59.04, "grad_norm": 0.392281711101532, "learning_rate": 8.18705608168097e-06, "loss": 0.3324, "step": 399000 }, { "epoch": 59.12, "grad_norm": 0.38242244720458984, "learning_rate": 8.158155149452502e-06, "loss": 0.3313, "step": 399500 }, { "epoch": 59.19, "grad_norm": 0.4169810712337494, "learning_rate": 8.129254217224031e-06, "loss": 0.3354, "step": 400000 }, { "epoch": 59.26, "grad_norm": 0.335362046957016, "learning_rate": 8.100353284995561e-06, "loss": 0.3312, "step": 400500 }, { "epoch": 59.34, "grad_norm": 0.41095077991485596, "learning_rate": 8.071452352767092e-06, "loss": 0.3331, "step": 401000 }, { "epoch": 59.41, "grad_norm": 0.39492741227149963, "learning_rate": 8.042551420538622e-06, "loss": 0.3314, "step": 401500 }, { "epoch": 59.49, "grad_norm": 0.42789730429649353, "learning_rate": 8.013650488310151e-06, "loss": 0.333, "step": 402000 }, { "epoch": 59.56, "grad_norm": 0.35511842370033264, "learning_rate": 7.984749556081683e-06, "loss": 0.3346, "step": 402500 }, { "epoch": 59.63, "grad_norm": 0.36928626894950867, "learning_rate": 7.95584862385321e-06, "loss": 0.335, "step": 403000 }, { "epoch": 59.71, "grad_norm": 0.4076744318008423, "learning_rate": 7.92694769162474e-06, "loss": 0.3294, "step": 403500 }, { "epoch": 59.78, "grad_norm": 0.35494473576545715, "learning_rate": 7.898046759396271e-06, "loss": 0.3336, "step": 404000 }, { "epoch": 59.85, "grad_norm": 0.3991703689098358, "learning_rate": 7.8691458271678e-06, "loss": 0.3294, "step": 404500 }, { "epoch": 59.93, "grad_norm": 0.3891808092594147, "learning_rate": 7.84024489493933e-06, "loss": 0.3349, "step": 405000 }, { "epoch": 60.0, "grad_norm": 0.5921450257301331, "learning_rate": 7.811343962710861e-06, "loss": 0.3331, "step": 405500 }, { "epoch": 60.08, "grad_norm": 0.387185275554657, "learning_rate": 7.782443030482391e-06, "loss": 0.3326, "step": 406000 }, { "epoch": 60.15, "grad_norm": 0.5411362648010254, "learning_rate": 7.75354209825392e-06, "loss": 0.3303, "step": 406500 }, { "epoch": 60.22, "grad_norm": 0.35113802552223206, "learning_rate": 7.724641166025452e-06, "loss": 0.3343, "step": 407000 }, { "epoch": 60.3, "grad_norm": 0.3711684048175812, "learning_rate": 7.695740233796981e-06, "loss": 0.3316, "step": 407500 }, { "epoch": 60.37, "grad_norm": 0.40576910972595215, "learning_rate": 7.666839301568511e-06, "loss": 0.3344, "step": 408000 }, { "epoch": 60.45, "grad_norm": 0.4487907588481903, "learning_rate": 7.637938369340042e-06, "loss": 0.3337, "step": 408500 }, { "epoch": 60.52, "grad_norm": 0.4065958857536316, "learning_rate": 7.609037437111572e-06, "loss": 0.3314, "step": 409000 }, { "epoch": 60.59, "grad_norm": 0.4283113479614258, "learning_rate": 7.580136504883102e-06, "loss": 0.3337, "step": 409500 }, { "epoch": 60.67, "grad_norm": 0.4433044493198395, "learning_rate": 7.5512355726546325e-06, "loss": 0.3317, "step": 410000 }, { "epoch": 60.74, "grad_norm": 0.38607364892959595, "learning_rate": 7.522334640426161e-06, "loss": 0.333, "step": 410500 }, { "epoch": 60.82, "grad_norm": 0.45367687940597534, "learning_rate": 7.4934337081976916e-06, "loss": 0.3298, "step": 411000 }, { "epoch": 60.89, "grad_norm": 0.4054895043373108, "learning_rate": 7.464532775969222e-06, "loss": 0.3318, "step": 411500 }, { "epoch": 60.96, "grad_norm": 0.41600409150123596, "learning_rate": 7.4356318437407515e-06, "loss": 0.3313, "step": 412000 }, { "epoch": 61.04, "grad_norm": 0.4171212911605835, "learning_rate": 7.406730911512282e-06, "loss": 0.3318, "step": 412500 }, { "epoch": 61.11, "grad_norm": 0.40264466404914856, "learning_rate": 7.377829979283812e-06, "loss": 0.3335, "step": 413000 }, { "epoch": 61.19, "grad_norm": 0.37919875979423523, "learning_rate": 7.348929047055342e-06, "loss": 0.3324, "step": 413500 }, { "epoch": 61.26, "grad_norm": 0.47246700525283813, "learning_rate": 7.320028114826872e-06, "loss": 0.3341, "step": 414000 }, { "epoch": 61.33, "grad_norm": 0.4305689036846161, "learning_rate": 7.291127182598403e-06, "loss": 0.3335, "step": 414500 }, { "epoch": 61.41, "grad_norm": 0.38494426012039185, "learning_rate": 7.262226250369932e-06, "loss": 0.3337, "step": 415000 }, { "epoch": 61.48, "grad_norm": 0.45139452815055847, "learning_rate": 7.233325318141462e-06, "loss": 0.3322, "step": 415500 }, { "epoch": 61.56, "grad_norm": 0.4199995994567871, "learning_rate": 7.204424385912992e-06, "loss": 0.3302, "step": 416000 }, { "epoch": 61.63, "grad_norm": 0.3823252022266388, "learning_rate": 7.175523453684522e-06, "loss": 0.333, "step": 416500 }, { "epoch": 61.7, "grad_norm": 0.38762542605400085, "learning_rate": 7.146622521456052e-06, "loss": 0.3338, "step": 417000 }, { "epoch": 61.78, "grad_norm": 0.3889346718788147, "learning_rate": 7.117721589227582e-06, "loss": 0.3333, "step": 417500 }, { "epoch": 61.85, "grad_norm": 0.43703803420066833, "learning_rate": 7.088820656999113e-06, "loss": 0.3313, "step": 418000 }, { "epoch": 61.93, "grad_norm": 0.37083032727241516, "learning_rate": 7.059919724770642e-06, "loss": 0.3327, "step": 418500 }, { "epoch": 62.0, "grad_norm": 0.431436687707901, "learning_rate": 7.031018792542173e-06, "loss": 0.3275, "step": 419000 }, { "epoch": 62.07, "grad_norm": 0.38710957765579224, "learning_rate": 7.002117860313703e-06, "loss": 0.3315, "step": 419500 }, { "epoch": 62.15, "grad_norm": 0.4548743963241577, "learning_rate": 6.973216928085232e-06, "loss": 0.3314, "step": 420000 }, { "epoch": 62.22, "grad_norm": 0.4413709342479706, "learning_rate": 6.944315995856762e-06, "loss": 0.3317, "step": 420500 }, { "epoch": 62.3, "grad_norm": 0.42544716596603394, "learning_rate": 6.915415063628293e-06, "loss": 0.3327, "step": 421000 }, { "epoch": 62.37, "grad_norm": 0.4307864010334015, "learning_rate": 6.886514131399822e-06, "loss": 0.3335, "step": 421500 }, { "epoch": 62.44, "grad_norm": 0.4296441376209259, "learning_rate": 6.8576131991713526e-06, "loss": 0.3317, "step": 422000 }, { "epoch": 62.52, "grad_norm": 0.3624299466609955, "learning_rate": 6.828712266942883e-06, "loss": 0.3307, "step": 422500 }, { "epoch": 62.59, "grad_norm": 0.4123700261116028, "learning_rate": 6.7998113347144125e-06, "loss": 0.3317, "step": 423000 }, { "epoch": 62.67, "grad_norm": 0.4546355903148651, "learning_rate": 6.770910402485943e-06, "loss": 0.3288, "step": 423500 }, { "epoch": 62.74, "grad_norm": 0.4328787922859192, "learning_rate": 6.742009470257473e-06, "loss": 0.3321, "step": 424000 }, { "epoch": 62.81, "grad_norm": 0.39879125356674194, "learning_rate": 6.713108538029003e-06, "loss": 0.334, "step": 424500 }, { "epoch": 62.89, "grad_norm": 0.42407459020614624, "learning_rate": 6.684207605800532e-06, "loss": 0.3312, "step": 425000 }, { "epoch": 62.96, "grad_norm": 0.5664127469062805, "learning_rate": 6.655306673572063e-06, "loss": 0.3323, "step": 425500 }, { "epoch": 63.04, "grad_norm": 0.47169846296310425, "learning_rate": 6.626405741343592e-06, "loss": 0.3309, "step": 426000 }, { "epoch": 63.11, "grad_norm": 0.3552204668521881, "learning_rate": 6.597504809115123e-06, "loss": 0.33, "step": 426500 }, { "epoch": 63.18, "grad_norm": 0.44585150480270386, "learning_rate": 6.568603876886653e-06, "loss": 0.3306, "step": 427000 }, { "epoch": 63.26, "grad_norm": 0.4512608051300049, "learning_rate": 6.5397029446581835e-06, "loss": 0.3308, "step": 427500 }, { "epoch": 63.33, "grad_norm": 0.40121740102767944, "learning_rate": 6.510802012429713e-06, "loss": 0.3302, "step": 428000 }, { "epoch": 63.41, "grad_norm": 0.4354041516780853, "learning_rate": 6.481901080201243e-06, "loss": 0.3327, "step": 428500 }, { "epoch": 63.48, "grad_norm": 0.4612290561199188, "learning_rate": 6.453000147972774e-06, "loss": 0.3311, "step": 429000 }, { "epoch": 63.55, "grad_norm": 0.4508548676967621, "learning_rate": 6.424099215744303e-06, "loss": 0.3312, "step": 429500 }, { "epoch": 63.63, "grad_norm": 0.4045092761516571, "learning_rate": 6.395198283515833e-06, "loss": 0.3313, "step": 430000 }, { "epoch": 63.7, "grad_norm": 0.4180326759815216, "learning_rate": 6.366297351287363e-06, "loss": 0.3324, "step": 430500 }, { "epoch": 63.78, "grad_norm": 0.3800413906574249, "learning_rate": 6.337396419058893e-06, "loss": 0.3357, "step": 431000 }, { "epoch": 63.85, "grad_norm": 0.4264669716358185, "learning_rate": 6.308495486830423e-06, "loss": 0.3314, "step": 431500 }, { "epoch": 63.92, "grad_norm": 0.4021168351173401, "learning_rate": 6.279594554601954e-06, "loss": 0.3301, "step": 432000 }, { "epoch": 64.0, "grad_norm": 0.4635623097419739, "learning_rate": 6.250693622373483e-06, "loss": 0.3304, "step": 432500 }, { "epoch": 64.07, "grad_norm": 0.4012512266635895, "learning_rate": 6.2217926901450136e-06, "loss": 0.3322, "step": 433000 }, { "epoch": 64.15, "grad_norm": 0.4430687725543976, "learning_rate": 6.192891757916544e-06, "loss": 0.3302, "step": 433500 }, { "epoch": 64.22, "grad_norm": 0.43903249502182007, "learning_rate": 6.1639908256880735e-06, "loss": 0.3326, "step": 434000 }, { "epoch": 64.29, "grad_norm": 0.5228444337844849, "learning_rate": 6.135089893459604e-06, "loss": 0.3298, "step": 434500 }, { "epoch": 64.37, "grad_norm": 0.43113288283348083, "learning_rate": 6.1061889612311334e-06, "loss": 0.3291, "step": 435000 }, { "epoch": 64.44, "grad_norm": 0.47652667760849, "learning_rate": 6.077288029002663e-06, "loss": 0.3299, "step": 435500 }, { "epoch": 64.52, "grad_norm": 0.4017566442489624, "learning_rate": 6.048387096774193e-06, "loss": 0.3312, "step": 436000 }, { "epoch": 64.59, "grad_norm": 0.4369170069694519, "learning_rate": 6.019486164545724e-06, "loss": 0.3339, "step": 436500 }, { "epoch": 64.66, "grad_norm": 0.36806294322013855, "learning_rate": 5.990585232317254e-06, "loss": 0.3317, "step": 437000 }, { "epoch": 64.74, "grad_norm": 0.42576882243156433, "learning_rate": 5.961684300088784e-06, "loss": 0.3309, "step": 437500 }, { "epoch": 64.81, "grad_norm": 0.4077777564525604, "learning_rate": 5.932783367860314e-06, "loss": 0.3319, "step": 438000 }, { "epoch": 64.89, "grad_norm": 0.4394007921218872, "learning_rate": 5.9038824356318445e-06, "loss": 0.3327, "step": 438500 }, { "epoch": 64.96, "grad_norm": 0.32965216040611267, "learning_rate": 5.874981503403374e-06, "loss": 0.3277, "step": 439000 }, { "epoch": 65.03, "grad_norm": 0.4312441945075989, "learning_rate": 5.846080571174904e-06, "loss": 0.3291, "step": 439500 }, { "epoch": 65.11, "grad_norm": 0.3752184808254242, "learning_rate": 5.817179638946434e-06, "loss": 0.3319, "step": 440000 }, { "epoch": 65.18, "grad_norm": 0.4169740080833435, "learning_rate": 5.7882787067179635e-06, "loss": 0.331, "step": 440500 }, { "epoch": 65.26, "grad_norm": 0.43580740690231323, "learning_rate": 5.759377774489494e-06, "loss": 0.3301, "step": 441000 }, { "epoch": 65.33, "grad_norm": 0.46015655994415283, "learning_rate": 5.730476842261024e-06, "loss": 0.3326, "step": 441500 }, { "epoch": 65.4, "grad_norm": 0.4646316468715668, "learning_rate": 5.701575910032554e-06, "loss": 0.3307, "step": 442000 }, { "epoch": 65.48, "grad_norm": 0.4371485114097595, "learning_rate": 5.672674977804084e-06, "loss": 0.3303, "step": 442500 }, { "epoch": 65.55, "grad_norm": 0.443768173456192, "learning_rate": 5.643774045575615e-06, "loss": 0.3315, "step": 443000 }, { "epoch": 65.63, "grad_norm": 0.44002553820610046, "learning_rate": 5.614873113347144e-06, "loss": 0.3305, "step": 443500 }, { "epoch": 65.7, "grad_norm": 0.39671292901039124, "learning_rate": 5.5859721811186746e-06, "loss": 0.3312, "step": 444000 }, { "epoch": 65.77, "grad_norm": 0.4188387393951416, "learning_rate": 5.557071248890204e-06, "loss": 0.3302, "step": 444500 }, { "epoch": 65.85, "grad_norm": 0.44623398780822754, "learning_rate": 5.528170316661734e-06, "loss": 0.3308, "step": 445000 }, { "epoch": 65.92, "grad_norm": 0.36335235834121704, "learning_rate": 5.499269384433264e-06, "loss": 0.3293, "step": 445500 }, { "epoch": 66.0, "grad_norm": 0.41810572147369385, "learning_rate": 5.4703684522047944e-06, "loss": 0.329, "step": 446000 }, { "epoch": 66.07, "grad_norm": 0.4002617299556732, "learning_rate": 5.441467519976325e-06, "loss": 0.3278, "step": 446500 }, { "epoch": 66.14, "grad_norm": 0.45273175835609436, "learning_rate": 5.412566587747854e-06, "loss": 0.3303, "step": 447000 }, { "epoch": 66.22, "grad_norm": 0.48169875144958496, "learning_rate": 5.383665655519385e-06, "loss": 0.332, "step": 447500 }, { "epoch": 66.29, "grad_norm": 0.39927640557289124, "learning_rate": 5.354764723290915e-06, "loss": 0.3296, "step": 448000 }, { "epoch": 66.37, "grad_norm": 0.42319226264953613, "learning_rate": 5.325863791062445e-06, "loss": 0.3309, "step": 448500 }, { "epoch": 66.44, "grad_norm": 0.4284779131412506, "learning_rate": 5.296962858833975e-06, "loss": 0.3321, "step": 449000 }, { "epoch": 66.51, "grad_norm": 0.5179397463798523, "learning_rate": 5.268061926605505e-06, "loss": 0.33, "step": 449500 }, { "epoch": 66.59, "grad_norm": 0.44250035285949707, "learning_rate": 5.239160994377034e-06, "loss": 0.3295, "step": 450000 }, { "epoch": 66.66, "grad_norm": 0.46015605330467224, "learning_rate": 5.210260062148565e-06, "loss": 0.3313, "step": 450500 }, { "epoch": 66.74, "grad_norm": 0.5012817978858948, "learning_rate": 5.181359129920095e-06, "loss": 0.3302, "step": 451000 }, { "epoch": 66.81, "grad_norm": 0.403338223695755, "learning_rate": 5.1524581976916245e-06, "loss": 0.3306, "step": 451500 }, { "epoch": 66.88, "grad_norm": 0.4086831212043762, "learning_rate": 5.123557265463155e-06, "loss": 0.3286, "step": 452000 }, { "epoch": 66.96, "grad_norm": 0.3715237081050873, "learning_rate": 5.094656333234685e-06, "loss": 0.3301, "step": 452500 }, { "epoch": 67.03, "grad_norm": 0.46829870343208313, "learning_rate": 5.065755401006215e-06, "loss": 0.3307, "step": 453000 }, { "epoch": 67.11, "grad_norm": 0.4667709767818451, "learning_rate": 5.036854468777745e-06, "loss": 0.3298, "step": 453500 }, { "epoch": 67.18, "grad_norm": 0.4758981466293335, "learning_rate": 5.007953536549275e-06, "loss": 0.3272, "step": 454000 }, { "epoch": 67.25, "grad_norm": 0.48276805877685547, "learning_rate": 4.979052604320804e-06, "loss": 0.3288, "step": 454500 }, { "epoch": 67.33, "grad_norm": 0.400806725025177, "learning_rate": 4.950151672092335e-06, "loss": 0.3258, "step": 455000 }, { "epoch": 67.4, "grad_norm": 0.40156251192092896, "learning_rate": 4.921250739863865e-06, "loss": 0.3351, "step": 455500 }, { "epoch": 67.48, "grad_norm": 0.5024535655975342, "learning_rate": 4.8923498076353955e-06, "loss": 0.3306, "step": 456000 }, { "epoch": 67.55, "grad_norm": 0.52587890625, "learning_rate": 4.863448875406925e-06, "loss": 0.3331, "step": 456500 }, { "epoch": 67.62, "grad_norm": 0.41265735030174255, "learning_rate": 4.8345479431784554e-06, "loss": 0.3328, "step": 457000 }, { "epoch": 67.7, "grad_norm": 0.34202754497528076, "learning_rate": 4.805647010949986e-06, "loss": 0.3321, "step": 457500 }, { "epoch": 67.77, "grad_norm": 0.4898373484611511, "learning_rate": 4.776746078721515e-06, "loss": 0.331, "step": 458000 }, { "epoch": 67.85, "grad_norm": 0.52295982837677, "learning_rate": 4.747845146493046e-06, "loss": 0.3306, "step": 458500 }, { "epoch": 67.92, "grad_norm": 0.46750620007514954, "learning_rate": 4.718944214264575e-06, "loss": 0.3315, "step": 459000 }, { "epoch": 67.99, "grad_norm": 0.35533860325813293, "learning_rate": 4.690043282036105e-06, "loss": 0.3315, "step": 459500 }, { "epoch": 68.07, "grad_norm": 0.41508856415748596, "learning_rate": 4.661142349807635e-06, "loss": 0.3291, "step": 460000 }, { "epoch": 68.14, "grad_norm": 0.4271659851074219, "learning_rate": 4.632241417579166e-06, "loss": 0.3286, "step": 460500 }, { "epoch": 68.22, "grad_norm": 0.44648808240890503, "learning_rate": 4.603340485350695e-06, "loss": 0.3299, "step": 461000 }, { "epoch": 68.29, "grad_norm": 0.4843562841415405, "learning_rate": 4.574439553122226e-06, "loss": 0.3282, "step": 461500 }, { "epoch": 68.36, "grad_norm": 0.41266024112701416, "learning_rate": 4.545538620893756e-06, "loss": 0.3299, "step": 462000 }, { "epoch": 68.44, "grad_norm": 0.4088280200958252, "learning_rate": 4.5166376886652855e-06, "loss": 0.332, "step": 462500 }, { "epoch": 68.51, "grad_norm": 0.48477041721343994, "learning_rate": 4.487736756436816e-06, "loss": 0.3312, "step": 463000 }, { "epoch": 68.59, "grad_norm": 0.42487454414367676, "learning_rate": 4.458835824208346e-06, "loss": 0.3296, "step": 463500 }, { "epoch": 68.66, "grad_norm": 0.4671236276626587, "learning_rate": 4.429934891979876e-06, "loss": 0.3288, "step": 464000 }, { "epoch": 68.73, "grad_norm": 0.4430939257144928, "learning_rate": 4.401033959751405e-06, "loss": 0.3297, "step": 464500 }, { "epoch": 68.81, "grad_norm": 0.4080400764942169, "learning_rate": 4.372133027522936e-06, "loss": 0.3305, "step": 465000 }, { "epoch": 68.88, "grad_norm": 0.3743002712726593, "learning_rate": 4.343232095294466e-06, "loss": 0.3294, "step": 465500 }, { "epoch": 68.96, "grad_norm": 0.3991639316082001, "learning_rate": 4.314331163065996e-06, "loss": 0.3293, "step": 466000 }, { "epoch": 69.03, "grad_norm": 0.40404531359672546, "learning_rate": 4.285430230837526e-06, "loss": 0.3307, "step": 466500 }, { "epoch": 69.1, "grad_norm": 0.4253464639186859, "learning_rate": 4.2565292986090565e-06, "loss": 0.3278, "step": 467000 }, { "epoch": 69.18, "grad_norm": 0.43970435857772827, "learning_rate": 4.227628366380586e-06, "loss": 0.3284, "step": 467500 }, { "epoch": 69.25, "grad_norm": 0.42423635721206665, "learning_rate": 4.1987274341521164e-06, "loss": 0.3337, "step": 468000 }, { "epoch": 69.33, "grad_norm": 0.4581485092639923, "learning_rate": 4.169826501923646e-06, "loss": 0.3273, "step": 468500 }, { "epoch": 69.4, "grad_norm": 0.4594268500804901, "learning_rate": 4.1409255696951755e-06, "loss": 0.3295, "step": 469000 }, { "epoch": 69.47, "grad_norm": 0.49994996190071106, "learning_rate": 4.112024637466706e-06, "loss": 0.3267, "step": 469500 }, { "epoch": 69.55, "grad_norm": 0.4062737822532654, "learning_rate": 4.083123705238236e-06, "loss": 0.3283, "step": 470000 }, { "epoch": 69.62, "grad_norm": 0.4764838218688965, "learning_rate": 4.054222773009766e-06, "loss": 0.3318, "step": 470500 }, { "epoch": 69.7, "grad_norm": 0.42747876048088074, "learning_rate": 4.025321840781296e-06, "loss": 0.3311, "step": 471000 }, { "epoch": 69.77, "grad_norm": 0.45434367656707764, "learning_rate": 3.996420908552827e-06, "loss": 0.3293, "step": 471500 }, { "epoch": 69.84, "grad_norm": 0.387123703956604, "learning_rate": 3.967519976324356e-06, "loss": 0.3311, "step": 472000 }, { "epoch": 69.92, "grad_norm": 0.412826806306839, "learning_rate": 3.938619044095887e-06, "loss": 0.329, "step": 472500 }, { "epoch": 69.99, "grad_norm": 0.532727062702179, "learning_rate": 3.909718111867417e-06, "loss": 0.3266, "step": 473000 }, { "epoch": 70.07, "grad_norm": 0.4674714505672455, "learning_rate": 3.8808171796389465e-06, "loss": 0.3257, "step": 473500 }, { "epoch": 70.14, "grad_norm": 0.3989239037036896, "learning_rate": 3.851916247410477e-06, "loss": 0.3288, "step": 474000 }, { "epoch": 70.21, "grad_norm": 0.5390828251838684, "learning_rate": 3.8230153151820065e-06, "loss": 0.3316, "step": 474500 }, { "epoch": 70.29, "grad_norm": 0.4232146143913269, "learning_rate": 3.7941143829535364e-06, "loss": 0.3297, "step": 475000 }, { "epoch": 70.36, "grad_norm": 0.4476439654827118, "learning_rate": 3.765213450725067e-06, "loss": 0.3308, "step": 475500 }, { "epoch": 70.44, "grad_norm": 0.46341538429260254, "learning_rate": 3.7363125184965968e-06, "loss": 0.3241, "step": 476000 }, { "epoch": 70.51, "grad_norm": 0.3792473077774048, "learning_rate": 3.7074115862681268e-06, "loss": 0.3282, "step": 476500 }, { "epoch": 70.58, "grad_norm": 0.42449694871902466, "learning_rate": 3.6785106540396567e-06, "loss": 0.3282, "step": 477000 }, { "epoch": 70.66, "grad_norm": 0.389700323343277, "learning_rate": 3.6496097218111867e-06, "loss": 0.3332, "step": 477500 }, { "epoch": 70.73, "grad_norm": 0.4011322855949402, "learning_rate": 3.6207087895827167e-06, "loss": 0.3295, "step": 478000 }, { "epoch": 70.8, "grad_norm": 0.485365092754364, "learning_rate": 3.591807857354247e-06, "loss": 0.33, "step": 478500 }, { "epoch": 70.88, "grad_norm": 0.39829009771347046, "learning_rate": 3.562906925125777e-06, "loss": 0.3291, "step": 479000 }, { "epoch": 70.95, "grad_norm": 0.46039626002311707, "learning_rate": 3.5340059928973066e-06, "loss": 0.331, "step": 479500 }, { "epoch": 71.03, "grad_norm": 0.40389734506607056, "learning_rate": 3.505105060668837e-06, "loss": 0.3286, "step": 480000 }, { "epoch": 71.1, "grad_norm": 0.4375256299972534, "learning_rate": 3.476204128440367e-06, "loss": 0.3298, "step": 480500 }, { "epoch": 71.17, "grad_norm": 0.42462676763534546, "learning_rate": 3.4473031962118973e-06, "loss": 0.3302, "step": 481000 }, { "epoch": 71.25, "grad_norm": 0.3216535747051239, "learning_rate": 3.4184022639834273e-06, "loss": 0.3283, "step": 481500 }, { "epoch": 71.32, "grad_norm": 0.45945799350738525, "learning_rate": 3.389501331754957e-06, "loss": 0.3278, "step": 482000 }, { "epoch": 71.4, "grad_norm": 0.4495971202850342, "learning_rate": 3.3606003995264872e-06, "loss": 0.3265, "step": 482500 }, { "epoch": 71.47, "grad_norm": 0.4159165322780609, "learning_rate": 3.331699467298017e-06, "loss": 0.3292, "step": 483000 }, { "epoch": 71.54, "grad_norm": 0.410427063703537, "learning_rate": 3.302798535069547e-06, "loss": 0.3299, "step": 483500 }, { "epoch": 71.62, "grad_norm": 0.5130240321159363, "learning_rate": 3.2738976028410776e-06, "loss": 0.3316, "step": 484000 }, { "epoch": 71.69, "grad_norm": 0.4405277669429779, "learning_rate": 3.244996670612607e-06, "loss": 0.33, "step": 484500 }, { "epoch": 71.77, "grad_norm": 0.575674295425415, "learning_rate": 3.2160957383841375e-06, "loss": 0.3298, "step": 485000 }, { "epoch": 71.84, "grad_norm": 0.4434616267681122, "learning_rate": 3.1871948061556675e-06, "loss": 0.3298, "step": 485500 }, { "epoch": 71.91, "grad_norm": 0.3960082530975342, "learning_rate": 3.1582938739271974e-06, "loss": 0.3282, "step": 486000 }, { "epoch": 71.99, "grad_norm": 0.42698296904563904, "learning_rate": 3.129392941698728e-06, "loss": 0.3296, "step": 486500 }, { "epoch": 72.06, "grad_norm": 0.5218748450279236, "learning_rate": 3.1004920094702574e-06, "loss": 0.3296, "step": 487000 }, { "epoch": 72.14, "grad_norm": 0.46763402223587036, "learning_rate": 3.0715910772417873e-06, "loss": 0.3262, "step": 487500 }, { "epoch": 72.21, "grad_norm": 0.42345327138900757, "learning_rate": 3.0426901450133177e-06, "loss": 0.33, "step": 488000 }, { "epoch": 72.28, "grad_norm": 0.4526881277561188, "learning_rate": 3.0137892127848477e-06, "loss": 0.3304, "step": 488500 }, { "epoch": 72.36, "grad_norm": 0.42106348276138306, "learning_rate": 2.984888280556378e-06, "loss": 0.3292, "step": 489000 }, { "epoch": 72.43, "grad_norm": 0.5022510886192322, "learning_rate": 2.9559873483279076e-06, "loss": 0.3266, "step": 489500 }, { "epoch": 72.51, "grad_norm": 0.4436812996864319, "learning_rate": 2.9270864160994376e-06, "loss": 0.3269, "step": 490000 }, { "epoch": 72.58, "grad_norm": 0.42252251505851746, "learning_rate": 2.898185483870968e-06, "loss": 0.3281, "step": 490500 }, { "epoch": 72.65, "grad_norm": 0.5339802503585815, "learning_rate": 2.869284551642498e-06, "loss": 0.3289, "step": 491000 }, { "epoch": 72.73, "grad_norm": 0.3937510550022125, "learning_rate": 2.840383619414028e-06, "loss": 0.328, "step": 491500 }, { "epoch": 72.8, "grad_norm": 0.3894229829311371, "learning_rate": 2.811482687185558e-06, "loss": 0.3282, "step": 492000 }, { "epoch": 72.88, "grad_norm": 0.4481090307235718, "learning_rate": 2.782581754957088e-06, "loss": 0.3301, "step": 492500 }, { "epoch": 72.95, "grad_norm": 0.45495444536209106, "learning_rate": 2.753680822728618e-06, "loss": 0.3278, "step": 493000 }, { "epoch": 73.02, "grad_norm": 0.49259716272354126, "learning_rate": 2.7247798905001482e-06, "loss": 0.3295, "step": 493500 }, { "epoch": 73.1, "grad_norm": 0.4257282018661499, "learning_rate": 2.6958789582716778e-06, "loss": 0.3269, "step": 494000 }, { "epoch": 73.17, "grad_norm": 0.43159300088882446, "learning_rate": 2.666978026043208e-06, "loss": 0.3276, "step": 494500 }, { "epoch": 73.25, "grad_norm": 0.4048108458518982, "learning_rate": 2.638077093814738e-06, "loss": 0.3333, "step": 495000 }, { "epoch": 73.32, "grad_norm": 0.4666566252708435, "learning_rate": 2.609176161586268e-06, "loss": 0.3275, "step": 495500 }, { "epoch": 73.39, "grad_norm": 0.3985891342163086, "learning_rate": 2.5802752293577985e-06, "loss": 0.3279, "step": 496000 }, { "epoch": 73.47, "grad_norm": 0.5439868569374084, "learning_rate": 2.551374297129328e-06, "loss": 0.3275, "step": 496500 }, { "epoch": 73.54, "grad_norm": 0.45784690976142883, "learning_rate": 2.522473364900858e-06, "loss": 0.3309, "step": 497000 }, { "epoch": 73.62, "grad_norm": 0.4779771864414215, "learning_rate": 2.4935724326723884e-06, "loss": 0.3288, "step": 497500 }, { "epoch": 73.69, "grad_norm": 0.47680574655532837, "learning_rate": 2.4646715004439184e-06, "loss": 0.3285, "step": 498000 }, { "epoch": 73.76, "grad_norm": 0.3629719913005829, "learning_rate": 2.4357705682154488e-06, "loss": 0.3284, "step": 498500 }, { "epoch": 73.84, "grad_norm": 0.46253129839897156, "learning_rate": 2.4068696359869783e-06, "loss": 0.3284, "step": 499000 }, { "epoch": 73.91, "grad_norm": 0.44531476497650146, "learning_rate": 2.3779687037585083e-06, "loss": 0.3281, "step": 499500 }, { "epoch": 73.99, "grad_norm": 0.39289695024490356, "learning_rate": 2.3490677715300387e-06, "loss": 0.326, "step": 500000 }, { "epoch": 74.06, "grad_norm": 0.48103997111320496, "learning_rate": 2.3201668393015686e-06, "loss": 0.3272, "step": 500500 }, { "epoch": 74.13, "grad_norm": 0.4336768388748169, "learning_rate": 2.2912659070730986e-06, "loss": 0.3265, "step": 501000 }, { "epoch": 74.21, "grad_norm": 0.4040307402610779, "learning_rate": 2.2623649748446286e-06, "loss": 0.3271, "step": 501500 }, { "epoch": 74.28, "grad_norm": 0.49081218242645264, "learning_rate": 2.2334640426161585e-06, "loss": 0.328, "step": 502000 }, { "epoch": 74.36, "grad_norm": 0.44683390855789185, "learning_rate": 2.2045631103876885e-06, "loss": 0.3266, "step": 502500 }, { "epoch": 74.43, "grad_norm": 0.4362635612487793, "learning_rate": 2.175662178159219e-06, "loss": 0.3293, "step": 503000 }, { "epoch": 74.5, "grad_norm": 0.4326813220977783, "learning_rate": 2.146761245930749e-06, "loss": 0.3302, "step": 503500 }, { "epoch": 74.58, "grad_norm": 0.5289288759231567, "learning_rate": 2.117860313702279e-06, "loss": 0.3288, "step": 504000 }, { "epoch": 74.65, "grad_norm": 0.5708897709846497, "learning_rate": 2.088959381473809e-06, "loss": 0.3271, "step": 504500 }, { "epoch": 74.73, "grad_norm": 0.38460394740104675, "learning_rate": 2.0600584492453388e-06, "loss": 0.3262, "step": 505000 }, { "epoch": 74.8, "grad_norm": 0.4401102066040039, "learning_rate": 2.031157517016869e-06, "loss": 0.3285, "step": 505500 }, { "epoch": 74.87, "grad_norm": 0.4699185788631439, "learning_rate": 2.002256584788399e-06, "loss": 0.3292, "step": 506000 }, { "epoch": 74.95, "grad_norm": 0.43969598412513733, "learning_rate": 1.9733556525599287e-06, "loss": 0.3282, "step": 506500 }, { "epoch": 75.02, "grad_norm": 0.5226773619651794, "learning_rate": 1.944454720331459e-06, "loss": 0.3275, "step": 507000 }, { "epoch": 75.1, "grad_norm": 0.42381104826927185, "learning_rate": 1.915553788102989e-06, "loss": 0.3287, "step": 507500 }, { "epoch": 75.17, "grad_norm": 0.47836771607398987, "learning_rate": 1.886652855874519e-06, "loss": 0.3248, "step": 508000 }, { "epoch": 75.24, "grad_norm": 0.4760962128639221, "learning_rate": 1.8577519236460492e-06, "loss": 0.3255, "step": 508500 }, { "epoch": 75.32, "grad_norm": 0.4954340159893036, "learning_rate": 1.8288509914175794e-06, "loss": 0.3274, "step": 509000 }, { "epoch": 75.39, "grad_norm": 0.3998168110847473, "learning_rate": 1.7999500591891091e-06, "loss": 0.328, "step": 509500 }, { "epoch": 75.47, "grad_norm": 0.3899104595184326, "learning_rate": 1.7710491269606393e-06, "loss": 0.3291, "step": 510000 }, { "epoch": 75.54, "grad_norm": 0.4677903652191162, "learning_rate": 1.7421481947321693e-06, "loss": 0.3261, "step": 510500 }, { "epoch": 75.61, "grad_norm": 0.41607698798179626, "learning_rate": 1.7132472625036995e-06, "loss": 0.3291, "step": 511000 }, { "epoch": 75.69, "grad_norm": 0.44930896162986755, "learning_rate": 1.6843463302752294e-06, "loss": 0.3307, "step": 511500 }, { "epoch": 75.76, "grad_norm": 0.48138096928596497, "learning_rate": 1.6554453980467594e-06, "loss": 0.3256, "step": 512000 }, { "epoch": 75.84, "grad_norm": 0.44024384021759033, "learning_rate": 1.6265444658182896e-06, "loss": 0.3278, "step": 512500 }, { "epoch": 75.91, "grad_norm": 0.42400872707366943, "learning_rate": 1.5976435335898193e-06, "loss": 0.3295, "step": 513000 }, { "epoch": 75.98, "grad_norm": 0.4450230896472931, "learning_rate": 1.5687426013613495e-06, "loss": 0.3293, "step": 513500 }, { "epoch": 76.06, "grad_norm": 0.4113300144672394, "learning_rate": 1.5398416691328797e-06, "loss": 0.3277, "step": 514000 }, { "epoch": 76.13, "grad_norm": 0.4838961064815521, "learning_rate": 1.5109407369044097e-06, "loss": 0.3257, "step": 514500 }, { "epoch": 76.21, "grad_norm": 0.45890524983406067, "learning_rate": 1.4820398046759396e-06, "loss": 0.3263, "step": 515000 }, { "epoch": 76.28, "grad_norm": 0.4421687424182892, "learning_rate": 1.4531388724474696e-06, "loss": 0.3287, "step": 515500 }, { "epoch": 76.35, "grad_norm": 0.4234231114387512, "learning_rate": 1.4242379402189998e-06, "loss": 0.3308, "step": 516000 }, { "epoch": 76.43, "grad_norm": 0.4239380657672882, "learning_rate": 1.3953370079905297e-06, "loss": 0.3291, "step": 516500 }, { "epoch": 76.5, "grad_norm": 0.4606933891773224, "learning_rate": 1.3664360757620597e-06, "loss": 0.3288, "step": 517000 }, { "epoch": 76.58, "grad_norm": 0.4070008099079132, "learning_rate": 1.33753514353359e-06, "loss": 0.3268, "step": 517500 }, { "epoch": 76.65, "grad_norm": 0.5978463888168335, "learning_rate": 1.3086342113051199e-06, "loss": 0.3272, "step": 518000 }, { "epoch": 76.72, "grad_norm": 0.43075379729270935, "learning_rate": 1.27973327907665e-06, "loss": 0.3287, "step": 518500 }, { "epoch": 76.8, "grad_norm": 0.46790510416030884, "learning_rate": 1.2508323468481798e-06, "loss": 0.3266, "step": 519000 }, { "epoch": 76.87, "grad_norm": 0.45541101694107056, "learning_rate": 1.22193141461971e-06, "loss": 0.3254, "step": 519500 }, { "epoch": 76.95, "grad_norm": 0.44363468885421753, "learning_rate": 1.1930304823912402e-06, "loss": 0.3248, "step": 520000 }, { "epoch": 77.02, "grad_norm": 0.5055235624313354, "learning_rate": 1.1641295501627701e-06, "loss": 0.3265, "step": 520500 }, { "epoch": 77.09, "grad_norm": 0.3572923541069031, "learning_rate": 1.1352286179343e-06, "loss": 0.324, "step": 521000 }, { "epoch": 77.17, "grad_norm": 0.40502551198005676, "learning_rate": 1.10632768570583e-06, "loss": 0.3254, "step": 521500 }, { "epoch": 77.24, "grad_norm": 0.45639294385910034, "learning_rate": 1.0774267534773602e-06, "loss": 0.328, "step": 522000 }, { "epoch": 77.32, "grad_norm": 0.4580610990524292, "learning_rate": 1.0485258212488904e-06, "loss": 0.3278, "step": 522500 }, { "epoch": 77.39, "grad_norm": 0.4812680184841156, "learning_rate": 1.0196248890204202e-06, "loss": 0.3274, "step": 523000 }, { "epoch": 77.46, "grad_norm": 0.416979044675827, "learning_rate": 9.907239567919504e-07, "loss": 0.3261, "step": 523500 }, { "epoch": 77.54, "grad_norm": 0.39473670721054077, "learning_rate": 9.618230245634803e-07, "loss": 0.328, "step": 524000 }, { "epoch": 77.61, "grad_norm": 0.4831089675426483, "learning_rate": 9.329220923350104e-07, "loss": 0.328, "step": 524500 }, { "epoch": 77.69, "grad_norm": 0.4752112627029419, "learning_rate": 9.040211601065404e-07, "loss": 0.3249, "step": 525000 }, { "epoch": 77.76, "grad_norm": 0.4114755690097809, "learning_rate": 8.751202278780705e-07, "loss": 0.3282, "step": 525500 }, { "epoch": 77.83, "grad_norm": 0.539284348487854, "learning_rate": 8.462192956496004e-07, "loss": 0.327, "step": 526000 }, { "epoch": 77.91, "grad_norm": 0.4160568118095398, "learning_rate": 8.173183634211305e-07, "loss": 0.3278, "step": 526500 }, { "epoch": 77.98, "grad_norm": 0.42335689067840576, "learning_rate": 7.884174311926606e-07, "loss": 0.3291, "step": 527000 }, { "epoch": 78.06, "grad_norm": 0.4964425563812256, "learning_rate": 7.595164989641906e-07, "loss": 0.3269, "step": 527500 }, { "epoch": 78.13, "grad_norm": 0.5482224822044373, "learning_rate": 7.306155667357206e-07, "loss": 0.3257, "step": 528000 }, { "epoch": 78.2, "grad_norm": 0.4845934808254242, "learning_rate": 7.017146345072507e-07, "loss": 0.3257, "step": 528500 }, { "epoch": 78.28, "grad_norm": 0.44311293959617615, "learning_rate": 6.728137022787807e-07, "loss": 0.3267, "step": 529000 }, { "epoch": 78.35, "grad_norm": 0.49295201897621155, "learning_rate": 6.439127700503107e-07, "loss": 0.3262, "step": 529500 }, { "epoch": 78.43, "grad_norm": 0.45838817954063416, "learning_rate": 6.150118378218408e-07, "loss": 0.3272, "step": 530000 }, { "epoch": 78.5, "grad_norm": 0.4277520477771759, "learning_rate": 5.861109055933709e-07, "loss": 0.3275, "step": 530500 }, { "epoch": 78.57, "grad_norm": 0.49568185210227966, "learning_rate": 5.572099733649008e-07, "loss": 0.3238, "step": 531000 }, { "epoch": 78.65, "grad_norm": 0.3964736759662628, "learning_rate": 5.283090411364309e-07, "loss": 0.3265, "step": 531500 }, { "epoch": 78.72, "grad_norm": 0.38991761207580566, "learning_rate": 4.994081089079609e-07, "loss": 0.3297, "step": 532000 }, { "epoch": 78.8, "grad_norm": 0.514043390750885, "learning_rate": 4.7050717667949096e-07, "loss": 0.3274, "step": 532500 }, { "epoch": 78.87, "grad_norm": 0.44372057914733887, "learning_rate": 4.41606244451021e-07, "loss": 0.3265, "step": 533000 }, { "epoch": 78.94, "grad_norm": 0.40556496381759644, "learning_rate": 4.1270531222255106e-07, "loss": 0.3268, "step": 533500 }, { "epoch": 79.02, "grad_norm": 0.37113696336746216, "learning_rate": 3.838043799940811e-07, "loss": 0.331, "step": 534000 }, { "epoch": 79.09, "grad_norm": 0.42463332414627075, "learning_rate": 3.549034477656111e-07, "loss": 0.3269, "step": 534500 }, { "epoch": 79.17, "grad_norm": 0.456259548664093, "learning_rate": 3.260025155371412e-07, "loss": 0.3296, "step": 535000 }, { "epoch": 79.24, "grad_norm": 0.39561837911605835, "learning_rate": 2.971015833086712e-07, "loss": 0.3272, "step": 535500 }, { "epoch": 79.31, "grad_norm": 0.42246511578559875, "learning_rate": 2.6820065108020127e-07, "loss": 0.3274, "step": 536000 }, { "epoch": 79.39, "grad_norm": 0.42932552099227905, "learning_rate": 2.392997188517313e-07, "loss": 0.3259, "step": 536500 }, { "epoch": 79.46, "grad_norm": 0.4081755578517914, "learning_rate": 2.1039878662326132e-07, "loss": 0.3254, "step": 537000 }, { "epoch": 79.54, "grad_norm": 0.43017107248306274, "learning_rate": 1.8149785439479136e-07, "loss": 0.3294, "step": 537500 }, { "epoch": 79.61, "grad_norm": 0.3940086364746094, "learning_rate": 1.525969221663214e-07, "loss": 0.3284, "step": 538000 }, { "epoch": 79.68, "grad_norm": 0.37287628650665283, "learning_rate": 1.2369598993785146e-07, "loss": 0.3269, "step": 538500 }, { "epoch": 79.76, "grad_norm": 0.451742559671402, "learning_rate": 9.479505770938148e-08, "loss": 0.3259, "step": 539000 }, { "epoch": 79.83, "grad_norm": 0.4438938796520233, "learning_rate": 6.589412548091152e-08, "loss": 0.3238, "step": 539500 }, { "epoch": 79.91, "grad_norm": 0.4649119973182678, "learning_rate": 3.699319325244155e-08, "loss": 0.3257, "step": 540000 }, { "epoch": 79.98, "grad_norm": 0.4151638150215149, "learning_rate": 8.09226102397159e-09, "loss": 0.3303, "step": 540500 }, { "epoch": 80.0, "step": 540640, "total_flos": 4.3712245507093955e+20, "train_loss": 0.35890252478696355, "train_runtime": 56544.5558, "train_samples_per_second": 76.486, "train_steps_per_second": 9.561 } ], "logging_steps": 500, "max_steps": 540640, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 1000000000, "total_flos": 4.3712245507093955e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }