diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7597 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 80.0, + "eval_steps": 500, + "global_step": 540640, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.07, + "grad_norm": 0.11938641220331192, + "learning_rate": 3.122109906777153e-05, + "loss": 0.7845, + "step": 500 + }, + { + "epoch": 0.15, + "grad_norm": 0.11220108717679977, + "learning_rate": 3.119219813554306e-05, + "loss": 0.7681, + "step": 1000 + }, + { + "epoch": 0.22, + "grad_norm": 0.20039337873458862, + "learning_rate": 3.116329720331459e-05, + "loss": 0.7627, + "step": 1500 + }, + { + "epoch": 0.3, + "grad_norm": 0.20968303084373474, + "learning_rate": 3.113439627108612e-05, + "loss": 0.751, + "step": 2000 + }, + { + "epoch": 0.37, + "grad_norm": 0.46042799949645996, + "learning_rate": 3.1105495338857653e-05, + "loss": 0.7395, + "step": 2500 + }, + { + "epoch": 0.44, + "grad_norm": 0.5682156682014465, + "learning_rate": 3.107659440662918e-05, + "loss": 0.7169, + "step": 3000 + }, + { + "epoch": 0.52, + "grad_norm": 0.446135938167572, + "learning_rate": 3.104769347440071e-05, + "loss": 0.7037, + "step": 3500 + }, + { + "epoch": 0.59, + "grad_norm": 0.5436543822288513, + "learning_rate": 3.1018792542172244e-05, + "loss": 0.6854, + "step": 4000 + }, + { + "epoch": 0.67, + "grad_norm": 0.5623897314071655, + "learning_rate": 3.098989160994377e-05, + "loss": 0.6661, + "step": 4500 + }, + { + "epoch": 0.74, + "grad_norm": 0.7239806652069092, + "learning_rate": 3.09609906777153e-05, + "loss": 0.6477, + "step": 5000 + }, + { + "epoch": 0.81, + "grad_norm": 0.6363663077354431, + "learning_rate": 3.0932089745486834e-05, + "loss": 0.6302, + "step": 5500 + }, + { + "epoch": 0.89, + "grad_norm": 0.8511515855789185, + "learning_rate": 3.090318881325836e-05, + "loss": 0.6156, + "step": 6000 + }, + { + "epoch": 0.96, + "grad_norm": 0.7209456562995911, + "learning_rate": 3.087428788102989e-05, + "loss": 0.6002, + "step": 6500 + }, + { + "epoch": 1.04, + "grad_norm": 0.7105280756950378, + "learning_rate": 3.0845386948801424e-05, + "loss": 0.5852, + "step": 7000 + }, + { + "epoch": 1.11, + "grad_norm": 0.7035876512527466, + "learning_rate": 3.081648601657295e-05, + "loss": 0.5734, + "step": 7500 + }, + { + "epoch": 1.18, + "grad_norm": 0.6755463480949402, + "learning_rate": 3.078758508434448e-05, + "loss": 0.5619, + "step": 8000 + }, + { + "epoch": 1.26, + "grad_norm": 0.6636064648628235, + "learning_rate": 3.0758684152116015e-05, + "loss": 0.5527, + "step": 8500 + }, + { + "epoch": 1.33, + "grad_norm": 0.7909913063049316, + "learning_rate": 3.072978321988754e-05, + "loss": 0.545, + "step": 9000 + }, + { + "epoch": 1.41, + "grad_norm": 0.7935764789581299, + "learning_rate": 3.070088228765907e-05, + "loss": 0.5342, + "step": 9500 + }, + { + "epoch": 1.48, + "grad_norm": 0.7649631500244141, + "learning_rate": 3.06719813554306e-05, + "loss": 0.5264, + "step": 10000 + }, + { + "epoch": 1.55, + "grad_norm": 0.7262706160545349, + "learning_rate": 3.064308042320213e-05, + "loss": 0.5194, + "step": 10500 + }, + { + "epoch": 1.63, + "grad_norm": 0.7068478465080261, + "learning_rate": 3.061417949097366e-05, + "loss": 0.5137, + "step": 11000 + }, + { + "epoch": 1.7, + "grad_norm": 0.6415815353393555, + "learning_rate": 3.058527855874519e-05, + "loss": 0.51, + "step": 11500 + }, + { + "epoch": 1.78, + "grad_norm": 0.7167455554008484, + "learning_rate": 3.055637762651672e-05, + "loss": 0.5024, + "step": 12000 + }, + { + "epoch": 1.85, + "grad_norm": 0.6563605666160583, + "learning_rate": 3.052747669428825e-05, + "loss": 0.4985, + "step": 12500 + }, + { + "epoch": 1.92, + "grad_norm": 0.7427666783332825, + "learning_rate": 3.049857576205978e-05, + "loss": 0.4939, + "step": 13000 + }, + { + "epoch": 2.0, + "grad_norm": 0.6371767520904541, + "learning_rate": 3.046967482983131e-05, + "loss": 0.4923, + "step": 13500 + }, + { + "epoch": 2.07, + "grad_norm": 0.7062104940414429, + "learning_rate": 3.044077389760284e-05, + "loss": 0.4894, + "step": 14000 + }, + { + "epoch": 2.15, + "grad_norm": 0.7556993365287781, + "learning_rate": 3.041187296537437e-05, + "loss": 0.4846, + "step": 14500 + }, + { + "epoch": 2.22, + "grad_norm": 0.6561410427093506, + "learning_rate": 3.03829720331459e-05, + "loss": 0.4831, + "step": 15000 + }, + { + "epoch": 2.29, + "grad_norm": 0.6414974331855774, + "learning_rate": 3.0354071100917432e-05, + "loss": 0.4807, + "step": 15500 + }, + { + "epoch": 2.37, + "grad_norm": 0.6632120609283447, + "learning_rate": 3.032517016868896e-05, + "loss": 0.472, + "step": 16000 + }, + { + "epoch": 2.44, + "grad_norm": 0.6413108706474304, + "learning_rate": 3.029626923646049e-05, + "loss": 0.4723, + "step": 16500 + }, + { + "epoch": 2.52, + "grad_norm": 0.6478744149208069, + "learning_rate": 3.0267368304232022e-05, + "loss": 0.4692, + "step": 17000 + }, + { + "epoch": 2.59, + "grad_norm": 0.5901973247528076, + "learning_rate": 3.023846737200355e-05, + "loss": 0.4672, + "step": 17500 + }, + { + "epoch": 2.66, + "grad_norm": 0.5960791707038879, + "learning_rate": 3.020956643977508e-05, + "loss": 0.4649, + "step": 18000 + }, + { + "epoch": 2.74, + "grad_norm": 0.6265193819999695, + "learning_rate": 3.0180665507546612e-05, + "loss": 0.4616, + "step": 18500 + }, + { + "epoch": 2.81, + "grad_norm": 0.6381145119667053, + "learning_rate": 3.0151764575318144e-05, + "loss": 0.4592, + "step": 19000 + }, + { + "epoch": 2.89, + "grad_norm": 0.6370628476142883, + "learning_rate": 3.012286364308967e-05, + "loss": 0.4594, + "step": 19500 + }, + { + "epoch": 2.96, + "grad_norm": 0.5658203363418579, + "learning_rate": 3.0093962710861203e-05, + "loss": 0.4542, + "step": 20000 + }, + { + "epoch": 3.03, + "grad_norm": 0.5123589038848877, + "learning_rate": 3.0065061778632734e-05, + "loss": 0.4555, + "step": 20500 + }, + { + "epoch": 3.11, + "grad_norm": 0.5034360289573669, + "learning_rate": 3.0036160846404262e-05, + "loss": 0.4501, + "step": 21000 + }, + { + "epoch": 3.18, + "grad_norm": 0.5025657415390015, + "learning_rate": 3.0007259914175793e-05, + "loss": 0.4501, + "step": 21500 + }, + { + "epoch": 3.26, + "grad_norm": 0.5448479056358337, + "learning_rate": 2.9978358981947324e-05, + "loss": 0.4481, + "step": 22000 + }, + { + "epoch": 3.33, + "grad_norm": 0.5894014239311218, + "learning_rate": 2.9949458049718852e-05, + "loss": 0.4438, + "step": 22500 + }, + { + "epoch": 3.4, + "grad_norm": 0.653883159160614, + "learning_rate": 2.9920557117490383e-05, + "loss": 0.444, + "step": 23000 + }, + { + "epoch": 3.48, + "grad_norm": 0.4382980167865753, + "learning_rate": 2.9891656185261915e-05, + "loss": 0.4437, + "step": 23500 + }, + { + "epoch": 3.55, + "grad_norm": 0.4639624357223511, + "learning_rate": 2.9862755253033443e-05, + "loss": 0.4398, + "step": 24000 + }, + { + "epoch": 3.63, + "grad_norm": 0.527728796005249, + "learning_rate": 2.9833854320804974e-05, + "loss": 0.4386, + "step": 24500 + }, + { + "epoch": 3.7, + "grad_norm": 0.543736457824707, + "learning_rate": 2.9804953388576505e-05, + "loss": 0.4392, + "step": 25000 + }, + { + "epoch": 3.77, + "grad_norm": 0.5280329585075378, + "learning_rate": 2.9776052456348033e-05, + "loss": 0.4383, + "step": 25500 + }, + { + "epoch": 3.85, + "grad_norm": 0.4563904106616974, + "learning_rate": 2.9747151524119564e-05, + "loss": 0.4371, + "step": 26000 + }, + { + "epoch": 3.92, + "grad_norm": 0.5162687301635742, + "learning_rate": 2.9718250591891095e-05, + "loss": 0.4367, + "step": 26500 + }, + { + "epoch": 4.0, + "grad_norm": 0.4838933050632477, + "learning_rate": 2.9689349659662623e-05, + "loss": 0.4352, + "step": 27000 + }, + { + "epoch": 4.07, + "grad_norm": 0.5301242470741272, + "learning_rate": 2.9660448727434154e-05, + "loss": 0.4319, + "step": 27500 + }, + { + "epoch": 4.14, + "grad_norm": 0.5619557499885559, + "learning_rate": 2.9631547795205686e-05, + "loss": 0.4303, + "step": 28000 + }, + { + "epoch": 4.22, + "grad_norm": 0.4900205433368683, + "learning_rate": 2.9602646862977214e-05, + "loss": 0.4312, + "step": 28500 + }, + { + "epoch": 4.29, + "grad_norm": 0.46870502829551697, + "learning_rate": 2.9573745930748745e-05, + "loss": 0.4302, + "step": 29000 + }, + { + "epoch": 4.37, + "grad_norm": 0.47382786870002747, + "learning_rate": 2.9544844998520273e-05, + "loss": 0.4287, + "step": 29500 + }, + { + "epoch": 4.44, + "grad_norm": 0.5594569444656372, + "learning_rate": 2.95159440662918e-05, + "loss": 0.4284, + "step": 30000 + }, + { + "epoch": 4.51, + "grad_norm": 0.511375367641449, + "learning_rate": 2.9487043134063332e-05, + "loss": 0.4262, + "step": 30500 + }, + { + "epoch": 4.59, + "grad_norm": 0.5069934725761414, + "learning_rate": 2.9458142201834863e-05, + "loss": 0.4247, + "step": 31000 + }, + { + "epoch": 4.66, + "grad_norm": 0.5310338139533997, + "learning_rate": 2.942924126960639e-05, + "loss": 0.4249, + "step": 31500 + }, + { + "epoch": 4.74, + "grad_norm": 0.4728649854660034, + "learning_rate": 2.9400340337377922e-05, + "loss": 0.4225, + "step": 32000 + }, + { + "epoch": 4.81, + "grad_norm": 0.45557233691215515, + "learning_rate": 2.9371439405149453e-05, + "loss": 0.4241, + "step": 32500 + }, + { + "epoch": 4.88, + "grad_norm": 0.4630686938762665, + "learning_rate": 2.934253847292098e-05, + "loss": 0.4212, + "step": 33000 + }, + { + "epoch": 4.96, + "grad_norm": 0.509099543094635, + "learning_rate": 2.9313637540692512e-05, + "loss": 0.4215, + "step": 33500 + }, + { + "epoch": 5.03, + "grad_norm": 0.4747762084007263, + "learning_rate": 2.9284736608464044e-05, + "loss": 0.4203, + "step": 34000 + }, + { + "epoch": 5.11, + "grad_norm": 0.43625542521476746, + "learning_rate": 2.925583567623557e-05, + "loss": 0.4211, + "step": 34500 + }, + { + "epoch": 5.18, + "grad_norm": 0.44176748394966125, + "learning_rate": 2.9226934744007103e-05, + "loss": 0.4209, + "step": 35000 + }, + { + "epoch": 5.25, + "grad_norm": 0.5236085653305054, + "learning_rate": 2.9198033811778634e-05, + "loss": 0.422, + "step": 35500 + }, + { + "epoch": 5.33, + "grad_norm": 0.4237843453884125, + "learning_rate": 2.9169132879550162e-05, + "loss": 0.4163, + "step": 36000 + }, + { + "epoch": 5.4, + "grad_norm": 0.44581139087677, + "learning_rate": 2.9140231947321693e-05, + "loss": 0.4152, + "step": 36500 + }, + { + "epoch": 5.47, + "grad_norm": 0.4488186836242676, + "learning_rate": 2.9111331015093224e-05, + "loss": 0.4175, + "step": 37000 + }, + { + "epoch": 5.55, + "grad_norm": 0.5051326751708984, + "learning_rate": 2.9082430082864752e-05, + "loss": 0.4149, + "step": 37500 + }, + { + "epoch": 5.62, + "grad_norm": 0.4836309850215912, + "learning_rate": 2.9053529150636283e-05, + "loss": 0.4138, + "step": 38000 + }, + { + "epoch": 5.7, + "grad_norm": 0.46710771322250366, + "learning_rate": 2.9024628218407815e-05, + "loss": 0.4125, + "step": 38500 + }, + { + "epoch": 5.77, + "grad_norm": 0.39740118384361267, + "learning_rate": 2.8995727286179342e-05, + "loss": 0.4169, + "step": 39000 + }, + { + "epoch": 5.84, + "grad_norm": 0.4491262435913086, + "learning_rate": 2.8966826353950874e-05, + "loss": 0.4136, + "step": 39500 + }, + { + "epoch": 5.92, + "grad_norm": 0.4240283966064453, + "learning_rate": 2.8937925421722405e-05, + "loss": 0.4143, + "step": 40000 + }, + { + "epoch": 5.99, + "grad_norm": 0.43018123507499695, + "learning_rate": 2.8909024489493933e-05, + "loss": 0.41, + "step": 40500 + }, + { + "epoch": 6.07, + "grad_norm": 0.49115487933158875, + "learning_rate": 2.8880123557265464e-05, + "loss": 0.4086, + "step": 41000 + }, + { + "epoch": 6.14, + "grad_norm": 0.4617484211921692, + "learning_rate": 2.8851222625036995e-05, + "loss": 0.4111, + "step": 41500 + }, + { + "epoch": 6.21, + "grad_norm": 0.4269873797893524, + "learning_rate": 2.8822321692808523e-05, + "loss": 0.4068, + "step": 42000 + }, + { + "epoch": 6.29, + "grad_norm": 0.45183584094047546, + "learning_rate": 2.8793420760580054e-05, + "loss": 0.4104, + "step": 42500 + }, + { + "epoch": 6.36, + "grad_norm": 0.3999849557876587, + "learning_rate": 2.8764519828351586e-05, + "loss": 0.4074, + "step": 43000 + }, + { + "epoch": 6.44, + "grad_norm": 0.3897479772567749, + "learning_rate": 2.8735618896123113e-05, + "loss": 0.4113, + "step": 43500 + }, + { + "epoch": 6.51, + "grad_norm": 0.36687174439430237, + "learning_rate": 2.8706717963894645e-05, + "loss": 0.409, + "step": 44000 + }, + { + "epoch": 6.58, + "grad_norm": 0.41888511180877686, + "learning_rate": 2.8677817031666176e-05, + "loss": 0.4072, + "step": 44500 + }, + { + "epoch": 6.66, + "grad_norm": 0.4102098047733307, + "learning_rate": 2.8648916099437704e-05, + "loss": 0.4081, + "step": 45000 + }, + { + "epoch": 6.73, + "grad_norm": 0.42067912220954895, + "learning_rate": 2.8620015167209235e-05, + "loss": 0.4093, + "step": 45500 + }, + { + "epoch": 6.81, + "grad_norm": 0.45427748560905457, + "learning_rate": 2.8591114234980766e-05, + "loss": 0.4076, + "step": 46000 + }, + { + "epoch": 6.88, + "grad_norm": 0.394954115152359, + "learning_rate": 2.8562213302752294e-05, + "loss": 0.4067, + "step": 46500 + }, + { + "epoch": 6.95, + "grad_norm": 0.42659953236579895, + "learning_rate": 2.8533312370523825e-05, + "loss": 0.4062, + "step": 47000 + }, + { + "epoch": 7.03, + "grad_norm": 0.38056984543800354, + "learning_rate": 2.8504411438295357e-05, + "loss": 0.4061, + "step": 47500 + }, + { + "epoch": 7.1, + "grad_norm": 0.368455708026886, + "learning_rate": 2.8475510506066884e-05, + "loss": 0.4032, + "step": 48000 + }, + { + "epoch": 7.18, + "grad_norm": 0.44540271162986755, + "learning_rate": 2.8446609573838416e-05, + "loss": 0.4054, + "step": 48500 + }, + { + "epoch": 7.25, + "grad_norm": 0.3926877975463867, + "learning_rate": 2.8417708641609943e-05, + "loss": 0.4024, + "step": 49000 + }, + { + "epoch": 7.32, + "grad_norm": 0.4288729727268219, + "learning_rate": 2.838880770938147e-05, + "loss": 0.4013, + "step": 49500 + }, + { + "epoch": 7.4, + "grad_norm": 0.4729566276073456, + "learning_rate": 2.8359906777153003e-05, + "loss": 0.4019, + "step": 50000 + }, + { + "epoch": 7.47, + "grad_norm": 0.46875321865081787, + "learning_rate": 2.8331005844924534e-05, + "loss": 0.4, + "step": 50500 + }, + { + "epoch": 7.55, + "grad_norm": 0.63325035572052, + "learning_rate": 2.830210491269606e-05, + "loss": 0.4008, + "step": 51000 + }, + { + "epoch": 7.62, + "grad_norm": 0.4186055064201355, + "learning_rate": 2.8273203980467593e-05, + "loss": 0.4026, + "step": 51500 + }, + { + "epoch": 7.69, + "grad_norm": 0.3860541880130768, + "learning_rate": 2.8244303048239124e-05, + "loss": 0.4022, + "step": 52000 + }, + { + "epoch": 7.77, + "grad_norm": 0.4552393853664398, + "learning_rate": 2.8215402116010652e-05, + "loss": 0.3979, + "step": 52500 + }, + { + "epoch": 7.84, + "grad_norm": 0.4990374743938446, + "learning_rate": 2.8186501183782183e-05, + "loss": 0.4001, + "step": 53000 + }, + { + "epoch": 7.92, + "grad_norm": 0.46718060970306396, + "learning_rate": 2.8157600251553714e-05, + "loss": 0.4, + "step": 53500 + }, + { + "epoch": 7.99, + "grad_norm": 0.45432960987091064, + "learning_rate": 2.8128699319325242e-05, + "loss": 0.398, + "step": 54000 + }, + { + "epoch": 8.06, + "grad_norm": 0.40666621923446655, + "learning_rate": 2.8099798387096774e-05, + "loss": 0.3996, + "step": 54500 + }, + { + "epoch": 8.14, + "grad_norm": 0.402972936630249, + "learning_rate": 2.8070897454868305e-05, + "loss": 0.3985, + "step": 55000 + }, + { + "epoch": 8.21, + "grad_norm": 0.3767193853855133, + "learning_rate": 2.8041996522639836e-05, + "loss": 0.4, + "step": 55500 + }, + { + "epoch": 8.29, + "grad_norm": 0.40102022886276245, + "learning_rate": 2.8013095590411364e-05, + "loss": 0.3987, + "step": 56000 + }, + { + "epoch": 8.36, + "grad_norm": 0.4435707926750183, + "learning_rate": 2.7984194658182895e-05, + "loss": 0.3976, + "step": 56500 + }, + { + "epoch": 8.43, + "grad_norm": 0.39804941415786743, + "learning_rate": 2.7955293725954426e-05, + "loss": 0.395, + "step": 57000 + }, + { + "epoch": 8.51, + "grad_norm": 0.41703784465789795, + "learning_rate": 2.7926392793725954e-05, + "loss": 0.395, + "step": 57500 + }, + { + "epoch": 8.58, + "grad_norm": 0.4349576234817505, + "learning_rate": 2.7897491861497485e-05, + "loss": 0.3946, + "step": 58000 + }, + { + "epoch": 8.66, + "grad_norm": 0.37204691767692566, + "learning_rate": 2.7868590929269017e-05, + "loss": 0.394, + "step": 58500 + }, + { + "epoch": 8.73, + "grad_norm": 0.42759761214256287, + "learning_rate": 2.7839689997040545e-05, + "loss": 0.3949, + "step": 59000 + }, + { + "epoch": 8.8, + "grad_norm": 0.37754470109939575, + "learning_rate": 2.7810789064812076e-05, + "loss": 0.3939, + "step": 59500 + }, + { + "epoch": 8.88, + "grad_norm": 0.3639107346534729, + "learning_rate": 2.7781888132583607e-05, + "loss": 0.3932, + "step": 60000 + }, + { + "epoch": 8.95, + "grad_norm": 0.37291327118873596, + "learning_rate": 2.7752987200355135e-05, + "loss": 0.394, + "step": 60500 + }, + { + "epoch": 9.03, + "grad_norm": 0.3964773416519165, + "learning_rate": 2.7724086268126666e-05, + "loss": 0.3959, + "step": 61000 + }, + { + "epoch": 9.1, + "grad_norm": 0.4025065004825592, + "learning_rate": 2.7695185335898197e-05, + "loss": 0.3922, + "step": 61500 + }, + { + "epoch": 9.17, + "grad_norm": 0.5499910116195679, + "learning_rate": 2.7666284403669725e-05, + "loss": 0.3893, + "step": 62000 + }, + { + "epoch": 9.25, + "grad_norm": 0.43492835760116577, + "learning_rate": 2.7637383471441256e-05, + "loss": 0.3942, + "step": 62500 + }, + { + "epoch": 9.32, + "grad_norm": 0.38981184363365173, + "learning_rate": 2.7608482539212788e-05, + "loss": 0.3941, + "step": 63000 + }, + { + "epoch": 9.4, + "grad_norm": 0.4508809745311737, + "learning_rate": 2.7579581606984316e-05, + "loss": 0.3922, + "step": 63500 + }, + { + "epoch": 9.47, + "grad_norm": 0.37447696924209595, + "learning_rate": 2.7550680674755847e-05, + "loss": 0.3905, + "step": 64000 + }, + { + "epoch": 9.54, + "grad_norm": 0.40094566345214844, + "learning_rate": 2.7521779742527378e-05, + "loss": 0.3938, + "step": 64500 + }, + { + "epoch": 9.62, + "grad_norm": 0.46564099192619324, + "learning_rate": 2.7492878810298906e-05, + "loss": 0.3914, + "step": 65000 + }, + { + "epoch": 9.69, + "grad_norm": 0.37548139691352844, + "learning_rate": 2.7463977878070437e-05, + "loss": 0.3923, + "step": 65500 + }, + { + "epoch": 9.77, + "grad_norm": 0.39845481514930725, + "learning_rate": 2.743507694584197e-05, + "loss": 0.3904, + "step": 66000 + }, + { + "epoch": 9.84, + "grad_norm": 0.46478548645973206, + "learning_rate": 2.7406176013613496e-05, + "loss": 0.3897, + "step": 66500 + }, + { + "epoch": 9.91, + "grad_norm": 0.5512229204177856, + "learning_rate": 2.7377275081385027e-05, + "loss": 0.3898, + "step": 67000 + }, + { + "epoch": 9.99, + "grad_norm": 0.34783828258514404, + "learning_rate": 2.734837414915656e-05, + "loss": 0.3901, + "step": 67500 + }, + { + "epoch": 10.06, + "grad_norm": 0.4403396546840668, + "learning_rate": 2.7319473216928083e-05, + "loss": 0.388, + "step": 68000 + }, + { + "epoch": 10.14, + "grad_norm": 0.37262195348739624, + "learning_rate": 2.7290572284699614e-05, + "loss": 0.3869, + "step": 68500 + }, + { + "epoch": 10.21, + "grad_norm": 0.38222742080688477, + "learning_rate": 2.7261671352471146e-05, + "loss": 0.3901, + "step": 69000 + }, + { + "epoch": 10.28, + "grad_norm": 0.614713191986084, + "learning_rate": 2.7232770420242673e-05, + "loss": 0.3886, + "step": 69500 + }, + { + "epoch": 10.36, + "grad_norm": 0.4252707362174988, + "learning_rate": 2.7203869488014205e-05, + "loss": 0.3874, + "step": 70000 + }, + { + "epoch": 10.43, + "grad_norm": 0.3792737126350403, + "learning_rate": 2.7174968555785736e-05, + "loss": 0.3855, + "step": 70500 + }, + { + "epoch": 10.51, + "grad_norm": 0.40962672233581543, + "learning_rate": 2.7146067623557264e-05, + "loss": 0.3875, + "step": 71000 + }, + { + "epoch": 10.58, + "grad_norm": 0.38987433910369873, + "learning_rate": 2.7117166691328795e-05, + "loss": 0.3855, + "step": 71500 + }, + { + "epoch": 10.65, + "grad_norm": 0.407105028629303, + "learning_rate": 2.7088265759100326e-05, + "loss": 0.3857, + "step": 72000 + }, + { + "epoch": 10.73, + "grad_norm": 0.3527330160140991, + "learning_rate": 2.7059364826871854e-05, + "loss": 0.3869, + "step": 72500 + }, + { + "epoch": 10.8, + "grad_norm": 0.3859241306781769, + "learning_rate": 2.7030463894643385e-05, + "loss": 0.3872, + "step": 73000 + }, + { + "epoch": 10.88, + "grad_norm": 0.3989656865596771, + "learning_rate": 2.7001562962414917e-05, + "loss": 0.3855, + "step": 73500 + }, + { + "epoch": 10.95, + "grad_norm": 0.4163249731063843, + "learning_rate": 2.6972662030186444e-05, + "loss": 0.3855, + "step": 74000 + }, + { + "epoch": 11.02, + "grad_norm": 0.39577716588974, + "learning_rate": 2.6943761097957976e-05, + "loss": 0.3848, + "step": 74500 + }, + { + "epoch": 11.1, + "grad_norm": 0.3792816400527954, + "learning_rate": 2.6914860165729507e-05, + "loss": 0.3864, + "step": 75000 + }, + { + "epoch": 11.17, + "grad_norm": 0.3979376554489136, + "learning_rate": 2.6885959233501035e-05, + "loss": 0.3847, + "step": 75500 + }, + { + "epoch": 11.25, + "grad_norm": 0.44446560740470886, + "learning_rate": 2.6857058301272566e-05, + "loss": 0.3851, + "step": 76000 + }, + { + "epoch": 11.32, + "grad_norm": 0.3826451599597931, + "learning_rate": 2.6828157369044097e-05, + "loss": 0.385, + "step": 76500 + }, + { + "epoch": 11.39, + "grad_norm": 0.38595423102378845, + "learning_rate": 2.6799256436815625e-05, + "loss": 0.3848, + "step": 77000 + }, + { + "epoch": 11.47, + "grad_norm": 0.4047674238681793, + "learning_rate": 2.6770355504587156e-05, + "loss": 0.384, + "step": 77500 + }, + { + "epoch": 11.54, + "grad_norm": 0.3704206347465515, + "learning_rate": 2.6741454572358688e-05, + "loss": 0.3846, + "step": 78000 + }, + { + "epoch": 11.62, + "grad_norm": 0.42468497157096863, + "learning_rate": 2.6712553640130215e-05, + "loss": 0.3847, + "step": 78500 + }, + { + "epoch": 11.69, + "grad_norm": 0.43300580978393555, + "learning_rate": 2.6683652707901747e-05, + "loss": 0.3783, + "step": 79000 + }, + { + "epoch": 11.76, + "grad_norm": 0.3680213689804077, + "learning_rate": 2.6654751775673278e-05, + "loss": 0.3811, + "step": 79500 + }, + { + "epoch": 11.84, + "grad_norm": 0.3971569240093231, + "learning_rate": 2.6625850843444806e-05, + "loss": 0.3816, + "step": 80000 + }, + { + "epoch": 11.91, + "grad_norm": 0.3454134464263916, + "learning_rate": 2.6596949911216337e-05, + "loss": 0.38, + "step": 80500 + }, + { + "epoch": 11.99, + "grad_norm": 0.38850536942481995, + "learning_rate": 2.6568048978987868e-05, + "loss": 0.3806, + "step": 81000 + }, + { + "epoch": 12.06, + "grad_norm": 0.41783374547958374, + "learning_rate": 2.6539148046759396e-05, + "loss": 0.3805, + "step": 81500 + }, + { + "epoch": 12.13, + "grad_norm": 0.37427714467048645, + "learning_rate": 2.6510247114530927e-05, + "loss": 0.3844, + "step": 82000 + }, + { + "epoch": 12.21, + "grad_norm": 0.3917700946331024, + "learning_rate": 2.648134618230246e-05, + "loss": 0.3818, + "step": 82500 + }, + { + "epoch": 12.28, + "grad_norm": 0.36409491300582886, + "learning_rate": 2.6452445250073986e-05, + "loss": 0.3815, + "step": 83000 + }, + { + "epoch": 12.36, + "grad_norm": 0.4320700764656067, + "learning_rate": 2.6423544317845518e-05, + "loss": 0.3813, + "step": 83500 + }, + { + "epoch": 12.43, + "grad_norm": 0.3927746117115021, + "learning_rate": 2.639464338561705e-05, + "loss": 0.3827, + "step": 84000 + }, + { + "epoch": 12.5, + "grad_norm": 0.3693206310272217, + "learning_rate": 2.6365742453388577e-05, + "loss": 0.3803, + "step": 84500 + }, + { + "epoch": 12.58, + "grad_norm": 0.4206922948360443, + "learning_rate": 2.6336841521160108e-05, + "loss": 0.3794, + "step": 85000 + }, + { + "epoch": 12.65, + "grad_norm": 0.35786914825439453, + "learning_rate": 2.630794058893164e-05, + "loss": 0.3823, + "step": 85500 + }, + { + "epoch": 12.73, + "grad_norm": 0.4055446982383728, + "learning_rate": 2.6279039656703167e-05, + "loss": 0.3797, + "step": 86000 + }, + { + "epoch": 12.8, + "grad_norm": 0.473630428314209, + "learning_rate": 2.62501387244747e-05, + "loss": 0.3776, + "step": 86500 + }, + { + "epoch": 12.87, + "grad_norm": 0.36061763763427734, + "learning_rate": 2.622123779224623e-05, + "loss": 0.3781, + "step": 87000 + }, + { + "epoch": 12.95, + "grad_norm": 0.4378969371318817, + "learning_rate": 2.6192336860017754e-05, + "loss": 0.3778, + "step": 87500 + }, + { + "epoch": 13.02, + "grad_norm": 0.3772602379322052, + "learning_rate": 2.6163435927789285e-05, + "loss": 0.3776, + "step": 88000 + }, + { + "epoch": 13.1, + "grad_norm": 0.42682790756225586, + "learning_rate": 2.6134534995560817e-05, + "loss": 0.3802, + "step": 88500 + }, + { + "epoch": 13.17, + "grad_norm": 0.38328275084495544, + "learning_rate": 2.6105634063332344e-05, + "loss": 0.3813, + "step": 89000 + }, + { + "epoch": 13.24, + "grad_norm": 0.4136464595794678, + "learning_rate": 2.6076733131103876e-05, + "loss": 0.3773, + "step": 89500 + }, + { + "epoch": 13.32, + "grad_norm": 0.39002037048339844, + "learning_rate": 2.6047832198875407e-05, + "loss": 0.3767, + "step": 90000 + }, + { + "epoch": 13.39, + "grad_norm": 0.4823383092880249, + "learning_rate": 2.6018931266646935e-05, + "loss": 0.3789, + "step": 90500 + }, + { + "epoch": 13.47, + "grad_norm": 0.3532434403896332, + "learning_rate": 2.5990030334418466e-05, + "loss": 0.3755, + "step": 91000 + }, + { + "epoch": 13.54, + "grad_norm": 0.3406650424003601, + "learning_rate": 2.5961129402189997e-05, + "loss": 0.3782, + "step": 91500 + }, + { + "epoch": 13.61, + "grad_norm": 0.42174699902534485, + "learning_rate": 2.5932228469961525e-05, + "loss": 0.3792, + "step": 92000 + }, + { + "epoch": 13.69, + "grad_norm": 0.4112718999385834, + "learning_rate": 2.5903327537733056e-05, + "loss": 0.3758, + "step": 92500 + }, + { + "epoch": 13.76, + "grad_norm": 0.39170435070991516, + "learning_rate": 2.5874426605504588e-05, + "loss": 0.3772, + "step": 93000 + }, + { + "epoch": 13.84, + "grad_norm": 0.35669615864753723, + "learning_rate": 2.584552567327612e-05, + "loss": 0.376, + "step": 93500 + }, + { + "epoch": 13.91, + "grad_norm": 0.36161208152770996, + "learning_rate": 2.5816624741047647e-05, + "loss": 0.3759, + "step": 94000 + }, + { + "epoch": 13.98, + "grad_norm": 0.3548930883407593, + "learning_rate": 2.5787723808819178e-05, + "loss": 0.3772, + "step": 94500 + }, + { + "epoch": 14.06, + "grad_norm": 0.3934749662876129, + "learning_rate": 2.575882287659071e-05, + "loss": 0.3774, + "step": 95000 + }, + { + "epoch": 14.13, + "grad_norm": 0.3642575442790985, + "learning_rate": 2.5729921944362237e-05, + "loss": 0.3756, + "step": 95500 + }, + { + "epoch": 14.21, + "grad_norm": 0.34236952662467957, + "learning_rate": 2.5701021012133768e-05, + "loss": 0.3758, + "step": 96000 + }, + { + "epoch": 14.28, + "grad_norm": 0.40388983488082886, + "learning_rate": 2.56721200799053e-05, + "loss": 0.3749, + "step": 96500 + }, + { + "epoch": 14.35, + "grad_norm": 0.432076632976532, + "learning_rate": 2.5643219147676827e-05, + "loss": 0.3756, + "step": 97000 + }, + { + "epoch": 14.43, + "grad_norm": 0.3947947025299072, + "learning_rate": 2.561431821544836e-05, + "loss": 0.3792, + "step": 97500 + }, + { + "epoch": 14.5, + "grad_norm": 0.3974926173686981, + "learning_rate": 2.558541728321989e-05, + "loss": 0.3741, + "step": 98000 + }, + { + "epoch": 14.58, + "grad_norm": 0.3732788860797882, + "learning_rate": 2.5556516350991418e-05, + "loss": 0.3741, + "step": 98500 + }, + { + "epoch": 14.65, + "grad_norm": 0.35293856263160706, + "learning_rate": 2.552761541876295e-05, + "loss": 0.375, + "step": 99000 + }, + { + "epoch": 14.72, + "grad_norm": 0.38685211539268494, + "learning_rate": 2.549871448653448e-05, + "loss": 0.3711, + "step": 99500 + }, + { + "epoch": 14.8, + "grad_norm": 0.40676021575927734, + "learning_rate": 2.5469813554306008e-05, + "loss": 0.3786, + "step": 100000 + }, + { + "epoch": 14.87, + "grad_norm": 0.40946874022483826, + "learning_rate": 2.544091262207754e-05, + "loss": 0.3749, + "step": 100500 + }, + { + "epoch": 14.95, + "grad_norm": 0.35838282108306885, + "learning_rate": 2.541201168984907e-05, + "loss": 0.3733, + "step": 101000 + }, + { + "epoch": 15.02, + "grad_norm": 0.4110182821750641, + "learning_rate": 2.5383110757620598e-05, + "loss": 0.3712, + "step": 101500 + }, + { + "epoch": 15.09, + "grad_norm": 0.35349026322364807, + "learning_rate": 2.535420982539213e-05, + "loss": 0.3719, + "step": 102000 + }, + { + "epoch": 15.17, + "grad_norm": 0.38222944736480713, + "learning_rate": 2.532530889316366e-05, + "loss": 0.3737, + "step": 102500 + }, + { + "epoch": 15.24, + "grad_norm": 0.47066986560821533, + "learning_rate": 2.529640796093519e-05, + "loss": 0.3728, + "step": 103000 + }, + { + "epoch": 15.32, + "grad_norm": 0.3949437439441681, + "learning_rate": 2.526750702870672e-05, + "loss": 0.3752, + "step": 103500 + }, + { + "epoch": 15.39, + "grad_norm": 0.42243218421936035, + "learning_rate": 2.523860609647825e-05, + "loss": 0.3729, + "step": 104000 + }, + { + "epoch": 15.46, + "grad_norm": 0.4031197726726532, + "learning_rate": 2.520970516424978e-05, + "loss": 0.3704, + "step": 104500 + }, + { + "epoch": 15.54, + "grad_norm": 0.4034245014190674, + "learning_rate": 2.518080423202131e-05, + "loss": 0.3713, + "step": 105000 + }, + { + "epoch": 15.61, + "grad_norm": 0.4237426817417145, + "learning_rate": 2.515190329979284e-05, + "loss": 0.3719, + "step": 105500 + }, + { + "epoch": 15.69, + "grad_norm": 0.41453346610069275, + "learning_rate": 2.512300236756437e-05, + "loss": 0.3742, + "step": 106000 + }, + { + "epoch": 15.76, + "grad_norm": 0.4238516390323639, + "learning_rate": 2.50941014353359e-05, + "loss": 0.3734, + "step": 106500 + }, + { + "epoch": 15.83, + "grad_norm": 0.3762485384941101, + "learning_rate": 2.506520050310743e-05, + "loss": 0.3736, + "step": 107000 + }, + { + "epoch": 15.91, + "grad_norm": 0.36851537227630615, + "learning_rate": 2.5036299570878956e-05, + "loss": 0.3736, + "step": 107500 + }, + { + "epoch": 15.98, + "grad_norm": 0.36127322912216187, + "learning_rate": 2.5007398638650487e-05, + "loss": 0.3716, + "step": 108000 + }, + { + "epoch": 16.06, + "grad_norm": 0.4159682095050812, + "learning_rate": 2.497849770642202e-05, + "loss": 0.3695, + "step": 108500 + }, + { + "epoch": 16.13, + "grad_norm": 0.40441277623176575, + "learning_rate": 2.4949596774193547e-05, + "loss": 0.3702, + "step": 109000 + }, + { + "epoch": 16.2, + "grad_norm": 0.3531157076358795, + "learning_rate": 2.4920695841965078e-05, + "loss": 0.3751, + "step": 109500 + }, + { + "epoch": 16.28, + "grad_norm": 0.40636512637138367, + "learning_rate": 2.489179490973661e-05, + "loss": 0.3692, + "step": 110000 + }, + { + "epoch": 16.35, + "grad_norm": 0.3990442156791687, + "learning_rate": 2.4862893977508137e-05, + "loss": 0.3697, + "step": 110500 + }, + { + "epoch": 16.42, + "grad_norm": 0.39944297075271606, + "learning_rate": 2.4833993045279668e-05, + "loss": 0.3683, + "step": 111000 + }, + { + "epoch": 16.5, + "grad_norm": 0.3601832985877991, + "learning_rate": 2.48050921130512e-05, + "loss": 0.3699, + "step": 111500 + }, + { + "epoch": 16.57, + "grad_norm": 0.40571844577789307, + "learning_rate": 2.4776191180822727e-05, + "loss": 0.3689, + "step": 112000 + }, + { + "epoch": 16.65, + "grad_norm": 0.40363049507141113, + "learning_rate": 2.474729024859426e-05, + "loss": 0.3708, + "step": 112500 + }, + { + "epoch": 16.72, + "grad_norm": 0.3990092873573303, + "learning_rate": 2.471838931636579e-05, + "loss": 0.3706, + "step": 113000 + }, + { + "epoch": 16.79, + "grad_norm": 0.36532795429229736, + "learning_rate": 2.4689488384137317e-05, + "loss": 0.3721, + "step": 113500 + }, + { + "epoch": 16.87, + "grad_norm": 0.44025781750679016, + "learning_rate": 2.466058745190885e-05, + "loss": 0.3733, + "step": 114000 + }, + { + "epoch": 16.94, + "grad_norm": 0.3850398659706116, + "learning_rate": 2.463168651968038e-05, + "loss": 0.3693, + "step": 114500 + }, + { + "epoch": 17.02, + "grad_norm": 0.38132092356681824, + "learning_rate": 2.4602785587451908e-05, + "loss": 0.3703, + "step": 115000 + }, + { + "epoch": 17.09, + "grad_norm": 0.39702102541923523, + "learning_rate": 2.457388465522344e-05, + "loss": 0.3706, + "step": 115500 + }, + { + "epoch": 17.16, + "grad_norm": 0.35070690512657166, + "learning_rate": 2.454498372299497e-05, + "loss": 0.3706, + "step": 116000 + }, + { + "epoch": 17.24, + "grad_norm": 0.42235612869262695, + "learning_rate": 2.4516082790766498e-05, + "loss": 0.3703, + "step": 116500 + }, + { + "epoch": 17.31, + "grad_norm": 0.35915622115135193, + "learning_rate": 2.448718185853803e-05, + "loss": 0.3693, + "step": 117000 + }, + { + "epoch": 17.39, + "grad_norm": 0.34371674060821533, + "learning_rate": 2.445828092630956e-05, + "loss": 0.3686, + "step": 117500 + }, + { + "epoch": 17.46, + "grad_norm": 0.48452600836753845, + "learning_rate": 2.442937999408109e-05, + "loss": 0.3696, + "step": 118000 + }, + { + "epoch": 17.53, + "grad_norm": 0.39582401514053345, + "learning_rate": 2.440047906185262e-05, + "loss": 0.3698, + "step": 118500 + }, + { + "epoch": 17.61, + "grad_norm": 0.43614286184310913, + "learning_rate": 2.437157812962415e-05, + "loss": 0.3688, + "step": 119000 + }, + { + "epoch": 17.68, + "grad_norm": 0.3942769765853882, + "learning_rate": 2.434267719739568e-05, + "loss": 0.368, + "step": 119500 + }, + { + "epoch": 17.76, + "grad_norm": 0.34341031312942505, + "learning_rate": 2.431377626516721e-05, + "loss": 0.3665, + "step": 120000 + }, + { + "epoch": 17.83, + "grad_norm": 0.3661987781524658, + "learning_rate": 2.428487533293874e-05, + "loss": 0.3664, + "step": 120500 + }, + { + "epoch": 17.9, + "grad_norm": 0.32992053031921387, + "learning_rate": 2.425597440071027e-05, + "loss": 0.3683, + "step": 121000 + }, + { + "epoch": 17.98, + "grad_norm": 0.40151405334472656, + "learning_rate": 2.42270734684818e-05, + "loss": 0.3677, + "step": 121500 + }, + { + "epoch": 18.05, + "grad_norm": 0.3343447148799896, + "learning_rate": 2.419817253625333e-05, + "loss": 0.3679, + "step": 122000 + }, + { + "epoch": 18.13, + "grad_norm": 0.3798489272594452, + "learning_rate": 2.416927160402486e-05, + "loss": 0.3687, + "step": 122500 + }, + { + "epoch": 18.2, + "grad_norm": 0.3244016766548157, + "learning_rate": 2.414037067179639e-05, + "loss": 0.3643, + "step": 123000 + }, + { + "epoch": 18.27, + "grad_norm": 0.4036329984664917, + "learning_rate": 2.4111469739567922e-05, + "loss": 0.3703, + "step": 123500 + }, + { + "epoch": 18.35, + "grad_norm": 0.3875889778137207, + "learning_rate": 2.408256880733945e-05, + "loss": 0.3661, + "step": 124000 + }, + { + "epoch": 18.42, + "grad_norm": 0.3607046902179718, + "learning_rate": 2.405366787511098e-05, + "loss": 0.3646, + "step": 124500 + }, + { + "epoch": 18.5, + "grad_norm": 0.33054810762405396, + "learning_rate": 2.4024766942882512e-05, + "loss": 0.3675, + "step": 125000 + }, + { + "epoch": 18.57, + "grad_norm": 0.4091895520687103, + "learning_rate": 2.399586601065404e-05, + "loss": 0.3652, + "step": 125500 + }, + { + "epoch": 18.64, + "grad_norm": 0.37667781114578247, + "learning_rate": 2.396696507842557e-05, + "loss": 0.3691, + "step": 126000 + }, + { + "epoch": 18.72, + "grad_norm": 0.3683590888977051, + "learning_rate": 2.39380641461971e-05, + "loss": 0.3675, + "step": 126500 + }, + { + "epoch": 18.79, + "grad_norm": 0.397073894739151, + "learning_rate": 2.3909163213968627e-05, + "loss": 0.3637, + "step": 127000 + }, + { + "epoch": 18.87, + "grad_norm": 0.3522073030471802, + "learning_rate": 2.3880262281740158e-05, + "loss": 0.3676, + "step": 127500 + }, + { + "epoch": 18.94, + "grad_norm": 0.3389582633972168, + "learning_rate": 2.385136134951169e-05, + "loss": 0.3676, + "step": 128000 + }, + { + "epoch": 19.01, + "grad_norm": 0.3726537823677063, + "learning_rate": 2.3822460417283217e-05, + "loss": 0.3674, + "step": 128500 + }, + { + "epoch": 19.09, + "grad_norm": 0.3774533271789551, + "learning_rate": 2.379355948505475e-05, + "loss": 0.3669, + "step": 129000 + }, + { + "epoch": 19.16, + "grad_norm": 0.45427048206329346, + "learning_rate": 2.376465855282628e-05, + "loss": 0.3652, + "step": 129500 + }, + { + "epoch": 19.24, + "grad_norm": 0.39148712158203125, + "learning_rate": 2.373575762059781e-05, + "loss": 0.3632, + "step": 130000 + }, + { + "epoch": 19.31, + "grad_norm": 0.3727419078350067, + "learning_rate": 2.370685668836934e-05, + "loss": 0.3674, + "step": 130500 + }, + { + "epoch": 19.38, + "grad_norm": 0.3490041196346283, + "learning_rate": 2.367795575614087e-05, + "loss": 0.3685, + "step": 131000 + }, + { + "epoch": 19.46, + "grad_norm": 0.33863797783851624, + "learning_rate": 2.36490548239124e-05, + "loss": 0.3659, + "step": 131500 + }, + { + "epoch": 19.53, + "grad_norm": 0.41820865869522095, + "learning_rate": 2.362015389168393e-05, + "loss": 0.3647, + "step": 132000 + }, + { + "epoch": 19.61, + "grad_norm": 0.31935831904411316, + "learning_rate": 2.359125295945546e-05, + "loss": 0.3657, + "step": 132500 + }, + { + "epoch": 19.68, + "grad_norm": 0.39523938298225403, + "learning_rate": 2.3562352027226992e-05, + "loss": 0.3643, + "step": 133000 + }, + { + "epoch": 19.75, + "grad_norm": 0.3851146996021271, + "learning_rate": 2.353345109499852e-05, + "loss": 0.3624, + "step": 133500 + }, + { + "epoch": 19.83, + "grad_norm": 0.3778953552246094, + "learning_rate": 2.350455016277005e-05, + "loss": 0.3658, + "step": 134000 + }, + { + "epoch": 19.9, + "grad_norm": 0.3673354387283325, + "learning_rate": 2.3475649230541582e-05, + "loss": 0.3645, + "step": 134500 + }, + { + "epoch": 19.98, + "grad_norm": 0.40675076842308044, + "learning_rate": 2.344674829831311e-05, + "loss": 0.3624, + "step": 135000 + }, + { + "epoch": 20.05, + "grad_norm": 0.32396331429481506, + "learning_rate": 2.341784736608464e-05, + "loss": 0.3608, + "step": 135500 + }, + { + "epoch": 20.12, + "grad_norm": 0.4665846526622772, + "learning_rate": 2.3388946433856172e-05, + "loss": 0.3654, + "step": 136000 + }, + { + "epoch": 20.2, + "grad_norm": 0.3753814697265625, + "learning_rate": 2.33600455016277e-05, + "loss": 0.3611, + "step": 136500 + }, + { + "epoch": 20.27, + "grad_norm": 0.39572277665138245, + "learning_rate": 2.333114456939923e-05, + "loss": 0.363, + "step": 137000 + }, + { + "epoch": 20.35, + "grad_norm": 0.36638927459716797, + "learning_rate": 2.3302243637170763e-05, + "loss": 0.3625, + "step": 137500 + }, + { + "epoch": 20.42, + "grad_norm": 0.40173882246017456, + "learning_rate": 2.327334270494229e-05, + "loss": 0.3645, + "step": 138000 + }, + { + "epoch": 20.49, + "grad_norm": 0.34684666991233826, + "learning_rate": 2.3244441772713822e-05, + "loss": 0.3636, + "step": 138500 + }, + { + "epoch": 20.57, + "grad_norm": 0.3533775806427002, + "learning_rate": 2.3215540840485353e-05, + "loss": 0.3624, + "step": 139000 + }, + { + "epoch": 20.64, + "grad_norm": 0.36431315541267395, + "learning_rate": 2.318663990825688e-05, + "loss": 0.3649, + "step": 139500 + }, + { + "epoch": 20.72, + "grad_norm": 0.3629516363143921, + "learning_rate": 2.3157738976028412e-05, + "loss": 0.3646, + "step": 140000 + }, + { + "epoch": 20.79, + "grad_norm": 0.383987158536911, + "learning_rate": 2.3128838043799943e-05, + "loss": 0.3623, + "step": 140500 + }, + { + "epoch": 20.86, + "grad_norm": 0.38170096278190613, + "learning_rate": 2.309993711157147e-05, + "loss": 0.3629, + "step": 141000 + }, + { + "epoch": 20.94, + "grad_norm": 0.36627018451690674, + "learning_rate": 2.3071036179343003e-05, + "loss": 0.362, + "step": 141500 + }, + { + "epoch": 21.01, + "grad_norm": 0.37587088346481323, + "learning_rate": 2.3042135247114534e-05, + "loss": 0.3624, + "step": 142000 + }, + { + "epoch": 21.09, + "grad_norm": 0.3648183047771454, + "learning_rate": 2.301323431488606e-05, + "loss": 0.3633, + "step": 142500 + }, + { + "epoch": 21.16, + "grad_norm": 0.3818926513195038, + "learning_rate": 2.2984333382657593e-05, + "loss": 0.3627, + "step": 143000 + }, + { + "epoch": 21.23, + "grad_norm": 0.38963159918785095, + "learning_rate": 2.2955432450429124e-05, + "loss": 0.3619, + "step": 143500 + }, + { + "epoch": 21.31, + "grad_norm": 0.4655527174472809, + "learning_rate": 2.2926531518200652e-05, + "loss": 0.3616, + "step": 144000 + }, + { + "epoch": 21.38, + "grad_norm": 0.36112433671951294, + "learning_rate": 2.2897630585972183e-05, + "loss": 0.3597, + "step": 144500 + }, + { + "epoch": 21.46, + "grad_norm": 0.3661918044090271, + "learning_rate": 2.2868729653743714e-05, + "loss": 0.3626, + "step": 145000 + }, + { + "epoch": 21.53, + "grad_norm": 0.340862512588501, + "learning_rate": 2.2839828721515242e-05, + "loss": 0.3637, + "step": 145500 + }, + { + "epoch": 21.6, + "grad_norm": 0.3376435339450836, + "learning_rate": 2.281092778928677e-05, + "loss": 0.3622, + "step": 146000 + }, + { + "epoch": 21.68, + "grad_norm": 0.4211704730987549, + "learning_rate": 2.27820268570583e-05, + "loss": 0.3593, + "step": 146500 + }, + { + "epoch": 21.75, + "grad_norm": 0.3886992037296295, + "learning_rate": 2.275312592482983e-05, + "loss": 0.3614, + "step": 147000 + }, + { + "epoch": 21.83, + "grad_norm": 0.34686926007270813, + "learning_rate": 2.272422499260136e-05, + "loss": 0.3615, + "step": 147500 + }, + { + "epoch": 21.9, + "grad_norm": 0.4066803455352783, + "learning_rate": 2.269532406037289e-05, + "loss": 0.3597, + "step": 148000 + }, + { + "epoch": 21.97, + "grad_norm": 0.37257149815559387, + "learning_rate": 2.266642312814442e-05, + "loss": 0.364, + "step": 148500 + }, + { + "epoch": 22.05, + "grad_norm": 0.3770715892314911, + "learning_rate": 2.263752219591595e-05, + "loss": 0.359, + "step": 149000 + }, + { + "epoch": 22.12, + "grad_norm": 0.3486000597476959, + "learning_rate": 2.2608621263687482e-05, + "loss": 0.3616, + "step": 149500 + }, + { + "epoch": 22.2, + "grad_norm": 0.4026411771774292, + "learning_rate": 2.257972033145901e-05, + "loss": 0.3615, + "step": 150000 + }, + { + "epoch": 22.27, + "grad_norm": 0.4740307629108429, + "learning_rate": 2.255081939923054e-05, + "loss": 0.359, + "step": 150500 + }, + { + "epoch": 22.34, + "grad_norm": 0.5692958235740662, + "learning_rate": 2.2521918467002072e-05, + "loss": 0.3614, + "step": 151000 + }, + { + "epoch": 22.42, + "grad_norm": 0.4546482264995575, + "learning_rate": 2.24930175347736e-05, + "loss": 0.3609, + "step": 151500 + }, + { + "epoch": 22.49, + "grad_norm": 0.3762848675251007, + "learning_rate": 2.246411660254513e-05, + "loss": 0.3612, + "step": 152000 + }, + { + "epoch": 22.57, + "grad_norm": 0.3631458282470703, + "learning_rate": 2.2435215670316663e-05, + "loss": 0.3613, + "step": 152500 + }, + { + "epoch": 22.64, + "grad_norm": 0.3560335040092468, + "learning_rate": 2.240631473808819e-05, + "loss": 0.3601, + "step": 153000 + }, + { + "epoch": 22.71, + "grad_norm": 0.3739969730377197, + "learning_rate": 2.2377413805859722e-05, + "loss": 0.3586, + "step": 153500 + }, + { + "epoch": 22.79, + "grad_norm": 0.3538212478160858, + "learning_rate": 2.2348512873631253e-05, + "loss": 0.3619, + "step": 154000 + }, + { + "epoch": 22.86, + "grad_norm": 0.33162832260131836, + "learning_rate": 2.231961194140278e-05, + "loss": 0.3566, + "step": 154500 + }, + { + "epoch": 22.94, + "grad_norm": 0.3731963634490967, + "learning_rate": 2.2290711009174312e-05, + "loss": 0.3617, + "step": 155000 + }, + { + "epoch": 23.01, + "grad_norm": 0.3658241033554077, + "learning_rate": 2.2261810076945843e-05, + "loss": 0.3621, + "step": 155500 + }, + { + "epoch": 23.08, + "grad_norm": 0.3295973837375641, + "learning_rate": 2.223290914471737e-05, + "loss": 0.3592, + "step": 156000 + }, + { + "epoch": 23.16, + "grad_norm": 0.3476080894470215, + "learning_rate": 2.2204008212488902e-05, + "loss": 0.358, + "step": 156500 + }, + { + "epoch": 23.23, + "grad_norm": 0.4091341197490692, + "learning_rate": 2.2175107280260434e-05, + "loss": 0.3588, + "step": 157000 + }, + { + "epoch": 23.31, + "grad_norm": 0.4243708848953247, + "learning_rate": 2.214620634803196e-05, + "loss": 0.3581, + "step": 157500 + }, + { + "epoch": 23.38, + "grad_norm": 0.4200844168663025, + "learning_rate": 2.2117305415803493e-05, + "loss": 0.3598, + "step": 158000 + }, + { + "epoch": 23.45, + "grad_norm": 0.4001297652721405, + "learning_rate": 2.2088404483575024e-05, + "loss": 0.3608, + "step": 158500 + }, + { + "epoch": 23.53, + "grad_norm": 0.44927239418029785, + "learning_rate": 2.2059503551346552e-05, + "loss": 0.3572, + "step": 159000 + }, + { + "epoch": 23.6, + "grad_norm": 0.3438055217266083, + "learning_rate": 2.2030602619118083e-05, + "loss": 0.3603, + "step": 159500 + }, + { + "epoch": 23.68, + "grad_norm": 0.395907461643219, + "learning_rate": 2.2001701686889614e-05, + "loss": 0.3583, + "step": 160000 + }, + { + "epoch": 23.75, + "grad_norm": 0.3705403506755829, + "learning_rate": 2.1972800754661142e-05, + "loss": 0.3606, + "step": 160500 + }, + { + "epoch": 23.82, + "grad_norm": 0.34676727652549744, + "learning_rate": 2.1943899822432673e-05, + "loss": 0.3589, + "step": 161000 + }, + { + "epoch": 23.9, + "grad_norm": 0.3712906837463379, + "learning_rate": 2.1914998890204205e-05, + "loss": 0.3597, + "step": 161500 + }, + { + "epoch": 23.97, + "grad_norm": 0.3136620819568634, + "learning_rate": 2.1886097957975732e-05, + "loss": 0.3571, + "step": 162000 + }, + { + "epoch": 24.05, + "grad_norm": 0.3915017545223236, + "learning_rate": 2.1857197025747264e-05, + "loss": 0.3595, + "step": 162500 + }, + { + "epoch": 24.12, + "grad_norm": 0.3403623700141907, + "learning_rate": 2.1828296093518795e-05, + "loss": 0.3608, + "step": 163000 + }, + { + "epoch": 24.19, + "grad_norm": 0.3841993510723114, + "learning_rate": 2.1799395161290323e-05, + "loss": 0.3596, + "step": 163500 + }, + { + "epoch": 24.27, + "grad_norm": 0.42894381284713745, + "learning_rate": 2.1770494229061854e-05, + "loss": 0.3571, + "step": 164000 + }, + { + "epoch": 24.34, + "grad_norm": 0.4211946129798889, + "learning_rate": 2.1741593296833385e-05, + "loss": 0.3552, + "step": 164500 + }, + { + "epoch": 24.42, + "grad_norm": 0.35293659567832947, + "learning_rate": 2.1712692364604917e-05, + "loss": 0.359, + "step": 165000 + }, + { + "epoch": 24.49, + "grad_norm": 0.3743543326854706, + "learning_rate": 2.168379143237644e-05, + "loss": 0.3556, + "step": 165500 + }, + { + "epoch": 24.56, + "grad_norm": 0.4101512134075165, + "learning_rate": 2.1654890500147972e-05, + "loss": 0.3561, + "step": 166000 + }, + { + "epoch": 24.64, + "grad_norm": 0.34685665369033813, + "learning_rate": 2.1625989567919503e-05, + "loss": 0.3596, + "step": 166500 + }, + { + "epoch": 24.71, + "grad_norm": 0.3677741289138794, + "learning_rate": 2.159708863569103e-05, + "loss": 0.3572, + "step": 167000 + }, + { + "epoch": 24.79, + "grad_norm": 0.3836405575275421, + "learning_rate": 2.1568187703462563e-05, + "loss": 0.36, + "step": 167500 + }, + { + "epoch": 24.86, + "grad_norm": 0.3649022579193115, + "learning_rate": 2.1539286771234094e-05, + "loss": 0.3558, + "step": 168000 + }, + { + "epoch": 24.93, + "grad_norm": 0.3566337525844574, + "learning_rate": 2.151038583900562e-05, + "loss": 0.3567, + "step": 168500 + }, + { + "epoch": 25.01, + "grad_norm": 0.4024845361709595, + "learning_rate": 2.1481484906777153e-05, + "loss": 0.3555, + "step": 169000 + }, + { + "epoch": 25.08, + "grad_norm": 0.3865801692008972, + "learning_rate": 2.1452583974548684e-05, + "loss": 0.3587, + "step": 169500 + }, + { + "epoch": 25.16, + "grad_norm": 0.3753124475479126, + "learning_rate": 2.1423683042320212e-05, + "loss": 0.358, + "step": 170000 + }, + { + "epoch": 25.23, + "grad_norm": 0.3889290690422058, + "learning_rate": 2.1394782110091743e-05, + "loss": 0.3573, + "step": 170500 + }, + { + "epoch": 25.3, + "grad_norm": 0.425574392080307, + "learning_rate": 2.1365881177863274e-05, + "loss": 0.3583, + "step": 171000 + }, + { + "epoch": 25.38, + "grad_norm": 0.35915040969848633, + "learning_rate": 2.1336980245634802e-05, + "loss": 0.355, + "step": 171500 + }, + { + "epoch": 25.45, + "grad_norm": 0.3714876174926758, + "learning_rate": 2.1308079313406334e-05, + "loss": 0.3558, + "step": 172000 + }, + { + "epoch": 25.53, + "grad_norm": 0.3659971356391907, + "learning_rate": 2.1279178381177865e-05, + "loss": 0.3526, + "step": 172500 + }, + { + "epoch": 25.6, + "grad_norm": 0.35083669424057007, + "learning_rate": 2.1250277448949393e-05, + "loss": 0.3582, + "step": 173000 + }, + { + "epoch": 25.67, + "grad_norm": 0.3540023863315582, + "learning_rate": 2.1221376516720924e-05, + "loss": 0.3555, + "step": 173500 + }, + { + "epoch": 25.75, + "grad_norm": 0.3811222016811371, + "learning_rate": 2.1192475584492455e-05, + "loss": 0.3557, + "step": 174000 + }, + { + "epoch": 25.82, + "grad_norm": 0.37513551115989685, + "learning_rate": 2.1163574652263983e-05, + "loss": 0.3563, + "step": 174500 + }, + { + "epoch": 25.9, + "grad_norm": 0.4036356508731842, + "learning_rate": 2.1134673720035514e-05, + "loss": 0.3548, + "step": 175000 + }, + { + "epoch": 25.97, + "grad_norm": 0.3446299135684967, + "learning_rate": 2.1105772787807045e-05, + "loss": 0.3573, + "step": 175500 + }, + { + "epoch": 26.04, + "grad_norm": 0.4351588487625122, + "learning_rate": 2.1076871855578573e-05, + "loss": 0.3556, + "step": 176000 + }, + { + "epoch": 26.12, + "grad_norm": 0.38238152861595154, + "learning_rate": 2.1047970923350105e-05, + "loss": 0.3566, + "step": 176500 + }, + { + "epoch": 26.19, + "grad_norm": 0.3972441554069519, + "learning_rate": 2.1019069991121636e-05, + "loss": 0.3533, + "step": 177000 + }, + { + "epoch": 26.27, + "grad_norm": 0.40132614970207214, + "learning_rate": 2.0990169058893164e-05, + "loss": 0.3548, + "step": 177500 + }, + { + "epoch": 26.34, + "grad_norm": 0.3178902864456177, + "learning_rate": 2.0961268126664695e-05, + "loss": 0.355, + "step": 178000 + }, + { + "epoch": 26.41, + "grad_norm": 0.4328124225139618, + "learning_rate": 2.0932367194436226e-05, + "loss": 0.3554, + "step": 178500 + }, + { + "epoch": 26.49, + "grad_norm": 0.3971725404262543, + "learning_rate": 2.0903466262207754e-05, + "loss": 0.3549, + "step": 179000 + }, + { + "epoch": 26.56, + "grad_norm": 0.3241216540336609, + "learning_rate": 2.0874565329979285e-05, + "loss": 0.3534, + "step": 179500 + }, + { + "epoch": 26.64, + "grad_norm": 0.3448522984981537, + "learning_rate": 2.0845664397750816e-05, + "loss": 0.3559, + "step": 180000 + }, + { + "epoch": 26.71, + "grad_norm": 0.34117060899734497, + "learning_rate": 2.0816763465522344e-05, + "loss": 0.3555, + "step": 180500 + }, + { + "epoch": 26.78, + "grad_norm": 0.39051172137260437, + "learning_rate": 2.0787862533293876e-05, + "loss": 0.3558, + "step": 181000 + }, + { + "epoch": 26.86, + "grad_norm": 0.3349858820438385, + "learning_rate": 2.0758961601065407e-05, + "loss": 0.3546, + "step": 181500 + }, + { + "epoch": 26.93, + "grad_norm": 0.4579429030418396, + "learning_rate": 2.0730060668836935e-05, + "loss": 0.3537, + "step": 182000 + }, + { + "epoch": 27.01, + "grad_norm": 0.3789091110229492, + "learning_rate": 2.0701159736608466e-05, + "loss": 0.3527, + "step": 182500 + }, + { + "epoch": 27.08, + "grad_norm": 0.43690434098243713, + "learning_rate": 2.0672258804379997e-05, + "loss": 0.3563, + "step": 183000 + }, + { + "epoch": 27.15, + "grad_norm": 0.3886288106441498, + "learning_rate": 2.0643357872151525e-05, + "loss": 0.3543, + "step": 183500 + }, + { + "epoch": 27.23, + "grad_norm": 0.40548428893089294, + "learning_rate": 2.0614456939923056e-05, + "loss": 0.353, + "step": 184000 + }, + { + "epoch": 27.3, + "grad_norm": 0.4054431915283203, + "learning_rate": 2.0585556007694584e-05, + "loss": 0.3575, + "step": 184500 + }, + { + "epoch": 27.37, + "grad_norm": 0.3319009840488434, + "learning_rate": 2.0556655075466112e-05, + "loss": 0.3524, + "step": 185000 + }, + { + "epoch": 27.45, + "grad_norm": 0.36432087421417236, + "learning_rate": 2.0527754143237643e-05, + "loss": 0.3556, + "step": 185500 + }, + { + "epoch": 27.52, + "grad_norm": 0.3561677038669586, + "learning_rate": 2.0498853211009174e-05, + "loss": 0.3539, + "step": 186000 + }, + { + "epoch": 27.6, + "grad_norm": 0.41498541831970215, + "learning_rate": 2.0469952278780702e-05, + "loss": 0.3561, + "step": 186500 + }, + { + "epoch": 27.67, + "grad_norm": 0.3646217882633209, + "learning_rate": 2.0441051346552233e-05, + "loss": 0.3519, + "step": 187000 + }, + { + "epoch": 27.74, + "grad_norm": 0.34534063935279846, + "learning_rate": 2.0412150414323765e-05, + "loss": 0.3539, + "step": 187500 + }, + { + "epoch": 27.82, + "grad_norm": 0.4323655962944031, + "learning_rate": 2.0383249482095293e-05, + "loss": 0.3559, + "step": 188000 + }, + { + "epoch": 27.89, + "grad_norm": 0.3833807408809662, + "learning_rate": 2.0354348549866824e-05, + "loss": 0.3532, + "step": 188500 + }, + { + "epoch": 27.97, + "grad_norm": 0.37557268142700195, + "learning_rate": 2.0325447617638355e-05, + "loss": 0.3523, + "step": 189000 + }, + { + "epoch": 28.04, + "grad_norm": 0.37144702672958374, + "learning_rate": 2.0296546685409883e-05, + "loss": 0.3536, + "step": 189500 + }, + { + "epoch": 28.11, + "grad_norm": 0.40455296635627747, + "learning_rate": 2.0267645753181414e-05, + "loss": 0.3534, + "step": 190000 + }, + { + "epoch": 28.19, + "grad_norm": 0.3639744818210602, + "learning_rate": 2.0238744820952945e-05, + "loss": 0.3536, + "step": 190500 + }, + { + "epoch": 28.26, + "grad_norm": 0.38016533851623535, + "learning_rate": 2.0209843888724473e-05, + "loss": 0.3569, + "step": 191000 + }, + { + "epoch": 28.34, + "grad_norm": 0.35611262917518616, + "learning_rate": 2.0180942956496004e-05, + "loss": 0.3539, + "step": 191500 + }, + { + "epoch": 28.41, + "grad_norm": 0.3586650490760803, + "learning_rate": 2.0152042024267536e-05, + "loss": 0.3516, + "step": 192000 + }, + { + "epoch": 28.48, + "grad_norm": 0.3105120062828064, + "learning_rate": 2.0123141092039064e-05, + "loss": 0.3518, + "step": 192500 + }, + { + "epoch": 28.56, + "grad_norm": 0.37972891330718994, + "learning_rate": 2.0094240159810595e-05, + "loss": 0.3558, + "step": 193000 + }, + { + "epoch": 28.63, + "grad_norm": 0.35530367493629456, + "learning_rate": 2.0065339227582126e-05, + "loss": 0.3505, + "step": 193500 + }, + { + "epoch": 28.71, + "grad_norm": 0.42136579751968384, + "learning_rate": 2.0036438295353654e-05, + "loss": 0.3537, + "step": 194000 + }, + { + "epoch": 28.78, + "grad_norm": 0.37874168157577515, + "learning_rate": 2.0007537363125185e-05, + "loss": 0.3505, + "step": 194500 + }, + { + "epoch": 28.85, + "grad_norm": 0.33442074060440063, + "learning_rate": 1.9978636430896716e-05, + "loss": 0.3536, + "step": 195000 + }, + { + "epoch": 28.93, + "grad_norm": 0.37098708748817444, + "learning_rate": 1.9949735498668244e-05, + "loss": 0.3553, + "step": 195500 + }, + { + "epoch": 29.0, + "grad_norm": 0.33478862047195435, + "learning_rate": 1.9920834566439775e-05, + "loss": 0.3527, + "step": 196000 + }, + { + "epoch": 29.08, + "grad_norm": 0.3783182203769684, + "learning_rate": 1.9891933634211307e-05, + "loss": 0.3523, + "step": 196500 + }, + { + "epoch": 29.15, + "grad_norm": 0.32911786437034607, + "learning_rate": 1.9863032701982835e-05, + "loss": 0.3526, + "step": 197000 + }, + { + "epoch": 29.22, + "grad_norm": 0.33882907032966614, + "learning_rate": 1.9834131769754366e-05, + "loss": 0.3507, + "step": 197500 + }, + { + "epoch": 29.3, + "grad_norm": 0.4318142235279083, + "learning_rate": 1.9805230837525897e-05, + "loss": 0.3504, + "step": 198000 + }, + { + "epoch": 29.37, + "grad_norm": 0.33973386883735657, + "learning_rate": 1.9776329905297425e-05, + "loss": 0.3526, + "step": 198500 + }, + { + "epoch": 29.45, + "grad_norm": 0.3557802736759186, + "learning_rate": 1.9747428973068956e-05, + "loss": 0.3511, + "step": 199000 + }, + { + "epoch": 29.52, + "grad_norm": 0.4430686831474304, + "learning_rate": 1.9718528040840487e-05, + "loss": 0.3503, + "step": 199500 + }, + { + "epoch": 29.59, + "grad_norm": 0.33132269978523254, + "learning_rate": 1.9689627108612015e-05, + "loss": 0.3531, + "step": 200000 + }, + { + "epoch": 29.67, + "grad_norm": 0.362075537443161, + "learning_rate": 1.9660726176383546e-05, + "loss": 0.3517, + "step": 200500 + }, + { + "epoch": 29.74, + "grad_norm": 0.3604036867618561, + "learning_rate": 1.9631825244155078e-05, + "loss": 0.3558, + "step": 201000 + }, + { + "epoch": 29.82, + "grad_norm": 0.39711400866508484, + "learning_rate": 1.960292431192661e-05, + "loss": 0.3515, + "step": 201500 + }, + { + "epoch": 29.89, + "grad_norm": 0.30394095182418823, + "learning_rate": 1.9574023379698137e-05, + "loss": 0.352, + "step": 202000 + }, + { + "epoch": 29.96, + "grad_norm": 0.3903627097606659, + "learning_rate": 1.9545122447469668e-05, + "loss": 0.3496, + "step": 202500 + }, + { + "epoch": 30.04, + "grad_norm": 0.3124367296695709, + "learning_rate": 1.95162215152412e-05, + "loss": 0.3517, + "step": 203000 + }, + { + "epoch": 30.11, + "grad_norm": 0.4038899540901184, + "learning_rate": 1.9487320583012727e-05, + "loss": 0.3518, + "step": 203500 + }, + { + "epoch": 30.19, + "grad_norm": 0.32454368472099304, + "learning_rate": 1.9458419650784255e-05, + "loss": 0.3517, + "step": 204000 + }, + { + "epoch": 30.26, + "grad_norm": 0.35400575399398804, + "learning_rate": 1.9429518718555786e-05, + "loss": 0.3479, + "step": 204500 + }, + { + "epoch": 30.33, + "grad_norm": 0.424834281206131, + "learning_rate": 1.9400617786327314e-05, + "loss": 0.3502, + "step": 205000 + }, + { + "epoch": 30.41, + "grad_norm": 0.4748223125934601, + "learning_rate": 1.9371716854098845e-05, + "loss": 0.349, + "step": 205500 + }, + { + "epoch": 30.48, + "grad_norm": 0.3032289445400238, + "learning_rate": 1.9342815921870376e-05, + "loss": 0.3543, + "step": 206000 + }, + { + "epoch": 30.56, + "grad_norm": 0.4162702262401581, + "learning_rate": 1.9313914989641904e-05, + "loss": 0.3518, + "step": 206500 + }, + { + "epoch": 30.63, + "grad_norm": 0.3512803912162781, + "learning_rate": 1.9285014057413436e-05, + "loss": 0.3505, + "step": 207000 + }, + { + "epoch": 30.7, + "grad_norm": 0.3622516989707947, + "learning_rate": 1.9256113125184967e-05, + "loss": 0.3538, + "step": 207500 + }, + { + "epoch": 30.78, + "grad_norm": 0.3330516517162323, + "learning_rate": 1.9227212192956495e-05, + "loss": 0.3489, + "step": 208000 + }, + { + "epoch": 30.85, + "grad_norm": 0.3457803726196289, + "learning_rate": 1.9198311260728026e-05, + "loss": 0.352, + "step": 208500 + }, + { + "epoch": 30.93, + "grad_norm": 0.3154030442237854, + "learning_rate": 1.9169410328499557e-05, + "loss": 0.3491, + "step": 209000 + }, + { + "epoch": 31.0, + "grad_norm": 0.46131783723831177, + "learning_rate": 1.9140509396271085e-05, + "loss": 0.3509, + "step": 209500 + }, + { + "epoch": 31.07, + "grad_norm": 0.400088369846344, + "learning_rate": 1.9111608464042616e-05, + "loss": 0.3509, + "step": 210000 + }, + { + "epoch": 31.15, + "grad_norm": 0.3647451400756836, + "learning_rate": 1.9082707531814147e-05, + "loss": 0.35, + "step": 210500 + }, + { + "epoch": 31.22, + "grad_norm": 0.4007732570171356, + "learning_rate": 1.9053806599585675e-05, + "loss": 0.3504, + "step": 211000 + }, + { + "epoch": 31.3, + "grad_norm": 0.3861900269985199, + "learning_rate": 1.9024905667357207e-05, + "loss": 0.3484, + "step": 211500 + }, + { + "epoch": 31.37, + "grad_norm": 0.411627858877182, + "learning_rate": 1.8996004735128738e-05, + "loss": 0.3507, + "step": 212000 + }, + { + "epoch": 31.44, + "grad_norm": 0.35766077041625977, + "learning_rate": 1.8967103802900266e-05, + "loss": 0.3485, + "step": 212500 + }, + { + "epoch": 31.52, + "grad_norm": 0.3537013530731201, + "learning_rate": 1.8938202870671797e-05, + "loss": 0.3517, + "step": 213000 + }, + { + "epoch": 31.59, + "grad_norm": 0.3919309675693512, + "learning_rate": 1.8909301938443328e-05, + "loss": 0.3489, + "step": 213500 + }, + { + "epoch": 31.67, + "grad_norm": 0.3441930115222931, + "learning_rate": 1.8880401006214856e-05, + "loss": 0.351, + "step": 214000 + }, + { + "epoch": 31.74, + "grad_norm": 0.38138172030448914, + "learning_rate": 1.8851500073986387e-05, + "loss": 0.3495, + "step": 214500 + }, + { + "epoch": 31.81, + "grad_norm": 0.4080500304698944, + "learning_rate": 1.882259914175792e-05, + "loss": 0.3497, + "step": 215000 + }, + { + "epoch": 31.89, + "grad_norm": 0.3864932358264923, + "learning_rate": 1.8793698209529446e-05, + "loss": 0.3525, + "step": 215500 + }, + { + "epoch": 31.96, + "grad_norm": 0.4017949104309082, + "learning_rate": 1.8764797277300978e-05, + "loss": 0.3528, + "step": 216000 + }, + { + "epoch": 32.04, + "grad_norm": 0.3484615087509155, + "learning_rate": 1.873589634507251e-05, + "loss": 0.3505, + "step": 216500 + }, + { + "epoch": 32.11, + "grad_norm": 0.34500235319137573, + "learning_rate": 1.8706995412844037e-05, + "loss": 0.3498, + "step": 217000 + }, + { + "epoch": 32.18, + "grad_norm": 0.32486996054649353, + "learning_rate": 1.8678094480615568e-05, + "loss": 0.3501, + "step": 217500 + }, + { + "epoch": 32.26, + "grad_norm": 0.3440997302532196, + "learning_rate": 1.86491935483871e-05, + "loss": 0.3504, + "step": 218000 + }, + { + "epoch": 32.33, + "grad_norm": 0.359846293926239, + "learning_rate": 1.8620292616158627e-05, + "loss": 0.3491, + "step": 218500 + }, + { + "epoch": 32.41, + "grad_norm": 0.36168062686920166, + "learning_rate": 1.8591391683930158e-05, + "loss": 0.3489, + "step": 219000 + }, + { + "epoch": 32.48, + "grad_norm": 0.43606841564178467, + "learning_rate": 1.856249075170169e-05, + "loss": 0.3497, + "step": 219500 + }, + { + "epoch": 32.55, + "grad_norm": 0.3898315727710724, + "learning_rate": 1.8533589819473217e-05, + "loss": 0.3487, + "step": 220000 + }, + { + "epoch": 32.63, + "grad_norm": 0.381244421005249, + "learning_rate": 1.850468888724475e-05, + "loss": 0.3475, + "step": 220500 + }, + { + "epoch": 32.7, + "grad_norm": 0.41321712732315063, + "learning_rate": 1.847578795501628e-05, + "loss": 0.3484, + "step": 221000 + }, + { + "epoch": 32.78, + "grad_norm": 0.3538101017475128, + "learning_rate": 1.8446887022787808e-05, + "loss": 0.3484, + "step": 221500 + }, + { + "epoch": 32.85, + "grad_norm": 0.38104715943336487, + "learning_rate": 1.841798609055934e-05, + "loss": 0.3482, + "step": 222000 + }, + { + "epoch": 32.92, + "grad_norm": 0.37761756777763367, + "learning_rate": 1.838908515833087e-05, + "loss": 0.3474, + "step": 222500 + }, + { + "epoch": 33.0, + "grad_norm": 0.3524073362350464, + "learning_rate": 1.8360184226102398e-05, + "loss": 0.3518, + "step": 223000 + }, + { + "epoch": 33.07, + "grad_norm": 0.3452695608139038, + "learning_rate": 1.8331283293873926e-05, + "loss": 0.3509, + "step": 223500 + }, + { + "epoch": 33.15, + "grad_norm": 0.4063817262649536, + "learning_rate": 1.8302382361645457e-05, + "loss": 0.3496, + "step": 224000 + }, + { + "epoch": 33.22, + "grad_norm": 0.41099056601524353, + "learning_rate": 1.8273481429416985e-05, + "loss": 0.3489, + "step": 224500 + }, + { + "epoch": 33.29, + "grad_norm": 0.3691389560699463, + "learning_rate": 1.8244580497188516e-05, + "loss": 0.3507, + "step": 225000 + }, + { + "epoch": 33.37, + "grad_norm": 0.36765575408935547, + "learning_rate": 1.8215679564960047e-05, + "loss": 0.3464, + "step": 225500 + }, + { + "epoch": 33.44, + "grad_norm": 0.39067772030830383, + "learning_rate": 1.8186778632731575e-05, + "loss": 0.3479, + "step": 226000 + }, + { + "epoch": 33.52, + "grad_norm": 0.3263433873653412, + "learning_rate": 1.8157877700503106e-05, + "loss": 0.3457, + "step": 226500 + }, + { + "epoch": 33.59, + "grad_norm": 0.45672136545181274, + "learning_rate": 1.8128976768274638e-05, + "loss": 0.3488, + "step": 227000 + }, + { + "epoch": 33.66, + "grad_norm": 0.42077481746673584, + "learning_rate": 1.8100075836046166e-05, + "loss": 0.3447, + "step": 227500 + }, + { + "epoch": 33.74, + "grad_norm": 0.3391963243484497, + "learning_rate": 1.8071174903817697e-05, + "loss": 0.3471, + "step": 228000 + }, + { + "epoch": 33.81, + "grad_norm": 0.3776566684246063, + "learning_rate": 1.8042273971589228e-05, + "loss": 0.3498, + "step": 228500 + }, + { + "epoch": 33.89, + "grad_norm": 0.3776949942111969, + "learning_rate": 1.8013373039360756e-05, + "loss": 0.3463, + "step": 229000 + }, + { + "epoch": 33.96, + "grad_norm": 0.3695808947086334, + "learning_rate": 1.7984472107132287e-05, + "loss": 0.3492, + "step": 229500 + }, + { + "epoch": 34.03, + "grad_norm": 0.36413583159446716, + "learning_rate": 1.795557117490382e-05, + "loss": 0.3472, + "step": 230000 + }, + { + "epoch": 34.11, + "grad_norm": 0.39360344409942627, + "learning_rate": 1.7926670242675346e-05, + "loss": 0.3505, + "step": 230500 + }, + { + "epoch": 34.18, + "grad_norm": 0.3416905999183655, + "learning_rate": 1.7897769310446877e-05, + "loss": 0.3472, + "step": 231000 + }, + { + "epoch": 34.26, + "grad_norm": 0.40496984124183655, + "learning_rate": 1.786886837821841e-05, + "loss": 0.3477, + "step": 231500 + }, + { + "epoch": 34.33, + "grad_norm": 0.3441724479198456, + "learning_rate": 1.7839967445989937e-05, + "loss": 0.3479, + "step": 232000 + }, + { + "epoch": 34.4, + "grad_norm": 0.37928706407546997, + "learning_rate": 1.7811066513761468e-05, + "loss": 0.3486, + "step": 232500 + }, + { + "epoch": 34.48, + "grad_norm": 0.3675363063812256, + "learning_rate": 1.7782165581533e-05, + "loss": 0.3458, + "step": 233000 + }, + { + "epoch": 34.55, + "grad_norm": 0.40871405601501465, + "learning_rate": 1.7753264649304527e-05, + "loss": 0.3493, + "step": 233500 + }, + { + "epoch": 34.63, + "grad_norm": 0.417258620262146, + "learning_rate": 1.7724363717076058e-05, + "loss": 0.3453, + "step": 234000 + }, + { + "epoch": 34.7, + "grad_norm": 0.3210906386375427, + "learning_rate": 1.769546278484759e-05, + "loss": 0.3457, + "step": 234500 + }, + { + "epoch": 34.77, + "grad_norm": 0.3412734270095825, + "learning_rate": 1.7666561852619117e-05, + "loss": 0.3475, + "step": 235000 + }, + { + "epoch": 34.85, + "grad_norm": 0.39695098996162415, + "learning_rate": 1.763766092039065e-05, + "loss": 0.3509, + "step": 235500 + }, + { + "epoch": 34.92, + "grad_norm": 0.36834970116615295, + "learning_rate": 1.760875998816218e-05, + "loss": 0.3485, + "step": 236000 + }, + { + "epoch": 35.0, + "grad_norm": 0.3971041738986969, + "learning_rate": 1.7579859055933708e-05, + "loss": 0.3468, + "step": 236500 + }, + { + "epoch": 35.07, + "grad_norm": 0.3513786196708679, + "learning_rate": 1.755095812370524e-05, + "loss": 0.3478, + "step": 237000 + }, + { + "epoch": 35.14, + "grad_norm": 0.35932984948158264, + "learning_rate": 1.752205719147677e-05, + "loss": 0.348, + "step": 237500 + }, + { + "epoch": 35.22, + "grad_norm": 0.40685245394706726, + "learning_rate": 1.7493156259248298e-05, + "loss": 0.3447, + "step": 238000 + }, + { + "epoch": 35.29, + "grad_norm": 0.37929338216781616, + "learning_rate": 1.746425532701983e-05, + "loss": 0.3465, + "step": 238500 + }, + { + "epoch": 35.37, + "grad_norm": 0.40910473465919495, + "learning_rate": 1.743535439479136e-05, + "loss": 0.3465, + "step": 239000 + }, + { + "epoch": 35.44, + "grad_norm": 0.34920281171798706, + "learning_rate": 1.740645346256289e-05, + "loss": 0.3463, + "step": 239500 + }, + { + "epoch": 35.51, + "grad_norm": 0.37716421484947205, + "learning_rate": 1.737755253033442e-05, + "loss": 0.3487, + "step": 240000 + }, + { + "epoch": 35.59, + "grad_norm": 0.4301624596118927, + "learning_rate": 1.734865159810595e-05, + "loss": 0.3449, + "step": 240500 + }, + { + "epoch": 35.66, + "grad_norm": 0.37206390500068665, + "learning_rate": 1.7319750665877482e-05, + "loss": 0.3453, + "step": 241000 + }, + { + "epoch": 35.74, + "grad_norm": 0.38183438777923584, + "learning_rate": 1.729084973364901e-05, + "loss": 0.3493, + "step": 241500 + }, + { + "epoch": 35.81, + "grad_norm": 0.3732428252696991, + "learning_rate": 1.726194880142054e-05, + "loss": 0.349, + "step": 242000 + }, + { + "epoch": 35.88, + "grad_norm": 0.3665318787097931, + "learning_rate": 1.7233047869192072e-05, + "loss": 0.3482, + "step": 242500 + }, + { + "epoch": 35.96, + "grad_norm": 0.3693431317806244, + "learning_rate": 1.7204146936963597e-05, + "loss": 0.3445, + "step": 243000 + }, + { + "epoch": 36.03, + "grad_norm": 0.3580050766468048, + "learning_rate": 1.7175246004735128e-05, + "loss": 0.3469, + "step": 243500 + }, + { + "epoch": 36.11, + "grad_norm": 0.3874383568763733, + "learning_rate": 1.714634507250666e-05, + "loss": 0.3483, + "step": 244000 + }, + { + "epoch": 36.18, + "grad_norm": 0.39490264654159546, + "learning_rate": 1.7117444140278187e-05, + "loss": 0.3479, + "step": 244500 + }, + { + "epoch": 36.25, + "grad_norm": 0.38647031784057617, + "learning_rate": 1.7088543208049718e-05, + "loss": 0.3458, + "step": 245000 + }, + { + "epoch": 36.33, + "grad_norm": 0.3899756073951721, + "learning_rate": 1.705964227582125e-05, + "loss": 0.3448, + "step": 245500 + }, + { + "epoch": 36.4, + "grad_norm": 0.4022426903247833, + "learning_rate": 1.7030741343592777e-05, + "loss": 0.3463, + "step": 246000 + }, + { + "epoch": 36.48, + "grad_norm": 0.37726154923439026, + "learning_rate": 1.700184041136431e-05, + "loss": 0.3469, + "step": 246500 + }, + { + "epoch": 36.55, + "grad_norm": 0.4022297263145447, + "learning_rate": 1.697293947913584e-05, + "loss": 0.3457, + "step": 247000 + }, + { + "epoch": 36.62, + "grad_norm": 0.43506869673728943, + "learning_rate": 1.6944038546907368e-05, + "loss": 0.3467, + "step": 247500 + }, + { + "epoch": 36.7, + "grad_norm": 0.4157175123691559, + "learning_rate": 1.69151376146789e-05, + "loss": 0.3443, + "step": 248000 + }, + { + "epoch": 36.77, + "grad_norm": 0.4038371443748474, + "learning_rate": 1.688623668245043e-05, + "loss": 0.3408, + "step": 248500 + }, + { + "epoch": 36.85, + "grad_norm": 0.3598155081272125, + "learning_rate": 1.6857335750221958e-05, + "loss": 0.3455, + "step": 249000 + }, + { + "epoch": 36.92, + "grad_norm": 0.3888005018234253, + "learning_rate": 1.682843481799349e-05, + "loss": 0.3465, + "step": 249500 + }, + { + "epoch": 36.99, + "grad_norm": 0.3933923840522766, + "learning_rate": 1.679953388576502e-05, + "loss": 0.3488, + "step": 250000 + }, + { + "epoch": 37.07, + "grad_norm": 0.36610084772109985, + "learning_rate": 1.677063295353655e-05, + "loss": 0.3448, + "step": 250500 + }, + { + "epoch": 37.14, + "grad_norm": 0.3755366802215576, + "learning_rate": 1.674173202130808e-05, + "loss": 0.3475, + "step": 251000 + }, + { + "epoch": 37.22, + "grad_norm": 0.3687468469142914, + "learning_rate": 1.671283108907961e-05, + "loss": 0.3441, + "step": 251500 + }, + { + "epoch": 37.29, + "grad_norm": 0.3150022327899933, + "learning_rate": 1.668393015685114e-05, + "loss": 0.3439, + "step": 252000 + }, + { + "epoch": 37.36, + "grad_norm": 0.37440210580825806, + "learning_rate": 1.665502922462267e-05, + "loss": 0.3474, + "step": 252500 + }, + { + "epoch": 37.44, + "grad_norm": 0.3700256943702698, + "learning_rate": 1.66261282923942e-05, + "loss": 0.3453, + "step": 253000 + }, + { + "epoch": 37.51, + "grad_norm": 0.40626585483551025, + "learning_rate": 1.659722736016573e-05, + "loss": 0.345, + "step": 253500 + }, + { + "epoch": 37.59, + "grad_norm": 0.39384424686431885, + "learning_rate": 1.656832642793726e-05, + "loss": 0.3465, + "step": 254000 + }, + { + "epoch": 37.66, + "grad_norm": 0.38323110342025757, + "learning_rate": 1.653942549570879e-05, + "loss": 0.3458, + "step": 254500 + }, + { + "epoch": 37.73, + "grad_norm": 0.3620944619178772, + "learning_rate": 1.651052456348032e-05, + "loss": 0.3457, + "step": 255000 + }, + { + "epoch": 37.81, + "grad_norm": 0.3920278251171112, + "learning_rate": 1.648162363125185e-05, + "loss": 0.3433, + "step": 255500 + }, + { + "epoch": 37.88, + "grad_norm": 0.3547744154930115, + "learning_rate": 1.6452722699023382e-05, + "loss": 0.3427, + "step": 256000 + }, + { + "epoch": 37.96, + "grad_norm": 0.3048088252544403, + "learning_rate": 1.642382176679491e-05, + "loss": 0.3462, + "step": 256500 + }, + { + "epoch": 38.03, + "grad_norm": 0.3744346499443054, + "learning_rate": 1.639492083456644e-05, + "loss": 0.3451, + "step": 257000 + }, + { + "epoch": 38.1, + "grad_norm": 0.3640407621860504, + "learning_rate": 1.6366019902337972e-05, + "loss": 0.3485, + "step": 257500 + }, + { + "epoch": 38.18, + "grad_norm": 0.37335264682769775, + "learning_rate": 1.63371189701095e-05, + "loss": 0.3466, + "step": 258000 + }, + { + "epoch": 38.25, + "grad_norm": 0.3745587170124054, + "learning_rate": 1.630821803788103e-05, + "loss": 0.344, + "step": 258500 + }, + { + "epoch": 38.32, + "grad_norm": 0.462406724691391, + "learning_rate": 1.6279317105652562e-05, + "loss": 0.3432, + "step": 259000 + }, + { + "epoch": 38.4, + "grad_norm": 0.3359210193157196, + "learning_rate": 1.625041617342409e-05, + "loss": 0.3469, + "step": 259500 + }, + { + "epoch": 38.47, + "grad_norm": 0.3449317514896393, + "learning_rate": 1.622151524119562e-05, + "loss": 0.3435, + "step": 260000 + }, + { + "epoch": 38.55, + "grad_norm": 0.4265647232532501, + "learning_rate": 1.6192614308967153e-05, + "loss": 0.3458, + "step": 260500 + }, + { + "epoch": 38.62, + "grad_norm": 0.40118905901908875, + "learning_rate": 1.616371337673868e-05, + "loss": 0.3461, + "step": 261000 + }, + { + "epoch": 38.69, + "grad_norm": 0.36499762535095215, + "learning_rate": 1.6134812444510212e-05, + "loss": 0.3457, + "step": 261500 + }, + { + "epoch": 38.77, + "grad_norm": 0.37067875266075134, + "learning_rate": 1.6105911512281743e-05, + "loss": 0.3426, + "step": 262000 + }, + { + "epoch": 38.84, + "grad_norm": 0.402778685092926, + "learning_rate": 1.6077010580053268e-05, + "loss": 0.3432, + "step": 262500 + }, + { + "epoch": 38.92, + "grad_norm": 0.37418636679649353, + "learning_rate": 1.60481096478248e-05, + "loss": 0.347, + "step": 263000 + }, + { + "epoch": 38.99, + "grad_norm": 0.4147396981716156, + "learning_rate": 1.601920871559633e-05, + "loss": 0.3456, + "step": 263500 + }, + { + "epoch": 39.06, + "grad_norm": 0.42823702096939087, + "learning_rate": 1.5990307783367858e-05, + "loss": 0.342, + "step": 264000 + }, + { + "epoch": 39.14, + "grad_norm": 0.40999341011047363, + "learning_rate": 1.596140685113939e-05, + "loss": 0.3413, + "step": 264500 + }, + { + "epoch": 39.21, + "grad_norm": 0.32551825046539307, + "learning_rate": 1.593250591891092e-05, + "loss": 0.3433, + "step": 265000 + }, + { + "epoch": 39.29, + "grad_norm": 0.3688596487045288, + "learning_rate": 1.5903604986682448e-05, + "loss": 0.3457, + "step": 265500 + }, + { + "epoch": 39.36, + "grad_norm": 0.39799386262893677, + "learning_rate": 1.587470405445398e-05, + "loss": 0.343, + "step": 266000 + }, + { + "epoch": 39.43, + "grad_norm": 0.34967321157455444, + "learning_rate": 1.584580312222551e-05, + "loss": 0.3418, + "step": 266500 + }, + { + "epoch": 39.51, + "grad_norm": 0.36091017723083496, + "learning_rate": 1.581690218999704e-05, + "loss": 0.3435, + "step": 267000 + }, + { + "epoch": 39.58, + "grad_norm": 0.3361178934574127, + "learning_rate": 1.578800125776857e-05, + "loss": 0.3471, + "step": 267500 + }, + { + "epoch": 39.66, + "grad_norm": 0.36311858892440796, + "learning_rate": 1.57591003255401e-05, + "loss": 0.3442, + "step": 268000 + }, + { + "epoch": 39.73, + "grad_norm": 0.37522685527801514, + "learning_rate": 1.573019939331163e-05, + "loss": 0.3432, + "step": 268500 + }, + { + "epoch": 39.8, + "grad_norm": 0.42775389552116394, + "learning_rate": 1.570129846108316e-05, + "loss": 0.344, + "step": 269000 + }, + { + "epoch": 39.88, + "grad_norm": 0.40960633754730225, + "learning_rate": 1.567239752885469e-05, + "loss": 0.3428, + "step": 269500 + }, + { + "epoch": 39.95, + "grad_norm": 0.35652443766593933, + "learning_rate": 1.564349659662622e-05, + "loss": 0.3435, + "step": 270000 + }, + { + "epoch": 40.03, + "grad_norm": 0.41139841079711914, + "learning_rate": 1.561459566439775e-05, + "loss": 0.3436, + "step": 270500 + }, + { + "epoch": 40.1, + "grad_norm": 0.3651288151741028, + "learning_rate": 1.5585694732169282e-05, + "loss": 0.3447, + "step": 271000 + }, + { + "epoch": 40.17, + "grad_norm": 0.37484046816825867, + "learning_rate": 1.555679379994081e-05, + "loss": 0.3456, + "step": 271500 + }, + { + "epoch": 40.25, + "grad_norm": 0.3306860625743866, + "learning_rate": 1.552789286771234e-05, + "loss": 0.3428, + "step": 272000 + }, + { + "epoch": 40.32, + "grad_norm": 0.37026843428611755, + "learning_rate": 1.5498991935483872e-05, + "loss": 0.3433, + "step": 272500 + }, + { + "epoch": 40.4, + "grad_norm": 0.3959224224090576, + "learning_rate": 1.54700910032554e-05, + "loss": 0.3453, + "step": 273000 + }, + { + "epoch": 40.47, + "grad_norm": 0.3823704719543457, + "learning_rate": 1.544119007102693e-05, + "loss": 0.3413, + "step": 273500 + }, + { + "epoch": 40.54, + "grad_norm": 0.33115535974502563, + "learning_rate": 1.5412289138798462e-05, + "loss": 0.3442, + "step": 274000 + }, + { + "epoch": 40.62, + "grad_norm": 0.5036399364471436, + "learning_rate": 1.538338820656999e-05, + "loss": 0.3417, + "step": 274500 + }, + { + "epoch": 40.69, + "grad_norm": 0.3805595934391022, + "learning_rate": 1.535448727434152e-05, + "loss": 0.3453, + "step": 275000 + }, + { + "epoch": 40.77, + "grad_norm": 0.4390459656715393, + "learning_rate": 1.5325586342113053e-05, + "loss": 0.343, + "step": 275500 + }, + { + "epoch": 40.84, + "grad_norm": 0.3673398792743683, + "learning_rate": 1.5296685409884584e-05, + "loss": 0.3402, + "step": 276000 + }, + { + "epoch": 40.91, + "grad_norm": 0.36677980422973633, + "learning_rate": 1.5267784477656112e-05, + "loss": 0.3418, + "step": 276500 + }, + { + "epoch": 40.99, + "grad_norm": 0.37628763914108276, + "learning_rate": 1.5238883545427641e-05, + "loss": 0.3423, + "step": 277000 + }, + { + "epoch": 41.06, + "grad_norm": 0.3959880769252777, + "learning_rate": 1.5209982613199171e-05, + "loss": 0.3413, + "step": 277500 + }, + { + "epoch": 41.14, + "grad_norm": 0.35615137219429016, + "learning_rate": 1.51810816809707e-05, + "loss": 0.3438, + "step": 278000 + }, + { + "epoch": 41.21, + "grad_norm": 0.4133353531360626, + "learning_rate": 1.5152180748742232e-05, + "loss": 0.3429, + "step": 278500 + }, + { + "epoch": 41.28, + "grad_norm": 0.35143953561782837, + "learning_rate": 1.5123279816513761e-05, + "loss": 0.3437, + "step": 279000 + }, + { + "epoch": 41.36, + "grad_norm": 0.37390509247779846, + "learning_rate": 1.509437888428529e-05, + "loss": 0.341, + "step": 279500 + }, + { + "epoch": 41.43, + "grad_norm": 0.39959460496902466, + "learning_rate": 1.5065477952056822e-05, + "loss": 0.3423, + "step": 280000 + }, + { + "epoch": 41.51, + "grad_norm": 0.3992210030555725, + "learning_rate": 1.5036577019828352e-05, + "loss": 0.3462, + "step": 280500 + }, + { + "epoch": 41.58, + "grad_norm": 0.3677886724472046, + "learning_rate": 1.5007676087599881e-05, + "loss": 0.3427, + "step": 281000 + }, + { + "epoch": 41.65, + "grad_norm": 0.3906817138195038, + "learning_rate": 1.4978775155371412e-05, + "loss": 0.3415, + "step": 281500 + }, + { + "epoch": 41.73, + "grad_norm": 0.39412081241607666, + "learning_rate": 1.4949874223142942e-05, + "loss": 0.3434, + "step": 282000 + }, + { + "epoch": 41.8, + "grad_norm": 0.4120485782623291, + "learning_rate": 1.4920973290914471e-05, + "loss": 0.344, + "step": 282500 + }, + { + "epoch": 41.88, + "grad_norm": 0.4464050829410553, + "learning_rate": 1.4892072358686003e-05, + "loss": 0.34, + "step": 283000 + }, + { + "epoch": 41.95, + "grad_norm": 0.3044414520263672, + "learning_rate": 1.4863171426457532e-05, + "loss": 0.3417, + "step": 283500 + }, + { + "epoch": 42.02, + "grad_norm": 0.32554033398628235, + "learning_rate": 1.4834270494229062e-05, + "loss": 0.3417, + "step": 284000 + }, + { + "epoch": 42.1, + "grad_norm": 0.3944820463657379, + "learning_rate": 1.4805369562000593e-05, + "loss": 0.3436, + "step": 284500 + }, + { + "epoch": 42.17, + "grad_norm": 0.37802961468696594, + "learning_rate": 1.4776468629772123e-05, + "loss": 0.3439, + "step": 285000 + }, + { + "epoch": 42.25, + "grad_norm": 0.4089604616165161, + "learning_rate": 1.4747567697543654e-05, + "loss": 0.3408, + "step": 285500 + }, + { + "epoch": 42.32, + "grad_norm": 0.37038755416870117, + "learning_rate": 1.4718666765315183e-05, + "loss": 0.3426, + "step": 286000 + }, + { + "epoch": 42.39, + "grad_norm": 0.35514524579048157, + "learning_rate": 1.4689765833086711e-05, + "loss": 0.3451, + "step": 286500 + }, + { + "epoch": 42.47, + "grad_norm": 0.4710882902145386, + "learning_rate": 1.4660864900858242e-05, + "loss": 0.3429, + "step": 287000 + }, + { + "epoch": 42.54, + "grad_norm": 0.34002232551574707, + "learning_rate": 1.4631963968629772e-05, + "loss": 0.3418, + "step": 287500 + }, + { + "epoch": 42.62, + "grad_norm": 0.4424833059310913, + "learning_rate": 1.4603063036401302e-05, + "loss": 0.3425, + "step": 288000 + }, + { + "epoch": 42.69, + "grad_norm": 0.4154003858566284, + "learning_rate": 1.4574162104172833e-05, + "loss": 0.3428, + "step": 288500 + }, + { + "epoch": 42.76, + "grad_norm": 0.40983182191848755, + "learning_rate": 1.4545261171944362e-05, + "loss": 0.343, + "step": 289000 + }, + { + "epoch": 42.84, + "grad_norm": 0.3568328320980072, + "learning_rate": 1.4516360239715892e-05, + "loss": 0.3387, + "step": 289500 + }, + { + "epoch": 42.91, + "grad_norm": 0.3948083221912384, + "learning_rate": 1.4487459307487423e-05, + "loss": 0.3388, + "step": 290000 + }, + { + "epoch": 42.99, + "grad_norm": 0.3891525864601135, + "learning_rate": 1.4458558375258953e-05, + "loss": 0.3415, + "step": 290500 + }, + { + "epoch": 43.06, + "grad_norm": 0.3312503695487976, + "learning_rate": 1.4429657443030482e-05, + "loss": 0.3407, + "step": 291000 + }, + { + "epoch": 43.13, + "grad_norm": 0.37773850560188293, + "learning_rate": 1.4400756510802013e-05, + "loss": 0.3411, + "step": 291500 + }, + { + "epoch": 43.21, + "grad_norm": 0.29978179931640625, + "learning_rate": 1.4371855578573543e-05, + "loss": 0.3423, + "step": 292000 + }, + { + "epoch": 43.28, + "grad_norm": 0.4314406216144562, + "learning_rate": 1.4342954646345072e-05, + "loss": 0.3373, + "step": 292500 + }, + { + "epoch": 43.36, + "grad_norm": 0.3975353538990021, + "learning_rate": 1.4314053714116604e-05, + "loss": 0.3405, + "step": 293000 + }, + { + "epoch": 43.43, + "grad_norm": 0.35734960436820984, + "learning_rate": 1.4285152781888133e-05, + "loss": 0.3416, + "step": 293500 + }, + { + "epoch": 43.5, + "grad_norm": 0.44908031821250916, + "learning_rate": 1.4256251849659663e-05, + "loss": 0.3392, + "step": 294000 + }, + { + "epoch": 43.58, + "grad_norm": 0.3516298532485962, + "learning_rate": 1.4227350917431194e-05, + "loss": 0.3385, + "step": 294500 + }, + { + "epoch": 43.65, + "grad_norm": 0.3821066915988922, + "learning_rate": 1.4198449985202724e-05, + "loss": 0.3448, + "step": 295000 + }, + { + "epoch": 43.73, + "grad_norm": 0.3824633061885834, + "learning_rate": 1.4169549052974253e-05, + "loss": 0.3415, + "step": 295500 + }, + { + "epoch": 43.8, + "grad_norm": 0.3336328864097595, + "learning_rate": 1.4140648120745784e-05, + "loss": 0.3403, + "step": 296000 + }, + { + "epoch": 43.87, + "grad_norm": 0.41100433468818665, + "learning_rate": 1.4111747188517312e-05, + "loss": 0.3428, + "step": 296500 + }, + { + "epoch": 43.95, + "grad_norm": 0.38574787974357605, + "learning_rate": 1.4082846256288842e-05, + "loss": 0.3388, + "step": 297000 + }, + { + "epoch": 44.02, + "grad_norm": 0.3482591509819031, + "learning_rate": 1.4053945324060373e-05, + "loss": 0.3392, + "step": 297500 + }, + { + "epoch": 44.1, + "grad_norm": 0.39932170510292053, + "learning_rate": 1.4025044391831903e-05, + "loss": 0.3385, + "step": 298000 + }, + { + "epoch": 44.17, + "grad_norm": 0.3750057518482208, + "learning_rate": 1.3996143459603432e-05, + "loss": 0.3419, + "step": 298500 + }, + { + "epoch": 44.24, + "grad_norm": 0.3343985974788666, + "learning_rate": 1.3967242527374963e-05, + "loss": 0.3398, + "step": 299000 + }, + { + "epoch": 44.32, + "grad_norm": 0.32805758714675903, + "learning_rate": 1.3938341595146493e-05, + "loss": 0.339, + "step": 299500 + }, + { + "epoch": 44.39, + "grad_norm": 0.376280814409256, + "learning_rate": 1.3909440662918022e-05, + "loss": 0.3407, + "step": 300000 + }, + { + "epoch": 44.47, + "grad_norm": 0.27181968092918396, + "learning_rate": 1.3880539730689554e-05, + "loss": 0.3392, + "step": 300500 + }, + { + "epoch": 44.54, + "grad_norm": 0.3351672887802124, + "learning_rate": 1.3851638798461083e-05, + "loss": 0.3416, + "step": 301000 + }, + { + "epoch": 44.61, + "grad_norm": 0.3780210614204407, + "learning_rate": 1.3822737866232613e-05, + "loss": 0.3429, + "step": 301500 + }, + { + "epoch": 44.69, + "grad_norm": 0.3951726257801056, + "learning_rate": 1.3793836934004144e-05, + "loss": 0.3407, + "step": 302000 + }, + { + "epoch": 44.76, + "grad_norm": 0.3675825893878937, + "learning_rate": 1.3764936001775674e-05, + "loss": 0.339, + "step": 302500 + }, + { + "epoch": 44.84, + "grad_norm": 0.3629719913005829, + "learning_rate": 1.3736035069547205e-05, + "loss": 0.3431, + "step": 303000 + }, + { + "epoch": 44.91, + "grad_norm": 0.4170212745666504, + "learning_rate": 1.3707134137318734e-05, + "loss": 0.3394, + "step": 303500 + }, + { + "epoch": 44.98, + "grad_norm": 0.3839627206325531, + "learning_rate": 1.3678233205090264e-05, + "loss": 0.3422, + "step": 304000 + }, + { + "epoch": 45.06, + "grad_norm": 0.4395899176597595, + "learning_rate": 1.3649332272861795e-05, + "loss": 0.3402, + "step": 304500 + }, + { + "epoch": 45.13, + "grad_norm": 0.37044864892959595, + "learning_rate": 1.3620431340633325e-05, + "loss": 0.342, + "step": 305000 + }, + { + "epoch": 45.21, + "grad_norm": 0.4869653582572937, + "learning_rate": 1.3591530408404854e-05, + "loss": 0.3391, + "step": 305500 + }, + { + "epoch": 45.28, + "grad_norm": 0.416748046875, + "learning_rate": 1.3562629476176384e-05, + "loss": 0.3394, + "step": 306000 + }, + { + "epoch": 45.35, + "grad_norm": 0.37101954221725464, + "learning_rate": 1.3533728543947913e-05, + "loss": 0.3425, + "step": 306500 + }, + { + "epoch": 45.43, + "grad_norm": 0.3808073103427887, + "learning_rate": 1.3504827611719443e-05, + "loss": 0.3399, + "step": 307000 + }, + { + "epoch": 45.5, + "grad_norm": 0.3832837641239166, + "learning_rate": 1.3475926679490974e-05, + "loss": 0.3413, + "step": 307500 + }, + { + "epoch": 45.58, + "grad_norm": 0.3216901421546936, + "learning_rate": 1.3447025747262504e-05, + "loss": 0.3403, + "step": 308000 + }, + { + "epoch": 45.65, + "grad_norm": 0.36098387837409973, + "learning_rate": 1.3418124815034033e-05, + "loss": 0.3426, + "step": 308500 + }, + { + "epoch": 45.72, + "grad_norm": 0.5177834033966064, + "learning_rate": 1.3389223882805564e-05, + "loss": 0.3411, + "step": 309000 + }, + { + "epoch": 45.8, + "grad_norm": 0.41095811128616333, + "learning_rate": 1.3360322950577094e-05, + "loss": 0.3381, + "step": 309500 + }, + { + "epoch": 45.87, + "grad_norm": 0.38759657740592957, + "learning_rate": 1.3331422018348624e-05, + "loss": 0.3385, + "step": 310000 + }, + { + "epoch": 45.95, + "grad_norm": 0.34995037317276, + "learning_rate": 1.3302521086120155e-05, + "loss": 0.3387, + "step": 310500 + }, + { + "epoch": 46.02, + "grad_norm": 0.40866127610206604, + "learning_rate": 1.3273620153891684e-05, + "loss": 0.3406, + "step": 311000 + }, + { + "epoch": 46.09, + "grad_norm": 0.40558964014053345, + "learning_rate": 1.3244719221663214e-05, + "loss": 0.34, + "step": 311500 + }, + { + "epoch": 46.17, + "grad_norm": 0.3268815875053406, + "learning_rate": 1.3215818289434745e-05, + "loss": 0.3428, + "step": 312000 + }, + { + "epoch": 46.24, + "grad_norm": 0.4113875925540924, + "learning_rate": 1.3186917357206275e-05, + "loss": 0.3397, + "step": 312500 + }, + { + "epoch": 46.32, + "grad_norm": 0.3797847032546997, + "learning_rate": 1.3158016424977804e-05, + "loss": 0.3404, + "step": 313000 + }, + { + "epoch": 46.39, + "grad_norm": 0.3348693251609802, + "learning_rate": 1.3129115492749335e-05, + "loss": 0.34, + "step": 313500 + }, + { + "epoch": 46.46, + "grad_norm": 0.3879573941230774, + "learning_rate": 1.3100214560520865e-05, + "loss": 0.341, + "step": 314000 + }, + { + "epoch": 46.54, + "grad_norm": 0.40568268299102783, + "learning_rate": 1.3071313628292394e-05, + "loss": 0.3387, + "step": 314500 + }, + { + "epoch": 46.61, + "grad_norm": 0.4025332033634186, + "learning_rate": 1.3042412696063926e-05, + "loss": 0.3415, + "step": 315000 + }, + { + "epoch": 46.69, + "grad_norm": 0.38457706570625305, + "learning_rate": 1.3013511763835455e-05, + "loss": 0.3372, + "step": 315500 + }, + { + "epoch": 46.76, + "grad_norm": 0.36499732732772827, + "learning_rate": 1.2984610831606983e-05, + "loss": 0.3402, + "step": 316000 + }, + { + "epoch": 46.83, + "grad_norm": 0.5976847410202026, + "learning_rate": 1.2955709899378514e-05, + "loss": 0.3402, + "step": 316500 + }, + { + "epoch": 46.91, + "grad_norm": 0.38978490233421326, + "learning_rate": 1.2926808967150044e-05, + "loss": 0.3378, + "step": 317000 + }, + { + "epoch": 46.98, + "grad_norm": 0.4110495448112488, + "learning_rate": 1.2897908034921573e-05, + "loss": 0.339, + "step": 317500 + }, + { + "epoch": 47.06, + "grad_norm": 0.37475350499153137, + "learning_rate": 1.2869007102693105e-05, + "loss": 0.3402, + "step": 318000 + }, + { + "epoch": 47.13, + "grad_norm": 0.34179574251174927, + "learning_rate": 1.2840106170464634e-05, + "loss": 0.3397, + "step": 318500 + }, + { + "epoch": 47.2, + "grad_norm": 0.3006201982498169, + "learning_rate": 1.2811205238236164e-05, + "loss": 0.342, + "step": 319000 + }, + { + "epoch": 47.28, + "grad_norm": 0.38113319873809814, + "learning_rate": 1.2782304306007695e-05, + "loss": 0.341, + "step": 319500 + }, + { + "epoch": 47.35, + "grad_norm": 0.48638325929641724, + "learning_rate": 1.2753403373779225e-05, + "loss": 0.3403, + "step": 320000 + }, + { + "epoch": 47.43, + "grad_norm": 0.38058334589004517, + "learning_rate": 1.2724502441550754e-05, + "loss": 0.3401, + "step": 320500 + }, + { + "epoch": 47.5, + "grad_norm": 0.3784169554710388, + "learning_rate": 1.2695601509322285e-05, + "loss": 0.3378, + "step": 321000 + }, + { + "epoch": 47.57, + "grad_norm": 0.3679274916648865, + "learning_rate": 1.2666700577093815e-05, + "loss": 0.3388, + "step": 321500 + }, + { + "epoch": 47.65, + "grad_norm": 0.4005269706249237, + "learning_rate": 1.2637799644865346e-05, + "loss": 0.3382, + "step": 322000 + }, + { + "epoch": 47.72, + "grad_norm": 0.36187744140625, + "learning_rate": 1.2608898712636876e-05, + "loss": 0.3372, + "step": 322500 + }, + { + "epoch": 47.8, + "grad_norm": 0.4188271164894104, + "learning_rate": 1.2579997780408405e-05, + "loss": 0.339, + "step": 323000 + }, + { + "epoch": 47.87, + "grad_norm": 0.40314343571662903, + "learning_rate": 1.2551096848179936e-05, + "loss": 0.3404, + "step": 323500 + }, + { + "epoch": 47.94, + "grad_norm": 0.4233045279979706, + "learning_rate": 1.2522195915951466e-05, + "loss": 0.3398, + "step": 324000 + }, + { + "epoch": 48.02, + "grad_norm": 0.3816595673561096, + "learning_rate": 1.2493294983722996e-05, + "loss": 0.3382, + "step": 324500 + }, + { + "epoch": 48.09, + "grad_norm": 0.4000893831253052, + "learning_rate": 1.2464394051494527e-05, + "loss": 0.3402, + "step": 325000 + }, + { + "epoch": 48.17, + "grad_norm": 0.4130527377128601, + "learning_rate": 1.2435493119266055e-05, + "loss": 0.3394, + "step": 325500 + }, + { + "epoch": 48.24, + "grad_norm": 0.38113564252853394, + "learning_rate": 1.2406592187037584e-05, + "loss": 0.3365, + "step": 326000 + }, + { + "epoch": 48.31, + "grad_norm": 0.346966415643692, + "learning_rate": 1.2377691254809115e-05, + "loss": 0.3395, + "step": 326500 + }, + { + "epoch": 48.39, + "grad_norm": 0.4276494085788727, + "learning_rate": 1.2348790322580645e-05, + "loss": 0.3362, + "step": 327000 + }, + { + "epoch": 48.46, + "grad_norm": 0.39347225427627563, + "learning_rate": 1.2319889390352175e-05, + "loss": 0.3422, + "step": 327500 + }, + { + "epoch": 48.54, + "grad_norm": 0.3483811616897583, + "learning_rate": 1.2290988458123706e-05, + "loss": 0.3395, + "step": 328000 + }, + { + "epoch": 48.61, + "grad_norm": 0.36153608560562134, + "learning_rate": 1.2262087525895235e-05, + "loss": 0.3365, + "step": 328500 + }, + { + "epoch": 48.68, + "grad_norm": 0.39289888739585876, + "learning_rate": 1.2233186593666765e-05, + "loss": 0.3421, + "step": 329000 + }, + { + "epoch": 48.76, + "grad_norm": 0.4176575839519501, + "learning_rate": 1.2204285661438296e-05, + "loss": 0.3364, + "step": 329500 + }, + { + "epoch": 48.83, + "grad_norm": 0.3840237855911255, + "learning_rate": 1.2175384729209826e-05, + "loss": 0.3357, + "step": 330000 + }, + { + "epoch": 48.91, + "grad_norm": 0.44171571731567383, + "learning_rate": 1.2146483796981355e-05, + "loss": 0.3413, + "step": 330500 + }, + { + "epoch": 48.98, + "grad_norm": 0.42055392265319824, + "learning_rate": 1.2117582864752886e-05, + "loss": 0.335, + "step": 331000 + }, + { + "epoch": 49.05, + "grad_norm": 0.44252675771713257, + "learning_rate": 1.2088681932524416e-05, + "loss": 0.3385, + "step": 331500 + }, + { + "epoch": 49.13, + "grad_norm": 0.378095805644989, + "learning_rate": 1.2059781000295946e-05, + "loss": 0.3413, + "step": 332000 + }, + { + "epoch": 49.2, + "grad_norm": 0.3892216980457306, + "learning_rate": 1.2030880068067477e-05, + "loss": 0.3374, + "step": 332500 + }, + { + "epoch": 49.27, + "grad_norm": 0.3788653612136841, + "learning_rate": 1.2001979135839006e-05, + "loss": 0.3398, + "step": 333000 + }, + { + "epoch": 49.35, + "grad_norm": 0.38030126690864563, + "learning_rate": 1.1973078203610536e-05, + "loss": 0.3375, + "step": 333500 + }, + { + "epoch": 49.42, + "grad_norm": 0.4031144082546234, + "learning_rate": 1.1944177271382067e-05, + "loss": 0.3398, + "step": 334000 + }, + { + "epoch": 49.5, + "grad_norm": 0.3956519663333893, + "learning_rate": 1.1915276339153597e-05, + "loss": 0.3374, + "step": 334500 + }, + { + "epoch": 49.57, + "grad_norm": 0.3961743414402008, + "learning_rate": 1.1886375406925124e-05, + "loss": 0.3349, + "step": 335000 + }, + { + "epoch": 49.64, + "grad_norm": 0.3616986572742462, + "learning_rate": 1.1857474474696656e-05, + "loss": 0.3374, + "step": 335500 + }, + { + "epoch": 49.72, + "grad_norm": 0.36143624782562256, + "learning_rate": 1.1828573542468185e-05, + "loss": 0.3361, + "step": 336000 + }, + { + "epoch": 49.79, + "grad_norm": 0.389981746673584, + "learning_rate": 1.1799672610239715e-05, + "loss": 0.3378, + "step": 336500 + }, + { + "epoch": 49.87, + "grad_norm": 0.4078088104724884, + "learning_rate": 1.1770771678011246e-05, + "loss": 0.3382, + "step": 337000 + }, + { + "epoch": 49.94, + "grad_norm": 0.3802012801170349, + "learning_rate": 1.1741870745782776e-05, + "loss": 0.3356, + "step": 337500 + }, + { + "epoch": 50.01, + "grad_norm": 0.46680396795272827, + "learning_rate": 1.1712969813554305e-05, + "loss": 0.339, + "step": 338000 + }, + { + "epoch": 50.09, + "grad_norm": 0.45273512601852417, + "learning_rate": 1.1684068881325836e-05, + "loss": 0.3381, + "step": 338500 + }, + { + "epoch": 50.16, + "grad_norm": 0.3563522398471832, + "learning_rate": 1.1655167949097366e-05, + "loss": 0.3371, + "step": 339000 + }, + { + "epoch": 50.24, + "grad_norm": 0.43655216693878174, + "learning_rate": 1.1626267016868895e-05, + "loss": 0.3354, + "step": 339500 + }, + { + "epoch": 50.31, + "grad_norm": 0.3371317982673645, + "learning_rate": 1.1597366084640427e-05, + "loss": 0.3374, + "step": 340000 + }, + { + "epoch": 50.38, + "grad_norm": 0.39056339859962463, + "learning_rate": 1.1568465152411956e-05, + "loss": 0.3376, + "step": 340500 + }, + { + "epoch": 50.46, + "grad_norm": 0.40476441383361816, + "learning_rate": 1.1539564220183487e-05, + "loss": 0.3381, + "step": 341000 + }, + { + "epoch": 50.53, + "grad_norm": 0.3706866502761841, + "learning_rate": 1.1510663287955017e-05, + "loss": 0.3355, + "step": 341500 + }, + { + "epoch": 50.61, + "grad_norm": 0.43677544593811035, + "learning_rate": 1.1481762355726547e-05, + "loss": 0.3352, + "step": 342000 + }, + { + "epoch": 50.68, + "grad_norm": 0.3938286602497101, + "learning_rate": 1.1452861423498078e-05, + "loss": 0.3375, + "step": 342500 + }, + { + "epoch": 50.75, + "grad_norm": 0.3463038504123688, + "learning_rate": 1.1423960491269607e-05, + "loss": 0.3379, + "step": 343000 + }, + { + "epoch": 50.83, + "grad_norm": 0.3810366988182068, + "learning_rate": 1.1395059559041137e-05, + "loss": 0.3367, + "step": 343500 + }, + { + "epoch": 50.9, + "grad_norm": 0.3845095932483673, + "learning_rate": 1.1366158626812668e-05, + "loss": 0.3366, + "step": 344000 + }, + { + "epoch": 50.98, + "grad_norm": 0.5161909461021423, + "learning_rate": 1.1337257694584198e-05, + "loss": 0.3382, + "step": 344500 + }, + { + "epoch": 51.05, + "grad_norm": 0.4319625794887543, + "learning_rate": 1.1308356762355726e-05, + "loss": 0.3359, + "step": 345000 + }, + { + "epoch": 51.12, + "grad_norm": 0.34908732771873474, + "learning_rate": 1.1279455830127257e-05, + "loss": 0.3353, + "step": 345500 + }, + { + "epoch": 51.2, + "grad_norm": 0.38367515802383423, + "learning_rate": 1.1250554897898786e-05, + "loss": 0.3367, + "step": 346000 + }, + { + "epoch": 51.27, + "grad_norm": 0.3939116597175598, + "learning_rate": 1.1221653965670316e-05, + "loss": 0.3374, + "step": 346500 + }, + { + "epoch": 51.35, + "grad_norm": 0.44843488931655884, + "learning_rate": 1.1192753033441847e-05, + "loss": 0.3376, + "step": 347000 + }, + { + "epoch": 51.42, + "grad_norm": 0.4169764816761017, + "learning_rate": 1.1163852101213377e-05, + "loss": 0.3385, + "step": 347500 + }, + { + "epoch": 51.49, + "grad_norm": 0.3487055003643036, + "learning_rate": 1.1134951168984906e-05, + "loss": 0.3372, + "step": 348000 + }, + { + "epoch": 51.57, + "grad_norm": 0.3876706063747406, + "learning_rate": 1.1106050236756437e-05, + "loss": 0.3379, + "step": 348500 + }, + { + "epoch": 51.64, + "grad_norm": 0.33344700932502747, + "learning_rate": 1.1077149304527967e-05, + "loss": 0.3389, + "step": 349000 + }, + { + "epoch": 51.72, + "grad_norm": 0.41183948516845703, + "learning_rate": 1.1048248372299497e-05, + "loss": 0.3363, + "step": 349500 + }, + { + "epoch": 51.79, + "grad_norm": 0.3549967110157013, + "learning_rate": 1.1019347440071028e-05, + "loss": 0.3374, + "step": 350000 + }, + { + "epoch": 51.86, + "grad_norm": 0.4144490659236908, + "learning_rate": 1.0990446507842557e-05, + "loss": 0.3347, + "step": 350500 + }, + { + "epoch": 51.94, + "grad_norm": 0.3781343400478363, + "learning_rate": 1.0961545575614087e-05, + "loss": 0.3365, + "step": 351000 + }, + { + "epoch": 52.01, + "grad_norm": 0.4050437808036804, + "learning_rate": 1.0932644643385618e-05, + "loss": 0.3384, + "step": 351500 + }, + { + "epoch": 52.09, + "grad_norm": 0.3758808374404907, + "learning_rate": 1.0903743711157148e-05, + "loss": 0.3382, + "step": 352000 + }, + { + "epoch": 52.16, + "grad_norm": 0.456534206867218, + "learning_rate": 1.0874842778928677e-05, + "loss": 0.3392, + "step": 352500 + }, + { + "epoch": 52.23, + "grad_norm": 0.38857728242874146, + "learning_rate": 1.0845941846700208e-05, + "loss": 0.3374, + "step": 353000 + }, + { + "epoch": 52.31, + "grad_norm": 0.39419788122177124, + "learning_rate": 1.0817040914471738e-05, + "loss": 0.3374, + "step": 353500 + }, + { + "epoch": 52.38, + "grad_norm": 0.41852855682373047, + "learning_rate": 1.0788139982243268e-05, + "loss": 0.3335, + "step": 354000 + }, + { + "epoch": 52.46, + "grad_norm": 0.3561359941959381, + "learning_rate": 1.0759239050014797e-05, + "loss": 0.3359, + "step": 354500 + }, + { + "epoch": 52.53, + "grad_norm": 0.3975025713443756, + "learning_rate": 1.0730338117786327e-05, + "loss": 0.336, + "step": 355000 + }, + { + "epoch": 52.6, + "grad_norm": 0.39150169491767883, + "learning_rate": 1.0701437185557856e-05, + "loss": 0.337, + "step": 355500 + }, + { + "epoch": 52.68, + "grad_norm": 0.404354453086853, + "learning_rate": 1.0672536253329387e-05, + "loss": 0.3378, + "step": 356000 + }, + { + "epoch": 52.75, + "grad_norm": 0.3414269685745239, + "learning_rate": 1.0643635321100917e-05, + "loss": 0.3338, + "step": 356500 + }, + { + "epoch": 52.83, + "grad_norm": 0.4378945827484131, + "learning_rate": 1.0614734388872446e-05, + "loss": 0.3369, + "step": 357000 + }, + { + "epoch": 52.9, + "grad_norm": 0.5136425495147705, + "learning_rate": 1.0585833456643978e-05, + "loss": 0.3348, + "step": 357500 + }, + { + "epoch": 52.97, + "grad_norm": 0.3793259263038635, + "learning_rate": 1.0556932524415507e-05, + "loss": 0.3354, + "step": 358000 + }, + { + "epoch": 53.05, + "grad_norm": 0.3828275203704834, + "learning_rate": 1.0528031592187039e-05, + "loss": 0.3348, + "step": 358500 + }, + { + "epoch": 53.12, + "grad_norm": 0.380776971578598, + "learning_rate": 1.0499130659958568e-05, + "loss": 0.3375, + "step": 359000 + }, + { + "epoch": 53.2, + "grad_norm": 0.40259137749671936, + "learning_rate": 1.0470229727730098e-05, + "loss": 0.3366, + "step": 359500 + }, + { + "epoch": 53.27, + "grad_norm": 0.3794288635253906, + "learning_rate": 1.0441328795501629e-05, + "loss": 0.3343, + "step": 360000 + }, + { + "epoch": 53.34, + "grad_norm": 0.44558051228523254, + "learning_rate": 1.0412427863273158e-05, + "loss": 0.3355, + "step": 360500 + }, + { + "epoch": 53.42, + "grad_norm": 0.42926931381225586, + "learning_rate": 1.0383526931044688e-05, + "loss": 0.3368, + "step": 361000 + }, + { + "epoch": 53.49, + "grad_norm": 0.3846406936645508, + "learning_rate": 1.035462599881622e-05, + "loss": 0.3363, + "step": 361500 + }, + { + "epoch": 53.57, + "grad_norm": 0.43000903725624084, + "learning_rate": 1.0325725066587749e-05, + "loss": 0.3338, + "step": 362000 + }, + { + "epoch": 53.64, + "grad_norm": 0.42310836911201477, + "learning_rate": 1.0296824134359278e-05, + "loss": 0.336, + "step": 362500 + }, + { + "epoch": 53.71, + "grad_norm": 0.3451327681541443, + "learning_rate": 1.026792320213081e-05, + "loss": 0.3384, + "step": 363000 + }, + { + "epoch": 53.79, + "grad_norm": 0.4068630337715149, + "learning_rate": 1.0239022269902339e-05, + "loss": 0.3389, + "step": 363500 + }, + { + "epoch": 53.86, + "grad_norm": 0.36988091468811035, + "learning_rate": 1.0210121337673869e-05, + "loss": 0.3368, + "step": 364000 + }, + { + "epoch": 53.94, + "grad_norm": 0.37670448422431946, + "learning_rate": 1.0181220405445398e-05, + "loss": 0.3361, + "step": 364500 + }, + { + "epoch": 54.01, + "grad_norm": 0.4235304296016693, + "learning_rate": 1.0152319473216928e-05, + "loss": 0.3339, + "step": 365000 + }, + { + "epoch": 54.08, + "grad_norm": 0.4179520606994629, + "learning_rate": 1.0123418540988457e-05, + "loss": 0.3372, + "step": 365500 + }, + { + "epoch": 54.16, + "grad_norm": 0.3763734996318817, + "learning_rate": 1.0094517608759988e-05, + "loss": 0.3368, + "step": 366000 + }, + { + "epoch": 54.23, + "grad_norm": 0.4098796844482422, + "learning_rate": 1.0065616676531518e-05, + "loss": 0.3326, + "step": 366500 + }, + { + "epoch": 54.31, + "grad_norm": 0.41570228338241577, + "learning_rate": 1.0036715744303048e-05, + "loss": 0.3366, + "step": 367000 + }, + { + "epoch": 54.38, + "grad_norm": 0.38217049837112427, + "learning_rate": 1.0007814812074579e-05, + "loss": 0.3338, + "step": 367500 + }, + { + "epoch": 54.45, + "grad_norm": 0.36770564317703247, + "learning_rate": 9.978913879846108e-06, + "loss": 0.3323, + "step": 368000 + }, + { + "epoch": 54.53, + "grad_norm": 0.43568935990333557, + "learning_rate": 9.950012947617638e-06, + "loss": 0.3361, + "step": 368500 + }, + { + "epoch": 54.6, + "grad_norm": 0.47602441906929016, + "learning_rate": 9.921112015389169e-06, + "loss": 0.3349, + "step": 369000 + }, + { + "epoch": 54.68, + "grad_norm": 0.4022866487503052, + "learning_rate": 9.892211083160699e-06, + "loss": 0.3347, + "step": 369500 + }, + { + "epoch": 54.75, + "grad_norm": 0.3981685936450958, + "learning_rate": 9.863310150932228e-06, + "loss": 0.3351, + "step": 370000 + }, + { + "epoch": 54.82, + "grad_norm": 0.3706594705581665, + "learning_rate": 9.83440921870376e-06, + "loss": 0.3342, + "step": 370500 + }, + { + "epoch": 54.9, + "grad_norm": 0.36316171288490295, + "learning_rate": 9.805508286475289e-06, + "loss": 0.337, + "step": 371000 + }, + { + "epoch": 54.97, + "grad_norm": 0.3705138564109802, + "learning_rate": 9.776607354246819e-06, + "loss": 0.3358, + "step": 371500 + }, + { + "epoch": 55.05, + "grad_norm": 0.4264328181743622, + "learning_rate": 9.74770642201835e-06, + "loss": 0.3349, + "step": 372000 + }, + { + "epoch": 55.12, + "grad_norm": 0.39624592661857605, + "learning_rate": 9.71880548978988e-06, + "loss": 0.3327, + "step": 372500 + }, + { + "epoch": 55.19, + "grad_norm": 0.41520076990127563, + "learning_rate": 9.689904557561409e-06, + "loss": 0.3363, + "step": 373000 + }, + { + "epoch": 55.27, + "grad_norm": 0.37249574065208435, + "learning_rate": 9.66100362533294e-06, + "loss": 0.335, + "step": 373500 + }, + { + "epoch": 55.34, + "grad_norm": 0.42657721042633057, + "learning_rate": 9.632102693104468e-06, + "loss": 0.3353, + "step": 374000 + }, + { + "epoch": 55.42, + "grad_norm": 0.3780669569969177, + "learning_rate": 9.603201760875998e-06, + "loss": 0.337, + "step": 374500 + }, + { + "epoch": 55.49, + "grad_norm": 0.3783871829509735, + "learning_rate": 9.574300828647529e-06, + "loss": 0.3348, + "step": 375000 + }, + { + "epoch": 55.56, + "grad_norm": 0.4328089952468872, + "learning_rate": 9.545399896419058e-06, + "loss": 0.3366, + "step": 375500 + }, + { + "epoch": 55.64, + "grad_norm": 0.3957238793373108, + "learning_rate": 9.516498964190588e-06, + "loss": 0.3344, + "step": 376000 + }, + { + "epoch": 55.71, + "grad_norm": 0.3606773614883423, + "learning_rate": 9.487598031962119e-06, + "loss": 0.3342, + "step": 376500 + }, + { + "epoch": 55.79, + "grad_norm": 0.4170531928539276, + "learning_rate": 9.458697099733649e-06, + "loss": 0.3349, + "step": 377000 + }, + { + "epoch": 55.86, + "grad_norm": 0.3830915093421936, + "learning_rate": 9.42979616750518e-06, + "loss": 0.3371, + "step": 377500 + }, + { + "epoch": 55.93, + "grad_norm": 0.4350239634513855, + "learning_rate": 9.40089523527671e-06, + "loss": 0.3377, + "step": 378000 + }, + { + "epoch": 56.01, + "grad_norm": 0.37382885813713074, + "learning_rate": 9.371994303048239e-06, + "loss": 0.3362, + "step": 378500 + }, + { + "epoch": 56.08, + "grad_norm": 0.3806856870651245, + "learning_rate": 9.34309337081977e-06, + "loss": 0.3347, + "step": 379000 + }, + { + "epoch": 56.16, + "grad_norm": 0.3189554214477539, + "learning_rate": 9.3141924385913e-06, + "loss": 0.3363, + "step": 379500 + }, + { + "epoch": 56.23, + "grad_norm": 0.33894240856170654, + "learning_rate": 9.28529150636283e-06, + "loss": 0.3362, + "step": 380000 + }, + { + "epoch": 56.3, + "grad_norm": 0.4565516710281372, + "learning_rate": 9.25639057413436e-06, + "loss": 0.3331, + "step": 380500 + }, + { + "epoch": 56.38, + "grad_norm": 0.4101388156414032, + "learning_rate": 9.22748964190589e-06, + "loss": 0.335, + "step": 381000 + }, + { + "epoch": 56.45, + "grad_norm": 0.40449845790863037, + "learning_rate": 9.19858870967742e-06, + "loss": 0.3337, + "step": 381500 + }, + { + "epoch": 56.53, + "grad_norm": 0.47349539399147034, + "learning_rate": 9.16968777744895e-06, + "loss": 0.3328, + "step": 382000 + }, + { + "epoch": 56.6, + "grad_norm": 0.42848438024520874, + "learning_rate": 9.14078684522048e-06, + "loss": 0.334, + "step": 382500 + }, + { + "epoch": 56.67, + "grad_norm": 0.3625510334968567, + "learning_rate": 9.11188591299201e-06, + "loss": 0.3321, + "step": 383000 + }, + { + "epoch": 56.75, + "grad_norm": 0.3561297357082367, + "learning_rate": 9.082984980763541e-06, + "loss": 0.3349, + "step": 383500 + }, + { + "epoch": 56.82, + "grad_norm": 0.3738841414451599, + "learning_rate": 9.054084048535069e-06, + "loss": 0.3366, + "step": 384000 + }, + { + "epoch": 56.9, + "grad_norm": 0.33738991618156433, + "learning_rate": 9.025183116306599e-06, + "loss": 0.3327, + "step": 384500 + }, + { + "epoch": 56.97, + "grad_norm": 0.42749759554862976, + "learning_rate": 8.99628218407813e-06, + "loss": 0.336, + "step": 385000 + }, + { + "epoch": 57.04, + "grad_norm": 0.4089387059211731, + "learning_rate": 8.96738125184966e-06, + "loss": 0.3334, + "step": 385500 + }, + { + "epoch": 57.12, + "grad_norm": 0.3684140145778656, + "learning_rate": 8.938480319621189e-06, + "loss": 0.3345, + "step": 386000 + }, + { + "epoch": 57.19, + "grad_norm": 0.3694292902946472, + "learning_rate": 8.90957938739272e-06, + "loss": 0.3333, + "step": 386500 + }, + { + "epoch": 57.27, + "grad_norm": 0.31505081057548523, + "learning_rate": 8.88067845516425e-06, + "loss": 0.3339, + "step": 387000 + }, + { + "epoch": 57.34, + "grad_norm": 0.4051445722579956, + "learning_rate": 8.85177752293578e-06, + "loss": 0.3348, + "step": 387500 + }, + { + "epoch": 57.41, + "grad_norm": 0.426145076751709, + "learning_rate": 8.82287659070731e-06, + "loss": 0.3307, + "step": 388000 + }, + { + "epoch": 57.49, + "grad_norm": 0.4356764256954193, + "learning_rate": 8.79397565847884e-06, + "loss": 0.3336, + "step": 388500 + }, + { + "epoch": 57.56, + "grad_norm": 0.39635592699050903, + "learning_rate": 8.76507472625037e-06, + "loss": 0.3355, + "step": 389000 + }, + { + "epoch": 57.64, + "grad_norm": 0.4467043876647949, + "learning_rate": 8.7361737940219e-06, + "loss": 0.3369, + "step": 389500 + }, + { + "epoch": 57.71, + "grad_norm": 0.5042401552200317, + "learning_rate": 8.70727286179343e-06, + "loss": 0.3352, + "step": 390000 + }, + { + "epoch": 57.78, + "grad_norm": 0.38742733001708984, + "learning_rate": 8.67837192956496e-06, + "loss": 0.3349, + "step": 390500 + }, + { + "epoch": 57.86, + "grad_norm": 0.35748493671417236, + "learning_rate": 8.649470997336491e-06, + "loss": 0.3331, + "step": 391000 + }, + { + "epoch": 57.93, + "grad_norm": 0.406547486782074, + "learning_rate": 8.62057006510802e-06, + "loss": 0.3345, + "step": 391500 + }, + { + "epoch": 58.01, + "grad_norm": 0.37016528844833374, + "learning_rate": 8.59166913287955e-06, + "loss": 0.3338, + "step": 392000 + }, + { + "epoch": 58.08, + "grad_norm": 0.39589524269104004, + "learning_rate": 8.562768200651081e-06, + "loss": 0.3334, + "step": 392500 + }, + { + "epoch": 58.15, + "grad_norm": 0.42654627561569214, + "learning_rate": 8.533867268422611e-06, + "loss": 0.3336, + "step": 393000 + }, + { + "epoch": 58.23, + "grad_norm": 0.4174553453922272, + "learning_rate": 8.504966336194139e-06, + "loss": 0.3339, + "step": 393500 + }, + { + "epoch": 58.3, + "grad_norm": 0.43379977345466614, + "learning_rate": 8.47606540396567e-06, + "loss": 0.3329, + "step": 394000 + }, + { + "epoch": 58.38, + "grad_norm": 0.3706502914428711, + "learning_rate": 8.4471644717372e-06, + "loss": 0.332, + "step": 394500 + }, + { + "epoch": 58.45, + "grad_norm": 0.4529905319213867, + "learning_rate": 8.41826353950873e-06, + "loss": 0.3342, + "step": 395000 + }, + { + "epoch": 58.52, + "grad_norm": 0.4060870110988617, + "learning_rate": 8.38936260728026e-06, + "loss": 0.3331, + "step": 395500 + }, + { + "epoch": 58.6, + "grad_norm": 0.4102860689163208, + "learning_rate": 8.36046167505179e-06, + "loss": 0.3339, + "step": 396000 + }, + { + "epoch": 58.67, + "grad_norm": 0.38025009632110596, + "learning_rate": 8.331560742823321e-06, + "loss": 0.3334, + "step": 396500 + }, + { + "epoch": 58.75, + "grad_norm": 0.3559959828853607, + "learning_rate": 8.30265981059485e-06, + "loss": 0.334, + "step": 397000 + }, + { + "epoch": 58.82, + "grad_norm": 0.48199519515037537, + "learning_rate": 8.27375887836638e-06, + "loss": 0.3328, + "step": 397500 + }, + { + "epoch": 58.89, + "grad_norm": 0.40932905673980713, + "learning_rate": 8.244857946137912e-06, + "loss": 0.3314, + "step": 398000 + }, + { + "epoch": 58.97, + "grad_norm": 0.4070405960083008, + "learning_rate": 8.215957013909441e-06, + "loss": 0.3354, + "step": 398500 + }, + { + "epoch": 59.04, + "grad_norm": 0.392281711101532, + "learning_rate": 8.18705608168097e-06, + "loss": 0.3324, + "step": 399000 + }, + { + "epoch": 59.12, + "grad_norm": 0.38242244720458984, + "learning_rate": 8.158155149452502e-06, + "loss": 0.3313, + "step": 399500 + }, + { + "epoch": 59.19, + "grad_norm": 0.4169810712337494, + "learning_rate": 8.129254217224031e-06, + "loss": 0.3354, + "step": 400000 + }, + { + "epoch": 59.26, + "grad_norm": 0.335362046957016, + "learning_rate": 8.100353284995561e-06, + "loss": 0.3312, + "step": 400500 + }, + { + "epoch": 59.34, + "grad_norm": 0.41095077991485596, + "learning_rate": 8.071452352767092e-06, + "loss": 0.3331, + "step": 401000 + }, + { + "epoch": 59.41, + "grad_norm": 0.39492741227149963, + "learning_rate": 8.042551420538622e-06, + "loss": 0.3314, + "step": 401500 + }, + { + "epoch": 59.49, + "grad_norm": 0.42789730429649353, + "learning_rate": 8.013650488310151e-06, + "loss": 0.333, + "step": 402000 + }, + { + "epoch": 59.56, + "grad_norm": 0.35511842370033264, + "learning_rate": 7.984749556081683e-06, + "loss": 0.3346, + "step": 402500 + }, + { + "epoch": 59.63, + "grad_norm": 0.36928626894950867, + "learning_rate": 7.95584862385321e-06, + "loss": 0.335, + "step": 403000 + }, + { + "epoch": 59.71, + "grad_norm": 0.4076744318008423, + "learning_rate": 7.92694769162474e-06, + "loss": 0.3294, + "step": 403500 + }, + { + "epoch": 59.78, + "grad_norm": 0.35494473576545715, + "learning_rate": 7.898046759396271e-06, + "loss": 0.3336, + "step": 404000 + }, + { + "epoch": 59.85, + "grad_norm": 0.3991703689098358, + "learning_rate": 7.8691458271678e-06, + "loss": 0.3294, + "step": 404500 + }, + { + "epoch": 59.93, + "grad_norm": 0.3891808092594147, + "learning_rate": 7.84024489493933e-06, + "loss": 0.3349, + "step": 405000 + }, + { + "epoch": 60.0, + "grad_norm": 0.5921450257301331, + "learning_rate": 7.811343962710861e-06, + "loss": 0.3331, + "step": 405500 + }, + { + "epoch": 60.08, + "grad_norm": 0.387185275554657, + "learning_rate": 7.782443030482391e-06, + "loss": 0.3326, + "step": 406000 + }, + { + "epoch": 60.15, + "grad_norm": 0.5411362648010254, + "learning_rate": 7.75354209825392e-06, + "loss": 0.3303, + "step": 406500 + }, + { + "epoch": 60.22, + "grad_norm": 0.35113802552223206, + "learning_rate": 7.724641166025452e-06, + "loss": 0.3343, + "step": 407000 + }, + { + "epoch": 60.3, + "grad_norm": 0.3711684048175812, + "learning_rate": 7.695740233796981e-06, + "loss": 0.3316, + "step": 407500 + }, + { + "epoch": 60.37, + "grad_norm": 0.40576910972595215, + "learning_rate": 7.666839301568511e-06, + "loss": 0.3344, + "step": 408000 + }, + { + "epoch": 60.45, + "grad_norm": 0.4487907588481903, + "learning_rate": 7.637938369340042e-06, + "loss": 0.3337, + "step": 408500 + }, + { + "epoch": 60.52, + "grad_norm": 0.4065958857536316, + "learning_rate": 7.609037437111572e-06, + "loss": 0.3314, + "step": 409000 + }, + { + "epoch": 60.59, + "grad_norm": 0.4283113479614258, + "learning_rate": 7.580136504883102e-06, + "loss": 0.3337, + "step": 409500 + }, + { + "epoch": 60.67, + "grad_norm": 0.4433044493198395, + "learning_rate": 7.5512355726546325e-06, + "loss": 0.3317, + "step": 410000 + }, + { + "epoch": 60.74, + "grad_norm": 0.38607364892959595, + "learning_rate": 7.522334640426161e-06, + "loss": 0.333, + "step": 410500 + }, + { + "epoch": 60.82, + "grad_norm": 0.45367687940597534, + "learning_rate": 7.4934337081976916e-06, + "loss": 0.3298, + "step": 411000 + }, + { + "epoch": 60.89, + "grad_norm": 0.4054895043373108, + "learning_rate": 7.464532775969222e-06, + "loss": 0.3318, + "step": 411500 + }, + { + "epoch": 60.96, + "grad_norm": 0.41600409150123596, + "learning_rate": 7.4356318437407515e-06, + "loss": 0.3313, + "step": 412000 + }, + { + "epoch": 61.04, + "grad_norm": 0.4171212911605835, + "learning_rate": 7.406730911512282e-06, + "loss": 0.3318, + "step": 412500 + }, + { + "epoch": 61.11, + "grad_norm": 0.40264466404914856, + "learning_rate": 7.377829979283812e-06, + "loss": 0.3335, + "step": 413000 + }, + { + "epoch": 61.19, + "grad_norm": 0.37919875979423523, + "learning_rate": 7.348929047055342e-06, + "loss": 0.3324, + "step": 413500 + }, + { + "epoch": 61.26, + "grad_norm": 0.47246700525283813, + "learning_rate": 7.320028114826872e-06, + "loss": 0.3341, + "step": 414000 + }, + { + "epoch": 61.33, + "grad_norm": 0.4305689036846161, + "learning_rate": 7.291127182598403e-06, + "loss": 0.3335, + "step": 414500 + }, + { + "epoch": 61.41, + "grad_norm": 0.38494426012039185, + "learning_rate": 7.262226250369932e-06, + "loss": 0.3337, + "step": 415000 + }, + { + "epoch": 61.48, + "grad_norm": 0.45139452815055847, + "learning_rate": 7.233325318141462e-06, + "loss": 0.3322, + "step": 415500 + }, + { + "epoch": 61.56, + "grad_norm": 0.4199995994567871, + "learning_rate": 7.204424385912992e-06, + "loss": 0.3302, + "step": 416000 + }, + { + "epoch": 61.63, + "grad_norm": 0.3823252022266388, + "learning_rate": 7.175523453684522e-06, + "loss": 0.333, + "step": 416500 + }, + { + "epoch": 61.7, + "grad_norm": 0.38762542605400085, + "learning_rate": 7.146622521456052e-06, + "loss": 0.3338, + "step": 417000 + }, + { + "epoch": 61.78, + "grad_norm": 0.3889346718788147, + "learning_rate": 7.117721589227582e-06, + "loss": 0.3333, + "step": 417500 + }, + { + "epoch": 61.85, + "grad_norm": 0.43703803420066833, + "learning_rate": 7.088820656999113e-06, + "loss": 0.3313, + "step": 418000 + }, + { + "epoch": 61.93, + "grad_norm": 0.37083032727241516, + "learning_rate": 7.059919724770642e-06, + "loss": 0.3327, + "step": 418500 + }, + { + "epoch": 62.0, + "grad_norm": 0.431436687707901, + "learning_rate": 7.031018792542173e-06, + "loss": 0.3275, + "step": 419000 + }, + { + "epoch": 62.07, + "grad_norm": 0.38710957765579224, + "learning_rate": 7.002117860313703e-06, + "loss": 0.3315, + "step": 419500 + }, + { + "epoch": 62.15, + "grad_norm": 0.4548743963241577, + "learning_rate": 6.973216928085232e-06, + "loss": 0.3314, + "step": 420000 + }, + { + "epoch": 62.22, + "grad_norm": 0.4413709342479706, + "learning_rate": 6.944315995856762e-06, + "loss": 0.3317, + "step": 420500 + }, + { + "epoch": 62.3, + "grad_norm": 0.42544716596603394, + "learning_rate": 6.915415063628293e-06, + "loss": 0.3327, + "step": 421000 + }, + { + "epoch": 62.37, + "grad_norm": 0.4307864010334015, + "learning_rate": 6.886514131399822e-06, + "loss": 0.3335, + "step": 421500 + }, + { + "epoch": 62.44, + "grad_norm": 0.4296441376209259, + "learning_rate": 6.8576131991713526e-06, + "loss": 0.3317, + "step": 422000 + }, + { + "epoch": 62.52, + "grad_norm": 0.3624299466609955, + "learning_rate": 6.828712266942883e-06, + "loss": 0.3307, + "step": 422500 + }, + { + "epoch": 62.59, + "grad_norm": 0.4123700261116028, + "learning_rate": 6.7998113347144125e-06, + "loss": 0.3317, + "step": 423000 + }, + { + "epoch": 62.67, + "grad_norm": 0.4546355903148651, + "learning_rate": 6.770910402485943e-06, + "loss": 0.3288, + "step": 423500 + }, + { + "epoch": 62.74, + "grad_norm": 0.4328787922859192, + "learning_rate": 6.742009470257473e-06, + "loss": 0.3321, + "step": 424000 + }, + { + "epoch": 62.81, + "grad_norm": 0.39879125356674194, + "learning_rate": 6.713108538029003e-06, + "loss": 0.334, + "step": 424500 + }, + { + "epoch": 62.89, + "grad_norm": 0.42407459020614624, + "learning_rate": 6.684207605800532e-06, + "loss": 0.3312, + "step": 425000 + }, + { + "epoch": 62.96, + "grad_norm": 0.5664127469062805, + "learning_rate": 6.655306673572063e-06, + "loss": 0.3323, + "step": 425500 + }, + { + "epoch": 63.04, + "grad_norm": 0.47169846296310425, + "learning_rate": 6.626405741343592e-06, + "loss": 0.3309, + "step": 426000 + }, + { + "epoch": 63.11, + "grad_norm": 0.3552204668521881, + "learning_rate": 6.597504809115123e-06, + "loss": 0.33, + "step": 426500 + }, + { + "epoch": 63.18, + "grad_norm": 0.44585150480270386, + "learning_rate": 6.568603876886653e-06, + "loss": 0.3306, + "step": 427000 + }, + { + "epoch": 63.26, + "grad_norm": 0.4512608051300049, + "learning_rate": 6.5397029446581835e-06, + "loss": 0.3308, + "step": 427500 + }, + { + "epoch": 63.33, + "grad_norm": 0.40121740102767944, + "learning_rate": 6.510802012429713e-06, + "loss": 0.3302, + "step": 428000 + }, + { + "epoch": 63.41, + "grad_norm": 0.4354041516780853, + "learning_rate": 6.481901080201243e-06, + "loss": 0.3327, + "step": 428500 + }, + { + "epoch": 63.48, + "grad_norm": 0.4612290561199188, + "learning_rate": 6.453000147972774e-06, + "loss": 0.3311, + "step": 429000 + }, + { + "epoch": 63.55, + "grad_norm": 0.4508548676967621, + "learning_rate": 6.424099215744303e-06, + "loss": 0.3312, + "step": 429500 + }, + { + "epoch": 63.63, + "grad_norm": 0.4045092761516571, + "learning_rate": 6.395198283515833e-06, + "loss": 0.3313, + "step": 430000 + }, + { + "epoch": 63.7, + "grad_norm": 0.4180326759815216, + "learning_rate": 6.366297351287363e-06, + "loss": 0.3324, + "step": 430500 + }, + { + "epoch": 63.78, + "grad_norm": 0.3800413906574249, + "learning_rate": 6.337396419058893e-06, + "loss": 0.3357, + "step": 431000 + }, + { + "epoch": 63.85, + "grad_norm": 0.4264669716358185, + "learning_rate": 6.308495486830423e-06, + "loss": 0.3314, + "step": 431500 + }, + { + "epoch": 63.92, + "grad_norm": 0.4021168351173401, + "learning_rate": 6.279594554601954e-06, + "loss": 0.3301, + "step": 432000 + }, + { + "epoch": 64.0, + "grad_norm": 0.4635623097419739, + "learning_rate": 6.250693622373483e-06, + "loss": 0.3304, + "step": 432500 + }, + { + "epoch": 64.07, + "grad_norm": 0.4012512266635895, + "learning_rate": 6.2217926901450136e-06, + "loss": 0.3322, + "step": 433000 + }, + { + "epoch": 64.15, + "grad_norm": 0.4430687725543976, + "learning_rate": 6.192891757916544e-06, + "loss": 0.3302, + "step": 433500 + }, + { + "epoch": 64.22, + "grad_norm": 0.43903249502182007, + "learning_rate": 6.1639908256880735e-06, + "loss": 0.3326, + "step": 434000 + }, + { + "epoch": 64.29, + "grad_norm": 0.5228444337844849, + "learning_rate": 6.135089893459604e-06, + "loss": 0.3298, + "step": 434500 + }, + { + "epoch": 64.37, + "grad_norm": 0.43113288283348083, + "learning_rate": 6.1061889612311334e-06, + "loss": 0.3291, + "step": 435000 + }, + { + "epoch": 64.44, + "grad_norm": 0.47652667760849, + "learning_rate": 6.077288029002663e-06, + "loss": 0.3299, + "step": 435500 + }, + { + "epoch": 64.52, + "grad_norm": 0.4017566442489624, + "learning_rate": 6.048387096774193e-06, + "loss": 0.3312, + "step": 436000 + }, + { + "epoch": 64.59, + "grad_norm": 0.4369170069694519, + "learning_rate": 6.019486164545724e-06, + "loss": 0.3339, + "step": 436500 + }, + { + "epoch": 64.66, + "grad_norm": 0.36806294322013855, + "learning_rate": 5.990585232317254e-06, + "loss": 0.3317, + "step": 437000 + }, + { + "epoch": 64.74, + "grad_norm": 0.42576882243156433, + "learning_rate": 5.961684300088784e-06, + "loss": 0.3309, + "step": 437500 + }, + { + "epoch": 64.81, + "grad_norm": 0.4077777564525604, + "learning_rate": 5.932783367860314e-06, + "loss": 0.3319, + "step": 438000 + }, + { + "epoch": 64.89, + "grad_norm": 0.4394007921218872, + "learning_rate": 5.9038824356318445e-06, + "loss": 0.3327, + "step": 438500 + }, + { + "epoch": 64.96, + "grad_norm": 0.32965216040611267, + "learning_rate": 5.874981503403374e-06, + "loss": 0.3277, + "step": 439000 + }, + { + "epoch": 65.03, + "grad_norm": 0.4312441945075989, + "learning_rate": 5.846080571174904e-06, + "loss": 0.3291, + "step": 439500 + }, + { + "epoch": 65.11, + "grad_norm": 0.3752184808254242, + "learning_rate": 5.817179638946434e-06, + "loss": 0.3319, + "step": 440000 + }, + { + "epoch": 65.18, + "grad_norm": 0.4169740080833435, + "learning_rate": 5.7882787067179635e-06, + "loss": 0.331, + "step": 440500 + }, + { + "epoch": 65.26, + "grad_norm": 0.43580740690231323, + "learning_rate": 5.759377774489494e-06, + "loss": 0.3301, + "step": 441000 + }, + { + "epoch": 65.33, + "grad_norm": 0.46015655994415283, + "learning_rate": 5.730476842261024e-06, + "loss": 0.3326, + "step": 441500 + }, + { + "epoch": 65.4, + "grad_norm": 0.4646316468715668, + "learning_rate": 5.701575910032554e-06, + "loss": 0.3307, + "step": 442000 + }, + { + "epoch": 65.48, + "grad_norm": 0.4371485114097595, + "learning_rate": 5.672674977804084e-06, + "loss": 0.3303, + "step": 442500 + }, + { + "epoch": 65.55, + "grad_norm": 0.443768173456192, + "learning_rate": 5.643774045575615e-06, + "loss": 0.3315, + "step": 443000 + }, + { + "epoch": 65.63, + "grad_norm": 0.44002553820610046, + "learning_rate": 5.614873113347144e-06, + "loss": 0.3305, + "step": 443500 + }, + { + "epoch": 65.7, + "grad_norm": 0.39671292901039124, + "learning_rate": 5.5859721811186746e-06, + "loss": 0.3312, + "step": 444000 + }, + { + "epoch": 65.77, + "grad_norm": 0.4188387393951416, + "learning_rate": 5.557071248890204e-06, + "loss": 0.3302, + "step": 444500 + }, + { + "epoch": 65.85, + "grad_norm": 0.44623398780822754, + "learning_rate": 5.528170316661734e-06, + "loss": 0.3308, + "step": 445000 + }, + { + "epoch": 65.92, + "grad_norm": 0.36335235834121704, + "learning_rate": 5.499269384433264e-06, + "loss": 0.3293, + "step": 445500 + }, + { + "epoch": 66.0, + "grad_norm": 0.41810572147369385, + "learning_rate": 5.4703684522047944e-06, + "loss": 0.329, + "step": 446000 + }, + { + "epoch": 66.07, + "grad_norm": 0.4002617299556732, + "learning_rate": 5.441467519976325e-06, + "loss": 0.3278, + "step": 446500 + }, + { + "epoch": 66.14, + "grad_norm": 0.45273175835609436, + "learning_rate": 5.412566587747854e-06, + "loss": 0.3303, + "step": 447000 + }, + { + "epoch": 66.22, + "grad_norm": 0.48169875144958496, + "learning_rate": 5.383665655519385e-06, + "loss": 0.332, + "step": 447500 + }, + { + "epoch": 66.29, + "grad_norm": 0.39927640557289124, + "learning_rate": 5.354764723290915e-06, + "loss": 0.3296, + "step": 448000 + }, + { + "epoch": 66.37, + "grad_norm": 0.42319226264953613, + "learning_rate": 5.325863791062445e-06, + "loss": 0.3309, + "step": 448500 + }, + { + "epoch": 66.44, + "grad_norm": 0.4284779131412506, + "learning_rate": 5.296962858833975e-06, + "loss": 0.3321, + "step": 449000 + }, + { + "epoch": 66.51, + "grad_norm": 0.5179397463798523, + "learning_rate": 5.268061926605505e-06, + "loss": 0.33, + "step": 449500 + }, + { + "epoch": 66.59, + "grad_norm": 0.44250035285949707, + "learning_rate": 5.239160994377034e-06, + "loss": 0.3295, + "step": 450000 + }, + { + "epoch": 66.66, + "grad_norm": 0.46015605330467224, + "learning_rate": 5.210260062148565e-06, + "loss": 0.3313, + "step": 450500 + }, + { + "epoch": 66.74, + "grad_norm": 0.5012817978858948, + "learning_rate": 5.181359129920095e-06, + "loss": 0.3302, + "step": 451000 + }, + { + "epoch": 66.81, + "grad_norm": 0.403338223695755, + "learning_rate": 5.1524581976916245e-06, + "loss": 0.3306, + "step": 451500 + }, + { + "epoch": 66.88, + "grad_norm": 0.4086831212043762, + "learning_rate": 5.123557265463155e-06, + "loss": 0.3286, + "step": 452000 + }, + { + "epoch": 66.96, + "grad_norm": 0.3715237081050873, + "learning_rate": 5.094656333234685e-06, + "loss": 0.3301, + "step": 452500 + }, + { + "epoch": 67.03, + "grad_norm": 0.46829870343208313, + "learning_rate": 5.065755401006215e-06, + "loss": 0.3307, + "step": 453000 + }, + { + "epoch": 67.11, + "grad_norm": 0.4667709767818451, + "learning_rate": 5.036854468777745e-06, + "loss": 0.3298, + "step": 453500 + }, + { + "epoch": 67.18, + "grad_norm": 0.4758981466293335, + "learning_rate": 5.007953536549275e-06, + "loss": 0.3272, + "step": 454000 + }, + { + "epoch": 67.25, + "grad_norm": 0.48276805877685547, + "learning_rate": 4.979052604320804e-06, + "loss": 0.3288, + "step": 454500 + }, + { + "epoch": 67.33, + "grad_norm": 0.400806725025177, + "learning_rate": 4.950151672092335e-06, + "loss": 0.3258, + "step": 455000 + }, + { + "epoch": 67.4, + "grad_norm": 0.40156251192092896, + "learning_rate": 4.921250739863865e-06, + "loss": 0.3351, + "step": 455500 + }, + { + "epoch": 67.48, + "grad_norm": 0.5024535655975342, + "learning_rate": 4.8923498076353955e-06, + "loss": 0.3306, + "step": 456000 + }, + { + "epoch": 67.55, + "grad_norm": 0.52587890625, + "learning_rate": 4.863448875406925e-06, + "loss": 0.3331, + "step": 456500 + }, + { + "epoch": 67.62, + "grad_norm": 0.41265735030174255, + "learning_rate": 4.8345479431784554e-06, + "loss": 0.3328, + "step": 457000 + }, + { + "epoch": 67.7, + "grad_norm": 0.34202754497528076, + "learning_rate": 4.805647010949986e-06, + "loss": 0.3321, + "step": 457500 + }, + { + "epoch": 67.77, + "grad_norm": 0.4898373484611511, + "learning_rate": 4.776746078721515e-06, + "loss": 0.331, + "step": 458000 + }, + { + "epoch": 67.85, + "grad_norm": 0.52295982837677, + "learning_rate": 4.747845146493046e-06, + "loss": 0.3306, + "step": 458500 + }, + { + "epoch": 67.92, + "grad_norm": 0.46750620007514954, + "learning_rate": 4.718944214264575e-06, + "loss": 0.3315, + "step": 459000 + }, + { + "epoch": 67.99, + "grad_norm": 0.35533860325813293, + "learning_rate": 4.690043282036105e-06, + "loss": 0.3315, + "step": 459500 + }, + { + "epoch": 68.07, + "grad_norm": 0.41508856415748596, + "learning_rate": 4.661142349807635e-06, + "loss": 0.3291, + "step": 460000 + }, + { + "epoch": 68.14, + "grad_norm": 0.4271659851074219, + "learning_rate": 4.632241417579166e-06, + "loss": 0.3286, + "step": 460500 + }, + { + "epoch": 68.22, + "grad_norm": 0.44648808240890503, + "learning_rate": 4.603340485350695e-06, + "loss": 0.3299, + "step": 461000 + }, + { + "epoch": 68.29, + "grad_norm": 0.4843562841415405, + "learning_rate": 4.574439553122226e-06, + "loss": 0.3282, + "step": 461500 + }, + { + "epoch": 68.36, + "grad_norm": 0.41266024112701416, + "learning_rate": 4.545538620893756e-06, + "loss": 0.3299, + "step": 462000 + }, + { + "epoch": 68.44, + "grad_norm": 0.4088280200958252, + "learning_rate": 4.5166376886652855e-06, + "loss": 0.332, + "step": 462500 + }, + { + "epoch": 68.51, + "grad_norm": 0.48477041721343994, + "learning_rate": 4.487736756436816e-06, + "loss": 0.3312, + "step": 463000 + }, + { + "epoch": 68.59, + "grad_norm": 0.42487454414367676, + "learning_rate": 4.458835824208346e-06, + "loss": 0.3296, + "step": 463500 + }, + { + "epoch": 68.66, + "grad_norm": 0.4671236276626587, + "learning_rate": 4.429934891979876e-06, + "loss": 0.3288, + "step": 464000 + }, + { + "epoch": 68.73, + "grad_norm": 0.4430939257144928, + "learning_rate": 4.401033959751405e-06, + "loss": 0.3297, + "step": 464500 + }, + { + "epoch": 68.81, + "grad_norm": 0.4080400764942169, + "learning_rate": 4.372133027522936e-06, + "loss": 0.3305, + "step": 465000 + }, + { + "epoch": 68.88, + "grad_norm": 0.3743002712726593, + "learning_rate": 4.343232095294466e-06, + "loss": 0.3294, + "step": 465500 + }, + { + "epoch": 68.96, + "grad_norm": 0.3991639316082001, + "learning_rate": 4.314331163065996e-06, + "loss": 0.3293, + "step": 466000 + }, + { + "epoch": 69.03, + "grad_norm": 0.40404531359672546, + "learning_rate": 4.285430230837526e-06, + "loss": 0.3307, + "step": 466500 + }, + { + "epoch": 69.1, + "grad_norm": 0.4253464639186859, + "learning_rate": 4.2565292986090565e-06, + "loss": 0.3278, + "step": 467000 + }, + { + "epoch": 69.18, + "grad_norm": 0.43970435857772827, + "learning_rate": 4.227628366380586e-06, + "loss": 0.3284, + "step": 467500 + }, + { + "epoch": 69.25, + "grad_norm": 0.42423635721206665, + "learning_rate": 4.1987274341521164e-06, + "loss": 0.3337, + "step": 468000 + }, + { + "epoch": 69.33, + "grad_norm": 0.4581485092639923, + "learning_rate": 4.169826501923646e-06, + "loss": 0.3273, + "step": 468500 + }, + { + "epoch": 69.4, + "grad_norm": 0.4594268500804901, + "learning_rate": 4.1409255696951755e-06, + "loss": 0.3295, + "step": 469000 + }, + { + "epoch": 69.47, + "grad_norm": 0.49994996190071106, + "learning_rate": 4.112024637466706e-06, + "loss": 0.3267, + "step": 469500 + }, + { + "epoch": 69.55, + "grad_norm": 0.4062737822532654, + "learning_rate": 4.083123705238236e-06, + "loss": 0.3283, + "step": 470000 + }, + { + "epoch": 69.62, + "grad_norm": 0.4764838218688965, + "learning_rate": 4.054222773009766e-06, + "loss": 0.3318, + "step": 470500 + }, + { + "epoch": 69.7, + "grad_norm": 0.42747876048088074, + "learning_rate": 4.025321840781296e-06, + "loss": 0.3311, + "step": 471000 + }, + { + "epoch": 69.77, + "grad_norm": 0.45434367656707764, + "learning_rate": 3.996420908552827e-06, + "loss": 0.3293, + "step": 471500 + }, + { + "epoch": 69.84, + "grad_norm": 0.387123703956604, + "learning_rate": 3.967519976324356e-06, + "loss": 0.3311, + "step": 472000 + }, + { + "epoch": 69.92, + "grad_norm": 0.412826806306839, + "learning_rate": 3.938619044095887e-06, + "loss": 0.329, + "step": 472500 + }, + { + "epoch": 69.99, + "grad_norm": 0.532727062702179, + "learning_rate": 3.909718111867417e-06, + "loss": 0.3266, + "step": 473000 + }, + { + "epoch": 70.07, + "grad_norm": 0.4674714505672455, + "learning_rate": 3.8808171796389465e-06, + "loss": 0.3257, + "step": 473500 + }, + { + "epoch": 70.14, + "grad_norm": 0.3989239037036896, + "learning_rate": 3.851916247410477e-06, + "loss": 0.3288, + "step": 474000 + }, + { + "epoch": 70.21, + "grad_norm": 0.5390828251838684, + "learning_rate": 3.8230153151820065e-06, + "loss": 0.3316, + "step": 474500 + }, + { + "epoch": 70.29, + "grad_norm": 0.4232146143913269, + "learning_rate": 3.7941143829535364e-06, + "loss": 0.3297, + "step": 475000 + }, + { + "epoch": 70.36, + "grad_norm": 0.4476439654827118, + "learning_rate": 3.765213450725067e-06, + "loss": 0.3308, + "step": 475500 + }, + { + "epoch": 70.44, + "grad_norm": 0.46341538429260254, + "learning_rate": 3.7363125184965968e-06, + "loss": 0.3241, + "step": 476000 + }, + { + "epoch": 70.51, + "grad_norm": 0.3792473077774048, + "learning_rate": 3.7074115862681268e-06, + "loss": 0.3282, + "step": 476500 + }, + { + "epoch": 70.58, + "grad_norm": 0.42449694871902466, + "learning_rate": 3.6785106540396567e-06, + "loss": 0.3282, + "step": 477000 + }, + { + "epoch": 70.66, + "grad_norm": 0.389700323343277, + "learning_rate": 3.6496097218111867e-06, + "loss": 0.3332, + "step": 477500 + }, + { + "epoch": 70.73, + "grad_norm": 0.4011322855949402, + "learning_rate": 3.6207087895827167e-06, + "loss": 0.3295, + "step": 478000 + }, + { + "epoch": 70.8, + "grad_norm": 0.485365092754364, + "learning_rate": 3.591807857354247e-06, + "loss": 0.33, + "step": 478500 + }, + { + "epoch": 70.88, + "grad_norm": 0.39829009771347046, + "learning_rate": 3.562906925125777e-06, + "loss": 0.3291, + "step": 479000 + }, + { + "epoch": 70.95, + "grad_norm": 0.46039626002311707, + "learning_rate": 3.5340059928973066e-06, + "loss": 0.331, + "step": 479500 + }, + { + "epoch": 71.03, + "grad_norm": 0.40389734506607056, + "learning_rate": 3.505105060668837e-06, + "loss": 0.3286, + "step": 480000 + }, + { + "epoch": 71.1, + "grad_norm": 0.4375256299972534, + "learning_rate": 3.476204128440367e-06, + "loss": 0.3298, + "step": 480500 + }, + { + "epoch": 71.17, + "grad_norm": 0.42462676763534546, + "learning_rate": 3.4473031962118973e-06, + "loss": 0.3302, + "step": 481000 + }, + { + "epoch": 71.25, + "grad_norm": 0.3216535747051239, + "learning_rate": 3.4184022639834273e-06, + "loss": 0.3283, + "step": 481500 + }, + { + "epoch": 71.32, + "grad_norm": 0.45945799350738525, + "learning_rate": 3.389501331754957e-06, + "loss": 0.3278, + "step": 482000 + }, + { + "epoch": 71.4, + "grad_norm": 0.4495971202850342, + "learning_rate": 3.3606003995264872e-06, + "loss": 0.3265, + "step": 482500 + }, + { + "epoch": 71.47, + "grad_norm": 0.4159165322780609, + "learning_rate": 3.331699467298017e-06, + "loss": 0.3292, + "step": 483000 + }, + { + "epoch": 71.54, + "grad_norm": 0.410427063703537, + "learning_rate": 3.302798535069547e-06, + "loss": 0.3299, + "step": 483500 + }, + { + "epoch": 71.62, + "grad_norm": 0.5130240321159363, + "learning_rate": 3.2738976028410776e-06, + "loss": 0.3316, + "step": 484000 + }, + { + "epoch": 71.69, + "grad_norm": 0.4405277669429779, + "learning_rate": 3.244996670612607e-06, + "loss": 0.33, + "step": 484500 + }, + { + "epoch": 71.77, + "grad_norm": 0.575674295425415, + "learning_rate": 3.2160957383841375e-06, + "loss": 0.3298, + "step": 485000 + }, + { + "epoch": 71.84, + "grad_norm": 0.4434616267681122, + "learning_rate": 3.1871948061556675e-06, + "loss": 0.3298, + "step": 485500 + }, + { + "epoch": 71.91, + "grad_norm": 0.3960082530975342, + "learning_rate": 3.1582938739271974e-06, + "loss": 0.3282, + "step": 486000 + }, + { + "epoch": 71.99, + "grad_norm": 0.42698296904563904, + "learning_rate": 3.129392941698728e-06, + "loss": 0.3296, + "step": 486500 + }, + { + "epoch": 72.06, + "grad_norm": 0.5218748450279236, + "learning_rate": 3.1004920094702574e-06, + "loss": 0.3296, + "step": 487000 + }, + { + "epoch": 72.14, + "grad_norm": 0.46763402223587036, + "learning_rate": 3.0715910772417873e-06, + "loss": 0.3262, + "step": 487500 + }, + { + "epoch": 72.21, + "grad_norm": 0.42345327138900757, + "learning_rate": 3.0426901450133177e-06, + "loss": 0.33, + "step": 488000 + }, + { + "epoch": 72.28, + "grad_norm": 0.4526881277561188, + "learning_rate": 3.0137892127848477e-06, + "loss": 0.3304, + "step": 488500 + }, + { + "epoch": 72.36, + "grad_norm": 0.42106348276138306, + "learning_rate": 2.984888280556378e-06, + "loss": 0.3292, + "step": 489000 + }, + { + "epoch": 72.43, + "grad_norm": 0.5022510886192322, + "learning_rate": 2.9559873483279076e-06, + "loss": 0.3266, + "step": 489500 + }, + { + "epoch": 72.51, + "grad_norm": 0.4436812996864319, + "learning_rate": 2.9270864160994376e-06, + "loss": 0.3269, + "step": 490000 + }, + { + "epoch": 72.58, + "grad_norm": 0.42252251505851746, + "learning_rate": 2.898185483870968e-06, + "loss": 0.3281, + "step": 490500 + }, + { + "epoch": 72.65, + "grad_norm": 0.5339802503585815, + "learning_rate": 2.869284551642498e-06, + "loss": 0.3289, + "step": 491000 + }, + { + "epoch": 72.73, + "grad_norm": 0.3937510550022125, + "learning_rate": 2.840383619414028e-06, + "loss": 0.328, + "step": 491500 + }, + { + "epoch": 72.8, + "grad_norm": 0.3894229829311371, + "learning_rate": 2.811482687185558e-06, + "loss": 0.3282, + "step": 492000 + }, + { + "epoch": 72.88, + "grad_norm": 0.4481090307235718, + "learning_rate": 2.782581754957088e-06, + "loss": 0.3301, + "step": 492500 + }, + { + "epoch": 72.95, + "grad_norm": 0.45495444536209106, + "learning_rate": 2.753680822728618e-06, + "loss": 0.3278, + "step": 493000 + }, + { + "epoch": 73.02, + "grad_norm": 0.49259716272354126, + "learning_rate": 2.7247798905001482e-06, + "loss": 0.3295, + "step": 493500 + }, + { + "epoch": 73.1, + "grad_norm": 0.4257282018661499, + "learning_rate": 2.6958789582716778e-06, + "loss": 0.3269, + "step": 494000 + }, + { + "epoch": 73.17, + "grad_norm": 0.43159300088882446, + "learning_rate": 2.666978026043208e-06, + "loss": 0.3276, + "step": 494500 + }, + { + "epoch": 73.25, + "grad_norm": 0.4048108458518982, + "learning_rate": 2.638077093814738e-06, + "loss": 0.3333, + "step": 495000 + }, + { + "epoch": 73.32, + "grad_norm": 0.4666566252708435, + "learning_rate": 2.609176161586268e-06, + "loss": 0.3275, + "step": 495500 + }, + { + "epoch": 73.39, + "grad_norm": 0.3985891342163086, + "learning_rate": 2.5802752293577985e-06, + "loss": 0.3279, + "step": 496000 + }, + { + "epoch": 73.47, + "grad_norm": 0.5439868569374084, + "learning_rate": 2.551374297129328e-06, + "loss": 0.3275, + "step": 496500 + }, + { + "epoch": 73.54, + "grad_norm": 0.45784690976142883, + "learning_rate": 2.522473364900858e-06, + "loss": 0.3309, + "step": 497000 + }, + { + "epoch": 73.62, + "grad_norm": 0.4779771864414215, + "learning_rate": 2.4935724326723884e-06, + "loss": 0.3288, + "step": 497500 + }, + { + "epoch": 73.69, + "grad_norm": 0.47680574655532837, + "learning_rate": 2.4646715004439184e-06, + "loss": 0.3285, + "step": 498000 + }, + { + "epoch": 73.76, + "grad_norm": 0.3629719913005829, + "learning_rate": 2.4357705682154488e-06, + "loss": 0.3284, + "step": 498500 + }, + { + "epoch": 73.84, + "grad_norm": 0.46253129839897156, + "learning_rate": 2.4068696359869783e-06, + "loss": 0.3284, + "step": 499000 + }, + { + "epoch": 73.91, + "grad_norm": 0.44531476497650146, + "learning_rate": 2.3779687037585083e-06, + "loss": 0.3281, + "step": 499500 + }, + { + "epoch": 73.99, + "grad_norm": 0.39289695024490356, + "learning_rate": 2.3490677715300387e-06, + "loss": 0.326, + "step": 500000 + }, + { + "epoch": 74.06, + "grad_norm": 0.48103997111320496, + "learning_rate": 2.3201668393015686e-06, + "loss": 0.3272, + "step": 500500 + }, + { + "epoch": 74.13, + "grad_norm": 0.4336768388748169, + "learning_rate": 2.2912659070730986e-06, + "loss": 0.3265, + "step": 501000 + }, + { + "epoch": 74.21, + "grad_norm": 0.4040307402610779, + "learning_rate": 2.2623649748446286e-06, + "loss": 0.3271, + "step": 501500 + }, + { + "epoch": 74.28, + "grad_norm": 0.49081218242645264, + "learning_rate": 2.2334640426161585e-06, + "loss": 0.328, + "step": 502000 + }, + { + "epoch": 74.36, + "grad_norm": 0.44683390855789185, + "learning_rate": 2.2045631103876885e-06, + "loss": 0.3266, + "step": 502500 + }, + { + "epoch": 74.43, + "grad_norm": 0.4362635612487793, + "learning_rate": 2.175662178159219e-06, + "loss": 0.3293, + "step": 503000 + }, + { + "epoch": 74.5, + "grad_norm": 0.4326813220977783, + "learning_rate": 2.146761245930749e-06, + "loss": 0.3302, + "step": 503500 + }, + { + "epoch": 74.58, + "grad_norm": 0.5289288759231567, + "learning_rate": 2.117860313702279e-06, + "loss": 0.3288, + "step": 504000 + }, + { + "epoch": 74.65, + "grad_norm": 0.5708897709846497, + "learning_rate": 2.088959381473809e-06, + "loss": 0.3271, + "step": 504500 + }, + { + "epoch": 74.73, + "grad_norm": 0.38460394740104675, + "learning_rate": 2.0600584492453388e-06, + "loss": 0.3262, + "step": 505000 + }, + { + "epoch": 74.8, + "grad_norm": 0.4401102066040039, + "learning_rate": 2.031157517016869e-06, + "loss": 0.3285, + "step": 505500 + }, + { + "epoch": 74.87, + "grad_norm": 0.4699185788631439, + "learning_rate": 2.002256584788399e-06, + "loss": 0.3292, + "step": 506000 + }, + { + "epoch": 74.95, + "grad_norm": 0.43969598412513733, + "learning_rate": 1.9733556525599287e-06, + "loss": 0.3282, + "step": 506500 + }, + { + "epoch": 75.02, + "grad_norm": 0.5226773619651794, + "learning_rate": 1.944454720331459e-06, + "loss": 0.3275, + "step": 507000 + }, + { + "epoch": 75.1, + "grad_norm": 0.42381104826927185, + "learning_rate": 1.915553788102989e-06, + "loss": 0.3287, + "step": 507500 + }, + { + "epoch": 75.17, + "grad_norm": 0.47836771607398987, + "learning_rate": 1.886652855874519e-06, + "loss": 0.3248, + "step": 508000 + }, + { + "epoch": 75.24, + "grad_norm": 0.4760962128639221, + "learning_rate": 1.8577519236460492e-06, + "loss": 0.3255, + "step": 508500 + }, + { + "epoch": 75.32, + "grad_norm": 0.4954340159893036, + "learning_rate": 1.8288509914175794e-06, + "loss": 0.3274, + "step": 509000 + }, + { + "epoch": 75.39, + "grad_norm": 0.3998168110847473, + "learning_rate": 1.7999500591891091e-06, + "loss": 0.328, + "step": 509500 + }, + { + "epoch": 75.47, + "grad_norm": 0.3899104595184326, + "learning_rate": 1.7710491269606393e-06, + "loss": 0.3291, + "step": 510000 + }, + { + "epoch": 75.54, + "grad_norm": 0.4677903652191162, + "learning_rate": 1.7421481947321693e-06, + "loss": 0.3261, + "step": 510500 + }, + { + "epoch": 75.61, + "grad_norm": 0.41607698798179626, + "learning_rate": 1.7132472625036995e-06, + "loss": 0.3291, + "step": 511000 + }, + { + "epoch": 75.69, + "grad_norm": 0.44930896162986755, + "learning_rate": 1.6843463302752294e-06, + "loss": 0.3307, + "step": 511500 + }, + { + "epoch": 75.76, + "grad_norm": 0.48138096928596497, + "learning_rate": 1.6554453980467594e-06, + "loss": 0.3256, + "step": 512000 + }, + { + "epoch": 75.84, + "grad_norm": 0.44024384021759033, + "learning_rate": 1.6265444658182896e-06, + "loss": 0.3278, + "step": 512500 + }, + { + "epoch": 75.91, + "grad_norm": 0.42400872707366943, + "learning_rate": 1.5976435335898193e-06, + "loss": 0.3295, + "step": 513000 + }, + { + "epoch": 75.98, + "grad_norm": 0.4450230896472931, + "learning_rate": 1.5687426013613495e-06, + "loss": 0.3293, + "step": 513500 + }, + { + "epoch": 76.06, + "grad_norm": 0.4113300144672394, + "learning_rate": 1.5398416691328797e-06, + "loss": 0.3277, + "step": 514000 + }, + { + "epoch": 76.13, + "grad_norm": 0.4838961064815521, + "learning_rate": 1.5109407369044097e-06, + "loss": 0.3257, + "step": 514500 + }, + { + "epoch": 76.21, + "grad_norm": 0.45890524983406067, + "learning_rate": 1.4820398046759396e-06, + "loss": 0.3263, + "step": 515000 + }, + { + "epoch": 76.28, + "grad_norm": 0.4421687424182892, + "learning_rate": 1.4531388724474696e-06, + "loss": 0.3287, + "step": 515500 + }, + { + "epoch": 76.35, + "grad_norm": 0.4234231114387512, + "learning_rate": 1.4242379402189998e-06, + "loss": 0.3308, + "step": 516000 + }, + { + "epoch": 76.43, + "grad_norm": 0.4239380657672882, + "learning_rate": 1.3953370079905297e-06, + "loss": 0.3291, + "step": 516500 + }, + { + "epoch": 76.5, + "grad_norm": 0.4606933891773224, + "learning_rate": 1.3664360757620597e-06, + "loss": 0.3288, + "step": 517000 + }, + { + "epoch": 76.58, + "grad_norm": 0.4070008099079132, + "learning_rate": 1.33753514353359e-06, + "loss": 0.3268, + "step": 517500 + }, + { + "epoch": 76.65, + "grad_norm": 0.5978463888168335, + "learning_rate": 1.3086342113051199e-06, + "loss": 0.3272, + "step": 518000 + }, + { + "epoch": 76.72, + "grad_norm": 0.43075379729270935, + "learning_rate": 1.27973327907665e-06, + "loss": 0.3287, + "step": 518500 + }, + { + "epoch": 76.8, + "grad_norm": 0.46790510416030884, + "learning_rate": 1.2508323468481798e-06, + "loss": 0.3266, + "step": 519000 + }, + { + "epoch": 76.87, + "grad_norm": 0.45541101694107056, + "learning_rate": 1.22193141461971e-06, + "loss": 0.3254, + "step": 519500 + }, + { + "epoch": 76.95, + "grad_norm": 0.44363468885421753, + "learning_rate": 1.1930304823912402e-06, + "loss": 0.3248, + "step": 520000 + }, + { + "epoch": 77.02, + "grad_norm": 0.5055235624313354, + "learning_rate": 1.1641295501627701e-06, + "loss": 0.3265, + "step": 520500 + }, + { + "epoch": 77.09, + "grad_norm": 0.3572923541069031, + "learning_rate": 1.1352286179343e-06, + "loss": 0.324, + "step": 521000 + }, + { + "epoch": 77.17, + "grad_norm": 0.40502551198005676, + "learning_rate": 1.10632768570583e-06, + "loss": 0.3254, + "step": 521500 + }, + { + "epoch": 77.24, + "grad_norm": 0.45639294385910034, + "learning_rate": 1.0774267534773602e-06, + "loss": 0.328, + "step": 522000 + }, + { + "epoch": 77.32, + "grad_norm": 0.4580610990524292, + "learning_rate": 1.0485258212488904e-06, + "loss": 0.3278, + "step": 522500 + }, + { + "epoch": 77.39, + "grad_norm": 0.4812680184841156, + "learning_rate": 1.0196248890204202e-06, + "loss": 0.3274, + "step": 523000 + }, + { + "epoch": 77.46, + "grad_norm": 0.416979044675827, + "learning_rate": 9.907239567919504e-07, + "loss": 0.3261, + "step": 523500 + }, + { + "epoch": 77.54, + "grad_norm": 0.39473670721054077, + "learning_rate": 9.618230245634803e-07, + "loss": 0.328, + "step": 524000 + }, + { + "epoch": 77.61, + "grad_norm": 0.4831089675426483, + "learning_rate": 9.329220923350104e-07, + "loss": 0.328, + "step": 524500 + }, + { + "epoch": 77.69, + "grad_norm": 0.4752112627029419, + "learning_rate": 9.040211601065404e-07, + "loss": 0.3249, + "step": 525000 + }, + { + "epoch": 77.76, + "grad_norm": 0.4114755690097809, + "learning_rate": 8.751202278780705e-07, + "loss": 0.3282, + "step": 525500 + }, + { + "epoch": 77.83, + "grad_norm": 0.539284348487854, + "learning_rate": 8.462192956496004e-07, + "loss": 0.327, + "step": 526000 + }, + { + "epoch": 77.91, + "grad_norm": 0.4160568118095398, + "learning_rate": 8.173183634211305e-07, + "loss": 0.3278, + "step": 526500 + }, + { + "epoch": 77.98, + "grad_norm": 0.42335689067840576, + "learning_rate": 7.884174311926606e-07, + "loss": 0.3291, + "step": 527000 + }, + { + "epoch": 78.06, + "grad_norm": 0.4964425563812256, + "learning_rate": 7.595164989641906e-07, + "loss": 0.3269, + "step": 527500 + }, + { + "epoch": 78.13, + "grad_norm": 0.5482224822044373, + "learning_rate": 7.306155667357206e-07, + "loss": 0.3257, + "step": 528000 + }, + { + "epoch": 78.2, + "grad_norm": 0.4845934808254242, + "learning_rate": 7.017146345072507e-07, + "loss": 0.3257, + "step": 528500 + }, + { + "epoch": 78.28, + "grad_norm": 0.44311293959617615, + "learning_rate": 6.728137022787807e-07, + "loss": 0.3267, + "step": 529000 + }, + { + "epoch": 78.35, + "grad_norm": 0.49295201897621155, + "learning_rate": 6.439127700503107e-07, + "loss": 0.3262, + "step": 529500 + }, + { + "epoch": 78.43, + "grad_norm": 0.45838817954063416, + "learning_rate": 6.150118378218408e-07, + "loss": 0.3272, + "step": 530000 + }, + { + "epoch": 78.5, + "grad_norm": 0.4277520477771759, + "learning_rate": 5.861109055933709e-07, + "loss": 0.3275, + "step": 530500 + }, + { + "epoch": 78.57, + "grad_norm": 0.49568185210227966, + "learning_rate": 5.572099733649008e-07, + "loss": 0.3238, + "step": 531000 + }, + { + "epoch": 78.65, + "grad_norm": 0.3964736759662628, + "learning_rate": 5.283090411364309e-07, + "loss": 0.3265, + "step": 531500 + }, + { + "epoch": 78.72, + "grad_norm": 0.38991761207580566, + "learning_rate": 4.994081089079609e-07, + "loss": 0.3297, + "step": 532000 + }, + { + "epoch": 78.8, + "grad_norm": 0.514043390750885, + "learning_rate": 4.7050717667949096e-07, + "loss": 0.3274, + "step": 532500 + }, + { + "epoch": 78.87, + "grad_norm": 0.44372057914733887, + "learning_rate": 4.41606244451021e-07, + "loss": 0.3265, + "step": 533000 + }, + { + "epoch": 78.94, + "grad_norm": 0.40556496381759644, + "learning_rate": 4.1270531222255106e-07, + "loss": 0.3268, + "step": 533500 + }, + { + "epoch": 79.02, + "grad_norm": 0.37113696336746216, + "learning_rate": 3.838043799940811e-07, + "loss": 0.331, + "step": 534000 + }, + { + "epoch": 79.09, + "grad_norm": 0.42463332414627075, + "learning_rate": 3.549034477656111e-07, + "loss": 0.3269, + "step": 534500 + }, + { + "epoch": 79.17, + "grad_norm": 0.456259548664093, + "learning_rate": 3.260025155371412e-07, + "loss": 0.3296, + "step": 535000 + }, + { + "epoch": 79.24, + "grad_norm": 0.39561837911605835, + "learning_rate": 2.971015833086712e-07, + "loss": 0.3272, + "step": 535500 + }, + { + "epoch": 79.31, + "grad_norm": 0.42246511578559875, + "learning_rate": 2.6820065108020127e-07, + "loss": 0.3274, + "step": 536000 + }, + { + "epoch": 79.39, + "grad_norm": 0.42932552099227905, + "learning_rate": 2.392997188517313e-07, + "loss": 0.3259, + "step": 536500 + }, + { + "epoch": 79.46, + "grad_norm": 0.4081755578517914, + "learning_rate": 2.1039878662326132e-07, + "loss": 0.3254, + "step": 537000 + }, + { + "epoch": 79.54, + "grad_norm": 0.43017107248306274, + "learning_rate": 1.8149785439479136e-07, + "loss": 0.3294, + "step": 537500 + }, + { + "epoch": 79.61, + "grad_norm": 0.3940086364746094, + "learning_rate": 1.525969221663214e-07, + "loss": 0.3284, + "step": 538000 + }, + { + "epoch": 79.68, + "grad_norm": 0.37287628650665283, + "learning_rate": 1.2369598993785146e-07, + "loss": 0.3269, + "step": 538500 + }, + { + "epoch": 79.76, + "grad_norm": 0.451742559671402, + "learning_rate": 9.479505770938148e-08, + "loss": 0.3259, + "step": 539000 + }, + { + "epoch": 79.83, + "grad_norm": 0.4438938796520233, + "learning_rate": 6.589412548091152e-08, + "loss": 0.3238, + "step": 539500 + }, + { + "epoch": 79.91, + "grad_norm": 0.4649119973182678, + "learning_rate": 3.699319325244155e-08, + "loss": 0.3257, + "step": 540000 + }, + { + "epoch": 79.98, + "grad_norm": 0.4151638150215149, + "learning_rate": 8.09226102397159e-09, + "loss": 0.3303, + "step": 540500 + }, + { + "epoch": 80.0, + "step": 540640, + "total_flos": 4.3712245507093955e+20, + "train_loss": 0.35890252478696355, + "train_runtime": 56544.5558, + "train_samples_per_second": 76.486, + "train_steps_per_second": 9.561 + } + ], + "logging_steps": 500, + "max_steps": 540640, + "num_input_tokens_seen": 0, + "num_train_epochs": 80, + "save_steps": 1000000000, + "total_flos": 4.3712245507093955e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}