radiovers17v / trainer_state.json
crncskn's picture
End of training
d33fa82 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 80.0,
"eval_steps": 500,
"global_step": 540640,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07,
"grad_norm": 0.11938641220331192,
"learning_rate": 3.122109906777153e-05,
"loss": 0.7845,
"step": 500
},
{
"epoch": 0.15,
"grad_norm": 0.11220108717679977,
"learning_rate": 3.119219813554306e-05,
"loss": 0.7681,
"step": 1000
},
{
"epoch": 0.22,
"grad_norm": 0.20039337873458862,
"learning_rate": 3.116329720331459e-05,
"loss": 0.7627,
"step": 1500
},
{
"epoch": 0.3,
"grad_norm": 0.20968303084373474,
"learning_rate": 3.113439627108612e-05,
"loss": 0.751,
"step": 2000
},
{
"epoch": 0.37,
"grad_norm": 0.46042799949645996,
"learning_rate": 3.1105495338857653e-05,
"loss": 0.7395,
"step": 2500
},
{
"epoch": 0.44,
"grad_norm": 0.5682156682014465,
"learning_rate": 3.107659440662918e-05,
"loss": 0.7169,
"step": 3000
},
{
"epoch": 0.52,
"grad_norm": 0.446135938167572,
"learning_rate": 3.104769347440071e-05,
"loss": 0.7037,
"step": 3500
},
{
"epoch": 0.59,
"grad_norm": 0.5436543822288513,
"learning_rate": 3.1018792542172244e-05,
"loss": 0.6854,
"step": 4000
},
{
"epoch": 0.67,
"grad_norm": 0.5623897314071655,
"learning_rate": 3.098989160994377e-05,
"loss": 0.6661,
"step": 4500
},
{
"epoch": 0.74,
"grad_norm": 0.7239806652069092,
"learning_rate": 3.09609906777153e-05,
"loss": 0.6477,
"step": 5000
},
{
"epoch": 0.81,
"grad_norm": 0.6363663077354431,
"learning_rate": 3.0932089745486834e-05,
"loss": 0.6302,
"step": 5500
},
{
"epoch": 0.89,
"grad_norm": 0.8511515855789185,
"learning_rate": 3.090318881325836e-05,
"loss": 0.6156,
"step": 6000
},
{
"epoch": 0.96,
"grad_norm": 0.7209456562995911,
"learning_rate": 3.087428788102989e-05,
"loss": 0.6002,
"step": 6500
},
{
"epoch": 1.04,
"grad_norm": 0.7105280756950378,
"learning_rate": 3.0845386948801424e-05,
"loss": 0.5852,
"step": 7000
},
{
"epoch": 1.11,
"grad_norm": 0.7035876512527466,
"learning_rate": 3.081648601657295e-05,
"loss": 0.5734,
"step": 7500
},
{
"epoch": 1.18,
"grad_norm": 0.6755463480949402,
"learning_rate": 3.078758508434448e-05,
"loss": 0.5619,
"step": 8000
},
{
"epoch": 1.26,
"grad_norm": 0.6636064648628235,
"learning_rate": 3.0758684152116015e-05,
"loss": 0.5527,
"step": 8500
},
{
"epoch": 1.33,
"grad_norm": 0.7909913063049316,
"learning_rate": 3.072978321988754e-05,
"loss": 0.545,
"step": 9000
},
{
"epoch": 1.41,
"grad_norm": 0.7935764789581299,
"learning_rate": 3.070088228765907e-05,
"loss": 0.5342,
"step": 9500
},
{
"epoch": 1.48,
"grad_norm": 0.7649631500244141,
"learning_rate": 3.06719813554306e-05,
"loss": 0.5264,
"step": 10000
},
{
"epoch": 1.55,
"grad_norm": 0.7262706160545349,
"learning_rate": 3.064308042320213e-05,
"loss": 0.5194,
"step": 10500
},
{
"epoch": 1.63,
"grad_norm": 0.7068478465080261,
"learning_rate": 3.061417949097366e-05,
"loss": 0.5137,
"step": 11000
},
{
"epoch": 1.7,
"grad_norm": 0.6415815353393555,
"learning_rate": 3.058527855874519e-05,
"loss": 0.51,
"step": 11500
},
{
"epoch": 1.78,
"grad_norm": 0.7167455554008484,
"learning_rate": 3.055637762651672e-05,
"loss": 0.5024,
"step": 12000
},
{
"epoch": 1.85,
"grad_norm": 0.6563605666160583,
"learning_rate": 3.052747669428825e-05,
"loss": 0.4985,
"step": 12500
},
{
"epoch": 1.92,
"grad_norm": 0.7427666783332825,
"learning_rate": 3.049857576205978e-05,
"loss": 0.4939,
"step": 13000
},
{
"epoch": 2.0,
"grad_norm": 0.6371767520904541,
"learning_rate": 3.046967482983131e-05,
"loss": 0.4923,
"step": 13500
},
{
"epoch": 2.07,
"grad_norm": 0.7062104940414429,
"learning_rate": 3.044077389760284e-05,
"loss": 0.4894,
"step": 14000
},
{
"epoch": 2.15,
"grad_norm": 0.7556993365287781,
"learning_rate": 3.041187296537437e-05,
"loss": 0.4846,
"step": 14500
},
{
"epoch": 2.22,
"grad_norm": 0.6561410427093506,
"learning_rate": 3.03829720331459e-05,
"loss": 0.4831,
"step": 15000
},
{
"epoch": 2.29,
"grad_norm": 0.6414974331855774,
"learning_rate": 3.0354071100917432e-05,
"loss": 0.4807,
"step": 15500
},
{
"epoch": 2.37,
"grad_norm": 0.6632120609283447,
"learning_rate": 3.032517016868896e-05,
"loss": 0.472,
"step": 16000
},
{
"epoch": 2.44,
"grad_norm": 0.6413108706474304,
"learning_rate": 3.029626923646049e-05,
"loss": 0.4723,
"step": 16500
},
{
"epoch": 2.52,
"grad_norm": 0.6478744149208069,
"learning_rate": 3.0267368304232022e-05,
"loss": 0.4692,
"step": 17000
},
{
"epoch": 2.59,
"grad_norm": 0.5901973247528076,
"learning_rate": 3.023846737200355e-05,
"loss": 0.4672,
"step": 17500
},
{
"epoch": 2.66,
"grad_norm": 0.5960791707038879,
"learning_rate": 3.020956643977508e-05,
"loss": 0.4649,
"step": 18000
},
{
"epoch": 2.74,
"grad_norm": 0.6265193819999695,
"learning_rate": 3.0180665507546612e-05,
"loss": 0.4616,
"step": 18500
},
{
"epoch": 2.81,
"grad_norm": 0.6381145119667053,
"learning_rate": 3.0151764575318144e-05,
"loss": 0.4592,
"step": 19000
},
{
"epoch": 2.89,
"grad_norm": 0.6370628476142883,
"learning_rate": 3.012286364308967e-05,
"loss": 0.4594,
"step": 19500
},
{
"epoch": 2.96,
"grad_norm": 0.5658203363418579,
"learning_rate": 3.0093962710861203e-05,
"loss": 0.4542,
"step": 20000
},
{
"epoch": 3.03,
"grad_norm": 0.5123589038848877,
"learning_rate": 3.0065061778632734e-05,
"loss": 0.4555,
"step": 20500
},
{
"epoch": 3.11,
"grad_norm": 0.5034360289573669,
"learning_rate": 3.0036160846404262e-05,
"loss": 0.4501,
"step": 21000
},
{
"epoch": 3.18,
"grad_norm": 0.5025657415390015,
"learning_rate": 3.0007259914175793e-05,
"loss": 0.4501,
"step": 21500
},
{
"epoch": 3.26,
"grad_norm": 0.5448479056358337,
"learning_rate": 2.9978358981947324e-05,
"loss": 0.4481,
"step": 22000
},
{
"epoch": 3.33,
"grad_norm": 0.5894014239311218,
"learning_rate": 2.9949458049718852e-05,
"loss": 0.4438,
"step": 22500
},
{
"epoch": 3.4,
"grad_norm": 0.653883159160614,
"learning_rate": 2.9920557117490383e-05,
"loss": 0.444,
"step": 23000
},
{
"epoch": 3.48,
"grad_norm": 0.4382980167865753,
"learning_rate": 2.9891656185261915e-05,
"loss": 0.4437,
"step": 23500
},
{
"epoch": 3.55,
"grad_norm": 0.4639624357223511,
"learning_rate": 2.9862755253033443e-05,
"loss": 0.4398,
"step": 24000
},
{
"epoch": 3.63,
"grad_norm": 0.527728796005249,
"learning_rate": 2.9833854320804974e-05,
"loss": 0.4386,
"step": 24500
},
{
"epoch": 3.7,
"grad_norm": 0.543736457824707,
"learning_rate": 2.9804953388576505e-05,
"loss": 0.4392,
"step": 25000
},
{
"epoch": 3.77,
"grad_norm": 0.5280329585075378,
"learning_rate": 2.9776052456348033e-05,
"loss": 0.4383,
"step": 25500
},
{
"epoch": 3.85,
"grad_norm": 0.4563904106616974,
"learning_rate": 2.9747151524119564e-05,
"loss": 0.4371,
"step": 26000
},
{
"epoch": 3.92,
"grad_norm": 0.5162687301635742,
"learning_rate": 2.9718250591891095e-05,
"loss": 0.4367,
"step": 26500
},
{
"epoch": 4.0,
"grad_norm": 0.4838933050632477,
"learning_rate": 2.9689349659662623e-05,
"loss": 0.4352,
"step": 27000
},
{
"epoch": 4.07,
"grad_norm": 0.5301242470741272,
"learning_rate": 2.9660448727434154e-05,
"loss": 0.4319,
"step": 27500
},
{
"epoch": 4.14,
"grad_norm": 0.5619557499885559,
"learning_rate": 2.9631547795205686e-05,
"loss": 0.4303,
"step": 28000
},
{
"epoch": 4.22,
"grad_norm": 0.4900205433368683,
"learning_rate": 2.9602646862977214e-05,
"loss": 0.4312,
"step": 28500
},
{
"epoch": 4.29,
"grad_norm": 0.46870502829551697,
"learning_rate": 2.9573745930748745e-05,
"loss": 0.4302,
"step": 29000
},
{
"epoch": 4.37,
"grad_norm": 0.47382786870002747,
"learning_rate": 2.9544844998520273e-05,
"loss": 0.4287,
"step": 29500
},
{
"epoch": 4.44,
"grad_norm": 0.5594569444656372,
"learning_rate": 2.95159440662918e-05,
"loss": 0.4284,
"step": 30000
},
{
"epoch": 4.51,
"grad_norm": 0.511375367641449,
"learning_rate": 2.9487043134063332e-05,
"loss": 0.4262,
"step": 30500
},
{
"epoch": 4.59,
"grad_norm": 0.5069934725761414,
"learning_rate": 2.9458142201834863e-05,
"loss": 0.4247,
"step": 31000
},
{
"epoch": 4.66,
"grad_norm": 0.5310338139533997,
"learning_rate": 2.942924126960639e-05,
"loss": 0.4249,
"step": 31500
},
{
"epoch": 4.74,
"grad_norm": 0.4728649854660034,
"learning_rate": 2.9400340337377922e-05,
"loss": 0.4225,
"step": 32000
},
{
"epoch": 4.81,
"grad_norm": 0.45557233691215515,
"learning_rate": 2.9371439405149453e-05,
"loss": 0.4241,
"step": 32500
},
{
"epoch": 4.88,
"grad_norm": 0.4630686938762665,
"learning_rate": 2.934253847292098e-05,
"loss": 0.4212,
"step": 33000
},
{
"epoch": 4.96,
"grad_norm": 0.509099543094635,
"learning_rate": 2.9313637540692512e-05,
"loss": 0.4215,
"step": 33500
},
{
"epoch": 5.03,
"grad_norm": 0.4747762084007263,
"learning_rate": 2.9284736608464044e-05,
"loss": 0.4203,
"step": 34000
},
{
"epoch": 5.11,
"grad_norm": 0.43625542521476746,
"learning_rate": 2.925583567623557e-05,
"loss": 0.4211,
"step": 34500
},
{
"epoch": 5.18,
"grad_norm": 0.44176748394966125,
"learning_rate": 2.9226934744007103e-05,
"loss": 0.4209,
"step": 35000
},
{
"epoch": 5.25,
"grad_norm": 0.5236085653305054,
"learning_rate": 2.9198033811778634e-05,
"loss": 0.422,
"step": 35500
},
{
"epoch": 5.33,
"grad_norm": 0.4237843453884125,
"learning_rate": 2.9169132879550162e-05,
"loss": 0.4163,
"step": 36000
},
{
"epoch": 5.4,
"grad_norm": 0.44581139087677,
"learning_rate": 2.9140231947321693e-05,
"loss": 0.4152,
"step": 36500
},
{
"epoch": 5.47,
"grad_norm": 0.4488186836242676,
"learning_rate": 2.9111331015093224e-05,
"loss": 0.4175,
"step": 37000
},
{
"epoch": 5.55,
"grad_norm": 0.5051326751708984,
"learning_rate": 2.9082430082864752e-05,
"loss": 0.4149,
"step": 37500
},
{
"epoch": 5.62,
"grad_norm": 0.4836309850215912,
"learning_rate": 2.9053529150636283e-05,
"loss": 0.4138,
"step": 38000
},
{
"epoch": 5.7,
"grad_norm": 0.46710771322250366,
"learning_rate": 2.9024628218407815e-05,
"loss": 0.4125,
"step": 38500
},
{
"epoch": 5.77,
"grad_norm": 0.39740118384361267,
"learning_rate": 2.8995727286179342e-05,
"loss": 0.4169,
"step": 39000
},
{
"epoch": 5.84,
"grad_norm": 0.4491262435913086,
"learning_rate": 2.8966826353950874e-05,
"loss": 0.4136,
"step": 39500
},
{
"epoch": 5.92,
"grad_norm": 0.4240283966064453,
"learning_rate": 2.8937925421722405e-05,
"loss": 0.4143,
"step": 40000
},
{
"epoch": 5.99,
"grad_norm": 0.43018123507499695,
"learning_rate": 2.8909024489493933e-05,
"loss": 0.41,
"step": 40500
},
{
"epoch": 6.07,
"grad_norm": 0.49115487933158875,
"learning_rate": 2.8880123557265464e-05,
"loss": 0.4086,
"step": 41000
},
{
"epoch": 6.14,
"grad_norm": 0.4617484211921692,
"learning_rate": 2.8851222625036995e-05,
"loss": 0.4111,
"step": 41500
},
{
"epoch": 6.21,
"grad_norm": 0.4269873797893524,
"learning_rate": 2.8822321692808523e-05,
"loss": 0.4068,
"step": 42000
},
{
"epoch": 6.29,
"grad_norm": 0.45183584094047546,
"learning_rate": 2.8793420760580054e-05,
"loss": 0.4104,
"step": 42500
},
{
"epoch": 6.36,
"grad_norm": 0.3999849557876587,
"learning_rate": 2.8764519828351586e-05,
"loss": 0.4074,
"step": 43000
},
{
"epoch": 6.44,
"grad_norm": 0.3897479772567749,
"learning_rate": 2.8735618896123113e-05,
"loss": 0.4113,
"step": 43500
},
{
"epoch": 6.51,
"grad_norm": 0.36687174439430237,
"learning_rate": 2.8706717963894645e-05,
"loss": 0.409,
"step": 44000
},
{
"epoch": 6.58,
"grad_norm": 0.41888511180877686,
"learning_rate": 2.8677817031666176e-05,
"loss": 0.4072,
"step": 44500
},
{
"epoch": 6.66,
"grad_norm": 0.4102098047733307,
"learning_rate": 2.8648916099437704e-05,
"loss": 0.4081,
"step": 45000
},
{
"epoch": 6.73,
"grad_norm": 0.42067912220954895,
"learning_rate": 2.8620015167209235e-05,
"loss": 0.4093,
"step": 45500
},
{
"epoch": 6.81,
"grad_norm": 0.45427748560905457,
"learning_rate": 2.8591114234980766e-05,
"loss": 0.4076,
"step": 46000
},
{
"epoch": 6.88,
"grad_norm": 0.394954115152359,
"learning_rate": 2.8562213302752294e-05,
"loss": 0.4067,
"step": 46500
},
{
"epoch": 6.95,
"grad_norm": 0.42659953236579895,
"learning_rate": 2.8533312370523825e-05,
"loss": 0.4062,
"step": 47000
},
{
"epoch": 7.03,
"grad_norm": 0.38056984543800354,
"learning_rate": 2.8504411438295357e-05,
"loss": 0.4061,
"step": 47500
},
{
"epoch": 7.1,
"grad_norm": 0.368455708026886,
"learning_rate": 2.8475510506066884e-05,
"loss": 0.4032,
"step": 48000
},
{
"epoch": 7.18,
"grad_norm": 0.44540271162986755,
"learning_rate": 2.8446609573838416e-05,
"loss": 0.4054,
"step": 48500
},
{
"epoch": 7.25,
"grad_norm": 0.3926877975463867,
"learning_rate": 2.8417708641609943e-05,
"loss": 0.4024,
"step": 49000
},
{
"epoch": 7.32,
"grad_norm": 0.4288729727268219,
"learning_rate": 2.838880770938147e-05,
"loss": 0.4013,
"step": 49500
},
{
"epoch": 7.4,
"grad_norm": 0.4729566276073456,
"learning_rate": 2.8359906777153003e-05,
"loss": 0.4019,
"step": 50000
},
{
"epoch": 7.47,
"grad_norm": 0.46875321865081787,
"learning_rate": 2.8331005844924534e-05,
"loss": 0.4,
"step": 50500
},
{
"epoch": 7.55,
"grad_norm": 0.63325035572052,
"learning_rate": 2.830210491269606e-05,
"loss": 0.4008,
"step": 51000
},
{
"epoch": 7.62,
"grad_norm": 0.4186055064201355,
"learning_rate": 2.8273203980467593e-05,
"loss": 0.4026,
"step": 51500
},
{
"epoch": 7.69,
"grad_norm": 0.3860541880130768,
"learning_rate": 2.8244303048239124e-05,
"loss": 0.4022,
"step": 52000
},
{
"epoch": 7.77,
"grad_norm": 0.4552393853664398,
"learning_rate": 2.8215402116010652e-05,
"loss": 0.3979,
"step": 52500
},
{
"epoch": 7.84,
"grad_norm": 0.4990374743938446,
"learning_rate": 2.8186501183782183e-05,
"loss": 0.4001,
"step": 53000
},
{
"epoch": 7.92,
"grad_norm": 0.46718060970306396,
"learning_rate": 2.8157600251553714e-05,
"loss": 0.4,
"step": 53500
},
{
"epoch": 7.99,
"grad_norm": 0.45432960987091064,
"learning_rate": 2.8128699319325242e-05,
"loss": 0.398,
"step": 54000
},
{
"epoch": 8.06,
"grad_norm": 0.40666621923446655,
"learning_rate": 2.8099798387096774e-05,
"loss": 0.3996,
"step": 54500
},
{
"epoch": 8.14,
"grad_norm": 0.402972936630249,
"learning_rate": 2.8070897454868305e-05,
"loss": 0.3985,
"step": 55000
},
{
"epoch": 8.21,
"grad_norm": 0.3767193853855133,
"learning_rate": 2.8041996522639836e-05,
"loss": 0.4,
"step": 55500
},
{
"epoch": 8.29,
"grad_norm": 0.40102022886276245,
"learning_rate": 2.8013095590411364e-05,
"loss": 0.3987,
"step": 56000
},
{
"epoch": 8.36,
"grad_norm": 0.4435707926750183,
"learning_rate": 2.7984194658182895e-05,
"loss": 0.3976,
"step": 56500
},
{
"epoch": 8.43,
"grad_norm": 0.39804941415786743,
"learning_rate": 2.7955293725954426e-05,
"loss": 0.395,
"step": 57000
},
{
"epoch": 8.51,
"grad_norm": 0.41703784465789795,
"learning_rate": 2.7926392793725954e-05,
"loss": 0.395,
"step": 57500
},
{
"epoch": 8.58,
"grad_norm": 0.4349576234817505,
"learning_rate": 2.7897491861497485e-05,
"loss": 0.3946,
"step": 58000
},
{
"epoch": 8.66,
"grad_norm": 0.37204691767692566,
"learning_rate": 2.7868590929269017e-05,
"loss": 0.394,
"step": 58500
},
{
"epoch": 8.73,
"grad_norm": 0.42759761214256287,
"learning_rate": 2.7839689997040545e-05,
"loss": 0.3949,
"step": 59000
},
{
"epoch": 8.8,
"grad_norm": 0.37754470109939575,
"learning_rate": 2.7810789064812076e-05,
"loss": 0.3939,
"step": 59500
},
{
"epoch": 8.88,
"grad_norm": 0.3639107346534729,
"learning_rate": 2.7781888132583607e-05,
"loss": 0.3932,
"step": 60000
},
{
"epoch": 8.95,
"grad_norm": 0.37291327118873596,
"learning_rate": 2.7752987200355135e-05,
"loss": 0.394,
"step": 60500
},
{
"epoch": 9.03,
"grad_norm": 0.3964773416519165,
"learning_rate": 2.7724086268126666e-05,
"loss": 0.3959,
"step": 61000
},
{
"epoch": 9.1,
"grad_norm": 0.4025065004825592,
"learning_rate": 2.7695185335898197e-05,
"loss": 0.3922,
"step": 61500
},
{
"epoch": 9.17,
"grad_norm": 0.5499910116195679,
"learning_rate": 2.7666284403669725e-05,
"loss": 0.3893,
"step": 62000
},
{
"epoch": 9.25,
"grad_norm": 0.43492835760116577,
"learning_rate": 2.7637383471441256e-05,
"loss": 0.3942,
"step": 62500
},
{
"epoch": 9.32,
"grad_norm": 0.38981184363365173,
"learning_rate": 2.7608482539212788e-05,
"loss": 0.3941,
"step": 63000
},
{
"epoch": 9.4,
"grad_norm": 0.4508809745311737,
"learning_rate": 2.7579581606984316e-05,
"loss": 0.3922,
"step": 63500
},
{
"epoch": 9.47,
"grad_norm": 0.37447696924209595,
"learning_rate": 2.7550680674755847e-05,
"loss": 0.3905,
"step": 64000
},
{
"epoch": 9.54,
"grad_norm": 0.40094566345214844,
"learning_rate": 2.7521779742527378e-05,
"loss": 0.3938,
"step": 64500
},
{
"epoch": 9.62,
"grad_norm": 0.46564099192619324,
"learning_rate": 2.7492878810298906e-05,
"loss": 0.3914,
"step": 65000
},
{
"epoch": 9.69,
"grad_norm": 0.37548139691352844,
"learning_rate": 2.7463977878070437e-05,
"loss": 0.3923,
"step": 65500
},
{
"epoch": 9.77,
"grad_norm": 0.39845481514930725,
"learning_rate": 2.743507694584197e-05,
"loss": 0.3904,
"step": 66000
},
{
"epoch": 9.84,
"grad_norm": 0.46478548645973206,
"learning_rate": 2.7406176013613496e-05,
"loss": 0.3897,
"step": 66500
},
{
"epoch": 9.91,
"grad_norm": 0.5512229204177856,
"learning_rate": 2.7377275081385027e-05,
"loss": 0.3898,
"step": 67000
},
{
"epoch": 9.99,
"grad_norm": 0.34783828258514404,
"learning_rate": 2.734837414915656e-05,
"loss": 0.3901,
"step": 67500
},
{
"epoch": 10.06,
"grad_norm": 0.4403396546840668,
"learning_rate": 2.7319473216928083e-05,
"loss": 0.388,
"step": 68000
},
{
"epoch": 10.14,
"grad_norm": 0.37262195348739624,
"learning_rate": 2.7290572284699614e-05,
"loss": 0.3869,
"step": 68500
},
{
"epoch": 10.21,
"grad_norm": 0.38222742080688477,
"learning_rate": 2.7261671352471146e-05,
"loss": 0.3901,
"step": 69000
},
{
"epoch": 10.28,
"grad_norm": 0.614713191986084,
"learning_rate": 2.7232770420242673e-05,
"loss": 0.3886,
"step": 69500
},
{
"epoch": 10.36,
"grad_norm": 0.4252707362174988,
"learning_rate": 2.7203869488014205e-05,
"loss": 0.3874,
"step": 70000
},
{
"epoch": 10.43,
"grad_norm": 0.3792737126350403,
"learning_rate": 2.7174968555785736e-05,
"loss": 0.3855,
"step": 70500
},
{
"epoch": 10.51,
"grad_norm": 0.40962672233581543,
"learning_rate": 2.7146067623557264e-05,
"loss": 0.3875,
"step": 71000
},
{
"epoch": 10.58,
"grad_norm": 0.38987433910369873,
"learning_rate": 2.7117166691328795e-05,
"loss": 0.3855,
"step": 71500
},
{
"epoch": 10.65,
"grad_norm": 0.407105028629303,
"learning_rate": 2.7088265759100326e-05,
"loss": 0.3857,
"step": 72000
},
{
"epoch": 10.73,
"grad_norm": 0.3527330160140991,
"learning_rate": 2.7059364826871854e-05,
"loss": 0.3869,
"step": 72500
},
{
"epoch": 10.8,
"grad_norm": 0.3859241306781769,
"learning_rate": 2.7030463894643385e-05,
"loss": 0.3872,
"step": 73000
},
{
"epoch": 10.88,
"grad_norm": 0.3989656865596771,
"learning_rate": 2.7001562962414917e-05,
"loss": 0.3855,
"step": 73500
},
{
"epoch": 10.95,
"grad_norm": 0.4163249731063843,
"learning_rate": 2.6972662030186444e-05,
"loss": 0.3855,
"step": 74000
},
{
"epoch": 11.02,
"grad_norm": 0.39577716588974,
"learning_rate": 2.6943761097957976e-05,
"loss": 0.3848,
"step": 74500
},
{
"epoch": 11.1,
"grad_norm": 0.3792816400527954,
"learning_rate": 2.6914860165729507e-05,
"loss": 0.3864,
"step": 75000
},
{
"epoch": 11.17,
"grad_norm": 0.3979376554489136,
"learning_rate": 2.6885959233501035e-05,
"loss": 0.3847,
"step": 75500
},
{
"epoch": 11.25,
"grad_norm": 0.44446560740470886,
"learning_rate": 2.6857058301272566e-05,
"loss": 0.3851,
"step": 76000
},
{
"epoch": 11.32,
"grad_norm": 0.3826451599597931,
"learning_rate": 2.6828157369044097e-05,
"loss": 0.385,
"step": 76500
},
{
"epoch": 11.39,
"grad_norm": 0.38595423102378845,
"learning_rate": 2.6799256436815625e-05,
"loss": 0.3848,
"step": 77000
},
{
"epoch": 11.47,
"grad_norm": 0.4047674238681793,
"learning_rate": 2.6770355504587156e-05,
"loss": 0.384,
"step": 77500
},
{
"epoch": 11.54,
"grad_norm": 0.3704206347465515,
"learning_rate": 2.6741454572358688e-05,
"loss": 0.3846,
"step": 78000
},
{
"epoch": 11.62,
"grad_norm": 0.42468497157096863,
"learning_rate": 2.6712553640130215e-05,
"loss": 0.3847,
"step": 78500
},
{
"epoch": 11.69,
"grad_norm": 0.43300580978393555,
"learning_rate": 2.6683652707901747e-05,
"loss": 0.3783,
"step": 79000
},
{
"epoch": 11.76,
"grad_norm": 0.3680213689804077,
"learning_rate": 2.6654751775673278e-05,
"loss": 0.3811,
"step": 79500
},
{
"epoch": 11.84,
"grad_norm": 0.3971569240093231,
"learning_rate": 2.6625850843444806e-05,
"loss": 0.3816,
"step": 80000
},
{
"epoch": 11.91,
"grad_norm": 0.3454134464263916,
"learning_rate": 2.6596949911216337e-05,
"loss": 0.38,
"step": 80500
},
{
"epoch": 11.99,
"grad_norm": 0.38850536942481995,
"learning_rate": 2.6568048978987868e-05,
"loss": 0.3806,
"step": 81000
},
{
"epoch": 12.06,
"grad_norm": 0.41783374547958374,
"learning_rate": 2.6539148046759396e-05,
"loss": 0.3805,
"step": 81500
},
{
"epoch": 12.13,
"grad_norm": 0.37427714467048645,
"learning_rate": 2.6510247114530927e-05,
"loss": 0.3844,
"step": 82000
},
{
"epoch": 12.21,
"grad_norm": 0.3917700946331024,
"learning_rate": 2.648134618230246e-05,
"loss": 0.3818,
"step": 82500
},
{
"epoch": 12.28,
"grad_norm": 0.36409491300582886,
"learning_rate": 2.6452445250073986e-05,
"loss": 0.3815,
"step": 83000
},
{
"epoch": 12.36,
"grad_norm": 0.4320700764656067,
"learning_rate": 2.6423544317845518e-05,
"loss": 0.3813,
"step": 83500
},
{
"epoch": 12.43,
"grad_norm": 0.3927746117115021,
"learning_rate": 2.639464338561705e-05,
"loss": 0.3827,
"step": 84000
},
{
"epoch": 12.5,
"grad_norm": 0.3693206310272217,
"learning_rate": 2.6365742453388577e-05,
"loss": 0.3803,
"step": 84500
},
{
"epoch": 12.58,
"grad_norm": 0.4206922948360443,
"learning_rate": 2.6336841521160108e-05,
"loss": 0.3794,
"step": 85000
},
{
"epoch": 12.65,
"grad_norm": 0.35786914825439453,
"learning_rate": 2.630794058893164e-05,
"loss": 0.3823,
"step": 85500
},
{
"epoch": 12.73,
"grad_norm": 0.4055446982383728,
"learning_rate": 2.6279039656703167e-05,
"loss": 0.3797,
"step": 86000
},
{
"epoch": 12.8,
"grad_norm": 0.473630428314209,
"learning_rate": 2.62501387244747e-05,
"loss": 0.3776,
"step": 86500
},
{
"epoch": 12.87,
"grad_norm": 0.36061763763427734,
"learning_rate": 2.622123779224623e-05,
"loss": 0.3781,
"step": 87000
},
{
"epoch": 12.95,
"grad_norm": 0.4378969371318817,
"learning_rate": 2.6192336860017754e-05,
"loss": 0.3778,
"step": 87500
},
{
"epoch": 13.02,
"grad_norm": 0.3772602379322052,
"learning_rate": 2.6163435927789285e-05,
"loss": 0.3776,
"step": 88000
},
{
"epoch": 13.1,
"grad_norm": 0.42682790756225586,
"learning_rate": 2.6134534995560817e-05,
"loss": 0.3802,
"step": 88500
},
{
"epoch": 13.17,
"grad_norm": 0.38328275084495544,
"learning_rate": 2.6105634063332344e-05,
"loss": 0.3813,
"step": 89000
},
{
"epoch": 13.24,
"grad_norm": 0.4136464595794678,
"learning_rate": 2.6076733131103876e-05,
"loss": 0.3773,
"step": 89500
},
{
"epoch": 13.32,
"grad_norm": 0.39002037048339844,
"learning_rate": 2.6047832198875407e-05,
"loss": 0.3767,
"step": 90000
},
{
"epoch": 13.39,
"grad_norm": 0.4823383092880249,
"learning_rate": 2.6018931266646935e-05,
"loss": 0.3789,
"step": 90500
},
{
"epoch": 13.47,
"grad_norm": 0.3532434403896332,
"learning_rate": 2.5990030334418466e-05,
"loss": 0.3755,
"step": 91000
},
{
"epoch": 13.54,
"grad_norm": 0.3406650424003601,
"learning_rate": 2.5961129402189997e-05,
"loss": 0.3782,
"step": 91500
},
{
"epoch": 13.61,
"grad_norm": 0.42174699902534485,
"learning_rate": 2.5932228469961525e-05,
"loss": 0.3792,
"step": 92000
},
{
"epoch": 13.69,
"grad_norm": 0.4112718999385834,
"learning_rate": 2.5903327537733056e-05,
"loss": 0.3758,
"step": 92500
},
{
"epoch": 13.76,
"grad_norm": 0.39170435070991516,
"learning_rate": 2.5874426605504588e-05,
"loss": 0.3772,
"step": 93000
},
{
"epoch": 13.84,
"grad_norm": 0.35669615864753723,
"learning_rate": 2.584552567327612e-05,
"loss": 0.376,
"step": 93500
},
{
"epoch": 13.91,
"grad_norm": 0.36161208152770996,
"learning_rate": 2.5816624741047647e-05,
"loss": 0.3759,
"step": 94000
},
{
"epoch": 13.98,
"grad_norm": 0.3548930883407593,
"learning_rate": 2.5787723808819178e-05,
"loss": 0.3772,
"step": 94500
},
{
"epoch": 14.06,
"grad_norm": 0.3934749662876129,
"learning_rate": 2.575882287659071e-05,
"loss": 0.3774,
"step": 95000
},
{
"epoch": 14.13,
"grad_norm": 0.3642575442790985,
"learning_rate": 2.5729921944362237e-05,
"loss": 0.3756,
"step": 95500
},
{
"epoch": 14.21,
"grad_norm": 0.34236952662467957,
"learning_rate": 2.5701021012133768e-05,
"loss": 0.3758,
"step": 96000
},
{
"epoch": 14.28,
"grad_norm": 0.40388983488082886,
"learning_rate": 2.56721200799053e-05,
"loss": 0.3749,
"step": 96500
},
{
"epoch": 14.35,
"grad_norm": 0.432076632976532,
"learning_rate": 2.5643219147676827e-05,
"loss": 0.3756,
"step": 97000
},
{
"epoch": 14.43,
"grad_norm": 0.3947947025299072,
"learning_rate": 2.561431821544836e-05,
"loss": 0.3792,
"step": 97500
},
{
"epoch": 14.5,
"grad_norm": 0.3974926173686981,
"learning_rate": 2.558541728321989e-05,
"loss": 0.3741,
"step": 98000
},
{
"epoch": 14.58,
"grad_norm": 0.3732788860797882,
"learning_rate": 2.5556516350991418e-05,
"loss": 0.3741,
"step": 98500
},
{
"epoch": 14.65,
"grad_norm": 0.35293856263160706,
"learning_rate": 2.552761541876295e-05,
"loss": 0.375,
"step": 99000
},
{
"epoch": 14.72,
"grad_norm": 0.38685211539268494,
"learning_rate": 2.549871448653448e-05,
"loss": 0.3711,
"step": 99500
},
{
"epoch": 14.8,
"grad_norm": 0.40676021575927734,
"learning_rate": 2.5469813554306008e-05,
"loss": 0.3786,
"step": 100000
},
{
"epoch": 14.87,
"grad_norm": 0.40946874022483826,
"learning_rate": 2.544091262207754e-05,
"loss": 0.3749,
"step": 100500
},
{
"epoch": 14.95,
"grad_norm": 0.35838282108306885,
"learning_rate": 2.541201168984907e-05,
"loss": 0.3733,
"step": 101000
},
{
"epoch": 15.02,
"grad_norm": 0.4110182821750641,
"learning_rate": 2.5383110757620598e-05,
"loss": 0.3712,
"step": 101500
},
{
"epoch": 15.09,
"grad_norm": 0.35349026322364807,
"learning_rate": 2.535420982539213e-05,
"loss": 0.3719,
"step": 102000
},
{
"epoch": 15.17,
"grad_norm": 0.38222944736480713,
"learning_rate": 2.532530889316366e-05,
"loss": 0.3737,
"step": 102500
},
{
"epoch": 15.24,
"grad_norm": 0.47066986560821533,
"learning_rate": 2.529640796093519e-05,
"loss": 0.3728,
"step": 103000
},
{
"epoch": 15.32,
"grad_norm": 0.3949437439441681,
"learning_rate": 2.526750702870672e-05,
"loss": 0.3752,
"step": 103500
},
{
"epoch": 15.39,
"grad_norm": 0.42243218421936035,
"learning_rate": 2.523860609647825e-05,
"loss": 0.3729,
"step": 104000
},
{
"epoch": 15.46,
"grad_norm": 0.4031197726726532,
"learning_rate": 2.520970516424978e-05,
"loss": 0.3704,
"step": 104500
},
{
"epoch": 15.54,
"grad_norm": 0.4034245014190674,
"learning_rate": 2.518080423202131e-05,
"loss": 0.3713,
"step": 105000
},
{
"epoch": 15.61,
"grad_norm": 0.4237426817417145,
"learning_rate": 2.515190329979284e-05,
"loss": 0.3719,
"step": 105500
},
{
"epoch": 15.69,
"grad_norm": 0.41453346610069275,
"learning_rate": 2.512300236756437e-05,
"loss": 0.3742,
"step": 106000
},
{
"epoch": 15.76,
"grad_norm": 0.4238516390323639,
"learning_rate": 2.50941014353359e-05,
"loss": 0.3734,
"step": 106500
},
{
"epoch": 15.83,
"grad_norm": 0.3762485384941101,
"learning_rate": 2.506520050310743e-05,
"loss": 0.3736,
"step": 107000
},
{
"epoch": 15.91,
"grad_norm": 0.36851537227630615,
"learning_rate": 2.5036299570878956e-05,
"loss": 0.3736,
"step": 107500
},
{
"epoch": 15.98,
"grad_norm": 0.36127322912216187,
"learning_rate": 2.5007398638650487e-05,
"loss": 0.3716,
"step": 108000
},
{
"epoch": 16.06,
"grad_norm": 0.4159682095050812,
"learning_rate": 2.497849770642202e-05,
"loss": 0.3695,
"step": 108500
},
{
"epoch": 16.13,
"grad_norm": 0.40441277623176575,
"learning_rate": 2.4949596774193547e-05,
"loss": 0.3702,
"step": 109000
},
{
"epoch": 16.2,
"grad_norm": 0.3531157076358795,
"learning_rate": 2.4920695841965078e-05,
"loss": 0.3751,
"step": 109500
},
{
"epoch": 16.28,
"grad_norm": 0.40636512637138367,
"learning_rate": 2.489179490973661e-05,
"loss": 0.3692,
"step": 110000
},
{
"epoch": 16.35,
"grad_norm": 0.3990442156791687,
"learning_rate": 2.4862893977508137e-05,
"loss": 0.3697,
"step": 110500
},
{
"epoch": 16.42,
"grad_norm": 0.39944297075271606,
"learning_rate": 2.4833993045279668e-05,
"loss": 0.3683,
"step": 111000
},
{
"epoch": 16.5,
"grad_norm": 0.3601832985877991,
"learning_rate": 2.48050921130512e-05,
"loss": 0.3699,
"step": 111500
},
{
"epoch": 16.57,
"grad_norm": 0.40571844577789307,
"learning_rate": 2.4776191180822727e-05,
"loss": 0.3689,
"step": 112000
},
{
"epoch": 16.65,
"grad_norm": 0.40363049507141113,
"learning_rate": 2.474729024859426e-05,
"loss": 0.3708,
"step": 112500
},
{
"epoch": 16.72,
"grad_norm": 0.3990092873573303,
"learning_rate": 2.471838931636579e-05,
"loss": 0.3706,
"step": 113000
},
{
"epoch": 16.79,
"grad_norm": 0.36532795429229736,
"learning_rate": 2.4689488384137317e-05,
"loss": 0.3721,
"step": 113500
},
{
"epoch": 16.87,
"grad_norm": 0.44025781750679016,
"learning_rate": 2.466058745190885e-05,
"loss": 0.3733,
"step": 114000
},
{
"epoch": 16.94,
"grad_norm": 0.3850398659706116,
"learning_rate": 2.463168651968038e-05,
"loss": 0.3693,
"step": 114500
},
{
"epoch": 17.02,
"grad_norm": 0.38132092356681824,
"learning_rate": 2.4602785587451908e-05,
"loss": 0.3703,
"step": 115000
},
{
"epoch": 17.09,
"grad_norm": 0.39702102541923523,
"learning_rate": 2.457388465522344e-05,
"loss": 0.3706,
"step": 115500
},
{
"epoch": 17.16,
"grad_norm": 0.35070690512657166,
"learning_rate": 2.454498372299497e-05,
"loss": 0.3706,
"step": 116000
},
{
"epoch": 17.24,
"grad_norm": 0.42235612869262695,
"learning_rate": 2.4516082790766498e-05,
"loss": 0.3703,
"step": 116500
},
{
"epoch": 17.31,
"grad_norm": 0.35915622115135193,
"learning_rate": 2.448718185853803e-05,
"loss": 0.3693,
"step": 117000
},
{
"epoch": 17.39,
"grad_norm": 0.34371674060821533,
"learning_rate": 2.445828092630956e-05,
"loss": 0.3686,
"step": 117500
},
{
"epoch": 17.46,
"grad_norm": 0.48452600836753845,
"learning_rate": 2.442937999408109e-05,
"loss": 0.3696,
"step": 118000
},
{
"epoch": 17.53,
"grad_norm": 0.39582401514053345,
"learning_rate": 2.440047906185262e-05,
"loss": 0.3698,
"step": 118500
},
{
"epoch": 17.61,
"grad_norm": 0.43614286184310913,
"learning_rate": 2.437157812962415e-05,
"loss": 0.3688,
"step": 119000
},
{
"epoch": 17.68,
"grad_norm": 0.3942769765853882,
"learning_rate": 2.434267719739568e-05,
"loss": 0.368,
"step": 119500
},
{
"epoch": 17.76,
"grad_norm": 0.34341031312942505,
"learning_rate": 2.431377626516721e-05,
"loss": 0.3665,
"step": 120000
},
{
"epoch": 17.83,
"grad_norm": 0.3661987781524658,
"learning_rate": 2.428487533293874e-05,
"loss": 0.3664,
"step": 120500
},
{
"epoch": 17.9,
"grad_norm": 0.32992053031921387,
"learning_rate": 2.425597440071027e-05,
"loss": 0.3683,
"step": 121000
},
{
"epoch": 17.98,
"grad_norm": 0.40151405334472656,
"learning_rate": 2.42270734684818e-05,
"loss": 0.3677,
"step": 121500
},
{
"epoch": 18.05,
"grad_norm": 0.3343447148799896,
"learning_rate": 2.419817253625333e-05,
"loss": 0.3679,
"step": 122000
},
{
"epoch": 18.13,
"grad_norm": 0.3798489272594452,
"learning_rate": 2.416927160402486e-05,
"loss": 0.3687,
"step": 122500
},
{
"epoch": 18.2,
"grad_norm": 0.3244016766548157,
"learning_rate": 2.414037067179639e-05,
"loss": 0.3643,
"step": 123000
},
{
"epoch": 18.27,
"grad_norm": 0.4036329984664917,
"learning_rate": 2.4111469739567922e-05,
"loss": 0.3703,
"step": 123500
},
{
"epoch": 18.35,
"grad_norm": 0.3875889778137207,
"learning_rate": 2.408256880733945e-05,
"loss": 0.3661,
"step": 124000
},
{
"epoch": 18.42,
"grad_norm": 0.3607046902179718,
"learning_rate": 2.405366787511098e-05,
"loss": 0.3646,
"step": 124500
},
{
"epoch": 18.5,
"grad_norm": 0.33054810762405396,
"learning_rate": 2.4024766942882512e-05,
"loss": 0.3675,
"step": 125000
},
{
"epoch": 18.57,
"grad_norm": 0.4091895520687103,
"learning_rate": 2.399586601065404e-05,
"loss": 0.3652,
"step": 125500
},
{
"epoch": 18.64,
"grad_norm": 0.37667781114578247,
"learning_rate": 2.396696507842557e-05,
"loss": 0.3691,
"step": 126000
},
{
"epoch": 18.72,
"grad_norm": 0.3683590888977051,
"learning_rate": 2.39380641461971e-05,
"loss": 0.3675,
"step": 126500
},
{
"epoch": 18.79,
"grad_norm": 0.397073894739151,
"learning_rate": 2.3909163213968627e-05,
"loss": 0.3637,
"step": 127000
},
{
"epoch": 18.87,
"grad_norm": 0.3522073030471802,
"learning_rate": 2.3880262281740158e-05,
"loss": 0.3676,
"step": 127500
},
{
"epoch": 18.94,
"grad_norm": 0.3389582633972168,
"learning_rate": 2.385136134951169e-05,
"loss": 0.3676,
"step": 128000
},
{
"epoch": 19.01,
"grad_norm": 0.3726537823677063,
"learning_rate": 2.3822460417283217e-05,
"loss": 0.3674,
"step": 128500
},
{
"epoch": 19.09,
"grad_norm": 0.3774533271789551,
"learning_rate": 2.379355948505475e-05,
"loss": 0.3669,
"step": 129000
},
{
"epoch": 19.16,
"grad_norm": 0.45427048206329346,
"learning_rate": 2.376465855282628e-05,
"loss": 0.3652,
"step": 129500
},
{
"epoch": 19.24,
"grad_norm": 0.39148712158203125,
"learning_rate": 2.373575762059781e-05,
"loss": 0.3632,
"step": 130000
},
{
"epoch": 19.31,
"grad_norm": 0.3727419078350067,
"learning_rate": 2.370685668836934e-05,
"loss": 0.3674,
"step": 130500
},
{
"epoch": 19.38,
"grad_norm": 0.3490041196346283,
"learning_rate": 2.367795575614087e-05,
"loss": 0.3685,
"step": 131000
},
{
"epoch": 19.46,
"grad_norm": 0.33863797783851624,
"learning_rate": 2.36490548239124e-05,
"loss": 0.3659,
"step": 131500
},
{
"epoch": 19.53,
"grad_norm": 0.41820865869522095,
"learning_rate": 2.362015389168393e-05,
"loss": 0.3647,
"step": 132000
},
{
"epoch": 19.61,
"grad_norm": 0.31935831904411316,
"learning_rate": 2.359125295945546e-05,
"loss": 0.3657,
"step": 132500
},
{
"epoch": 19.68,
"grad_norm": 0.39523938298225403,
"learning_rate": 2.3562352027226992e-05,
"loss": 0.3643,
"step": 133000
},
{
"epoch": 19.75,
"grad_norm": 0.3851146996021271,
"learning_rate": 2.353345109499852e-05,
"loss": 0.3624,
"step": 133500
},
{
"epoch": 19.83,
"grad_norm": 0.3778953552246094,
"learning_rate": 2.350455016277005e-05,
"loss": 0.3658,
"step": 134000
},
{
"epoch": 19.9,
"grad_norm": 0.3673354387283325,
"learning_rate": 2.3475649230541582e-05,
"loss": 0.3645,
"step": 134500
},
{
"epoch": 19.98,
"grad_norm": 0.40675076842308044,
"learning_rate": 2.344674829831311e-05,
"loss": 0.3624,
"step": 135000
},
{
"epoch": 20.05,
"grad_norm": 0.32396331429481506,
"learning_rate": 2.341784736608464e-05,
"loss": 0.3608,
"step": 135500
},
{
"epoch": 20.12,
"grad_norm": 0.4665846526622772,
"learning_rate": 2.3388946433856172e-05,
"loss": 0.3654,
"step": 136000
},
{
"epoch": 20.2,
"grad_norm": 0.3753814697265625,
"learning_rate": 2.33600455016277e-05,
"loss": 0.3611,
"step": 136500
},
{
"epoch": 20.27,
"grad_norm": 0.39572277665138245,
"learning_rate": 2.333114456939923e-05,
"loss": 0.363,
"step": 137000
},
{
"epoch": 20.35,
"grad_norm": 0.36638927459716797,
"learning_rate": 2.3302243637170763e-05,
"loss": 0.3625,
"step": 137500
},
{
"epoch": 20.42,
"grad_norm": 0.40173882246017456,
"learning_rate": 2.327334270494229e-05,
"loss": 0.3645,
"step": 138000
},
{
"epoch": 20.49,
"grad_norm": 0.34684666991233826,
"learning_rate": 2.3244441772713822e-05,
"loss": 0.3636,
"step": 138500
},
{
"epoch": 20.57,
"grad_norm": 0.3533775806427002,
"learning_rate": 2.3215540840485353e-05,
"loss": 0.3624,
"step": 139000
},
{
"epoch": 20.64,
"grad_norm": 0.36431315541267395,
"learning_rate": 2.318663990825688e-05,
"loss": 0.3649,
"step": 139500
},
{
"epoch": 20.72,
"grad_norm": 0.3629516363143921,
"learning_rate": 2.3157738976028412e-05,
"loss": 0.3646,
"step": 140000
},
{
"epoch": 20.79,
"grad_norm": 0.383987158536911,
"learning_rate": 2.3128838043799943e-05,
"loss": 0.3623,
"step": 140500
},
{
"epoch": 20.86,
"grad_norm": 0.38170096278190613,
"learning_rate": 2.309993711157147e-05,
"loss": 0.3629,
"step": 141000
},
{
"epoch": 20.94,
"grad_norm": 0.36627018451690674,
"learning_rate": 2.3071036179343003e-05,
"loss": 0.362,
"step": 141500
},
{
"epoch": 21.01,
"grad_norm": 0.37587088346481323,
"learning_rate": 2.3042135247114534e-05,
"loss": 0.3624,
"step": 142000
},
{
"epoch": 21.09,
"grad_norm": 0.3648183047771454,
"learning_rate": 2.301323431488606e-05,
"loss": 0.3633,
"step": 142500
},
{
"epoch": 21.16,
"grad_norm": 0.3818926513195038,
"learning_rate": 2.2984333382657593e-05,
"loss": 0.3627,
"step": 143000
},
{
"epoch": 21.23,
"grad_norm": 0.38963159918785095,
"learning_rate": 2.2955432450429124e-05,
"loss": 0.3619,
"step": 143500
},
{
"epoch": 21.31,
"grad_norm": 0.4655527174472809,
"learning_rate": 2.2926531518200652e-05,
"loss": 0.3616,
"step": 144000
},
{
"epoch": 21.38,
"grad_norm": 0.36112433671951294,
"learning_rate": 2.2897630585972183e-05,
"loss": 0.3597,
"step": 144500
},
{
"epoch": 21.46,
"grad_norm": 0.3661918044090271,
"learning_rate": 2.2868729653743714e-05,
"loss": 0.3626,
"step": 145000
},
{
"epoch": 21.53,
"grad_norm": 0.340862512588501,
"learning_rate": 2.2839828721515242e-05,
"loss": 0.3637,
"step": 145500
},
{
"epoch": 21.6,
"grad_norm": 0.3376435339450836,
"learning_rate": 2.281092778928677e-05,
"loss": 0.3622,
"step": 146000
},
{
"epoch": 21.68,
"grad_norm": 0.4211704730987549,
"learning_rate": 2.27820268570583e-05,
"loss": 0.3593,
"step": 146500
},
{
"epoch": 21.75,
"grad_norm": 0.3886992037296295,
"learning_rate": 2.275312592482983e-05,
"loss": 0.3614,
"step": 147000
},
{
"epoch": 21.83,
"grad_norm": 0.34686926007270813,
"learning_rate": 2.272422499260136e-05,
"loss": 0.3615,
"step": 147500
},
{
"epoch": 21.9,
"grad_norm": 0.4066803455352783,
"learning_rate": 2.269532406037289e-05,
"loss": 0.3597,
"step": 148000
},
{
"epoch": 21.97,
"grad_norm": 0.37257149815559387,
"learning_rate": 2.266642312814442e-05,
"loss": 0.364,
"step": 148500
},
{
"epoch": 22.05,
"grad_norm": 0.3770715892314911,
"learning_rate": 2.263752219591595e-05,
"loss": 0.359,
"step": 149000
},
{
"epoch": 22.12,
"grad_norm": 0.3486000597476959,
"learning_rate": 2.2608621263687482e-05,
"loss": 0.3616,
"step": 149500
},
{
"epoch": 22.2,
"grad_norm": 0.4026411771774292,
"learning_rate": 2.257972033145901e-05,
"loss": 0.3615,
"step": 150000
},
{
"epoch": 22.27,
"grad_norm": 0.4740307629108429,
"learning_rate": 2.255081939923054e-05,
"loss": 0.359,
"step": 150500
},
{
"epoch": 22.34,
"grad_norm": 0.5692958235740662,
"learning_rate": 2.2521918467002072e-05,
"loss": 0.3614,
"step": 151000
},
{
"epoch": 22.42,
"grad_norm": 0.4546482264995575,
"learning_rate": 2.24930175347736e-05,
"loss": 0.3609,
"step": 151500
},
{
"epoch": 22.49,
"grad_norm": 0.3762848675251007,
"learning_rate": 2.246411660254513e-05,
"loss": 0.3612,
"step": 152000
},
{
"epoch": 22.57,
"grad_norm": 0.3631458282470703,
"learning_rate": 2.2435215670316663e-05,
"loss": 0.3613,
"step": 152500
},
{
"epoch": 22.64,
"grad_norm": 0.3560335040092468,
"learning_rate": 2.240631473808819e-05,
"loss": 0.3601,
"step": 153000
},
{
"epoch": 22.71,
"grad_norm": 0.3739969730377197,
"learning_rate": 2.2377413805859722e-05,
"loss": 0.3586,
"step": 153500
},
{
"epoch": 22.79,
"grad_norm": 0.3538212478160858,
"learning_rate": 2.2348512873631253e-05,
"loss": 0.3619,
"step": 154000
},
{
"epoch": 22.86,
"grad_norm": 0.33162832260131836,
"learning_rate": 2.231961194140278e-05,
"loss": 0.3566,
"step": 154500
},
{
"epoch": 22.94,
"grad_norm": 0.3731963634490967,
"learning_rate": 2.2290711009174312e-05,
"loss": 0.3617,
"step": 155000
},
{
"epoch": 23.01,
"grad_norm": 0.3658241033554077,
"learning_rate": 2.2261810076945843e-05,
"loss": 0.3621,
"step": 155500
},
{
"epoch": 23.08,
"grad_norm": 0.3295973837375641,
"learning_rate": 2.223290914471737e-05,
"loss": 0.3592,
"step": 156000
},
{
"epoch": 23.16,
"grad_norm": 0.3476080894470215,
"learning_rate": 2.2204008212488902e-05,
"loss": 0.358,
"step": 156500
},
{
"epoch": 23.23,
"grad_norm": 0.4091341197490692,
"learning_rate": 2.2175107280260434e-05,
"loss": 0.3588,
"step": 157000
},
{
"epoch": 23.31,
"grad_norm": 0.4243708848953247,
"learning_rate": 2.214620634803196e-05,
"loss": 0.3581,
"step": 157500
},
{
"epoch": 23.38,
"grad_norm": 0.4200844168663025,
"learning_rate": 2.2117305415803493e-05,
"loss": 0.3598,
"step": 158000
},
{
"epoch": 23.45,
"grad_norm": 0.4001297652721405,
"learning_rate": 2.2088404483575024e-05,
"loss": 0.3608,
"step": 158500
},
{
"epoch": 23.53,
"grad_norm": 0.44927239418029785,
"learning_rate": 2.2059503551346552e-05,
"loss": 0.3572,
"step": 159000
},
{
"epoch": 23.6,
"grad_norm": 0.3438055217266083,
"learning_rate": 2.2030602619118083e-05,
"loss": 0.3603,
"step": 159500
},
{
"epoch": 23.68,
"grad_norm": 0.395907461643219,
"learning_rate": 2.2001701686889614e-05,
"loss": 0.3583,
"step": 160000
},
{
"epoch": 23.75,
"grad_norm": 0.3705403506755829,
"learning_rate": 2.1972800754661142e-05,
"loss": 0.3606,
"step": 160500
},
{
"epoch": 23.82,
"grad_norm": 0.34676727652549744,
"learning_rate": 2.1943899822432673e-05,
"loss": 0.3589,
"step": 161000
},
{
"epoch": 23.9,
"grad_norm": 0.3712906837463379,
"learning_rate": 2.1914998890204205e-05,
"loss": 0.3597,
"step": 161500
},
{
"epoch": 23.97,
"grad_norm": 0.3136620819568634,
"learning_rate": 2.1886097957975732e-05,
"loss": 0.3571,
"step": 162000
},
{
"epoch": 24.05,
"grad_norm": 0.3915017545223236,
"learning_rate": 2.1857197025747264e-05,
"loss": 0.3595,
"step": 162500
},
{
"epoch": 24.12,
"grad_norm": 0.3403623700141907,
"learning_rate": 2.1828296093518795e-05,
"loss": 0.3608,
"step": 163000
},
{
"epoch": 24.19,
"grad_norm": 0.3841993510723114,
"learning_rate": 2.1799395161290323e-05,
"loss": 0.3596,
"step": 163500
},
{
"epoch": 24.27,
"grad_norm": 0.42894381284713745,
"learning_rate": 2.1770494229061854e-05,
"loss": 0.3571,
"step": 164000
},
{
"epoch": 24.34,
"grad_norm": 0.4211946129798889,
"learning_rate": 2.1741593296833385e-05,
"loss": 0.3552,
"step": 164500
},
{
"epoch": 24.42,
"grad_norm": 0.35293659567832947,
"learning_rate": 2.1712692364604917e-05,
"loss": 0.359,
"step": 165000
},
{
"epoch": 24.49,
"grad_norm": 0.3743543326854706,
"learning_rate": 2.168379143237644e-05,
"loss": 0.3556,
"step": 165500
},
{
"epoch": 24.56,
"grad_norm": 0.4101512134075165,
"learning_rate": 2.1654890500147972e-05,
"loss": 0.3561,
"step": 166000
},
{
"epoch": 24.64,
"grad_norm": 0.34685665369033813,
"learning_rate": 2.1625989567919503e-05,
"loss": 0.3596,
"step": 166500
},
{
"epoch": 24.71,
"grad_norm": 0.3677741289138794,
"learning_rate": 2.159708863569103e-05,
"loss": 0.3572,
"step": 167000
},
{
"epoch": 24.79,
"grad_norm": 0.3836405575275421,
"learning_rate": 2.1568187703462563e-05,
"loss": 0.36,
"step": 167500
},
{
"epoch": 24.86,
"grad_norm": 0.3649022579193115,
"learning_rate": 2.1539286771234094e-05,
"loss": 0.3558,
"step": 168000
},
{
"epoch": 24.93,
"grad_norm": 0.3566337525844574,
"learning_rate": 2.151038583900562e-05,
"loss": 0.3567,
"step": 168500
},
{
"epoch": 25.01,
"grad_norm": 0.4024845361709595,
"learning_rate": 2.1481484906777153e-05,
"loss": 0.3555,
"step": 169000
},
{
"epoch": 25.08,
"grad_norm": 0.3865801692008972,
"learning_rate": 2.1452583974548684e-05,
"loss": 0.3587,
"step": 169500
},
{
"epoch": 25.16,
"grad_norm": 0.3753124475479126,
"learning_rate": 2.1423683042320212e-05,
"loss": 0.358,
"step": 170000
},
{
"epoch": 25.23,
"grad_norm": 0.3889290690422058,
"learning_rate": 2.1394782110091743e-05,
"loss": 0.3573,
"step": 170500
},
{
"epoch": 25.3,
"grad_norm": 0.425574392080307,
"learning_rate": 2.1365881177863274e-05,
"loss": 0.3583,
"step": 171000
},
{
"epoch": 25.38,
"grad_norm": 0.35915040969848633,
"learning_rate": 2.1336980245634802e-05,
"loss": 0.355,
"step": 171500
},
{
"epoch": 25.45,
"grad_norm": 0.3714876174926758,
"learning_rate": 2.1308079313406334e-05,
"loss": 0.3558,
"step": 172000
},
{
"epoch": 25.53,
"grad_norm": 0.3659971356391907,
"learning_rate": 2.1279178381177865e-05,
"loss": 0.3526,
"step": 172500
},
{
"epoch": 25.6,
"grad_norm": 0.35083669424057007,
"learning_rate": 2.1250277448949393e-05,
"loss": 0.3582,
"step": 173000
},
{
"epoch": 25.67,
"grad_norm": 0.3540023863315582,
"learning_rate": 2.1221376516720924e-05,
"loss": 0.3555,
"step": 173500
},
{
"epoch": 25.75,
"grad_norm": 0.3811222016811371,
"learning_rate": 2.1192475584492455e-05,
"loss": 0.3557,
"step": 174000
},
{
"epoch": 25.82,
"grad_norm": 0.37513551115989685,
"learning_rate": 2.1163574652263983e-05,
"loss": 0.3563,
"step": 174500
},
{
"epoch": 25.9,
"grad_norm": 0.4036356508731842,
"learning_rate": 2.1134673720035514e-05,
"loss": 0.3548,
"step": 175000
},
{
"epoch": 25.97,
"grad_norm": 0.3446299135684967,
"learning_rate": 2.1105772787807045e-05,
"loss": 0.3573,
"step": 175500
},
{
"epoch": 26.04,
"grad_norm": 0.4351588487625122,
"learning_rate": 2.1076871855578573e-05,
"loss": 0.3556,
"step": 176000
},
{
"epoch": 26.12,
"grad_norm": 0.38238152861595154,
"learning_rate": 2.1047970923350105e-05,
"loss": 0.3566,
"step": 176500
},
{
"epoch": 26.19,
"grad_norm": 0.3972441554069519,
"learning_rate": 2.1019069991121636e-05,
"loss": 0.3533,
"step": 177000
},
{
"epoch": 26.27,
"grad_norm": 0.40132614970207214,
"learning_rate": 2.0990169058893164e-05,
"loss": 0.3548,
"step": 177500
},
{
"epoch": 26.34,
"grad_norm": 0.3178902864456177,
"learning_rate": 2.0961268126664695e-05,
"loss": 0.355,
"step": 178000
},
{
"epoch": 26.41,
"grad_norm": 0.4328124225139618,
"learning_rate": 2.0932367194436226e-05,
"loss": 0.3554,
"step": 178500
},
{
"epoch": 26.49,
"grad_norm": 0.3971725404262543,
"learning_rate": 2.0903466262207754e-05,
"loss": 0.3549,
"step": 179000
},
{
"epoch": 26.56,
"grad_norm": 0.3241216540336609,
"learning_rate": 2.0874565329979285e-05,
"loss": 0.3534,
"step": 179500
},
{
"epoch": 26.64,
"grad_norm": 0.3448522984981537,
"learning_rate": 2.0845664397750816e-05,
"loss": 0.3559,
"step": 180000
},
{
"epoch": 26.71,
"grad_norm": 0.34117060899734497,
"learning_rate": 2.0816763465522344e-05,
"loss": 0.3555,
"step": 180500
},
{
"epoch": 26.78,
"grad_norm": 0.39051172137260437,
"learning_rate": 2.0787862533293876e-05,
"loss": 0.3558,
"step": 181000
},
{
"epoch": 26.86,
"grad_norm": 0.3349858820438385,
"learning_rate": 2.0758961601065407e-05,
"loss": 0.3546,
"step": 181500
},
{
"epoch": 26.93,
"grad_norm": 0.4579429030418396,
"learning_rate": 2.0730060668836935e-05,
"loss": 0.3537,
"step": 182000
},
{
"epoch": 27.01,
"grad_norm": 0.3789091110229492,
"learning_rate": 2.0701159736608466e-05,
"loss": 0.3527,
"step": 182500
},
{
"epoch": 27.08,
"grad_norm": 0.43690434098243713,
"learning_rate": 2.0672258804379997e-05,
"loss": 0.3563,
"step": 183000
},
{
"epoch": 27.15,
"grad_norm": 0.3886288106441498,
"learning_rate": 2.0643357872151525e-05,
"loss": 0.3543,
"step": 183500
},
{
"epoch": 27.23,
"grad_norm": 0.40548428893089294,
"learning_rate": 2.0614456939923056e-05,
"loss": 0.353,
"step": 184000
},
{
"epoch": 27.3,
"grad_norm": 0.4054431915283203,
"learning_rate": 2.0585556007694584e-05,
"loss": 0.3575,
"step": 184500
},
{
"epoch": 27.37,
"grad_norm": 0.3319009840488434,
"learning_rate": 2.0556655075466112e-05,
"loss": 0.3524,
"step": 185000
},
{
"epoch": 27.45,
"grad_norm": 0.36432087421417236,
"learning_rate": 2.0527754143237643e-05,
"loss": 0.3556,
"step": 185500
},
{
"epoch": 27.52,
"grad_norm": 0.3561677038669586,
"learning_rate": 2.0498853211009174e-05,
"loss": 0.3539,
"step": 186000
},
{
"epoch": 27.6,
"grad_norm": 0.41498541831970215,
"learning_rate": 2.0469952278780702e-05,
"loss": 0.3561,
"step": 186500
},
{
"epoch": 27.67,
"grad_norm": 0.3646217882633209,
"learning_rate": 2.0441051346552233e-05,
"loss": 0.3519,
"step": 187000
},
{
"epoch": 27.74,
"grad_norm": 0.34534063935279846,
"learning_rate": 2.0412150414323765e-05,
"loss": 0.3539,
"step": 187500
},
{
"epoch": 27.82,
"grad_norm": 0.4323655962944031,
"learning_rate": 2.0383249482095293e-05,
"loss": 0.3559,
"step": 188000
},
{
"epoch": 27.89,
"grad_norm": 0.3833807408809662,
"learning_rate": 2.0354348549866824e-05,
"loss": 0.3532,
"step": 188500
},
{
"epoch": 27.97,
"grad_norm": 0.37557268142700195,
"learning_rate": 2.0325447617638355e-05,
"loss": 0.3523,
"step": 189000
},
{
"epoch": 28.04,
"grad_norm": 0.37144702672958374,
"learning_rate": 2.0296546685409883e-05,
"loss": 0.3536,
"step": 189500
},
{
"epoch": 28.11,
"grad_norm": 0.40455296635627747,
"learning_rate": 2.0267645753181414e-05,
"loss": 0.3534,
"step": 190000
},
{
"epoch": 28.19,
"grad_norm": 0.3639744818210602,
"learning_rate": 2.0238744820952945e-05,
"loss": 0.3536,
"step": 190500
},
{
"epoch": 28.26,
"grad_norm": 0.38016533851623535,
"learning_rate": 2.0209843888724473e-05,
"loss": 0.3569,
"step": 191000
},
{
"epoch": 28.34,
"grad_norm": 0.35611262917518616,
"learning_rate": 2.0180942956496004e-05,
"loss": 0.3539,
"step": 191500
},
{
"epoch": 28.41,
"grad_norm": 0.3586650490760803,
"learning_rate": 2.0152042024267536e-05,
"loss": 0.3516,
"step": 192000
},
{
"epoch": 28.48,
"grad_norm": 0.3105120062828064,
"learning_rate": 2.0123141092039064e-05,
"loss": 0.3518,
"step": 192500
},
{
"epoch": 28.56,
"grad_norm": 0.37972891330718994,
"learning_rate": 2.0094240159810595e-05,
"loss": 0.3558,
"step": 193000
},
{
"epoch": 28.63,
"grad_norm": 0.35530367493629456,
"learning_rate": 2.0065339227582126e-05,
"loss": 0.3505,
"step": 193500
},
{
"epoch": 28.71,
"grad_norm": 0.42136579751968384,
"learning_rate": 2.0036438295353654e-05,
"loss": 0.3537,
"step": 194000
},
{
"epoch": 28.78,
"grad_norm": 0.37874168157577515,
"learning_rate": 2.0007537363125185e-05,
"loss": 0.3505,
"step": 194500
},
{
"epoch": 28.85,
"grad_norm": 0.33442074060440063,
"learning_rate": 1.9978636430896716e-05,
"loss": 0.3536,
"step": 195000
},
{
"epoch": 28.93,
"grad_norm": 0.37098708748817444,
"learning_rate": 1.9949735498668244e-05,
"loss": 0.3553,
"step": 195500
},
{
"epoch": 29.0,
"grad_norm": 0.33478862047195435,
"learning_rate": 1.9920834566439775e-05,
"loss": 0.3527,
"step": 196000
},
{
"epoch": 29.08,
"grad_norm": 0.3783182203769684,
"learning_rate": 1.9891933634211307e-05,
"loss": 0.3523,
"step": 196500
},
{
"epoch": 29.15,
"grad_norm": 0.32911786437034607,
"learning_rate": 1.9863032701982835e-05,
"loss": 0.3526,
"step": 197000
},
{
"epoch": 29.22,
"grad_norm": 0.33882907032966614,
"learning_rate": 1.9834131769754366e-05,
"loss": 0.3507,
"step": 197500
},
{
"epoch": 29.3,
"grad_norm": 0.4318142235279083,
"learning_rate": 1.9805230837525897e-05,
"loss": 0.3504,
"step": 198000
},
{
"epoch": 29.37,
"grad_norm": 0.33973386883735657,
"learning_rate": 1.9776329905297425e-05,
"loss": 0.3526,
"step": 198500
},
{
"epoch": 29.45,
"grad_norm": 0.3557802736759186,
"learning_rate": 1.9747428973068956e-05,
"loss": 0.3511,
"step": 199000
},
{
"epoch": 29.52,
"grad_norm": 0.4430686831474304,
"learning_rate": 1.9718528040840487e-05,
"loss": 0.3503,
"step": 199500
},
{
"epoch": 29.59,
"grad_norm": 0.33132269978523254,
"learning_rate": 1.9689627108612015e-05,
"loss": 0.3531,
"step": 200000
},
{
"epoch": 29.67,
"grad_norm": 0.362075537443161,
"learning_rate": 1.9660726176383546e-05,
"loss": 0.3517,
"step": 200500
},
{
"epoch": 29.74,
"grad_norm": 0.3604036867618561,
"learning_rate": 1.9631825244155078e-05,
"loss": 0.3558,
"step": 201000
},
{
"epoch": 29.82,
"grad_norm": 0.39711400866508484,
"learning_rate": 1.960292431192661e-05,
"loss": 0.3515,
"step": 201500
},
{
"epoch": 29.89,
"grad_norm": 0.30394095182418823,
"learning_rate": 1.9574023379698137e-05,
"loss": 0.352,
"step": 202000
},
{
"epoch": 29.96,
"grad_norm": 0.3903627097606659,
"learning_rate": 1.9545122447469668e-05,
"loss": 0.3496,
"step": 202500
},
{
"epoch": 30.04,
"grad_norm": 0.3124367296695709,
"learning_rate": 1.95162215152412e-05,
"loss": 0.3517,
"step": 203000
},
{
"epoch": 30.11,
"grad_norm": 0.4038899540901184,
"learning_rate": 1.9487320583012727e-05,
"loss": 0.3518,
"step": 203500
},
{
"epoch": 30.19,
"grad_norm": 0.32454368472099304,
"learning_rate": 1.9458419650784255e-05,
"loss": 0.3517,
"step": 204000
},
{
"epoch": 30.26,
"grad_norm": 0.35400575399398804,
"learning_rate": 1.9429518718555786e-05,
"loss": 0.3479,
"step": 204500
},
{
"epoch": 30.33,
"grad_norm": 0.424834281206131,
"learning_rate": 1.9400617786327314e-05,
"loss": 0.3502,
"step": 205000
},
{
"epoch": 30.41,
"grad_norm": 0.4748223125934601,
"learning_rate": 1.9371716854098845e-05,
"loss": 0.349,
"step": 205500
},
{
"epoch": 30.48,
"grad_norm": 0.3032289445400238,
"learning_rate": 1.9342815921870376e-05,
"loss": 0.3543,
"step": 206000
},
{
"epoch": 30.56,
"grad_norm": 0.4162702262401581,
"learning_rate": 1.9313914989641904e-05,
"loss": 0.3518,
"step": 206500
},
{
"epoch": 30.63,
"grad_norm": 0.3512803912162781,
"learning_rate": 1.9285014057413436e-05,
"loss": 0.3505,
"step": 207000
},
{
"epoch": 30.7,
"grad_norm": 0.3622516989707947,
"learning_rate": 1.9256113125184967e-05,
"loss": 0.3538,
"step": 207500
},
{
"epoch": 30.78,
"grad_norm": 0.3330516517162323,
"learning_rate": 1.9227212192956495e-05,
"loss": 0.3489,
"step": 208000
},
{
"epoch": 30.85,
"grad_norm": 0.3457803726196289,
"learning_rate": 1.9198311260728026e-05,
"loss": 0.352,
"step": 208500
},
{
"epoch": 30.93,
"grad_norm": 0.3154030442237854,
"learning_rate": 1.9169410328499557e-05,
"loss": 0.3491,
"step": 209000
},
{
"epoch": 31.0,
"grad_norm": 0.46131783723831177,
"learning_rate": 1.9140509396271085e-05,
"loss": 0.3509,
"step": 209500
},
{
"epoch": 31.07,
"grad_norm": 0.400088369846344,
"learning_rate": 1.9111608464042616e-05,
"loss": 0.3509,
"step": 210000
},
{
"epoch": 31.15,
"grad_norm": 0.3647451400756836,
"learning_rate": 1.9082707531814147e-05,
"loss": 0.35,
"step": 210500
},
{
"epoch": 31.22,
"grad_norm": 0.4007732570171356,
"learning_rate": 1.9053806599585675e-05,
"loss": 0.3504,
"step": 211000
},
{
"epoch": 31.3,
"grad_norm": 0.3861900269985199,
"learning_rate": 1.9024905667357207e-05,
"loss": 0.3484,
"step": 211500
},
{
"epoch": 31.37,
"grad_norm": 0.411627858877182,
"learning_rate": 1.8996004735128738e-05,
"loss": 0.3507,
"step": 212000
},
{
"epoch": 31.44,
"grad_norm": 0.35766077041625977,
"learning_rate": 1.8967103802900266e-05,
"loss": 0.3485,
"step": 212500
},
{
"epoch": 31.52,
"grad_norm": 0.3537013530731201,
"learning_rate": 1.8938202870671797e-05,
"loss": 0.3517,
"step": 213000
},
{
"epoch": 31.59,
"grad_norm": 0.3919309675693512,
"learning_rate": 1.8909301938443328e-05,
"loss": 0.3489,
"step": 213500
},
{
"epoch": 31.67,
"grad_norm": 0.3441930115222931,
"learning_rate": 1.8880401006214856e-05,
"loss": 0.351,
"step": 214000
},
{
"epoch": 31.74,
"grad_norm": 0.38138172030448914,
"learning_rate": 1.8851500073986387e-05,
"loss": 0.3495,
"step": 214500
},
{
"epoch": 31.81,
"grad_norm": 0.4080500304698944,
"learning_rate": 1.882259914175792e-05,
"loss": 0.3497,
"step": 215000
},
{
"epoch": 31.89,
"grad_norm": 0.3864932358264923,
"learning_rate": 1.8793698209529446e-05,
"loss": 0.3525,
"step": 215500
},
{
"epoch": 31.96,
"grad_norm": 0.4017949104309082,
"learning_rate": 1.8764797277300978e-05,
"loss": 0.3528,
"step": 216000
},
{
"epoch": 32.04,
"grad_norm": 0.3484615087509155,
"learning_rate": 1.873589634507251e-05,
"loss": 0.3505,
"step": 216500
},
{
"epoch": 32.11,
"grad_norm": 0.34500235319137573,
"learning_rate": 1.8706995412844037e-05,
"loss": 0.3498,
"step": 217000
},
{
"epoch": 32.18,
"grad_norm": 0.32486996054649353,
"learning_rate": 1.8678094480615568e-05,
"loss": 0.3501,
"step": 217500
},
{
"epoch": 32.26,
"grad_norm": 0.3440997302532196,
"learning_rate": 1.86491935483871e-05,
"loss": 0.3504,
"step": 218000
},
{
"epoch": 32.33,
"grad_norm": 0.359846293926239,
"learning_rate": 1.8620292616158627e-05,
"loss": 0.3491,
"step": 218500
},
{
"epoch": 32.41,
"grad_norm": 0.36168062686920166,
"learning_rate": 1.8591391683930158e-05,
"loss": 0.3489,
"step": 219000
},
{
"epoch": 32.48,
"grad_norm": 0.43606841564178467,
"learning_rate": 1.856249075170169e-05,
"loss": 0.3497,
"step": 219500
},
{
"epoch": 32.55,
"grad_norm": 0.3898315727710724,
"learning_rate": 1.8533589819473217e-05,
"loss": 0.3487,
"step": 220000
},
{
"epoch": 32.63,
"grad_norm": 0.381244421005249,
"learning_rate": 1.850468888724475e-05,
"loss": 0.3475,
"step": 220500
},
{
"epoch": 32.7,
"grad_norm": 0.41321712732315063,
"learning_rate": 1.847578795501628e-05,
"loss": 0.3484,
"step": 221000
},
{
"epoch": 32.78,
"grad_norm": 0.3538101017475128,
"learning_rate": 1.8446887022787808e-05,
"loss": 0.3484,
"step": 221500
},
{
"epoch": 32.85,
"grad_norm": 0.38104715943336487,
"learning_rate": 1.841798609055934e-05,
"loss": 0.3482,
"step": 222000
},
{
"epoch": 32.92,
"grad_norm": 0.37761756777763367,
"learning_rate": 1.838908515833087e-05,
"loss": 0.3474,
"step": 222500
},
{
"epoch": 33.0,
"grad_norm": 0.3524073362350464,
"learning_rate": 1.8360184226102398e-05,
"loss": 0.3518,
"step": 223000
},
{
"epoch": 33.07,
"grad_norm": 0.3452695608139038,
"learning_rate": 1.8331283293873926e-05,
"loss": 0.3509,
"step": 223500
},
{
"epoch": 33.15,
"grad_norm": 0.4063817262649536,
"learning_rate": 1.8302382361645457e-05,
"loss": 0.3496,
"step": 224000
},
{
"epoch": 33.22,
"grad_norm": 0.41099056601524353,
"learning_rate": 1.8273481429416985e-05,
"loss": 0.3489,
"step": 224500
},
{
"epoch": 33.29,
"grad_norm": 0.3691389560699463,
"learning_rate": 1.8244580497188516e-05,
"loss": 0.3507,
"step": 225000
},
{
"epoch": 33.37,
"grad_norm": 0.36765575408935547,
"learning_rate": 1.8215679564960047e-05,
"loss": 0.3464,
"step": 225500
},
{
"epoch": 33.44,
"grad_norm": 0.39067772030830383,
"learning_rate": 1.8186778632731575e-05,
"loss": 0.3479,
"step": 226000
},
{
"epoch": 33.52,
"grad_norm": 0.3263433873653412,
"learning_rate": 1.8157877700503106e-05,
"loss": 0.3457,
"step": 226500
},
{
"epoch": 33.59,
"grad_norm": 0.45672136545181274,
"learning_rate": 1.8128976768274638e-05,
"loss": 0.3488,
"step": 227000
},
{
"epoch": 33.66,
"grad_norm": 0.42077481746673584,
"learning_rate": 1.8100075836046166e-05,
"loss": 0.3447,
"step": 227500
},
{
"epoch": 33.74,
"grad_norm": 0.3391963243484497,
"learning_rate": 1.8071174903817697e-05,
"loss": 0.3471,
"step": 228000
},
{
"epoch": 33.81,
"grad_norm": 0.3776566684246063,
"learning_rate": 1.8042273971589228e-05,
"loss": 0.3498,
"step": 228500
},
{
"epoch": 33.89,
"grad_norm": 0.3776949942111969,
"learning_rate": 1.8013373039360756e-05,
"loss": 0.3463,
"step": 229000
},
{
"epoch": 33.96,
"grad_norm": 0.3695808947086334,
"learning_rate": 1.7984472107132287e-05,
"loss": 0.3492,
"step": 229500
},
{
"epoch": 34.03,
"grad_norm": 0.36413583159446716,
"learning_rate": 1.795557117490382e-05,
"loss": 0.3472,
"step": 230000
},
{
"epoch": 34.11,
"grad_norm": 0.39360344409942627,
"learning_rate": 1.7926670242675346e-05,
"loss": 0.3505,
"step": 230500
},
{
"epoch": 34.18,
"grad_norm": 0.3416905999183655,
"learning_rate": 1.7897769310446877e-05,
"loss": 0.3472,
"step": 231000
},
{
"epoch": 34.26,
"grad_norm": 0.40496984124183655,
"learning_rate": 1.786886837821841e-05,
"loss": 0.3477,
"step": 231500
},
{
"epoch": 34.33,
"grad_norm": 0.3441724479198456,
"learning_rate": 1.7839967445989937e-05,
"loss": 0.3479,
"step": 232000
},
{
"epoch": 34.4,
"grad_norm": 0.37928706407546997,
"learning_rate": 1.7811066513761468e-05,
"loss": 0.3486,
"step": 232500
},
{
"epoch": 34.48,
"grad_norm": 0.3675363063812256,
"learning_rate": 1.7782165581533e-05,
"loss": 0.3458,
"step": 233000
},
{
"epoch": 34.55,
"grad_norm": 0.40871405601501465,
"learning_rate": 1.7753264649304527e-05,
"loss": 0.3493,
"step": 233500
},
{
"epoch": 34.63,
"grad_norm": 0.417258620262146,
"learning_rate": 1.7724363717076058e-05,
"loss": 0.3453,
"step": 234000
},
{
"epoch": 34.7,
"grad_norm": 0.3210906386375427,
"learning_rate": 1.769546278484759e-05,
"loss": 0.3457,
"step": 234500
},
{
"epoch": 34.77,
"grad_norm": 0.3412734270095825,
"learning_rate": 1.7666561852619117e-05,
"loss": 0.3475,
"step": 235000
},
{
"epoch": 34.85,
"grad_norm": 0.39695098996162415,
"learning_rate": 1.763766092039065e-05,
"loss": 0.3509,
"step": 235500
},
{
"epoch": 34.92,
"grad_norm": 0.36834970116615295,
"learning_rate": 1.760875998816218e-05,
"loss": 0.3485,
"step": 236000
},
{
"epoch": 35.0,
"grad_norm": 0.3971041738986969,
"learning_rate": 1.7579859055933708e-05,
"loss": 0.3468,
"step": 236500
},
{
"epoch": 35.07,
"grad_norm": 0.3513786196708679,
"learning_rate": 1.755095812370524e-05,
"loss": 0.3478,
"step": 237000
},
{
"epoch": 35.14,
"grad_norm": 0.35932984948158264,
"learning_rate": 1.752205719147677e-05,
"loss": 0.348,
"step": 237500
},
{
"epoch": 35.22,
"grad_norm": 0.40685245394706726,
"learning_rate": 1.7493156259248298e-05,
"loss": 0.3447,
"step": 238000
},
{
"epoch": 35.29,
"grad_norm": 0.37929338216781616,
"learning_rate": 1.746425532701983e-05,
"loss": 0.3465,
"step": 238500
},
{
"epoch": 35.37,
"grad_norm": 0.40910473465919495,
"learning_rate": 1.743535439479136e-05,
"loss": 0.3465,
"step": 239000
},
{
"epoch": 35.44,
"grad_norm": 0.34920281171798706,
"learning_rate": 1.740645346256289e-05,
"loss": 0.3463,
"step": 239500
},
{
"epoch": 35.51,
"grad_norm": 0.37716421484947205,
"learning_rate": 1.737755253033442e-05,
"loss": 0.3487,
"step": 240000
},
{
"epoch": 35.59,
"grad_norm": 0.4301624596118927,
"learning_rate": 1.734865159810595e-05,
"loss": 0.3449,
"step": 240500
},
{
"epoch": 35.66,
"grad_norm": 0.37206390500068665,
"learning_rate": 1.7319750665877482e-05,
"loss": 0.3453,
"step": 241000
},
{
"epoch": 35.74,
"grad_norm": 0.38183438777923584,
"learning_rate": 1.729084973364901e-05,
"loss": 0.3493,
"step": 241500
},
{
"epoch": 35.81,
"grad_norm": 0.3732428252696991,
"learning_rate": 1.726194880142054e-05,
"loss": 0.349,
"step": 242000
},
{
"epoch": 35.88,
"grad_norm": 0.3665318787097931,
"learning_rate": 1.7233047869192072e-05,
"loss": 0.3482,
"step": 242500
},
{
"epoch": 35.96,
"grad_norm": 0.3693431317806244,
"learning_rate": 1.7204146936963597e-05,
"loss": 0.3445,
"step": 243000
},
{
"epoch": 36.03,
"grad_norm": 0.3580050766468048,
"learning_rate": 1.7175246004735128e-05,
"loss": 0.3469,
"step": 243500
},
{
"epoch": 36.11,
"grad_norm": 0.3874383568763733,
"learning_rate": 1.714634507250666e-05,
"loss": 0.3483,
"step": 244000
},
{
"epoch": 36.18,
"grad_norm": 0.39490264654159546,
"learning_rate": 1.7117444140278187e-05,
"loss": 0.3479,
"step": 244500
},
{
"epoch": 36.25,
"grad_norm": 0.38647031784057617,
"learning_rate": 1.7088543208049718e-05,
"loss": 0.3458,
"step": 245000
},
{
"epoch": 36.33,
"grad_norm": 0.3899756073951721,
"learning_rate": 1.705964227582125e-05,
"loss": 0.3448,
"step": 245500
},
{
"epoch": 36.4,
"grad_norm": 0.4022426903247833,
"learning_rate": 1.7030741343592777e-05,
"loss": 0.3463,
"step": 246000
},
{
"epoch": 36.48,
"grad_norm": 0.37726154923439026,
"learning_rate": 1.700184041136431e-05,
"loss": 0.3469,
"step": 246500
},
{
"epoch": 36.55,
"grad_norm": 0.4022297263145447,
"learning_rate": 1.697293947913584e-05,
"loss": 0.3457,
"step": 247000
},
{
"epoch": 36.62,
"grad_norm": 0.43506869673728943,
"learning_rate": 1.6944038546907368e-05,
"loss": 0.3467,
"step": 247500
},
{
"epoch": 36.7,
"grad_norm": 0.4157175123691559,
"learning_rate": 1.69151376146789e-05,
"loss": 0.3443,
"step": 248000
},
{
"epoch": 36.77,
"grad_norm": 0.4038371443748474,
"learning_rate": 1.688623668245043e-05,
"loss": 0.3408,
"step": 248500
},
{
"epoch": 36.85,
"grad_norm": 0.3598155081272125,
"learning_rate": 1.6857335750221958e-05,
"loss": 0.3455,
"step": 249000
},
{
"epoch": 36.92,
"grad_norm": 0.3888005018234253,
"learning_rate": 1.682843481799349e-05,
"loss": 0.3465,
"step": 249500
},
{
"epoch": 36.99,
"grad_norm": 0.3933923840522766,
"learning_rate": 1.679953388576502e-05,
"loss": 0.3488,
"step": 250000
},
{
"epoch": 37.07,
"grad_norm": 0.36610084772109985,
"learning_rate": 1.677063295353655e-05,
"loss": 0.3448,
"step": 250500
},
{
"epoch": 37.14,
"grad_norm": 0.3755366802215576,
"learning_rate": 1.674173202130808e-05,
"loss": 0.3475,
"step": 251000
},
{
"epoch": 37.22,
"grad_norm": 0.3687468469142914,
"learning_rate": 1.671283108907961e-05,
"loss": 0.3441,
"step": 251500
},
{
"epoch": 37.29,
"grad_norm": 0.3150022327899933,
"learning_rate": 1.668393015685114e-05,
"loss": 0.3439,
"step": 252000
},
{
"epoch": 37.36,
"grad_norm": 0.37440210580825806,
"learning_rate": 1.665502922462267e-05,
"loss": 0.3474,
"step": 252500
},
{
"epoch": 37.44,
"grad_norm": 0.3700256943702698,
"learning_rate": 1.66261282923942e-05,
"loss": 0.3453,
"step": 253000
},
{
"epoch": 37.51,
"grad_norm": 0.40626585483551025,
"learning_rate": 1.659722736016573e-05,
"loss": 0.345,
"step": 253500
},
{
"epoch": 37.59,
"grad_norm": 0.39384424686431885,
"learning_rate": 1.656832642793726e-05,
"loss": 0.3465,
"step": 254000
},
{
"epoch": 37.66,
"grad_norm": 0.38323110342025757,
"learning_rate": 1.653942549570879e-05,
"loss": 0.3458,
"step": 254500
},
{
"epoch": 37.73,
"grad_norm": 0.3620944619178772,
"learning_rate": 1.651052456348032e-05,
"loss": 0.3457,
"step": 255000
},
{
"epoch": 37.81,
"grad_norm": 0.3920278251171112,
"learning_rate": 1.648162363125185e-05,
"loss": 0.3433,
"step": 255500
},
{
"epoch": 37.88,
"grad_norm": 0.3547744154930115,
"learning_rate": 1.6452722699023382e-05,
"loss": 0.3427,
"step": 256000
},
{
"epoch": 37.96,
"grad_norm": 0.3048088252544403,
"learning_rate": 1.642382176679491e-05,
"loss": 0.3462,
"step": 256500
},
{
"epoch": 38.03,
"grad_norm": 0.3744346499443054,
"learning_rate": 1.639492083456644e-05,
"loss": 0.3451,
"step": 257000
},
{
"epoch": 38.1,
"grad_norm": 0.3640407621860504,
"learning_rate": 1.6366019902337972e-05,
"loss": 0.3485,
"step": 257500
},
{
"epoch": 38.18,
"grad_norm": 0.37335264682769775,
"learning_rate": 1.63371189701095e-05,
"loss": 0.3466,
"step": 258000
},
{
"epoch": 38.25,
"grad_norm": 0.3745587170124054,
"learning_rate": 1.630821803788103e-05,
"loss": 0.344,
"step": 258500
},
{
"epoch": 38.32,
"grad_norm": 0.462406724691391,
"learning_rate": 1.6279317105652562e-05,
"loss": 0.3432,
"step": 259000
},
{
"epoch": 38.4,
"grad_norm": 0.3359210193157196,
"learning_rate": 1.625041617342409e-05,
"loss": 0.3469,
"step": 259500
},
{
"epoch": 38.47,
"grad_norm": 0.3449317514896393,
"learning_rate": 1.622151524119562e-05,
"loss": 0.3435,
"step": 260000
},
{
"epoch": 38.55,
"grad_norm": 0.4265647232532501,
"learning_rate": 1.6192614308967153e-05,
"loss": 0.3458,
"step": 260500
},
{
"epoch": 38.62,
"grad_norm": 0.40118905901908875,
"learning_rate": 1.616371337673868e-05,
"loss": 0.3461,
"step": 261000
},
{
"epoch": 38.69,
"grad_norm": 0.36499762535095215,
"learning_rate": 1.6134812444510212e-05,
"loss": 0.3457,
"step": 261500
},
{
"epoch": 38.77,
"grad_norm": 0.37067875266075134,
"learning_rate": 1.6105911512281743e-05,
"loss": 0.3426,
"step": 262000
},
{
"epoch": 38.84,
"grad_norm": 0.402778685092926,
"learning_rate": 1.6077010580053268e-05,
"loss": 0.3432,
"step": 262500
},
{
"epoch": 38.92,
"grad_norm": 0.37418636679649353,
"learning_rate": 1.60481096478248e-05,
"loss": 0.347,
"step": 263000
},
{
"epoch": 38.99,
"grad_norm": 0.4147396981716156,
"learning_rate": 1.601920871559633e-05,
"loss": 0.3456,
"step": 263500
},
{
"epoch": 39.06,
"grad_norm": 0.42823702096939087,
"learning_rate": 1.5990307783367858e-05,
"loss": 0.342,
"step": 264000
},
{
"epoch": 39.14,
"grad_norm": 0.40999341011047363,
"learning_rate": 1.596140685113939e-05,
"loss": 0.3413,
"step": 264500
},
{
"epoch": 39.21,
"grad_norm": 0.32551825046539307,
"learning_rate": 1.593250591891092e-05,
"loss": 0.3433,
"step": 265000
},
{
"epoch": 39.29,
"grad_norm": 0.3688596487045288,
"learning_rate": 1.5903604986682448e-05,
"loss": 0.3457,
"step": 265500
},
{
"epoch": 39.36,
"grad_norm": 0.39799386262893677,
"learning_rate": 1.587470405445398e-05,
"loss": 0.343,
"step": 266000
},
{
"epoch": 39.43,
"grad_norm": 0.34967321157455444,
"learning_rate": 1.584580312222551e-05,
"loss": 0.3418,
"step": 266500
},
{
"epoch": 39.51,
"grad_norm": 0.36091017723083496,
"learning_rate": 1.581690218999704e-05,
"loss": 0.3435,
"step": 267000
},
{
"epoch": 39.58,
"grad_norm": 0.3361178934574127,
"learning_rate": 1.578800125776857e-05,
"loss": 0.3471,
"step": 267500
},
{
"epoch": 39.66,
"grad_norm": 0.36311858892440796,
"learning_rate": 1.57591003255401e-05,
"loss": 0.3442,
"step": 268000
},
{
"epoch": 39.73,
"grad_norm": 0.37522685527801514,
"learning_rate": 1.573019939331163e-05,
"loss": 0.3432,
"step": 268500
},
{
"epoch": 39.8,
"grad_norm": 0.42775389552116394,
"learning_rate": 1.570129846108316e-05,
"loss": 0.344,
"step": 269000
},
{
"epoch": 39.88,
"grad_norm": 0.40960633754730225,
"learning_rate": 1.567239752885469e-05,
"loss": 0.3428,
"step": 269500
},
{
"epoch": 39.95,
"grad_norm": 0.35652443766593933,
"learning_rate": 1.564349659662622e-05,
"loss": 0.3435,
"step": 270000
},
{
"epoch": 40.03,
"grad_norm": 0.41139841079711914,
"learning_rate": 1.561459566439775e-05,
"loss": 0.3436,
"step": 270500
},
{
"epoch": 40.1,
"grad_norm": 0.3651288151741028,
"learning_rate": 1.5585694732169282e-05,
"loss": 0.3447,
"step": 271000
},
{
"epoch": 40.17,
"grad_norm": 0.37484046816825867,
"learning_rate": 1.555679379994081e-05,
"loss": 0.3456,
"step": 271500
},
{
"epoch": 40.25,
"grad_norm": 0.3306860625743866,
"learning_rate": 1.552789286771234e-05,
"loss": 0.3428,
"step": 272000
},
{
"epoch": 40.32,
"grad_norm": 0.37026843428611755,
"learning_rate": 1.5498991935483872e-05,
"loss": 0.3433,
"step": 272500
},
{
"epoch": 40.4,
"grad_norm": 0.3959224224090576,
"learning_rate": 1.54700910032554e-05,
"loss": 0.3453,
"step": 273000
},
{
"epoch": 40.47,
"grad_norm": 0.3823704719543457,
"learning_rate": 1.544119007102693e-05,
"loss": 0.3413,
"step": 273500
},
{
"epoch": 40.54,
"grad_norm": 0.33115535974502563,
"learning_rate": 1.5412289138798462e-05,
"loss": 0.3442,
"step": 274000
},
{
"epoch": 40.62,
"grad_norm": 0.5036399364471436,
"learning_rate": 1.538338820656999e-05,
"loss": 0.3417,
"step": 274500
},
{
"epoch": 40.69,
"grad_norm": 0.3805595934391022,
"learning_rate": 1.535448727434152e-05,
"loss": 0.3453,
"step": 275000
},
{
"epoch": 40.77,
"grad_norm": 0.4390459656715393,
"learning_rate": 1.5325586342113053e-05,
"loss": 0.343,
"step": 275500
},
{
"epoch": 40.84,
"grad_norm": 0.3673398792743683,
"learning_rate": 1.5296685409884584e-05,
"loss": 0.3402,
"step": 276000
},
{
"epoch": 40.91,
"grad_norm": 0.36677980422973633,
"learning_rate": 1.5267784477656112e-05,
"loss": 0.3418,
"step": 276500
},
{
"epoch": 40.99,
"grad_norm": 0.37628763914108276,
"learning_rate": 1.5238883545427641e-05,
"loss": 0.3423,
"step": 277000
},
{
"epoch": 41.06,
"grad_norm": 0.3959880769252777,
"learning_rate": 1.5209982613199171e-05,
"loss": 0.3413,
"step": 277500
},
{
"epoch": 41.14,
"grad_norm": 0.35615137219429016,
"learning_rate": 1.51810816809707e-05,
"loss": 0.3438,
"step": 278000
},
{
"epoch": 41.21,
"grad_norm": 0.4133353531360626,
"learning_rate": 1.5152180748742232e-05,
"loss": 0.3429,
"step": 278500
},
{
"epoch": 41.28,
"grad_norm": 0.35143953561782837,
"learning_rate": 1.5123279816513761e-05,
"loss": 0.3437,
"step": 279000
},
{
"epoch": 41.36,
"grad_norm": 0.37390509247779846,
"learning_rate": 1.509437888428529e-05,
"loss": 0.341,
"step": 279500
},
{
"epoch": 41.43,
"grad_norm": 0.39959460496902466,
"learning_rate": 1.5065477952056822e-05,
"loss": 0.3423,
"step": 280000
},
{
"epoch": 41.51,
"grad_norm": 0.3992210030555725,
"learning_rate": 1.5036577019828352e-05,
"loss": 0.3462,
"step": 280500
},
{
"epoch": 41.58,
"grad_norm": 0.3677886724472046,
"learning_rate": 1.5007676087599881e-05,
"loss": 0.3427,
"step": 281000
},
{
"epoch": 41.65,
"grad_norm": 0.3906817138195038,
"learning_rate": 1.4978775155371412e-05,
"loss": 0.3415,
"step": 281500
},
{
"epoch": 41.73,
"grad_norm": 0.39412081241607666,
"learning_rate": 1.4949874223142942e-05,
"loss": 0.3434,
"step": 282000
},
{
"epoch": 41.8,
"grad_norm": 0.4120485782623291,
"learning_rate": 1.4920973290914471e-05,
"loss": 0.344,
"step": 282500
},
{
"epoch": 41.88,
"grad_norm": 0.4464050829410553,
"learning_rate": 1.4892072358686003e-05,
"loss": 0.34,
"step": 283000
},
{
"epoch": 41.95,
"grad_norm": 0.3044414520263672,
"learning_rate": 1.4863171426457532e-05,
"loss": 0.3417,
"step": 283500
},
{
"epoch": 42.02,
"grad_norm": 0.32554033398628235,
"learning_rate": 1.4834270494229062e-05,
"loss": 0.3417,
"step": 284000
},
{
"epoch": 42.1,
"grad_norm": 0.3944820463657379,
"learning_rate": 1.4805369562000593e-05,
"loss": 0.3436,
"step": 284500
},
{
"epoch": 42.17,
"grad_norm": 0.37802961468696594,
"learning_rate": 1.4776468629772123e-05,
"loss": 0.3439,
"step": 285000
},
{
"epoch": 42.25,
"grad_norm": 0.4089604616165161,
"learning_rate": 1.4747567697543654e-05,
"loss": 0.3408,
"step": 285500
},
{
"epoch": 42.32,
"grad_norm": 0.37038755416870117,
"learning_rate": 1.4718666765315183e-05,
"loss": 0.3426,
"step": 286000
},
{
"epoch": 42.39,
"grad_norm": 0.35514524579048157,
"learning_rate": 1.4689765833086711e-05,
"loss": 0.3451,
"step": 286500
},
{
"epoch": 42.47,
"grad_norm": 0.4710882902145386,
"learning_rate": 1.4660864900858242e-05,
"loss": 0.3429,
"step": 287000
},
{
"epoch": 42.54,
"grad_norm": 0.34002232551574707,
"learning_rate": 1.4631963968629772e-05,
"loss": 0.3418,
"step": 287500
},
{
"epoch": 42.62,
"grad_norm": 0.4424833059310913,
"learning_rate": 1.4603063036401302e-05,
"loss": 0.3425,
"step": 288000
},
{
"epoch": 42.69,
"grad_norm": 0.4154003858566284,
"learning_rate": 1.4574162104172833e-05,
"loss": 0.3428,
"step": 288500
},
{
"epoch": 42.76,
"grad_norm": 0.40983182191848755,
"learning_rate": 1.4545261171944362e-05,
"loss": 0.343,
"step": 289000
},
{
"epoch": 42.84,
"grad_norm": 0.3568328320980072,
"learning_rate": 1.4516360239715892e-05,
"loss": 0.3387,
"step": 289500
},
{
"epoch": 42.91,
"grad_norm": 0.3948083221912384,
"learning_rate": 1.4487459307487423e-05,
"loss": 0.3388,
"step": 290000
},
{
"epoch": 42.99,
"grad_norm": 0.3891525864601135,
"learning_rate": 1.4458558375258953e-05,
"loss": 0.3415,
"step": 290500
},
{
"epoch": 43.06,
"grad_norm": 0.3312503695487976,
"learning_rate": 1.4429657443030482e-05,
"loss": 0.3407,
"step": 291000
},
{
"epoch": 43.13,
"grad_norm": 0.37773850560188293,
"learning_rate": 1.4400756510802013e-05,
"loss": 0.3411,
"step": 291500
},
{
"epoch": 43.21,
"grad_norm": 0.29978179931640625,
"learning_rate": 1.4371855578573543e-05,
"loss": 0.3423,
"step": 292000
},
{
"epoch": 43.28,
"grad_norm": 0.4314406216144562,
"learning_rate": 1.4342954646345072e-05,
"loss": 0.3373,
"step": 292500
},
{
"epoch": 43.36,
"grad_norm": 0.3975353538990021,
"learning_rate": 1.4314053714116604e-05,
"loss": 0.3405,
"step": 293000
},
{
"epoch": 43.43,
"grad_norm": 0.35734960436820984,
"learning_rate": 1.4285152781888133e-05,
"loss": 0.3416,
"step": 293500
},
{
"epoch": 43.5,
"grad_norm": 0.44908031821250916,
"learning_rate": 1.4256251849659663e-05,
"loss": 0.3392,
"step": 294000
},
{
"epoch": 43.58,
"grad_norm": 0.3516298532485962,
"learning_rate": 1.4227350917431194e-05,
"loss": 0.3385,
"step": 294500
},
{
"epoch": 43.65,
"grad_norm": 0.3821066915988922,
"learning_rate": 1.4198449985202724e-05,
"loss": 0.3448,
"step": 295000
},
{
"epoch": 43.73,
"grad_norm": 0.3824633061885834,
"learning_rate": 1.4169549052974253e-05,
"loss": 0.3415,
"step": 295500
},
{
"epoch": 43.8,
"grad_norm": 0.3336328864097595,
"learning_rate": 1.4140648120745784e-05,
"loss": 0.3403,
"step": 296000
},
{
"epoch": 43.87,
"grad_norm": 0.41100433468818665,
"learning_rate": 1.4111747188517312e-05,
"loss": 0.3428,
"step": 296500
},
{
"epoch": 43.95,
"grad_norm": 0.38574787974357605,
"learning_rate": 1.4082846256288842e-05,
"loss": 0.3388,
"step": 297000
},
{
"epoch": 44.02,
"grad_norm": 0.3482591509819031,
"learning_rate": 1.4053945324060373e-05,
"loss": 0.3392,
"step": 297500
},
{
"epoch": 44.1,
"grad_norm": 0.39932170510292053,
"learning_rate": 1.4025044391831903e-05,
"loss": 0.3385,
"step": 298000
},
{
"epoch": 44.17,
"grad_norm": 0.3750057518482208,
"learning_rate": 1.3996143459603432e-05,
"loss": 0.3419,
"step": 298500
},
{
"epoch": 44.24,
"grad_norm": 0.3343985974788666,
"learning_rate": 1.3967242527374963e-05,
"loss": 0.3398,
"step": 299000
},
{
"epoch": 44.32,
"grad_norm": 0.32805758714675903,
"learning_rate": 1.3938341595146493e-05,
"loss": 0.339,
"step": 299500
},
{
"epoch": 44.39,
"grad_norm": 0.376280814409256,
"learning_rate": 1.3909440662918022e-05,
"loss": 0.3407,
"step": 300000
},
{
"epoch": 44.47,
"grad_norm": 0.27181968092918396,
"learning_rate": 1.3880539730689554e-05,
"loss": 0.3392,
"step": 300500
},
{
"epoch": 44.54,
"grad_norm": 0.3351672887802124,
"learning_rate": 1.3851638798461083e-05,
"loss": 0.3416,
"step": 301000
},
{
"epoch": 44.61,
"grad_norm": 0.3780210614204407,
"learning_rate": 1.3822737866232613e-05,
"loss": 0.3429,
"step": 301500
},
{
"epoch": 44.69,
"grad_norm": 0.3951726257801056,
"learning_rate": 1.3793836934004144e-05,
"loss": 0.3407,
"step": 302000
},
{
"epoch": 44.76,
"grad_norm": 0.3675825893878937,
"learning_rate": 1.3764936001775674e-05,
"loss": 0.339,
"step": 302500
},
{
"epoch": 44.84,
"grad_norm": 0.3629719913005829,
"learning_rate": 1.3736035069547205e-05,
"loss": 0.3431,
"step": 303000
},
{
"epoch": 44.91,
"grad_norm": 0.4170212745666504,
"learning_rate": 1.3707134137318734e-05,
"loss": 0.3394,
"step": 303500
},
{
"epoch": 44.98,
"grad_norm": 0.3839627206325531,
"learning_rate": 1.3678233205090264e-05,
"loss": 0.3422,
"step": 304000
},
{
"epoch": 45.06,
"grad_norm": 0.4395899176597595,
"learning_rate": 1.3649332272861795e-05,
"loss": 0.3402,
"step": 304500
},
{
"epoch": 45.13,
"grad_norm": 0.37044864892959595,
"learning_rate": 1.3620431340633325e-05,
"loss": 0.342,
"step": 305000
},
{
"epoch": 45.21,
"grad_norm": 0.4869653582572937,
"learning_rate": 1.3591530408404854e-05,
"loss": 0.3391,
"step": 305500
},
{
"epoch": 45.28,
"grad_norm": 0.416748046875,
"learning_rate": 1.3562629476176384e-05,
"loss": 0.3394,
"step": 306000
},
{
"epoch": 45.35,
"grad_norm": 0.37101954221725464,
"learning_rate": 1.3533728543947913e-05,
"loss": 0.3425,
"step": 306500
},
{
"epoch": 45.43,
"grad_norm": 0.3808073103427887,
"learning_rate": 1.3504827611719443e-05,
"loss": 0.3399,
"step": 307000
},
{
"epoch": 45.5,
"grad_norm": 0.3832837641239166,
"learning_rate": 1.3475926679490974e-05,
"loss": 0.3413,
"step": 307500
},
{
"epoch": 45.58,
"grad_norm": 0.3216901421546936,
"learning_rate": 1.3447025747262504e-05,
"loss": 0.3403,
"step": 308000
},
{
"epoch": 45.65,
"grad_norm": 0.36098387837409973,
"learning_rate": 1.3418124815034033e-05,
"loss": 0.3426,
"step": 308500
},
{
"epoch": 45.72,
"grad_norm": 0.5177834033966064,
"learning_rate": 1.3389223882805564e-05,
"loss": 0.3411,
"step": 309000
},
{
"epoch": 45.8,
"grad_norm": 0.41095811128616333,
"learning_rate": 1.3360322950577094e-05,
"loss": 0.3381,
"step": 309500
},
{
"epoch": 45.87,
"grad_norm": 0.38759657740592957,
"learning_rate": 1.3331422018348624e-05,
"loss": 0.3385,
"step": 310000
},
{
"epoch": 45.95,
"grad_norm": 0.34995037317276,
"learning_rate": 1.3302521086120155e-05,
"loss": 0.3387,
"step": 310500
},
{
"epoch": 46.02,
"grad_norm": 0.40866127610206604,
"learning_rate": 1.3273620153891684e-05,
"loss": 0.3406,
"step": 311000
},
{
"epoch": 46.09,
"grad_norm": 0.40558964014053345,
"learning_rate": 1.3244719221663214e-05,
"loss": 0.34,
"step": 311500
},
{
"epoch": 46.17,
"grad_norm": 0.3268815875053406,
"learning_rate": 1.3215818289434745e-05,
"loss": 0.3428,
"step": 312000
},
{
"epoch": 46.24,
"grad_norm": 0.4113875925540924,
"learning_rate": 1.3186917357206275e-05,
"loss": 0.3397,
"step": 312500
},
{
"epoch": 46.32,
"grad_norm": 0.3797847032546997,
"learning_rate": 1.3158016424977804e-05,
"loss": 0.3404,
"step": 313000
},
{
"epoch": 46.39,
"grad_norm": 0.3348693251609802,
"learning_rate": 1.3129115492749335e-05,
"loss": 0.34,
"step": 313500
},
{
"epoch": 46.46,
"grad_norm": 0.3879573941230774,
"learning_rate": 1.3100214560520865e-05,
"loss": 0.341,
"step": 314000
},
{
"epoch": 46.54,
"grad_norm": 0.40568268299102783,
"learning_rate": 1.3071313628292394e-05,
"loss": 0.3387,
"step": 314500
},
{
"epoch": 46.61,
"grad_norm": 0.4025332033634186,
"learning_rate": 1.3042412696063926e-05,
"loss": 0.3415,
"step": 315000
},
{
"epoch": 46.69,
"grad_norm": 0.38457706570625305,
"learning_rate": 1.3013511763835455e-05,
"loss": 0.3372,
"step": 315500
},
{
"epoch": 46.76,
"grad_norm": 0.36499732732772827,
"learning_rate": 1.2984610831606983e-05,
"loss": 0.3402,
"step": 316000
},
{
"epoch": 46.83,
"grad_norm": 0.5976847410202026,
"learning_rate": 1.2955709899378514e-05,
"loss": 0.3402,
"step": 316500
},
{
"epoch": 46.91,
"grad_norm": 0.38978490233421326,
"learning_rate": 1.2926808967150044e-05,
"loss": 0.3378,
"step": 317000
},
{
"epoch": 46.98,
"grad_norm": 0.4110495448112488,
"learning_rate": 1.2897908034921573e-05,
"loss": 0.339,
"step": 317500
},
{
"epoch": 47.06,
"grad_norm": 0.37475350499153137,
"learning_rate": 1.2869007102693105e-05,
"loss": 0.3402,
"step": 318000
},
{
"epoch": 47.13,
"grad_norm": 0.34179574251174927,
"learning_rate": 1.2840106170464634e-05,
"loss": 0.3397,
"step": 318500
},
{
"epoch": 47.2,
"grad_norm": 0.3006201982498169,
"learning_rate": 1.2811205238236164e-05,
"loss": 0.342,
"step": 319000
},
{
"epoch": 47.28,
"grad_norm": 0.38113319873809814,
"learning_rate": 1.2782304306007695e-05,
"loss": 0.341,
"step": 319500
},
{
"epoch": 47.35,
"grad_norm": 0.48638325929641724,
"learning_rate": 1.2753403373779225e-05,
"loss": 0.3403,
"step": 320000
},
{
"epoch": 47.43,
"grad_norm": 0.38058334589004517,
"learning_rate": 1.2724502441550754e-05,
"loss": 0.3401,
"step": 320500
},
{
"epoch": 47.5,
"grad_norm": 0.3784169554710388,
"learning_rate": 1.2695601509322285e-05,
"loss": 0.3378,
"step": 321000
},
{
"epoch": 47.57,
"grad_norm": 0.3679274916648865,
"learning_rate": 1.2666700577093815e-05,
"loss": 0.3388,
"step": 321500
},
{
"epoch": 47.65,
"grad_norm": 0.4005269706249237,
"learning_rate": 1.2637799644865346e-05,
"loss": 0.3382,
"step": 322000
},
{
"epoch": 47.72,
"grad_norm": 0.36187744140625,
"learning_rate": 1.2608898712636876e-05,
"loss": 0.3372,
"step": 322500
},
{
"epoch": 47.8,
"grad_norm": 0.4188271164894104,
"learning_rate": 1.2579997780408405e-05,
"loss": 0.339,
"step": 323000
},
{
"epoch": 47.87,
"grad_norm": 0.40314343571662903,
"learning_rate": 1.2551096848179936e-05,
"loss": 0.3404,
"step": 323500
},
{
"epoch": 47.94,
"grad_norm": 0.4233045279979706,
"learning_rate": 1.2522195915951466e-05,
"loss": 0.3398,
"step": 324000
},
{
"epoch": 48.02,
"grad_norm": 0.3816595673561096,
"learning_rate": 1.2493294983722996e-05,
"loss": 0.3382,
"step": 324500
},
{
"epoch": 48.09,
"grad_norm": 0.4000893831253052,
"learning_rate": 1.2464394051494527e-05,
"loss": 0.3402,
"step": 325000
},
{
"epoch": 48.17,
"grad_norm": 0.4130527377128601,
"learning_rate": 1.2435493119266055e-05,
"loss": 0.3394,
"step": 325500
},
{
"epoch": 48.24,
"grad_norm": 0.38113564252853394,
"learning_rate": 1.2406592187037584e-05,
"loss": 0.3365,
"step": 326000
},
{
"epoch": 48.31,
"grad_norm": 0.346966415643692,
"learning_rate": 1.2377691254809115e-05,
"loss": 0.3395,
"step": 326500
},
{
"epoch": 48.39,
"grad_norm": 0.4276494085788727,
"learning_rate": 1.2348790322580645e-05,
"loss": 0.3362,
"step": 327000
},
{
"epoch": 48.46,
"grad_norm": 0.39347225427627563,
"learning_rate": 1.2319889390352175e-05,
"loss": 0.3422,
"step": 327500
},
{
"epoch": 48.54,
"grad_norm": 0.3483811616897583,
"learning_rate": 1.2290988458123706e-05,
"loss": 0.3395,
"step": 328000
},
{
"epoch": 48.61,
"grad_norm": 0.36153608560562134,
"learning_rate": 1.2262087525895235e-05,
"loss": 0.3365,
"step": 328500
},
{
"epoch": 48.68,
"grad_norm": 0.39289888739585876,
"learning_rate": 1.2233186593666765e-05,
"loss": 0.3421,
"step": 329000
},
{
"epoch": 48.76,
"grad_norm": 0.4176575839519501,
"learning_rate": 1.2204285661438296e-05,
"loss": 0.3364,
"step": 329500
},
{
"epoch": 48.83,
"grad_norm": 0.3840237855911255,
"learning_rate": 1.2175384729209826e-05,
"loss": 0.3357,
"step": 330000
},
{
"epoch": 48.91,
"grad_norm": 0.44171571731567383,
"learning_rate": 1.2146483796981355e-05,
"loss": 0.3413,
"step": 330500
},
{
"epoch": 48.98,
"grad_norm": 0.42055392265319824,
"learning_rate": 1.2117582864752886e-05,
"loss": 0.335,
"step": 331000
},
{
"epoch": 49.05,
"grad_norm": 0.44252675771713257,
"learning_rate": 1.2088681932524416e-05,
"loss": 0.3385,
"step": 331500
},
{
"epoch": 49.13,
"grad_norm": 0.378095805644989,
"learning_rate": 1.2059781000295946e-05,
"loss": 0.3413,
"step": 332000
},
{
"epoch": 49.2,
"grad_norm": 0.3892216980457306,
"learning_rate": 1.2030880068067477e-05,
"loss": 0.3374,
"step": 332500
},
{
"epoch": 49.27,
"grad_norm": 0.3788653612136841,
"learning_rate": 1.2001979135839006e-05,
"loss": 0.3398,
"step": 333000
},
{
"epoch": 49.35,
"grad_norm": 0.38030126690864563,
"learning_rate": 1.1973078203610536e-05,
"loss": 0.3375,
"step": 333500
},
{
"epoch": 49.42,
"grad_norm": 0.4031144082546234,
"learning_rate": 1.1944177271382067e-05,
"loss": 0.3398,
"step": 334000
},
{
"epoch": 49.5,
"grad_norm": 0.3956519663333893,
"learning_rate": 1.1915276339153597e-05,
"loss": 0.3374,
"step": 334500
},
{
"epoch": 49.57,
"grad_norm": 0.3961743414402008,
"learning_rate": 1.1886375406925124e-05,
"loss": 0.3349,
"step": 335000
},
{
"epoch": 49.64,
"grad_norm": 0.3616986572742462,
"learning_rate": 1.1857474474696656e-05,
"loss": 0.3374,
"step": 335500
},
{
"epoch": 49.72,
"grad_norm": 0.36143624782562256,
"learning_rate": 1.1828573542468185e-05,
"loss": 0.3361,
"step": 336000
},
{
"epoch": 49.79,
"grad_norm": 0.389981746673584,
"learning_rate": 1.1799672610239715e-05,
"loss": 0.3378,
"step": 336500
},
{
"epoch": 49.87,
"grad_norm": 0.4078088104724884,
"learning_rate": 1.1770771678011246e-05,
"loss": 0.3382,
"step": 337000
},
{
"epoch": 49.94,
"grad_norm": 0.3802012801170349,
"learning_rate": 1.1741870745782776e-05,
"loss": 0.3356,
"step": 337500
},
{
"epoch": 50.01,
"grad_norm": 0.46680396795272827,
"learning_rate": 1.1712969813554305e-05,
"loss": 0.339,
"step": 338000
},
{
"epoch": 50.09,
"grad_norm": 0.45273512601852417,
"learning_rate": 1.1684068881325836e-05,
"loss": 0.3381,
"step": 338500
},
{
"epoch": 50.16,
"grad_norm": 0.3563522398471832,
"learning_rate": 1.1655167949097366e-05,
"loss": 0.3371,
"step": 339000
},
{
"epoch": 50.24,
"grad_norm": 0.43655216693878174,
"learning_rate": 1.1626267016868895e-05,
"loss": 0.3354,
"step": 339500
},
{
"epoch": 50.31,
"grad_norm": 0.3371317982673645,
"learning_rate": 1.1597366084640427e-05,
"loss": 0.3374,
"step": 340000
},
{
"epoch": 50.38,
"grad_norm": 0.39056339859962463,
"learning_rate": 1.1568465152411956e-05,
"loss": 0.3376,
"step": 340500
},
{
"epoch": 50.46,
"grad_norm": 0.40476441383361816,
"learning_rate": 1.1539564220183487e-05,
"loss": 0.3381,
"step": 341000
},
{
"epoch": 50.53,
"grad_norm": 0.3706866502761841,
"learning_rate": 1.1510663287955017e-05,
"loss": 0.3355,
"step": 341500
},
{
"epoch": 50.61,
"grad_norm": 0.43677544593811035,
"learning_rate": 1.1481762355726547e-05,
"loss": 0.3352,
"step": 342000
},
{
"epoch": 50.68,
"grad_norm": 0.3938286602497101,
"learning_rate": 1.1452861423498078e-05,
"loss": 0.3375,
"step": 342500
},
{
"epoch": 50.75,
"grad_norm": 0.3463038504123688,
"learning_rate": 1.1423960491269607e-05,
"loss": 0.3379,
"step": 343000
},
{
"epoch": 50.83,
"grad_norm": 0.3810366988182068,
"learning_rate": 1.1395059559041137e-05,
"loss": 0.3367,
"step": 343500
},
{
"epoch": 50.9,
"grad_norm": 0.3845095932483673,
"learning_rate": 1.1366158626812668e-05,
"loss": 0.3366,
"step": 344000
},
{
"epoch": 50.98,
"grad_norm": 0.5161909461021423,
"learning_rate": 1.1337257694584198e-05,
"loss": 0.3382,
"step": 344500
},
{
"epoch": 51.05,
"grad_norm": 0.4319625794887543,
"learning_rate": 1.1308356762355726e-05,
"loss": 0.3359,
"step": 345000
},
{
"epoch": 51.12,
"grad_norm": 0.34908732771873474,
"learning_rate": 1.1279455830127257e-05,
"loss": 0.3353,
"step": 345500
},
{
"epoch": 51.2,
"grad_norm": 0.38367515802383423,
"learning_rate": 1.1250554897898786e-05,
"loss": 0.3367,
"step": 346000
},
{
"epoch": 51.27,
"grad_norm": 0.3939116597175598,
"learning_rate": 1.1221653965670316e-05,
"loss": 0.3374,
"step": 346500
},
{
"epoch": 51.35,
"grad_norm": 0.44843488931655884,
"learning_rate": 1.1192753033441847e-05,
"loss": 0.3376,
"step": 347000
},
{
"epoch": 51.42,
"grad_norm": 0.4169764816761017,
"learning_rate": 1.1163852101213377e-05,
"loss": 0.3385,
"step": 347500
},
{
"epoch": 51.49,
"grad_norm": 0.3487055003643036,
"learning_rate": 1.1134951168984906e-05,
"loss": 0.3372,
"step": 348000
},
{
"epoch": 51.57,
"grad_norm": 0.3876706063747406,
"learning_rate": 1.1106050236756437e-05,
"loss": 0.3379,
"step": 348500
},
{
"epoch": 51.64,
"grad_norm": 0.33344700932502747,
"learning_rate": 1.1077149304527967e-05,
"loss": 0.3389,
"step": 349000
},
{
"epoch": 51.72,
"grad_norm": 0.41183948516845703,
"learning_rate": 1.1048248372299497e-05,
"loss": 0.3363,
"step": 349500
},
{
"epoch": 51.79,
"grad_norm": 0.3549967110157013,
"learning_rate": 1.1019347440071028e-05,
"loss": 0.3374,
"step": 350000
},
{
"epoch": 51.86,
"grad_norm": 0.4144490659236908,
"learning_rate": 1.0990446507842557e-05,
"loss": 0.3347,
"step": 350500
},
{
"epoch": 51.94,
"grad_norm": 0.3781343400478363,
"learning_rate": 1.0961545575614087e-05,
"loss": 0.3365,
"step": 351000
},
{
"epoch": 52.01,
"grad_norm": 0.4050437808036804,
"learning_rate": 1.0932644643385618e-05,
"loss": 0.3384,
"step": 351500
},
{
"epoch": 52.09,
"grad_norm": 0.3758808374404907,
"learning_rate": 1.0903743711157148e-05,
"loss": 0.3382,
"step": 352000
},
{
"epoch": 52.16,
"grad_norm": 0.456534206867218,
"learning_rate": 1.0874842778928677e-05,
"loss": 0.3392,
"step": 352500
},
{
"epoch": 52.23,
"grad_norm": 0.38857728242874146,
"learning_rate": 1.0845941846700208e-05,
"loss": 0.3374,
"step": 353000
},
{
"epoch": 52.31,
"grad_norm": 0.39419788122177124,
"learning_rate": 1.0817040914471738e-05,
"loss": 0.3374,
"step": 353500
},
{
"epoch": 52.38,
"grad_norm": 0.41852855682373047,
"learning_rate": 1.0788139982243268e-05,
"loss": 0.3335,
"step": 354000
},
{
"epoch": 52.46,
"grad_norm": 0.3561359941959381,
"learning_rate": 1.0759239050014797e-05,
"loss": 0.3359,
"step": 354500
},
{
"epoch": 52.53,
"grad_norm": 0.3975025713443756,
"learning_rate": 1.0730338117786327e-05,
"loss": 0.336,
"step": 355000
},
{
"epoch": 52.6,
"grad_norm": 0.39150169491767883,
"learning_rate": 1.0701437185557856e-05,
"loss": 0.337,
"step": 355500
},
{
"epoch": 52.68,
"grad_norm": 0.404354453086853,
"learning_rate": 1.0672536253329387e-05,
"loss": 0.3378,
"step": 356000
},
{
"epoch": 52.75,
"grad_norm": 0.3414269685745239,
"learning_rate": 1.0643635321100917e-05,
"loss": 0.3338,
"step": 356500
},
{
"epoch": 52.83,
"grad_norm": 0.4378945827484131,
"learning_rate": 1.0614734388872446e-05,
"loss": 0.3369,
"step": 357000
},
{
"epoch": 52.9,
"grad_norm": 0.5136425495147705,
"learning_rate": 1.0585833456643978e-05,
"loss": 0.3348,
"step": 357500
},
{
"epoch": 52.97,
"grad_norm": 0.3793259263038635,
"learning_rate": 1.0556932524415507e-05,
"loss": 0.3354,
"step": 358000
},
{
"epoch": 53.05,
"grad_norm": 0.3828275203704834,
"learning_rate": 1.0528031592187039e-05,
"loss": 0.3348,
"step": 358500
},
{
"epoch": 53.12,
"grad_norm": 0.380776971578598,
"learning_rate": 1.0499130659958568e-05,
"loss": 0.3375,
"step": 359000
},
{
"epoch": 53.2,
"grad_norm": 0.40259137749671936,
"learning_rate": 1.0470229727730098e-05,
"loss": 0.3366,
"step": 359500
},
{
"epoch": 53.27,
"grad_norm": 0.3794288635253906,
"learning_rate": 1.0441328795501629e-05,
"loss": 0.3343,
"step": 360000
},
{
"epoch": 53.34,
"grad_norm": 0.44558051228523254,
"learning_rate": 1.0412427863273158e-05,
"loss": 0.3355,
"step": 360500
},
{
"epoch": 53.42,
"grad_norm": 0.42926931381225586,
"learning_rate": 1.0383526931044688e-05,
"loss": 0.3368,
"step": 361000
},
{
"epoch": 53.49,
"grad_norm": 0.3846406936645508,
"learning_rate": 1.035462599881622e-05,
"loss": 0.3363,
"step": 361500
},
{
"epoch": 53.57,
"grad_norm": 0.43000903725624084,
"learning_rate": 1.0325725066587749e-05,
"loss": 0.3338,
"step": 362000
},
{
"epoch": 53.64,
"grad_norm": 0.42310836911201477,
"learning_rate": 1.0296824134359278e-05,
"loss": 0.336,
"step": 362500
},
{
"epoch": 53.71,
"grad_norm": 0.3451327681541443,
"learning_rate": 1.026792320213081e-05,
"loss": 0.3384,
"step": 363000
},
{
"epoch": 53.79,
"grad_norm": 0.4068630337715149,
"learning_rate": 1.0239022269902339e-05,
"loss": 0.3389,
"step": 363500
},
{
"epoch": 53.86,
"grad_norm": 0.36988091468811035,
"learning_rate": 1.0210121337673869e-05,
"loss": 0.3368,
"step": 364000
},
{
"epoch": 53.94,
"grad_norm": 0.37670448422431946,
"learning_rate": 1.0181220405445398e-05,
"loss": 0.3361,
"step": 364500
},
{
"epoch": 54.01,
"grad_norm": 0.4235304296016693,
"learning_rate": 1.0152319473216928e-05,
"loss": 0.3339,
"step": 365000
},
{
"epoch": 54.08,
"grad_norm": 0.4179520606994629,
"learning_rate": 1.0123418540988457e-05,
"loss": 0.3372,
"step": 365500
},
{
"epoch": 54.16,
"grad_norm": 0.3763734996318817,
"learning_rate": 1.0094517608759988e-05,
"loss": 0.3368,
"step": 366000
},
{
"epoch": 54.23,
"grad_norm": 0.4098796844482422,
"learning_rate": 1.0065616676531518e-05,
"loss": 0.3326,
"step": 366500
},
{
"epoch": 54.31,
"grad_norm": 0.41570228338241577,
"learning_rate": 1.0036715744303048e-05,
"loss": 0.3366,
"step": 367000
},
{
"epoch": 54.38,
"grad_norm": 0.38217049837112427,
"learning_rate": 1.0007814812074579e-05,
"loss": 0.3338,
"step": 367500
},
{
"epoch": 54.45,
"grad_norm": 0.36770564317703247,
"learning_rate": 9.978913879846108e-06,
"loss": 0.3323,
"step": 368000
},
{
"epoch": 54.53,
"grad_norm": 0.43568935990333557,
"learning_rate": 9.950012947617638e-06,
"loss": 0.3361,
"step": 368500
},
{
"epoch": 54.6,
"grad_norm": 0.47602441906929016,
"learning_rate": 9.921112015389169e-06,
"loss": 0.3349,
"step": 369000
},
{
"epoch": 54.68,
"grad_norm": 0.4022866487503052,
"learning_rate": 9.892211083160699e-06,
"loss": 0.3347,
"step": 369500
},
{
"epoch": 54.75,
"grad_norm": 0.3981685936450958,
"learning_rate": 9.863310150932228e-06,
"loss": 0.3351,
"step": 370000
},
{
"epoch": 54.82,
"grad_norm": 0.3706594705581665,
"learning_rate": 9.83440921870376e-06,
"loss": 0.3342,
"step": 370500
},
{
"epoch": 54.9,
"grad_norm": 0.36316171288490295,
"learning_rate": 9.805508286475289e-06,
"loss": 0.337,
"step": 371000
},
{
"epoch": 54.97,
"grad_norm": 0.3705138564109802,
"learning_rate": 9.776607354246819e-06,
"loss": 0.3358,
"step": 371500
},
{
"epoch": 55.05,
"grad_norm": 0.4264328181743622,
"learning_rate": 9.74770642201835e-06,
"loss": 0.3349,
"step": 372000
},
{
"epoch": 55.12,
"grad_norm": 0.39624592661857605,
"learning_rate": 9.71880548978988e-06,
"loss": 0.3327,
"step": 372500
},
{
"epoch": 55.19,
"grad_norm": 0.41520076990127563,
"learning_rate": 9.689904557561409e-06,
"loss": 0.3363,
"step": 373000
},
{
"epoch": 55.27,
"grad_norm": 0.37249574065208435,
"learning_rate": 9.66100362533294e-06,
"loss": 0.335,
"step": 373500
},
{
"epoch": 55.34,
"grad_norm": 0.42657721042633057,
"learning_rate": 9.632102693104468e-06,
"loss": 0.3353,
"step": 374000
},
{
"epoch": 55.42,
"grad_norm": 0.3780669569969177,
"learning_rate": 9.603201760875998e-06,
"loss": 0.337,
"step": 374500
},
{
"epoch": 55.49,
"grad_norm": 0.3783871829509735,
"learning_rate": 9.574300828647529e-06,
"loss": 0.3348,
"step": 375000
},
{
"epoch": 55.56,
"grad_norm": 0.4328089952468872,
"learning_rate": 9.545399896419058e-06,
"loss": 0.3366,
"step": 375500
},
{
"epoch": 55.64,
"grad_norm": 0.3957238793373108,
"learning_rate": 9.516498964190588e-06,
"loss": 0.3344,
"step": 376000
},
{
"epoch": 55.71,
"grad_norm": 0.3606773614883423,
"learning_rate": 9.487598031962119e-06,
"loss": 0.3342,
"step": 376500
},
{
"epoch": 55.79,
"grad_norm": 0.4170531928539276,
"learning_rate": 9.458697099733649e-06,
"loss": 0.3349,
"step": 377000
},
{
"epoch": 55.86,
"grad_norm": 0.3830915093421936,
"learning_rate": 9.42979616750518e-06,
"loss": 0.3371,
"step": 377500
},
{
"epoch": 55.93,
"grad_norm": 0.4350239634513855,
"learning_rate": 9.40089523527671e-06,
"loss": 0.3377,
"step": 378000
},
{
"epoch": 56.01,
"grad_norm": 0.37382885813713074,
"learning_rate": 9.371994303048239e-06,
"loss": 0.3362,
"step": 378500
},
{
"epoch": 56.08,
"grad_norm": 0.3806856870651245,
"learning_rate": 9.34309337081977e-06,
"loss": 0.3347,
"step": 379000
},
{
"epoch": 56.16,
"grad_norm": 0.3189554214477539,
"learning_rate": 9.3141924385913e-06,
"loss": 0.3363,
"step": 379500
},
{
"epoch": 56.23,
"grad_norm": 0.33894240856170654,
"learning_rate": 9.28529150636283e-06,
"loss": 0.3362,
"step": 380000
},
{
"epoch": 56.3,
"grad_norm": 0.4565516710281372,
"learning_rate": 9.25639057413436e-06,
"loss": 0.3331,
"step": 380500
},
{
"epoch": 56.38,
"grad_norm": 0.4101388156414032,
"learning_rate": 9.22748964190589e-06,
"loss": 0.335,
"step": 381000
},
{
"epoch": 56.45,
"grad_norm": 0.40449845790863037,
"learning_rate": 9.19858870967742e-06,
"loss": 0.3337,
"step": 381500
},
{
"epoch": 56.53,
"grad_norm": 0.47349539399147034,
"learning_rate": 9.16968777744895e-06,
"loss": 0.3328,
"step": 382000
},
{
"epoch": 56.6,
"grad_norm": 0.42848438024520874,
"learning_rate": 9.14078684522048e-06,
"loss": 0.334,
"step": 382500
},
{
"epoch": 56.67,
"grad_norm": 0.3625510334968567,
"learning_rate": 9.11188591299201e-06,
"loss": 0.3321,
"step": 383000
},
{
"epoch": 56.75,
"grad_norm": 0.3561297357082367,
"learning_rate": 9.082984980763541e-06,
"loss": 0.3349,
"step": 383500
},
{
"epoch": 56.82,
"grad_norm": 0.3738841414451599,
"learning_rate": 9.054084048535069e-06,
"loss": 0.3366,
"step": 384000
},
{
"epoch": 56.9,
"grad_norm": 0.33738991618156433,
"learning_rate": 9.025183116306599e-06,
"loss": 0.3327,
"step": 384500
},
{
"epoch": 56.97,
"grad_norm": 0.42749759554862976,
"learning_rate": 8.99628218407813e-06,
"loss": 0.336,
"step": 385000
},
{
"epoch": 57.04,
"grad_norm": 0.4089387059211731,
"learning_rate": 8.96738125184966e-06,
"loss": 0.3334,
"step": 385500
},
{
"epoch": 57.12,
"grad_norm": 0.3684140145778656,
"learning_rate": 8.938480319621189e-06,
"loss": 0.3345,
"step": 386000
},
{
"epoch": 57.19,
"grad_norm": 0.3694292902946472,
"learning_rate": 8.90957938739272e-06,
"loss": 0.3333,
"step": 386500
},
{
"epoch": 57.27,
"grad_norm": 0.31505081057548523,
"learning_rate": 8.88067845516425e-06,
"loss": 0.3339,
"step": 387000
},
{
"epoch": 57.34,
"grad_norm": 0.4051445722579956,
"learning_rate": 8.85177752293578e-06,
"loss": 0.3348,
"step": 387500
},
{
"epoch": 57.41,
"grad_norm": 0.426145076751709,
"learning_rate": 8.82287659070731e-06,
"loss": 0.3307,
"step": 388000
},
{
"epoch": 57.49,
"grad_norm": 0.4356764256954193,
"learning_rate": 8.79397565847884e-06,
"loss": 0.3336,
"step": 388500
},
{
"epoch": 57.56,
"grad_norm": 0.39635592699050903,
"learning_rate": 8.76507472625037e-06,
"loss": 0.3355,
"step": 389000
},
{
"epoch": 57.64,
"grad_norm": 0.4467043876647949,
"learning_rate": 8.7361737940219e-06,
"loss": 0.3369,
"step": 389500
},
{
"epoch": 57.71,
"grad_norm": 0.5042401552200317,
"learning_rate": 8.70727286179343e-06,
"loss": 0.3352,
"step": 390000
},
{
"epoch": 57.78,
"grad_norm": 0.38742733001708984,
"learning_rate": 8.67837192956496e-06,
"loss": 0.3349,
"step": 390500
},
{
"epoch": 57.86,
"grad_norm": 0.35748493671417236,
"learning_rate": 8.649470997336491e-06,
"loss": 0.3331,
"step": 391000
},
{
"epoch": 57.93,
"grad_norm": 0.406547486782074,
"learning_rate": 8.62057006510802e-06,
"loss": 0.3345,
"step": 391500
},
{
"epoch": 58.01,
"grad_norm": 0.37016528844833374,
"learning_rate": 8.59166913287955e-06,
"loss": 0.3338,
"step": 392000
},
{
"epoch": 58.08,
"grad_norm": 0.39589524269104004,
"learning_rate": 8.562768200651081e-06,
"loss": 0.3334,
"step": 392500
},
{
"epoch": 58.15,
"grad_norm": 0.42654627561569214,
"learning_rate": 8.533867268422611e-06,
"loss": 0.3336,
"step": 393000
},
{
"epoch": 58.23,
"grad_norm": 0.4174553453922272,
"learning_rate": 8.504966336194139e-06,
"loss": 0.3339,
"step": 393500
},
{
"epoch": 58.3,
"grad_norm": 0.43379977345466614,
"learning_rate": 8.47606540396567e-06,
"loss": 0.3329,
"step": 394000
},
{
"epoch": 58.38,
"grad_norm": 0.3706502914428711,
"learning_rate": 8.4471644717372e-06,
"loss": 0.332,
"step": 394500
},
{
"epoch": 58.45,
"grad_norm": 0.4529905319213867,
"learning_rate": 8.41826353950873e-06,
"loss": 0.3342,
"step": 395000
},
{
"epoch": 58.52,
"grad_norm": 0.4060870110988617,
"learning_rate": 8.38936260728026e-06,
"loss": 0.3331,
"step": 395500
},
{
"epoch": 58.6,
"grad_norm": 0.4102860689163208,
"learning_rate": 8.36046167505179e-06,
"loss": 0.3339,
"step": 396000
},
{
"epoch": 58.67,
"grad_norm": 0.38025009632110596,
"learning_rate": 8.331560742823321e-06,
"loss": 0.3334,
"step": 396500
},
{
"epoch": 58.75,
"grad_norm": 0.3559959828853607,
"learning_rate": 8.30265981059485e-06,
"loss": 0.334,
"step": 397000
},
{
"epoch": 58.82,
"grad_norm": 0.48199519515037537,
"learning_rate": 8.27375887836638e-06,
"loss": 0.3328,
"step": 397500
},
{
"epoch": 58.89,
"grad_norm": 0.40932905673980713,
"learning_rate": 8.244857946137912e-06,
"loss": 0.3314,
"step": 398000
},
{
"epoch": 58.97,
"grad_norm": 0.4070405960083008,
"learning_rate": 8.215957013909441e-06,
"loss": 0.3354,
"step": 398500
},
{
"epoch": 59.04,
"grad_norm": 0.392281711101532,
"learning_rate": 8.18705608168097e-06,
"loss": 0.3324,
"step": 399000
},
{
"epoch": 59.12,
"grad_norm": 0.38242244720458984,
"learning_rate": 8.158155149452502e-06,
"loss": 0.3313,
"step": 399500
},
{
"epoch": 59.19,
"grad_norm": 0.4169810712337494,
"learning_rate": 8.129254217224031e-06,
"loss": 0.3354,
"step": 400000
},
{
"epoch": 59.26,
"grad_norm": 0.335362046957016,
"learning_rate": 8.100353284995561e-06,
"loss": 0.3312,
"step": 400500
},
{
"epoch": 59.34,
"grad_norm": 0.41095077991485596,
"learning_rate": 8.071452352767092e-06,
"loss": 0.3331,
"step": 401000
},
{
"epoch": 59.41,
"grad_norm": 0.39492741227149963,
"learning_rate": 8.042551420538622e-06,
"loss": 0.3314,
"step": 401500
},
{
"epoch": 59.49,
"grad_norm": 0.42789730429649353,
"learning_rate": 8.013650488310151e-06,
"loss": 0.333,
"step": 402000
},
{
"epoch": 59.56,
"grad_norm": 0.35511842370033264,
"learning_rate": 7.984749556081683e-06,
"loss": 0.3346,
"step": 402500
},
{
"epoch": 59.63,
"grad_norm": 0.36928626894950867,
"learning_rate": 7.95584862385321e-06,
"loss": 0.335,
"step": 403000
},
{
"epoch": 59.71,
"grad_norm": 0.4076744318008423,
"learning_rate": 7.92694769162474e-06,
"loss": 0.3294,
"step": 403500
},
{
"epoch": 59.78,
"grad_norm": 0.35494473576545715,
"learning_rate": 7.898046759396271e-06,
"loss": 0.3336,
"step": 404000
},
{
"epoch": 59.85,
"grad_norm": 0.3991703689098358,
"learning_rate": 7.8691458271678e-06,
"loss": 0.3294,
"step": 404500
},
{
"epoch": 59.93,
"grad_norm": 0.3891808092594147,
"learning_rate": 7.84024489493933e-06,
"loss": 0.3349,
"step": 405000
},
{
"epoch": 60.0,
"grad_norm": 0.5921450257301331,
"learning_rate": 7.811343962710861e-06,
"loss": 0.3331,
"step": 405500
},
{
"epoch": 60.08,
"grad_norm": 0.387185275554657,
"learning_rate": 7.782443030482391e-06,
"loss": 0.3326,
"step": 406000
},
{
"epoch": 60.15,
"grad_norm": 0.5411362648010254,
"learning_rate": 7.75354209825392e-06,
"loss": 0.3303,
"step": 406500
},
{
"epoch": 60.22,
"grad_norm": 0.35113802552223206,
"learning_rate": 7.724641166025452e-06,
"loss": 0.3343,
"step": 407000
},
{
"epoch": 60.3,
"grad_norm": 0.3711684048175812,
"learning_rate": 7.695740233796981e-06,
"loss": 0.3316,
"step": 407500
},
{
"epoch": 60.37,
"grad_norm": 0.40576910972595215,
"learning_rate": 7.666839301568511e-06,
"loss": 0.3344,
"step": 408000
},
{
"epoch": 60.45,
"grad_norm": 0.4487907588481903,
"learning_rate": 7.637938369340042e-06,
"loss": 0.3337,
"step": 408500
},
{
"epoch": 60.52,
"grad_norm": 0.4065958857536316,
"learning_rate": 7.609037437111572e-06,
"loss": 0.3314,
"step": 409000
},
{
"epoch": 60.59,
"grad_norm": 0.4283113479614258,
"learning_rate": 7.580136504883102e-06,
"loss": 0.3337,
"step": 409500
},
{
"epoch": 60.67,
"grad_norm": 0.4433044493198395,
"learning_rate": 7.5512355726546325e-06,
"loss": 0.3317,
"step": 410000
},
{
"epoch": 60.74,
"grad_norm": 0.38607364892959595,
"learning_rate": 7.522334640426161e-06,
"loss": 0.333,
"step": 410500
},
{
"epoch": 60.82,
"grad_norm": 0.45367687940597534,
"learning_rate": 7.4934337081976916e-06,
"loss": 0.3298,
"step": 411000
},
{
"epoch": 60.89,
"grad_norm": 0.4054895043373108,
"learning_rate": 7.464532775969222e-06,
"loss": 0.3318,
"step": 411500
},
{
"epoch": 60.96,
"grad_norm": 0.41600409150123596,
"learning_rate": 7.4356318437407515e-06,
"loss": 0.3313,
"step": 412000
},
{
"epoch": 61.04,
"grad_norm": 0.4171212911605835,
"learning_rate": 7.406730911512282e-06,
"loss": 0.3318,
"step": 412500
},
{
"epoch": 61.11,
"grad_norm": 0.40264466404914856,
"learning_rate": 7.377829979283812e-06,
"loss": 0.3335,
"step": 413000
},
{
"epoch": 61.19,
"grad_norm": 0.37919875979423523,
"learning_rate": 7.348929047055342e-06,
"loss": 0.3324,
"step": 413500
},
{
"epoch": 61.26,
"grad_norm": 0.47246700525283813,
"learning_rate": 7.320028114826872e-06,
"loss": 0.3341,
"step": 414000
},
{
"epoch": 61.33,
"grad_norm": 0.4305689036846161,
"learning_rate": 7.291127182598403e-06,
"loss": 0.3335,
"step": 414500
},
{
"epoch": 61.41,
"grad_norm": 0.38494426012039185,
"learning_rate": 7.262226250369932e-06,
"loss": 0.3337,
"step": 415000
},
{
"epoch": 61.48,
"grad_norm": 0.45139452815055847,
"learning_rate": 7.233325318141462e-06,
"loss": 0.3322,
"step": 415500
},
{
"epoch": 61.56,
"grad_norm": 0.4199995994567871,
"learning_rate": 7.204424385912992e-06,
"loss": 0.3302,
"step": 416000
},
{
"epoch": 61.63,
"grad_norm": 0.3823252022266388,
"learning_rate": 7.175523453684522e-06,
"loss": 0.333,
"step": 416500
},
{
"epoch": 61.7,
"grad_norm": 0.38762542605400085,
"learning_rate": 7.146622521456052e-06,
"loss": 0.3338,
"step": 417000
},
{
"epoch": 61.78,
"grad_norm": 0.3889346718788147,
"learning_rate": 7.117721589227582e-06,
"loss": 0.3333,
"step": 417500
},
{
"epoch": 61.85,
"grad_norm": 0.43703803420066833,
"learning_rate": 7.088820656999113e-06,
"loss": 0.3313,
"step": 418000
},
{
"epoch": 61.93,
"grad_norm": 0.37083032727241516,
"learning_rate": 7.059919724770642e-06,
"loss": 0.3327,
"step": 418500
},
{
"epoch": 62.0,
"grad_norm": 0.431436687707901,
"learning_rate": 7.031018792542173e-06,
"loss": 0.3275,
"step": 419000
},
{
"epoch": 62.07,
"grad_norm": 0.38710957765579224,
"learning_rate": 7.002117860313703e-06,
"loss": 0.3315,
"step": 419500
},
{
"epoch": 62.15,
"grad_norm": 0.4548743963241577,
"learning_rate": 6.973216928085232e-06,
"loss": 0.3314,
"step": 420000
},
{
"epoch": 62.22,
"grad_norm": 0.4413709342479706,
"learning_rate": 6.944315995856762e-06,
"loss": 0.3317,
"step": 420500
},
{
"epoch": 62.3,
"grad_norm": 0.42544716596603394,
"learning_rate": 6.915415063628293e-06,
"loss": 0.3327,
"step": 421000
},
{
"epoch": 62.37,
"grad_norm": 0.4307864010334015,
"learning_rate": 6.886514131399822e-06,
"loss": 0.3335,
"step": 421500
},
{
"epoch": 62.44,
"grad_norm": 0.4296441376209259,
"learning_rate": 6.8576131991713526e-06,
"loss": 0.3317,
"step": 422000
},
{
"epoch": 62.52,
"grad_norm": 0.3624299466609955,
"learning_rate": 6.828712266942883e-06,
"loss": 0.3307,
"step": 422500
},
{
"epoch": 62.59,
"grad_norm": 0.4123700261116028,
"learning_rate": 6.7998113347144125e-06,
"loss": 0.3317,
"step": 423000
},
{
"epoch": 62.67,
"grad_norm": 0.4546355903148651,
"learning_rate": 6.770910402485943e-06,
"loss": 0.3288,
"step": 423500
},
{
"epoch": 62.74,
"grad_norm": 0.4328787922859192,
"learning_rate": 6.742009470257473e-06,
"loss": 0.3321,
"step": 424000
},
{
"epoch": 62.81,
"grad_norm": 0.39879125356674194,
"learning_rate": 6.713108538029003e-06,
"loss": 0.334,
"step": 424500
},
{
"epoch": 62.89,
"grad_norm": 0.42407459020614624,
"learning_rate": 6.684207605800532e-06,
"loss": 0.3312,
"step": 425000
},
{
"epoch": 62.96,
"grad_norm": 0.5664127469062805,
"learning_rate": 6.655306673572063e-06,
"loss": 0.3323,
"step": 425500
},
{
"epoch": 63.04,
"grad_norm": 0.47169846296310425,
"learning_rate": 6.626405741343592e-06,
"loss": 0.3309,
"step": 426000
},
{
"epoch": 63.11,
"grad_norm": 0.3552204668521881,
"learning_rate": 6.597504809115123e-06,
"loss": 0.33,
"step": 426500
},
{
"epoch": 63.18,
"grad_norm": 0.44585150480270386,
"learning_rate": 6.568603876886653e-06,
"loss": 0.3306,
"step": 427000
},
{
"epoch": 63.26,
"grad_norm": 0.4512608051300049,
"learning_rate": 6.5397029446581835e-06,
"loss": 0.3308,
"step": 427500
},
{
"epoch": 63.33,
"grad_norm": 0.40121740102767944,
"learning_rate": 6.510802012429713e-06,
"loss": 0.3302,
"step": 428000
},
{
"epoch": 63.41,
"grad_norm": 0.4354041516780853,
"learning_rate": 6.481901080201243e-06,
"loss": 0.3327,
"step": 428500
},
{
"epoch": 63.48,
"grad_norm": 0.4612290561199188,
"learning_rate": 6.453000147972774e-06,
"loss": 0.3311,
"step": 429000
},
{
"epoch": 63.55,
"grad_norm": 0.4508548676967621,
"learning_rate": 6.424099215744303e-06,
"loss": 0.3312,
"step": 429500
},
{
"epoch": 63.63,
"grad_norm": 0.4045092761516571,
"learning_rate": 6.395198283515833e-06,
"loss": 0.3313,
"step": 430000
},
{
"epoch": 63.7,
"grad_norm": 0.4180326759815216,
"learning_rate": 6.366297351287363e-06,
"loss": 0.3324,
"step": 430500
},
{
"epoch": 63.78,
"grad_norm": 0.3800413906574249,
"learning_rate": 6.337396419058893e-06,
"loss": 0.3357,
"step": 431000
},
{
"epoch": 63.85,
"grad_norm": 0.4264669716358185,
"learning_rate": 6.308495486830423e-06,
"loss": 0.3314,
"step": 431500
},
{
"epoch": 63.92,
"grad_norm": 0.4021168351173401,
"learning_rate": 6.279594554601954e-06,
"loss": 0.3301,
"step": 432000
},
{
"epoch": 64.0,
"grad_norm": 0.4635623097419739,
"learning_rate": 6.250693622373483e-06,
"loss": 0.3304,
"step": 432500
},
{
"epoch": 64.07,
"grad_norm": 0.4012512266635895,
"learning_rate": 6.2217926901450136e-06,
"loss": 0.3322,
"step": 433000
},
{
"epoch": 64.15,
"grad_norm": 0.4430687725543976,
"learning_rate": 6.192891757916544e-06,
"loss": 0.3302,
"step": 433500
},
{
"epoch": 64.22,
"grad_norm": 0.43903249502182007,
"learning_rate": 6.1639908256880735e-06,
"loss": 0.3326,
"step": 434000
},
{
"epoch": 64.29,
"grad_norm": 0.5228444337844849,
"learning_rate": 6.135089893459604e-06,
"loss": 0.3298,
"step": 434500
},
{
"epoch": 64.37,
"grad_norm": 0.43113288283348083,
"learning_rate": 6.1061889612311334e-06,
"loss": 0.3291,
"step": 435000
},
{
"epoch": 64.44,
"grad_norm": 0.47652667760849,
"learning_rate": 6.077288029002663e-06,
"loss": 0.3299,
"step": 435500
},
{
"epoch": 64.52,
"grad_norm": 0.4017566442489624,
"learning_rate": 6.048387096774193e-06,
"loss": 0.3312,
"step": 436000
},
{
"epoch": 64.59,
"grad_norm": 0.4369170069694519,
"learning_rate": 6.019486164545724e-06,
"loss": 0.3339,
"step": 436500
},
{
"epoch": 64.66,
"grad_norm": 0.36806294322013855,
"learning_rate": 5.990585232317254e-06,
"loss": 0.3317,
"step": 437000
},
{
"epoch": 64.74,
"grad_norm": 0.42576882243156433,
"learning_rate": 5.961684300088784e-06,
"loss": 0.3309,
"step": 437500
},
{
"epoch": 64.81,
"grad_norm": 0.4077777564525604,
"learning_rate": 5.932783367860314e-06,
"loss": 0.3319,
"step": 438000
},
{
"epoch": 64.89,
"grad_norm": 0.4394007921218872,
"learning_rate": 5.9038824356318445e-06,
"loss": 0.3327,
"step": 438500
},
{
"epoch": 64.96,
"grad_norm": 0.32965216040611267,
"learning_rate": 5.874981503403374e-06,
"loss": 0.3277,
"step": 439000
},
{
"epoch": 65.03,
"grad_norm": 0.4312441945075989,
"learning_rate": 5.846080571174904e-06,
"loss": 0.3291,
"step": 439500
},
{
"epoch": 65.11,
"grad_norm": 0.3752184808254242,
"learning_rate": 5.817179638946434e-06,
"loss": 0.3319,
"step": 440000
},
{
"epoch": 65.18,
"grad_norm": 0.4169740080833435,
"learning_rate": 5.7882787067179635e-06,
"loss": 0.331,
"step": 440500
},
{
"epoch": 65.26,
"grad_norm": 0.43580740690231323,
"learning_rate": 5.759377774489494e-06,
"loss": 0.3301,
"step": 441000
},
{
"epoch": 65.33,
"grad_norm": 0.46015655994415283,
"learning_rate": 5.730476842261024e-06,
"loss": 0.3326,
"step": 441500
},
{
"epoch": 65.4,
"grad_norm": 0.4646316468715668,
"learning_rate": 5.701575910032554e-06,
"loss": 0.3307,
"step": 442000
},
{
"epoch": 65.48,
"grad_norm": 0.4371485114097595,
"learning_rate": 5.672674977804084e-06,
"loss": 0.3303,
"step": 442500
},
{
"epoch": 65.55,
"grad_norm": 0.443768173456192,
"learning_rate": 5.643774045575615e-06,
"loss": 0.3315,
"step": 443000
},
{
"epoch": 65.63,
"grad_norm": 0.44002553820610046,
"learning_rate": 5.614873113347144e-06,
"loss": 0.3305,
"step": 443500
},
{
"epoch": 65.7,
"grad_norm": 0.39671292901039124,
"learning_rate": 5.5859721811186746e-06,
"loss": 0.3312,
"step": 444000
},
{
"epoch": 65.77,
"grad_norm": 0.4188387393951416,
"learning_rate": 5.557071248890204e-06,
"loss": 0.3302,
"step": 444500
},
{
"epoch": 65.85,
"grad_norm": 0.44623398780822754,
"learning_rate": 5.528170316661734e-06,
"loss": 0.3308,
"step": 445000
},
{
"epoch": 65.92,
"grad_norm": 0.36335235834121704,
"learning_rate": 5.499269384433264e-06,
"loss": 0.3293,
"step": 445500
},
{
"epoch": 66.0,
"grad_norm": 0.41810572147369385,
"learning_rate": 5.4703684522047944e-06,
"loss": 0.329,
"step": 446000
},
{
"epoch": 66.07,
"grad_norm": 0.4002617299556732,
"learning_rate": 5.441467519976325e-06,
"loss": 0.3278,
"step": 446500
},
{
"epoch": 66.14,
"grad_norm": 0.45273175835609436,
"learning_rate": 5.412566587747854e-06,
"loss": 0.3303,
"step": 447000
},
{
"epoch": 66.22,
"grad_norm": 0.48169875144958496,
"learning_rate": 5.383665655519385e-06,
"loss": 0.332,
"step": 447500
},
{
"epoch": 66.29,
"grad_norm": 0.39927640557289124,
"learning_rate": 5.354764723290915e-06,
"loss": 0.3296,
"step": 448000
},
{
"epoch": 66.37,
"grad_norm": 0.42319226264953613,
"learning_rate": 5.325863791062445e-06,
"loss": 0.3309,
"step": 448500
},
{
"epoch": 66.44,
"grad_norm": 0.4284779131412506,
"learning_rate": 5.296962858833975e-06,
"loss": 0.3321,
"step": 449000
},
{
"epoch": 66.51,
"grad_norm": 0.5179397463798523,
"learning_rate": 5.268061926605505e-06,
"loss": 0.33,
"step": 449500
},
{
"epoch": 66.59,
"grad_norm": 0.44250035285949707,
"learning_rate": 5.239160994377034e-06,
"loss": 0.3295,
"step": 450000
},
{
"epoch": 66.66,
"grad_norm": 0.46015605330467224,
"learning_rate": 5.210260062148565e-06,
"loss": 0.3313,
"step": 450500
},
{
"epoch": 66.74,
"grad_norm": 0.5012817978858948,
"learning_rate": 5.181359129920095e-06,
"loss": 0.3302,
"step": 451000
},
{
"epoch": 66.81,
"grad_norm": 0.403338223695755,
"learning_rate": 5.1524581976916245e-06,
"loss": 0.3306,
"step": 451500
},
{
"epoch": 66.88,
"grad_norm": 0.4086831212043762,
"learning_rate": 5.123557265463155e-06,
"loss": 0.3286,
"step": 452000
},
{
"epoch": 66.96,
"grad_norm": 0.3715237081050873,
"learning_rate": 5.094656333234685e-06,
"loss": 0.3301,
"step": 452500
},
{
"epoch": 67.03,
"grad_norm": 0.46829870343208313,
"learning_rate": 5.065755401006215e-06,
"loss": 0.3307,
"step": 453000
},
{
"epoch": 67.11,
"grad_norm": 0.4667709767818451,
"learning_rate": 5.036854468777745e-06,
"loss": 0.3298,
"step": 453500
},
{
"epoch": 67.18,
"grad_norm": 0.4758981466293335,
"learning_rate": 5.007953536549275e-06,
"loss": 0.3272,
"step": 454000
},
{
"epoch": 67.25,
"grad_norm": 0.48276805877685547,
"learning_rate": 4.979052604320804e-06,
"loss": 0.3288,
"step": 454500
},
{
"epoch": 67.33,
"grad_norm": 0.400806725025177,
"learning_rate": 4.950151672092335e-06,
"loss": 0.3258,
"step": 455000
},
{
"epoch": 67.4,
"grad_norm": 0.40156251192092896,
"learning_rate": 4.921250739863865e-06,
"loss": 0.3351,
"step": 455500
},
{
"epoch": 67.48,
"grad_norm": 0.5024535655975342,
"learning_rate": 4.8923498076353955e-06,
"loss": 0.3306,
"step": 456000
},
{
"epoch": 67.55,
"grad_norm": 0.52587890625,
"learning_rate": 4.863448875406925e-06,
"loss": 0.3331,
"step": 456500
},
{
"epoch": 67.62,
"grad_norm": 0.41265735030174255,
"learning_rate": 4.8345479431784554e-06,
"loss": 0.3328,
"step": 457000
},
{
"epoch": 67.7,
"grad_norm": 0.34202754497528076,
"learning_rate": 4.805647010949986e-06,
"loss": 0.3321,
"step": 457500
},
{
"epoch": 67.77,
"grad_norm": 0.4898373484611511,
"learning_rate": 4.776746078721515e-06,
"loss": 0.331,
"step": 458000
},
{
"epoch": 67.85,
"grad_norm": 0.52295982837677,
"learning_rate": 4.747845146493046e-06,
"loss": 0.3306,
"step": 458500
},
{
"epoch": 67.92,
"grad_norm": 0.46750620007514954,
"learning_rate": 4.718944214264575e-06,
"loss": 0.3315,
"step": 459000
},
{
"epoch": 67.99,
"grad_norm": 0.35533860325813293,
"learning_rate": 4.690043282036105e-06,
"loss": 0.3315,
"step": 459500
},
{
"epoch": 68.07,
"grad_norm": 0.41508856415748596,
"learning_rate": 4.661142349807635e-06,
"loss": 0.3291,
"step": 460000
},
{
"epoch": 68.14,
"grad_norm": 0.4271659851074219,
"learning_rate": 4.632241417579166e-06,
"loss": 0.3286,
"step": 460500
},
{
"epoch": 68.22,
"grad_norm": 0.44648808240890503,
"learning_rate": 4.603340485350695e-06,
"loss": 0.3299,
"step": 461000
},
{
"epoch": 68.29,
"grad_norm": 0.4843562841415405,
"learning_rate": 4.574439553122226e-06,
"loss": 0.3282,
"step": 461500
},
{
"epoch": 68.36,
"grad_norm": 0.41266024112701416,
"learning_rate": 4.545538620893756e-06,
"loss": 0.3299,
"step": 462000
},
{
"epoch": 68.44,
"grad_norm": 0.4088280200958252,
"learning_rate": 4.5166376886652855e-06,
"loss": 0.332,
"step": 462500
},
{
"epoch": 68.51,
"grad_norm": 0.48477041721343994,
"learning_rate": 4.487736756436816e-06,
"loss": 0.3312,
"step": 463000
},
{
"epoch": 68.59,
"grad_norm": 0.42487454414367676,
"learning_rate": 4.458835824208346e-06,
"loss": 0.3296,
"step": 463500
},
{
"epoch": 68.66,
"grad_norm": 0.4671236276626587,
"learning_rate": 4.429934891979876e-06,
"loss": 0.3288,
"step": 464000
},
{
"epoch": 68.73,
"grad_norm": 0.4430939257144928,
"learning_rate": 4.401033959751405e-06,
"loss": 0.3297,
"step": 464500
},
{
"epoch": 68.81,
"grad_norm": 0.4080400764942169,
"learning_rate": 4.372133027522936e-06,
"loss": 0.3305,
"step": 465000
},
{
"epoch": 68.88,
"grad_norm": 0.3743002712726593,
"learning_rate": 4.343232095294466e-06,
"loss": 0.3294,
"step": 465500
},
{
"epoch": 68.96,
"grad_norm": 0.3991639316082001,
"learning_rate": 4.314331163065996e-06,
"loss": 0.3293,
"step": 466000
},
{
"epoch": 69.03,
"grad_norm": 0.40404531359672546,
"learning_rate": 4.285430230837526e-06,
"loss": 0.3307,
"step": 466500
},
{
"epoch": 69.1,
"grad_norm": 0.4253464639186859,
"learning_rate": 4.2565292986090565e-06,
"loss": 0.3278,
"step": 467000
},
{
"epoch": 69.18,
"grad_norm": 0.43970435857772827,
"learning_rate": 4.227628366380586e-06,
"loss": 0.3284,
"step": 467500
},
{
"epoch": 69.25,
"grad_norm": 0.42423635721206665,
"learning_rate": 4.1987274341521164e-06,
"loss": 0.3337,
"step": 468000
},
{
"epoch": 69.33,
"grad_norm": 0.4581485092639923,
"learning_rate": 4.169826501923646e-06,
"loss": 0.3273,
"step": 468500
},
{
"epoch": 69.4,
"grad_norm": 0.4594268500804901,
"learning_rate": 4.1409255696951755e-06,
"loss": 0.3295,
"step": 469000
},
{
"epoch": 69.47,
"grad_norm": 0.49994996190071106,
"learning_rate": 4.112024637466706e-06,
"loss": 0.3267,
"step": 469500
},
{
"epoch": 69.55,
"grad_norm": 0.4062737822532654,
"learning_rate": 4.083123705238236e-06,
"loss": 0.3283,
"step": 470000
},
{
"epoch": 69.62,
"grad_norm": 0.4764838218688965,
"learning_rate": 4.054222773009766e-06,
"loss": 0.3318,
"step": 470500
},
{
"epoch": 69.7,
"grad_norm": 0.42747876048088074,
"learning_rate": 4.025321840781296e-06,
"loss": 0.3311,
"step": 471000
},
{
"epoch": 69.77,
"grad_norm": 0.45434367656707764,
"learning_rate": 3.996420908552827e-06,
"loss": 0.3293,
"step": 471500
},
{
"epoch": 69.84,
"grad_norm": 0.387123703956604,
"learning_rate": 3.967519976324356e-06,
"loss": 0.3311,
"step": 472000
},
{
"epoch": 69.92,
"grad_norm": 0.412826806306839,
"learning_rate": 3.938619044095887e-06,
"loss": 0.329,
"step": 472500
},
{
"epoch": 69.99,
"grad_norm": 0.532727062702179,
"learning_rate": 3.909718111867417e-06,
"loss": 0.3266,
"step": 473000
},
{
"epoch": 70.07,
"grad_norm": 0.4674714505672455,
"learning_rate": 3.8808171796389465e-06,
"loss": 0.3257,
"step": 473500
},
{
"epoch": 70.14,
"grad_norm": 0.3989239037036896,
"learning_rate": 3.851916247410477e-06,
"loss": 0.3288,
"step": 474000
},
{
"epoch": 70.21,
"grad_norm": 0.5390828251838684,
"learning_rate": 3.8230153151820065e-06,
"loss": 0.3316,
"step": 474500
},
{
"epoch": 70.29,
"grad_norm": 0.4232146143913269,
"learning_rate": 3.7941143829535364e-06,
"loss": 0.3297,
"step": 475000
},
{
"epoch": 70.36,
"grad_norm": 0.4476439654827118,
"learning_rate": 3.765213450725067e-06,
"loss": 0.3308,
"step": 475500
},
{
"epoch": 70.44,
"grad_norm": 0.46341538429260254,
"learning_rate": 3.7363125184965968e-06,
"loss": 0.3241,
"step": 476000
},
{
"epoch": 70.51,
"grad_norm": 0.3792473077774048,
"learning_rate": 3.7074115862681268e-06,
"loss": 0.3282,
"step": 476500
},
{
"epoch": 70.58,
"grad_norm": 0.42449694871902466,
"learning_rate": 3.6785106540396567e-06,
"loss": 0.3282,
"step": 477000
},
{
"epoch": 70.66,
"grad_norm": 0.389700323343277,
"learning_rate": 3.6496097218111867e-06,
"loss": 0.3332,
"step": 477500
},
{
"epoch": 70.73,
"grad_norm": 0.4011322855949402,
"learning_rate": 3.6207087895827167e-06,
"loss": 0.3295,
"step": 478000
},
{
"epoch": 70.8,
"grad_norm": 0.485365092754364,
"learning_rate": 3.591807857354247e-06,
"loss": 0.33,
"step": 478500
},
{
"epoch": 70.88,
"grad_norm": 0.39829009771347046,
"learning_rate": 3.562906925125777e-06,
"loss": 0.3291,
"step": 479000
},
{
"epoch": 70.95,
"grad_norm": 0.46039626002311707,
"learning_rate": 3.5340059928973066e-06,
"loss": 0.331,
"step": 479500
},
{
"epoch": 71.03,
"grad_norm": 0.40389734506607056,
"learning_rate": 3.505105060668837e-06,
"loss": 0.3286,
"step": 480000
},
{
"epoch": 71.1,
"grad_norm": 0.4375256299972534,
"learning_rate": 3.476204128440367e-06,
"loss": 0.3298,
"step": 480500
},
{
"epoch": 71.17,
"grad_norm": 0.42462676763534546,
"learning_rate": 3.4473031962118973e-06,
"loss": 0.3302,
"step": 481000
},
{
"epoch": 71.25,
"grad_norm": 0.3216535747051239,
"learning_rate": 3.4184022639834273e-06,
"loss": 0.3283,
"step": 481500
},
{
"epoch": 71.32,
"grad_norm": 0.45945799350738525,
"learning_rate": 3.389501331754957e-06,
"loss": 0.3278,
"step": 482000
},
{
"epoch": 71.4,
"grad_norm": 0.4495971202850342,
"learning_rate": 3.3606003995264872e-06,
"loss": 0.3265,
"step": 482500
},
{
"epoch": 71.47,
"grad_norm": 0.4159165322780609,
"learning_rate": 3.331699467298017e-06,
"loss": 0.3292,
"step": 483000
},
{
"epoch": 71.54,
"grad_norm": 0.410427063703537,
"learning_rate": 3.302798535069547e-06,
"loss": 0.3299,
"step": 483500
},
{
"epoch": 71.62,
"grad_norm": 0.5130240321159363,
"learning_rate": 3.2738976028410776e-06,
"loss": 0.3316,
"step": 484000
},
{
"epoch": 71.69,
"grad_norm": 0.4405277669429779,
"learning_rate": 3.244996670612607e-06,
"loss": 0.33,
"step": 484500
},
{
"epoch": 71.77,
"grad_norm": 0.575674295425415,
"learning_rate": 3.2160957383841375e-06,
"loss": 0.3298,
"step": 485000
},
{
"epoch": 71.84,
"grad_norm": 0.4434616267681122,
"learning_rate": 3.1871948061556675e-06,
"loss": 0.3298,
"step": 485500
},
{
"epoch": 71.91,
"grad_norm": 0.3960082530975342,
"learning_rate": 3.1582938739271974e-06,
"loss": 0.3282,
"step": 486000
},
{
"epoch": 71.99,
"grad_norm": 0.42698296904563904,
"learning_rate": 3.129392941698728e-06,
"loss": 0.3296,
"step": 486500
},
{
"epoch": 72.06,
"grad_norm": 0.5218748450279236,
"learning_rate": 3.1004920094702574e-06,
"loss": 0.3296,
"step": 487000
},
{
"epoch": 72.14,
"grad_norm": 0.46763402223587036,
"learning_rate": 3.0715910772417873e-06,
"loss": 0.3262,
"step": 487500
},
{
"epoch": 72.21,
"grad_norm": 0.42345327138900757,
"learning_rate": 3.0426901450133177e-06,
"loss": 0.33,
"step": 488000
},
{
"epoch": 72.28,
"grad_norm": 0.4526881277561188,
"learning_rate": 3.0137892127848477e-06,
"loss": 0.3304,
"step": 488500
},
{
"epoch": 72.36,
"grad_norm": 0.42106348276138306,
"learning_rate": 2.984888280556378e-06,
"loss": 0.3292,
"step": 489000
},
{
"epoch": 72.43,
"grad_norm": 0.5022510886192322,
"learning_rate": 2.9559873483279076e-06,
"loss": 0.3266,
"step": 489500
},
{
"epoch": 72.51,
"grad_norm": 0.4436812996864319,
"learning_rate": 2.9270864160994376e-06,
"loss": 0.3269,
"step": 490000
},
{
"epoch": 72.58,
"grad_norm": 0.42252251505851746,
"learning_rate": 2.898185483870968e-06,
"loss": 0.3281,
"step": 490500
},
{
"epoch": 72.65,
"grad_norm": 0.5339802503585815,
"learning_rate": 2.869284551642498e-06,
"loss": 0.3289,
"step": 491000
},
{
"epoch": 72.73,
"grad_norm": 0.3937510550022125,
"learning_rate": 2.840383619414028e-06,
"loss": 0.328,
"step": 491500
},
{
"epoch": 72.8,
"grad_norm": 0.3894229829311371,
"learning_rate": 2.811482687185558e-06,
"loss": 0.3282,
"step": 492000
},
{
"epoch": 72.88,
"grad_norm": 0.4481090307235718,
"learning_rate": 2.782581754957088e-06,
"loss": 0.3301,
"step": 492500
},
{
"epoch": 72.95,
"grad_norm": 0.45495444536209106,
"learning_rate": 2.753680822728618e-06,
"loss": 0.3278,
"step": 493000
},
{
"epoch": 73.02,
"grad_norm": 0.49259716272354126,
"learning_rate": 2.7247798905001482e-06,
"loss": 0.3295,
"step": 493500
},
{
"epoch": 73.1,
"grad_norm": 0.4257282018661499,
"learning_rate": 2.6958789582716778e-06,
"loss": 0.3269,
"step": 494000
},
{
"epoch": 73.17,
"grad_norm": 0.43159300088882446,
"learning_rate": 2.666978026043208e-06,
"loss": 0.3276,
"step": 494500
},
{
"epoch": 73.25,
"grad_norm": 0.4048108458518982,
"learning_rate": 2.638077093814738e-06,
"loss": 0.3333,
"step": 495000
},
{
"epoch": 73.32,
"grad_norm": 0.4666566252708435,
"learning_rate": 2.609176161586268e-06,
"loss": 0.3275,
"step": 495500
},
{
"epoch": 73.39,
"grad_norm": 0.3985891342163086,
"learning_rate": 2.5802752293577985e-06,
"loss": 0.3279,
"step": 496000
},
{
"epoch": 73.47,
"grad_norm": 0.5439868569374084,
"learning_rate": 2.551374297129328e-06,
"loss": 0.3275,
"step": 496500
},
{
"epoch": 73.54,
"grad_norm": 0.45784690976142883,
"learning_rate": 2.522473364900858e-06,
"loss": 0.3309,
"step": 497000
},
{
"epoch": 73.62,
"grad_norm": 0.4779771864414215,
"learning_rate": 2.4935724326723884e-06,
"loss": 0.3288,
"step": 497500
},
{
"epoch": 73.69,
"grad_norm": 0.47680574655532837,
"learning_rate": 2.4646715004439184e-06,
"loss": 0.3285,
"step": 498000
},
{
"epoch": 73.76,
"grad_norm": 0.3629719913005829,
"learning_rate": 2.4357705682154488e-06,
"loss": 0.3284,
"step": 498500
},
{
"epoch": 73.84,
"grad_norm": 0.46253129839897156,
"learning_rate": 2.4068696359869783e-06,
"loss": 0.3284,
"step": 499000
},
{
"epoch": 73.91,
"grad_norm": 0.44531476497650146,
"learning_rate": 2.3779687037585083e-06,
"loss": 0.3281,
"step": 499500
},
{
"epoch": 73.99,
"grad_norm": 0.39289695024490356,
"learning_rate": 2.3490677715300387e-06,
"loss": 0.326,
"step": 500000
},
{
"epoch": 74.06,
"grad_norm": 0.48103997111320496,
"learning_rate": 2.3201668393015686e-06,
"loss": 0.3272,
"step": 500500
},
{
"epoch": 74.13,
"grad_norm": 0.4336768388748169,
"learning_rate": 2.2912659070730986e-06,
"loss": 0.3265,
"step": 501000
},
{
"epoch": 74.21,
"grad_norm": 0.4040307402610779,
"learning_rate": 2.2623649748446286e-06,
"loss": 0.3271,
"step": 501500
},
{
"epoch": 74.28,
"grad_norm": 0.49081218242645264,
"learning_rate": 2.2334640426161585e-06,
"loss": 0.328,
"step": 502000
},
{
"epoch": 74.36,
"grad_norm": 0.44683390855789185,
"learning_rate": 2.2045631103876885e-06,
"loss": 0.3266,
"step": 502500
},
{
"epoch": 74.43,
"grad_norm": 0.4362635612487793,
"learning_rate": 2.175662178159219e-06,
"loss": 0.3293,
"step": 503000
},
{
"epoch": 74.5,
"grad_norm": 0.4326813220977783,
"learning_rate": 2.146761245930749e-06,
"loss": 0.3302,
"step": 503500
},
{
"epoch": 74.58,
"grad_norm": 0.5289288759231567,
"learning_rate": 2.117860313702279e-06,
"loss": 0.3288,
"step": 504000
},
{
"epoch": 74.65,
"grad_norm": 0.5708897709846497,
"learning_rate": 2.088959381473809e-06,
"loss": 0.3271,
"step": 504500
},
{
"epoch": 74.73,
"grad_norm": 0.38460394740104675,
"learning_rate": 2.0600584492453388e-06,
"loss": 0.3262,
"step": 505000
},
{
"epoch": 74.8,
"grad_norm": 0.4401102066040039,
"learning_rate": 2.031157517016869e-06,
"loss": 0.3285,
"step": 505500
},
{
"epoch": 74.87,
"grad_norm": 0.4699185788631439,
"learning_rate": 2.002256584788399e-06,
"loss": 0.3292,
"step": 506000
},
{
"epoch": 74.95,
"grad_norm": 0.43969598412513733,
"learning_rate": 1.9733556525599287e-06,
"loss": 0.3282,
"step": 506500
},
{
"epoch": 75.02,
"grad_norm": 0.5226773619651794,
"learning_rate": 1.944454720331459e-06,
"loss": 0.3275,
"step": 507000
},
{
"epoch": 75.1,
"grad_norm": 0.42381104826927185,
"learning_rate": 1.915553788102989e-06,
"loss": 0.3287,
"step": 507500
},
{
"epoch": 75.17,
"grad_norm": 0.47836771607398987,
"learning_rate": 1.886652855874519e-06,
"loss": 0.3248,
"step": 508000
},
{
"epoch": 75.24,
"grad_norm": 0.4760962128639221,
"learning_rate": 1.8577519236460492e-06,
"loss": 0.3255,
"step": 508500
},
{
"epoch": 75.32,
"grad_norm": 0.4954340159893036,
"learning_rate": 1.8288509914175794e-06,
"loss": 0.3274,
"step": 509000
},
{
"epoch": 75.39,
"grad_norm": 0.3998168110847473,
"learning_rate": 1.7999500591891091e-06,
"loss": 0.328,
"step": 509500
},
{
"epoch": 75.47,
"grad_norm": 0.3899104595184326,
"learning_rate": 1.7710491269606393e-06,
"loss": 0.3291,
"step": 510000
},
{
"epoch": 75.54,
"grad_norm": 0.4677903652191162,
"learning_rate": 1.7421481947321693e-06,
"loss": 0.3261,
"step": 510500
},
{
"epoch": 75.61,
"grad_norm": 0.41607698798179626,
"learning_rate": 1.7132472625036995e-06,
"loss": 0.3291,
"step": 511000
},
{
"epoch": 75.69,
"grad_norm": 0.44930896162986755,
"learning_rate": 1.6843463302752294e-06,
"loss": 0.3307,
"step": 511500
},
{
"epoch": 75.76,
"grad_norm": 0.48138096928596497,
"learning_rate": 1.6554453980467594e-06,
"loss": 0.3256,
"step": 512000
},
{
"epoch": 75.84,
"grad_norm": 0.44024384021759033,
"learning_rate": 1.6265444658182896e-06,
"loss": 0.3278,
"step": 512500
},
{
"epoch": 75.91,
"grad_norm": 0.42400872707366943,
"learning_rate": 1.5976435335898193e-06,
"loss": 0.3295,
"step": 513000
},
{
"epoch": 75.98,
"grad_norm": 0.4450230896472931,
"learning_rate": 1.5687426013613495e-06,
"loss": 0.3293,
"step": 513500
},
{
"epoch": 76.06,
"grad_norm": 0.4113300144672394,
"learning_rate": 1.5398416691328797e-06,
"loss": 0.3277,
"step": 514000
},
{
"epoch": 76.13,
"grad_norm": 0.4838961064815521,
"learning_rate": 1.5109407369044097e-06,
"loss": 0.3257,
"step": 514500
},
{
"epoch": 76.21,
"grad_norm": 0.45890524983406067,
"learning_rate": 1.4820398046759396e-06,
"loss": 0.3263,
"step": 515000
},
{
"epoch": 76.28,
"grad_norm": 0.4421687424182892,
"learning_rate": 1.4531388724474696e-06,
"loss": 0.3287,
"step": 515500
},
{
"epoch": 76.35,
"grad_norm": 0.4234231114387512,
"learning_rate": 1.4242379402189998e-06,
"loss": 0.3308,
"step": 516000
},
{
"epoch": 76.43,
"grad_norm": 0.4239380657672882,
"learning_rate": 1.3953370079905297e-06,
"loss": 0.3291,
"step": 516500
},
{
"epoch": 76.5,
"grad_norm": 0.4606933891773224,
"learning_rate": 1.3664360757620597e-06,
"loss": 0.3288,
"step": 517000
},
{
"epoch": 76.58,
"grad_norm": 0.4070008099079132,
"learning_rate": 1.33753514353359e-06,
"loss": 0.3268,
"step": 517500
},
{
"epoch": 76.65,
"grad_norm": 0.5978463888168335,
"learning_rate": 1.3086342113051199e-06,
"loss": 0.3272,
"step": 518000
},
{
"epoch": 76.72,
"grad_norm": 0.43075379729270935,
"learning_rate": 1.27973327907665e-06,
"loss": 0.3287,
"step": 518500
},
{
"epoch": 76.8,
"grad_norm": 0.46790510416030884,
"learning_rate": 1.2508323468481798e-06,
"loss": 0.3266,
"step": 519000
},
{
"epoch": 76.87,
"grad_norm": 0.45541101694107056,
"learning_rate": 1.22193141461971e-06,
"loss": 0.3254,
"step": 519500
},
{
"epoch": 76.95,
"grad_norm": 0.44363468885421753,
"learning_rate": 1.1930304823912402e-06,
"loss": 0.3248,
"step": 520000
},
{
"epoch": 77.02,
"grad_norm": 0.5055235624313354,
"learning_rate": 1.1641295501627701e-06,
"loss": 0.3265,
"step": 520500
},
{
"epoch": 77.09,
"grad_norm": 0.3572923541069031,
"learning_rate": 1.1352286179343e-06,
"loss": 0.324,
"step": 521000
},
{
"epoch": 77.17,
"grad_norm": 0.40502551198005676,
"learning_rate": 1.10632768570583e-06,
"loss": 0.3254,
"step": 521500
},
{
"epoch": 77.24,
"grad_norm": 0.45639294385910034,
"learning_rate": 1.0774267534773602e-06,
"loss": 0.328,
"step": 522000
},
{
"epoch": 77.32,
"grad_norm": 0.4580610990524292,
"learning_rate": 1.0485258212488904e-06,
"loss": 0.3278,
"step": 522500
},
{
"epoch": 77.39,
"grad_norm": 0.4812680184841156,
"learning_rate": 1.0196248890204202e-06,
"loss": 0.3274,
"step": 523000
},
{
"epoch": 77.46,
"grad_norm": 0.416979044675827,
"learning_rate": 9.907239567919504e-07,
"loss": 0.3261,
"step": 523500
},
{
"epoch": 77.54,
"grad_norm": 0.39473670721054077,
"learning_rate": 9.618230245634803e-07,
"loss": 0.328,
"step": 524000
},
{
"epoch": 77.61,
"grad_norm": 0.4831089675426483,
"learning_rate": 9.329220923350104e-07,
"loss": 0.328,
"step": 524500
},
{
"epoch": 77.69,
"grad_norm": 0.4752112627029419,
"learning_rate": 9.040211601065404e-07,
"loss": 0.3249,
"step": 525000
},
{
"epoch": 77.76,
"grad_norm": 0.4114755690097809,
"learning_rate": 8.751202278780705e-07,
"loss": 0.3282,
"step": 525500
},
{
"epoch": 77.83,
"grad_norm": 0.539284348487854,
"learning_rate": 8.462192956496004e-07,
"loss": 0.327,
"step": 526000
},
{
"epoch": 77.91,
"grad_norm": 0.4160568118095398,
"learning_rate": 8.173183634211305e-07,
"loss": 0.3278,
"step": 526500
},
{
"epoch": 77.98,
"grad_norm": 0.42335689067840576,
"learning_rate": 7.884174311926606e-07,
"loss": 0.3291,
"step": 527000
},
{
"epoch": 78.06,
"grad_norm": 0.4964425563812256,
"learning_rate": 7.595164989641906e-07,
"loss": 0.3269,
"step": 527500
},
{
"epoch": 78.13,
"grad_norm": 0.5482224822044373,
"learning_rate": 7.306155667357206e-07,
"loss": 0.3257,
"step": 528000
},
{
"epoch": 78.2,
"grad_norm": 0.4845934808254242,
"learning_rate": 7.017146345072507e-07,
"loss": 0.3257,
"step": 528500
},
{
"epoch": 78.28,
"grad_norm": 0.44311293959617615,
"learning_rate": 6.728137022787807e-07,
"loss": 0.3267,
"step": 529000
},
{
"epoch": 78.35,
"grad_norm": 0.49295201897621155,
"learning_rate": 6.439127700503107e-07,
"loss": 0.3262,
"step": 529500
},
{
"epoch": 78.43,
"grad_norm": 0.45838817954063416,
"learning_rate": 6.150118378218408e-07,
"loss": 0.3272,
"step": 530000
},
{
"epoch": 78.5,
"grad_norm": 0.4277520477771759,
"learning_rate": 5.861109055933709e-07,
"loss": 0.3275,
"step": 530500
},
{
"epoch": 78.57,
"grad_norm": 0.49568185210227966,
"learning_rate": 5.572099733649008e-07,
"loss": 0.3238,
"step": 531000
},
{
"epoch": 78.65,
"grad_norm": 0.3964736759662628,
"learning_rate": 5.283090411364309e-07,
"loss": 0.3265,
"step": 531500
},
{
"epoch": 78.72,
"grad_norm": 0.38991761207580566,
"learning_rate": 4.994081089079609e-07,
"loss": 0.3297,
"step": 532000
},
{
"epoch": 78.8,
"grad_norm": 0.514043390750885,
"learning_rate": 4.7050717667949096e-07,
"loss": 0.3274,
"step": 532500
},
{
"epoch": 78.87,
"grad_norm": 0.44372057914733887,
"learning_rate": 4.41606244451021e-07,
"loss": 0.3265,
"step": 533000
},
{
"epoch": 78.94,
"grad_norm": 0.40556496381759644,
"learning_rate": 4.1270531222255106e-07,
"loss": 0.3268,
"step": 533500
},
{
"epoch": 79.02,
"grad_norm": 0.37113696336746216,
"learning_rate": 3.838043799940811e-07,
"loss": 0.331,
"step": 534000
},
{
"epoch": 79.09,
"grad_norm": 0.42463332414627075,
"learning_rate": 3.549034477656111e-07,
"loss": 0.3269,
"step": 534500
},
{
"epoch": 79.17,
"grad_norm": 0.456259548664093,
"learning_rate": 3.260025155371412e-07,
"loss": 0.3296,
"step": 535000
},
{
"epoch": 79.24,
"grad_norm": 0.39561837911605835,
"learning_rate": 2.971015833086712e-07,
"loss": 0.3272,
"step": 535500
},
{
"epoch": 79.31,
"grad_norm": 0.42246511578559875,
"learning_rate": 2.6820065108020127e-07,
"loss": 0.3274,
"step": 536000
},
{
"epoch": 79.39,
"grad_norm": 0.42932552099227905,
"learning_rate": 2.392997188517313e-07,
"loss": 0.3259,
"step": 536500
},
{
"epoch": 79.46,
"grad_norm": 0.4081755578517914,
"learning_rate": 2.1039878662326132e-07,
"loss": 0.3254,
"step": 537000
},
{
"epoch": 79.54,
"grad_norm": 0.43017107248306274,
"learning_rate": 1.8149785439479136e-07,
"loss": 0.3294,
"step": 537500
},
{
"epoch": 79.61,
"grad_norm": 0.3940086364746094,
"learning_rate": 1.525969221663214e-07,
"loss": 0.3284,
"step": 538000
},
{
"epoch": 79.68,
"grad_norm": 0.37287628650665283,
"learning_rate": 1.2369598993785146e-07,
"loss": 0.3269,
"step": 538500
},
{
"epoch": 79.76,
"grad_norm": 0.451742559671402,
"learning_rate": 9.479505770938148e-08,
"loss": 0.3259,
"step": 539000
},
{
"epoch": 79.83,
"grad_norm": 0.4438938796520233,
"learning_rate": 6.589412548091152e-08,
"loss": 0.3238,
"step": 539500
},
{
"epoch": 79.91,
"grad_norm": 0.4649119973182678,
"learning_rate": 3.699319325244155e-08,
"loss": 0.3257,
"step": 540000
},
{
"epoch": 79.98,
"grad_norm": 0.4151638150215149,
"learning_rate": 8.09226102397159e-09,
"loss": 0.3303,
"step": 540500
},
{
"epoch": 80.0,
"step": 540640,
"total_flos": 4.3712245507093955e+20,
"train_loss": 0.35890252478696355,
"train_runtime": 56544.5558,
"train_samples_per_second": 76.486,
"train_steps_per_second": 9.561
}
],
"logging_steps": 500,
"max_steps": 540640,
"num_input_tokens_seen": 0,
"num_train_epochs": 80,
"save_steps": 1000000000,
"total_flos": 4.3712245507093955e+20,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}