{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9999981318415465, "eval_steps": 500, "global_step": 535286, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018681584534637058, "grad_norm": 7.906698703765869, "learning_rate": 9.999978643682902e-05, "loss": 3.2962, "step": 500 }, { "epoch": 0.0037363169069274116, "grad_norm": 8.97018051147461, "learning_rate": 9.99991423149794e-05, "loss": 3.1699, "step": 1000 }, { "epoch": 0.005604475360391117, "grad_norm": 15.646541595458984, "learning_rate": 9.999806763655335e-05, "loss": 3.0952, "step": 1500 }, { "epoch": 0.007472633813854823, "grad_norm": 7.570132732391357, "learning_rate": 9.999656241080522e-05, "loss": 3.0763, "step": 2000 }, { "epoch": 0.009340792267318529, "grad_norm": 5.853657245635986, "learning_rate": 9.999462665069693e-05, "loss": 3.0355, "step": 2500 }, { "epoch": 0.011208950720782235, "grad_norm": 6.587205410003662, "learning_rate": 9.999226553509718e-05, "loss": 3.0291, "step": 3000 }, { "epoch": 0.01307710917424594, "grad_norm": 6.537953853607178, "learning_rate": 9.998946962095583e-05, "loss": 2.9902, "step": 3500 }, { "epoch": 0.014945267627709646, "grad_norm": 5.199572563171387, "learning_rate": 9.998624323353232e-05, "loss": 2.9598, "step": 4000 }, { "epoch": 0.016813426081173352, "grad_norm": 6.676388263702393, "learning_rate": 9.998258640060996e-05, "loss": 2.9677, "step": 4500 }, { "epoch": 0.018681584534637058, "grad_norm": 5.827391147613525, "learning_rate": 9.997849915367876e-05, "loss": 2.9525, "step": 5000 }, { "epoch": 0.020549742988100764, "grad_norm": 7.3195953369140625, "learning_rate": 9.997398152793517e-05, "loss": 2.8857, "step": 5500 }, { "epoch": 0.02241790144156447, "grad_norm": 5.648918151855469, "learning_rate": 9.99690335622817e-05, "loss": 2.8672, "step": 6000 }, { "epoch": 0.024286059895028175, "grad_norm": 5.5112528800964355, "learning_rate": 9.996366648525912e-05, "loss": 2.9211, "step": 6500 }, { "epoch": 0.02615421834849188, "grad_norm": 5.33510684967041, "learning_rate": 9.995785883176955e-05, "loss": 2.8746, "step": 7000 }, { "epoch": 0.028022376801955587, "grad_norm": 4.034855365753174, "learning_rate": 9.995162097720716e-05, "loss": 2.8976, "step": 7500 }, { "epoch": 0.029890535255419293, "grad_norm": 6.461133003234863, "learning_rate": 9.994495297528784e-05, "loss": 2.8834, "step": 8000 }, { "epoch": 0.031758693708883, "grad_norm": 5.551321029663086, "learning_rate": 9.993785488343162e-05, "loss": 2.7976, "step": 8500 }, { "epoch": 0.033626852162346704, "grad_norm": 4.432355880737305, "learning_rate": 9.993032676276217e-05, "loss": 2.8252, "step": 9000 }, { "epoch": 0.03549501061581041, "grad_norm": 4.1577839851379395, "learning_rate": 9.99223686781062e-05, "loss": 2.8277, "step": 9500 }, { "epoch": 0.037363169069274116, "grad_norm": 4.840776443481445, "learning_rate": 9.991398069799303e-05, "loss": 2.8151, "step": 10000 }, { "epoch": 0.03923132752273782, "grad_norm": 5.0539093017578125, "learning_rate": 9.99051628946539e-05, "loss": 2.8342, "step": 10500 }, { "epoch": 0.04109948597620153, "grad_norm": 4.669162750244141, "learning_rate": 9.989593426795811e-05, "loss": 2.8473, "step": 11000 }, { "epoch": 0.04296764442966523, "grad_norm": 6.490626811981201, "learning_rate": 9.98862579089188e-05, "loss": 2.7918, "step": 11500 }, { "epoch": 0.04483580288312894, "grad_norm": 5.523123741149902, "learning_rate": 9.98761519653822e-05, "loss": 2.8058, "step": 12000 }, { "epoch": 0.046703961336592645, "grad_norm": 4.8026227951049805, "learning_rate": 9.98656165243734e-05, "loss": 2.76, "step": 12500 }, { "epoch": 0.04857211979005635, "grad_norm": 4.3018083572387695, "learning_rate": 9.985467403479736e-05, "loss": 2.7533, "step": 13000 }, { "epoch": 0.050440278243520056, "grad_norm": 3.7984395027160645, "learning_rate": 9.984330394823319e-05, "loss": 2.7928, "step": 13500 }, { "epoch": 0.05230843669698376, "grad_norm": 3.971073627471924, "learning_rate": 9.983148229059621e-05, "loss": 2.7542, "step": 14000 }, { "epoch": 0.05417659515044747, "grad_norm": 4.415925979614258, "learning_rate": 9.98192315201501e-05, "loss": 2.7767, "step": 14500 }, { "epoch": 0.056044753603911174, "grad_norm": 4.695183277130127, "learning_rate": 9.980655174238964e-05, "loss": 2.7724, "step": 15000 }, { "epoch": 0.05791291205737488, "grad_norm": 5.4851484298706055, "learning_rate": 9.979344306650395e-05, "loss": 2.7768, "step": 15500 }, { "epoch": 0.059781070510838585, "grad_norm": 4.120709419250488, "learning_rate": 9.977990560537549e-05, "loss": 2.7775, "step": 16000 }, { "epoch": 0.06164922896430229, "grad_norm": 3.63053560256958, "learning_rate": 9.976593947557912e-05, "loss": 2.7329, "step": 16500 }, { "epoch": 0.063517387417766, "grad_norm": 4.178781509399414, "learning_rate": 9.97515447973811e-05, "loss": 2.7428, "step": 17000 }, { "epoch": 0.0653855458712297, "grad_norm": 3.8429136276245117, "learning_rate": 9.973675176842667e-05, "loss": 2.7136, "step": 17500 }, { "epoch": 0.06725370432469341, "grad_norm": 3.6935720443725586, "learning_rate": 9.972150122544814e-05, "loss": 2.6918, "step": 18000 }, { "epoch": 0.06912186277815711, "grad_norm": 4.678779125213623, "learning_rate": 9.970582251673812e-05, "loss": 2.686, "step": 18500 }, { "epoch": 0.07099002123162082, "grad_norm": 5.219886779785156, "learning_rate": 9.968971577731036e-05, "loss": 2.7664, "step": 19000 }, { "epoch": 0.07285817968508453, "grad_norm": 3.985466241836548, "learning_rate": 9.967318114586451e-05, "loss": 2.7409, "step": 19500 }, { "epoch": 0.07472633813854823, "grad_norm": 5.018237590789795, "learning_rate": 9.965621876478483e-05, "loss": 2.7278, "step": 20000 }, { "epoch": 0.07659449659201194, "grad_norm": 4.305635452270508, "learning_rate": 9.963882878013921e-05, "loss": 2.7453, "step": 20500 }, { "epoch": 0.07846265504547564, "grad_norm": 3.6431195735931396, "learning_rate": 9.962101134167761e-05, "loss": 2.6693, "step": 21000 }, { "epoch": 0.08033081349893935, "grad_norm": 3.750077962875366, "learning_rate": 9.960280351865064e-05, "loss": 2.7108, "step": 21500 }, { "epoch": 0.08219897195240305, "grad_norm": 3.730613946914673, "learning_rate": 9.95841324906568e-05, "loss": 2.6607, "step": 22000 }, { "epoch": 0.08406713040586676, "grad_norm": 4.009971618652344, "learning_rate": 9.956503447985205e-05, "loss": 2.7232, "step": 22500 }, { "epoch": 0.08593528885933047, "grad_norm": 3.1298115253448486, "learning_rate": 9.954550965069465e-05, "loss": 2.6655, "step": 23000 }, { "epoch": 0.08780344731279417, "grad_norm": 3.9897303581237793, "learning_rate": 9.952555817131835e-05, "loss": 2.6755, "step": 23500 }, { "epoch": 0.08967160576625788, "grad_norm": 5.565286636352539, "learning_rate": 9.950522139495593e-05, "loss": 2.6854, "step": 24000 }, { "epoch": 0.09153976421972158, "grad_norm": 3.5274269580841064, "learning_rate": 9.948441798666596e-05, "loss": 2.6821, "step": 24500 }, { "epoch": 0.09340792267318529, "grad_norm": 4.026999473571777, "learning_rate": 9.946323133845033e-05, "loss": 2.6389, "step": 25000 }, { "epoch": 0.095276081126649, "grad_norm": 4.1627326011657715, "learning_rate": 9.944157671638854e-05, "loss": 2.6786, "step": 25500 }, { "epoch": 0.0971442395801127, "grad_norm": 3.341585159301758, "learning_rate": 9.94194963391034e-05, "loss": 2.6419, "step": 26000 }, { "epoch": 0.09901239803357641, "grad_norm": 3.5735983848571777, "learning_rate": 9.939699039673516e-05, "loss": 2.652, "step": 26500 }, { "epoch": 0.10088055648704011, "grad_norm": 3.736764669418335, "learning_rate": 9.937405908308882e-05, "loss": 2.701, "step": 27000 }, { "epoch": 0.10274871494050382, "grad_norm": 3.172218084335327, "learning_rate": 9.935070259563231e-05, "loss": 2.6086, "step": 27500 }, { "epoch": 0.10461687339396752, "grad_norm": 3.945516347885132, "learning_rate": 9.932692113549484e-05, "loss": 2.6714, "step": 28000 }, { "epoch": 0.10648503184743123, "grad_norm": 2.7730209827423096, "learning_rate": 9.930271490746525e-05, "loss": 2.6346, "step": 28500 }, { "epoch": 0.10835319030089494, "grad_norm": 3.7872776985168457, "learning_rate": 9.92780841199901e-05, "loss": 2.6376, "step": 29000 }, { "epoch": 0.11022134875435864, "grad_norm": 3.9411559104919434, "learning_rate": 9.925302898517198e-05, "loss": 2.6674, "step": 29500 }, { "epoch": 0.11208950720782235, "grad_norm": 4.368437767028809, "learning_rate": 9.922760110043857e-05, "loss": 2.6232, "step": 30000 }, { "epoch": 0.11395766566128605, "grad_norm": 4.385318279266357, "learning_rate": 9.920169876946009e-05, "loss": 2.595, "step": 30500 }, { "epoch": 0.11582582411474976, "grad_norm": 3.4636647701263428, "learning_rate": 9.917537274891421e-05, "loss": 2.6073, "step": 31000 }, { "epoch": 0.11769398256821346, "grad_norm": 2.474412202835083, "learning_rate": 9.914862326550168e-05, "loss": 2.655, "step": 31500 }, { "epoch": 0.11956214102167717, "grad_norm": 3.5162529945373535, "learning_rate": 9.912145054956974e-05, "loss": 2.6259, "step": 32000 }, { "epoch": 0.12143029947514088, "grad_norm": 3.149369716644287, "learning_rate": 9.909385483511026e-05, "loss": 2.6045, "step": 32500 }, { "epoch": 0.12329845792860458, "grad_norm": 3.873689651489258, "learning_rate": 9.906583635975763e-05, "loss": 2.6476, "step": 33000 }, { "epoch": 0.1251666163820683, "grad_norm": 4.371992588043213, "learning_rate": 9.90374526682891e-05, "loss": 2.6149, "step": 33500 }, { "epoch": 0.127034774835532, "grad_norm": 4.554148197174072, "learning_rate": 9.900859024291592e-05, "loss": 2.6146, "step": 34000 }, { "epoch": 0.1289029332889957, "grad_norm": 4.277965545654297, "learning_rate": 9.897930579088681e-05, "loss": 2.5902, "step": 34500 }, { "epoch": 0.1307710917424594, "grad_norm": 4.317843914031982, "learning_rate": 9.894959956437835e-05, "loss": 2.6276, "step": 35000 }, { "epoch": 0.13263925019592313, "grad_norm": 3.6088337898254395, "learning_rate": 9.891953249519332e-05, "loss": 2.5647, "step": 35500 }, { "epoch": 0.13450740864938682, "grad_norm": 2.6994011402130127, "learning_rate": 9.888898433303897e-05, "loss": 2.6306, "step": 36000 }, { "epoch": 0.13637556710285054, "grad_norm": 3.670053005218506, "learning_rate": 9.885801517418857e-05, "loss": 2.6103, "step": 36500 }, { "epoch": 0.13824372555631423, "grad_norm": 3.3493151664733887, "learning_rate": 9.882662528532621e-05, "loss": 2.5293, "step": 37000 }, { "epoch": 0.14011188400977795, "grad_norm": 4.308838844299316, "learning_rate": 9.879481493675895e-05, "loss": 2.5701, "step": 37500 }, { "epoch": 0.14198004246324164, "grad_norm": 3.550856828689575, "learning_rate": 9.876258440241463e-05, "loss": 2.5949, "step": 38000 }, { "epoch": 0.14384820091670536, "grad_norm": 3.9775218963623047, "learning_rate": 9.872999967960666e-05, "loss": 2.5844, "step": 38500 }, { "epoch": 0.14571635937016905, "grad_norm": 3.936997413635254, "learning_rate": 9.869693044893364e-05, "loss": 2.5558, "step": 39000 }, { "epoch": 0.14758451782363277, "grad_norm": 4.209615707397461, "learning_rate": 9.866344187539423e-05, "loss": 2.5605, "step": 39500 }, { "epoch": 0.14945267627709646, "grad_norm": 4.603176116943359, "learning_rate": 9.862960248064681e-05, "loss": 2.6045, "step": 40000 }, { "epoch": 0.15132083473056018, "grad_norm": 3.0863678455352783, "learning_rate": 9.859527692735271e-05, "loss": 2.5638, "step": 40500 }, { "epoch": 0.15318899318402387, "grad_norm": 3.8357596397399902, "learning_rate": 9.856053290655904e-05, "loss": 2.5569, "step": 41000 }, { "epoch": 0.1550571516374876, "grad_norm": 3.3822269439697266, "learning_rate": 9.85253707174563e-05, "loss": 2.5459, "step": 41500 }, { "epoch": 0.1569253100909513, "grad_norm": 4.058901309967041, "learning_rate": 9.848979066283589e-05, "loss": 2.6128, "step": 42000 }, { "epoch": 0.158793468544415, "grad_norm": 4.78932523727417, "learning_rate": 9.84537930490876e-05, "loss": 2.5862, "step": 42500 }, { "epoch": 0.1606616269978787, "grad_norm": 3.3654229640960693, "learning_rate": 9.841737818619692e-05, "loss": 2.5509, "step": 43000 }, { "epoch": 0.16252978545134242, "grad_norm": 3.9686570167541504, "learning_rate": 9.838054638774244e-05, "loss": 2.5089, "step": 43500 }, { "epoch": 0.1643979439048061, "grad_norm": 2.973649740219116, "learning_rate": 9.834329797089303e-05, "loss": 2.5321, "step": 44000 }, { "epoch": 0.16626610235826983, "grad_norm": 2.5326201915740967, "learning_rate": 9.83056332564052e-05, "loss": 2.5408, "step": 44500 }, { "epoch": 0.16813426081173352, "grad_norm": 3.884883165359497, "learning_rate": 9.826762914491992e-05, "loss": 2.5352, "step": 45000 }, { "epoch": 0.17000241926519724, "grad_norm": 3.9567508697509766, "learning_rate": 9.822913364272259e-05, "loss": 2.5619, "step": 45500 }, { "epoch": 0.17187057771866093, "grad_norm": 3.041057825088501, "learning_rate": 9.819022282598776e-05, "loss": 2.555, "step": 46000 }, { "epoch": 0.17373873617212465, "grad_norm": 3.1877288818359375, "learning_rate": 9.815089702978735e-05, "loss": 2.5458, "step": 46500 }, { "epoch": 0.17560689462558834, "grad_norm": 3.142703056335449, "learning_rate": 9.811115659276677e-05, "loss": 2.5607, "step": 47000 }, { "epoch": 0.17747505307905206, "grad_norm": 3.609555959701538, "learning_rate": 9.807100185714202e-05, "loss": 2.5683, "step": 47500 }, { "epoch": 0.17934321153251576, "grad_norm": 3.200345277786255, "learning_rate": 9.803051471896693e-05, "loss": 2.5496, "step": 48000 }, { "epoch": 0.18121136998597948, "grad_norm": 3.56850266456604, "learning_rate": 9.798953325390536e-05, "loss": 2.5425, "step": 48500 }, { "epoch": 0.18307952843944317, "grad_norm": 3.4314849376678467, "learning_rate": 9.794813853757214e-05, "loss": 2.5238, "step": 49000 }, { "epoch": 0.1849476868929069, "grad_norm": 3.024343967437744, "learning_rate": 9.790633092642875e-05, "loss": 2.5786, "step": 49500 }, { "epoch": 0.18681584534637058, "grad_norm": 3.2595534324645996, "learning_rate": 9.786419563225273e-05, "loss": 2.5386, "step": 50000 }, { "epoch": 0.1886840037998343, "grad_norm": 3.6985089778900146, "learning_rate": 9.782156413906974e-05, "loss": 2.5338, "step": 50500 }, { "epoch": 0.190552162253298, "grad_norm": 2.9342880249023438, "learning_rate": 9.777852084104404e-05, "loss": 2.4992, "step": 51000 }, { "epoch": 0.1924203207067617, "grad_norm": 2.8690543174743652, "learning_rate": 9.773506610883352e-05, "loss": 2.571, "step": 51500 }, { "epoch": 0.1942884791602254, "grad_norm": 2.8353734016418457, "learning_rate": 9.769120031663902e-05, "loss": 2.4895, "step": 52000 }, { "epoch": 0.19615663761368912, "grad_norm": 3.6773738861083984, "learning_rate": 9.764692384220111e-05, "loss": 2.5121, "step": 52500 }, { "epoch": 0.19802479606715281, "grad_norm": 3.3569443225860596, "learning_rate": 9.760223706679688e-05, "loss": 2.527, "step": 53000 }, { "epoch": 0.19989295452061653, "grad_norm": 2.970712184906006, "learning_rate": 9.755714037523662e-05, "loss": 2.5337, "step": 53500 }, { "epoch": 0.20176111297408023, "grad_norm": 3.2004318237304688, "learning_rate": 9.751172557674817e-05, "loss": 2.5342, "step": 54000 }, { "epoch": 0.20362927142754395, "grad_norm": 3.16782546043396, "learning_rate": 9.746581103930153e-05, "loss": 2.524, "step": 54500 }, { "epoch": 0.20549742988100764, "grad_norm": 3.3260490894317627, "learning_rate": 9.741948776050147e-05, "loss": 2.4701, "step": 55000 }, { "epoch": 0.20736558833447136, "grad_norm": 3.6631577014923096, "learning_rate": 9.737275613925072e-05, "loss": 2.5314, "step": 55500 }, { "epoch": 0.20923374678793505, "grad_norm": 2.5733258724212646, "learning_rate": 9.732561657796828e-05, "loss": 2.5362, "step": 56000 }, { "epoch": 0.21110190524139877, "grad_norm": 3.8227956295013428, "learning_rate": 9.727816498322433e-05, "loss": 2.4807, "step": 56500 }, { "epoch": 0.21297006369486246, "grad_norm": 3.5182738304138184, "learning_rate": 9.723021157702207e-05, "loss": 2.5263, "step": 57000 }, { "epoch": 0.21483822214832618, "grad_norm": 3.405224084854126, "learning_rate": 9.71818514582792e-05, "loss": 2.5105, "step": 57500 }, { "epoch": 0.21670638060178987, "grad_norm": 2.988802671432495, "learning_rate": 9.713308504343815e-05, "loss": 2.5297, "step": 58000 }, { "epoch": 0.2185745390552536, "grad_norm": 2.3862366676330566, "learning_rate": 9.708391275244016e-05, "loss": 2.5006, "step": 58500 }, { "epoch": 0.22044269750871728, "grad_norm": 3.3643691539764404, "learning_rate": 9.703433500872156e-05, "loss": 2.5255, "step": 59000 }, { "epoch": 0.222310855962181, "grad_norm": 3.6664035320281982, "learning_rate": 9.698435223921016e-05, "loss": 2.4421, "step": 59500 }, { "epoch": 0.2241790144156447, "grad_norm": 3.3508718013763428, "learning_rate": 9.693396487432153e-05, "loss": 2.4893, "step": 60000 }, { "epoch": 0.22604717286910841, "grad_norm": 3.5202372074127197, "learning_rate": 9.688337731857194e-05, "loss": 2.505, "step": 60500 }, { "epoch": 0.2279153313225721, "grad_norm": 4.265177249908447, "learning_rate": 9.683218368212872e-05, "loss": 2.5134, "step": 61000 }, { "epoch": 0.22978348977603583, "grad_norm": 3.761479377746582, "learning_rate": 9.67805867606742e-05, "loss": 2.477, "step": 61500 }, { "epoch": 0.23165164822949952, "grad_norm": 3.254711866378784, "learning_rate": 9.67285869985239e-05, "loss": 2.4894, "step": 62000 }, { "epoch": 0.23351980668296324, "grad_norm": 3.4447569847106934, "learning_rate": 9.667629004906115e-05, "loss": 2.5338, "step": 62500 }, { "epoch": 0.23538796513642693, "grad_norm": 3.283677577972412, "learning_rate": 9.662348675576849e-05, "loss": 2.5028, "step": 63000 }, { "epoch": 0.23725612358989065, "grad_norm": 3.641008138656616, "learning_rate": 9.657028197461201e-05, "loss": 2.5102, "step": 63500 }, { "epoch": 0.23912428204335434, "grad_norm": 2.3239517211914062, "learning_rate": 9.651667616375301e-05, "loss": 2.4692, "step": 64000 }, { "epoch": 0.24099244049681806, "grad_norm": 2.590287446975708, "learning_rate": 9.646266978480605e-05, "loss": 2.4753, "step": 64500 }, { "epoch": 0.24286059895028175, "grad_norm": 3.5106756687164307, "learning_rate": 9.640826330283514e-05, "loss": 2.4541, "step": 65000 }, { "epoch": 0.24472875740374547, "grad_norm": 2.9911463260650635, "learning_rate": 9.635345718634972e-05, "loss": 2.5228, "step": 65500 }, { "epoch": 0.24659691585720916, "grad_norm": 3.7811479568481445, "learning_rate": 9.629825190730053e-05, "loss": 2.468, "step": 66000 }, { "epoch": 0.24846507431067288, "grad_norm": 3.073608875274658, "learning_rate": 9.624275954658023e-05, "loss": 2.5416, "step": 66500 }, { "epoch": 0.2503332327641366, "grad_norm": 2.943208932876587, "learning_rate": 9.618675816793752e-05, "loss": 2.4685, "step": 67000 }, { "epoch": 0.25220139121760027, "grad_norm": 2.2683610916137695, "learning_rate": 9.613047225704368e-05, "loss": 2.4953, "step": 67500 }, { "epoch": 0.254069549671064, "grad_norm": 3.0341203212738037, "learning_rate": 9.607367670392133e-05, "loss": 2.4601, "step": 68000 }, { "epoch": 0.2559377081245277, "grad_norm": 3.2594239711761475, "learning_rate": 9.60164843975031e-05, "loss": 2.4339, "step": 68500 }, { "epoch": 0.2578058665779914, "grad_norm": 3.045818328857422, "learning_rate": 9.595889583028791e-05, "loss": 2.4237, "step": 69000 }, { "epoch": 0.2596740250314551, "grad_norm": 3.0980165004730225, "learning_rate": 9.590091149818697e-05, "loss": 2.5111, "step": 69500 }, { "epoch": 0.2615421834849188, "grad_norm": 2.206389904022217, "learning_rate": 9.584253190051957e-05, "loss": 2.4885, "step": 70000 }, { "epoch": 0.26341034193838253, "grad_norm": 3.909090518951416, "learning_rate": 9.578387548236723e-05, "loss": 2.4945, "step": 70500 }, { "epoch": 0.26527850039184625, "grad_norm": 3.3355019092559814, "learning_rate": 9.572470765314143e-05, "loss": 2.4225, "step": 71000 }, { "epoch": 0.2671466588453099, "grad_norm": 2.9104554653167725, "learning_rate": 9.56651460756897e-05, "loss": 2.4666, "step": 71500 }, { "epoch": 0.26901481729877363, "grad_norm": 2.195571184158325, "learning_rate": 9.560519126291337e-05, "loss": 2.4738, "step": 72000 }, { "epoch": 0.27088297575223735, "grad_norm": 2.8600668907165527, "learning_rate": 9.554484373110011e-05, "loss": 2.3982, "step": 72500 }, { "epoch": 0.2727511342057011, "grad_norm": 2.985612630844116, "learning_rate": 9.54842258704496e-05, "loss": 2.4708, "step": 73000 }, { "epoch": 0.27461929265916474, "grad_norm": 2.609339475631714, "learning_rate": 9.542309524577655e-05, "loss": 2.4385, "step": 73500 }, { "epoch": 0.27648745111262846, "grad_norm": 2.9328203201293945, "learning_rate": 9.536157347014623e-05, "loss": 2.3942, "step": 74000 }, { "epoch": 0.2783556095660922, "grad_norm": 3.242722511291504, "learning_rate": 9.529966107333978e-05, "loss": 2.4568, "step": 74500 }, { "epoch": 0.2802237680195559, "grad_norm": 2.90252423286438, "learning_rate": 9.523735858850218e-05, "loss": 2.4495, "step": 75000 }, { "epoch": 0.2820919264730196, "grad_norm": 2.491132974624634, "learning_rate": 9.517466655213752e-05, "loss": 2.4401, "step": 75500 }, { "epoch": 0.2839600849264833, "grad_norm": 2.714989185333252, "learning_rate": 9.511171205407364e-05, "loss": 2.4607, "step": 76000 }, { "epoch": 0.285828243379947, "grad_norm": 3.1541576385498047, "learning_rate": 9.50482433139732e-05, "loss": 2.4522, "step": 76500 }, { "epoch": 0.2876964018334107, "grad_norm": 3.280564546585083, "learning_rate": 9.498438665087013e-05, "loss": 2.4696, "step": 77000 }, { "epoch": 0.28956456028687444, "grad_norm": 3.0421793460845947, "learning_rate": 9.492014261465201e-05, "loss": 2.482, "step": 77500 }, { "epoch": 0.2914327187403381, "grad_norm": 2.658756971359253, "learning_rate": 9.485551175854214e-05, "loss": 2.4464, "step": 78000 }, { "epoch": 0.2933008771938018, "grad_norm": 4.537105083465576, "learning_rate": 9.479049463909488e-05, "loss": 2.444, "step": 78500 }, { "epoch": 0.29516903564726554, "grad_norm": 2.9097115993499756, "learning_rate": 9.472509181619083e-05, "loss": 2.4631, "step": 79000 }, { "epoch": 0.29703719410072926, "grad_norm": 2.133843421936035, "learning_rate": 9.465943581295223e-05, "loss": 2.4159, "step": 79500 }, { "epoch": 0.2989053525541929, "grad_norm": 2.5699055194854736, "learning_rate": 9.459326404463687e-05, "loss": 2.4392, "step": 80000 }, { "epoch": 0.30077351100765665, "grad_norm": 2.927656412124634, "learning_rate": 9.452684176567582e-05, "loss": 2.4121, "step": 80500 }, { "epoch": 0.30264166946112037, "grad_norm": 3.3542892932891846, "learning_rate": 9.44599033266823e-05, "loss": 2.4138, "step": 81000 }, { "epoch": 0.3045098279145841, "grad_norm": 2.9518256187438965, "learning_rate": 9.439258203104611e-05, "loss": 2.4193, "step": 81500 }, { "epoch": 0.30637798636804775, "grad_norm": 2.9476184844970703, "learning_rate": 9.432487845848965e-05, "loss": 2.3944, "step": 82000 }, { "epoch": 0.30824614482151147, "grad_norm": 2.688512086868286, "learning_rate": 9.425679319202733e-05, "loss": 2.4331, "step": 82500 }, { "epoch": 0.3101143032749752, "grad_norm": 2.971700429916382, "learning_rate": 9.418832681796042e-05, "loss": 2.4513, "step": 83000 }, { "epoch": 0.3119824617284389, "grad_norm": 2.495612382888794, "learning_rate": 9.411947992587194e-05, "loss": 2.3972, "step": 83500 }, { "epoch": 0.3138506201819026, "grad_norm": 3.071038246154785, "learning_rate": 9.405025310862172e-05, "loss": 2.4309, "step": 84000 }, { "epoch": 0.3157187786353663, "grad_norm": 3.627650260925293, "learning_rate": 9.398064696234121e-05, "loss": 2.4297, "step": 84500 }, { "epoch": 0.31758693708883, "grad_norm": 2.077777147293091, "learning_rate": 9.391066208642838e-05, "loss": 2.4245, "step": 85000 }, { "epoch": 0.31945509554229373, "grad_norm": 3.0603654384613037, "learning_rate": 9.384044018651683e-05, "loss": 2.4145, "step": 85500 }, { "epoch": 0.3213232539957574, "grad_norm": 2.993283271789551, "learning_rate": 9.37697004170087e-05, "loss": 2.4095, "step": 86000 }, { "epoch": 0.3231914124492211, "grad_norm": 2.8521878719329834, "learning_rate": 9.369858373438785e-05, "loss": 2.3967, "step": 86500 }, { "epoch": 0.32505957090268484, "grad_norm": 3.297847032546997, "learning_rate": 9.362709075105988e-05, "loss": 2.4343, "step": 87000 }, { "epoch": 0.32692772935614856, "grad_norm": 2.3240292072296143, "learning_rate": 9.355522208267086e-05, "loss": 2.3947, "step": 87500 }, { "epoch": 0.3287958878096122, "grad_norm": 3.8041253089904785, "learning_rate": 9.348297834810195e-05, "loss": 2.4111, "step": 88000 }, { "epoch": 0.33066404626307594, "grad_norm": 2.6961183547973633, "learning_rate": 9.341036016946413e-05, "loss": 2.4159, "step": 88500 }, { "epoch": 0.33253220471653966, "grad_norm": 3.0299246311187744, "learning_rate": 9.33373681720928e-05, "loss": 2.4012, "step": 89000 }, { "epoch": 0.3344003631700034, "grad_norm": 2.75026273727417, "learning_rate": 9.326415008694199e-05, "loss": 2.3755, "step": 89500 }, { "epoch": 0.33626852162346704, "grad_norm": 2.4696195125579834, "learning_rate": 9.319056093086089e-05, "loss": 2.3953, "step": 90000 }, { "epoch": 0.33813668007693076, "grad_norm": 2.428610324859619, "learning_rate": 9.311645274788967e-05, "loss": 2.4433, "step": 90500 }, { "epoch": 0.3400048385303945, "grad_norm": 2.851217269897461, "learning_rate": 9.304197327710381e-05, "loss": 2.429, "step": 91000 }, { "epoch": 0.3418729969838582, "grad_norm": 3.0488922595977783, "learning_rate": 9.296712315986686e-05, "loss": 2.417, "step": 91500 }, { "epoch": 0.34374115543732187, "grad_norm": 2.7306880950927734, "learning_rate": 9.289190304073406e-05, "loss": 2.4539, "step": 92000 }, { "epoch": 0.3456093138907856, "grad_norm": 3.2483866214752197, "learning_rate": 9.281631356744687e-05, "loss": 2.3616, "step": 92500 }, { "epoch": 0.3474774723442493, "grad_norm": 2.66874098777771, "learning_rate": 9.274035539092736e-05, "loss": 2.3984, "step": 93000 }, { "epoch": 0.349345630797713, "grad_norm": 2.5911643505096436, "learning_rate": 9.266402916527259e-05, "loss": 2.4403, "step": 93500 }, { "epoch": 0.3512137892511767, "grad_norm": 3.084787607192993, "learning_rate": 9.258748930120269e-05, "loss": 2.3685, "step": 94000 }, { "epoch": 0.3530819477046404, "grad_norm": 3.077162742614746, "learning_rate": 9.251042968504211e-05, "loss": 2.4033, "step": 94500 }, { "epoch": 0.35495010615810413, "grad_norm": 2.7327165603637695, "learning_rate": 9.243300399970075e-05, "loss": 2.357, "step": 95000 }, { "epoch": 0.35681826461156785, "grad_norm": 2.942444324493408, "learning_rate": 9.235521291191276e-05, "loss": 2.4114, "step": 95500 }, { "epoch": 0.3586864230650315, "grad_norm": 2.504429817199707, "learning_rate": 9.227705709155896e-05, "loss": 2.3763, "step": 96000 }, { "epoch": 0.36055458151849523, "grad_norm": 3.322981119155884, "learning_rate": 9.219853721166094e-05, "loss": 2.4037, "step": 96500 }, { "epoch": 0.36242273997195895, "grad_norm": 2.8509936332702637, "learning_rate": 9.21196539483753e-05, "loss": 2.4089, "step": 97000 }, { "epoch": 0.36429089842542267, "grad_norm": 3.585662603378296, "learning_rate": 9.204040798098783e-05, "loss": 2.4132, "step": 97500 }, { "epoch": 0.36615905687888634, "grad_norm": 2.8213889598846436, "learning_rate": 9.196095956872841e-05, "loss": 2.3647, "step": 98000 }, { "epoch": 0.36802721533235006, "grad_norm": 3.3626108169555664, "learning_rate": 9.188099096546838e-05, "loss": 2.4143, "step": 98500 }, { "epoch": 0.3698953737858138, "grad_norm": 2.993591785430908, "learning_rate": 9.180066171330013e-05, "loss": 2.3806, "step": 99000 }, { "epoch": 0.3717635322392775, "grad_norm": 2.9788472652435303, "learning_rate": 9.171997250396128e-05, "loss": 2.3571, "step": 99500 }, { "epoch": 0.37363169069274116, "grad_norm": 2.3888766765594482, "learning_rate": 9.163908648731292e-05, "loss": 2.3841, "step": 100000 }, { "epoch": 0.3754998491462049, "grad_norm": 3.0424160957336426, "learning_rate": 9.155768016766876e-05, "loss": 2.4152, "step": 100500 }, { "epoch": 0.3773680075996686, "grad_norm": 2.592036724090576, "learning_rate": 9.147591598323593e-05, "loss": 2.3465, "step": 101000 }, { "epoch": 0.3792361660531323, "grad_norm": 2.8690261840820312, "learning_rate": 9.139379463810866e-05, "loss": 2.3974, "step": 101500 }, { "epoch": 0.381104324506596, "grad_norm": 2.7227180004119873, "learning_rate": 9.131148215032317e-05, "loss": 2.3688, "step": 102000 }, { "epoch": 0.3829724829600597, "grad_norm": 2.856623888015747, "learning_rate": 9.12286493191618e-05, "loss": 2.4341, "step": 102500 }, { "epoch": 0.3848406414135234, "grad_norm": 2.56028151512146, "learning_rate": 9.114546145658827e-05, "loss": 2.427, "step": 103000 }, { "epoch": 0.38670879986698714, "grad_norm": 3.3118507862091064, "learning_rate": 9.106208671644056e-05, "loss": 2.3166, "step": 103500 }, { "epoch": 0.3885769583204508, "grad_norm": 3.2025699615478516, "learning_rate": 9.097819164962692e-05, "loss": 2.4462, "step": 104000 }, { "epoch": 0.3904451167739145, "grad_norm": 3.240300416946411, "learning_rate": 9.089394370816208e-05, "loss": 2.4285, "step": 104500 }, { "epoch": 0.39231327522737824, "grad_norm": 3.5723962783813477, "learning_rate": 9.080934361752857e-05, "loss": 2.355, "step": 105000 }, { "epoch": 0.39418143368084196, "grad_norm": 3.186774253845215, "learning_rate": 9.072456235949608e-05, "loss": 2.4029, "step": 105500 }, { "epoch": 0.39604959213430563, "grad_norm": 2.629359006881714, "learning_rate": 9.063926085974259e-05, "loss": 2.3459, "step": 106000 }, { "epoch": 0.39791775058776935, "grad_norm": 3.2429652214050293, "learning_rate": 9.055360940396558e-05, "loss": 2.3847, "step": 106500 }, { "epoch": 0.39978590904123307, "grad_norm": 2.427645206451416, "learning_rate": 9.046760872973364e-05, "loss": 2.3435, "step": 107000 }, { "epoch": 0.4016540674946968, "grad_norm": 2.556652784347534, "learning_rate": 9.038143262321399e-05, "loss": 2.4121, "step": 107500 }, { "epoch": 0.40352222594816045, "grad_norm": 2.9563798904418945, "learning_rate": 9.029473643152501e-05, "loss": 2.3786, "step": 108000 }, { "epoch": 0.40539038440162417, "grad_norm": 2.457141876220703, "learning_rate": 9.020769325060857e-05, "loss": 2.3734, "step": 108500 }, { "epoch": 0.4072585428550879, "grad_norm": 2.489871025085449, "learning_rate": 9.012030383001778e-05, "loss": 2.3934, "step": 109000 }, { "epoch": 0.4091267013085516, "grad_norm": 2.9061882495880127, "learning_rate": 9.003256892228738e-05, "loss": 2.3507, "step": 109500 }, { "epoch": 0.4109948597620153, "grad_norm": 3.2263598442077637, "learning_rate": 8.994448928292711e-05, "loss": 2.3866, "step": 110000 }, { "epoch": 0.412863018215479, "grad_norm": 2.9006874561309814, "learning_rate": 8.985606567041537e-05, "loss": 2.3546, "step": 110500 }, { "epoch": 0.4147311766689427, "grad_norm": 2.51509428024292, "learning_rate": 8.976747672185874e-05, "loss": 2.3669, "step": 111000 }, { "epoch": 0.41659933512240643, "grad_norm": 2.6938908100128174, "learning_rate": 8.967836813445061e-05, "loss": 2.3485, "step": 111500 }, { "epoch": 0.4184674935758701, "grad_norm": 2.7218174934387207, "learning_rate": 8.958891786553452e-05, "loss": 2.3798, "step": 112000 }, { "epoch": 0.4203356520293338, "grad_norm": 3.0031161308288574, "learning_rate": 8.949912668539173e-05, "loss": 2.3501, "step": 112500 }, { "epoch": 0.42220381048279754, "grad_norm": 2.5878889560699463, "learning_rate": 8.940899536723916e-05, "loss": 2.3512, "step": 113000 }, { "epoch": 0.42407196893626126, "grad_norm": 2.7273967266082764, "learning_rate": 8.931852468722277e-05, "loss": 2.3394, "step": 113500 }, { "epoch": 0.4259401273897249, "grad_norm": 2.3990983963012695, "learning_rate": 8.922771542441081e-05, "loss": 2.3104, "step": 114000 }, { "epoch": 0.42780828584318864, "grad_norm": 3.0549476146698, "learning_rate": 8.913656836078725e-05, "loss": 2.3557, "step": 114500 }, { "epoch": 0.42967644429665236, "grad_norm": 2.417224168777466, "learning_rate": 8.904508428124488e-05, "loss": 2.32, "step": 115000 }, { "epoch": 0.4315446027501161, "grad_norm": 2.56392502784729, "learning_rate": 8.895363192352878e-05, "loss": 2.3651, "step": 115500 }, { "epoch": 0.43341276120357974, "grad_norm": 2.027083396911621, "learning_rate": 8.886147751859986e-05, "loss": 2.3277, "step": 116000 }, { "epoch": 0.43528091965704346, "grad_norm": 1.902034044265747, "learning_rate": 8.876898846663621e-05, "loss": 2.3185, "step": 116500 }, { "epoch": 0.4371490781105072, "grad_norm": 2.7564985752105713, "learning_rate": 8.867616556408684e-05, "loss": 2.3674, "step": 117000 }, { "epoch": 0.4390172365639709, "grad_norm": 3.024198532104492, "learning_rate": 8.858300961027575e-05, "loss": 2.3832, "step": 117500 }, { "epoch": 0.44088539501743457, "grad_norm": 2.2952866554260254, "learning_rate": 8.84895214073948e-05, "loss": 2.3799, "step": 118000 }, { "epoch": 0.4427535534708983, "grad_norm": 2.352498769760132, "learning_rate": 8.839570176049705e-05, "loss": 2.3958, "step": 118500 }, { "epoch": 0.444621711924362, "grad_norm": 3.565748453140259, "learning_rate": 8.830155147748969e-05, "loss": 2.3614, "step": 119000 }, { "epoch": 0.4464898703778257, "grad_norm": 3.0577287673950195, "learning_rate": 8.82072606579692e-05, "loss": 2.3458, "step": 119500 }, { "epoch": 0.4483580288312894, "grad_norm": 2.6253695487976074, "learning_rate": 8.81124521950556e-05, "loss": 2.3273, "step": 120000 }, { "epoch": 0.4502261872847531, "grad_norm": 2.1585161685943604, "learning_rate": 8.801731553517346e-05, "loss": 2.3298, "step": 120500 }, { "epoch": 0.45209434573821683, "grad_norm": 2.5908641815185547, "learning_rate": 8.792185149757116e-05, "loss": 2.323, "step": 121000 }, { "epoch": 0.45396250419168055, "grad_norm": 1.9700515270233154, "learning_rate": 8.78262528108574e-05, "loss": 2.3285, "step": 121500 }, { "epoch": 0.4558306626451442, "grad_norm": 2.0091867446899414, "learning_rate": 8.773013713746569e-05, "loss": 2.3353, "step": 122000 }, { "epoch": 0.45769882109860793, "grad_norm": 3.026522159576416, "learning_rate": 8.763369655932719e-05, "loss": 2.3478, "step": 122500 }, { "epoch": 0.45956697955207165, "grad_norm": 2.7834973335266113, "learning_rate": 8.753693190691863e-05, "loss": 2.3256, "step": 123000 }, { "epoch": 0.4614351380055354, "grad_norm": 3.004798173904419, "learning_rate": 8.743984401350747e-05, "loss": 2.3466, "step": 123500 }, { "epoch": 0.46330329645899904, "grad_norm": 2.611668586730957, "learning_rate": 8.734262885694443e-05, "loss": 2.3222, "step": 124000 }, { "epoch": 0.46517145491246276, "grad_norm": 2.902439594268799, "learning_rate": 8.72448976347505e-05, "loss": 2.3485, "step": 124500 }, { "epoch": 0.4670396133659265, "grad_norm": 2.932037353515625, "learning_rate": 8.714684568634262e-05, "loss": 2.3258, "step": 125000 }, { "epoch": 0.4689077718193902, "grad_norm": 2.526458263397217, "learning_rate": 8.70484738560735e-05, "loss": 2.3549, "step": 125500 }, { "epoch": 0.47077593027285386, "grad_norm": 2.8670670986175537, "learning_rate": 8.694978299105044e-05, "loss": 2.3685, "step": 126000 }, { "epoch": 0.4726440887263176, "grad_norm": 2.95123553276062, "learning_rate": 8.685077394112803e-05, "loss": 2.327, "step": 126500 }, { "epoch": 0.4745122471797813, "grad_norm": 3.010820150375366, "learning_rate": 8.675164652779493e-05, "loss": 2.3247, "step": 127000 }, { "epoch": 0.476380405633245, "grad_norm": 1.896767258644104, "learning_rate": 8.665200430068873e-05, "loss": 2.3158, "step": 127500 }, { "epoch": 0.4782485640867087, "grad_norm": 2.559565305709839, "learning_rate": 8.655204645293866e-05, "loss": 2.3425, "step": 128000 }, { "epoch": 0.4801167225401724, "grad_norm": 2.658048391342163, "learning_rate": 8.645177384530965e-05, "loss": 2.3565, "step": 128500 }, { "epoch": 0.4819848809936361, "grad_norm": 1.818748116493225, "learning_rate": 8.635118734127712e-05, "loss": 2.3441, "step": 129000 }, { "epoch": 0.48385303944709984, "grad_norm": 2.627014398574829, "learning_rate": 8.625028780701953e-05, "loss": 2.3296, "step": 129500 }, { "epoch": 0.4857211979005635, "grad_norm": 2.687391519546509, "learning_rate": 8.614907611141099e-05, "loss": 2.3334, "step": 130000 }, { "epoch": 0.4875893563540272, "grad_norm": 3.092353582382202, "learning_rate": 8.604755312601363e-05, "loss": 2.3278, "step": 130500 }, { "epoch": 0.48945751480749095, "grad_norm": 3.0431768894195557, "learning_rate": 8.59459237010844e-05, "loss": 2.299, "step": 131000 }, { "epoch": 0.49132567326095467, "grad_norm": 2.2302520275115967, "learning_rate": 8.584378137971116e-05, "loss": 2.2837, "step": 131500 }, { "epoch": 0.49319383171441833, "grad_norm": 2.7669031620025635, "learning_rate": 8.574133039752728e-05, "loss": 2.3202, "step": 132000 }, { "epoch": 0.49506199016788205, "grad_norm": 2.6957993507385254, "learning_rate": 8.563857163676681e-05, "loss": 2.3214, "step": 132500 }, { "epoch": 0.49693014862134577, "grad_norm": 2.662504196166992, "learning_rate": 8.553571241931346e-05, "loss": 2.2907, "step": 133000 }, { "epoch": 0.4987983070748095, "grad_norm": 2.6600215435028076, "learning_rate": 8.54323413698205e-05, "loss": 2.2866, "step": 133500 }, { "epoch": 0.5006664655282732, "grad_norm": 1.6196849346160889, "learning_rate": 8.532866520254174e-05, "loss": 2.3064, "step": 134000 }, { "epoch": 0.5025346239817369, "grad_norm": 2.3502981662750244, "learning_rate": 8.522468481026161e-05, "loss": 2.3447, "step": 134500 }, { "epoch": 0.5044027824352005, "grad_norm": 2.94901442527771, "learning_rate": 8.512040108838428e-05, "loss": 2.3602, "step": 135000 }, { "epoch": 0.5062709408886643, "grad_norm": 2.749366283416748, "learning_rate": 8.501581493492603e-05, "loss": 2.3389, "step": 135500 }, { "epoch": 0.508139099342128, "grad_norm": 3.2299070358276367, "learning_rate": 8.491113732620424e-05, "loss": 2.3348, "step": 136000 }, { "epoch": 0.5100072577955918, "grad_norm": 2.3727314472198486, "learning_rate": 8.480616028924504e-05, "loss": 2.2864, "step": 136500 }, { "epoch": 0.5118754162490554, "grad_norm": 1.8499844074249268, "learning_rate": 8.470067345222588e-05, "loss": 2.271, "step": 137000 }, { "epoch": 0.5137435747025191, "grad_norm": 3.1945462226867676, "learning_rate": 8.459488779801767e-05, "loss": 2.2967, "step": 137500 }, { "epoch": 0.5156117331559829, "grad_norm": 2.6457462310791016, "learning_rate": 8.448880423757021e-05, "loss": 2.2784, "step": 138000 }, { "epoch": 0.5174798916094465, "grad_norm": 2.016098976135254, "learning_rate": 8.438242368439869e-05, "loss": 2.3013, "step": 138500 }, { "epoch": 0.5193480500629102, "grad_norm": 1.97508704662323, "learning_rate": 8.42757470545757e-05, "loss": 2.3232, "step": 139000 }, { "epoch": 0.521216208516374, "grad_norm": 2.349184274673462, "learning_rate": 8.416877526672355e-05, "loss": 2.3266, "step": 139500 }, { "epoch": 0.5230843669698376, "grad_norm": 2.6522152423858643, "learning_rate": 8.406150924200616e-05, "loss": 2.2941, "step": 140000 }, { "epoch": 0.5249525254233014, "grad_norm": 3.5393903255462646, "learning_rate": 8.395394990412121e-05, "loss": 2.3459, "step": 140500 }, { "epoch": 0.5268206838767651, "grad_norm": 2.5476553440093994, "learning_rate": 8.38460981792922e-05, "loss": 2.2942, "step": 141000 }, { "epoch": 0.5286888423302287, "grad_norm": 2.8197927474975586, "learning_rate": 8.373817157288324e-05, "loss": 2.3426, "step": 141500 }, { "epoch": 0.5305570007836925, "grad_norm": 2.1316707134246826, "learning_rate": 8.362973844302275e-05, "loss": 2.2985, "step": 142000 }, { "epoch": 0.5324251592371562, "grad_norm": 1.9890694618225098, "learning_rate": 8.352101571809362e-05, "loss": 2.2896, "step": 142500 }, { "epoch": 0.5342933176906198, "grad_norm": 3.057724952697754, "learning_rate": 8.34120043343376e-05, "loss": 2.3079, "step": 143000 }, { "epoch": 0.5361614761440836, "grad_norm": 2.373011350631714, "learning_rate": 8.330270523048216e-05, "loss": 2.3294, "step": 143500 }, { "epoch": 0.5380296345975473, "grad_norm": 2.1205389499664307, "learning_rate": 8.31931193477324e-05, "loss": 2.2969, "step": 144000 }, { "epoch": 0.539897793051011, "grad_norm": 2.767277956008911, "learning_rate": 8.308324762976294e-05, "loss": 2.2901, "step": 144500 }, { "epoch": 0.5417659515044747, "grad_norm": 2.847618579864502, "learning_rate": 8.297309102270986e-05, "loss": 2.3128, "step": 145000 }, { "epoch": 0.5436341099579384, "grad_norm": 2.3643147945404053, "learning_rate": 8.286287163899844e-05, "loss": 2.2991, "step": 145500 }, { "epoch": 0.5455022684114021, "grad_norm": 3.874725103378296, "learning_rate": 8.275214866701926e-05, "loss": 2.2602, "step": 146000 }, { "epoch": 0.5473704268648658, "grad_norm": 2.4457411766052246, "learning_rate": 8.264114365714206e-05, "loss": 2.3038, "step": 146500 }, { "epoch": 0.5492385853183295, "grad_norm": 2.56156063079834, "learning_rate": 8.252985756526198e-05, "loss": 2.3193, "step": 147000 }, { "epoch": 0.5511067437717933, "grad_norm": 3.2425754070281982, "learning_rate": 8.241851476105105e-05, "loss": 2.294, "step": 147500 }, { "epoch": 0.5529749022252569, "grad_norm": 3.299207925796509, "learning_rate": 8.23066699398898e-05, "loss": 2.2933, "step": 148000 }, { "epoch": 0.5548430606787207, "grad_norm": 2.3422181606292725, "learning_rate": 8.219454691697226e-05, "loss": 2.3066, "step": 148500 }, { "epoch": 0.5567112191321844, "grad_norm": 2.9155092239379883, "learning_rate": 8.208214665782109e-05, "loss": 2.2698, "step": 149000 }, { "epoch": 0.558579377585648, "grad_norm": 3.0940420627593994, "learning_rate": 8.196969575847251e-05, "loss": 2.2787, "step": 149500 }, { "epoch": 0.5604475360391118, "grad_norm": 3.761610507965088, "learning_rate": 8.185674448258929e-05, "loss": 2.3008, "step": 150000 }, { "epoch": 0.5623156944925755, "grad_norm": 2.735173463821411, "learning_rate": 8.174374560372093e-05, "loss": 2.3122, "step": 150500 }, { "epoch": 0.5641838529460392, "grad_norm": 2.3430800437927246, "learning_rate": 8.163024719393988e-05, "loss": 2.2645, "step": 151000 }, { "epoch": 0.5660520113995029, "grad_norm": 2.489206314086914, "learning_rate": 8.151647640726769e-05, "loss": 2.2695, "step": 151500 }, { "epoch": 0.5679201698529666, "grad_norm": 3.2072606086730957, "learning_rate": 8.140243422341638e-05, "loss": 2.2641, "step": 152000 }, { "epoch": 0.5697883283064303, "grad_norm": 3.0480380058288574, "learning_rate": 8.128812162443502e-05, "loss": 2.3294, "step": 152500 }, { "epoch": 0.571656486759894, "grad_norm": 3.000128746032715, "learning_rate": 8.117353959470134e-05, "loss": 2.2637, "step": 153000 }, { "epoch": 0.5735246452133577, "grad_norm": 3.1820998191833496, "learning_rate": 8.105868912091317e-05, "loss": 2.2759, "step": 153500 }, { "epoch": 0.5753928036668214, "grad_norm": 2.6837666034698486, "learning_rate": 8.094357119208004e-05, "loss": 2.2549, "step": 154000 }, { "epoch": 0.5772609621202851, "grad_norm": 2.4082396030426025, "learning_rate": 8.082841783357048e-05, "loss": 2.3007, "step": 154500 }, { "epoch": 0.5791291205737489, "grad_norm": 2.461305618286133, "learning_rate": 8.0712768500827e-05, "loss": 2.2654, "step": 155000 }, { "epoch": 0.5809972790272125, "grad_norm": 2.9279286861419678, "learning_rate": 8.059708678275976e-05, "loss": 2.2669, "step": 155500 }, { "epoch": 0.5828654374806762, "grad_norm": 2.3760006427764893, "learning_rate": 8.048091002168906e-05, "loss": 2.2429, "step": 156000 }, { "epoch": 0.58473359593414, "grad_norm": 2.879556894302368, "learning_rate": 8.036447078099056e-05, "loss": 2.2694, "step": 156500 }, { "epoch": 0.5866017543876036, "grad_norm": 1.9433120489120483, "learning_rate": 8.024777006335506e-05, "loss": 2.243, "step": 157000 }, { "epoch": 0.5884699128410673, "grad_norm": 2.5363948345184326, "learning_rate": 8.013080887372506e-05, "loss": 2.267, "step": 157500 }, { "epoch": 0.5903380712945311, "grad_norm": 2.3004775047302246, "learning_rate": 8.001358821928599e-05, "loss": 2.2711, "step": 158000 }, { "epoch": 0.5922062297479948, "grad_norm": 2.1187326908111572, "learning_rate": 7.989610910945766e-05, "loss": 2.2733, "step": 158500 }, { "epoch": 0.5940743882014585, "grad_norm": 2.612976312637329, "learning_rate": 7.977860828524794e-05, "loss": 2.2617, "step": 159000 }, { "epoch": 0.5959425466549222, "grad_norm": 2.5254204273223877, "learning_rate": 7.96606158136407e-05, "loss": 2.2624, "step": 159500 }, { "epoch": 0.5978107051083859, "grad_norm": 2.352216958999634, "learning_rate": 7.954236792618814e-05, "loss": 2.2923, "step": 160000 }, { "epoch": 0.5996788635618496, "grad_norm": 2.5276451110839844, "learning_rate": 7.942386564115584e-05, "loss": 2.281, "step": 160500 }, { "epoch": 0.6015470220153133, "grad_norm": 2.3592355251312256, "learning_rate": 7.930510997900007e-05, "loss": 2.252, "step": 161000 }, { "epoch": 0.603415180468777, "grad_norm": 3.495464324951172, "learning_rate": 7.918610196235899e-05, "loss": 2.2379, "step": 161500 }, { "epoch": 0.6052833389222407, "grad_norm": 2.2157094478607178, "learning_rate": 7.906684261604388e-05, "loss": 2.2813, "step": 162000 }, { "epoch": 0.6071514973757044, "grad_norm": 3.170558452606201, "learning_rate": 7.894733296703025e-05, "loss": 2.2457, "step": 162500 }, { "epoch": 0.6090196558291682, "grad_norm": 3.1325762271881104, "learning_rate": 7.882781381038415e-05, "loss": 2.2531, "step": 163000 }, { "epoch": 0.6108878142826318, "grad_norm": 2.3855438232421875, "learning_rate": 7.87078071409669e-05, "loss": 2.2665, "step": 163500 }, { "epoch": 0.6127559727360955, "grad_norm": 2.261495351791382, "learning_rate": 7.858755326060588e-05, "loss": 2.2769, "step": 164000 }, { "epoch": 0.6146241311895593, "grad_norm": 3.212700128555298, "learning_rate": 7.846705320484082e-05, "loss": 2.2719, "step": 164500 }, { "epoch": 0.6164922896430229, "grad_norm": 2.875687837600708, "learning_rate": 7.83465497456751e-05, "loss": 2.2756, "step": 165000 }, { "epoch": 0.6183604480964866, "grad_norm": 3.213188886642456, "learning_rate": 7.822556094134869e-05, "loss": 2.2475, "step": 165500 }, { "epoch": 0.6202286065499504, "grad_norm": 2.9114816188812256, "learning_rate": 7.81043290788352e-05, "loss": 2.2411, "step": 166000 }, { "epoch": 0.622096765003414, "grad_norm": 2.960690498352051, "learning_rate": 7.798285520209603e-05, "loss": 2.2823, "step": 166500 }, { "epoch": 0.6239649234568778, "grad_norm": 2.9522547721862793, "learning_rate": 7.786138402665644e-05, "loss": 2.2186, "step": 167000 }, { "epoch": 0.6258330819103415, "grad_norm": 2.8541057109832764, "learning_rate": 7.773942974047013e-05, "loss": 2.2735, "step": 167500 }, { "epoch": 0.6277012403638051, "grad_norm": 2.182999849319458, "learning_rate": 7.761723658230827e-05, "loss": 2.2556, "step": 168000 }, { "epoch": 0.6295693988172689, "grad_norm": 2.0711419582366943, "learning_rate": 7.749480560441025e-05, "loss": 2.2949, "step": 168500 }, { "epoch": 0.6314375572707326, "grad_norm": 2.7931690216064453, "learning_rate": 7.737238343214024e-05, "loss": 2.2579, "step": 169000 }, { "epoch": 0.6333057157241962, "grad_norm": 2.2357709407806396, "learning_rate": 7.724948045003347e-05, "loss": 2.2145, "step": 169500 }, { "epoch": 0.63517387417766, "grad_norm": 2.4123311042785645, "learning_rate": 7.712634281504125e-05, "loss": 2.2908, "step": 170000 }, { "epoch": 0.6370420326311237, "grad_norm": 3.390855312347412, "learning_rate": 7.700321856241075e-05, "loss": 2.1975, "step": 170500 }, { "epoch": 0.6389101910845875, "grad_norm": 2.8016293048858643, "learning_rate": 7.687961526877562e-05, "loss": 2.2842, "step": 171000 }, { "epoch": 0.6407783495380511, "grad_norm": 2.734112501144409, "learning_rate": 7.675578050726744e-05, "loss": 2.2881, "step": 171500 }, { "epoch": 0.6426465079915148, "grad_norm": 2.7221627235412598, "learning_rate": 7.66317153442619e-05, "loss": 2.2748, "step": 172000 }, { "epoch": 0.6445146664449786, "grad_norm": 2.9320507049560547, "learning_rate": 7.650766966527448e-05, "loss": 2.2157, "step": 172500 }, { "epoch": 0.6463828248984422, "grad_norm": 2.428924798965454, "learning_rate": 7.638314736178451e-05, "loss": 2.2613, "step": 173000 }, { "epoch": 0.6482509833519059, "grad_norm": 2.5038206577301025, "learning_rate": 7.62583978656453e-05, "loss": 2.2606, "step": 173500 }, { "epoch": 0.6501191418053697, "grad_norm": 2.3970868587493896, "learning_rate": 7.613342225110954e-05, "loss": 2.2383, "step": 174000 }, { "epoch": 0.6519873002588333, "grad_norm": 2.124425172805786, "learning_rate": 7.60082215943772e-05, "loss": 2.2513, "step": 174500 }, { "epoch": 0.6538554587122971, "grad_norm": 3.180497884750366, "learning_rate": 7.58830480456262e-05, "loss": 2.2722, "step": 175000 }, { "epoch": 0.6557236171657608, "grad_norm": 2.8902299404144287, "learning_rate": 7.575740098553152e-05, "loss": 2.2439, "step": 175500 }, { "epoch": 0.6575917756192244, "grad_norm": 2.987680196762085, "learning_rate": 7.563153212126435e-05, "loss": 2.233, "step": 176000 }, { "epoch": 0.6594599340726882, "grad_norm": 2.5328335762023926, "learning_rate": 7.550544253671663e-05, "loss": 2.2434, "step": 176500 }, { "epoch": 0.6613280925261519, "grad_norm": 2.5823991298675537, "learning_rate": 7.537913331768098e-05, "loss": 2.2261, "step": 177000 }, { "epoch": 0.6631962509796155, "grad_norm": 3.252668619155884, "learning_rate": 7.525260555184135e-05, "loss": 2.2626, "step": 177500 }, { "epoch": 0.6650644094330793, "grad_norm": 2.427614688873291, "learning_rate": 7.512586032876367e-05, "loss": 2.2249, "step": 178000 }, { "epoch": 0.666932567886543, "grad_norm": 2.6210880279541016, "learning_rate": 7.49988987398865e-05, "loss": 2.2602, "step": 178500 }, { "epoch": 0.6688007263400068, "grad_norm": 2.7572479248046875, "learning_rate": 7.487223101332892e-05, "loss": 2.2325, "step": 179000 }, { "epoch": 0.6706688847934704, "grad_norm": 3.2144672870635986, "learning_rate": 7.474484082913688e-05, "loss": 2.2835, "step": 179500 }, { "epoch": 0.6725370432469341, "grad_norm": 2.4524009227752686, "learning_rate": 7.461723756021062e-05, "loss": 2.274, "step": 180000 }, { "epoch": 0.6744052017003979, "grad_norm": 2.676546335220337, "learning_rate": 7.44894223053775e-05, "loss": 2.2941, "step": 180500 }, { "epoch": 0.6762733601538615, "grad_norm": 3.0090246200561523, "learning_rate": 7.43613961652904e-05, "loss": 2.2545, "step": 181000 }, { "epoch": 0.6781415186073252, "grad_norm": 2.6397953033447266, "learning_rate": 7.423316024241814e-05, "loss": 2.2541, "step": 181500 }, { "epoch": 0.680009677060789, "grad_norm": 3.0165371894836426, "learning_rate": 7.410471564103606e-05, "loss": 2.2319, "step": 182000 }, { "epoch": 0.6818778355142526, "grad_norm": 2.1070499420166016, "learning_rate": 7.39760634672165e-05, "loss": 2.2617, "step": 182500 }, { "epoch": 0.6837459939677164, "grad_norm": 2.777233123779297, "learning_rate": 7.384746275141047e-05, "loss": 2.2206, "step": 183000 }, { "epoch": 0.6856141524211801, "grad_norm": 2.188089370727539, "learning_rate": 7.371839916767453e-05, "loss": 2.2428, "step": 183500 }, { "epoch": 0.6874823108746437, "grad_norm": 2.427400827407837, "learning_rate": 7.358913133818016e-05, "loss": 2.2161, "step": 184000 }, { "epoch": 0.6893504693281075, "grad_norm": 2.542616605758667, "learning_rate": 7.34596603760887e-05, "loss": 2.266, "step": 184500 }, { "epoch": 0.6912186277815712, "grad_norm": 2.6249241828918457, "learning_rate": 7.333024694314207e-05, "loss": 2.2383, "step": 185000 }, { "epoch": 0.6930867862350348, "grad_norm": 2.5798895359039307, "learning_rate": 7.320037346301442e-05, "loss": 2.2524, "step": 185500 }, { "epoch": 0.6949549446884986, "grad_norm": 2.9020352363586426, "learning_rate": 7.307030019799232e-05, "loss": 2.2251, "step": 186000 }, { "epoch": 0.6968231031419623, "grad_norm": 3.3277840614318848, "learning_rate": 7.294002826817298e-05, "loss": 2.2608, "step": 186500 }, { "epoch": 0.698691261595426, "grad_norm": 2.6658146381378174, "learning_rate": 7.280955879536435e-05, "loss": 2.2689, "step": 187000 }, { "epoch": 0.7005594200488897, "grad_norm": 2.736542224884033, "learning_rate": 7.267915443013911e-05, "loss": 2.2004, "step": 187500 }, { "epoch": 0.7024275785023534, "grad_norm": 2.440765619277954, "learning_rate": 7.254829363303503e-05, "loss": 2.2541, "step": 188000 }, { "epoch": 0.7042957369558172, "grad_norm": 2.6804561614990234, "learning_rate": 7.241723866627799e-05, "loss": 2.2647, "step": 188500 }, { "epoch": 0.7061638954092808, "grad_norm": 2.6702585220336914, "learning_rate": 7.228599065841891e-05, "loss": 2.2004, "step": 189000 }, { "epoch": 0.7080320538627445, "grad_norm": 2.5987019538879395, "learning_rate": 7.215481381028357e-05, "loss": 2.2509, "step": 189500 }, { "epoch": 0.7099002123162083, "grad_norm": 2.9680731296539307, "learning_rate": 7.20231834929401e-05, "loss": 2.2262, "step": 190000 }, { "epoch": 0.7117683707696719, "grad_norm": 3.8419201374053955, "learning_rate": 7.189136352781376e-05, "loss": 2.2313, "step": 190500 }, { "epoch": 0.7136365292231357, "grad_norm": 2.6179468631744385, "learning_rate": 7.175935505004304e-05, "loss": 2.2466, "step": 191000 }, { "epoch": 0.7155046876765994, "grad_norm": 1.9412791728973389, "learning_rate": 7.162742377434187e-05, "loss": 2.2336, "step": 191500 }, { "epoch": 0.717372846130063, "grad_norm": 2.312648057937622, "learning_rate": 7.149504205451939e-05, "loss": 2.2124, "step": 192000 }, { "epoch": 0.7192410045835268, "grad_norm": 2.4080445766448975, "learning_rate": 7.136247523488743e-05, "loss": 2.2103, "step": 192500 }, { "epoch": 0.7211091630369905, "grad_norm": 3.0859153270721436, "learning_rate": 7.122972445701587e-05, "loss": 2.1961, "step": 193000 }, { "epoch": 0.7229773214904541, "grad_norm": 3.438227415084839, "learning_rate": 7.10970569129335e-05, "loss": 2.2128, "step": 193500 }, { "epoch": 0.7248454799439179, "grad_norm": 2.6577913761138916, "learning_rate": 7.096394201181632e-05, "loss": 2.2254, "step": 194000 }, { "epoch": 0.7267136383973816, "grad_norm": 2.579580068588257, "learning_rate": 7.083064658434042e-05, "loss": 2.2562, "step": 194500 }, { "epoch": 0.7285817968508453, "grad_norm": 2.957392454147339, "learning_rate": 7.069717177834997e-05, "loss": 2.2762, "step": 195000 }, { "epoch": 0.730449955304309, "grad_norm": 1.9975017309188843, "learning_rate": 7.056378622641193e-05, "loss": 2.2385, "step": 195500 }, { "epoch": 0.7323181137577727, "grad_norm": 3.1538219451904297, "learning_rate": 7.042995646610036e-05, "loss": 2.2086, "step": 196000 }, { "epoch": 0.7341862722112364, "grad_norm": 2.2817578315734863, "learning_rate": 7.02959507777287e-05, "loss": 2.2153, "step": 196500 }, { "epoch": 0.7360544306647001, "grad_norm": 2.5474236011505127, "learning_rate": 7.016177031525738e-05, "loss": 2.2388, "step": 197000 }, { "epoch": 0.7379225891181638, "grad_norm": 2.5271482467651367, "learning_rate": 7.002795399479169e-05, "loss": 2.2344, "step": 197500 }, { "epoch": 0.7397907475716275, "grad_norm": 1.9711894989013672, "learning_rate": 6.989342813955246e-05, "loss": 2.1875, "step": 198000 }, { "epoch": 0.7416589060250912, "grad_norm": 2.832296133041382, "learning_rate": 6.97587309764484e-05, "loss": 2.2378, "step": 198500 }, { "epoch": 0.743527064478555, "grad_norm": 3.224106788635254, "learning_rate": 6.962386366539439e-05, "loss": 2.1749, "step": 199000 }, { "epoch": 0.7453952229320187, "grad_norm": 2.2426908016204834, "learning_rate": 6.948882736777054e-05, "loss": 2.1997, "step": 199500 }, { "epoch": 0.7472633813854823, "grad_norm": 2.7945656776428223, "learning_rate": 6.935362324641206e-05, "loss": 2.2217, "step": 200000 }, { "epoch": 0.7491315398389461, "grad_norm": 2.7567574977874756, "learning_rate": 6.921825246559942e-05, "loss": 2.2296, "step": 200500 }, { "epoch": 0.7509996982924098, "grad_norm": 2.5919723510742188, "learning_rate": 6.908298742798458e-05, "loss": 2.2364, "step": 201000 }, { "epoch": 0.7528678567458734, "grad_norm": 2.993880271911621, "learning_rate": 6.894728715432299e-05, "loss": 2.2065, "step": 201500 }, { "epoch": 0.7547360151993372, "grad_norm": 2.4301109313964844, "learning_rate": 6.881142372028077e-05, "loss": 2.2457, "step": 202000 }, { "epoch": 0.7566041736528009, "grad_norm": 2.623084783554077, "learning_rate": 6.867539829581595e-05, "loss": 2.1742, "step": 202500 }, { "epoch": 0.7584723321062646, "grad_norm": 3.4304981231689453, "learning_rate": 6.853921205228139e-05, "loss": 2.2292, "step": 203000 }, { "epoch": 0.7603404905597283, "grad_norm": 1.7889618873596191, "learning_rate": 6.84028661624149e-05, "loss": 2.217, "step": 203500 }, { "epoch": 0.762208649013192, "grad_norm": 2.954709053039551, "learning_rate": 6.8266361800329e-05, "loss": 2.2491, "step": 204000 }, { "epoch": 0.7640768074666557, "grad_norm": 2.892221212387085, "learning_rate": 6.812970014150086e-05, "loss": 2.2431, "step": 204500 }, { "epoch": 0.7659449659201194, "grad_norm": 1.9717577695846558, "learning_rate": 6.799315615334446e-05, "loss": 2.2397, "step": 205000 }, { "epoch": 0.7678131243735831, "grad_norm": 2.904269218444824, "learning_rate": 6.785618374157811e-05, "loss": 2.1972, "step": 205500 }, { "epoch": 0.7696812828270468, "grad_norm": 3.807295083999634, "learning_rate": 6.771933197025247e-05, "loss": 2.2292, "step": 206000 }, { "epoch": 0.7715494412805105, "grad_norm": 3.4538333415985107, "learning_rate": 6.758205351413722e-05, "loss": 2.1935, "step": 206500 }, { "epoch": 0.7734175997339743, "grad_norm": 2.769444227218628, "learning_rate": 6.744462365404948e-05, "loss": 2.1709, "step": 207000 }, { "epoch": 0.775285758187438, "grad_norm": 3.002584934234619, "learning_rate": 6.730704357343616e-05, "loss": 2.1863, "step": 207500 }, { "epoch": 0.7771539166409016, "grad_norm": 2.559108257293701, "learning_rate": 6.716959006322012e-05, "loss": 2.2118, "step": 208000 }, { "epoch": 0.7790220750943654, "grad_norm": 3.1521153450012207, "learning_rate": 6.703171339157552e-05, "loss": 2.19, "step": 208500 }, { "epoch": 0.780890233547829, "grad_norm": 2.7111008167266846, "learning_rate": 6.689369005509088e-05, "loss": 2.2044, "step": 209000 }, { "epoch": 0.7827583920012927, "grad_norm": 2.8580000400543213, "learning_rate": 6.675552124232371e-05, "loss": 2.2458, "step": 209500 }, { "epoch": 0.7846265504547565, "grad_norm": 2.7248494625091553, "learning_rate": 6.661720814308425e-05, "loss": 2.2096, "step": 210000 }, { "epoch": 0.7864947089082202, "grad_norm": 3.5847723484039307, "learning_rate": 6.647875194842521e-05, "loss": 2.2238, "step": 210500 }, { "epoch": 0.7883628673616839, "grad_norm": 3.013185977935791, "learning_rate": 6.634015385063155e-05, "loss": 2.2128, "step": 211000 }, { "epoch": 0.7902310258151476, "grad_norm": 3.160470962524414, "learning_rate": 6.620141504321021e-05, "loss": 2.2604, "step": 211500 }, { "epoch": 0.7920991842686113, "grad_norm": 3.009772300720215, "learning_rate": 6.606281461596562e-05, "loss": 2.2169, "step": 212000 }, { "epoch": 0.793967342722075, "grad_norm": 2.7089791297912598, "learning_rate": 6.592379825008977e-05, "loss": 2.1894, "step": 212500 }, { "epoch": 0.7958355011755387, "grad_norm": 2.2874131202697754, "learning_rate": 6.578492320297462e-05, "loss": 2.2472, "step": 213000 }, { "epoch": 0.7977036596290024, "grad_norm": 3.115208864212036, "learning_rate": 6.564563405749691e-05, "loss": 2.1696, "step": 213500 }, { "epoch": 0.7995718180824661, "grad_norm": 3.074309825897217, "learning_rate": 6.550621018309538e-05, "loss": 2.2022, "step": 214000 }, { "epoch": 0.8014399765359298, "grad_norm": 2.6160593032836914, "learning_rate": 6.536665278038796e-05, "loss": 2.2136, "step": 214500 }, { "epoch": 0.8033081349893936, "grad_norm": 2.875887155532837, "learning_rate": 6.522696305114238e-05, "loss": 2.222, "step": 215000 }, { "epoch": 0.8051762934428572, "grad_norm": 1.9582101106643677, "learning_rate": 6.508714219826595e-05, "loss": 2.1975, "step": 215500 }, { "epoch": 0.8070444518963209, "grad_norm": 3.11397647857666, "learning_rate": 6.494719142579506e-05, "loss": 2.2285, "step": 216000 }, { "epoch": 0.8089126103497847, "grad_norm": 2.7110836505889893, "learning_rate": 6.480711193888488e-05, "loss": 2.1638, "step": 216500 }, { "epoch": 0.8107807688032483, "grad_norm": 2.2085702419281006, "learning_rate": 6.4666904943799e-05, "loss": 2.2144, "step": 217000 }, { "epoch": 0.812648927256712, "grad_norm": 3.44262957572937, "learning_rate": 6.452657164789899e-05, "loss": 2.2248, "step": 217500 }, { "epoch": 0.8145170857101758, "grad_norm": 2.770791530609131, "learning_rate": 6.438639430044904e-05, "loss": 2.1861, "step": 218000 }, { "epoch": 0.8163852441636394, "grad_norm": 3.2068679332733154, "learning_rate": 6.424581227590346e-05, "loss": 2.1691, "step": 218500 }, { "epoch": 0.8182534026171032, "grad_norm": 3.264312744140625, "learning_rate": 6.410510757669032e-05, "loss": 2.159, "step": 219000 }, { "epoch": 0.8201215610705669, "grad_norm": 3.264051675796509, "learning_rate": 6.396428141445709e-05, "loss": 2.1775, "step": 219500 }, { "epoch": 0.8219897195240305, "grad_norm": 2.961418867111206, "learning_rate": 6.382333500189714e-05, "loss": 2.1851, "step": 220000 }, { "epoch": 0.8238578779774943, "grad_norm": 4.034390449523926, "learning_rate": 6.368226955273941e-05, "loss": 2.1552, "step": 220500 }, { "epoch": 0.825726036430958, "grad_norm": 2.0030012130737305, "learning_rate": 6.354136876505816e-05, "loss": 2.1762, "step": 221000 }, { "epoch": 0.8275941948844217, "grad_norm": 2.7552449703216553, "learning_rate": 6.340006911997954e-05, "loss": 2.1758, "step": 221500 }, { "epoch": 0.8294623533378854, "grad_norm": 2.4928476810455322, "learning_rate": 6.325865408316381e-05, "loss": 2.1951, "step": 222000 }, { "epoch": 0.8313305117913491, "grad_norm": 2.8218753337860107, "learning_rate": 6.311712487237538e-05, "loss": 2.1348, "step": 222500 }, { "epoch": 0.8331986702448129, "grad_norm": 3.4085326194763184, "learning_rate": 6.297548270636179e-05, "loss": 2.2058, "step": 223000 }, { "epoch": 0.8350668286982765, "grad_norm": 3.3644134998321533, "learning_rate": 6.283372880484332e-05, "loss": 2.1574, "step": 223500 }, { "epoch": 0.8369349871517402, "grad_norm": 3.0675761699676514, "learning_rate": 6.269186438850234e-05, "loss": 2.1725, "step": 224000 }, { "epoch": 0.838803145605204, "grad_norm": 2.6877012252807617, "learning_rate": 6.2549890678973e-05, "loss": 2.1889, "step": 224500 }, { "epoch": 0.8406713040586676, "grad_norm": 3.4169256687164307, "learning_rate": 6.240837743960651e-05, "loss": 2.1423, "step": 225000 }, { "epoch": 0.8425394625121313, "grad_norm": 3.0024383068084717, "learning_rate": 6.22661892373068e-05, "loss": 2.178, "step": 225500 }, { "epoch": 0.8444076209655951, "grad_norm": 3.079028606414795, "learning_rate": 6.212389540742632e-05, "loss": 2.2295, "step": 226000 }, { "epoch": 0.8462757794190587, "grad_norm": 2.90077805519104, "learning_rate": 6.198149717529692e-05, "loss": 2.1684, "step": 226500 }, { "epoch": 0.8481439378725225, "grad_norm": 3.053629159927368, "learning_rate": 6.18389957671496e-05, "loss": 2.1738, "step": 227000 }, { "epoch": 0.8500120963259862, "grad_norm": 3.0925843715667725, "learning_rate": 6.16963924101038e-05, "loss": 2.1551, "step": 227500 }, { "epoch": 0.8518802547794498, "grad_norm": 3.0221009254455566, "learning_rate": 6.155368833215677e-05, "loss": 2.1966, "step": 228000 }, { "epoch": 0.8537484132329136, "grad_norm": 2.5803329944610596, "learning_rate": 6.141088476217323e-05, "loss": 2.164, "step": 228500 }, { "epoch": 0.8556165716863773, "grad_norm": 3.4956555366516113, "learning_rate": 6.126826883078718e-05, "loss": 2.1776, "step": 229000 }, { "epoch": 0.8574847301398411, "grad_norm": 2.8954169750213623, "learning_rate": 6.112527015957583e-05, "loss": 2.1944, "step": 229500 }, { "epoch": 0.8593528885933047, "grad_norm": 3.2150614261627197, "learning_rate": 6.0982175685556475e-05, "loss": 2.1942, "step": 230000 }, { "epoch": 0.8612210470467684, "grad_norm": 2.8969147205352783, "learning_rate": 6.083898664095558e-05, "loss": 2.152, "step": 230500 }, { "epoch": 0.8630892055002322, "grad_norm": 2.898751974105835, "learning_rate": 6.069599091590918e-05, "loss": 2.1624, "step": 231000 }, { "epoch": 0.8649573639536958, "grad_norm": 3.5042660236358643, "learning_rate": 6.05529034527542e-05, "loss": 2.1428, "step": 231500 }, { "epoch": 0.8668255224071595, "grad_norm": 3.0192151069641113, "learning_rate": 6.040943845887397e-05, "loss": 2.1942, "step": 232000 }, { "epoch": 0.8686936808606233, "grad_norm": 3.0444955825805664, "learning_rate": 6.026588382641243e-05, "loss": 2.1533, "step": 232500 }, { "epoch": 0.8705618393140869, "grad_norm": 3.1138992309570312, "learning_rate": 6.012224079155855e-05, "loss": 2.1841, "step": 233000 }, { "epoch": 0.8724299977675507, "grad_norm": 2.3980443477630615, "learning_rate": 5.997879813783181e-05, "loss": 2.1724, "step": 233500 }, { "epoch": 0.8742981562210144, "grad_norm": 2.9543912410736084, "learning_rate": 5.9834982180414524e-05, "loss": 2.1502, "step": 234000 }, { "epoch": 0.876166314674478, "grad_norm": 2.555027961730957, "learning_rate": 5.969108153121932e-05, "loss": 2.1499, "step": 234500 }, { "epoch": 0.8780344731279418, "grad_norm": 2.4806180000305176, "learning_rate": 5.954709742941489e-05, "loss": 2.1733, "step": 235000 }, { "epoch": 0.8799026315814055, "grad_norm": 2.855769634246826, "learning_rate": 5.9403031114888505e-05, "loss": 2.1783, "step": 235500 }, { "epoch": 0.8817707900348691, "grad_norm": 2.85447359085083, "learning_rate": 5.9258883828235466e-05, "loss": 2.1684, "step": 236000 }, { "epoch": 0.8836389484883329, "grad_norm": 3.5129261016845703, "learning_rate": 5.911494534352925e-05, "loss": 2.1825, "step": 236500 }, { "epoch": 0.8855071069417966, "grad_norm": 3.9751412868499756, "learning_rate": 5.8970639992924826e-05, "loss": 2.1827, "step": 237000 }, { "epoch": 0.8873752653952603, "grad_norm": 3.1551120281219482, "learning_rate": 5.882625739363443e-05, "loss": 2.2232, "step": 237500 }, { "epoch": 0.889243423848724, "grad_norm": 3.2931878566741943, "learning_rate": 5.868179878897693e-05, "loss": 2.1291, "step": 238000 }, { "epoch": 0.8911115823021877, "grad_norm": 3.2662160396575928, "learning_rate": 5.853726542292572e-05, "loss": 2.1776, "step": 238500 }, { "epoch": 0.8929797407556515, "grad_norm": 2.764841079711914, "learning_rate": 5.8392658540097975e-05, "loss": 2.1069, "step": 239000 }, { "epoch": 0.8948478992091151, "grad_norm": 1.903836965560913, "learning_rate": 5.8247979385743945e-05, "loss": 2.1436, "step": 239500 }, { "epoch": 0.8967160576625788, "grad_norm": 2.859905481338501, "learning_rate": 5.8103229205736235e-05, "loss": 2.1426, "step": 240000 }, { "epoch": 0.8985842161160426, "grad_norm": 3.1984663009643555, "learning_rate": 5.79586989552882e-05, "loss": 2.1798, "step": 240500 }, { "epoch": 0.9004523745695062, "grad_norm": 2.157151222229004, "learning_rate": 5.781381059984584e-05, "loss": 2.1766, "step": 241000 }, { "epoch": 0.90232053302297, "grad_norm": 3.674839973449707, "learning_rate": 5.7668854957498444e-05, "loss": 2.1925, "step": 241500 }, { "epoch": 0.9041886914764337, "grad_norm": 2.9118549823760986, "learning_rate": 5.752383327649953e-05, "loss": 2.1655, "step": 242000 }, { "epoch": 0.9060568499298973, "grad_norm": 3.0006792545318604, "learning_rate": 5.737903704244284e-05, "loss": 2.1639, "step": 242500 }, { "epoch": 0.9079250083833611, "grad_norm": 3.3966879844665527, "learning_rate": 5.723388715699902e-05, "loss": 2.1106, "step": 243000 }, { "epoch": 0.9097931668368248, "grad_norm": 3.6091904640197754, "learning_rate": 5.708896546422721e-05, "loss": 2.1847, "step": 243500 }, { "epoch": 0.9116613252902884, "grad_norm": 2.7571775913238525, "learning_rate": 5.694369236403816e-05, "loss": 2.1453, "step": 244000 }, { "epoch": 0.9135294837437522, "grad_norm": 3.4625306129455566, "learning_rate": 5.6798359469775195e-05, "loss": 2.1599, "step": 244500 }, { "epoch": 0.9153976421972159, "grad_norm": 2.573812246322632, "learning_rate": 5.665296803294042e-05, "loss": 2.1393, "step": 245000 }, { "epoch": 0.9172658006506796, "grad_norm": 2.3979828357696533, "learning_rate": 5.650751930554011e-05, "loss": 2.1714, "step": 245500 }, { "epoch": 0.9191339591041433, "grad_norm": 3.1871445178985596, "learning_rate": 5.6362014540073884e-05, "loss": 2.1164, "step": 246000 }, { "epoch": 0.921002117557607, "grad_norm": 2.8169736862182617, "learning_rate": 5.6216454989523906e-05, "loss": 2.1343, "step": 246500 }, { "epoch": 0.9228702760110707, "grad_norm": 3.2970011234283447, "learning_rate": 5.607113318609965e-05, "loss": 2.1403, "step": 247000 }, { "epoch": 0.9247384344645344, "grad_norm": 2.7862350940704346, "learning_rate": 5.5925467929508655e-05, "loss": 2.148, "step": 247500 }, { "epoch": 0.9266065929179981, "grad_norm": 2.888575553894043, "learning_rate": 5.5779751647058663e-05, "loss": 2.184, "step": 248000 }, { "epoch": 0.9284747513714618, "grad_norm": 2.52675199508667, "learning_rate": 5.56339855935533e-05, "loss": 2.078, "step": 248500 }, { "epoch": 0.9303429098249255, "grad_norm": 2.9500951766967773, "learning_rate": 5.54881710242247e-05, "loss": 2.1206, "step": 249000 }, { "epoch": 0.9322110682783893, "grad_norm": 2.5412566661834717, "learning_rate": 5.5342309194722885e-05, "loss": 2.1395, "step": 249500 }, { "epoch": 0.934079226731853, "grad_norm": 2.3108468055725098, "learning_rate": 5.519640136110478e-05, "loss": 2.1498, "step": 250000 }, { "epoch": 0.9359473851853166, "grad_norm": 2.373042345046997, "learning_rate": 5.505044877982351e-05, "loss": 2.1532, "step": 250500 }, { "epoch": 0.9378155436387804, "grad_norm": 2.997445821762085, "learning_rate": 5.490474474242996e-05, "loss": 2.1451, "step": 251000 }, { "epoch": 0.939683702092244, "grad_norm": 2.837625741958618, "learning_rate": 5.4758706519924406e-05, "loss": 2.1425, "step": 251500 }, { "epoch": 0.9415518605457077, "grad_norm": 2.954401731491089, "learning_rate": 5.461262731886816e-05, "loss": 2.1568, "step": 252000 }, { "epoch": 0.9434200189991715, "grad_norm": 3.2825334072113037, "learning_rate": 5.446650839719003e-05, "loss": 2.15, "step": 252500 }, { "epoch": 0.9452881774526352, "grad_norm": 3.196861505508423, "learning_rate": 5.4320643365477844e-05, "loss": 2.1278, "step": 253000 }, { "epoch": 0.9471563359060989, "grad_norm": 2.7488534450531006, "learning_rate": 5.417444885085084e-05, "loss": 2.1859, "step": 253500 }, { "epoch": 0.9490244943595626, "grad_norm": 2.5847301483154297, "learning_rate": 5.4028218388879116e-05, "loss": 2.1445, "step": 254000 }, { "epoch": 0.9508926528130263, "grad_norm": 3.6500895023345947, "learning_rate": 5.388195323879396e-05, "loss": 2.1439, "step": 254500 }, { "epoch": 0.95276081126649, "grad_norm": 2.848147392272949, "learning_rate": 5.373594728980722e-05, "loss": 2.1709, "step": 255000 }, { "epoch": 0.9546289697199537, "grad_norm": 2.592301368713379, "learning_rate": 5.35899092980915e-05, "loss": 2.1306, "step": 255500 }, { "epoch": 0.9564971281734174, "grad_norm": 1.9539679288864136, "learning_rate": 5.344354776311128e-05, "loss": 2.115, "step": 256000 }, { "epoch": 0.9583652866268811, "grad_norm": 3.211258888244629, "learning_rate": 5.329715657477968e-05, "loss": 2.166, "step": 256500 }, { "epoch": 0.9602334450803448, "grad_norm": 2.754812240600586, "learning_rate": 5.31507369937121e-05, "loss": 2.1639, "step": 257000 }, { "epoch": 0.9621016035338086, "grad_norm": 2.349533796310425, "learning_rate": 5.300458320043379e-05, "loss": 2.155, "step": 257500 }, { "epoch": 0.9639697619872722, "grad_norm": 3.3088858127593994, "learning_rate": 5.285811066719044e-05, "loss": 2.1429, "step": 258000 }, { "epoch": 0.9658379204407359, "grad_norm": 3.420562505722046, "learning_rate": 5.2711613521958034e-05, "loss": 2.133, "step": 258500 }, { "epoch": 0.9677060788941997, "grad_norm": 2.4579176902770996, "learning_rate": 5.256509302626437e-05, "loss": 2.1483, "step": 259000 }, { "epoch": 0.9695742373476633, "grad_norm": 3.574404239654541, "learning_rate": 5.241855044183839e-05, "loss": 2.1599, "step": 259500 }, { "epoch": 0.971442395801127, "grad_norm": 2.763312816619873, "learning_rate": 5.227198703059918e-05, "loss": 2.1175, "step": 260000 }, { "epoch": 0.9733105542545908, "grad_norm": 3.4662206172943115, "learning_rate": 5.2125404054645224e-05, "loss": 2.1439, "step": 260500 }, { "epoch": 0.9751787127080545, "grad_norm": 2.4736666679382324, "learning_rate": 5.197880277624344e-05, "loss": 2.166, "step": 261000 }, { "epoch": 0.9770468711615182, "grad_norm": 2.448014974594116, "learning_rate": 5.1832184457818365e-05, "loss": 2.1184, "step": 261500 }, { "epoch": 0.9789150296149819, "grad_norm": 2.605496644973755, "learning_rate": 5.168584364503971e-05, "loss": 2.0694, "step": 262000 }, { "epoch": 0.9807831880684456, "grad_norm": 2.6576755046844482, "learning_rate": 5.153919506218703e-05, "loss": 2.1525, "step": 262500 }, { "epoch": 0.9826513465219093, "grad_norm": 3.0602567195892334, "learning_rate": 5.139253322489586e-05, "loss": 2.12, "step": 263000 }, { "epoch": 0.984519504975373, "grad_norm": 2.233271598815918, "learning_rate": 5.124585939611224e-05, "loss": 2.124, "step": 263500 }, { "epoch": 0.9863876634288367, "grad_norm": 3.0819501876831055, "learning_rate": 5.109946821786733e-05, "loss": 2.1361, "step": 264000 }, { "epoch": 0.9882558218823004, "grad_norm": 2.7308757305145264, "learning_rate": 5.0952774213009e-05, "loss": 2.1196, "step": 264500 }, { "epoch": 0.9901239803357641, "grad_norm": 2.309229612350464, "learning_rate": 5.080607200354588e-05, "loss": 2.071, "step": 265000 }, { "epoch": 0.9919921387892279, "grad_norm": 3.331204652786255, "learning_rate": 5.065965627716091e-05, "loss": 2.0675, "step": 265500 }, { "epoch": 0.9938602972426915, "grad_norm": 3.6821019649505615, "learning_rate": 5.051294145852407e-05, "loss": 2.1329, "step": 266000 }, { "epoch": 0.9957284556961552, "grad_norm": 1.9205609560012817, "learning_rate": 5.036622222280509e-05, "loss": 2.1563, "step": 266500 }, { "epoch": 0.997596614149619, "grad_norm": 3.6985223293304443, "learning_rate": 5.021949983344428e-05, "loss": 2.139, "step": 267000 }, { "epoch": 0.9994647726030826, "grad_norm": 3.8483798503875732, "learning_rate": 5.007277555390912e-05, "loss": 2.1531, "step": 267500 }, { "epoch": 1.0013329310565464, "grad_norm": 2.758868932723999, "learning_rate": 4.992605064768335e-05, "loss": 2.0257, "step": 268000 }, { "epoch": 1.00320108951001, "grad_norm": 2.7047057151794434, "learning_rate": 4.9779619825319616e-05, "loss": 1.9918, "step": 268500 }, { "epoch": 1.0050692479634737, "grad_norm": 3.4775989055633545, "learning_rate": 4.963289745111303e-05, "loss": 1.9841, "step": 269000 }, { "epoch": 1.0069374064169374, "grad_norm": 3.1174392700195312, "learning_rate": 4.9486178238129e-05, "loss": 1.9998, "step": 269500 }, { "epoch": 1.008805564870401, "grad_norm": 3.418029546737671, "learning_rate": 4.933946344980765e-05, "loss": 2.0305, "step": 270000 }, { "epoch": 1.010673723323865, "grad_norm": 4.21517276763916, "learning_rate": 4.919275434955098e-05, "loss": 1.9349, "step": 270500 }, { "epoch": 1.0125418817773286, "grad_norm": 3.2260196208953857, "learning_rate": 4.904605220071203e-05, "loss": 1.9659, "step": 271000 }, { "epoch": 1.0144100402307923, "grad_norm": 2.354206085205078, "learning_rate": 4.889935826658396e-05, "loss": 1.9459, "step": 271500 }, { "epoch": 1.016278198684256, "grad_norm": 2.399245262145996, "learning_rate": 4.8752967169003024e-05, "loss": 1.9669, "step": 272000 }, { "epoch": 1.0181463571377196, "grad_norm": 2.836991786956787, "learning_rate": 4.8606293431139685e-05, "loss": 1.9754, "step": 272500 }, { "epoch": 1.0200145155911835, "grad_norm": 2.369506597518921, "learning_rate": 4.845963169487281e-05, "loss": 1.9748, "step": 273000 }, { "epoch": 1.0218826740446472, "grad_norm": 4.3176140785217285, "learning_rate": 4.831298322314752e-05, "loss": 1.9874, "step": 273500 }, { "epoch": 1.0237508324981108, "grad_norm": 2.473726749420166, "learning_rate": 4.8166349278794803e-05, "loss": 1.9784, "step": 274000 }, { "epoch": 1.0256189909515745, "grad_norm": 3.3185558319091797, "learning_rate": 4.8019731124520506e-05, "loss": 2.0007, "step": 274500 }, { "epoch": 1.0274871494050382, "grad_norm": 3.276498317718506, "learning_rate": 4.787313002289445e-05, "loss": 1.9758, "step": 275000 }, { "epoch": 1.029355307858502, "grad_norm": 3.0989725589752197, "learning_rate": 4.772654723633967e-05, "loss": 2.0042, "step": 275500 }, { "epoch": 1.0312234663119657, "grad_norm": 2.4186153411865234, "learning_rate": 4.7580277133162835e-05, "loss": 2.0053, "step": 276000 }, { "epoch": 1.0330916247654294, "grad_norm": 2.4179837703704834, "learning_rate": 4.74340277836311e-05, "loss": 1.9908, "step": 276500 }, { "epoch": 1.034959783218893, "grad_norm": 3.3896212577819824, "learning_rate": 4.728750742427794e-05, "loss": 1.9604, "step": 277000 }, { "epoch": 1.0368279416723567, "grad_norm": 2.6385319232940674, "learning_rate": 4.714101042295578e-05, "loss": 1.9896, "step": 277500 }, { "epoch": 1.0386961001258204, "grad_norm": 3.6427805423736572, "learning_rate": 4.6994538041191235e-05, "loss": 2.0044, "step": 278000 }, { "epoch": 1.0405642585792843, "grad_norm": 3.0906810760498047, "learning_rate": 4.684809154029888e-05, "loss": 2.0074, "step": 278500 }, { "epoch": 1.042432417032748, "grad_norm": 3.357675313949585, "learning_rate": 4.67019649921625e-05, "loss": 2.0337, "step": 279000 }, { "epoch": 1.0443005754862116, "grad_norm": 3.163966655731201, "learning_rate": 4.655557397799212e-05, "loss": 1.9936, "step": 279500 }, { "epoch": 1.0461687339396752, "grad_norm": 2.073416233062744, "learning_rate": 4.640921262473603e-05, "loss": 1.9917, "step": 280000 }, { "epoch": 1.048036892393139, "grad_norm": 4.012736797332764, "learning_rate": 4.626288219275275e-05, "loss": 1.9811, "step": 280500 }, { "epoch": 1.0499050508466028, "grad_norm": 3.065397262573242, "learning_rate": 4.611658394213446e-05, "loss": 2.0052, "step": 281000 }, { "epoch": 1.0517732093000665, "grad_norm": 3.3266775608062744, "learning_rate": 4.597061162810362e-05, "loss": 1.997, "step": 281500 }, { "epoch": 1.0536413677535301, "grad_norm": 2.940035820007324, "learning_rate": 4.582438144871442e-05, "loss": 1.9267, "step": 282000 }, { "epoch": 1.0555095262069938, "grad_norm": 3.5627119541168213, "learning_rate": 4.567818722674258e-05, "loss": 1.973, "step": 282500 }, { "epoch": 1.0573776846604575, "grad_norm": 2.702580213546753, "learning_rate": 4.553203022110738e-05, "loss": 1.9818, "step": 283000 }, { "epoch": 1.0592458431139213, "grad_norm": 3.027751922607422, "learning_rate": 4.538591169040759e-05, "loss": 2.0195, "step": 283500 }, { "epoch": 1.061114001567385, "grad_norm": 2.598694086074829, "learning_rate": 4.5239832892910685e-05, "loss": 1.9988, "step": 284000 }, { "epoch": 1.0629821600208487, "grad_norm": 2.5287024974823, "learning_rate": 4.5093795086541985e-05, "loss": 1.9794, "step": 284500 }, { "epoch": 1.0648503184743123, "grad_norm": 2.937054395675659, "learning_rate": 4.494779952887383e-05, "loss": 1.9804, "step": 285000 }, { "epoch": 1.066718476927776, "grad_norm": 2.625366687774658, "learning_rate": 4.48021393369639e-05, "loss": 2.002, "step": 285500 }, { "epoch": 1.0685866353812399, "grad_norm": 2.97308349609375, "learning_rate": 4.465623195716817e-05, "loss": 1.974, "step": 286000 }, { "epoch": 1.0704547938347035, "grad_norm": 2.940298080444336, "learning_rate": 4.4510370594051275e-05, "loss": 1.9722, "step": 286500 }, { "epoch": 1.0723229522881672, "grad_norm": 2.5476973056793213, "learning_rate": 4.436455650366615e-05, "loss": 2.0061, "step": 287000 }, { "epoch": 1.0741911107416309, "grad_norm": 3.88171124458313, "learning_rate": 4.4218790941658633e-05, "loss": 1.9859, "step": 287500 }, { "epoch": 1.0760592691950945, "grad_norm": 2.958958864212036, "learning_rate": 4.407307516325668e-05, "loss": 1.9929, "step": 288000 }, { "epoch": 1.0779274276485582, "grad_norm": 3.2626969814300537, "learning_rate": 4.3927410423259555e-05, "loss": 2.0427, "step": 288500 }, { "epoch": 1.079795586102022, "grad_norm": 2.726310968399048, "learning_rate": 4.378208914789977e-05, "loss": 1.9826, "step": 289000 }, { "epoch": 1.0816637445554858, "grad_norm": 3.683236598968506, "learning_rate": 4.36365301389968e-05, "loss": 2.006, "step": 289500 }, { "epoch": 1.0835319030089494, "grad_norm": 3.4819111824035645, "learning_rate": 4.349102592770976e-05, "loss": 1.9865, "step": 290000 }, { "epoch": 1.085400061462413, "grad_norm": 3.417532444000244, "learning_rate": 4.334557776701607e-05, "loss": 1.9988, "step": 290500 }, { "epoch": 1.0872682199158767, "grad_norm": 2.9879865646362305, "learning_rate": 4.3200477633104895e-05, "loss": 1.9888, "step": 291000 }, { "epoch": 1.0891363783693406, "grad_norm": 2.8864903450012207, "learning_rate": 4.305514521222923e-05, "loss": 1.9602, "step": 291500 }, { "epoch": 1.0910045368228043, "grad_norm": 3.8783183097839355, "learning_rate": 4.290987259543744e-05, "loss": 2.0115, "step": 292000 }, { "epoch": 1.092872695276268, "grad_norm": 3.2339043617248535, "learning_rate": 4.2764661033712623e-05, "loss": 2.016, "step": 292500 }, { "epoch": 1.0947408537297316, "grad_norm": 3.942629337310791, "learning_rate": 4.261951177751206e-05, "loss": 1.9975, "step": 293000 }, { "epoch": 1.0966090121831953, "grad_norm": 5.084557056427002, "learning_rate": 4.2474426076756546e-05, "loss": 1.9484, "step": 293500 }, { "epoch": 1.098477170636659, "grad_norm": 3.621943473815918, "learning_rate": 4.2329405180819554e-05, "loss": 1.9364, "step": 294000 }, { "epoch": 1.1003453290901228, "grad_norm": 3.5090487003326416, "learning_rate": 4.2184450338516527e-05, "loss": 2.0112, "step": 294500 }, { "epoch": 1.1022134875435865, "grad_norm": 4.1997246742248535, "learning_rate": 4.204014221253661e-05, "loss": 1.9631, "step": 295000 }, { "epoch": 1.1040816459970502, "grad_norm": 3.7712690830230713, "learning_rate": 4.189532294497906e-05, "loss": 1.9428, "step": 295500 }, { "epoch": 1.1059498044505138, "grad_norm": 4.392169952392578, "learning_rate": 4.175057346905878e-05, "loss": 2.0024, "step": 296000 }, { "epoch": 1.1078179629039775, "grad_norm": 3.103431463241577, "learning_rate": 4.160589503125397e-05, "loss": 1.9671, "step": 296500 }, { "epoch": 1.1096861213574414, "grad_norm": 2.2490739822387695, "learning_rate": 4.1461288877431045e-05, "loss": 1.9978, "step": 297000 }, { "epoch": 1.111554279810905, "grad_norm": 3.9997470378875732, "learning_rate": 4.1317045243873654e-05, "loss": 1.9756, "step": 297500 }, { "epoch": 1.1134224382643687, "grad_norm": 3.8243391513824463, "learning_rate": 4.117258724232387e-05, "loss": 1.9927, "step": 298000 }, { "epoch": 1.1152905967178324, "grad_norm": 3.207801342010498, "learning_rate": 4.102820525609035e-05, "loss": 1.9807, "step": 298500 }, { "epoch": 1.117158755171296, "grad_norm": 2.981112480163574, "learning_rate": 4.08839005284867e-05, "loss": 1.9757, "step": 299000 }, { "epoch": 1.11902691362476, "grad_norm": 2.8603618144989014, "learning_rate": 4.0739674302161204e-05, "loss": 1.9882, "step": 299500 }, { "epoch": 1.1208950720782236, "grad_norm": 3.422062635421753, "learning_rate": 4.059552781908619e-05, "loss": 1.9883, "step": 300000 }, { "epoch": 1.1227632305316873, "grad_norm": 3.2499775886535645, "learning_rate": 4.045146232054726e-05, "loss": 1.9715, "step": 300500 }, { "epoch": 1.124631388985151, "grad_norm": 3.5448482036590576, "learning_rate": 4.030776693079458e-05, "loss": 1.9895, "step": 301000 }, { "epoch": 1.1264995474386146, "grad_norm": 3.52693510055542, "learning_rate": 4.016386695421753e-05, "loss": 1.9936, "step": 301500 }, { "epoch": 1.1283677058920785, "grad_norm": 3.247986078262329, "learning_rate": 4.002005167932884e-05, "loss": 1.9916, "step": 302000 }, { "epoch": 1.1302358643455421, "grad_norm": 3.287041425704956, "learning_rate": 3.987632234456198e-05, "loss": 1.971, "step": 302500 }, { "epoch": 1.1321040227990058, "grad_norm": 2.758507251739502, "learning_rate": 3.9732680187610403e-05, "loss": 2.0091, "step": 303000 }, { "epoch": 1.1339721812524695, "grad_norm": 2.9558610916137695, "learning_rate": 3.958912644541679e-05, "loss": 2.0046, "step": 303500 }, { "epoch": 1.1358403397059331, "grad_norm": 3.0163705348968506, "learning_rate": 3.944566235416254e-05, "loss": 1.9902, "step": 304000 }, { "epoch": 1.1377084981593968, "grad_norm": 2.4738314151763916, "learning_rate": 3.9302289149256985e-05, "loss": 1.969, "step": 304500 }, { "epoch": 1.1395766566128607, "grad_norm": 3.352306604385376, "learning_rate": 3.915929453473775e-05, "loss": 1.9639, "step": 305000 }, { "epoch": 1.1414448150663243, "grad_norm": 3.9805781841278076, "learning_rate": 3.9016106617675985e-05, "loss": 1.9703, "step": 305500 }, { "epoch": 1.143312973519788, "grad_norm": 2.410222291946411, "learning_rate": 3.8873013285987326e-05, "loss": 1.9836, "step": 306000 }, { "epoch": 1.1451811319732517, "grad_norm": 3.830815076828003, "learning_rate": 3.873030167047204e-05, "loss": 1.9474, "step": 306500 }, { "epoch": 1.1470492904267153, "grad_norm": 3.884229898452759, "learning_rate": 3.858740101002805e-05, "loss": 1.9912, "step": 307000 }, { "epoch": 1.1489174488801792, "grad_norm": 3.097529172897339, "learning_rate": 3.8444598626660855e-05, "loss": 1.9851, "step": 307500 }, { "epoch": 1.1507856073336429, "grad_norm": 3.3618969917297363, "learning_rate": 3.8301895750081664e-05, "loss": 1.9897, "step": 308000 }, { "epoch": 1.1526537657871065, "grad_norm": 2.846202850341797, "learning_rate": 3.8159293609144794e-05, "loss": 1.9649, "step": 308500 }, { "epoch": 1.1545219242405702, "grad_norm": 3.3975071907043457, "learning_rate": 3.801679343183709e-05, "loss": 1.9611, "step": 309000 }, { "epoch": 1.1563900826940339, "grad_norm": 3.390746831893921, "learning_rate": 3.787468113544101e-05, "loss": 1.9809, "step": 309500 }, { "epoch": 1.1582582411474975, "grad_norm": 3.883208990097046, "learning_rate": 3.773238835577244e-05, "loss": 1.9741, "step": 310000 }, { "epoch": 1.1601263996009614, "grad_norm": 2.655240535736084, "learning_rate": 3.7590201215933385e-05, "loss": 1.9929, "step": 310500 }, { "epoch": 1.161994558054425, "grad_norm": 3.561328649520874, "learning_rate": 3.7448120940337014e-05, "loss": 1.9941, "step": 311000 }, { "epoch": 1.1638627165078888, "grad_norm": 4.378994464874268, "learning_rate": 3.7306148752476284e-05, "loss": 1.9692, "step": 311500 }, { "epoch": 1.1657308749613524, "grad_norm": 2.515988826751709, "learning_rate": 3.716428587491332e-05, "loss": 1.9721, "step": 312000 }, { "epoch": 1.1675990334148163, "grad_norm": 2.2535147666931152, "learning_rate": 3.702253352926898e-05, "loss": 1.9904, "step": 312500 }, { "epoch": 1.16946719186828, "grad_norm": 3.65279483795166, "learning_rate": 3.688117610505848e-05, "loss": 1.8969, "step": 313000 }, { "epoch": 1.1713353503217436, "grad_norm": 3.5840914249420166, "learning_rate": 3.6739648257134945e-05, "loss": 1.9981, "step": 313500 }, { "epoch": 1.1732035087752073, "grad_norm": 4.6728973388671875, "learning_rate": 3.659823459780314e-05, "loss": 2.0034, "step": 314000 }, { "epoch": 1.175071667228671, "grad_norm": 3.8465287685394287, "learning_rate": 3.6456936344815585e-05, "loss": 1.9575, "step": 314500 }, { "epoch": 1.1769398256821346, "grad_norm": 3.005547046661377, "learning_rate": 3.631603696099265e-05, "loss": 1.9799, "step": 315000 }, { "epoch": 1.1788079841355985, "grad_norm": 3.0555107593536377, "learning_rate": 3.617497293307507e-05, "loss": 1.9681, "step": 315500 }, { "epoch": 1.1806761425890622, "grad_norm": 3.1861069202423096, "learning_rate": 3.6034027956326125e-05, "loss": 2.0004, "step": 316000 }, { "epoch": 1.1825443010425258, "grad_norm": 3.5906646251678467, "learning_rate": 3.589320324446236e-05, "loss": 1.984, "step": 316500 }, { "epoch": 1.1844124594959895, "grad_norm": 3.118577480316162, "learning_rate": 3.5752500010164694e-05, "loss": 2.0166, "step": 317000 }, { "epoch": 1.1862806179494532, "grad_norm": 3.639019727706909, "learning_rate": 3.561220050290951e-05, "loss": 1.9152, "step": 317500 }, { "epoch": 1.188148776402917, "grad_norm": 2.516979455947876, "learning_rate": 3.547174360858504e-05, "loss": 1.9838, "step": 318000 }, { "epoch": 1.1900169348563807, "grad_norm": 4.030247688293457, "learning_rate": 3.5331411821133284e-05, "loss": 1.9957, "step": 318500 }, { "epoch": 1.1918850933098444, "grad_norm": 2.944655656814575, "learning_rate": 3.519120634899048e-05, "loss": 1.9557, "step": 319000 }, { "epoch": 1.193753251763308, "grad_norm": 2.9035158157348633, "learning_rate": 3.505112839950505e-05, "loss": 1.9852, "step": 319500 }, { "epoch": 1.1956214102167717, "grad_norm": 4.2154364585876465, "learning_rate": 3.491117917892734e-05, "loss": 1.9863, "step": 320000 }, { "epoch": 1.1974895686702354, "grad_norm": 3.7261621952056885, "learning_rate": 3.4771359892399204e-05, "loss": 1.9478, "step": 320500 }, { "epoch": 1.1993577271236993, "grad_norm": 4.7101240158081055, "learning_rate": 3.463195098856492e-05, "loss": 1.9688, "step": 321000 }, { "epoch": 1.201225885577163, "grad_norm": 3.4447665214538574, "learning_rate": 3.44923949151937e-05, "loss": 1.9768, "step": 321500 }, { "epoch": 1.2030940440306266, "grad_norm": 2.6960058212280273, "learning_rate": 3.4352972382140294e-05, "loss": 1.9639, "step": 322000 }, { "epoch": 1.2049622024840903, "grad_norm": 3.2135891914367676, "learning_rate": 3.421368459001103e-05, "loss": 2.0298, "step": 322500 }, { "epoch": 1.206830360937554, "grad_norm": 3.953632116317749, "learning_rate": 3.4074532738252e-05, "loss": 2.0028, "step": 323000 }, { "epoch": 1.2086985193910178, "grad_norm": 3.091557025909424, "learning_rate": 3.393551802513865e-05, "loss": 1.9353, "step": 323500 }, { "epoch": 1.2105666778444815, "grad_norm": 3.2774996757507324, "learning_rate": 3.379664164776548e-05, "loss": 1.9976, "step": 324000 }, { "epoch": 1.2124348362979451, "grad_norm": 4.057534694671631, "learning_rate": 3.365790480203579e-05, "loss": 1.9577, "step": 324500 }, { "epoch": 1.2143029947514088, "grad_norm": 3.725080728530884, "learning_rate": 3.351958573365166e-05, "loss": 1.9619, "step": 325000 }, { "epoch": 1.2161711532048725, "grad_norm": 2.542310953140259, "learning_rate": 3.338140801561512e-05, "loss": 1.9413, "step": 325500 }, { "epoch": 1.2180393116583361, "grad_norm": 3.8798625469207764, "learning_rate": 3.324309635334674e-05, "loss": 1.9272, "step": 326000 }, { "epoch": 1.2199074701118, "grad_norm": 2.8388006687164307, "learning_rate": 3.310492898945492e-05, "loss": 1.9717, "step": 326500 }, { "epoch": 1.2217756285652637, "grad_norm": 3.845374822616577, "learning_rate": 3.296690711373742e-05, "loss": 1.9995, "step": 327000 }, { "epoch": 1.2236437870187273, "grad_norm": 3.3350958824157715, "learning_rate": 3.282903191473914e-05, "loss": 1.9505, "step": 327500 }, { "epoch": 1.225511945472191, "grad_norm": 3.514188289642334, "learning_rate": 3.2691304579741944e-05, "loss": 1.9493, "step": 328000 }, { "epoch": 1.2273801039256549, "grad_norm": 4.140675067901611, "learning_rate": 3.255372629475436e-05, "loss": 1.9381, "step": 328500 }, { "epoch": 1.2292482623791186, "grad_norm": 3.2821719646453857, "learning_rate": 3.241629824450141e-05, "loss": 1.9647, "step": 329000 }, { "epoch": 1.2311164208325822, "grad_norm": 3.671809434890747, "learning_rate": 3.227929601377734e-05, "loss": 1.948, "step": 329500 }, { "epoch": 1.2329845792860459, "grad_norm": 4.461349010467529, "learning_rate": 3.214244577120278e-05, "loss": 1.9533, "step": 330000 }, { "epoch": 1.2348527377395095, "grad_norm": 4.116054058074951, "learning_rate": 3.200547490304101e-05, "loss": 1.9278, "step": 330500 }, { "epoch": 1.2367208961929732, "grad_norm": 3.0734941959381104, "learning_rate": 3.1868658990759734e-05, "loss": 1.9038, "step": 331000 }, { "epoch": 1.238589054646437, "grad_norm": 4.233485698699951, "learning_rate": 3.173199921251894e-05, "loss": 1.9466, "step": 331500 }, { "epoch": 1.2404572130999008, "grad_norm": 3.6610071659088135, "learning_rate": 3.159549674513415e-05, "loss": 1.9437, "step": 332000 }, { "epoch": 1.2423253715533644, "grad_norm": 3.757662773132324, "learning_rate": 3.145915276406623e-05, "loss": 1.9695, "step": 332500 }, { "epoch": 1.244193530006828, "grad_norm": 4.0608062744140625, "learning_rate": 3.1322968443411296e-05, "loss": 1.9398, "step": 333000 }, { "epoch": 1.2460616884602917, "grad_norm": 3.5959203243255615, "learning_rate": 3.118694495589054e-05, "loss": 1.9154, "step": 333500 }, { "epoch": 1.2479298469137556, "grad_norm": 4.01427698135376, "learning_rate": 3.105135503334797e-05, "loss": 1.9268, "step": 334000 }, { "epoch": 1.2497980053672193, "grad_norm": 4.18043851852417, "learning_rate": 3.091565639719372e-05, "loss": 1.9349, "step": 334500 }, { "epoch": 1.251666163820683, "grad_norm": 3.132768154144287, "learning_rate": 3.0780122101651435e-05, "loss": 1.9476, "step": 335000 }, { "epoch": 1.2535343222741466, "grad_norm": 2.99275803565979, "learning_rate": 3.0644753313844755e-05, "loss": 1.9625, "step": 335500 }, { "epoch": 1.2554024807276103, "grad_norm": 3.58479380607605, "learning_rate": 3.0509551199472118e-05, "loss": 1.9545, "step": 336000 }, { "epoch": 1.257270639181074, "grad_norm": 3.13480544090271, "learning_rate": 3.0374786823074896e-05, "loss": 1.9398, "step": 336500 }, { "epoch": 1.2591387976345378, "grad_norm": 3.130760431289673, "learning_rate": 3.0239921207753986e-05, "loss": 1.9582, "step": 337000 }, { "epoch": 1.2610069560880015, "grad_norm": 3.4282748699188232, "learning_rate": 3.0105225751989453e-05, "loss": 1.9285, "step": 337500 }, { "epoch": 1.2628751145414652, "grad_norm": 3.996558666229248, "learning_rate": 2.9970701615681463e-05, "loss": 1.9397, "step": 338000 }, { "epoch": 1.2647432729949288, "grad_norm": 3.9144933223724365, "learning_rate": 2.9836349957254927e-05, "loss": 1.9361, "step": 338500 }, { "epoch": 1.2666114314483927, "grad_norm": 2.7201411724090576, "learning_rate": 2.9702171933649482e-05, "loss": 1.9221, "step": 339000 }, { "epoch": 1.2684795899018564, "grad_norm": 3.485480785369873, "learning_rate": 2.956843653156831e-05, "loss": 1.951, "step": 339500 }, { "epoch": 1.27034774835532, "grad_norm": 4.514249324798584, "learning_rate": 2.943460888939414e-05, "loss": 1.9556, "step": 340000 }, { "epoch": 1.2722159068087837, "grad_norm": 3.043680429458618, "learning_rate": 2.930095834154558e-05, "loss": 1.9673, "step": 340500 }, { "epoch": 1.2740840652622474, "grad_norm": 2.636143207550049, "learning_rate": 2.9167486038924823e-05, "loss": 1.9492, "step": 341000 }, { "epoch": 1.275952223715711, "grad_norm": 3.6190054416656494, "learning_rate": 2.9034193130899155e-05, "loss": 1.9648, "step": 341500 }, { "epoch": 1.2778203821691747, "grad_norm": 4.245516777038574, "learning_rate": 2.890108076529099e-05, "loss": 1.9589, "step": 342000 }, { "epoch": 1.2796885406226386, "grad_norm": 3.619927406311035, "learning_rate": 2.876841576763556e-05, "loss": 1.9439, "step": 342500 }, { "epoch": 1.2815566990761023, "grad_norm": 3.657912015914917, "learning_rate": 2.863566755729298e-05, "loss": 1.9564, "step": 343000 }, { "epoch": 1.283424857529566, "grad_norm": 3.4643499851226807, "learning_rate": 2.8503103321182943e-05, "loss": 1.9754, "step": 343500 }, { "epoch": 1.2852930159830296, "grad_norm": 4.774941444396973, "learning_rate": 2.8370724200853072e-05, "loss": 1.9406, "step": 344000 }, { "epoch": 1.2871611744364935, "grad_norm": 3.5722765922546387, "learning_rate": 2.8238531336256975e-05, "loss": 1.9708, "step": 344500 }, { "epoch": 1.2890293328899571, "grad_norm": 3.9576704502105713, "learning_rate": 2.8106525865744272e-05, "loss": 1.9503, "step": 345000 }, { "epoch": 1.2908974913434208, "grad_norm": 4.773796558380127, "learning_rate": 2.7974972371021873e-05, "loss": 1.967, "step": 345500 }, { "epoch": 1.2927656497968845, "grad_norm": 3.749734401702881, "learning_rate": 2.784334471679681e-05, "loss": 1.9484, "step": 346000 }, { "epoch": 1.2946338082503481, "grad_norm": 4.330195903778076, "learning_rate": 2.7711907859717524e-05, "loss": 1.9094, "step": 346500 }, { "epoch": 1.2965019667038118, "grad_norm": 3.0685718059539795, "learning_rate": 2.758066293162346e-05, "loss": 1.9195, "step": 347000 }, { "epoch": 1.2983701251572755, "grad_norm": 3.8571877479553223, "learning_rate": 2.7449611062701342e-05, "loss": 1.9457, "step": 347500 }, { "epoch": 1.3002382836107393, "grad_norm": 3.673949718475342, "learning_rate": 2.731875338147545e-05, "loss": 1.9046, "step": 348000 }, { "epoch": 1.302106442064203, "grad_norm": 3.5845327377319336, "learning_rate": 2.7188091014797774e-05, "loss": 1.9871, "step": 348500 }, { "epoch": 1.3039746005176667, "grad_norm": 5.045246124267578, "learning_rate": 2.7057885822898532e-05, "loss": 1.9445, "step": 349000 }, { "epoch": 1.3058427589711303, "grad_norm": 4.416993141174316, "learning_rate": 2.692761706288961e-05, "loss": 1.9242, "step": 349500 }, { "epoch": 1.3077109174245942, "grad_norm": 5.05975341796875, "learning_rate": 2.6797546985612997e-05, "loss": 1.9729, "step": 350000 }, { "epoch": 1.3095790758780579, "grad_norm": 3.4689128398895264, "learning_rate": 2.6667676711138423e-05, "loss": 1.9479, "step": 350500 }, { "epoch": 1.3114472343315215, "grad_norm": 3.177008628845215, "learning_rate": 2.6538266495259985e-05, "loss": 1.9456, "step": 351000 }, { "epoch": 1.3133153927849852, "grad_norm": 3.6939172744750977, "learning_rate": 2.6408798774518146e-05, "loss": 1.934, "step": 351500 }, { "epoch": 1.3151835512384489, "grad_norm": 4.592978477478027, "learning_rate": 2.6279534204197788e-05, "loss": 1.8931, "step": 352000 }, { "epoch": 1.3170517096919125, "grad_norm": 4.249555587768555, "learning_rate": 2.6150473897432166e-05, "loss": 1.9352, "step": 352500 }, { "epoch": 1.3189198681453764, "grad_norm": 3.4636592864990234, "learning_rate": 2.6021876469757334e-05, "loss": 1.9227, "step": 353000 }, { "epoch": 1.32078802659884, "grad_norm": 3.9055769443511963, "learning_rate": 2.5893227608380464e-05, "loss": 2.0114, "step": 353500 }, { "epoch": 1.3226561850523038, "grad_norm": 3.659078359603882, "learning_rate": 2.576478633715232e-05, "loss": 1.9675, "step": 354000 }, { "epoch": 1.3245243435057674, "grad_norm": 4.109720230102539, "learning_rate": 2.563655376211658e-05, "loss": 1.9515, "step": 354500 }, { "epoch": 1.3263925019592313, "grad_norm": 3.4679160118103027, "learning_rate": 2.550853098751974e-05, "loss": 1.965, "step": 355000 }, { "epoch": 1.328260660412695, "grad_norm": 3.3445444107055664, "learning_rate": 2.538097452833215e-05, "loss": 1.9422, "step": 355500 }, { "epoch": 1.3301288188661586, "grad_norm": 4.475471496582031, "learning_rate": 2.5253374235012317e-05, "loss": 1.9533, "step": 356000 }, { "epoch": 1.3319969773196223, "grad_norm": 3.064134359359741, "learning_rate": 2.5125987041797306e-05, "loss": 1.9263, "step": 356500 }, { "epoch": 1.333865135773086, "grad_norm": 3.313082218170166, "learning_rate": 2.4998814045653785e-05, "loss": 1.8802, "step": 357000 }, { "epoch": 1.3357332942265496, "grad_norm": 5.206328392028809, "learning_rate": 2.4872110041523282e-05, "loss": 1.8967, "step": 357500 }, { "epoch": 1.3376014526800133, "grad_norm": 4.334334373474121, "learning_rate": 2.4745368289174596e-05, "loss": 1.9429, "step": 358000 }, { "epoch": 1.3394696111334772, "grad_norm": 5.680240154266357, "learning_rate": 2.4618844011511794e-05, "loss": 1.9209, "step": 358500 }, { "epoch": 1.3413377695869408, "grad_norm": 3.261059284210205, "learning_rate": 2.449253829807073e-05, "loss": 1.9251, "step": 359000 }, { "epoch": 1.3432059280404045, "grad_norm": 3.2310187816619873, "learning_rate": 2.4366704188693773e-05, "loss": 1.9056, "step": 359500 }, { "epoch": 1.3450740864938682, "grad_norm": 4.145471096038818, "learning_rate": 2.424083842220842e-05, "loss": 1.926, "step": 360000 }, { "epoch": 1.346942244947332, "grad_norm": 4.704455852508545, "learning_rate": 2.411519447505653e-05, "loss": 1.9485, "step": 360500 }, { "epoch": 1.3488104034007957, "grad_norm": 3.9618282318115234, "learning_rate": 2.3989773429193175e-05, "loss": 1.9304, "step": 361000 }, { "epoch": 1.3506785618542594, "grad_norm": 3.921598434448242, "learning_rate": 2.3864576364654012e-05, "loss": 1.91, "step": 361500 }, { "epoch": 1.352546720307723, "grad_norm": 4.026153087615967, "learning_rate": 2.3739604359545953e-05, "loss": 1.9588, "step": 362000 }, { "epoch": 1.3544148787611867, "grad_norm": 3.6452534198760986, "learning_rate": 2.3615107755379164e-05, "loss": 1.9613, "step": 362500 }, { "epoch": 1.3562830372146504, "grad_norm": 3.757392406463623, "learning_rate": 2.349058864020204e-05, "loss": 1.9386, "step": 363000 }, { "epoch": 1.358151195668114, "grad_norm": 4.3105902671813965, "learning_rate": 2.3366297804968707e-05, "loss": 1.9171, "step": 363500 }, { "epoch": 1.360019354121578, "grad_norm": 4.3953938484191895, "learning_rate": 2.3242236319982296e-05, "loss": 1.9274, "step": 364000 }, { "epoch": 1.3618875125750416, "grad_norm": 3.9918718338012695, "learning_rate": 2.3118652685036857e-05, "loss": 1.9505, "step": 364500 }, { "epoch": 1.3637556710285053, "grad_norm": 4.170524597167969, "learning_rate": 2.2995052639511584e-05, "loss": 1.9666, "step": 365000 }, { "epoch": 1.365623829481969, "grad_norm": 2.33520245552063, "learning_rate": 2.2871685141129013e-05, "loss": 1.8909, "step": 365500 }, { "epoch": 1.3674919879354328, "grad_norm": 3.8575286865234375, "learning_rate": 2.2748551252241096e-05, "loss": 1.9036, "step": 366000 }, { "epoch": 1.3693601463888965, "grad_norm": 3.738067150115967, "learning_rate": 2.262589759672201e-05, "loss": 1.9242, "step": 366500 }, { "epoch": 1.3712283048423601, "grad_norm": 3.2097079753875732, "learning_rate": 2.2503233633312364e-05, "loss": 1.9669, "step": 367000 }, { "epoch": 1.3730964632958238, "grad_norm": 4.111919403076172, "learning_rate": 2.2380806452236224e-05, "loss": 1.9115, "step": 367500 }, { "epoch": 1.3749646217492875, "grad_norm": 3.6487059593200684, "learning_rate": 2.2258617107748202e-05, "loss": 1.9221, "step": 368000 }, { "epoch": 1.3768327802027511, "grad_norm": 3.9140658378601074, "learning_rate": 2.213666665205488e-05, "loss": 1.9077, "step": 368500 }, { "epoch": 1.378700938656215, "grad_norm": 4.236271858215332, "learning_rate": 2.2015199316183162e-05, "loss": 1.9248, "step": 369000 }, { "epoch": 1.3805690971096787, "grad_norm": 3.9722940921783447, "learning_rate": 2.189372930344269e-05, "loss": 1.9075, "step": 369500 }, { "epoch": 1.3824372555631423, "grad_norm": 3.9439289569854736, "learning_rate": 2.1772501321647675e-05, "loss": 1.9325, "step": 370000 }, { "epoch": 1.384305414016606, "grad_norm": 3.183210611343384, "learning_rate": 2.1651516414726137e-05, "loss": 1.9372, "step": 370500 }, { "epoch": 1.38617357247007, "grad_norm": 4.380889892578125, "learning_rate": 2.1530775624512915e-05, "loss": 1.9119, "step": 371000 }, { "epoch": 1.3880417309235336, "grad_norm": 3.137747049331665, "learning_rate": 2.1410520736652044e-05, "loss": 1.8852, "step": 371500 }, { "epoch": 1.3899098893769972, "grad_norm": 4.502001762390137, "learning_rate": 2.129027080352e-05, "loss": 1.9157, "step": 372000 }, { "epoch": 1.3917780478304609, "grad_norm": 3.3394224643707275, "learning_rate": 2.1170268097883096e-05, "loss": 1.9329, "step": 372500 }, { "epoch": 1.3936462062839245, "grad_norm": 3.0865299701690674, "learning_rate": 2.1050513653118137e-05, "loss": 1.9178, "step": 373000 }, { "epoch": 1.3955143647373882, "grad_norm": 4.535000324249268, "learning_rate": 2.0931247261291493e-05, "loss": 1.9163, "step": 373500 }, { "epoch": 1.3973825231908519, "grad_norm": 3.5877630710601807, "learning_rate": 2.0811991928172553e-05, "loss": 1.9437, "step": 374000 }, { "epoch": 1.3992506816443158, "grad_norm": 4.446563243865967, "learning_rate": 2.0692987941141717e-05, "loss": 1.9458, "step": 374500 }, { "epoch": 1.4011188400977794, "grad_norm": 3.427525758743286, "learning_rate": 2.0574236324975526e-05, "loss": 1.9163, "step": 375000 }, { "epoch": 1.402986998551243, "grad_norm": 4.324997901916504, "learning_rate": 2.0455974845157404e-05, "loss": 1.9447, "step": 375500 }, { "epoch": 1.4048551570047068, "grad_norm": 4.460984706878662, "learning_rate": 2.0337730526503722e-05, "loss": 1.8936, "step": 376000 }, { "epoch": 1.4067233154581706, "grad_norm": 3.0335512161254883, "learning_rate": 2.0219741637935503e-05, "loss": 1.9274, "step": 376500 }, { "epoch": 1.4085914739116343, "grad_norm": 3.983215808868408, "learning_rate": 2.010200919548798e-05, "loss": 1.9456, "step": 377000 }, { "epoch": 1.410459632365098, "grad_norm": 4.645228385925293, "learning_rate": 1.9984534212988126e-05, "loss": 1.8914, "step": 377500 }, { "epoch": 1.4123277908185616, "grad_norm": 4.4612250328063965, "learning_rate": 1.986755187644178e-05, "loss": 1.9379, "step": 378000 }, { "epoch": 1.4141959492720253, "grad_norm": 3.9466419219970703, "learning_rate": 1.9750594326473332e-05, "loss": 1.9053, "step": 378500 }, { "epoch": 1.416064107725489, "grad_norm": 3.384223461151123, "learning_rate": 1.9633897262584083e-05, "loss": 1.9777, "step": 379000 }, { "epoch": 1.4179322661789528, "grad_norm": 3.591265916824341, "learning_rate": 1.9517461689685075e-05, "loss": 1.9357, "step": 379500 }, { "epoch": 1.4198004246324165, "grad_norm": 4.8993730545043945, "learning_rate": 1.9401520693960035e-05, "loss": 1.9063, "step": 380000 }, { "epoch": 1.4216685830858802, "grad_norm": 4.398604869842529, "learning_rate": 1.9285610580773773e-05, "loss": 1.8615, "step": 380500 }, { "epoch": 1.4235367415393438, "grad_norm": 3.6538774967193604, "learning_rate": 1.916996495777159e-05, "loss": 1.9166, "step": 381000 }, { "epoch": 1.4254048999928077, "grad_norm": 3.730799436569214, "learning_rate": 1.905458482081028e-05, "loss": 1.8853, "step": 381500 }, { "epoch": 1.4272730584462714, "grad_norm": 5.199082851409912, "learning_rate": 1.8939701124169172e-05, "loss": 1.8736, "step": 382000 }, { "epoch": 1.429141216899735, "grad_norm": 4.507551670074463, "learning_rate": 1.8824854401777008e-05, "loss": 1.9045, "step": 382500 }, { "epoch": 1.4310093753531987, "grad_norm": 2.917692184448242, "learning_rate": 1.8710276137269065e-05, "loss": 1.8737, "step": 383000 }, { "epoch": 1.4328775338066624, "grad_norm": 4.9208221435546875, "learning_rate": 1.8595967317310803e-05, "loss": 1.8852, "step": 383500 }, { "epoch": 1.434745692260126, "grad_norm": 4.914313793182373, "learning_rate": 1.8481928926247323e-05, "loss": 1.9188, "step": 384000 }, { "epoch": 1.4366138507135897, "grad_norm": 4.2889556884765625, "learning_rate": 1.836838920853576e-05, "loss": 1.9626, "step": 384500 }, { "epoch": 1.4384820091670536, "grad_norm": 4.040252208709717, "learning_rate": 1.8254894073216665e-05, "loss": 1.9157, "step": 385000 }, { "epoch": 1.4403501676205173, "grad_norm": 4.800929546356201, "learning_rate": 1.8141672303869356e-05, "loss": 1.8893, "step": 385500 }, { "epoch": 1.442218326073981, "grad_norm": 3.5540807247161865, "learning_rate": 1.8028724875478063e-05, "loss": 1.9504, "step": 386000 }, { "epoch": 1.4440864845274446, "grad_norm": 3.3006908893585205, "learning_rate": 1.791627782948606e-05, "loss": 1.9409, "step": 386500 }, { "epoch": 1.4459546429809085, "grad_norm": 2.976499080657959, "learning_rate": 1.7803881444967192e-05, "loss": 1.9083, "step": 387000 }, { "epoch": 1.4478228014343721, "grad_norm": 4.687767505645752, "learning_rate": 1.7691762310215786e-05, "loss": 1.9419, "step": 387500 }, { "epoch": 1.4496909598878358, "grad_norm": 4.436933517456055, "learning_rate": 1.7579921390721e-05, "loss": 1.9205, "step": 388000 }, { "epoch": 1.4515591183412995, "grad_norm": 4.451811790466309, "learning_rate": 1.7468582493799596e-05, "loss": 1.9, "step": 388500 }, { "epoch": 1.4534272767947631, "grad_norm": 4.564020156860352, "learning_rate": 1.7357300330458897e-05, "loss": 1.8913, "step": 389000 }, { "epoch": 1.4552954352482268, "grad_norm": 3.211652994155884, "learning_rate": 1.724629926252035e-05, "loss": 1.8884, "step": 389500 }, { "epoch": 1.4571635937016905, "grad_norm": 4.224535942077637, "learning_rate": 1.7135580245845107e-05, "loss": 1.9185, "step": 390000 }, { "epoch": 1.4590317521551543, "grad_norm": 3.9640257358551025, "learning_rate": 1.7025364822818328e-05, "loss": 1.9193, "step": 390500 }, { "epoch": 1.460899910608618, "grad_norm": 3.1013686656951904, "learning_rate": 1.6915212197670978e-05, "loss": 1.9274, "step": 391000 }, { "epoch": 1.4627680690620817, "grad_norm": 5.020761966705322, "learning_rate": 1.68053444748701e-05, "loss": 1.8856, "step": 391500 }, { "epoch": 1.4646362275155453, "grad_norm": 3.306040048599243, "learning_rate": 1.6695762600517374e-05, "loss": 1.9403, "step": 392000 }, { "epoch": 1.4665043859690092, "grad_norm": 4.234299182891846, "learning_rate": 1.658668582157294e-05, "loss": 1.8777, "step": 392500 }, { "epoch": 1.468372544422473, "grad_norm": 6.068370342254639, "learning_rate": 1.6477677896163034e-05, "loss": 1.8937, "step": 393000 }, { "epoch": 1.4702407028759366, "grad_norm": 4.372175216674805, "learning_rate": 1.636895864082966e-05, "loss": 1.9034, "step": 393500 }, { "epoch": 1.4721088613294002, "grad_norm": 4.099493980407715, "learning_rate": 1.6260528991784696e-05, "loss": 1.9204, "step": 394000 }, { "epoch": 1.4739770197828639, "grad_norm": 3.7667877674102783, "learning_rate": 1.6152389882746138e-05, "loss": 1.9014, "step": 394500 }, { "epoch": 1.4758451782363275, "grad_norm": 2.797348976135254, "learning_rate": 1.60447576486997e-05, "loss": 1.9077, "step": 395000 }, { "epoch": 1.4777133366897914, "grad_norm": 4.806083679199219, "learning_rate": 1.593720182508714e-05, "loss": 1.9239, "step": 395500 }, { "epoch": 1.479581495143255, "grad_norm": 4.35167121887207, "learning_rate": 1.58299393257415e-05, "loss": 1.9147, "step": 396000 }, { "epoch": 1.4814496535967188, "grad_norm": 7.256587982177734, "learning_rate": 1.5722971074330122e-05, "loss": 1.9101, "step": 396500 }, { "epoch": 1.4833178120501824, "grad_norm": 4.269795894622803, "learning_rate": 1.5616511042961456e-05, "loss": 1.9253, "step": 397000 }, { "epoch": 1.4851859705036463, "grad_norm": 3.5930633544921875, "learning_rate": 1.551013345518685e-05, "loss": 1.9399, "step": 397500 }, { "epoch": 1.48705412895711, "grad_norm": 4.802802085876465, "learning_rate": 1.5404052869284143e-05, "loss": 1.924, "step": 398000 }, { "epoch": 1.4889222874105736, "grad_norm": 5.457955360412598, "learning_rate": 1.5298270198742908e-05, "loss": 1.925, "step": 398500 }, { "epoch": 1.4907904458640373, "grad_norm": 4.350592613220215, "learning_rate": 1.5192997023342925e-05, "loss": 1.9841, "step": 399000 }, { "epoch": 1.492658604317501, "grad_norm": 3.5578579902648926, "learning_rate": 1.5087812313349553e-05, "loss": 1.8914, "step": 399500 }, { "epoch": 1.4945267627709646, "grad_norm": 4.802867412567139, "learning_rate": 1.4982928241953386e-05, "loss": 1.8969, "step": 400000 }, { "epoch": 1.4963949212244283, "grad_norm": 4.002582550048828, "learning_rate": 1.4878345712340435e-05, "loss": 1.904, "step": 400500 }, { "epoch": 1.4982630796778922, "grad_norm": 4.3025665283203125, "learning_rate": 1.4774273882839745e-05, "loss": 1.916, "step": 401000 }, { "epoch": 1.5001312381313558, "grad_norm": 4.821669101715088, "learning_rate": 1.4670296528381727e-05, "loss": 1.8837, "step": 401500 }, { "epoch": 1.5019993965848195, "grad_norm": 3.655703067779541, "learning_rate": 1.456662340786592e-05, "loss": 1.95, "step": 402000 }, { "epoch": 1.5038675550382834, "grad_norm": 3.852405548095703, "learning_rate": 1.4463255414050487e-05, "loss": 1.8723, "step": 402500 }, { "epoch": 1.505735713491747, "grad_norm": 4.878715515136719, "learning_rate": 1.4360193437066122e-05, "loss": 1.8876, "step": 403000 }, { "epoch": 1.5076038719452107, "grad_norm": 4.768284320831299, "learning_rate": 1.4257643567674483e-05, "loss": 1.9061, "step": 403500 }, { "epoch": 1.5094720303986744, "grad_norm": 4.845045566558838, "learning_rate": 1.4155195667736094e-05, "loss": 1.8932, "step": 404000 }, { "epoch": 1.511340188852138, "grad_norm": 3.8661012649536133, "learning_rate": 1.4053056437417239e-05, "loss": 1.9518, "step": 404500 }, { "epoch": 1.5132083473056017, "grad_norm": 4.624420166015625, "learning_rate": 1.3951226756267382e-05, "loss": 1.8403, "step": 405000 }, { "epoch": 1.5150765057590654, "grad_norm": 3.6633214950561523, "learning_rate": 1.3849910229293806e-05, "loss": 1.8943, "step": 405500 }, { "epoch": 1.516944664212529, "grad_norm": 5.2839155197143555, "learning_rate": 1.3748701650989005e-05, "loss": 1.8692, "step": 406000 }, { "epoch": 1.518812822665993, "grad_norm": 3.8412556648254395, "learning_rate": 1.3647805242737227e-05, "loss": 1.8699, "step": 406500 }, { "epoch": 1.5206809811194566, "grad_norm": 3.3254265785217285, "learning_rate": 1.3547221873385652e-05, "loss": 1.8909, "step": 407000 }, { "epoch": 1.5225491395729203, "grad_norm": 3.2033207416534424, "learning_rate": 1.3446952409085728e-05, "loss": 1.8986, "step": 407500 }, { "epoch": 1.5244172980263841, "grad_norm": 4.760767459869385, "learning_rate": 1.334719730796591e-05, "loss": 1.8756, "step": 408000 }, { "epoch": 1.5262854564798478, "grad_norm": 4.965844631195068, "learning_rate": 1.3247557609288142e-05, "loss": 1.8743, "step": 408500 }, { "epoch": 1.5281536149333115, "grad_norm": 4.014163494110107, "learning_rate": 1.314823439615473e-05, "loss": 1.9219, "step": 409000 }, { "epoch": 1.5300217733867751, "grad_norm": 4.178042888641357, "learning_rate": 1.3049228523865536e-05, "loss": 1.881, "step": 409500 }, { "epoch": 1.5318899318402388, "grad_norm": 4.607501983642578, "learning_rate": 1.2950737902223226e-05, "loss": 1.9469, "step": 410000 }, { "epoch": 1.5337580902937025, "grad_norm": 4.652303695678711, "learning_rate": 1.2852368627651334e-05, "loss": 1.8881, "step": 410500 }, { "epoch": 1.5356262487471661, "grad_norm": 4.992543697357178, "learning_rate": 1.2754319241706458e-05, "loss": 1.9569, "step": 411000 }, { "epoch": 1.5374944072006298, "grad_norm": 3.5058271884918213, "learning_rate": 1.2656590588719214e-05, "loss": 1.9032, "step": 411500 }, { "epoch": 1.5393625656540937, "grad_norm": 3.973353147506714, "learning_rate": 1.2559183510258338e-05, "loss": 1.8669, "step": 412000 }, { "epoch": 1.5412307241075573, "grad_norm": 4.776645660400391, "learning_rate": 1.2462292692129003e-05, "loss": 1.8993, "step": 412500 }, { "epoch": 1.543098882561021, "grad_norm": 4.160543441772461, "learning_rate": 1.2365530629011917e-05, "loss": 1.9269, "step": 413000 }, { "epoch": 1.544967041014485, "grad_norm": 4.14699125289917, "learning_rate": 1.226909264681978e-05, "loss": 1.9139, "step": 413500 }, { "epoch": 1.5468351994679486, "grad_norm": 4.639766693115234, "learning_rate": 1.2172979576006998e-05, "loss": 1.8844, "step": 414000 }, { "epoch": 1.5487033579214122, "grad_norm": 3.771737575531006, "learning_rate": 1.207719224423004e-05, "loss": 1.8961, "step": 414500 }, { "epoch": 1.550571516374876, "grad_norm": 4.165931701660156, "learning_rate": 1.1981922071418567e-05, "loss": 1.891, "step": 415000 }, { "epoch": 1.5524396748283396, "grad_norm": 5.3882341384887695, "learning_rate": 1.1886788033865165e-05, "loss": 1.8854, "step": 415500 }, { "epoch": 1.5543078332818032, "grad_norm": 4.879900932312012, "learning_rate": 1.1791982199822898e-05, "loss": 1.8817, "step": 416000 }, { "epoch": 1.5561759917352669, "grad_norm": 4.769500732421875, "learning_rate": 1.169750538569126e-05, "loss": 1.9078, "step": 416500 }, { "epoch": 1.5580441501887305, "grad_norm": 5.184789657592773, "learning_rate": 1.1603546369284646e-05, "loss": 1.864, "step": 417000 }, { "epoch": 1.5599123086421944, "grad_norm": 3.5462260246276855, "learning_rate": 1.1509729370737072e-05, "loss": 1.9012, "step": 417500 }, { "epoch": 1.561780467095658, "grad_norm": 4.478038311004639, "learning_rate": 1.1416243822658057e-05, "loss": 1.8541, "step": 418000 }, { "epoch": 1.563648625549122, "grad_norm": 4.2772650718688965, "learning_rate": 1.1323090530077756e-05, "loss": 1.9176, "step": 418500 }, { "epoch": 1.5655167840025856, "grad_norm": 4.45164155960083, "learning_rate": 1.123045560271172e-05, "loss": 1.9191, "step": 419000 }, { "epoch": 1.5673849424560493, "grad_norm": 4.31321382522583, "learning_rate": 1.1137968556258127e-05, "loss": 1.9104, "step": 419500 }, { "epoch": 1.569253100909513, "grad_norm": 3.313171625137329, "learning_rate": 1.1045816161609301e-05, "loss": 1.8969, "step": 420000 }, { "epoch": 1.5711212593629766, "grad_norm": 5.630086898803711, "learning_rate": 1.0953999212315213e-05, "loss": 1.8921, "step": 420500 }, { "epoch": 1.5729894178164403, "grad_norm": 4.993584632873535, "learning_rate": 1.0862518499037283e-05, "loss": 1.8845, "step": 421000 }, { "epoch": 1.574857576269904, "grad_norm": 5.677700996398926, "learning_rate": 1.077155676004855e-05, "loss": 1.8988, "step": 421500 }, { "epoch": 1.5767257347233676, "grad_norm": 4.58486795425415, "learning_rate": 1.068075020279995e-05, "loss": 1.9101, "step": 422000 }, { "epoch": 1.5785938931768315, "grad_norm": 4.042180061340332, "learning_rate": 1.0590282234591004e-05, "loss": 1.9224, "step": 422500 }, { "epoch": 1.5804620516302952, "grad_norm": 3.4549098014831543, "learning_rate": 1.0500153634466675e-05, "loss": 1.8885, "step": 423000 }, { "epoch": 1.5823302100837588, "grad_norm": 4.782561302185059, "learning_rate": 1.0410544415482986e-05, "loss": 1.9126, "step": 423500 }, { "epoch": 1.5841983685372227, "grad_norm": 4.326170921325684, "learning_rate": 1.0321096194361922e-05, "loss": 1.8519, "step": 424000 }, { "epoch": 1.5860665269906864, "grad_norm": 4.411458492279053, "learning_rate": 1.0231989659361606e-05, "loss": 1.8756, "step": 424500 }, { "epoch": 1.58793468544415, "grad_norm": 4.059584140777588, "learning_rate": 1.0143225577803328e-05, "loss": 1.897, "step": 425000 }, { "epoch": 1.5898028438976137, "grad_norm": 4.62555456161499, "learning_rate": 1.0054981212748877e-05, "loss": 1.9044, "step": 425500 }, { "epoch": 1.5916710023510774, "grad_norm": 3.3062992095947266, "learning_rate": 9.966903639519581e-06, "loss": 1.8671, "step": 426000 }, { "epoch": 1.593539160804541, "grad_norm": 3.750192880630493, "learning_rate": 9.879170802462034e-06, "loss": 1.9024, "step": 426500 }, { "epoch": 1.5954073192580047, "grad_norm": 3.6934866905212402, "learning_rate": 9.791783457068221e-06, "loss": 1.8972, "step": 427000 }, { "epoch": 1.5972754777114684, "grad_norm": 4.577314376831055, "learning_rate": 9.704916092006999e-06, "loss": 1.9391, "step": 427500 }, { "epoch": 1.5991436361649323, "grad_norm": 4.8952226638793945, "learning_rate": 9.618221289776025e-06, "loss": 1.8756, "step": 428000 }, { "epoch": 1.601011794618396, "grad_norm": 5.817446231842041, "learning_rate": 9.531874226317888e-06, "loss": 1.8756, "step": 428500 }, { "epoch": 1.6028799530718596, "grad_norm": 3.9412033557891846, "learning_rate": 9.445875645191288e-06, "loss": 1.912, "step": 429000 }, { "epoch": 1.6047481115253235, "grad_norm": 4.50702428817749, "learning_rate": 9.360397236655304e-06, "loss": 1.8652, "step": 429500 }, { "epoch": 1.6066162699787871, "grad_norm": 4.587414741516113, "learning_rate": 9.27509713820291e-06, "loss": 1.9097, "step": 430000 }, { "epoch": 1.6084844284322508, "grad_norm": 6.312617301940918, "learning_rate": 9.190147733261234e-06, "loss": 1.8736, "step": 430500 }, { "epoch": 1.6103525868857145, "grad_norm": 5.86572790145874, "learning_rate": 9.105549753353348e-06, "loss": 1.8866, "step": 431000 }, { "epoch": 1.6122207453391781, "grad_norm": 4.819661617279053, "learning_rate": 9.021303926976055e-06, "loss": 1.8648, "step": 431500 }, { "epoch": 1.6140889037926418, "grad_norm": 4.977511882781982, "learning_rate": 8.937578412834564e-06, "loss": 1.8504, "step": 432000 }, { "epoch": 1.6159570622461055, "grad_norm": 3.8270606994628906, "learning_rate": 8.85403835895094e-06, "loss": 1.9031, "step": 432500 }, { "epoch": 1.6178252206995691, "grad_norm": 3.582000255584717, "learning_rate": 8.770852624432785e-06, "loss": 1.9016, "step": 433000 }, { "epoch": 1.619693379153033, "grad_norm": 4.828258037567139, "learning_rate": 8.688021925615658e-06, "loss": 1.9003, "step": 433500 }, { "epoch": 1.6215615376064967, "grad_norm": 4.899356842041016, "learning_rate": 8.60571157016748e-06, "loss": 1.902, "step": 434000 }, { "epoch": 1.6234296960599606, "grad_norm": 3.5516891479492188, "learning_rate": 8.523592365898686e-06, "loss": 1.8574, "step": 434500 }, { "epoch": 1.6252978545134242, "grad_norm": 4.53317928314209, "learning_rate": 8.441830326558064e-06, "loss": 1.8844, "step": 435000 }, { "epoch": 1.627166012966888, "grad_norm": 6.883234977722168, "learning_rate": 8.360426156221358e-06, "loss": 1.859, "step": 435500 }, { "epoch": 1.6290341714203516, "grad_norm": 5.441802024841309, "learning_rate": 8.279542288766052e-06, "loss": 1.9012, "step": 436000 }, { "epoch": 1.6309023298738152, "grad_norm": 3.1804521083831787, "learning_rate": 8.198855237101328e-06, "loss": 1.8847, "step": 436500 }, { "epoch": 1.632770488327279, "grad_norm": 4.132668972015381, "learning_rate": 8.118528146766863e-06, "loss": 1.8517, "step": 437000 }, { "epoch": 1.6346386467807426, "grad_norm": 4.795321464538574, "learning_rate": 8.038561709481684e-06, "loss": 1.9175, "step": 437500 }, { "epoch": 1.6365068052342062, "grad_norm": 4.67226505279541, "learning_rate": 7.959115462975215e-06, "loss": 1.857, "step": 438000 }, { "epoch": 1.63837496368767, "grad_norm": 5.205322742462158, "learning_rate": 7.879871669780554e-06, "loss": 1.8824, "step": 438500 }, { "epoch": 1.6402431221411338, "grad_norm": 5.369668960571289, "learning_rate": 7.800990584772722e-06, "loss": 1.876, "step": 439000 }, { "epoch": 1.6421112805945974, "grad_norm": 4.469278335571289, "learning_rate": 7.722472887218802e-06, "loss": 1.8871, "step": 439500 }, { "epoch": 1.6439794390480613, "grad_norm": 4.810849189758301, "learning_rate": 7.644319253256577e-06, "loss": 1.892, "step": 440000 }, { "epoch": 1.645847597501525, "grad_norm": 5.1172027587890625, "learning_rate": 7.5666855692307025e-06, "loss": 1.9003, "step": 440500 }, { "epoch": 1.6477157559549886, "grad_norm": 5.264705181121826, "learning_rate": 7.48926134684001e-06, "loss": 1.866, "step": 441000 }, { "epoch": 1.6495839144084523, "grad_norm": 3.7140793800354004, "learning_rate": 7.41220319629074e-06, "loss": 1.8958, "step": 441500 }, { "epoch": 1.651452072861916, "grad_norm": 4.509251117706299, "learning_rate": 7.335511781152121e-06, "loss": 1.8784, "step": 442000 }, { "epoch": 1.6533202313153796, "grad_norm": 4.2154388427734375, "learning_rate": 7.259340042775581e-06, "loss": 1.8476, "step": 442500 }, { "epoch": 1.6551883897688433, "grad_norm": 6.030950546264648, "learning_rate": 7.183383339768157e-06, "loss": 1.9157, "step": 443000 }, { "epoch": 1.657056548222307, "grad_norm": 4.760791301727295, "learning_rate": 7.107795342603074e-06, "loss": 1.8709, "step": 443500 }, { "epoch": 1.6589247066757709, "grad_norm": 4.554337978363037, "learning_rate": 7.032576702189675e-06, "loss": 1.8865, "step": 444000 }, { "epoch": 1.6607928651292345, "grad_norm": 5.714734077453613, "learning_rate": 6.9578773938351495e-06, "loss": 1.8687, "step": 444500 }, { "epoch": 1.6626610235826982, "grad_norm": 4.749231338500977, "learning_rate": 6.883398664985902e-06, "loss": 1.8953, "step": 445000 }, { "epoch": 1.664529182036162, "grad_norm": 2.8103106021881104, "learning_rate": 6.809291225230813e-06, "loss": 1.8854, "step": 445500 }, { "epoch": 1.6663973404896257, "grad_norm": 6.017327308654785, "learning_rate": 6.735555712729713e-06, "loss": 1.8829, "step": 446000 }, { "epoch": 1.6682654989430894, "grad_norm": 5.306553363800049, "learning_rate": 6.662339116102778e-06, "loss": 1.8542, "step": 446500 }, { "epoch": 1.670133657396553, "grad_norm": 5.078936576843262, "learning_rate": 6.5893486127564465e-06, "loss": 1.9077, "step": 447000 }, { "epoch": 1.6720018158500167, "grad_norm": 5.262309551239014, "learning_rate": 6.516731930651387e-06, "loss": 1.8863, "step": 447500 }, { "epoch": 1.6738699743034804, "grad_norm": 5.343240261077881, "learning_rate": 6.444489695110101e-06, "loss": 1.8784, "step": 448000 }, { "epoch": 1.675738132756944, "grad_norm": 4.112715244293213, "learning_rate": 6.372622528230676e-06, "loss": 1.8559, "step": 448500 }, { "epoch": 1.6776062912104077, "grad_norm": 3.1489148139953613, "learning_rate": 6.301273656494144e-06, "loss": 1.8633, "step": 449000 }, { "epoch": 1.6794744496638716, "grad_norm": 5.503724575042725, "learning_rate": 6.230157727089419e-06, "loss": 1.8898, "step": 449500 }, { "epoch": 1.6813426081173353, "grad_norm": 4.443988800048828, "learning_rate": 6.159418712018961e-06, "loss": 1.881, "step": 450000 }, { "epoch": 1.6832107665707992, "grad_norm": 3.3895161151885986, "learning_rate": 6.089057220436195e-06, "loss": 1.8802, "step": 450500 }, { "epoch": 1.6850789250242628, "grad_norm": 4.960055828094482, "learning_rate": 6.0192134471937224e-06, "loss": 1.8593, "step": 451000 }, { "epoch": 1.6869470834777265, "grad_norm": 4.596670150756836, "learning_rate": 5.949608058974171e-06, "loss": 1.8924, "step": 451500 }, { "epoch": 1.6888152419311901, "grad_norm": 3.810817003250122, "learning_rate": 5.8803820009804165e-06, "loss": 1.8412, "step": 452000 }, { "epoch": 1.6906834003846538, "grad_norm": 6.2422380447387695, "learning_rate": 5.8115358693374035e-06, "loss": 1.875, "step": 452500 }, { "epoch": 1.6925515588381175, "grad_norm": 4.921154499053955, "learning_rate": 5.7432068079726676e-06, "loss": 1.8729, "step": 453000 }, { "epoch": 1.6944197172915811, "grad_norm": 5.331964015960693, "learning_rate": 5.675121541510353e-06, "loss": 1.8726, "step": 453500 }, { "epoch": 1.6962878757450448, "grad_norm": 4.561686038970947, "learning_rate": 5.607417968953904e-06, "loss": 1.8597, "step": 454000 }, { "epoch": 1.6981560341985087, "grad_norm": 5.06734037399292, "learning_rate": 5.5400966733176905e-06, "loss": 1.8741, "step": 454500 }, { "epoch": 1.7000241926519724, "grad_norm": 6.29988956451416, "learning_rate": 5.473291728727564e-06, "loss": 1.9034, "step": 455000 }, { "epoch": 1.701892351105436, "grad_norm": 5.206850051879883, "learning_rate": 5.406735955363129e-06, "loss": 1.8556, "step": 455500 }, { "epoch": 1.7037605095589, "grad_norm": 3.8202433586120605, "learning_rate": 5.340564187047786e-06, "loss": 1.8677, "step": 456000 }, { "epoch": 1.7056286680123636, "grad_norm": 3.6107611656188965, "learning_rate": 5.2747769936051125e-06, "loss": 1.8593, "step": 456500 }, { "epoch": 1.7074968264658272, "grad_norm": 4.204036235809326, "learning_rate": 5.20937494154699e-06, "loss": 1.8571, "step": 457000 }, { "epoch": 1.709364984919291, "grad_norm": 5.234120845794678, "learning_rate": 5.1444882414578675e-06, "loss": 1.8433, "step": 457500 }, { "epoch": 1.7112331433727546, "grad_norm": 3.4716298580169678, "learning_rate": 5.079857385347997e-06, "loss": 1.8765, "step": 458000 }, { "epoch": 1.7131013018262182, "grad_norm": 5.14175271987915, "learning_rate": 5.015613349129866e-06, "loss": 1.9206, "step": 458500 }, { "epoch": 1.714969460279682, "grad_norm": 4.21678352355957, "learning_rate": 4.951756686026798e-06, "loss": 1.8835, "step": 459000 }, { "epoch": 1.7168376187331456, "grad_norm": 3.8663065433502197, "learning_rate": 4.888414495895577e-06, "loss": 1.8974, "step": 459500 }, { "epoch": 1.7187057771866094, "grad_norm": 4.44641637802124, "learning_rate": 4.825333447862485e-06, "loss": 1.8963, "step": 460000 }, { "epoch": 1.720573935640073, "grad_norm": 4.290149211883545, "learning_rate": 4.762641411497825e-06, "loss": 1.8818, "step": 460500 }, { "epoch": 1.722442094093537, "grad_norm": 3.1460719108581543, "learning_rate": 4.700338926660225e-06, "loss": 1.8916, "step": 461000 }, { "epoch": 1.7243102525470007, "grad_norm": 3.602639675140381, "learning_rate": 4.63842652985379e-06, "loss": 1.8656, "step": 461500 }, { "epoch": 1.7261784110004643, "grad_norm": 4.454497337341309, "learning_rate": 4.577027407582085e-06, "loss": 1.8377, "step": 462000 }, { "epoch": 1.728046569453928, "grad_norm": 4.91801118850708, "learning_rate": 4.5158960000806275e-06, "loss": 1.8708, "step": 462500 }, { "epoch": 1.7299147279073916, "grad_norm": 5.951587200164795, "learning_rate": 4.45515626889988e-06, "loss": 1.8598, "step": 463000 }, { "epoch": 1.7317828863608553, "grad_norm": 3.9829583168029785, "learning_rate": 4.394808737086631e-06, "loss": 1.8637, "step": 463500 }, { "epoch": 1.733651044814319, "grad_norm": 4.84136962890625, "learning_rate": 4.334973441658552e-06, "loss": 1.849, "step": 464000 }, { "epoch": 1.7355192032677826, "grad_norm": 5.9698991775512695, "learning_rate": 4.275411077223152e-06, "loss": 1.8716, "step": 464500 }, { "epoch": 1.7373873617212465, "grad_norm": 6.253756046295166, "learning_rate": 4.216242459991293e-06, "loss": 1.877, "step": 465000 }, { "epoch": 1.7392555201747102, "grad_norm": 4.6036152839660645, "learning_rate": 4.157468099480438e-06, "loss": 1.8532, "step": 465500 }, { "epoch": 1.7411236786281739, "grad_norm": 4.482430934906006, "learning_rate": 4.099204866700346e-06, "loss": 1.858, "step": 466000 }, { "epoch": 1.7429918370816377, "grad_norm": 4.4797749519348145, "learning_rate": 4.041219743568814e-06, "loss": 1.8436, "step": 466500 }, { "epoch": 1.7448599955351014, "grad_norm": 5.49769926071167, "learning_rate": 3.983630384327791e-06, "loss": 1.8767, "step": 467000 }, { "epoch": 1.746728153988565, "grad_norm": 5.328680038452148, "learning_rate": 3.9264372848953125e-06, "loss": 1.8929, "step": 467500 }, { "epoch": 1.7485963124420287, "grad_norm": 3.2703754901885986, "learning_rate": 3.869640937777136e-06, "loss": 1.7657, "step": 468000 }, { "epoch": 1.7504644708954924, "grad_norm": 4.710208892822266, "learning_rate": 3.813241832062481e-06, "loss": 1.868, "step": 468500 }, { "epoch": 1.752332629348956, "grad_norm": 3.9908735752105713, "learning_rate": 3.7572404534197746e-06, "loss": 1.9306, "step": 469000 }, { "epoch": 1.7542007878024197, "grad_norm": 5.898683071136475, "learning_rate": 3.701637284092546e-06, "loss": 1.8756, "step": 469500 }, { "epoch": 1.7560689462558834, "grad_norm": 5.575063705444336, "learning_rate": 3.6465428136502942e-06, "loss": 1.8415, "step": 470000 }, { "epoch": 1.7579371047093473, "grad_norm": 3.8220248222351074, "learning_rate": 3.591736697164866e-06, "loss": 1.8549, "step": 470500 }, { "epoch": 1.759805263162811, "grad_norm": 4.483773708343506, "learning_rate": 3.5373302151939625e-06, "loss": 1.8414, "step": 471000 }, { "epoch": 1.7616734216162746, "grad_norm": 5.593682289123535, "learning_rate": 3.4833238362470044e-06, "loss": 1.8729, "step": 471500 }, { "epoch": 1.7635415800697385, "grad_norm": 3.2169010639190674, "learning_rate": 3.4298248369353582e-06, "loss": 1.8556, "step": 472000 }, { "epoch": 1.7654097385232022, "grad_norm": 5.516305923461914, "learning_rate": 3.3766192532610986e-06, "loss": 1.8855, "step": 472500 }, { "epoch": 1.7672778969766658, "grad_norm": 5.06584358215332, "learning_rate": 3.3239203637443983e-06, "loss": 1.8967, "step": 473000 }, { "epoch": 1.7691460554301295, "grad_norm": 4.666677474975586, "learning_rate": 3.271517404347946e-06, "loss": 1.8351, "step": 473500 }, { "epoch": 1.7710142138835931, "grad_norm": 5.4451823234558105, "learning_rate": 3.2195168369637765e-06, "loss": 1.8405, "step": 474000 }, { "epoch": 1.7728823723370568, "grad_norm": 4.598884582519531, "learning_rate": 3.1679191093832883e-06, "loss": 1.8774, "step": 474500 }, { "epoch": 1.7747505307905205, "grad_norm": 5.018040657043457, "learning_rate": 3.1167246659289217e-06, "loss": 1.8544, "step": 475000 }, { "epoch": 1.7766186892439841, "grad_norm": 5.349071502685547, "learning_rate": 3.065933947450339e-06, "loss": 1.8779, "step": 475500 }, { "epoch": 1.778486847697448, "grad_norm": 4.253110408782959, "learning_rate": 3.015547391320589e-06, "loss": 1.8161, "step": 476000 }, { "epoch": 1.7803550061509117, "grad_norm": 3.6783599853515625, "learning_rate": 2.9655654314323655e-06, "loss": 1.8395, "step": 476500 }, { "epoch": 1.7822231646043756, "grad_norm": 4.650113582611084, "learning_rate": 2.916185998547194e-06, "loss": 1.8573, "step": 477000 }, { "epoch": 1.7840913230578392, "grad_norm": 4.785963535308838, "learning_rate": 2.8670128962200117e-06, "loss": 1.839, "step": 477500 }, { "epoch": 1.785959481511303, "grad_norm": 4.258472442626953, "learning_rate": 2.818245669206393e-06, "loss": 1.8937, "step": 478000 }, { "epoch": 1.7878276399647666, "grad_norm": 5.702148914337158, "learning_rate": 2.7698847374545255e-06, "loss": 1.8767, "step": 478500 }, { "epoch": 1.7896957984182302, "grad_norm": 5.909474849700928, "learning_rate": 2.7219305174139067e-06, "loss": 1.8927, "step": 479000 }, { "epoch": 1.791563956871694, "grad_norm": 4.348086357116699, "learning_rate": 2.6743834220317286e-06, "loss": 1.8478, "step": 479500 }, { "epoch": 1.7934321153251576, "grad_norm": 4.148903846740723, "learning_rate": 2.62724386074929e-06, "loss": 1.855, "step": 480000 }, { "epoch": 1.7953002737786212, "grad_norm": 4.32988977432251, "learning_rate": 2.580512239498528e-06, "loss": 1.8551, "step": 480500 }, { "epoch": 1.7971684322320851, "grad_norm": 4.866036415100098, "learning_rate": 2.534188960698475e-06, "loss": 1.8938, "step": 481000 }, { "epoch": 1.7990365906855488, "grad_norm": 4.053302764892578, "learning_rate": 2.4883658441394673e-06, "loss": 1.8759, "step": 481500 }, { "epoch": 1.8009047491390124, "grad_norm": 5.242681980133057, "learning_rate": 2.4428596247633885e-06, "loss": 1.8914, "step": 482000 }, { "epoch": 1.8027729075924763, "grad_norm": 5.018854141235352, "learning_rate": 2.3977629332031404e-06, "loss": 1.8592, "step": 482500 }, { "epoch": 1.80464106604594, "grad_norm": 4.828859329223633, "learning_rate": 2.3530761577989e-06, "loss": 1.8676, "step": 483000 }, { "epoch": 1.8065092244994037, "grad_norm": 3.3137731552124023, "learning_rate": 2.3088878265754845e-06, "loss": 1.8182, "step": 483500 }, { "epoch": 1.8083773829528673, "grad_norm": 6.416788101196289, "learning_rate": 2.2650212126383242e-06, "loss": 1.8656, "step": 484000 }, { "epoch": 1.810245541406331, "grad_norm": 4.340769290924072, "learning_rate": 2.2215656579332167e-06, "loss": 1.9075, "step": 484500 }, { "epoch": 1.8121136998597946, "grad_norm": 4.634076118469238, "learning_rate": 2.17852153666806e-06, "loss": 1.8799, "step": 485000 }, { "epoch": 1.8139818583132583, "grad_norm": 4.349535942077637, "learning_rate": 2.1359740729170296e-06, "loss": 1.8522, "step": 485500 }, { "epoch": 1.815850016766722, "grad_norm": 4.439642429351807, "learning_rate": 2.0937531022739987e-06, "loss": 1.8578, "step": 486000 }, { "epoch": 1.8177181752201859, "grad_norm": 4.639336585998535, "learning_rate": 2.051944665700545e-06, "loss": 1.883, "step": 486500 }, { "epoch": 1.8195863336736495, "grad_norm": 4.625245571136475, "learning_rate": 2.010549123220773e-06, "loss": 1.8886, "step": 487000 }, { "epoch": 1.8214544921271132, "grad_norm": 4.0239667892456055, "learning_rate": 1.9696483832278845e-06, "loss": 1.8653, "step": 487500 }, { "epoch": 1.823322650580577, "grad_norm": 4.363647937774658, "learning_rate": 1.92907886722582e-06, "loss": 1.8718, "step": 488000 }, { "epoch": 1.8251908090340407, "grad_norm": 4.025300025939941, "learning_rate": 1.8889233033491493e-06, "loss": 1.8352, "step": 488500 }, { "epoch": 1.8270589674875044, "grad_norm": 6.883707046508789, "learning_rate": 1.8491820373886358e-06, "loss": 1.9056, "step": 489000 }, { "epoch": 1.828927125940968, "grad_norm": 5.169373512268066, "learning_rate": 1.8098554115674292e-06, "loss": 1.8994, "step": 489500 }, { "epoch": 1.8307952843944317, "grad_norm": 5.691972255706787, "learning_rate": 1.7710985840431572e-06, "loss": 1.8602, "step": 490000 }, { "epoch": 1.8326634428478954, "grad_norm": 4.719027042388916, "learning_rate": 1.7326005889664986e-06, "loss": 1.8645, "step": 490500 }, { "epoch": 1.834531601301359, "grad_norm": 5.3066816329956055, "learning_rate": 1.6945182379445534e-06, "loss": 1.879, "step": 491000 }, { "epoch": 1.8363997597548227, "grad_norm": 5.338113307952881, "learning_rate": 1.6568518589150705e-06, "loss": 1.8811, "step": 491500 }, { "epoch": 1.8382679182082866, "grad_norm": 3.351616382598877, "learning_rate": 1.61960177623377e-06, "loss": 1.8459, "step": 492000 }, { "epoch": 1.8401360766617503, "grad_norm": 5.075439929962158, "learning_rate": 1.5827683106715008e-06, "loss": 1.8515, "step": 492500 }, { "epoch": 1.8420042351152142, "grad_norm": 4.089956283569336, "learning_rate": 1.5463517794115367e-06, "loss": 1.8624, "step": 493000 }, { "epoch": 1.8438723935686778, "grad_norm": 6.492163181304932, "learning_rate": 1.5103524960467908e-06, "loss": 1.8245, "step": 493500 }, { "epoch": 1.8457405520221415, "grad_norm": 6.452279567718506, "learning_rate": 1.4748415171010387e-06, "loss": 1.8406, "step": 494000 }, { "epoch": 1.8476087104756052, "grad_norm": 3.7838053703308105, "learning_rate": 1.4396768198986554e-06, "loss": 1.8508, "step": 494500 }, { "epoch": 1.8494768689290688, "grad_norm": 3.706258535385132, "learning_rate": 1.4049302891993631e-06, "loss": 1.8484, "step": 495000 }, { "epoch": 1.8513450273825325, "grad_norm": 4.734787940979004, "learning_rate": 1.3706022242152227e-06, "loss": 1.8616, "step": 495500 }, { "epoch": 1.8532131858359961, "grad_norm": 5.525266170501709, "learning_rate": 1.336760321043634e-06, "loss": 1.8696, "step": 496000 }, { "epoch": 1.8550813442894598, "grad_norm": 3.555717706680298, "learning_rate": 1.3032692323137307e-06, "loss": 1.8539, "step": 496500 }, { "epoch": 1.8569495027429237, "grad_norm": 4.906459331512451, "learning_rate": 1.2701974847307452e-06, "loss": 1.8555, "step": 497000 }, { "epoch": 1.8588176611963874, "grad_norm": 5.703590393066406, "learning_rate": 1.2375453630847134e-06, "loss": 1.8088, "step": 497500 }, { "epoch": 1.860685819649851, "grad_norm": 4.265283107757568, "learning_rate": 1.2053771937288626e-06, "loss": 1.8823, "step": 498000 }, { "epoch": 1.862553978103315, "grad_norm": 4.899601936340332, "learning_rate": 1.1735643232264836e-06, "loss": 1.8687, "step": 498500 }, { "epoch": 1.8644221365567786, "grad_norm": 4.975470542907715, "learning_rate": 1.1422342758236281e-06, "loss": 1.871, "step": 499000 }, { "epoch": 1.8662902950102422, "grad_norm": 4.806349754333496, "learning_rate": 1.1112617500700973e-06, "loss": 1.8244, "step": 499500 }, { "epoch": 1.868158453463706, "grad_norm": 5.105782508850098, "learning_rate": 1.0807102188935214e-06, "loss": 1.8867, "step": 500000 }, { "epoch": 1.8700266119171696, "grad_norm": 5.97845458984375, "learning_rate": 1.050579945381669e-06, "loss": 1.8339, "step": 500500 }, { "epoch": 1.8718947703706332, "grad_norm": 4.778586387634277, "learning_rate": 1.0208711889947376e-06, "loss": 1.8423, "step": 501000 }, { "epoch": 1.873762928824097, "grad_norm": 4.4693169593811035, "learning_rate": 9.915842055631286e-07, "loss": 1.8629, "step": 501500 }, { "epoch": 1.8756310872775606, "grad_norm": 5.0336222648620605, "learning_rate": 9.62719247285221e-07, "loss": 1.8386, "step": 502000 }, { "epoch": 1.8774992457310244, "grad_norm": 4.51587438583374, "learning_rate": 9.342765627252504e-07, "loss": 1.8566, "step": 502500 }, { "epoch": 1.879367404184488, "grad_norm": 4.207951068878174, "learning_rate": 9.062563968110948e-07, "loss": 1.8517, "step": 503000 }, { "epoch": 1.8812355626379518, "grad_norm": 3.8609273433685303, "learning_rate": 8.787137635712206e-07, "loss": 1.8727, "step": 503500 }, { "epoch": 1.8831037210914157, "grad_norm": 4.1626877784729, "learning_rate": 8.515385089467198e-07, "loss": 1.89, "step": 504000 }, { "epoch": 1.8849718795448793, "grad_norm": 3.9561331272125244, "learning_rate": 8.247864854485199e-07, "loss": 1.8863, "step": 504500 }, { "epoch": 1.886840037998343, "grad_norm": 4.846907138824463, "learning_rate": 7.98457923445789e-07, "loss": 1.8208, "step": 505000 }, { "epoch": 1.8887081964518067, "grad_norm": 4.7613911628723145, "learning_rate": 7.726044364189499e-07, "loss": 1.8515, "step": 505500 }, { "epoch": 1.8905763549052703, "grad_norm": 5.021259307861328, "learning_rate": 7.47122625883645e-07, "loss": 1.8398, "step": 506000 }, { "epoch": 1.892444513358734, "grad_norm": 6.04338264465332, "learning_rate": 7.220649456289641e-07, "loss": 1.8433, "step": 506500 }, { "epoch": 1.8943126718121976, "grad_norm": 4.8739094734191895, "learning_rate": 6.974316114336077e-07, "loss": 1.8352, "step": 507000 }, { "epoch": 1.8961808302656613, "grad_norm": 4.441490650177002, "learning_rate": 6.732708291258827e-07, "loss": 1.8887, "step": 507500 }, { "epoch": 1.8980489887191252, "grad_norm": 3.811279058456421, "learning_rate": 6.494859700278133e-07, "loss": 1.8689, "step": 508000 }, { "epoch": 1.8999171471725889, "grad_norm": 2.8529744148254395, "learning_rate": 6.26126081986883e-07, "loss": 1.9027, "step": 508500 }, { "epoch": 1.9017853056260527, "grad_norm": 4.631827354431152, "learning_rate": 6.031913661616207e-07, "loss": 1.848, "step": 509000 }, { "epoch": 1.9036534640795164, "grad_norm": 3.616713762283325, "learning_rate": 5.807266140930689e-07, "loss": 1.8911, "step": 509500 }, { "epoch": 1.90552162253298, "grad_norm": 5.187899112701416, "learning_rate": 5.586419802097898e-07, "loss": 1.8309, "step": 510000 }, { "epoch": 1.9073897809864437, "grad_norm": 5.249440670013428, "learning_rate": 5.369830996666103e-07, "loss": 1.8542, "step": 510500 }, { "epoch": 1.9092579394399074, "grad_norm": 5.117617607116699, "learning_rate": 5.157501589742042e-07, "loss": 1.8459, "step": 511000 }, { "epoch": 1.911126097893371, "grad_norm": 5.904655456542969, "learning_rate": 4.949433409753679e-07, "loss": 1.8495, "step": 511500 }, { "epoch": 1.9129942563468347, "grad_norm": 6.1428632736206055, "learning_rate": 4.7460316030914495e-07, "loss": 1.8274, "step": 512000 }, { "epoch": 1.9148624148002984, "grad_norm": 4.737666130065918, "learning_rate": 4.546482684189279e-07, "loss": 1.8814, "step": 512500 }, { "epoch": 1.9167305732537623, "grad_norm": 5.555963516235352, "learning_rate": 4.351200253877141e-07, "loss": 1.8644, "step": 513000 }, { "epoch": 1.918598731707226, "grad_norm": 4.281107425689697, "learning_rate": 4.160185993786592e-07, "loss": 1.8685, "step": 513500 }, { "epoch": 1.9204668901606896, "grad_norm": 4.849224090576172, "learning_rate": 3.973441548794699e-07, "loss": 1.8921, "step": 514000 }, { "epoch": 1.9223350486141535, "grad_norm": 5.799472332000732, "learning_rate": 3.791329209122674e-07, "loss": 1.8326, "step": 514500 }, { "epoch": 1.9242032070676172, "grad_norm": 5.754580020904541, "learning_rate": 3.613120634338663e-07, "loss": 1.8677, "step": 515000 }, { "epoch": 1.9260713655210808, "grad_norm": 4.404658317565918, "learning_rate": 3.4391865855858406e-07, "loss": 1.8637, "step": 515500 }, { "epoch": 1.9279395239745445, "grad_norm": 4.911507606506348, "learning_rate": 3.2695285606589856e-07, "loss": 1.85, "step": 516000 }, { "epoch": 1.9298076824280082, "grad_norm": 4.071664333343506, "learning_rate": 3.1044745117284056e-07, "loss": 1.8303, "step": 516500 }, { "epoch": 1.9316758408814718, "grad_norm": 5.3374223709106445, "learning_rate": 2.9433643213220284e-07, "loss": 1.8384, "step": 517000 }, { "epoch": 1.9335439993349355, "grad_norm": 5.541077613830566, "learning_rate": 2.7865344244054625e-07, "loss": 1.8562, "step": 517500 }, { "epoch": 1.9354121577883991, "grad_norm": 4.992559432983398, "learning_rate": 2.6339861714849144e-07, "loss": 1.8563, "step": 518000 }, { "epoch": 1.937280316241863, "grad_norm": 3.9907846450805664, "learning_rate": 2.486013131539955e-07, "loss": 1.8736, "step": 518500 }, { "epoch": 1.9391484746953267, "grad_norm": 3.9517438411712646, "learning_rate": 2.3420235009178893e-07, "loss": 1.859, "step": 519000 }, { "epoch": 1.9410166331487904, "grad_norm": 4.987946510314941, "learning_rate": 2.2023193420994125e-07, "loss": 1.8258, "step": 519500 }, { "epoch": 1.9428847916022542, "grad_norm": 4.550879955291748, "learning_rate": 2.0669018581160883e-07, "loss": 1.8678, "step": 520000 }, { "epoch": 1.944752950055718, "grad_norm": 3.339261293411255, "learning_rate": 1.936030194349736e-07, "loss": 1.8278, "step": 520500 }, { "epoch": 1.9466211085091816, "grad_norm": 5.5620951652526855, "learning_rate": 1.8091809424235495e-07, "loss": 1.8996, "step": 521000 }, { "epoch": 1.9484892669626452, "grad_norm": 3.614462375640869, "learning_rate": 1.6866217507570114e-07, "loss": 1.8478, "step": 521500 }, { "epoch": 1.950357425416109, "grad_norm": 4.48366117477417, "learning_rate": 1.5683536747416184e-07, "loss": 1.8555, "step": 522000 }, { "epoch": 1.9522255838695726, "grad_norm": 5.737336158752441, "learning_rate": 1.454601400492306e-07, "loss": 1.8463, "step": 522500 }, { "epoch": 1.9540937423230362, "grad_norm": 3.779061794281006, "learning_rate": 1.3449099869505266e-07, "loss": 1.8293, "step": 523000 }, { "epoch": 1.9559619007765, "grad_norm": 5.098133087158203, "learning_rate": 1.239512631635298e-07, "loss": 1.8594, "step": 523500 }, { "epoch": 1.9578300592299638, "grad_norm": 4.416299343109131, "learning_rate": 1.1384102421526654e-07, "loss": 1.8593, "step": 524000 }, { "epoch": 1.9596982176834274, "grad_norm": 3.656932830810547, "learning_rate": 1.0417930144245858e-07, "loss": 1.836, "step": 524500 }, { "epoch": 1.9615663761368913, "grad_norm": 5.132260322570801, "learning_rate": 9.492745373296808e-08, "loss": 1.8943, "step": 525000 }, { "epoch": 1.963434534590355, "grad_norm": 4.663350582122803, "learning_rate": 8.61053525388622e-08, "loss": 1.8534, "step": 525500 }, { "epoch": 1.9653026930438187, "grad_norm": 6.682803153991699, "learning_rate": 7.77130738297216e-08, "loss": 1.8735, "step": 526000 }, { "epoch": 1.9671708514972823, "grad_norm": 6.555516719818115, "learning_rate": 6.976618556056025e-08, "loss": 1.88, "step": 526500 }, { "epoch": 1.969039009950746, "grad_norm": 5.245980739593506, "learning_rate": 6.223290493156397e-08, "loss": 1.8565, "step": 527000 }, { "epoch": 1.9709071684042097, "grad_norm": 3.9505879878997803, "learning_rate": 5.512965235983658e-08, "loss": 1.8449, "step": 527500 }, { "epoch": 1.9727753268576733, "grad_norm": 6.470322132110596, "learning_rate": 4.8456489013481986e-08, "loss": 1.8588, "step": 528000 }, { "epoch": 1.974643485311137, "grad_norm": 5.629650592803955, "learning_rate": 4.221347235697226e-08, "loss": 1.8839, "step": 528500 }, { "epoch": 1.9765116437646009, "grad_norm": 3.961327075958252, "learning_rate": 3.6411852409129475e-08, "loss": 1.8824, "step": 529000 }, { "epoch": 1.9783798022180645, "grad_norm": 4.475338935852051, "learning_rate": 3.1028426160295554e-08, "loss": 1.8725, "step": 529500 }, { "epoch": 1.9802479606715282, "grad_norm": 6.577774524688721, "learning_rate": 2.607529667921771e-08, "loss": 1.8575, "step": 530000 }, { "epoch": 1.982116119124992, "grad_norm": 6.510643005371094, "learning_rate": 2.1552506618677248e-08, "loss": 1.8503, "step": 530500 }, { "epoch": 1.9839842775784557, "grad_norm": 4.923540115356445, "learning_rate": 1.746785020741437e-08, "loss": 1.8607, "step": 531000 }, { "epoch": 1.9858524360319194, "grad_norm": 4.267704486846924, "learning_rate": 1.3804991262938994e-08, "loss": 1.8248, "step": 531500 }, { "epoch": 1.987720594485383, "grad_norm": 5.18399715423584, "learning_rate": 1.0572577402029326e-08, "loss": 1.8468, "step": 532000 }, { "epoch": 1.9895887529388467, "grad_norm": 4.5753045082092285, "learning_rate": 7.770636459902836e-09, "loss": 1.8354, "step": 532500 }, { "epoch": 1.9914569113923104, "grad_norm": 4.492304801940918, "learning_rate": 5.403505802398234e-09, "loss": 1.8668, "step": 533000 }, { "epoch": 1.993325069845774, "grad_norm": 6.207240104675293, "learning_rate": 3.461718322739227e-09, "loss": 1.8532, "step": 533500 }, { "epoch": 1.9951932282992377, "grad_norm": 6.569146156311035, "learning_rate": 1.9504649954538156e-09, "loss": 1.8313, "step": 534000 }, { "epoch": 1.9970613867527016, "grad_norm": 3.274258852005005, "learning_rate": 8.69758834370904e-10, "loss": 1.9104, "step": 534500 }, { "epoch": 1.9989295452061653, "grad_norm": 4.226444721221924, "learning_rate": 2.2047974543304427e-10, "loss": 1.8681, "step": 535000 } ], "logging_steps": 500, "max_steps": 535286, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4321334103279616e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }