{ "best_metric": null, "best_model_checkpoint": null, "epoch": 35.0, "eval_steps": 500, "global_step": 175490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0997207818109294, "grad_norm": 2.346997022628784, "learning_rate": 4.5023932987634625e-05, "loss": 8.2424, "step": 500 }, { "epoch": 0.1994415636218588, "grad_norm": 2.3684158325195312, "learning_rate": 4.0037893897088155e-05, "loss": 7.6851, "step": 1000 }, { "epoch": 0.2991623454327882, "grad_norm": 3.409303665161133, "learning_rate": 3.5051854806541686e-05, "loss": 7.4872, "step": 1500 }, { "epoch": 0.3988831272437176, "grad_norm": 2.615360975265503, "learning_rate": 3.0065815715995216e-05, "loss": 7.344, "step": 2000 }, { "epoch": 0.49860390905464697, "grad_norm": 3.5242176055908203, "learning_rate": 2.5079776625448743e-05, "loss": 7.2749, "step": 2500 }, { "epoch": 0.5983246908655764, "grad_norm": 3.690262794494629, "learning_rate": 2.0093737534902273e-05, "loss": 7.1657, "step": 3000 }, { "epoch": 0.6980454726765057, "grad_norm": 2.940692663192749, "learning_rate": 1.5107698444355806e-05, "loss": 7.1298, "step": 3500 }, { "epoch": 0.7977662544874352, "grad_norm": 2.9132378101348877, "learning_rate": 1.0121659353809334e-05, "loss": 7.0938, "step": 4000 }, { "epoch": 0.8974870362983646, "grad_norm": 3.101921558380127, "learning_rate": 5.135620263262864e-06, "loss": 7.0715, "step": 4500 }, { "epoch": 0.9972078181092939, "grad_norm": 3.2258358001708984, "learning_rate": 1.495811727163941e-07, "loss": 7.0478, "step": 5000 }, { "epoch": 1.0969285999202234, "grad_norm": 3.2722208499908447, "learning_rate": 3.903270841643399e-05, "loss": 7.0374, "step": 5500 }, { "epoch": 1.1966493817311528, "grad_norm": 5.218217849731445, "learning_rate": 3.803550059832469e-05, "loss": 7.0289, "step": 6000 }, { "epoch": 1.2963701635420821, "grad_norm": 3.466571807861328, "learning_rate": 3.70382927802154e-05, "loss": 6.9595, "step": 6500 }, { "epoch": 1.3960909453530115, "grad_norm": 3.688443183898926, "learning_rate": 3.6041084962106106e-05, "loss": 6.9267, "step": 7000 }, { "epoch": 1.4958117271639408, "grad_norm": 3.0426700115203857, "learning_rate": 3.504387714399681e-05, "loss": 6.8954, "step": 7500 }, { "epoch": 1.5955325089748702, "grad_norm": 3.7769949436187744, "learning_rate": 3.404666932588751e-05, "loss": 6.8657, "step": 8000 }, { "epoch": 1.6952532907857998, "grad_norm": 3.0776305198669434, "learning_rate": 3.304946150777822e-05, "loss": 6.8285, "step": 8500 }, { "epoch": 1.7949740725967291, "grad_norm": 3.350515604019165, "learning_rate": 3.2052253689668926e-05, "loss": 6.7948, "step": 9000 }, { "epoch": 1.8946948544076585, "grad_norm": 3.393035411834717, "learning_rate": 3.1055045871559636e-05, "loss": 6.7725, "step": 9500 }, { "epoch": 1.994415636218588, "grad_norm": 3.438401222229004, "learning_rate": 3.0057838053450336e-05, "loss": 6.7484, "step": 10000 }, { "epoch": 2.0941364180295174, "grad_norm": 4.042023181915283, "learning_rate": 2.9060630235341047e-05, "loss": 6.6939, "step": 10500 }, { "epoch": 2.193857199840447, "grad_norm": 3.3481028079986572, "learning_rate": 2.8063422417231757e-05, "loss": 6.6854, "step": 11000 }, { "epoch": 2.293577981651376, "grad_norm": 3.266961097717285, "learning_rate": 2.706820901475868e-05, "loss": 6.6555, "step": 11500 }, { "epoch": 2.3932987634623055, "grad_norm": 3.215405225753784, "learning_rate": 2.607100119664938e-05, "loss": 6.6713, "step": 12000 }, { "epoch": 2.493019545273235, "grad_norm": 3.380500316619873, "learning_rate": 2.507379337854009e-05, "loss": 6.6581, "step": 12500 }, { "epoch": 2.5927403270841642, "grad_norm": 3.536166191101074, "learning_rate": 2.4076585560430796e-05, "loss": 6.5945, "step": 13000 }, { "epoch": 2.6924611088950936, "grad_norm": 3.9319474697113037, "learning_rate": 2.30793777423215e-05, "loss": 6.6057, "step": 13500 }, { "epoch": 2.792181890706023, "grad_norm": 4.334239482879639, "learning_rate": 2.2084164339848425e-05, "loss": 6.5818, "step": 14000 }, { "epoch": 2.8919026725169523, "grad_norm": 4.093286514282227, "learning_rate": 2.1086956521739132e-05, "loss": 6.5732, "step": 14500 }, { "epoch": 2.9916234543278817, "grad_norm": 4.026576995849609, "learning_rate": 2.008974870362984e-05, "loss": 6.5627, "step": 15000 }, { "epoch": 3.0913442361388115, "grad_norm": 3.7285637855529785, "learning_rate": 1.9092540885520542e-05, "loss": 6.5268, "step": 15500 }, { "epoch": 3.191065017949741, "grad_norm": 3.7349226474761963, "learning_rate": 1.809533306741125e-05, "loss": 6.5388, "step": 16000 }, { "epoch": 3.29078579976067, "grad_norm": 3.5330066680908203, "learning_rate": 1.7098125249301956e-05, "loss": 6.5141, "step": 16500 }, { "epoch": 3.3905065815715996, "grad_norm": 3.6961631774902344, "learning_rate": 1.6100917431192662e-05, "loss": 6.5013, "step": 17000 }, { "epoch": 3.490227363382529, "grad_norm": 3.413053274154663, "learning_rate": 1.5103709613083367e-05, "loss": 6.4932, "step": 17500 }, { "epoch": 3.5899481451934583, "grad_norm": 4.584457874298096, "learning_rate": 1.4108496210610292e-05, "loss": 6.4695, "step": 18000 }, { "epoch": 3.6896689270043876, "grad_norm": 3.3078787326812744, "learning_rate": 1.3111288392500998e-05, "loss": 6.4711, "step": 18500 }, { "epoch": 3.789389708815317, "grad_norm": 3.6679279804229736, "learning_rate": 1.2114080574391703e-05, "loss": 6.466, "step": 19000 }, { "epoch": 3.8891104906262464, "grad_norm": 4.358784198760986, "learning_rate": 1.1116872756282408e-05, "loss": 6.4568, "step": 19500 }, { "epoch": 3.988831272437176, "grad_norm": 4.014244556427002, "learning_rate": 1.0119664938173115e-05, "loss": 6.4536, "step": 20000 }, { "epoch": 4.0885520542481055, "grad_norm": 3.8396079540252686, "learning_rate": 9.122457120063822e-06, "loss": 6.443, "step": 20500 }, { "epoch": 4.188272836059035, "grad_norm": 3.850647449493408, "learning_rate": 8.125249301954529e-06, "loss": 6.4186, "step": 21000 }, { "epoch": 4.287993617869964, "grad_norm": 3.829951047897339, "learning_rate": 7.128041483845234e-06, "loss": 6.4178, "step": 21500 }, { "epoch": 4.387714399680894, "grad_norm": 3.5512278079986572, "learning_rate": 6.132828081372159e-06, "loss": 6.4055, "step": 22000 }, { "epoch": 4.487435181491823, "grad_norm": 3.568665027618408, "learning_rate": 5.135620263262864e-06, "loss": 6.4076, "step": 22500 }, { "epoch": 4.587155963302752, "grad_norm": 3.71463942527771, "learning_rate": 4.13841244515357e-06, "loss": 6.4086, "step": 23000 }, { "epoch": 4.686876745113682, "grad_norm": 3.9615983963012695, "learning_rate": 3.1412046270442757e-06, "loss": 6.4061, "step": 23500 }, { "epoch": 4.786597526924611, "grad_norm": 4.0287909507751465, "learning_rate": 2.1459912245712007e-06, "loss": 6.3772, "step": 24000 }, { "epoch": 4.88631830873554, "grad_norm": 4.012565612792969, "learning_rate": 1.1487834064619066e-06, "loss": 6.3956, "step": 24500 }, { "epoch": 4.98603909054647, "grad_norm": 4.36814022064209, "learning_rate": 1.515755883526127e-07, "loss": 6.3996, "step": 25000 }, { "epoch": 5.0, "step": 25070, "total_flos": 2.639861525017728e+16, "train_loss": 5.285465436767286, "train_runtime": 6500.188, "train_samples_per_second": 61.705, "train_steps_per_second": 3.857 }, { "epoch": 5.085759872357399, "grad_norm": 3.5418105125427246, "learning_rate": 4.946400079776626e-05, "loss": 6.5458, "step": 25500 }, { "epoch": 5.1854806541683285, "grad_norm": 4.323005676269531, "learning_rate": 4.884074591144795e-05, "loss": 6.5604, "step": 26000 }, { "epoch": 5.285201435979258, "grad_norm": 4.445618629455566, "learning_rate": 4.8217491025129644e-05, "loss": 6.5452, "step": 26500 }, { "epoch": 5.384922217790187, "grad_norm": 4.320890426635742, "learning_rate": 4.759423613881133e-05, "loss": 6.5239, "step": 27000 }, { "epoch": 5.484642999601117, "grad_norm": 3.8980209827423096, "learning_rate": 4.697098125249302e-05, "loss": 6.5278, "step": 27500 }, { "epoch": 5.584363781412046, "grad_norm": 4.074916362762451, "learning_rate": 4.6347726366174716e-05, "loss": 6.5044, "step": 28000 }, { "epoch": 5.684084563222975, "grad_norm": 4.465285778045654, "learning_rate": 4.572447147985641e-05, "loss": 6.472, "step": 28500 }, { "epoch": 5.783805345033905, "grad_norm": 4.351347923278809, "learning_rate": 4.5101216593538095e-05, "loss": 6.4504, "step": 29000 }, { "epoch": 5.883526126844835, "grad_norm": 4.14565372467041, "learning_rate": 4.447796170721978e-05, "loss": 6.4375, "step": 29500 }, { "epoch": 5.983246908655763, "grad_norm": 4.669959545135498, "learning_rate": 4.3854706820901474e-05, "loss": 6.4393, "step": 30000 }, { "epoch": 6.082967690466694, "grad_norm": 4.345717430114746, "learning_rate": 4.323145193458317e-05, "loss": 6.3808, "step": 30500 }, { "epoch": 6.182688472277623, "grad_norm": 4.040054798126221, "learning_rate": 4.260819704826486e-05, "loss": 6.3705, "step": 31000 }, { "epoch": 6.282409254088552, "grad_norm": 4.663171291351318, "learning_rate": 4.198618867171919e-05, "loss": 6.3803, "step": 31500 }, { "epoch": 6.382130035899482, "grad_norm": 4.45890474319458, "learning_rate": 4.136293378540088e-05, "loss": 6.3256, "step": 32000 }, { "epoch": 6.481850817710411, "grad_norm": 4.158110618591309, "learning_rate": 4.073967889908257e-05, "loss": 6.3351, "step": 32500 }, { "epoch": 6.58157159952134, "grad_norm": 4.460795879364014, "learning_rate": 4.0116424012764265e-05, "loss": 6.3137, "step": 33000 }, { "epoch": 6.68129238133227, "grad_norm": 4.767895221710205, "learning_rate": 3.949316912644596e-05, "loss": 6.2751, "step": 33500 }, { "epoch": 6.781013163143199, "grad_norm": 4.399994850158691, "learning_rate": 3.887116074990028e-05, "loss": 6.2345, "step": 34000 }, { "epoch": 6.8807339449541285, "grad_norm": 4.522914886474609, "learning_rate": 3.8247905863581976e-05, "loss": 6.218, "step": 34500 }, { "epoch": 6.980454726765058, "grad_norm": 4.697731018066406, "learning_rate": 3.762465097726366e-05, "loss": 6.1819, "step": 35000 }, { "epoch": 7.080175508575987, "grad_norm": 5.113608360290527, "learning_rate": 3.7001396090945355e-05, "loss": 6.1566, "step": 35500 }, { "epoch": 7.179896290386917, "grad_norm": 4.987142086029053, "learning_rate": 3.637814120462705e-05, "loss": 6.1504, "step": 36000 }, { "epoch": 7.279617072197846, "grad_norm": 4.797494888305664, "learning_rate": 3.5756132828081373e-05, "loss": 6.0915, "step": 36500 }, { "epoch": 7.379337854008775, "grad_norm": 5.114543437957764, "learning_rate": 3.5132877941763066e-05, "loss": 6.0859, "step": 37000 }, { "epoch": 7.479058635819705, "grad_norm": 5.5212721824646, "learning_rate": 3.450962305544476e-05, "loss": 6.0643, "step": 37500 }, { "epoch": 7.578779417630634, "grad_norm": 4.77981424331665, "learning_rate": 3.3886368169126446e-05, "loss": 6.038, "step": 38000 }, { "epoch": 7.678500199441563, "grad_norm": 5.6912760734558105, "learning_rate": 3.326311328280814e-05, "loss": 6.0327, "step": 38500 }, { "epoch": 7.778220981252493, "grad_norm": 5.021594524383545, "learning_rate": 3.2641104906262464e-05, "loss": 6.0089, "step": 39000 }, { "epoch": 7.877941763063422, "grad_norm": 4.9512410163879395, "learning_rate": 3.201785001994416e-05, "loss": 5.9914, "step": 39500 }, { "epoch": 7.9776625448743514, "grad_norm": 4.6659088134765625, "learning_rate": 3.139459513362585e-05, "loss": 5.9688, "step": 40000 }, { "epoch": 8.07738332668528, "grad_norm": 5.084179401397705, "learning_rate": 3.601552017986003e-05, "loss": 5.9368, "step": 40500 }, { "epoch": 8.177104108496211, "grad_norm": 5.475657939910889, "learning_rate": 3.556224389890126e-05, "loss": 5.9181, "step": 41000 }, { "epoch": 8.27682489030714, "grad_norm": 4.678411960601807, "learning_rate": 3.510896761794249e-05, "loss": 5.8795, "step": 41500 }, { "epoch": 8.37654567211807, "grad_norm": 5.502169132232666, "learning_rate": 3.465569133698372e-05, "loss": 5.8389, "step": 42000 }, { "epoch": 8.476266453928998, "grad_norm": 5.32131290435791, "learning_rate": 3.420241505602495e-05, "loss": 5.8329, "step": 42500 }, { "epoch": 8.575987235739928, "grad_norm": 5.6808552742004395, "learning_rate": 3.374913877506618e-05, "loss": 5.8001, "step": 43000 }, { "epoch": 8.675708017550857, "grad_norm": 4.988351821899414, "learning_rate": 3.329586249410741e-05, "loss": 5.7928, "step": 43500 }, { "epoch": 8.775428799361787, "grad_norm": 5.559896469116211, "learning_rate": 3.284258621314864e-05, "loss": 5.7488, "step": 44000 }, { "epoch": 8.875149581172716, "grad_norm": 6.084516525268555, "learning_rate": 3.238930993218987e-05, "loss": 5.7262, "step": 44500 }, { "epoch": 8.974870362983646, "grad_norm": 6.219081401824951, "learning_rate": 3.19360336512311e-05, "loss": 5.6925, "step": 45000 }, { "epoch": 9.074591144794574, "grad_norm": 6.170139789581299, "learning_rate": 3.1482757370272333e-05, "loss": 5.6491, "step": 45500 }, { "epoch": 9.174311926605505, "grad_norm": 5.830073356628418, "learning_rate": 3.102948108931356e-05, "loss": 5.6228, "step": 46000 }, { "epoch": 9.274032708416435, "grad_norm": 5.452333927154541, "learning_rate": 3.0577111360916706e-05, "loss": 5.5724, "step": 46500 }, { "epoch": 9.373753490227363, "grad_norm": 5.113864421844482, "learning_rate": 3.0123835079957935e-05, "loss": 5.5437, "step": 47000 }, { "epoch": 9.473474272038294, "grad_norm": 5.875530242919922, "learning_rate": 2.9670558798999164e-05, "loss": 5.525, "step": 47500 }, { "epoch": 9.573195053849222, "grad_norm": 5.342255592346191, "learning_rate": 2.9217282518040397e-05, "loss": 5.5145, "step": 48000 }, { "epoch": 9.672915835660152, "grad_norm": 6.1103644371032715, "learning_rate": 2.8764006237081626e-05, "loss": 5.4687, "step": 48500 }, { "epoch": 9.77263661747108, "grad_norm": 6.640170097351074, "learning_rate": 2.8310729956122855e-05, "loss": 5.4448, "step": 49000 }, { "epoch": 9.872357399282011, "grad_norm": 6.135842323303223, "learning_rate": 2.7858360227726005e-05, "loss": 5.4075, "step": 49500 }, { "epoch": 9.97207818109294, "grad_norm": 6.063602924346924, "learning_rate": 2.7405083946767234e-05, "loss": 5.374, "step": 50000 }, { "epoch": 10.07179896290387, "grad_norm": 6.689053535461426, "learning_rate": 2.6951807665808463e-05, "loss": 5.3459, "step": 50500 }, { "epoch": 10.171519744714798, "grad_norm": 6.488341331481934, "learning_rate": 2.6498531384849696e-05, "loss": 5.3185, "step": 51000 }, { "epoch": 10.271240526525728, "grad_norm": 6.589330673217773, "learning_rate": 2.6045255103890925e-05, "loss": 5.3019, "step": 51500 }, { "epoch": 10.370961308336657, "grad_norm": 6.61977481842041, "learning_rate": 2.5592885375494075e-05, "loss": 5.2792, "step": 52000 }, { "epoch": 10.470682090147587, "grad_norm": 6.396610736846924, "learning_rate": 2.5139609094535304e-05, "loss": 5.2347, "step": 52500 }, { "epoch": 10.570402871958516, "grad_norm": 7.000791549682617, "learning_rate": 2.4686332813576534e-05, "loss": 5.2252, "step": 53000 }, { "epoch": 10.670123653769446, "grad_norm": 6.714987277984619, "learning_rate": 2.4233056532617763e-05, "loss": 5.1965, "step": 53500 }, { "epoch": 10.769844435580374, "grad_norm": 7.012180805206299, "learning_rate": 2.3779780251658992e-05, "loss": 5.1769, "step": 54000 }, { "epoch": 10.869565217391305, "grad_norm": 6.85835599899292, "learning_rate": 2.332650397070022e-05, "loss": 5.1442, "step": 54500 }, { "epoch": 10.969285999202233, "grad_norm": 6.789878845214844, "learning_rate": 2.2873227689741453e-05, "loss": 5.1071, "step": 55000 }, { "epoch": 11.0, "step": 55154, "total_flos": 5.807695355039002e+16, "train_loss": 1.5156397336923944, "train_runtime": 4860.501, "train_samples_per_second": 181.547, "train_steps_per_second": 11.347 }, { "epoch": 11.069006781013163, "grad_norm": 7.099039077758789, "learning_rate": 4.976997739662279e-05, "loss": 5.2813, "step": 55500 }, { "epoch": 11.168727562824092, "grad_norm": 6.935009479522705, "learning_rate": 4.943757479058636e-05, "loss": 5.2781, "step": 56000 }, { "epoch": 11.268448344635022, "grad_norm": 8.239794731140137, "learning_rate": 4.910517218454993e-05, "loss": 5.2531, "step": 56500 }, { "epoch": 11.36816912644595, "grad_norm": 6.757853031158447, "learning_rate": 4.87727695785135e-05, "loss": 5.1861, "step": 57000 }, { "epoch": 11.46788990825688, "grad_norm": 7.666926383972168, "learning_rate": 4.844036697247707e-05, "loss": 5.1783, "step": 57500 }, { "epoch": 11.56761069006781, "grad_norm": 7.166041374206543, "learning_rate": 4.810796436644063e-05, "loss": 5.1202, "step": 58000 }, { "epoch": 11.66733147187874, "grad_norm": 7.543915748596191, "learning_rate": 4.77755617604042e-05, "loss": 5.0482, "step": 58500 }, { "epoch": 11.76705225368967, "grad_norm": 8.00036907196045, "learning_rate": 4.744315915436777e-05, "loss": 5.0167, "step": 59000 }, { "epoch": 11.866773035500598, "grad_norm": 6.7936272621154785, "learning_rate": 4.711075654833134e-05, "loss": 4.9823, "step": 59500 }, { "epoch": 11.966493817311529, "grad_norm": 7.003523826599121, "learning_rate": 4.677835394229491e-05, "loss": 4.9457, "step": 60000 }, { "epoch": 12.066214599122457, "grad_norm": 7.01780891418457, "learning_rate": 4.644595133625848e-05, "loss": 4.825, "step": 60500 }, { "epoch": 12.165935380933387, "grad_norm": 7.654853820800781, "learning_rate": 4.6113548730222045e-05, "loss": 4.7741, "step": 61000 }, { "epoch": 12.265656162744316, "grad_norm": 7.968235492706299, "learning_rate": 4.578181092939769e-05, "loss": 4.7404, "step": 61500 }, { "epoch": 12.365376944555246, "grad_norm": 7.112838268280029, "learning_rate": 4.544940832336126e-05, "loss": 4.6502, "step": 62000 }, { "epoch": 12.465097726366174, "grad_norm": 6.567187786102295, "learning_rate": 4.511700571732483e-05, "loss": 4.6277, "step": 62500 }, { "epoch": 12.564818508177105, "grad_norm": 6.989046096801758, "learning_rate": 4.478460311128839e-05, "loss": 4.5757, "step": 63000 }, { "epoch": 12.664539289988033, "grad_norm": 6.270955562591553, "learning_rate": 4.445220050525196e-05, "loss": 4.5394, "step": 63500 }, { "epoch": 12.764260071798963, "grad_norm": 6.227508544921875, "learning_rate": 4.412046270442761e-05, "loss": 4.4651, "step": 64000 }, { "epoch": 12.863980853609892, "grad_norm": 6.464995861053467, "learning_rate": 4.378806009839118e-05, "loss": 4.423, "step": 64500 }, { "epoch": 12.963701635420822, "grad_norm": 6.102914810180664, "learning_rate": 4.345565749235474e-05, "loss": 4.3969, "step": 65000 }, { "epoch": 13.06342241723175, "grad_norm": 6.3487067222595215, "learning_rate": 4.312325488631831e-05, "loss": 4.2689, "step": 65500 }, { "epoch": 13.16314319904268, "grad_norm": 6.235875129699707, "learning_rate": 4.279085228028188e-05, "loss": 4.2232, "step": 66000 }, { "epoch": 13.26286398085361, "grad_norm": 5.931600570678711, "learning_rate": 4.245844967424545e-05, "loss": 4.222, "step": 66500 }, { "epoch": 13.36258476266454, "grad_norm": 5.873235702514648, "learning_rate": 4.212604706820902e-05, "loss": 4.1722, "step": 67000 }, { "epoch": 13.462305544475468, "grad_norm": 6.30717134475708, "learning_rate": 4.179364446217259e-05, "loss": 4.1255, "step": 67500 }, { "epoch": 13.562026326286398, "grad_norm": 5.893185138702393, "learning_rate": 4.146190666134823e-05, "loss": 4.0975, "step": 68000 }, { "epoch": 13.661747108097327, "grad_norm": 6.775746822357178, "learning_rate": 4.113016886052387e-05, "loss": 4.0787, "step": 68500 }, { "epoch": 13.761467889908257, "grad_norm": 5.948095798492432, "learning_rate": 4.0797766254487435e-05, "loss": 4.0581, "step": 69000 }, { "epoch": 13.861188671719185, "grad_norm": 5.961909770965576, "learning_rate": 4.0465363648451005e-05, "loss": 4.0097, "step": 69500 }, { "epoch": 13.960909453530116, "grad_norm": 5.72122859954834, "learning_rate": 4.0132961042414575e-05, "loss": 3.9751, "step": 70000 }, { "epoch": 14.060630235341046, "grad_norm": 6.1757378578186035, "learning_rate": 3.980122324159022e-05, "loss": 3.9707, "step": 70500 }, { "epoch": 14.160351017151974, "grad_norm": 5.7611236572265625, "learning_rate": 3.946882063555378e-05, "loss": 3.9126, "step": 71000 }, { "epoch": 14.260071798962905, "grad_norm": 6.233034133911133, "learning_rate": 3.913641802951735e-05, "loss": 3.9005, "step": 71500 }, { "epoch": 14.359792580773833, "grad_norm": 6.282217979431152, "learning_rate": 3.880401542348092e-05, "loss": 3.8648, "step": 72000 }, { "epoch": 14.459513362584763, "grad_norm": 6.495648384094238, "learning_rate": 3.847161281744449e-05, "loss": 3.8567, "step": 72500 }, { "epoch": 14.559234144395692, "grad_norm": 6.3030195236206055, "learning_rate": 3.813921021140806e-05, "loss": 3.839, "step": 73000 }, { "epoch": 14.658954926206622, "grad_norm": 5.807531833648682, "learning_rate": 3.78074724105837e-05, "loss": 3.8156, "step": 73500 }, { "epoch": 14.75867570801755, "grad_norm": 5.283077716827393, "learning_rate": 3.747506980454727e-05, "loss": 3.8142, "step": 74000 }, { "epoch": 14.85839648982848, "grad_norm": 5.933303356170654, "learning_rate": 3.714266719851084e-05, "loss": 3.8109, "step": 74500 }, { "epoch": 14.95811727163941, "grad_norm": 6.217842102050781, "learning_rate": 3.681026459247441e-05, "loss": 3.7937, "step": 75000 }, { "epoch": 15.0, "step": 75210, "total_flos": 7.919584575053184e+16, "train_loss": 1.1771604976443562, "train_runtime": 4937.567, "train_samples_per_second": 243.7, "train_steps_per_second": 15.232 }, { "epoch": 15.05783805345034, "grad_norm": 6.581785202026367, "learning_rate": 4.985540486637415e-05, "loss": 3.8491, "step": 75500 }, { "epoch": 15.157558835261268, "grad_norm": 6.372396469116211, "learning_rate": 4.960610291184683e-05, "loss": 3.8838, "step": 76000 }, { "epoch": 15.257279617072198, "grad_norm": 6.738864421844482, "learning_rate": 4.935680095731951e-05, "loss": 3.8834, "step": 76500 }, { "epoch": 15.357000398883127, "grad_norm": 6.700061798095703, "learning_rate": 4.910749900279219e-05, "loss": 3.8559, "step": 77000 }, { "epoch": 15.456721180694057, "grad_norm": 6.3839497566223145, "learning_rate": 4.8858197048264857e-05, "loss": 3.8275, "step": 77500 }, { "epoch": 15.556441962504985, "grad_norm": 6.165511131286621, "learning_rate": 4.860889509373754e-05, "loss": 3.803, "step": 78000 }, { "epoch": 15.656162744315916, "grad_norm": 5.800929069519043, "learning_rate": 4.8359593139210215e-05, "loss": 3.8, "step": 78500 }, { "epoch": 15.755883526126844, "grad_norm": 6.714051246643066, "learning_rate": 4.811029118468289e-05, "loss": 3.797, "step": 79000 }, { "epoch": 15.855604307937774, "grad_norm": 6.74777889251709, "learning_rate": 4.786098923015557e-05, "loss": 3.7759, "step": 79500 }, { "epoch": 15.955325089748703, "grad_norm": 6.980929374694824, "learning_rate": 4.761168727562824e-05, "loss": 3.7445, "step": 80000 }, { "epoch": 16.05504587155963, "grad_norm": 6.54088020324707, "learning_rate": 4.736238532110092e-05, "loss": 3.6805, "step": 80500 }, { "epoch": 16.15476665337056, "grad_norm": 5.999478340148926, "learning_rate": 4.7113083366573594e-05, "loss": 3.6537, "step": 81000 }, { "epoch": 16.254487435181492, "grad_norm": 6.384885311126709, "learning_rate": 4.686378141204627e-05, "loss": 3.6522, "step": 81500 }, { "epoch": 16.354208216992422, "grad_norm": 6.624803066253662, "learning_rate": 4.661447945751895e-05, "loss": 3.6302, "step": 82000 }, { "epoch": 16.453928998803352, "grad_norm": 6.454346656799316, "learning_rate": 4.636567610690068e-05, "loss": 3.6179, "step": 82500 }, { "epoch": 16.55364978061428, "grad_norm": 6.266842365264893, "learning_rate": 4.611637415237336e-05, "loss": 3.6265, "step": 83000 }, { "epoch": 16.65337056242521, "grad_norm": 6.608065128326416, "learning_rate": 4.5867072197846036e-05, "loss": 3.6105, "step": 83500 }, { "epoch": 16.75309134423614, "grad_norm": 6.4489426612854, "learning_rate": 4.5617770243318705e-05, "loss": 3.5994, "step": 84000 }, { "epoch": 16.85281212604707, "grad_norm": 6.433938503265381, "learning_rate": 4.536896689270044e-05, "loss": 3.5648, "step": 84500 }, { "epoch": 16.952532907857996, "grad_norm": 7.4558610916137695, "learning_rate": 4.511966493817312e-05, "loss": 3.5746, "step": 85000 }, { "epoch": 17.052253689668927, "grad_norm": 5.742049217224121, "learning_rate": 4.4870362983645795e-05, "loss": 3.5378, "step": 85500 }, { "epoch": 17.151974471479857, "grad_norm": 6.346868515014648, "learning_rate": 4.462106102911847e-05, "loss": 3.505, "step": 86000 }, { "epoch": 17.251695253290787, "grad_norm": 6.252668857574463, "learning_rate": 4.4371759074591147e-05, "loss": 3.4787, "step": 86500 }, { "epoch": 17.351416035101714, "grad_norm": 6.237195014953613, "learning_rate": 4.412245712006383e-05, "loss": 3.4914, "step": 87000 }, { "epoch": 17.451136816912644, "grad_norm": 7.106077194213867, "learning_rate": 4.3873653769445554e-05, "loss": 3.4641, "step": 87500 }, { "epoch": 17.550857598723574, "grad_norm": 7.160710334777832, "learning_rate": 4.362435181491823e-05, "loss": 3.4419, "step": 88000 }, { "epoch": 17.650578380534505, "grad_norm": 7.160135746002197, "learning_rate": 4.337504986039091e-05, "loss": 3.4604, "step": 88500 }, { "epoch": 17.75029916234543, "grad_norm": 6.785101890563965, "learning_rate": 4.312574790586358e-05, "loss": 3.432, "step": 89000 }, { "epoch": 17.85001994415636, "grad_norm": 5.990314960479736, "learning_rate": 4.287644595133626e-05, "loss": 3.4045, "step": 89500 }, { "epoch": 17.949740725967292, "grad_norm": 6.434844493865967, "learning_rate": 4.2627642600717995e-05, "loss": 3.4236, "step": 90000 }, { "epoch": 18.049461507778222, "grad_norm": 6.7937774658203125, "learning_rate": 4.2378340646190664e-05, "loss": 3.3902, "step": 90500 }, { "epoch": 18.14918228958915, "grad_norm": 7.1783576011657715, "learning_rate": 4.212903869166335e-05, "loss": 3.3545, "step": 91000 }, { "epoch": 18.24890307140008, "grad_norm": 6.374876022338867, "learning_rate": 4.187973673713602e-05, "loss": 3.3451, "step": 91500 }, { "epoch": 18.34862385321101, "grad_norm": 6.49647331237793, "learning_rate": 4.163093338651775e-05, "loss": 3.3452, "step": 92000 }, { "epoch": 18.44834463502194, "grad_norm": 6.785512924194336, "learning_rate": 4.138163143199043e-05, "loss": 3.3102, "step": 92500 }, { "epoch": 18.54806541683287, "grad_norm": 6.842392921447754, "learning_rate": 4.1132329477463106e-05, "loss": 3.3376, "step": 93000 }, { "epoch": 18.647786198643796, "grad_norm": 7.126637935638428, "learning_rate": 4.088302752293578e-05, "loss": 3.3249, "step": 93500 }, { "epoch": 18.747506980454727, "grad_norm": 5.808903217315674, "learning_rate": 4.063372556840846e-05, "loss": 3.2808, "step": 94000 }, { "epoch": 18.847227762265657, "grad_norm": 6.2346954345703125, "learning_rate": 4.0385420821699245e-05, "loss": 3.3189, "step": 94500 }, { "epoch": 18.946948544076587, "grad_norm": 6.60822057723999, "learning_rate": 4.013611886717192e-05, "loss": 3.3143, "step": 95000 }, { "epoch": 19.046669325887514, "grad_norm": 6.471176624298096, "learning_rate": 3.9886816912644597e-05, "loss": 3.2855, "step": 95500 }, { "epoch": 19.146390107698444, "grad_norm": 6.365059852600098, "learning_rate": 3.963751495811727e-05, "loss": 3.2616, "step": 96000 }, { "epoch": 19.246110889509374, "grad_norm": 6.250296592712402, "learning_rate": 3.9388213003589955e-05, "loss": 3.226, "step": 96500 }, { "epoch": 19.345831671320305, "grad_norm": 6.003506660461426, "learning_rate": 3.9138911049062624e-05, "loss": 3.2352, "step": 97000 }, { "epoch": 19.44555245313123, "grad_norm": 5.75541353225708, "learning_rate": 3.88896090945353e-05, "loss": 3.2395, "step": 97500 }, { "epoch": 19.54527323494216, "grad_norm": 6.684996604919434, "learning_rate": 3.864030714000798e-05, "loss": 3.2272, "step": 98000 }, { "epoch": 19.644994016753092, "grad_norm": 5.906820297241211, "learning_rate": 3.839100518548066e-05, "loss": 3.2096, "step": 98500 }, { "epoch": 19.744714798564022, "grad_norm": 6.240872383117676, "learning_rate": 3.814220183486238e-05, "loss": 3.2016, "step": 99000 }, { "epoch": 19.84443558037495, "grad_norm": 6.751197338104248, "learning_rate": 3.7892899880335066e-05, "loss": 3.2141, "step": 99500 }, { "epoch": 19.94415636218588, "grad_norm": 6.535121917724609, "learning_rate": 3.764359792580774e-05, "loss": 3.1829, "step": 100000 }, { "epoch": 20.0, "step": 100280, "total_flos": 1.0559446100070912e+17, "train_loss": 0.8717835233465437, "train_runtime": 7516.252, "train_samples_per_second": 213.455, "train_steps_per_second": 13.342 }, { "epoch": 20.04387714399681, "grad_norm": 6.778732776641846, "learning_rate": 4.9912245712006384e-05, "loss": 3.2388, "step": 100500 }, { "epoch": 20.14359792580774, "grad_norm": 6.354984760284424, "learning_rate": 4.9712804148384526e-05, "loss": 3.2858, "step": 101000 }, { "epoch": 20.243318707618666, "grad_norm": 7.301539897918701, "learning_rate": 4.951336258476267e-05, "loss": 3.3016, "step": 101500 }, { "epoch": 20.343039489429597, "grad_norm": 7.8318772315979, "learning_rate": 4.931392102114081e-05, "loss": 3.2969, "step": 102000 }, { "epoch": 20.442760271240527, "grad_norm": 6.826496124267578, "learning_rate": 4.9114479457518946e-05, "loss": 3.315, "step": 102500 }, { "epoch": 20.542481053051457, "grad_norm": 6.47593879699707, "learning_rate": 4.8915037893897094e-05, "loss": 3.2395, "step": 103000 }, { "epoch": 20.642201834862384, "grad_norm": 6.942465782165527, "learning_rate": 4.871559633027523e-05, "loss": 3.2812, "step": 103500 }, { "epoch": 20.741922616673314, "grad_norm": 6.694247722625732, "learning_rate": 4.851615476665337e-05, "loss": 3.2757, "step": 104000 }, { "epoch": 20.841643398484244, "grad_norm": 6.374402046203613, "learning_rate": 4.8316713203031514e-05, "loss": 3.2517, "step": 104500 }, { "epoch": 20.941364180295174, "grad_norm": 7.804276943206787, "learning_rate": 4.8117271639409656e-05, "loss": 3.2417, "step": 105000 }, { "epoch": 21.041084962106105, "grad_norm": 7.735393524169922, "learning_rate": 4.791822895891504e-05, "loss": 3.2124, "step": 105500 }, { "epoch": 21.14080574391703, "grad_norm": 6.500980377197266, "learning_rate": 4.771878739529318e-05, "loss": 3.1786, "step": 106000 }, { "epoch": 21.24052652572796, "grad_norm": 6.206303119659424, "learning_rate": 4.751934583167133e-05, "loss": 3.188, "step": 106500 }, { "epoch": 21.340247307538892, "grad_norm": 7.221670150756836, "learning_rate": 4.731990426804946e-05, "loss": 3.1658, "step": 107000 }, { "epoch": 21.439968089349822, "grad_norm": 6.705102443695068, "learning_rate": 4.7120462704427605e-05, "loss": 3.1698, "step": 107500 }, { "epoch": 21.53968887116075, "grad_norm": 7.459311485290527, "learning_rate": 4.692102114080575e-05, "loss": 3.1263, "step": 108000 }, { "epoch": 21.63940965297168, "grad_norm": 6.276129245758057, "learning_rate": 4.6721978460311135e-05, "loss": 3.1438, "step": 108500 }, { "epoch": 21.73913043478261, "grad_norm": 6.849742412567139, "learning_rate": 4.652253689668927e-05, "loss": 3.1688, "step": 109000 }, { "epoch": 21.83885121659354, "grad_norm": 6.463535308837891, "learning_rate": 4.632309533306741e-05, "loss": 3.1261, "step": 109500 }, { "epoch": 21.938571998404466, "grad_norm": 6.4734063148498535, "learning_rate": 4.6123653769445554e-05, "loss": 3.1375, "step": 110000 }, { "epoch": 22.038292780215397, "grad_norm": 6.659780025482178, "learning_rate": 4.5924212205823696e-05, "loss": 3.1488, "step": 110500 }, { "epoch": 22.138013562026327, "grad_norm": 6.0405402183532715, "learning_rate": 4.572477064220184e-05, "loss": 3.0816, "step": 111000 }, { "epoch": 22.237734343837257, "grad_norm": 6.467530727386475, "learning_rate": 4.5525329078579974e-05, "loss": 3.0573, "step": 111500 }, { "epoch": 22.337455125648184, "grad_norm": 7.352579116821289, "learning_rate": 4.532588751495812e-05, "loss": 3.054, "step": 112000 }, { "epoch": 22.437175907459114, "grad_norm": 6.598001956939697, "learning_rate": 4.5126844834463503e-05, "loss": 3.0912, "step": 112500 }, { "epoch": 22.536896689270044, "grad_norm": 7.065674304962158, "learning_rate": 4.492780215396889e-05, "loss": 3.0784, "step": 113000 }, { "epoch": 22.636617471080974, "grad_norm": 6.545448303222656, "learning_rate": 4.472836059034703e-05, "loss": 3.0489, "step": 113500 }, { "epoch": 22.7363382528919, "grad_norm": 6.2428059577941895, "learning_rate": 4.4528919026725175e-05, "loss": 3.0412, "step": 114000 }, { "epoch": 22.83605903470283, "grad_norm": 6.4470367431640625, "learning_rate": 4.432947746310331e-05, "loss": 3.0359, "step": 114500 }, { "epoch": 22.93577981651376, "grad_norm": 6.093207836151123, "learning_rate": 4.413003589948145e-05, "loss": 3.0304, "step": 115000 }, { "epoch": 23.035500598324692, "grad_norm": 6.75270414352417, "learning_rate": 4.3930594335859595e-05, "loss": 3.02, "step": 115500 }, { "epoch": 23.13522138013562, "grad_norm": 6.7165374755859375, "learning_rate": 4.373115277223774e-05, "loss": 3.0069, "step": 116000 }, { "epoch": 23.23494216194655, "grad_norm": 5.961038589477539, "learning_rate": 4.353171120861588e-05, "loss": 2.9626, "step": 116500 }, { "epoch": 23.33466294375748, "grad_norm": 6.657290935516357, "learning_rate": 4.333266852812126e-05, "loss": 2.9839, "step": 117000 }, { "epoch": 23.43438372556841, "grad_norm": 6.603748798370361, "learning_rate": 4.31332269644994e-05, "loss": 2.984, "step": 117500 }, { "epoch": 23.53410450737934, "grad_norm": 6.49187707901001, "learning_rate": 4.2933785400877544e-05, "loss": 2.9815, "step": 118000 }, { "epoch": 23.633825289190266, "grad_norm": 7.08600378036499, "learning_rate": 4.273474272038293e-05, "loss": 2.9877, "step": 118500 }, { "epoch": 23.733546071001197, "grad_norm": 6.5724077224731445, "learning_rate": 4.253530115676107e-05, "loss": 2.965, "step": 119000 }, { "epoch": 23.833266852812127, "grad_norm": 6.058481693267822, "learning_rate": 4.233585959313921e-05, "loss": 2.9759, "step": 119500 }, { "epoch": 23.932987634623057, "grad_norm": 7.042490482330322, "learning_rate": 4.213641802951736e-05, "loss": 2.9619, "step": 120000 }, { "epoch": 24.032708416433984, "grad_norm": 6.764120578765869, "learning_rate": 4.193697646589549e-05, "loss": 2.9482, "step": 120500 }, { "epoch": 24.132429198244914, "grad_norm": 6.224752426147461, "learning_rate": 4.1737534902273635e-05, "loss": 2.9368, "step": 121000 }, { "epoch": 24.232149980055844, "grad_norm": 6.817770481109619, "learning_rate": 4.153809333865178e-05, "loss": 2.9325, "step": 121500 }, { "epoch": 24.331870761866774, "grad_norm": 6.26372766494751, "learning_rate": 4.133865177502992e-05, "loss": 2.8953, "step": 122000 }, { "epoch": 24.4315915436777, "grad_norm": 7.136674880981445, "learning_rate": 4.11396090945353e-05, "loss": 2.9019, "step": 122500 }, { "epoch": 24.53131232548863, "grad_norm": 6.46077299118042, "learning_rate": 4.094016753091344e-05, "loss": 2.9091, "step": 123000 }, { "epoch": 24.63103310729956, "grad_norm": 6.0465288162231445, "learning_rate": 4.0740725967291584e-05, "loss": 2.9107, "step": 123500 }, { "epoch": 24.730753889110492, "grad_norm": 6.354468822479248, "learning_rate": 4.0541284403669726e-05, "loss": 2.9206, "step": 124000 }, { "epoch": 24.83047467092142, "grad_norm": 6.679784297943115, "learning_rate": 4.0342241723175114e-05, "loss": 2.8901, "step": 124500 }, { "epoch": 24.93019545273235, "grad_norm": 6.418820858001709, "learning_rate": 4.014280015955325e-05, "loss": 2.8971, "step": 125000 }, { "epoch": 25.0, "step": 125350, "total_flos": 1.319930762508864e+17, "train_loss": 0.6150265672133651, "train_runtime": 7682.6614, "train_samples_per_second": 261.039, "train_steps_per_second": 16.316 }, { "epoch": 25.02991623454328, "grad_norm": 6.8266754150390625, "learning_rate": 4.9950139609094536e-05, "loss": 2.9041, "step": 125500 }, { "epoch": 25.12963701635421, "grad_norm": 7.047895431518555, "learning_rate": 4.9783938306076325e-05, "loss": 2.9501, "step": 126000 }, { "epoch": 25.229357798165136, "grad_norm": 6.489243507385254, "learning_rate": 4.9617737003058106e-05, "loss": 2.9795, "step": 126500 }, { "epoch": 25.329078579976066, "grad_norm": 6.933114528656006, "learning_rate": 4.9451535700039895e-05, "loss": 2.9906, "step": 127000 }, { "epoch": 25.428799361786997, "grad_norm": 7.721564769744873, "learning_rate": 4.9285334397021676e-05, "loss": 2.9822, "step": 127500 }, { "epoch": 25.528520143597927, "grad_norm": 7.604334831237793, "learning_rate": 4.911913309400346e-05, "loss": 2.9751, "step": 128000 }, { "epoch": 25.628240925408853, "grad_norm": 6.689730644226074, "learning_rate": 4.8952931790985246e-05, "loss": 2.9806, "step": 128500 }, { "epoch": 25.727961707219784, "grad_norm": 7.001711368560791, "learning_rate": 4.878673048796703e-05, "loss": 2.9701, "step": 129000 }, { "epoch": 25.827682489030714, "grad_norm": 6.627374649047852, "learning_rate": 4.862052918494881e-05, "loss": 2.982, "step": 129500 }, { "epoch": 25.927403270841644, "grad_norm": 6.500030517578125, "learning_rate": 4.8454660284536635e-05, "loss": 2.9497, "step": 130000 }, { "epoch": 26.027124052652574, "grad_norm": 6.908927917480469, "learning_rate": 4.828845898151842e-05, "loss": 2.9201, "step": 130500 }, { "epoch": 26.1268448344635, "grad_norm": 7.953597068786621, "learning_rate": 4.8122257678500205e-05, "loss": 2.8916, "step": 131000 }, { "epoch": 26.22656561627443, "grad_norm": 7.111712455749512, "learning_rate": 4.795605637548199e-05, "loss": 2.8983, "step": 131500 }, { "epoch": 26.32628639808536, "grad_norm": 7.099549293518066, "learning_rate": 4.778985507246377e-05, "loss": 2.8862, "step": 132000 }, { "epoch": 26.426007179896292, "grad_norm": 6.708031177520752, "learning_rate": 4.762365376944555e-05, "loss": 2.8828, "step": 132500 }, { "epoch": 26.52572796170722, "grad_norm": 6.638050079345703, "learning_rate": 4.745745246642734e-05, "loss": 2.9, "step": 133000 }, { "epoch": 26.62544874351815, "grad_norm": 6.474231243133545, "learning_rate": 4.729125116340912e-05, "loss": 2.8729, "step": 133500 }, { "epoch": 26.72516952532908, "grad_norm": 7.071346759796143, "learning_rate": 4.712538226299694e-05, "loss": 2.878, "step": 134000 }, { "epoch": 26.82489030714001, "grad_norm": 7.4629740715026855, "learning_rate": 4.695918095997873e-05, "loss": 2.8949, "step": 134500 }, { "epoch": 26.924611088950936, "grad_norm": 7.166282653808594, "learning_rate": 4.679331205956655e-05, "loss": 2.8834, "step": 135000 }, { "epoch": 27.024331870761866, "grad_norm": 7.213958263397217, "learning_rate": 4.6627110756548334e-05, "loss": 2.8722, "step": 135500 }, { "epoch": 27.124052652572797, "grad_norm": 6.917830467224121, "learning_rate": 4.6460909453530116e-05, "loss": 2.812, "step": 136000 }, { "epoch": 27.223773434383727, "grad_norm": 7.030029296875, "learning_rate": 4.62947081505119e-05, "loss": 2.7973, "step": 136500 }, { "epoch": 27.323494216194653, "grad_norm": 6.927401542663574, "learning_rate": 4.6128506847493686e-05, "loss": 2.8567, "step": 137000 }, { "epoch": 27.423214998005584, "grad_norm": 7.063901424407959, "learning_rate": 4.596230554447547e-05, "loss": 2.8119, "step": 137500 }, { "epoch": 27.522935779816514, "grad_norm": 6.619449138641357, "learning_rate": 4.5796104241457256e-05, "loss": 2.814, "step": 138000 }, { "epoch": 27.622656561627444, "grad_norm": 6.861698150634766, "learning_rate": 4.562990293843904e-05, "loss": 2.7966, "step": 138500 }, { "epoch": 27.72237734343837, "grad_norm": 5.698707580566406, "learning_rate": 4.5464034038026856e-05, "loss": 2.8274, "step": 139000 }, { "epoch": 27.8220981252493, "grad_norm": 6.638801574707031, "learning_rate": 4.5297832735008645e-05, "loss": 2.8111, "step": 139500 }, { "epoch": 27.92181890706023, "grad_norm": 7.414352893829346, "learning_rate": 4.5131631431990427e-05, "loss": 2.8219, "step": 140000 }, { "epoch": 28.02153968887116, "grad_norm": 7.000102519989014, "learning_rate": 4.4965430128972215e-05, "loss": 2.8059, "step": 140500 }, { "epoch": 28.121260470682092, "grad_norm": 7.648940563201904, "learning_rate": 4.4799561228560034e-05, "loss": 2.7801, "step": 141000 }, { "epoch": 28.22098125249302, "grad_norm": 6.238720417022705, "learning_rate": 4.4633359925541815e-05, "loss": 2.7611, "step": 141500 }, { "epoch": 28.32070203430395, "grad_norm": 7.083422660827637, "learning_rate": 4.4467491025129634e-05, "loss": 2.7476, "step": 142000 }, { "epoch": 28.42042281611488, "grad_norm": 7.1048760414123535, "learning_rate": 4.430128972211142e-05, "loss": 2.7601, "step": 142500 }, { "epoch": 28.52014359792581, "grad_norm": 6.950742244720459, "learning_rate": 4.4135088419093204e-05, "loss": 2.7615, "step": 143000 }, { "epoch": 28.619864379736736, "grad_norm": 7.063054084777832, "learning_rate": 4.396888711607499e-05, "loss": 2.7583, "step": 143500 }, { "epoch": 28.719585161547666, "grad_norm": 6.951484680175781, "learning_rate": 4.3802685813056774e-05, "loss": 2.748, "step": 144000 }, { "epoch": 28.819305943358597, "grad_norm": 7.212677955627441, "learning_rate": 4.363648451003856e-05, "loss": 2.7542, "step": 144500 }, { "epoch": 28.919026725169527, "grad_norm": 6.691658973693848, "learning_rate": 4.3470283207020344e-05, "loss": 2.753, "step": 145000 }, { "epoch": 29.018747506980453, "grad_norm": 7.1954874992370605, "learning_rate": 4.330408190400213e-05, "loss": 2.7332, "step": 145500 }, { "epoch": 29.118468288791384, "grad_norm": 6.654098987579346, "learning_rate": 4.313821300358995e-05, "loss": 2.7109, "step": 146000 }, { "epoch": 29.218189070602314, "grad_norm": 6.924403667449951, "learning_rate": 4.297201170057173e-05, "loss": 2.7076, "step": 146500 }, { "epoch": 29.317909852413244, "grad_norm": 7.731849193572998, "learning_rate": 4.280581039755352e-05, "loss": 2.6943, "step": 147000 }, { "epoch": 29.41763063422417, "grad_norm": 7.095526218414307, "learning_rate": 4.26396090945353e-05, "loss": 2.72, "step": 147500 }, { "epoch": 29.5173514160351, "grad_norm": 7.1939520835876465, "learning_rate": 4.247340779151709e-05, "loss": 2.6772, "step": 148000 }, { "epoch": 29.61707219784603, "grad_norm": 7.466503620147705, "learning_rate": 4.230753889110491e-05, "loss": 2.7193, "step": 148500 }, { "epoch": 29.71679297965696, "grad_norm": 6.902263164520264, "learning_rate": 4.214133758808669e-05, "loss": 2.716, "step": 149000 }, { "epoch": 29.81651376146789, "grad_norm": 7.366625785827637, "learning_rate": 4.197513628506848e-05, "loss": 2.7009, "step": 149500 }, { "epoch": 29.91623454327882, "grad_norm": 6.991941452026367, "learning_rate": 4.180893498205026e-05, "loss": 2.7202, "step": 150000 }, { "epoch": 30.0, "step": 150420, "total_flos": 1.5839169150106368e+17, "train_loss": 0.47119966579742084, "train_runtime": 6930.0607, "train_samples_per_second": 347.265, "train_steps_per_second": 21.705 }, { "epoch": 30.01595532508975, "grad_norm": 6.077478885650635, "learning_rate": 4.997720667844322e-05, "loss": 2.7286, "step": 150500 }, { "epoch": 30.11567610690068, "grad_norm": 6.566033363342285, "learning_rate": 4.983474841871332e-05, "loss": 2.7319, "step": 151000 }, { "epoch": 30.215396888711606, "grad_norm": 7.486234188079834, "learning_rate": 4.969229015898342e-05, "loss": 2.7899, "step": 151500 }, { "epoch": 30.315117670522536, "grad_norm": 7.640929222106934, "learning_rate": 4.954983189925352e-05, "loss": 2.7598, "step": 152000 }, { "epoch": 30.414838452333466, "grad_norm": 7.036547660827637, "learning_rate": 4.940737363952362e-05, "loss": 2.754, "step": 152500 }, { "epoch": 30.514559234144397, "grad_norm": 7.128058910369873, "learning_rate": 4.926491537979372e-05, "loss": 2.7888, "step": 153000 }, { "epoch": 30.614280015955327, "grad_norm": 7.1788249015808105, "learning_rate": 4.912245712006382e-05, "loss": 2.7662, "step": 153500 }, { "epoch": 30.714000797766253, "grad_norm": 7.081215858459473, "learning_rate": 4.897999886033392e-05, "loss": 2.7722, "step": 154000 }, { "epoch": 30.813721579577184, "grad_norm": 6.131695747375488, "learning_rate": 4.883754060060402e-05, "loss": 2.7464, "step": 154500 }, { "epoch": 30.913442361388114, "grad_norm": 6.66817569732666, "learning_rate": 4.869508234087412e-05, "loss": 2.7352, "step": 155000 }, { "epoch": 31.013163143199044, "grad_norm": 7.4430952072143555, "learning_rate": 4.8552908997663685e-05, "loss": 2.7503, "step": 155500 }, { "epoch": 31.11288392500997, "grad_norm": 7.984841346740723, "learning_rate": 4.8410450737933786e-05, "loss": 2.6821, "step": 156000 }, { "epoch": 31.2126047068209, "grad_norm": 7.386984348297119, "learning_rate": 4.8267992478203886e-05, "loss": 2.6916, "step": 156500 }, { "epoch": 31.31232548863183, "grad_norm": 6.3857951164245605, "learning_rate": 4.8125534218473987e-05, "loss": 2.6826, "step": 157000 }, { "epoch": 31.41204627044276, "grad_norm": 7.394888401031494, "learning_rate": 4.798307595874409e-05, "loss": 2.7099, "step": 157500 }, { "epoch": 31.51176705225369, "grad_norm": 7.39955997467041, "learning_rate": 4.784061769901419e-05, "loss": 2.7056, "step": 158000 }, { "epoch": 31.61148783406462, "grad_norm": 6.624033451080322, "learning_rate": 4.769844435580375e-05, "loss": 2.6903, "step": 158500 }, { "epoch": 31.71120861587555, "grad_norm": 6.656693458557129, "learning_rate": 4.755627101259331e-05, "loss": 2.6877, "step": 159000 }, { "epoch": 31.81092939768648, "grad_norm": 7.474542140960693, "learning_rate": 4.741381275286341e-05, "loss": 2.6965, "step": 159500 }, { "epoch": 31.910650179497406, "grad_norm": 7.388774394989014, "learning_rate": 4.727135449313351e-05, "loss": 2.7145, "step": 160000 }, { "epoch": 32.01037096130834, "grad_norm": 7.423541069030762, "learning_rate": 4.712889623340361e-05, "loss": 2.6943, "step": 160500 }, { "epoch": 32.11009174311926, "grad_norm": 6.063508033752441, "learning_rate": 4.698643797367371e-05, "loss": 2.6214, "step": 161000 }, { "epoch": 32.20981252493019, "grad_norm": 7.619082450866699, "learning_rate": 4.6843979713943814e-05, "loss": 2.6318, "step": 161500 }, { "epoch": 32.30953330674112, "grad_norm": 6.978066921234131, "learning_rate": 4.670152145421392e-05, "loss": 2.6327, "step": 162000 }, { "epoch": 32.40925408855205, "grad_norm": 6.166346073150635, "learning_rate": 4.655906319448402e-05, "loss": 2.6419, "step": 162500 }, { "epoch": 32.508974870362984, "grad_norm": 7.364738464355469, "learning_rate": 4.641660493475412e-05, "loss": 2.6356, "step": 163000 }, { "epoch": 32.608695652173914, "grad_norm": 7.476531982421875, "learning_rate": 4.627414667502422e-05, "loss": 2.6344, "step": 163500 }, { "epoch": 32.708416433984844, "grad_norm": 7.627068042755127, "learning_rate": 4.613168841529432e-05, "loss": 2.6434, "step": 164000 }, { "epoch": 32.808137215795774, "grad_norm": 7.334908962249756, "learning_rate": 4.598923015556442e-05, "loss": 2.663, "step": 164500 }, { "epoch": 32.907857997606705, "grad_norm": 6.580120086669922, "learning_rate": 4.5847341728873446e-05, "loss": 2.6406, "step": 165000 }, { "epoch": 33.00757877941763, "grad_norm": 6.953055381774902, "learning_rate": 4.570488346914355e-05, "loss": 2.6517, "step": 165500 }, { "epoch": 33.10729956122856, "grad_norm": 6.980926036834717, "learning_rate": 4.556242520941365e-05, "loss": 2.589, "step": 166000 }, { "epoch": 33.20702034303949, "grad_norm": 7.215412616729736, "learning_rate": 4.541996694968375e-05, "loss": 2.5831, "step": 166500 }, { "epoch": 33.30674112485042, "grad_norm": 7.203444004058838, "learning_rate": 4.527750868995385e-05, "loss": 2.5739, "step": 167000 }, { "epoch": 33.40646190666135, "grad_norm": 5.696502685546875, "learning_rate": 4.513505043022395e-05, "loss": 2.604, "step": 167500 }, { "epoch": 33.50618268847228, "grad_norm": 6.160342216491699, "learning_rate": 4.499259217049405e-05, "loss": 2.5848, "step": 168000 }, { "epoch": 33.60590347028321, "grad_norm": 6.758869171142578, "learning_rate": 4.485013391076415e-05, "loss": 2.6157, "step": 168500 }, { "epoch": 33.70562425209414, "grad_norm": 7.064002513885498, "learning_rate": 4.4708245484073166e-05, "loss": 2.5765, "step": 169000 }, { "epoch": 33.80534503390506, "grad_norm": 7.993391513824463, "learning_rate": 4.4565787224343267e-05, "loss": 2.6115, "step": 169500 }, { "epoch": 33.90506581571599, "grad_norm": 7.196022033691406, "learning_rate": 4.442332896461337e-05, "loss": 2.591, "step": 170000 }, { "epoch": 34.00478659752692, "grad_norm": 8.118667602539062, "learning_rate": 4.428115562140293e-05, "loss": 2.5833, "step": 170500 }, { "epoch": 34.10450737933785, "grad_norm": 7.465199947357178, "learning_rate": 4.413869736167303e-05, "loss": 2.5509, "step": 171000 }, { "epoch": 34.204228161148784, "grad_norm": 6.739304542541504, "learning_rate": 4.399623910194313e-05, "loss": 2.5357, "step": 171500 }, { "epoch": 34.303948942959714, "grad_norm": 6.758444786071777, "learning_rate": 4.385378084221323e-05, "loss": 2.567, "step": 172000 }, { "epoch": 34.403669724770644, "grad_norm": 6.511049270629883, "learning_rate": 4.371132258248333e-05, "loss": 2.5759, "step": 172500 }, { "epoch": 34.503390506581574, "grad_norm": 7.730967044830322, "learning_rate": 4.356886432275343e-05, "loss": 2.5494, "step": 173000 }, { "epoch": 34.6031112883925, "grad_norm": 6.543623924255371, "learning_rate": 4.342640606302353e-05, "loss": 2.5482, "step": 173500 }, { "epoch": 34.70283207020343, "grad_norm": 7.216828346252441, "learning_rate": 4.328394780329364e-05, "loss": 2.5593, "step": 174000 }, { "epoch": 34.80255285201436, "grad_norm": 6.891706943511963, "learning_rate": 4.3141774460083194e-05, "loss": 2.5409, "step": 174500 }, { "epoch": 34.90227363382529, "grad_norm": 7.4927778244018555, "learning_rate": 4.29993162003533e-05, "loss": 2.5673, "step": 175000 }, { "epoch": 35.0, "step": 175490, "total_flos": 1.8479030675124096e+17, "train_loss": 0.37831091759340585, "train_runtime": 6392.496, "train_samples_per_second": 439.213, "train_steps_per_second": 27.453 } ], "logging_steps": 500, "max_steps": 175490, "num_input_tokens_seen": 0, "num_train_epochs": 35, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8479030675124096e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }