{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 95491, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001047219109654313, "grad_norm": 6.445349216461182, "learning_rate": 5.238344683080147e-08, "loss": 1.2293, "step": 100 }, { "epoch": 0.002094438219308626, "grad_norm": 7.579877853393555, "learning_rate": 1.0476689366160294e-07, "loss": 1.2053, "step": 200 }, { "epoch": 0.003141657328962939, "grad_norm": 5.277140140533447, "learning_rate": 1.5715034049240438e-07, "loss": 1.1086, "step": 300 }, { "epoch": 0.004188876438617252, "grad_norm": 3.0632076263427734, "learning_rate": 2.0953378732320588e-07, "loss": 1.0615, "step": 400 }, { "epoch": 0.005236095548271565, "grad_norm": 8.091245651245117, "learning_rate": 2.6191723415400735e-07, "loss": 0.9659, "step": 500 }, { "epoch": 0.006283314657925878, "grad_norm": 2.5814743041992188, "learning_rate": 3.1430068098480877e-07, "loss": 0.9656, "step": 600 }, { "epoch": 0.007330533767580191, "grad_norm": 8.025248527526855, "learning_rate": 3.6668412781561024e-07, "loss": 0.9068, "step": 700 }, { "epoch": 0.008377752877234504, "grad_norm": 2.8023085594177246, "learning_rate": 4.1906757464641176e-07, "loss": 0.8278, "step": 800 }, { "epoch": 0.009424971986888816, "grad_norm": 2.9815306663513184, "learning_rate": 4.714510214772132e-07, "loss": 0.8097, "step": 900 }, { "epoch": 0.01047219109654313, "grad_norm": 4.450624465942383, "learning_rate": 5.238344683080147e-07, "loss": 0.8611, "step": 1000 }, { "epoch": 0.011519410206197442, "grad_norm": 2.9705615043640137, "learning_rate": 5.762179151388162e-07, "loss": 0.8217, "step": 1100 }, { "epoch": 0.012566629315851756, "grad_norm": 5.060612678527832, "learning_rate": 6.286013619696175e-07, "loss": 0.8326, "step": 1200 }, { "epoch": 0.01361384842550607, "grad_norm": 4.002683639526367, "learning_rate": 6.809848088004191e-07, "loss": 0.7742, "step": 1300 }, { "epoch": 0.014661067535160381, "grad_norm": 3.3899588584899902, "learning_rate": 7.333682556312205e-07, "loss": 0.7594, "step": 1400 }, { "epoch": 0.015708286644814693, "grad_norm": 4.091441631317139, "learning_rate": 7.857517024620219e-07, "loss": 0.7871, "step": 1500 }, { "epoch": 0.01675550575446901, "grad_norm": 3.302689790725708, "learning_rate": 8.381351492928235e-07, "loss": 0.721, "step": 1600 }, { "epoch": 0.01780272486412332, "grad_norm": 3.8457956314086914, "learning_rate": 8.905185961236249e-07, "loss": 0.6789, "step": 1700 }, { "epoch": 0.018849943973777632, "grad_norm": 3.763422727584839, "learning_rate": 9.429020429544264e-07, "loss": 0.7035, "step": 1800 }, { "epoch": 0.019897163083431948, "grad_norm": 2.3855648040771484, "learning_rate": 9.95285489785228e-07, "loss": 0.7331, "step": 1900 }, { "epoch": 0.02094438219308626, "grad_norm": 3.0932857990264893, "learning_rate": 9.999976668774249e-07, "loss": 0.7123, "step": 2000 }, { "epoch": 0.02199160130274057, "grad_norm": 2.939152956008911, "learning_rate": 9.999897217221058e-07, "loss": 0.6106, "step": 2100 }, { "epoch": 0.023038820412394884, "grad_norm": 2.148160934448242, "learning_rate": 9.999761418022958e-07, "loss": 0.6828, "step": 2200 }, { "epoch": 0.0240860395220492, "grad_norm": 2.302873134613037, "learning_rate": 9.999569272710377e-07, "loss": 0.6691, "step": 2300 }, { "epoch": 0.02513325863170351, "grad_norm": 4.346377372741699, "learning_rate": 9.999320783448744e-07, "loss": 0.6698, "step": 2400 }, { "epoch": 0.026180477741357823, "grad_norm": 2.157055616378784, "learning_rate": 9.999015953038474e-07, "loss": 0.6019, "step": 2500 }, { "epoch": 0.02722769685101214, "grad_norm": 2.7303714752197266, "learning_rate": 9.998654784914935e-07, "loss": 0.5972, "step": 2600 }, { "epoch": 0.02827491596066645, "grad_norm": 4.359681606292725, "learning_rate": 9.9982372831484e-07, "loss": 0.6381, "step": 2700 }, { "epoch": 0.029322135070320762, "grad_norm": 3.2993288040161133, "learning_rate": 9.997763452444018e-07, "loss": 0.6093, "step": 2800 }, { "epoch": 0.030369354179975078, "grad_norm": 3.061521053314209, "learning_rate": 9.99723329814175e-07, "loss": 0.6875, "step": 2900 }, { "epoch": 0.031416573289629386, "grad_norm": 2.3765642642974854, "learning_rate": 9.996646826216302e-07, "loss": 0.6031, "step": 3000 }, { "epoch": 0.0324637923992837, "grad_norm": 2.144615411758423, "learning_rate": 9.996004043277078e-07, "loss": 0.637, "step": 3100 }, { "epoch": 0.03351101150893802, "grad_norm": 3.2836430072784424, "learning_rate": 9.995304956568083e-07, "loss": 0.6425, "step": 3200 }, { "epoch": 0.034558230618592325, "grad_norm": 2.8710663318634033, "learning_rate": 9.99454957396786e-07, "loss": 0.6199, "step": 3300 }, { "epoch": 0.03560544972824664, "grad_norm": 2.5998404026031494, "learning_rate": 9.993737903989387e-07, "loss": 0.5903, "step": 3400 }, { "epoch": 0.036652668837900956, "grad_norm": 2.677945613861084, "learning_rate": 9.992869955779995e-07, "loss": 0.6473, "step": 3500 }, { "epoch": 0.037699887947555265, "grad_norm": 3.9936769008636475, "learning_rate": 9.991945739121251e-07, "loss": 0.5847, "step": 3600 }, { "epoch": 0.03874710705720958, "grad_norm": 2.839268207550049, "learning_rate": 9.990965264428851e-07, "loss": 0.5893, "step": 3700 }, { "epoch": 0.039794326166863896, "grad_norm": 2.4763646125793457, "learning_rate": 9.989928542752516e-07, "loss": 0.5865, "step": 3800 }, { "epoch": 0.040841545276518204, "grad_norm": 4.822995662689209, "learning_rate": 9.98883558577585e-07, "loss": 0.579, "step": 3900 }, { "epoch": 0.04188876438617252, "grad_norm": 2.6188089847564697, "learning_rate": 9.987686405816216e-07, "loss": 0.6065, "step": 4000 }, { "epoch": 0.042935983495826835, "grad_norm": 2.550874710083008, "learning_rate": 9.986481015824592e-07, "loss": 0.5911, "step": 4100 }, { "epoch": 0.04398320260548114, "grad_norm": 2.973268985748291, "learning_rate": 9.985219429385443e-07, "loss": 0.6216, "step": 4200 }, { "epoch": 0.04503042171513546, "grad_norm": 6.536316394805908, "learning_rate": 9.98390166071654e-07, "loss": 0.5904, "step": 4300 }, { "epoch": 0.04607764082478977, "grad_norm": 2.6079025268554688, "learning_rate": 9.982527724668825e-07, "loss": 0.5942, "step": 4400 }, { "epoch": 0.04712485993444408, "grad_norm": 2.2787749767303467, "learning_rate": 9.981097636726227e-07, "loss": 0.6174, "step": 4500 }, { "epoch": 0.0481720790440984, "grad_norm": 1.995902419090271, "learning_rate": 9.979611413005493e-07, "loss": 0.5698, "step": 4600 }, { "epoch": 0.04921929815375271, "grad_norm": 3.4670004844665527, "learning_rate": 9.97806907025601e-07, "loss": 0.5871, "step": 4700 }, { "epoch": 0.05026651726340702, "grad_norm": 2.329735279083252, "learning_rate": 9.97647062585961e-07, "loss": 0.6061, "step": 4800 }, { "epoch": 0.05131373637306134, "grad_norm": 2.4299092292785645, "learning_rate": 9.97481609783038e-07, "loss": 0.5944, "step": 4900 }, { "epoch": 0.052360955482715646, "grad_norm": 4.186954498291016, "learning_rate": 9.973105504814458e-07, "loss": 0.6131, "step": 5000 }, { "epoch": 0.05340817459236996, "grad_norm": 2.038557767868042, "learning_rate": 9.971338866089812e-07, "loss": 0.5668, "step": 5100 }, { "epoch": 0.05445539370202428, "grad_norm": 2.6505930423736572, "learning_rate": 9.96951620156604e-07, "loss": 0.5697, "step": 5200 }, { "epoch": 0.055502612811678585, "grad_norm": 3.494474411010742, "learning_rate": 9.967637531784138e-07, "loss": 0.6061, "step": 5300 }, { "epoch": 0.0565498319213329, "grad_norm": 1.573089599609375, "learning_rate": 9.965702877916262e-07, "loss": 0.5714, "step": 5400 }, { "epoch": 0.057597051030987216, "grad_norm": 3.103743553161621, "learning_rate": 9.963712261765495e-07, "loss": 0.6045, "step": 5500 }, { "epoch": 0.058644270140641525, "grad_norm": 2.182767152786255, "learning_rate": 9.96166570576561e-07, "loss": 0.6209, "step": 5600 }, { "epoch": 0.05969148925029584, "grad_norm": 2.818512439727783, "learning_rate": 9.959563232980801e-07, "loss": 0.5825, "step": 5700 }, { "epoch": 0.060738708359950155, "grad_norm": 6.24643611907959, "learning_rate": 9.957404867105435e-07, "loss": 0.5645, "step": 5800 }, { "epoch": 0.061785927469604464, "grad_norm": 2.866800308227539, "learning_rate": 9.955190632463774e-07, "loss": 0.5826, "step": 5900 }, { "epoch": 0.06283314657925877, "grad_norm": 1.9323431253433228, "learning_rate": 9.952920554009715e-07, "loss": 0.5706, "step": 6000 }, { "epoch": 0.06388036568891309, "grad_norm": 2.389801263809204, "learning_rate": 9.9505946573265e-07, "loss": 0.5888, "step": 6100 }, { "epoch": 0.0649275847985674, "grad_norm": 2.6937005519866943, "learning_rate": 9.948212968626429e-07, "loss": 0.5848, "step": 6200 }, { "epoch": 0.06597480390822172, "grad_norm": 3.2649362087249756, "learning_rate": 9.945775514750558e-07, "loss": 0.5746, "step": 6300 }, { "epoch": 0.06702202301787603, "grad_norm": 3.9703376293182373, "learning_rate": 9.943282323168416e-07, "loss": 0.5219, "step": 6400 }, { "epoch": 0.06806924212753035, "grad_norm": 3.0078823566436768, "learning_rate": 9.94073342197767e-07, "loss": 0.5867, "step": 6500 }, { "epoch": 0.06911646123718465, "grad_norm": 2.0793182849884033, "learning_rate": 9.938128839903829e-07, "loss": 0.5757, "step": 6600 }, { "epoch": 0.07016368034683897, "grad_norm": 1.7143627405166626, "learning_rate": 9.935468606299908e-07, "loss": 0.5753, "step": 6700 }, { "epoch": 0.07121089945649328, "grad_norm": 1.6375339031219482, "learning_rate": 9.932752751146102e-07, "loss": 0.5875, "step": 6800 }, { "epoch": 0.0722581185661476, "grad_norm": 3.0804569721221924, "learning_rate": 9.929981305049452e-07, "loss": 0.5399, "step": 6900 }, { "epoch": 0.07330533767580191, "grad_norm": 1.8709744215011597, "learning_rate": 9.92715429924349e-07, "loss": 0.5555, "step": 7000 }, { "epoch": 0.07435255678545621, "grad_norm": 2.213629722595215, "learning_rate": 9.924271765587897e-07, "loss": 0.5536, "step": 7100 }, { "epoch": 0.07539977589511053, "grad_norm": 1.5812900066375732, "learning_rate": 9.921333736568133e-07, "loss": 0.5973, "step": 7200 }, { "epoch": 0.07644699500476484, "grad_norm": 1.2580069303512573, "learning_rate": 9.918340245295086e-07, "loss": 0.549, "step": 7300 }, { "epoch": 0.07749421411441916, "grad_norm": 3.4917242527008057, "learning_rate": 9.915291325504685e-07, "loss": 0.5493, "step": 7400 }, { "epoch": 0.07854143322407348, "grad_norm": 3.6106157302856445, "learning_rate": 9.912187011557523e-07, "loss": 0.5367, "step": 7500 }, { "epoch": 0.07958865233372779, "grad_norm": 2.585413694381714, "learning_rate": 9.90902733843848e-07, "loss": 0.5242, "step": 7600 }, { "epoch": 0.08063587144338209, "grad_norm": 2.1417288780212402, "learning_rate": 9.905812341756314e-07, "loss": 0.5657, "step": 7700 }, { "epoch": 0.08168309055303641, "grad_norm": 2.6701626777648926, "learning_rate": 9.902542057743267e-07, "loss": 0.533, "step": 7800 }, { "epoch": 0.08273030966269072, "grad_norm": 2.7961204051971436, "learning_rate": 9.899216523254657e-07, "loss": 0.5833, "step": 7900 }, { "epoch": 0.08377752877234504, "grad_norm": 3.9673585891723633, "learning_rate": 9.895835775768464e-07, "loss": 0.5548, "step": 8000 }, { "epoch": 0.08482474788199935, "grad_norm": 2.384716272354126, "learning_rate": 9.892399853384903e-07, "loss": 0.5802, "step": 8100 }, { "epoch": 0.08587196699165367, "grad_norm": 2.7740979194641113, "learning_rate": 9.888908794825994e-07, "loss": 0.5565, "step": 8200 }, { "epoch": 0.08691918610130797, "grad_norm": 2.4571990966796875, "learning_rate": 9.885362639435133e-07, "loss": 0.5538, "step": 8300 }, { "epoch": 0.08796640521096229, "grad_norm": 2.063465118408203, "learning_rate": 9.88176142717664e-07, "loss": 0.603, "step": 8400 }, { "epoch": 0.0890136243206166, "grad_norm": 1.9801498651504517, "learning_rate": 9.878105198635321e-07, "loss": 0.5479, "step": 8500 }, { "epoch": 0.09006084343027092, "grad_norm": 2.044619083404541, "learning_rate": 9.87439399501599e-07, "loss": 0.5446, "step": 8600 }, { "epoch": 0.09110806253992523, "grad_norm": 2.573242664337158, "learning_rate": 9.87062785814303e-07, "loss": 0.5347, "step": 8700 }, { "epoch": 0.09215528164957953, "grad_norm": 2.520949125289917, "learning_rate": 9.866806830459898e-07, "loss": 0.5467, "step": 8800 }, { "epoch": 0.09320250075923385, "grad_norm": 2.924830913543701, "learning_rate": 9.86293095502866e-07, "loss": 0.5187, "step": 8900 }, { "epoch": 0.09424971986888817, "grad_norm": 2.2049362659454346, "learning_rate": 9.859000275529507e-07, "loss": 0.5549, "step": 9000 }, { "epoch": 0.09529693897854248, "grad_norm": 2.932223320007324, "learning_rate": 9.855014836260256e-07, "loss": 0.5723, "step": 9100 }, { "epoch": 0.0963441580881968, "grad_norm": 2.659306526184082, "learning_rate": 9.850974682135855e-07, "loss": 0.5471, "step": 9200 }, { "epoch": 0.09739137719785111, "grad_norm": 3.1078333854675293, "learning_rate": 9.84687985868787e-07, "loss": 0.5498, "step": 9300 }, { "epoch": 0.09843859630750541, "grad_norm": 2.73991322517395, "learning_rate": 9.842730412063984e-07, "loss": 0.5509, "step": 9400 }, { "epoch": 0.09948581541715973, "grad_norm": 2.288360595703125, "learning_rate": 9.83852638902747e-07, "loss": 0.5311, "step": 9500 }, { "epoch": 0.10053303452681404, "grad_norm": 2.391042947769165, "learning_rate": 9.834267836956652e-07, "loss": 0.569, "step": 9600 }, { "epoch": 0.10158025363646836, "grad_norm": 2.225496292114258, "learning_rate": 9.829954803844404e-07, "loss": 0.5432, "step": 9700 }, { "epoch": 0.10262747274612267, "grad_norm": 1.877164363861084, "learning_rate": 9.82558733829757e-07, "loss": 0.5795, "step": 9800 }, { "epoch": 0.10367469185577699, "grad_norm": 2.455549478530884, "learning_rate": 9.82116548953644e-07, "loss": 0.577, "step": 9900 }, { "epoch": 0.10472191096543129, "grad_norm": 3.1859889030456543, "learning_rate": 9.816689307394198e-07, "loss": 0.5742, "step": 10000 }, { "epoch": 0.10576913007508561, "grad_norm": 2.9405317306518555, "learning_rate": 9.812158842316341e-07, "loss": 0.5674, "step": 10100 }, { "epoch": 0.10681634918473992, "grad_norm": 2.1740851402282715, "learning_rate": 9.807574145360125e-07, "loss": 0.5219, "step": 10200 }, { "epoch": 0.10786356829439424, "grad_norm": 2.1551525592803955, "learning_rate": 9.80293526819399e-07, "loss": 0.5378, "step": 10300 }, { "epoch": 0.10891078740404855, "grad_norm": 1.479442834854126, "learning_rate": 9.798242263096968e-07, "loss": 0.5137, "step": 10400 }, { "epoch": 0.10995800651370287, "grad_norm": 2.2272469997406006, "learning_rate": 9.793495182958107e-07, "loss": 0.5469, "step": 10500 }, { "epoch": 0.11100522562335717, "grad_norm": 1.9610800743103027, "learning_rate": 9.78869408127586e-07, "loss": 0.5685, "step": 10600 }, { "epoch": 0.11205244473301149, "grad_norm": 2.2086081504821777, "learning_rate": 9.7838390121575e-07, "loss": 0.5505, "step": 10700 }, { "epoch": 0.1130996638426658, "grad_norm": 3.1201093196868896, "learning_rate": 9.778930030318488e-07, "loss": 0.5829, "step": 10800 }, { "epoch": 0.11414688295232012, "grad_norm": 2.6629204750061035, "learning_rate": 9.773967191081875e-07, "loss": 0.5925, "step": 10900 }, { "epoch": 0.11519410206197443, "grad_norm": 2.593073844909668, "learning_rate": 9.768950550377674e-07, "loss": 0.572, "step": 11000 }, { "epoch": 0.11624132117162873, "grad_norm": 4.5134687423706055, "learning_rate": 9.763880164742224e-07, "loss": 0.5106, "step": 11100 }, { "epoch": 0.11728854028128305, "grad_norm": 3.3710708618164062, "learning_rate": 9.758756091317557e-07, "loss": 0.567, "step": 11200 }, { "epoch": 0.11833575939093736, "grad_norm": 3.414686679840088, "learning_rate": 9.753578387850754e-07, "loss": 0.578, "step": 11300 }, { "epoch": 0.11938297850059168, "grad_norm": 2.6787045001983643, "learning_rate": 9.748347112693294e-07, "loss": 0.5587, "step": 11400 }, { "epoch": 0.120430197610246, "grad_norm": 2.505725860595703, "learning_rate": 9.743062324800395e-07, "loss": 0.5513, "step": 11500 }, { "epoch": 0.12147741671990031, "grad_norm": 2.5358970165252686, "learning_rate": 9.737724083730354e-07, "loss": 0.5378, "step": 11600 }, { "epoch": 0.12252463582955461, "grad_norm": 1.6748542785644531, "learning_rate": 9.732332449643868e-07, "loss": 0.5062, "step": 11700 }, { "epoch": 0.12357185493920893, "grad_norm": 2.4574966430664062, "learning_rate": 9.726887483303364e-07, "loss": 0.5721, "step": 11800 }, { "epoch": 0.12461907404886324, "grad_norm": 2.737337589263916, "learning_rate": 9.721389246072307e-07, "loss": 0.5963, "step": 11900 }, { "epoch": 0.12566629315851754, "grad_norm": 2.453996181488037, "learning_rate": 9.715837799914517e-07, "loss": 0.5917, "step": 12000 }, { "epoch": 0.12671351226817187, "grad_norm": 2.9003748893737793, "learning_rate": 9.710233207393463e-07, "loss": 0.5603, "step": 12100 }, { "epoch": 0.12776073137782618, "grad_norm": 2.409175395965576, "learning_rate": 9.704575531671562e-07, "loss": 0.568, "step": 12200 }, { "epoch": 0.1288079504874805, "grad_norm": 3.183899402618408, "learning_rate": 9.698864836509463e-07, "loss": 0.5702, "step": 12300 }, { "epoch": 0.1298551695971348, "grad_norm": 2.7574760913848877, "learning_rate": 9.693101186265336e-07, "loss": 0.5394, "step": 12400 }, { "epoch": 0.1309023887067891, "grad_norm": 2.9319100379943848, "learning_rate": 9.687284645894139e-07, "loss": 0.5504, "step": 12500 }, { "epoch": 0.13194960781644344, "grad_norm": 2.8977279663085938, "learning_rate": 9.681415280946887e-07, "loss": 0.611, "step": 12600 }, { "epoch": 0.13299682692609774, "grad_norm": 1.9469819068908691, "learning_rate": 9.675493157569922e-07, "loss": 0.5621, "step": 12700 }, { "epoch": 0.13404404603575207, "grad_norm": 2.0829553604125977, "learning_rate": 9.669518342504155e-07, "loss": 0.5305, "step": 12800 }, { "epoch": 0.13509126514540637, "grad_norm": 3.0171096324920654, "learning_rate": 9.663490903084324e-07, "loss": 0.5666, "step": 12900 }, { "epoch": 0.1361384842550607, "grad_norm": 3.0453896522521973, "learning_rate": 9.657410907238224e-07, "loss": 0.5332, "step": 13000 }, { "epoch": 0.137185703364715, "grad_norm": 2.2059998512268066, "learning_rate": 9.651278423485958e-07, "loss": 0.5859, "step": 13100 }, { "epoch": 0.1382329224743693, "grad_norm": 2.076673746109009, "learning_rate": 9.645093520939146e-07, "loss": 0.5048, "step": 13200 }, { "epoch": 0.13928014158402363, "grad_norm": 1.7987829446792603, "learning_rate": 9.638856269300163e-07, "loss": 0.5501, "step": 13300 }, { "epoch": 0.14032736069367793, "grad_norm": 3.1706273555755615, "learning_rate": 9.63256673886134e-07, "loss": 0.5389, "step": 13400 }, { "epoch": 0.14137457980333226, "grad_norm": 2.9992752075195312, "learning_rate": 9.626225000504177e-07, "loss": 0.5517, "step": 13500 }, { "epoch": 0.14242179891298656, "grad_norm": 1.2536182403564453, "learning_rate": 9.619831125698552e-07, "loss": 0.5304, "step": 13600 }, { "epoch": 0.14346901802264087, "grad_norm": 2.491206645965576, "learning_rate": 9.6133851865019e-07, "loss": 0.5001, "step": 13700 }, { "epoch": 0.1445162371322952, "grad_norm": 2.180227518081665, "learning_rate": 9.606887255558417e-07, "loss": 0.5149, "step": 13800 }, { "epoch": 0.1455634562419495, "grad_norm": 1.546883463859558, "learning_rate": 9.60033740609823e-07, "loss": 0.5566, "step": 13900 }, { "epoch": 0.14661067535160383, "grad_norm": 2.402559757232666, "learning_rate": 9.593735711936567e-07, "loss": 0.5343, "step": 14000 }, { "epoch": 0.14765789446125813, "grad_norm": 4.94249153137207, "learning_rate": 9.587082247472948e-07, "loss": 0.516, "step": 14100 }, { "epoch": 0.14870511357091243, "grad_norm": 1.760003924369812, "learning_rate": 9.580377087690324e-07, "loss": 0.5395, "step": 14200 }, { "epoch": 0.14975233268056676, "grad_norm": 2.1215927600860596, "learning_rate": 9.573620308154238e-07, "loss": 0.55, "step": 14300 }, { "epoch": 0.15079955179022106, "grad_norm": 2.929760217666626, "learning_rate": 9.566811985011981e-07, "loss": 0.5571, "step": 14400 }, { "epoch": 0.1518467708998754, "grad_norm": 2.7724721431732178, "learning_rate": 9.559952194991726e-07, "loss": 0.5712, "step": 14500 }, { "epoch": 0.1528939900095297, "grad_norm": 2.270812749862671, "learning_rate": 9.55304101540166e-07, "loss": 0.5355, "step": 14600 }, { "epoch": 0.15394120911918402, "grad_norm": 2.3572235107421875, "learning_rate": 9.546078524129127e-07, "loss": 0.5595, "step": 14700 }, { "epoch": 0.15498842822883832, "grad_norm": 1.5402534008026123, "learning_rate": 9.539064799639735e-07, "loss": 0.5561, "step": 14800 }, { "epoch": 0.15603564733849262, "grad_norm": 3.2286136150360107, "learning_rate": 9.531999920976481e-07, "loss": 0.4951, "step": 14900 }, { "epoch": 0.15708286644814695, "grad_norm": 1.4825396537780762, "learning_rate": 9.524883967758858e-07, "loss": 0.5099, "step": 15000 }, { "epoch": 0.15813008555780125, "grad_norm": 1.649629831314087, "learning_rate": 9.517717020181953e-07, "loss": 0.5694, "step": 15100 }, { "epoch": 0.15917730466745558, "grad_norm": 1.8996721506118774, "learning_rate": 9.510499159015553e-07, "loss": 0.5364, "step": 15200 }, { "epoch": 0.16022452377710988, "grad_norm": 3.648730993270874, "learning_rate": 9.50323046560322e-07, "loss": 0.5276, "step": 15300 }, { "epoch": 0.16127174288676419, "grad_norm": 2.633986473083496, "learning_rate": 9.495911021861396e-07, "loss": 0.5399, "step": 15400 }, { "epoch": 0.16231896199641851, "grad_norm": 1.8254631757736206, "learning_rate": 9.488540910278455e-07, "loss": 0.5484, "step": 15500 }, { "epoch": 0.16336618110607282, "grad_norm": 2.676395893096924, "learning_rate": 9.481120213913794e-07, "loss": 0.5741, "step": 15600 }, { "epoch": 0.16441340021572715, "grad_norm": 3.6794283390045166, "learning_rate": 9.47364901639688e-07, "loss": 0.5481, "step": 15700 }, { "epoch": 0.16546061932538145, "grad_norm": 1.8362795114517212, "learning_rate": 9.466127401926326e-07, "loss": 0.5704, "step": 15800 }, { "epoch": 0.16650783843503575, "grad_norm": 2.256762742996216, "learning_rate": 9.458555455268924e-07, "loss": 0.5159, "step": 15900 }, { "epoch": 0.16755505754469008, "grad_norm": 2.6386005878448486, "learning_rate": 9.450933261758702e-07, "loss": 0.4916, "step": 16000 }, { "epoch": 0.16860227665434438, "grad_norm": 2.635512113571167, "learning_rate": 9.443260907295955e-07, "loss": 0.508, "step": 16100 }, { "epoch": 0.1696494957639987, "grad_norm": 1.6727428436279297, "learning_rate": 9.435538478346282e-07, "loss": 0.5282, "step": 16200 }, { "epoch": 0.170696714873653, "grad_norm": 2.1256072521209717, "learning_rate": 9.42776606193961e-07, "loss": 0.5878, "step": 16300 }, { "epoch": 0.17174393398330734, "grad_norm": 2.557060956954956, "learning_rate": 9.419943745669209e-07, "loss": 0.5392, "step": 16400 }, { "epoch": 0.17279115309296164, "grad_norm": 2.912794828414917, "learning_rate": 9.412071617690713e-07, "loss": 0.5631, "step": 16500 }, { "epoch": 0.17383837220261594, "grad_norm": 2.380751132965088, "learning_rate": 9.40414976672112e-07, "loss": 0.5518, "step": 16600 }, { "epoch": 0.17488559131227027, "grad_norm": 2.5645503997802734, "learning_rate": 9.396178282037795e-07, "loss": 0.5377, "step": 16700 }, { "epoch": 0.17593281042192457, "grad_norm": 2.270052433013916, "learning_rate": 9.388157253477459e-07, "loss": 0.524, "step": 16800 }, { "epoch": 0.1769800295315789, "grad_norm": 2.3046374320983887, "learning_rate": 9.380086771435187e-07, "loss": 0.5224, "step": 16900 }, { "epoch": 0.1780272486412332, "grad_norm": 1.9633408784866333, "learning_rate": 9.371966926863381e-07, "loss": 0.5241, "step": 17000 }, { "epoch": 0.1790744677508875, "grad_norm": 2.206256628036499, "learning_rate": 9.363797811270743e-07, "loss": 0.5599, "step": 17100 }, { "epoch": 0.18012168686054184, "grad_norm": 2.883242607116699, "learning_rate": 9.355579516721251e-07, "loss": 0.5472, "step": 17200 }, { "epoch": 0.18116890597019614, "grad_norm": 3.9055755138397217, "learning_rate": 9.34731213583312e-07, "loss": 0.5463, "step": 17300 }, { "epoch": 0.18221612507985047, "grad_norm": 2.9254720211029053, "learning_rate": 9.338995761777751e-07, "loss": 0.5385, "step": 17400 }, { "epoch": 0.18326334418950477, "grad_norm": 2.070220947265625, "learning_rate": 9.33063048827869e-07, "loss": 0.597, "step": 17500 }, { "epoch": 0.18431056329915907, "grad_norm": 2.241502285003662, "learning_rate": 9.322216409610566e-07, "loss": 0.4954, "step": 17600 }, { "epoch": 0.1853577824088134, "grad_norm": 2.7689974308013916, "learning_rate": 9.313753620598035e-07, "loss": 0.5536, "step": 17700 }, { "epoch": 0.1864050015184677, "grad_norm": 2.5464389324188232, "learning_rate": 9.3052422166147e-07, "loss": 0.5342, "step": 17800 }, { "epoch": 0.18745222062812203, "grad_norm": 1.727013111114502, "learning_rate": 9.296682293582049e-07, "loss": 0.5383, "step": 17900 }, { "epoch": 0.18849943973777633, "grad_norm": 4.623219966888428, "learning_rate": 9.288073947968364e-07, "loss": 0.5305, "step": 18000 }, { "epoch": 0.18954665884743066, "grad_norm": 1.5261229276657104, "learning_rate": 9.27941727678764e-07, "loss": 0.5235, "step": 18100 }, { "epoch": 0.19059387795708496, "grad_norm": 1.9866268634796143, "learning_rate": 9.270712377598491e-07, "loss": 0.5217, "step": 18200 }, { "epoch": 0.19164109706673926, "grad_norm": 3.0393967628479004, "learning_rate": 9.261959348503046e-07, "loss": 0.5241, "step": 18300 }, { "epoch": 0.1926883161763936, "grad_norm": 2.8217124938964844, "learning_rate": 9.253158288145848e-07, "loss": 0.5713, "step": 18400 }, { "epoch": 0.1937355352860479, "grad_norm": 2.327930450439453, "learning_rate": 9.24430929571274e-07, "loss": 0.5191, "step": 18500 }, { "epoch": 0.19478275439570222, "grad_norm": 2.090432643890381, "learning_rate": 9.235412470929748e-07, "loss": 0.5285, "step": 18600 }, { "epoch": 0.19582997350535652, "grad_norm": 2.427619457244873, "learning_rate": 9.226467914061962e-07, "loss": 0.5157, "step": 18700 }, { "epoch": 0.19687719261501083, "grad_norm": 3.4102041721343994, "learning_rate": 9.217475725912391e-07, "loss": 0.52, "step": 18800 }, { "epoch": 0.19792441172466516, "grad_norm": 1.7967109680175781, "learning_rate": 9.208436007820848e-07, "loss": 0.514, "step": 18900 }, { "epoch": 0.19897163083431946, "grad_norm": 2.5887088775634766, "learning_rate": 9.19934886166279e-07, "loss": 0.4798, "step": 19000 }, { "epoch": 0.2000188499439738, "grad_norm": 2.08363676071167, "learning_rate": 9.190214389848181e-07, "loss": 0.5348, "step": 19100 }, { "epoch": 0.2010660690536281, "grad_norm": 2.4554569721221924, "learning_rate": 9.18103269532033e-07, "loss": 0.4976, "step": 19200 }, { "epoch": 0.20211328816328242, "grad_norm": 2.604750633239746, "learning_rate": 9.171803881554736e-07, "loss": 0.5048, "step": 19300 }, { "epoch": 0.20316050727293672, "grad_norm": 1.9831663370132446, "learning_rate": 9.162528052557925e-07, "loss": 0.5618, "step": 19400 }, { "epoch": 0.20420772638259102, "grad_norm": 2.6448137760162354, "learning_rate": 9.153205312866265e-07, "loss": 0.5382, "step": 19500 }, { "epoch": 0.20525494549224535, "grad_norm": 2.27817964553833, "learning_rate": 9.143835767544805e-07, "loss": 0.5189, "step": 19600 }, { "epoch": 0.20630216460189965, "grad_norm": 1.8295369148254395, "learning_rate": 9.134419522186075e-07, "loss": 0.5083, "step": 19700 }, { "epoch": 0.20734938371155398, "grad_norm": 3.7082695960998535, "learning_rate": 9.124956682908908e-07, "loss": 0.4839, "step": 19800 }, { "epoch": 0.20839660282120828, "grad_norm": 2.17672061920166, "learning_rate": 9.115447356357238e-07, "loss": 0.5203, "step": 19900 }, { "epoch": 0.20944382193086258, "grad_norm": 2.759127378463745, "learning_rate": 9.105891649698898e-07, "loss": 0.5339, "step": 20000 }, { "epoch": 0.2104910410405169, "grad_norm": 2.4461498260498047, "learning_rate": 9.096289670624416e-07, "loss": 0.5536, "step": 20100 }, { "epoch": 0.21153826015017121, "grad_norm": 2.8688385486602783, "learning_rate": 9.086641527345796e-07, "loss": 0.5266, "step": 20200 }, { "epoch": 0.21258547925982554, "grad_norm": 2.589167356491089, "learning_rate": 9.076947328595306e-07, "loss": 0.5031, "step": 20300 }, { "epoch": 0.21363269836947985, "grad_norm": 3.033956289291382, "learning_rate": 9.067207183624243e-07, "loss": 0.5288, "step": 20400 }, { "epoch": 0.21467991747913415, "grad_norm": 2.5122592449188232, "learning_rate": 9.057421202201714e-07, "loss": 0.5002, "step": 20500 }, { "epoch": 0.21572713658878848, "grad_norm": 2.099766731262207, "learning_rate": 9.047589494613381e-07, "loss": 0.5389, "step": 20600 }, { "epoch": 0.21677435569844278, "grad_norm": 2.65134596824646, "learning_rate": 9.037712171660241e-07, "loss": 0.5537, "step": 20700 }, { "epoch": 0.2178215748080971, "grad_norm": 2.301417589187622, "learning_rate": 9.027789344657357e-07, "loss": 0.5554, "step": 20800 }, { "epoch": 0.2188687939177514, "grad_norm": 2.6696295738220215, "learning_rate": 9.017821125432612e-07, "loss": 0.5191, "step": 20900 }, { "epoch": 0.21991601302740574, "grad_norm": 2.455559015274048, "learning_rate": 9.007807626325455e-07, "loss": 0.5053, "step": 21000 }, { "epoch": 0.22096323213706004, "grad_norm": 2.676161289215088, "learning_rate": 8.997748960185622e-07, "loss": 0.518, "step": 21100 }, { "epoch": 0.22201045124671434, "grad_norm": 2.6200263500213623, "learning_rate": 8.987645240371873e-07, "loss": 0.4884, "step": 21200 }, { "epoch": 0.22305767035636867, "grad_norm": 3.8255863189697266, "learning_rate": 8.977496580750712e-07, "loss": 0.5348, "step": 21300 }, { "epoch": 0.22410488946602297, "grad_norm": 2.0892577171325684, "learning_rate": 8.967303095695105e-07, "loss": 0.5178, "step": 21400 }, { "epoch": 0.2251521085756773, "grad_norm": 2.40419864654541, "learning_rate": 8.957064900083187e-07, "loss": 0.584, "step": 21500 }, { "epoch": 0.2261993276853316, "grad_norm": 3.042703628540039, "learning_rate": 8.946782109296973e-07, "loss": 0.5267, "step": 21600 }, { "epoch": 0.2272465467949859, "grad_norm": 1.6234790086746216, "learning_rate": 8.936454839221054e-07, "loss": 0.5217, "step": 21700 }, { "epoch": 0.22829376590464023, "grad_norm": 1.706650972366333, "learning_rate": 8.926083206241291e-07, "loss": 0.5242, "step": 21800 }, { "epoch": 0.22934098501429453, "grad_norm": 4.158198833465576, "learning_rate": 8.915667327243506e-07, "loss": 0.524, "step": 21900 }, { "epoch": 0.23038820412394886, "grad_norm": 2.2484548091888428, "learning_rate": 8.905207319612163e-07, "loss": 0.5347, "step": 22000 }, { "epoch": 0.23143542323360317, "grad_norm": 2.990169048309326, "learning_rate": 8.894703301229043e-07, "loss": 0.5408, "step": 22100 }, { "epoch": 0.23248264234325747, "grad_norm": 3.9766592979431152, "learning_rate": 8.884155390471919e-07, "loss": 0.5046, "step": 22200 }, { "epoch": 0.2335298614529118, "grad_norm": 2.5463485717773438, "learning_rate": 8.873563706213221e-07, "loss": 0.4881, "step": 22300 }, { "epoch": 0.2345770805625661, "grad_norm": 2.7277047634124756, "learning_rate": 8.862928367818696e-07, "loss": 0.5228, "step": 22400 }, { "epoch": 0.23562429967222043, "grad_norm": 1.9528217315673828, "learning_rate": 8.852249495146063e-07, "loss": 0.5056, "step": 22500 }, { "epoch": 0.23667151878187473, "grad_norm": 2.527414083480835, "learning_rate": 8.841527208543658e-07, "loss": 0.5186, "step": 22600 }, { "epoch": 0.23771873789152906, "grad_norm": 1.9525986909866333, "learning_rate": 8.830761628849087e-07, "loss": 0.5195, "step": 22700 }, { "epoch": 0.23876595700118336, "grad_norm": 1.6230095624923706, "learning_rate": 8.819952877387855e-07, "loss": 0.4834, "step": 22800 }, { "epoch": 0.23981317611083766, "grad_norm": 2.2290198802948, "learning_rate": 8.809101075972005e-07, "loss": 0.5207, "step": 22900 }, { "epoch": 0.240860395220492, "grad_norm": 3.419203996658325, "learning_rate": 8.798206346898743e-07, "loss": 0.5064, "step": 23000 }, { "epoch": 0.2419076143301463, "grad_norm": 2.360508441925049, "learning_rate": 8.787268812949054e-07, "loss": 0.5011, "step": 23100 }, { "epoch": 0.24295483343980062, "grad_norm": 1.8023535013198853, "learning_rate": 8.77628859738633e-07, "loss": 0.5099, "step": 23200 }, { "epoch": 0.24400205254945492, "grad_norm": 1.9575679302215576, "learning_rate": 8.765265823954972e-07, "loss": 0.5361, "step": 23300 }, { "epoch": 0.24504927165910922, "grad_norm": 1.5841313600540161, "learning_rate": 8.754200616879001e-07, "loss": 0.541, "step": 23400 }, { "epoch": 0.24609649076876355, "grad_norm": 2.8605728149414062, "learning_rate": 8.743093100860648e-07, "loss": 0.5541, "step": 23500 }, { "epoch": 0.24714370987841786, "grad_norm": 1.696733832359314, "learning_rate": 8.731943401078961e-07, "loss": 0.511, "step": 23600 }, { "epoch": 0.24819092898807218, "grad_norm": 2.1618356704711914, "learning_rate": 8.720751643188389e-07, "loss": 0.5066, "step": 23700 }, { "epoch": 0.2492381480977265, "grad_norm": 2.721067428588867, "learning_rate": 8.709517953317365e-07, "loss": 0.5398, "step": 23800 }, { "epoch": 0.2502853672073808, "grad_norm": 1.8457568883895874, "learning_rate": 8.698242458066882e-07, "loss": 0.4879, "step": 23900 }, { "epoch": 0.2513325863170351, "grad_norm": 2.435941696166992, "learning_rate": 8.686925284509077e-07, "loss": 0.531, "step": 24000 }, { "epoch": 0.2523798054266894, "grad_norm": 2.617920160293579, "learning_rate": 8.675566560185786e-07, "loss": 0.5189, "step": 24100 }, { "epoch": 0.25342702453634375, "grad_norm": 2.538632869720459, "learning_rate": 8.664166413107109e-07, "loss": 0.5433, "step": 24200 }, { "epoch": 0.2544742436459981, "grad_norm": 2.3944451808929443, "learning_rate": 8.65272497174998e-07, "loss": 0.5401, "step": 24300 }, { "epoch": 0.25552146275565235, "grad_norm": 3.6203765869140625, "learning_rate": 8.641242365056705e-07, "loss": 0.544, "step": 24400 }, { "epoch": 0.2565686818653067, "grad_norm": 2.866250991821289, "learning_rate": 8.629718722433507e-07, "loss": 0.5357, "step": 24500 }, { "epoch": 0.257615900974961, "grad_norm": 3.3872838020324707, "learning_rate": 8.618154173749088e-07, "loss": 0.5261, "step": 24600 }, { "epoch": 0.2586631200846153, "grad_norm": 2.269967794418335, "learning_rate": 8.606548849333138e-07, "loss": 0.5128, "step": 24700 }, { "epoch": 0.2597103391942696, "grad_norm": 2.1335697174072266, "learning_rate": 8.594902879974888e-07, "loss": 0.5645, "step": 24800 }, { "epoch": 0.26075755830392394, "grad_norm": 2.443239212036133, "learning_rate": 8.583216396921624e-07, "loss": 0.4806, "step": 24900 }, { "epoch": 0.2618047774135782, "grad_norm": 2.713833808898926, "learning_rate": 8.571489531877214e-07, "loss": 0.5271, "step": 25000 }, { "epoch": 0.26285199652323255, "grad_norm": 3.485182046890259, "learning_rate": 8.559722417000619e-07, "loss": 0.4962, "step": 25100 }, { "epoch": 0.2638992156328869, "grad_norm": 2.306403160095215, "learning_rate": 8.547915184904409e-07, "loss": 0.5122, "step": 25200 }, { "epoch": 0.2649464347425412, "grad_norm": 2.6151928901672363, "learning_rate": 8.536067968653261e-07, "loss": 0.5316, "step": 25300 }, { "epoch": 0.2659936538521955, "grad_norm": 2.3466389179229736, "learning_rate": 8.524180901762469e-07, "loss": 0.4991, "step": 25400 }, { "epoch": 0.2670408729618498, "grad_norm": 2.0926601886749268, "learning_rate": 8.512254118196429e-07, "loss": 0.5254, "step": 25500 }, { "epoch": 0.26808809207150414, "grad_norm": 1.9708478450775146, "learning_rate": 8.500287752367142e-07, "loss": 0.507, "step": 25600 }, { "epoch": 0.2691353111811584, "grad_norm": 2.028843879699707, "learning_rate": 8.48828193913268e-07, "loss": 0.5066, "step": 25700 }, { "epoch": 0.27018253029081274, "grad_norm": 2.9337289333343506, "learning_rate": 8.47623681379569e-07, "loss": 0.5023, "step": 25800 }, { "epoch": 0.27122974940046707, "grad_norm": 2.8608200550079346, "learning_rate": 8.464152512101848e-07, "loss": 0.5417, "step": 25900 }, { "epoch": 0.2722769685101214, "grad_norm": 3.0925405025482178, "learning_rate": 8.452029170238344e-07, "loss": 0.5415, "step": 26000 }, { "epoch": 0.27332418761977567, "grad_norm": 1.9558321237564087, "learning_rate": 8.439866924832338e-07, "loss": 0.519, "step": 26100 }, { "epoch": 0.27437140672943, "grad_norm": 1.5545213222503662, "learning_rate": 8.427665912949425e-07, "loss": 0.5441, "step": 26200 }, { "epoch": 0.27541862583908433, "grad_norm": 3.6202712059020996, "learning_rate": 8.415426272092089e-07, "loss": 0.5559, "step": 26300 }, { "epoch": 0.2764658449487386, "grad_norm": 1.8004056215286255, "learning_rate": 8.403148140198151e-07, "loss": 0.5034, "step": 26400 }, { "epoch": 0.27751306405839293, "grad_norm": 2.5597338676452637, "learning_rate": 8.390831655639223e-07, "loss": 0.5294, "step": 26500 }, { "epoch": 0.27856028316804726, "grad_norm": 2.014400005340576, "learning_rate": 8.378476957219134e-07, "loss": 0.5663, "step": 26600 }, { "epoch": 0.27960750227770154, "grad_norm": 2.069840669631958, "learning_rate": 8.366084184172377e-07, "loss": 0.5007, "step": 26700 }, { "epoch": 0.28065472138735587, "grad_norm": 5.621069431304932, "learning_rate": 8.353653476162543e-07, "loss": 0.5263, "step": 26800 }, { "epoch": 0.2817019404970102, "grad_norm": 3.1065540313720703, "learning_rate": 8.341184973280732e-07, "loss": 0.5048, "step": 26900 }, { "epoch": 0.2827491596066645, "grad_norm": 2.579742431640625, "learning_rate": 8.328678816043988e-07, "loss": 0.5272, "step": 27000 }, { "epoch": 0.2837963787163188, "grad_norm": 2.476778030395508, "learning_rate": 8.31613514539371e-07, "loss": 0.4944, "step": 27100 }, { "epoch": 0.2848435978259731, "grad_norm": 2.7026314735412598, "learning_rate": 8.303554102694065e-07, "loss": 0.5257, "step": 27200 }, { "epoch": 0.28589081693562746, "grad_norm": 2.1597368717193604, "learning_rate": 8.290935829730391e-07, "loss": 0.5282, "step": 27300 }, { "epoch": 0.28693803604528173, "grad_norm": 2.447305202484131, "learning_rate": 8.278280468707606e-07, "loss": 0.5295, "step": 27400 }, { "epoch": 0.28798525515493606, "grad_norm": 2.806995391845703, "learning_rate": 8.265588162248597e-07, "loss": 0.4933, "step": 27500 }, { "epoch": 0.2890324742645904, "grad_norm": 2.1765849590301514, "learning_rate": 8.252859053392622e-07, "loss": 0.5486, "step": 27600 }, { "epoch": 0.2900796933742447, "grad_norm": 2.122382640838623, "learning_rate": 8.240093285593692e-07, "loss": 0.5255, "step": 27700 }, { "epoch": 0.291126912483899, "grad_norm": 2.136657476425171, "learning_rate": 8.22729100271895e-07, "loss": 0.5214, "step": 27800 }, { "epoch": 0.2921741315935533, "grad_norm": 2.033987522125244, "learning_rate": 8.214452349047065e-07, "loss": 0.5065, "step": 27900 }, { "epoch": 0.29322135070320765, "grad_norm": 3.346703290939331, "learning_rate": 8.20157746926659e-07, "loss": 0.5349, "step": 28000 }, { "epoch": 0.2942685698128619, "grad_norm": 2.63242244720459, "learning_rate": 8.188666508474335e-07, "loss": 0.5264, "step": 28100 }, { "epoch": 0.29531578892251625, "grad_norm": 2.475911855697632, "learning_rate": 8.175719612173741e-07, "loss": 0.5186, "step": 28200 }, { "epoch": 0.2963630080321706, "grad_norm": 1.5967457294464111, "learning_rate": 8.162736926273231e-07, "loss": 0.5321, "step": 28300 }, { "epoch": 0.29741022714182486, "grad_norm": 1.6950793266296387, "learning_rate": 8.149718597084565e-07, "loss": 0.5028, "step": 28400 }, { "epoch": 0.2984574462514792, "grad_norm": 1.8821123838424683, "learning_rate": 8.136664771321198e-07, "loss": 0.5147, "step": 28500 }, { "epoch": 0.2995046653611335, "grad_norm": 3.8432750701904297, "learning_rate": 8.123575596096624e-07, "loss": 0.5055, "step": 28600 }, { "epoch": 0.30055188447078784, "grad_norm": 2.2065136432647705, "learning_rate": 8.110451218922711e-07, "loss": 0.4804, "step": 28700 }, { "epoch": 0.3015991035804421, "grad_norm": 3.215104103088379, "learning_rate": 8.097291787708052e-07, "loss": 0.508, "step": 28800 }, { "epoch": 0.30264632269009645, "grad_norm": 2.6659111976623535, "learning_rate": 8.084097450756286e-07, "loss": 0.5058, "step": 28900 }, { "epoch": 0.3036935417997508, "grad_norm": 3.1594624519348145, "learning_rate": 8.070868356764431e-07, "loss": 0.4819, "step": 29000 }, { "epoch": 0.30474076090940505, "grad_norm": 3.2502479553222656, "learning_rate": 8.05760465482121e-07, "loss": 0.5132, "step": 29100 }, { "epoch": 0.3057879800190594, "grad_norm": 2.3569111824035645, "learning_rate": 8.044306494405372e-07, "loss": 0.4989, "step": 29200 }, { "epoch": 0.3068351991287137, "grad_norm": 2.7516555786132812, "learning_rate": 8.030974025384e-07, "loss": 0.4982, "step": 29300 }, { "epoch": 0.30788241823836804, "grad_norm": 2.388401508331299, "learning_rate": 8.017607398010829e-07, "loss": 0.492, "step": 29400 }, { "epoch": 0.3089296373480223, "grad_norm": 2.49920392036438, "learning_rate": 8.004206762924548e-07, "loss": 0.4729, "step": 29500 }, { "epoch": 0.30997685645767664, "grad_norm": 2.528714179992676, "learning_rate": 7.99077227114711e-07, "loss": 0.5229, "step": 29600 }, { "epoch": 0.31102407556733097, "grad_norm": 2.0866329669952393, "learning_rate": 7.977304074082021e-07, "loss": 0.483, "step": 29700 }, { "epoch": 0.31207129467698524, "grad_norm": 3.1670796871185303, "learning_rate": 7.963802323512638e-07, "loss": 0.4816, "step": 29800 }, { "epoch": 0.3131185137866396, "grad_norm": 1.9715406894683838, "learning_rate": 7.950267171600458e-07, "loss": 0.4666, "step": 29900 }, { "epoch": 0.3141657328962939, "grad_norm": 1.6176679134368896, "learning_rate": 7.936698770883404e-07, "loss": 0.4886, "step": 30000 }, { "epoch": 0.3152129520059482, "grad_norm": 2.4239096641540527, "learning_rate": 7.923097274274103e-07, "loss": 0.5085, "step": 30100 }, { "epoch": 0.3162601711156025, "grad_norm": 1.8292428255081177, "learning_rate": 7.909462835058169e-07, "loss": 0.538, "step": 30200 }, { "epoch": 0.31730739022525684, "grad_norm": 2.2372076511383057, "learning_rate": 7.895795606892466e-07, "loss": 0.5099, "step": 30300 }, { "epoch": 0.31835460933491116, "grad_norm": 1.9392811059951782, "learning_rate": 7.882095743803386e-07, "loss": 0.4947, "step": 30400 }, { "epoch": 0.31940182844456544, "grad_norm": 2.645183801651001, "learning_rate": 7.868363400185106e-07, "loss": 0.5012, "step": 30500 }, { "epoch": 0.32044904755421977, "grad_norm": 3.2452821731567383, "learning_rate": 7.85459873079785e-07, "loss": 0.4696, "step": 30600 }, { "epoch": 0.3214962666638741, "grad_norm": 1.310027003288269, "learning_rate": 7.84080189076615e-07, "loss": 0.5183, "step": 30700 }, { "epoch": 0.32254348577352837, "grad_norm": 2.6369211673736572, "learning_rate": 7.826973035577091e-07, "loss": 0.5135, "step": 30800 }, { "epoch": 0.3235907048831827, "grad_norm": 2.9246723651885986, "learning_rate": 7.813112321078559e-07, "loss": 0.527, "step": 30900 }, { "epoch": 0.32463792399283703, "grad_norm": 3.309020519256592, "learning_rate": 7.799219903477489e-07, "loss": 0.5322, "step": 31000 }, { "epoch": 0.32568514310249136, "grad_norm": 2.4480512142181396, "learning_rate": 7.785295939338105e-07, "loss": 0.5234, "step": 31100 }, { "epoch": 0.32673236221214563, "grad_norm": 1.7909550666809082, "learning_rate": 7.771340585580149e-07, "loss": 0.4938, "step": 31200 }, { "epoch": 0.32777958132179996, "grad_norm": 2.6975667476654053, "learning_rate": 7.757353999477114e-07, "loss": 0.491, "step": 31300 }, { "epoch": 0.3288268004314543, "grad_norm": 2.4480390548706055, "learning_rate": 7.743336338654483e-07, "loss": 0.538, "step": 31400 }, { "epoch": 0.32987401954110857, "grad_norm": 1.8292025327682495, "learning_rate": 7.729287761087935e-07, "loss": 0.4906, "step": 31500 }, { "epoch": 0.3309212386507629, "grad_norm": 1.5502568483352661, "learning_rate": 7.715208425101576e-07, "loss": 0.459, "step": 31600 }, { "epoch": 0.3319684577604172, "grad_norm": 2.6698973178863525, "learning_rate": 7.701098489366156e-07, "loss": 0.5086, "step": 31700 }, { "epoch": 0.3330156768700715, "grad_norm": 2.4431324005126953, "learning_rate": 7.686958112897271e-07, "loss": 0.4843, "step": 31800 }, { "epoch": 0.3340628959797258, "grad_norm": 2.875575065612793, "learning_rate": 7.67278745505358e-07, "loss": 0.5171, "step": 31900 }, { "epoch": 0.33511011508938016, "grad_norm": 2.196960210800171, "learning_rate": 7.658586675535005e-07, "loss": 0.5026, "step": 32000 }, { "epoch": 0.3361573341990345, "grad_norm": 2.801039457321167, "learning_rate": 7.644355934380933e-07, "loss": 0.5175, "step": 32100 }, { "epoch": 0.33720455330868876, "grad_norm": 2.4252429008483887, "learning_rate": 7.630095391968407e-07, "loss": 0.492, "step": 32200 }, { "epoch": 0.3382517724183431, "grad_norm": 1.9080466032028198, "learning_rate": 7.615805209010334e-07, "loss": 0.5203, "step": 32300 }, { "epoch": 0.3392989915279974, "grad_norm": 1.8371050357818604, "learning_rate": 7.601485546553647e-07, "loss": 0.5028, "step": 32400 }, { "epoch": 0.3403462106376517, "grad_norm": 3.5394959449768066, "learning_rate": 7.587136565977522e-07, "loss": 0.5203, "step": 32500 }, { "epoch": 0.341393429747306, "grad_norm": 2.381826400756836, "learning_rate": 7.572758428991532e-07, "loss": 0.5254, "step": 32600 }, { "epoch": 0.34244064885696035, "grad_norm": 1.7615987062454224, "learning_rate": 7.55835129763384e-07, "loss": 0.5091, "step": 32700 }, { "epoch": 0.3434878679666147, "grad_norm": 2.329334020614624, "learning_rate": 7.543915334269365e-07, "loss": 0.5004, "step": 32800 }, { "epoch": 0.34453508707626895, "grad_norm": 2.9679040908813477, "learning_rate": 7.529450701587963e-07, "loss": 0.5114, "step": 32900 }, { "epoch": 0.3455823061859233, "grad_norm": 3.3162288665771484, "learning_rate": 7.514957562602582e-07, "loss": 0.5055, "step": 33000 }, { "epoch": 0.3466295252955776, "grad_norm": 2.0709986686706543, "learning_rate": 7.500436080647428e-07, "loss": 0.5574, "step": 33100 }, { "epoch": 0.3476767444052319, "grad_norm": 2.1400296688079834, "learning_rate": 7.485886419376126e-07, "loss": 0.5777, "step": 33200 }, { "epoch": 0.3487239635148862, "grad_norm": 2.4479362964630127, "learning_rate": 7.471308742759879e-07, "loss": 0.5378, "step": 33300 }, { "epoch": 0.34977118262454054, "grad_norm": 2.2012875080108643, "learning_rate": 7.456703215085609e-07, "loss": 0.4941, "step": 33400 }, { "epoch": 0.3508184017341948, "grad_norm": 2.5233943462371826, "learning_rate": 7.44207000095412e-07, "loss": 0.547, "step": 33500 }, { "epoch": 0.35186562084384915, "grad_norm": 2.050294876098633, "learning_rate": 7.427409265278235e-07, "loss": 0.5326, "step": 33600 }, { "epoch": 0.3529128399535035, "grad_norm": 1.9416810274124146, "learning_rate": 7.412721173280931e-07, "loss": 0.5373, "step": 33700 }, { "epoch": 0.3539600590631578, "grad_norm": 2.4550209045410156, "learning_rate": 7.398005890493493e-07, "loss": 0.5025, "step": 33800 }, { "epoch": 0.3550072781728121, "grad_norm": 2.1860315799713135, "learning_rate": 7.383263582753633e-07, "loss": 0.4961, "step": 33900 }, { "epoch": 0.3560544972824664, "grad_norm": 3.3393681049346924, "learning_rate": 7.368494416203632e-07, "loss": 0.5014, "step": 34000 }, { "epoch": 0.35710171639212074, "grad_norm": 2.2855758666992188, "learning_rate": 7.353698557288462e-07, "loss": 0.5179, "step": 34100 }, { "epoch": 0.358148935501775, "grad_norm": 2.719910144805908, "learning_rate": 7.338876172753913e-07, "loss": 0.5151, "step": 34200 }, { "epoch": 0.35919615461142934, "grad_norm": 2.3122212886810303, "learning_rate": 7.324027429644709e-07, "loss": 0.5075, "step": 34300 }, { "epoch": 0.36024337372108367, "grad_norm": 2.5901198387145996, "learning_rate": 7.309152495302631e-07, "loss": 0.5185, "step": 34400 }, { "epoch": 0.361290592830738, "grad_norm": 2.749903440475464, "learning_rate": 7.294251537364629e-07, "loss": 0.4728, "step": 34500 }, { "epoch": 0.3623378119403923, "grad_norm": 2.453977108001709, "learning_rate": 7.279324723760932e-07, "loss": 0.5197, "step": 34600 }, { "epoch": 0.3633850310500466, "grad_norm": 3.2406835556030273, "learning_rate": 7.264372222713157e-07, "loss": 0.4856, "step": 34700 }, { "epoch": 0.36443225015970093, "grad_norm": 2.1802427768707275, "learning_rate": 7.249394202732414e-07, "loss": 0.4996, "step": 34800 }, { "epoch": 0.3654794692693552, "grad_norm": 1.560670256614685, "learning_rate": 7.234390832617399e-07, "loss": 0.5032, "step": 34900 }, { "epoch": 0.36652668837900954, "grad_norm": 2.8153815269470215, "learning_rate": 7.219362281452504e-07, "loss": 0.4882, "step": 35000 }, { "epoch": 0.36757390748866386, "grad_norm": 3.205367088317871, "learning_rate": 7.204308718605906e-07, "loss": 0.5232, "step": 35100 }, { "epoch": 0.36862112659831814, "grad_norm": 1.6098523139953613, "learning_rate": 7.189230313727651e-07, "loss": 0.488, "step": 35200 }, { "epoch": 0.36966834570797247, "grad_norm": 2.2674808502197266, "learning_rate": 7.174127236747756e-07, "loss": 0.5026, "step": 35300 }, { "epoch": 0.3707155648176268, "grad_norm": 2.0923283100128174, "learning_rate": 7.158999657874283e-07, "loss": 0.5292, "step": 35400 }, { "epoch": 0.3717627839272811, "grad_norm": 2.078521251678467, "learning_rate": 7.143847747591423e-07, "loss": 0.5002, "step": 35500 }, { "epoch": 0.3728100030369354, "grad_norm": 2.299473285675049, "learning_rate": 7.128671676657579e-07, "loss": 0.5132, "step": 35600 }, { "epoch": 0.37385722214658973, "grad_norm": 1.3978760242462158, "learning_rate": 7.113471616103441e-07, "loss": 0.5182, "step": 35700 }, { "epoch": 0.37490444125624406, "grad_norm": 2.559293746948242, "learning_rate": 7.098247737230052e-07, "loss": 0.5202, "step": 35800 }, { "epoch": 0.37595166036589833, "grad_norm": 2.457498788833618, "learning_rate": 7.083000211606881e-07, "loss": 0.4946, "step": 35900 }, { "epoch": 0.37699887947555266, "grad_norm": 1.9849262237548828, "learning_rate": 7.067729211069892e-07, "loss": 0.4932, "step": 36000 }, { "epoch": 0.378046098585207, "grad_norm": 2.242328405380249, "learning_rate": 7.05243490771961e-07, "loss": 0.4853, "step": 36100 }, { "epoch": 0.3790933176948613, "grad_norm": 4.18756103515625, "learning_rate": 7.037117473919169e-07, "loss": 0.5271, "step": 36200 }, { "epoch": 0.3801405368045156, "grad_norm": 2.454249382019043, "learning_rate": 7.021777082292384e-07, "loss": 0.5208, "step": 36300 }, { "epoch": 0.3811877559141699, "grad_norm": 1.5989599227905273, "learning_rate": 7.006413905721796e-07, "loss": 0.5252, "step": 36400 }, { "epoch": 0.38223497502382425, "grad_norm": 3.1384224891662598, "learning_rate": 6.991028117346727e-07, "loss": 0.5231, "step": 36500 }, { "epoch": 0.3832821941334785, "grad_norm": 3.674887180328369, "learning_rate": 6.975619890561331e-07, "loss": 0.5338, "step": 36600 }, { "epoch": 0.38432941324313286, "grad_norm": 2.8714184761047363, "learning_rate": 6.960189399012635e-07, "loss": 0.4667, "step": 36700 }, { "epoch": 0.3853766323527872, "grad_norm": 2.0271899700164795, "learning_rate": 6.944736816598585e-07, "loss": 0.5439, "step": 36800 }, { "epoch": 0.38642385146244146, "grad_norm": 2.3302154541015625, "learning_rate": 6.929262317466087e-07, "loss": 0.5085, "step": 36900 }, { "epoch": 0.3874710705720958, "grad_norm": 1.89630126953125, "learning_rate": 6.913766076009042e-07, "loss": 0.489, "step": 37000 }, { "epoch": 0.3885182896817501, "grad_norm": 3.864342212677002, "learning_rate": 6.898248266866383e-07, "loss": 0.4782, "step": 37100 }, { "epoch": 0.38956550879140445, "grad_norm": 3.6760518550872803, "learning_rate": 6.882709064920104e-07, "loss": 0.5387, "step": 37200 }, { "epoch": 0.3906127279010587, "grad_norm": 2.225639581680298, "learning_rate": 6.867148645293292e-07, "loss": 0.5417, "step": 37300 }, { "epoch": 0.39165994701071305, "grad_norm": 1.6425765752792358, "learning_rate": 6.85156718334815e-07, "loss": 0.501, "step": 37400 }, { "epoch": 0.3927071661203674, "grad_norm": 2.095388650894165, "learning_rate": 6.835964854684027e-07, "loss": 0.5244, "step": 37500 }, { "epoch": 0.39375438523002165, "grad_norm": 1.9956177473068237, "learning_rate": 6.820341835135434e-07, "loss": 0.4862, "step": 37600 }, { "epoch": 0.394801604339676, "grad_norm": 2.3689606189727783, "learning_rate": 6.804698300770058e-07, "loss": 0.5174, "step": 37700 }, { "epoch": 0.3958488234493303, "grad_norm": 2.4154350757598877, "learning_rate": 6.789034427886788e-07, "loss": 0.5232, "step": 37800 }, { "epoch": 0.39689604255898464, "grad_norm": 2.841860055923462, "learning_rate": 6.773350393013725e-07, "loss": 0.4952, "step": 37900 }, { "epoch": 0.3979432616686389, "grad_norm": 1.6685402393341064, "learning_rate": 6.757646372906183e-07, "loss": 0.5136, "step": 38000 }, { "epoch": 0.39899048077829324, "grad_norm": 2.3947384357452393, "learning_rate": 6.741922544544716e-07, "loss": 0.4728, "step": 38100 }, { "epoch": 0.4000376998879476, "grad_norm": 1.9924613237380981, "learning_rate": 6.726179085133102e-07, "loss": 0.5101, "step": 38200 }, { "epoch": 0.40108491899760185, "grad_norm": 2.3830676078796387, "learning_rate": 6.710416172096361e-07, "loss": 0.489, "step": 38300 }, { "epoch": 0.4021321381072562, "grad_norm": 2.6001055240631104, "learning_rate": 6.69463398307875e-07, "loss": 0.5337, "step": 38400 }, { "epoch": 0.4031793572169105, "grad_norm": 2.329277753829956, "learning_rate": 6.678832695941763e-07, "loss": 0.469, "step": 38500 }, { "epoch": 0.40422657632656483, "grad_norm": 2.2831122875213623, "learning_rate": 6.663012488762123e-07, "loss": 0.5279, "step": 38600 }, { "epoch": 0.4052737954362191, "grad_norm": 2.813821315765381, "learning_rate": 6.647173539829778e-07, "loss": 0.4873, "step": 38700 }, { "epoch": 0.40632101454587344, "grad_norm": 2.3835694789886475, "learning_rate": 6.631316027645892e-07, "loss": 0.4991, "step": 38800 }, { "epoch": 0.40736823365552777, "grad_norm": 2.7960257530212402, "learning_rate": 6.615440130920833e-07, "loss": 0.5366, "step": 38900 }, { "epoch": 0.40841545276518204, "grad_norm": 1.9220885038375854, "learning_rate": 6.599546028572153e-07, "loss": 0.5111, "step": 39000 }, { "epoch": 0.40946267187483637, "grad_norm": 2.636683464050293, "learning_rate": 6.583633899722587e-07, "loss": 0.5058, "step": 39100 }, { "epoch": 0.4105098909844907, "grad_norm": 2.0583505630493164, "learning_rate": 6.567703923698013e-07, "loss": 0.4796, "step": 39200 }, { "epoch": 0.411557110094145, "grad_norm": 3.092818021774292, "learning_rate": 6.551756280025453e-07, "loss": 0.5181, "step": 39300 }, { "epoch": 0.4126043292037993, "grad_norm": 2.689857006072998, "learning_rate": 6.535791148431031e-07, "loss": 0.5424, "step": 39400 }, { "epoch": 0.41365154831345363, "grad_norm": 1.4727122783660889, "learning_rate": 6.519808708837958e-07, "loss": 0.5257, "step": 39500 }, { "epoch": 0.41469876742310796, "grad_norm": 2.4704394340515137, "learning_rate": 6.503809141364506e-07, "loss": 0.5043, "step": 39600 }, { "epoch": 0.41574598653276224, "grad_norm": 2.2205686569213867, "learning_rate": 6.487792626321969e-07, "loss": 0.4732, "step": 39700 }, { "epoch": 0.41679320564241656, "grad_norm": 4.539642333984375, "learning_rate": 6.471759344212637e-07, "loss": 0.5028, "step": 39800 }, { "epoch": 0.4178404247520709, "grad_norm": 3.22900652885437, "learning_rate": 6.455709475727764e-07, "loss": 0.4802, "step": 39900 }, { "epoch": 0.41888764386172517, "grad_norm": 1.7866666316986084, "learning_rate": 6.439643201745524e-07, "loss": 0.4677, "step": 40000 }, { "epoch": 0.4199348629713795, "grad_norm": 1.5298930406570435, "learning_rate": 6.423560703328981e-07, "loss": 0.4663, "step": 40100 }, { "epoch": 0.4209820820810338, "grad_norm": 2.7381436824798584, "learning_rate": 6.407462161724042e-07, "loss": 0.5032, "step": 40200 }, { "epoch": 0.42202930119068816, "grad_norm": 1.915801763534546, "learning_rate": 6.391347758357418e-07, "loss": 0.4876, "step": 40300 }, { "epoch": 0.42307652030034243, "grad_norm": 2.128645658493042, "learning_rate": 6.375217674834578e-07, "loss": 0.4947, "step": 40400 }, { "epoch": 0.42412373940999676, "grad_norm": 2.3809661865234375, "learning_rate": 6.359072092937702e-07, "loss": 0.5207, "step": 40500 }, { "epoch": 0.4251709585196511, "grad_norm": 2.089869976043701, "learning_rate": 6.342911194623636e-07, "loss": 0.5179, "step": 40600 }, { "epoch": 0.42621817762930536, "grad_norm": 2.531280040740967, "learning_rate": 6.326735162021832e-07, "loss": 0.5003, "step": 40700 }, { "epoch": 0.4272653967389597, "grad_norm": 1.5095371007919312, "learning_rate": 6.310544177432308e-07, "loss": 0.475, "step": 40800 }, { "epoch": 0.428312615848614, "grad_norm": 3.487618923187256, "learning_rate": 6.294338423323584e-07, "loss": 0.5382, "step": 40900 }, { "epoch": 0.4293598349582683, "grad_norm": 3.1474342346191406, "learning_rate": 6.27811808233063e-07, "loss": 0.5147, "step": 41000 }, { "epoch": 0.4304070540679226, "grad_norm": 3.5564653873443604, "learning_rate": 6.261883337252808e-07, "loss": 0.5062, "step": 41100 }, { "epoch": 0.43145427317757695, "grad_norm": 2.47421932220459, "learning_rate": 6.245634371051808e-07, "loss": 0.5364, "step": 41200 }, { "epoch": 0.4325014922872313, "grad_norm": 1.5858722925186157, "learning_rate": 6.22937136684959e-07, "loss": 0.5319, "step": 41300 }, { "epoch": 0.43354871139688556, "grad_norm": 2.9193403720855713, "learning_rate": 6.21309450792632e-07, "loss": 0.486, "step": 41400 }, { "epoch": 0.4345959305065399, "grad_norm": 1.9017012119293213, "learning_rate": 6.1968039777183e-07, "loss": 0.5445, "step": 41500 }, { "epoch": 0.4356431496161942, "grad_norm": 2.5207788944244385, "learning_rate": 6.180499959815908e-07, "loss": 0.5274, "step": 41600 }, { "epoch": 0.4366903687258485, "grad_norm": 2.239696979522705, "learning_rate": 6.164182637961521e-07, "loss": 0.5056, "step": 41700 }, { "epoch": 0.4377375878355028, "grad_norm": 2.565997838973999, "learning_rate": 6.147852196047455e-07, "loss": 0.508, "step": 41800 }, { "epoch": 0.43878480694515715, "grad_norm": 1.4207922220230103, "learning_rate": 6.131508818113878e-07, "loss": 0.4964, "step": 41900 }, { "epoch": 0.4398320260548115, "grad_norm": 2.6042516231536865, "learning_rate": 6.11515268834675e-07, "loss": 0.5008, "step": 42000 }, { "epoch": 0.44087924516446575, "grad_norm": 2.077496290206909, "learning_rate": 6.098783991075736e-07, "loss": 0.4964, "step": 42100 }, { "epoch": 0.4419264642741201, "grad_norm": 2.444882392883301, "learning_rate": 6.082402910772137e-07, "loss": 0.493, "step": 42200 }, { "epoch": 0.4429736833837744, "grad_norm": 3.973526954650879, "learning_rate": 6.066009632046809e-07, "loss": 0.5078, "step": 42300 }, { "epoch": 0.4440209024934287, "grad_norm": 2.283217430114746, "learning_rate": 6.049604339648078e-07, "loss": 0.4756, "step": 42400 }, { "epoch": 0.445068121603083, "grad_norm": 1.3749598264694214, "learning_rate": 6.033187218459665e-07, "loss": 0.494, "step": 42500 }, { "epoch": 0.44611534071273734, "grad_norm": 3.739201068878174, "learning_rate": 6.016758453498592e-07, "loss": 0.4977, "step": 42600 }, { "epoch": 0.4471625598223916, "grad_norm": 2.5676069259643555, "learning_rate": 6.00031822991311e-07, "loss": 0.4691, "step": 42700 }, { "epoch": 0.44820977893204594, "grad_norm": 2.269869089126587, "learning_rate": 5.983866732980607e-07, "loss": 0.5088, "step": 42800 }, { "epoch": 0.4492569980417003, "grad_norm": 1.8404080867767334, "learning_rate": 5.96740414810551e-07, "loss": 0.4666, "step": 42900 }, { "epoch": 0.4503042171513546, "grad_norm": 2.3597822189331055, "learning_rate": 5.950930660817214e-07, "loss": 0.4976, "step": 43000 }, { "epoch": 0.4513514362610089, "grad_norm": 1.5849223136901855, "learning_rate": 5.934446456767977e-07, "loss": 0.5176, "step": 43100 }, { "epoch": 0.4523986553706632, "grad_norm": 1.3389567136764526, "learning_rate": 5.917951721730834e-07, "loss": 0.5244, "step": 43200 }, { "epoch": 0.45344587448031753, "grad_norm": 2.6399717330932617, "learning_rate": 5.901446641597498e-07, "loss": 0.5227, "step": 43300 }, { "epoch": 0.4544930935899718, "grad_norm": 2.2782344818115234, "learning_rate": 5.884931402376274e-07, "loss": 0.5351, "step": 43400 }, { "epoch": 0.45554031269962614, "grad_norm": 4.411149024963379, "learning_rate": 5.868406190189955e-07, "loss": 0.4855, "step": 43500 }, { "epoch": 0.45658753180928047, "grad_norm": 2.243643045425415, "learning_rate": 5.851871191273726e-07, "loss": 0.5299, "step": 43600 }, { "epoch": 0.4576347509189348, "grad_norm": 2.678518533706665, "learning_rate": 5.835326591973068e-07, "loss": 0.5615, "step": 43700 }, { "epoch": 0.45868197002858907, "grad_norm": 2.2850341796875, "learning_rate": 5.818772578741654e-07, "loss": 0.5314, "step": 43800 }, { "epoch": 0.4597291891382434, "grad_norm": 2.199620246887207, "learning_rate": 5.802209338139253e-07, "loss": 0.4905, "step": 43900 }, { "epoch": 0.46077640824789773, "grad_norm": 2.532054901123047, "learning_rate": 5.785637056829619e-07, "loss": 0.5143, "step": 44000 }, { "epoch": 0.461823627357552, "grad_norm": 1.9873905181884766, "learning_rate": 5.769055921578399e-07, "loss": 0.5128, "step": 44100 }, { "epoch": 0.46287084646720633, "grad_norm": 2.033123254776001, "learning_rate": 5.752466119251018e-07, "loss": 0.5027, "step": 44200 }, { "epoch": 0.46391806557686066, "grad_norm": 1.890243649482727, "learning_rate": 5.735867836810575e-07, "loss": 0.4893, "step": 44300 }, { "epoch": 0.46496528468651493, "grad_norm": 2.7789084911346436, "learning_rate": 5.719261261315742e-07, "loss": 0.4804, "step": 44400 }, { "epoch": 0.46601250379616926, "grad_norm": 2.320241928100586, "learning_rate": 5.702646579918651e-07, "loss": 0.4727, "step": 44500 }, { "epoch": 0.4670597229058236, "grad_norm": 2.557783603668213, "learning_rate": 5.686023979862784e-07, "loss": 0.4802, "step": 44600 }, { "epoch": 0.4681069420154779, "grad_norm": 2.0354034900665283, "learning_rate": 5.669393648480861e-07, "loss": 0.4409, "step": 44700 }, { "epoch": 0.4691541611251322, "grad_norm": 2.6490516662597656, "learning_rate": 5.652755773192742e-07, "loss": 0.5116, "step": 44800 }, { "epoch": 0.4702013802347865, "grad_norm": 1.9367735385894775, "learning_rate": 5.636110541503299e-07, "loss": 0.51, "step": 44900 }, { "epoch": 0.47124859934444085, "grad_norm": 2.3540682792663574, "learning_rate": 5.619458141000305e-07, "loss": 0.5053, "step": 45000 }, { "epoch": 0.47229581845409513, "grad_norm": 2.308772325515747, "learning_rate": 5.602798759352328e-07, "loss": 0.4857, "step": 45100 }, { "epoch": 0.47334303756374946, "grad_norm": 2.775662899017334, "learning_rate": 5.586132584306617e-07, "loss": 0.5039, "step": 45200 }, { "epoch": 0.4743902566734038, "grad_norm": 2.4968132972717285, "learning_rate": 5.569459803686971e-07, "loss": 0.5047, "step": 45300 }, { "epoch": 0.4754374757830581, "grad_norm": 2.3723912239074707, "learning_rate": 5.552780605391637e-07, "loss": 0.5022, "step": 45400 }, { "epoch": 0.4764846948927124, "grad_norm": 2.080238103866577, "learning_rate": 5.53609517739119e-07, "loss": 0.5139, "step": 45500 }, { "epoch": 0.4775319140023667, "grad_norm": 2.763566732406616, "learning_rate": 5.519403707726409e-07, "loss": 0.5269, "step": 45600 }, { "epoch": 0.47857913311202105, "grad_norm": 2.2503960132598877, "learning_rate": 5.502706384506162e-07, "loss": 0.5049, "step": 45700 }, { "epoch": 0.4796263522216753, "grad_norm": 2.2146077156066895, "learning_rate": 5.486003395905284e-07, "loss": 0.5164, "step": 45800 }, { "epoch": 0.48067357133132965, "grad_norm": 2.077916145324707, "learning_rate": 5.46929493016246e-07, "loss": 0.5436, "step": 45900 }, { "epoch": 0.481720790440984, "grad_norm": 2.990812301635742, "learning_rate": 5.452581175578099e-07, "loss": 0.4996, "step": 46000 }, { "epoch": 0.48276800955063826, "grad_norm": 2.3420207500457764, "learning_rate": 5.435862320512216e-07, "loss": 0.4886, "step": 46100 }, { "epoch": 0.4838152286602926, "grad_norm": 2.182870864868164, "learning_rate": 5.419138553382303e-07, "loss": 0.5081, "step": 46200 }, { "epoch": 0.4848624477699469, "grad_norm": 2.5916247367858887, "learning_rate": 5.402410062661217e-07, "loss": 0.4863, "step": 46300 }, { "epoch": 0.48590966687960124, "grad_norm": 2.3160765171051025, "learning_rate": 5.38567703687504e-07, "loss": 0.55, "step": 46400 }, { "epoch": 0.4869568859892555, "grad_norm": 3.3683152198791504, "learning_rate": 5.368939664600971e-07, "loss": 0.4838, "step": 46500 }, { "epoch": 0.48800410509890985, "grad_norm": 1.8857132196426392, "learning_rate": 5.352198134465188e-07, "loss": 0.5053, "step": 46600 }, { "epoch": 0.4890513242085642, "grad_norm": 2.4393274784088135, "learning_rate": 5.335452635140728e-07, "loss": 0.53, "step": 46700 }, { "epoch": 0.49009854331821845, "grad_norm": 2.8095269203186035, "learning_rate": 5.318703355345361e-07, "loss": 0.4955, "step": 46800 }, { "epoch": 0.4911457624278728, "grad_norm": 3.766524076461792, "learning_rate": 5.301950483839461e-07, "loss": 0.5033, "step": 46900 }, { "epoch": 0.4921929815375271, "grad_norm": 3.614816665649414, "learning_rate": 5.285194209423881e-07, "loss": 0.516, "step": 47000 }, { "epoch": 0.49324020064718144, "grad_norm": 2.2229409217834473, "learning_rate": 5.268434720937823e-07, "loss": 0.5158, "step": 47100 }, { "epoch": 0.4942874197568357, "grad_norm": 2.4111645221710205, "learning_rate": 5.251672207256708e-07, "loss": 0.5265, "step": 47200 }, { "epoch": 0.49533463886649004, "grad_norm": 1.9818792343139648, "learning_rate": 5.234906857290057e-07, "loss": 0.5059, "step": 47300 }, { "epoch": 0.49638185797614437, "grad_norm": 1.8921643495559692, "learning_rate": 5.218138859979349e-07, "loss": 0.5281, "step": 47400 }, { "epoch": 0.49742907708579864, "grad_norm": 2.3685996532440186, "learning_rate": 5.201368404295899e-07, "loss": 0.5257, "step": 47500 }, { "epoch": 0.498476296195453, "grad_norm": 3.2099828720092773, "learning_rate": 5.184595679238732e-07, "loss": 0.4806, "step": 47600 }, { "epoch": 0.4995235153051073, "grad_norm": 2.328226089477539, "learning_rate": 5.167820873832445e-07, "loss": 0.5496, "step": 47700 }, { "epoch": 0.5005707344147616, "grad_norm": 2.010138988494873, "learning_rate": 5.151044177125077e-07, "loss": 0.5025, "step": 47800 }, { "epoch": 0.501617953524416, "grad_norm": 2.0107200145721436, "learning_rate": 5.134265778185984e-07, "loss": 0.4695, "step": 47900 }, { "epoch": 0.5026651726340702, "grad_norm": 3.73002552986145, "learning_rate": 5.117485866103707e-07, "loss": 0.5489, "step": 48000 }, { "epoch": 0.5037123917437245, "grad_norm": 1.203131914138794, "learning_rate": 5.100704629983842e-07, "loss": 0.4918, "step": 48100 }, { "epoch": 0.5047596108533788, "grad_norm": 2.464951276779175, "learning_rate": 5.083922258946899e-07, "loss": 0.526, "step": 48200 }, { "epoch": 0.5058068299630332, "grad_norm": 2.5923502445220947, "learning_rate": 5.067138942126185e-07, "loss": 0.5094, "step": 48300 }, { "epoch": 0.5068540490726875, "grad_norm": 2.553731918334961, "learning_rate": 5.050354868665663e-07, "loss": 0.5116, "step": 48400 }, { "epoch": 0.5079012681823418, "grad_norm": 2.171161413192749, "learning_rate": 5.033570227717823e-07, "loss": 0.5021, "step": 48500 }, { "epoch": 0.5089484872919962, "grad_norm": 1.9675207138061523, "learning_rate": 5.016785208441553e-07, "loss": 0.4759, "step": 48600 }, { "epoch": 0.5099957064016504, "grad_norm": 2.772975206375122, "learning_rate": 5e-07, "loss": 0.504, "step": 48700 }, { "epoch": 0.5110429255113047, "grad_norm": 1.8081309795379639, "learning_rate": 4.983214791558449e-07, "loss": 0.4884, "step": 48800 }, { "epoch": 0.512090144620959, "grad_norm": 2.1011574268341064, "learning_rate": 4.966429772282177e-07, "loss": 0.5411, "step": 48900 }, { "epoch": 0.5131373637306134, "grad_norm": 1.7532665729522705, "learning_rate": 4.949645131334338e-07, "loss": 0.5217, "step": 49000 }, { "epoch": 0.5141845828402677, "grad_norm": 1.9248243570327759, "learning_rate": 4.932861057873817e-07, "loss": 0.5161, "step": 49100 }, { "epoch": 0.515231801949922, "grad_norm": 2.180882692337036, "learning_rate": 4.916077741053101e-07, "loss": 0.4977, "step": 49200 }, { "epoch": 0.5162790210595763, "grad_norm": 2.663121223449707, "learning_rate": 4.899295370016159e-07, "loss": 0.4918, "step": 49300 }, { "epoch": 0.5173262401692306, "grad_norm": 1.928085446357727, "learning_rate": 4.882514133896293e-07, "loss": 0.4863, "step": 49400 }, { "epoch": 0.5183734592788849, "grad_norm": 2.9963412284851074, "learning_rate": 4.865734221814016e-07, "loss": 0.5015, "step": 49500 }, { "epoch": 0.5194206783885392, "grad_norm": 2.45681095123291, "learning_rate": 4.848955822874924e-07, "loss": 0.5285, "step": 49600 }, { "epoch": 0.5204678974981936, "grad_norm": 1.8462231159210205, "learning_rate": 4.832179126167556e-07, "loss": 0.467, "step": 49700 }, { "epoch": 0.5215151166078479, "grad_norm": 2.27242374420166, "learning_rate": 4.815404320761267e-07, "loss": 0.4681, "step": 49800 }, { "epoch": 0.5225623357175022, "grad_norm": 2.18723201751709, "learning_rate": 4.7986315957041e-07, "loss": 0.5005, "step": 49900 }, { "epoch": 0.5236095548271564, "grad_norm": 3.0114426612854004, "learning_rate": 4.781861140020652e-07, "loss": 0.4861, "step": 50000 }, { "epoch": 0.5246567739368108, "grad_norm": 2.07069730758667, "learning_rate": 4.765093142709943e-07, "loss": 0.4648, "step": 50100 }, { "epoch": 0.5257039930464651, "grad_norm": 2.2993671894073486, "learning_rate": 4.7483277927432924e-07, "loss": 0.4835, "step": 50200 }, { "epoch": 0.5267512121561194, "grad_norm": 2.224874258041382, "learning_rate": 4.731565279062179e-07, "loss": 0.4642, "step": 50300 }, { "epoch": 0.5277984312657737, "grad_norm": 1.7376128435134888, "learning_rate": 4.7148057905761187e-07, "loss": 0.4883, "step": 50400 }, { "epoch": 0.5288456503754281, "grad_norm": 3.3602840900421143, "learning_rate": 4.698049516160539e-07, "loss": 0.4762, "step": 50500 }, { "epoch": 0.5298928694850824, "grad_norm": 1.7802869081497192, "learning_rate": 4.681296644654639e-07, "loss": 0.5264, "step": 50600 }, { "epoch": 0.5309400885947366, "grad_norm": 1.8603919744491577, "learning_rate": 4.6645473648592716e-07, "loss": 0.4902, "step": 50700 }, { "epoch": 0.531987307704391, "grad_norm": 2.204157590866089, "learning_rate": 4.647801865534813e-07, "loss": 0.4835, "step": 50800 }, { "epoch": 0.5330345268140453, "grad_norm": 1.2694624662399292, "learning_rate": 4.63106033539903e-07, "loss": 0.5238, "step": 50900 }, { "epoch": 0.5340817459236996, "grad_norm": 2.0624773502349854, "learning_rate": 4.6143229631249596e-07, "loss": 0.5033, "step": 51000 }, { "epoch": 0.5351289650333539, "grad_norm": 1.9012243747711182, "learning_rate": 4.597589937338784e-07, "loss": 0.5076, "step": 51100 }, { "epoch": 0.5361761841430083, "grad_norm": 2.1069536209106445, "learning_rate": 4.580861446617698e-07, "loss": 0.5171, "step": 51200 }, { "epoch": 0.5372234032526626, "grad_norm": 1.5368138551712036, "learning_rate": 4.564137679487785e-07, "loss": 0.4803, "step": 51300 }, { "epoch": 0.5382706223623168, "grad_norm": 1.5406559705734253, "learning_rate": 4.5474188244219006e-07, "loss": 0.4839, "step": 51400 }, { "epoch": 0.5393178414719711, "grad_norm": 1.4071673154830933, "learning_rate": 4.530705069837542e-07, "loss": 0.4764, "step": 51500 }, { "epoch": 0.5403650605816255, "grad_norm": 2.699596643447876, "learning_rate": 4.513996604094716e-07, "loss": 0.5177, "step": 51600 }, { "epoch": 0.5414122796912798, "grad_norm": 1.542262315750122, "learning_rate": 4.497293615493838e-07, "loss": 0.508, "step": 51700 }, { "epoch": 0.5424594988009341, "grad_norm": 3.0482521057128906, "learning_rate": 4.480596292273592e-07, "loss": 0.5303, "step": 51800 }, { "epoch": 0.5435067179105885, "grad_norm": 2.214055061340332, "learning_rate": 4.463904822608809e-07, "loss": 0.4843, "step": 51900 }, { "epoch": 0.5445539370202428, "grad_norm": 2.4003210067749023, "learning_rate": 4.4472193946083634e-07, "loss": 0.5024, "step": 52000 }, { "epoch": 0.545601156129897, "grad_norm": 2.2942888736724854, "learning_rate": 4.430540196313031e-07, "loss": 0.5073, "step": 52100 }, { "epoch": 0.5466483752395513, "grad_norm": 2.4813528060913086, "learning_rate": 4.413867415693383e-07, "loss": 0.5114, "step": 52200 }, { "epoch": 0.5476955943492057, "grad_norm": 1.8171602487564087, "learning_rate": 4.3972012406476715e-07, "loss": 0.4714, "step": 52300 }, { "epoch": 0.54874281345886, "grad_norm": 2.677717924118042, "learning_rate": 4.3805418589996967e-07, "loss": 0.5277, "step": 52400 }, { "epoch": 0.5497900325685143, "grad_norm": 2.815244674682617, "learning_rate": 4.363889458496701e-07, "loss": 0.4969, "step": 52500 }, { "epoch": 0.5508372516781687, "grad_norm": 2.719905376434326, "learning_rate": 4.347244226807257e-07, "loss": 0.494, "step": 52600 }, { "epoch": 0.551884470787823, "grad_norm": 2.277196168899536, "learning_rate": 4.3306063515191384e-07, "loss": 0.4989, "step": 52700 }, { "epoch": 0.5529316898974772, "grad_norm": 2.747807741165161, "learning_rate": 4.3139760201372166e-07, "loss": 0.475, "step": 52800 }, { "epoch": 0.5539789090071315, "grad_norm": 2.1879899501800537, "learning_rate": 4.29735342008135e-07, "loss": 0.4727, "step": 52900 }, { "epoch": 0.5550261281167859, "grad_norm": 1.5891708135604858, "learning_rate": 4.280738738684259e-07, "loss": 0.5209, "step": 53000 }, { "epoch": 0.5560733472264402, "grad_norm": 2.6258082389831543, "learning_rate": 4.2641321631894256e-07, "loss": 0.5146, "step": 53100 }, { "epoch": 0.5571205663360945, "grad_norm": 2.106497287750244, "learning_rate": 4.2475338807489825e-07, "loss": 0.5072, "step": 53200 }, { "epoch": 0.5581677854457489, "grad_norm": 1.3520596027374268, "learning_rate": 4.2309440784216014e-07, "loss": 0.5007, "step": 53300 }, { "epoch": 0.5592150045554031, "grad_norm": 2.2585766315460205, "learning_rate": 4.21436294317038e-07, "loss": 0.5661, "step": 53400 }, { "epoch": 0.5602622236650574, "grad_norm": 2.4655063152313232, "learning_rate": 4.1977906618607473e-07, "loss": 0.5057, "step": 53500 }, { "epoch": 0.5613094427747117, "grad_norm": 1.7120404243469238, "learning_rate": 4.181227421258344e-07, "loss": 0.4762, "step": 53600 }, { "epoch": 0.5623566618843661, "grad_norm": 2.365668535232544, "learning_rate": 4.164673408026932e-07, "loss": 0.5015, "step": 53700 }, { "epoch": 0.5634038809940204, "grad_norm": 2.5297205448150635, "learning_rate": 4.148128808726274e-07, "loss": 0.4789, "step": 53800 }, { "epoch": 0.5644511001036747, "grad_norm": 2.997265577316284, "learning_rate": 4.131593809810044e-07, "loss": 0.4841, "step": 53900 }, { "epoch": 0.565498319213329, "grad_norm": 2.2408447265625, "learning_rate": 4.1150685976237253e-07, "loss": 0.5194, "step": 54000 }, { "epoch": 0.5665455383229833, "grad_norm": 1.8267594575881958, "learning_rate": 4.098553358402503e-07, "loss": 0.4978, "step": 54100 }, { "epoch": 0.5675927574326376, "grad_norm": 3.2854866981506348, "learning_rate": 4.0820482782691666e-07, "loss": 0.499, "step": 54200 }, { "epoch": 0.5686399765422919, "grad_norm": 2.401383638381958, "learning_rate": 4.0655535432320225e-07, "loss": 0.539, "step": 54300 }, { "epoch": 0.5696871956519463, "grad_norm": 2.3308005332946777, "learning_rate": 4.0490693391827867e-07, "loss": 0.527, "step": 54400 }, { "epoch": 0.5707344147616006, "grad_norm": 2.6808366775512695, "learning_rate": 4.0325958518944893e-07, "loss": 0.4965, "step": 54500 }, { "epoch": 0.5717816338712549, "grad_norm": 2.82200026512146, "learning_rate": 4.016133267019394e-07, "loss": 0.5051, "step": 54600 }, { "epoch": 0.5728288529809092, "grad_norm": 3.023541212081909, "learning_rate": 3.99968177008689e-07, "loss": 0.4623, "step": 54700 }, { "epoch": 0.5738760720905635, "grad_norm": 2.405120372772217, "learning_rate": 3.983241546501408e-07, "loss": 0.5096, "step": 54800 }, { "epoch": 0.5749232912002178, "grad_norm": 1.9728878736495972, "learning_rate": 3.9668127815403353e-07, "loss": 0.5405, "step": 54900 }, { "epoch": 0.5759705103098721, "grad_norm": 3.312455415725708, "learning_rate": 3.950395660351922e-07, "loss": 0.5245, "step": 55000 }, { "epoch": 0.5770177294195264, "grad_norm": 1.9875174760818481, "learning_rate": 3.93399036795319e-07, "loss": 0.4863, "step": 55100 }, { "epoch": 0.5780649485291808, "grad_norm": 2.295588731765747, "learning_rate": 3.917597089227863e-07, "loss": 0.4868, "step": 55200 }, { "epoch": 0.5791121676388351, "grad_norm": 2.505709409713745, "learning_rate": 3.901216008924265e-07, "loss": 0.4955, "step": 55300 }, { "epoch": 0.5801593867484894, "grad_norm": 2.177341938018799, "learning_rate": 3.88484731165325e-07, "loss": 0.5103, "step": 55400 }, { "epoch": 0.5812066058581437, "grad_norm": 1.426915168762207, "learning_rate": 3.868491181886122e-07, "loss": 0.5235, "step": 55500 }, { "epoch": 0.582253824967798, "grad_norm": 2.258373498916626, "learning_rate": 3.852147803952545e-07, "loss": 0.4983, "step": 55600 }, { "epoch": 0.5833010440774523, "grad_norm": 2.660693645477295, "learning_rate": 3.835817362038477e-07, "loss": 0.5127, "step": 55700 }, { "epoch": 0.5843482631871066, "grad_norm": 2.2097291946411133, "learning_rate": 3.8195000401840927e-07, "loss": 0.5034, "step": 55800 }, { "epoch": 0.585395482296761, "grad_norm": 2.2298669815063477, "learning_rate": 3.803196022281701e-07, "loss": 0.4971, "step": 55900 }, { "epoch": 0.5864427014064153, "grad_norm": 2.1946804523468018, "learning_rate": 3.78690549207368e-07, "loss": 0.4942, "step": 56000 }, { "epoch": 0.5874899205160696, "grad_norm": 3.2329068183898926, "learning_rate": 3.77062863315041e-07, "loss": 0.513, "step": 56100 }, { "epoch": 0.5885371396257238, "grad_norm": 1.839722752571106, "learning_rate": 3.7543656289481927e-07, "loss": 0.5546, "step": 56200 }, { "epoch": 0.5895843587353782, "grad_norm": 2.5834665298461914, "learning_rate": 3.7381166627471914e-07, "loss": 0.4821, "step": 56300 }, { "epoch": 0.5906315778450325, "grad_norm": 2.00166916847229, "learning_rate": 3.7218819176693693e-07, "loss": 0.5187, "step": 56400 }, { "epoch": 0.5916787969546868, "grad_norm": 3.0043110847473145, "learning_rate": 3.7056615766764174e-07, "loss": 0.5227, "step": 56500 }, { "epoch": 0.5927260160643412, "grad_norm": 1.637872576713562, "learning_rate": 3.6894558225676924e-07, "loss": 0.4611, "step": 56600 }, { "epoch": 0.5937732351739955, "grad_norm": 2.64483904838562, "learning_rate": 3.6732648379781683e-07, "loss": 0.4792, "step": 56700 }, { "epoch": 0.5948204542836497, "grad_norm": 1.7451013326644897, "learning_rate": 3.657088805376366e-07, "loss": 0.5322, "step": 56800 }, { "epoch": 0.595867673393304, "grad_norm": 2.465116500854492, "learning_rate": 3.640927907062297e-07, "loss": 0.4657, "step": 56900 }, { "epoch": 0.5969148925029584, "grad_norm": 3.788491725921631, "learning_rate": 3.624782325165421e-07, "loss": 0.4855, "step": 57000 }, { "epoch": 0.5979621116126127, "grad_norm": 2.519657850265503, "learning_rate": 3.6086522416425823e-07, "loss": 0.5125, "step": 57100 }, { "epoch": 0.599009330722267, "grad_norm": 1.8677030801773071, "learning_rate": 3.5925378382759577e-07, "loss": 0.498, "step": 57200 }, { "epoch": 0.6000565498319214, "grad_norm": 1.9577298164367676, "learning_rate": 3.57643929667102e-07, "loss": 0.4792, "step": 57300 }, { "epoch": 0.6011037689415757, "grad_norm": 2.364872932434082, "learning_rate": 3.560356798254477e-07, "loss": 0.4882, "step": 57400 }, { "epoch": 0.6021509880512299, "grad_norm": 2.4925103187561035, "learning_rate": 3.5442905242722365e-07, "loss": 0.4825, "step": 57500 }, { "epoch": 0.6031982071608842, "grad_norm": 2.7740890979766846, "learning_rate": 3.5282406557873635e-07, "loss": 0.5345, "step": 57600 }, { "epoch": 0.6042454262705386, "grad_norm": 1.0781739950180054, "learning_rate": 3.512207373678032e-07, "loss": 0.4665, "step": 57700 }, { "epoch": 0.6052926453801929, "grad_norm": 2.9016547203063965, "learning_rate": 3.496190858635494e-07, "loss": 0.4655, "step": 57800 }, { "epoch": 0.6063398644898472, "grad_norm": 0.917265772819519, "learning_rate": 3.480191291162041e-07, "loss": 0.4707, "step": 57900 }, { "epoch": 0.6073870835995016, "grad_norm": 1.5372905731201172, "learning_rate": 3.4642088515689695e-07, "loss": 0.4867, "step": 58000 }, { "epoch": 0.6084343027091559, "grad_norm": 1.8536443710327148, "learning_rate": 3.4482437199745463e-07, "loss": 0.4746, "step": 58100 }, { "epoch": 0.6094815218188101, "grad_norm": 2.8087878227233887, "learning_rate": 3.432296076301986e-07, "loss": 0.5529, "step": 58200 }, { "epoch": 0.6105287409284644, "grad_norm": 1.8362385034561157, "learning_rate": 3.416366100277414e-07, "loss": 0.4911, "step": 58300 }, { "epoch": 0.6115759600381188, "grad_norm": 1.9666386842727661, "learning_rate": 3.4004539714278457e-07, "loss": 0.4902, "step": 58400 }, { "epoch": 0.6126231791477731, "grad_norm": 1.745953917503357, "learning_rate": 3.3845598690791675e-07, "loss": 0.5204, "step": 58500 }, { "epoch": 0.6136703982574274, "grad_norm": 1.9354580640792847, "learning_rate": 3.368683972354108e-07, "loss": 0.4763, "step": 58600 }, { "epoch": 0.6147176173670817, "grad_norm": 2.232057809829712, "learning_rate": 3.3528264601702217e-07, "loss": 0.5116, "step": 58700 }, { "epoch": 0.6157648364767361, "grad_norm": 2.1513118743896484, "learning_rate": 3.336987511237877e-07, "loss": 0.539, "step": 58800 }, { "epoch": 0.6168120555863903, "grad_norm": 1.7164148092269897, "learning_rate": 3.321167304058238e-07, "loss": 0.4912, "step": 58900 }, { "epoch": 0.6178592746960446, "grad_norm": 2.390707015991211, "learning_rate": 3.305366016921249e-07, "loss": 0.5207, "step": 59000 }, { "epoch": 0.618906493805699, "grad_norm": 1.944360613822937, "learning_rate": 3.289583827903639e-07, "loss": 0.4786, "step": 59100 }, { "epoch": 0.6199537129153533, "grad_norm": 3.611234426498413, "learning_rate": 3.2738209148668996e-07, "loss": 0.5597, "step": 59200 }, { "epoch": 0.6210009320250076, "grad_norm": 2.125988245010376, "learning_rate": 3.2580774554552834e-07, "loss": 0.5064, "step": 59300 }, { "epoch": 0.6220481511346619, "grad_norm": 2.2751822471618652, "learning_rate": 3.242353627093817e-07, "loss": 0.4839, "step": 59400 }, { "epoch": 0.6230953702443163, "grad_norm": 2.4632444381713867, "learning_rate": 3.226649606986277e-07, "loss": 0.5085, "step": 59500 }, { "epoch": 0.6241425893539705, "grad_norm": 2.596140146255493, "learning_rate": 3.210965572113211e-07, "loss": 0.4834, "step": 59600 }, { "epoch": 0.6251898084636248, "grad_norm": 3.1402766704559326, "learning_rate": 3.195301699229943e-07, "loss": 0.4894, "step": 59700 }, { "epoch": 0.6262370275732791, "grad_norm": 1.3100465536117554, "learning_rate": 3.179658164864567e-07, "loss": 0.5371, "step": 59800 }, { "epoch": 0.6272842466829335, "grad_norm": 2.2746660709381104, "learning_rate": 3.164035145315971e-07, "loss": 0.4865, "step": 59900 }, { "epoch": 0.6283314657925878, "grad_norm": 2.2843546867370605, "learning_rate": 3.14843281665185e-07, "loss": 0.4958, "step": 60000 }, { "epoch": 0.6293786849022421, "grad_norm": 2.045327663421631, "learning_rate": 3.132851354706709e-07, "loss": 0.4747, "step": 60100 }, { "epoch": 0.6304259040118964, "grad_norm": 2.59464430809021, "learning_rate": 3.117290935079895e-07, "loss": 0.4927, "step": 60200 }, { "epoch": 0.6314731231215507, "grad_norm": 1.8439029455184937, "learning_rate": 3.1017517331336175e-07, "loss": 0.4829, "step": 60300 }, { "epoch": 0.632520342231205, "grad_norm": 2.155336618423462, "learning_rate": 3.0862339239909587e-07, "loss": 0.4764, "step": 60400 }, { "epoch": 0.6335675613408593, "grad_norm": 2.2298882007598877, "learning_rate": 3.070737682533913e-07, "loss": 0.5267, "step": 60500 }, { "epoch": 0.6346147804505137, "grad_norm": 1.9075183868408203, "learning_rate": 3.0552631834014153e-07, "loss": 0.5101, "step": 60600 }, { "epoch": 0.635661999560168, "grad_norm": 2.1493678092956543, "learning_rate": 3.039810600987367e-07, "loss": 0.455, "step": 60700 }, { "epoch": 0.6367092186698223, "grad_norm": 1.9552183151245117, "learning_rate": 3.024380109438669e-07, "loss": 0.511, "step": 60800 }, { "epoch": 0.6377564377794765, "grad_norm": 2.0828135013580322, "learning_rate": 3.0089718826532727e-07, "loss": 0.4816, "step": 60900 }, { "epoch": 0.6388036568891309, "grad_norm": 1.6887547969818115, "learning_rate": 2.9935860942782055e-07, "loss": 0.4874, "step": 61000 }, { "epoch": 0.6398508759987852, "grad_norm": 1.987060785293579, "learning_rate": 2.978222917707616e-07, "loss": 0.5237, "step": 61100 }, { "epoch": 0.6408980951084395, "grad_norm": 1.8471943140029907, "learning_rate": 2.9628825260808313e-07, "loss": 0.4864, "step": 61200 }, { "epoch": 0.6419453142180939, "grad_norm": 2.424875497817993, "learning_rate": 2.9475650922803907e-07, "loss": 0.4865, "step": 61300 }, { "epoch": 0.6429925333277482, "grad_norm": 1.9071121215820312, "learning_rate": 2.9322707889301066e-07, "loss": 0.5097, "step": 61400 }, { "epoch": 0.6440397524374025, "grad_norm": 1.9200624227523804, "learning_rate": 2.9169997883931205e-07, "loss": 0.4865, "step": 61500 }, { "epoch": 0.6450869715470567, "grad_norm": 1.8281010389328003, "learning_rate": 2.90175226276995e-07, "loss": 0.4923, "step": 61600 }, { "epoch": 0.6461341906567111, "grad_norm": 2.7019853591918945, "learning_rate": 2.886528383896559e-07, "loss": 0.4702, "step": 61700 }, { "epoch": 0.6471814097663654, "grad_norm": 1.542846918106079, "learning_rate": 2.87132832334242e-07, "loss": 0.5025, "step": 61800 }, { "epoch": 0.6482286288760197, "grad_norm": 3.2872512340545654, "learning_rate": 2.856152252408578e-07, "loss": 0.4896, "step": 61900 }, { "epoch": 0.6492758479856741, "grad_norm": 3.8048501014709473, "learning_rate": 2.841000342125719e-07, "loss": 0.4723, "step": 62000 }, { "epoch": 0.6503230670953284, "grad_norm": 2.0907108783721924, "learning_rate": 2.825872763252245e-07, "loss": 0.5326, "step": 62100 }, { "epoch": 0.6513702862049827, "grad_norm": 2.4722342491149902, "learning_rate": 2.81076968627235e-07, "loss": 0.4774, "step": 62200 }, { "epoch": 0.6524175053146369, "grad_norm": 2.449239492416382, "learning_rate": 2.7956912813940947e-07, "loss": 0.47, "step": 62300 }, { "epoch": 0.6534647244242913, "grad_norm": 2.0104002952575684, "learning_rate": 2.7806377185474953e-07, "loss": 0.5017, "step": 62400 }, { "epoch": 0.6545119435339456, "grad_norm": 2.3968191146850586, "learning_rate": 2.765609167382602e-07, "loss": 0.489, "step": 62500 }, { "epoch": 0.6555591626435999, "grad_norm": 2.0325634479522705, "learning_rate": 2.750605797267587e-07, "loss": 0.5153, "step": 62600 }, { "epoch": 0.6566063817532543, "grad_norm": 2.9563980102539062, "learning_rate": 2.7356277772868427e-07, "loss": 0.5121, "step": 62700 }, { "epoch": 0.6576536008629086, "grad_norm": 1.5260460376739502, "learning_rate": 2.7206752762390684e-07, "loss": 0.5009, "step": 62800 }, { "epoch": 0.6587008199725629, "grad_norm": 2.651346206665039, "learning_rate": 2.7057484626353717e-07, "loss": 0.4819, "step": 62900 }, { "epoch": 0.6597480390822171, "grad_norm": 2.392993927001953, "learning_rate": 2.69084750469737e-07, "loss": 0.4924, "step": 63000 }, { "epoch": 0.6607952581918715, "grad_norm": 2.065648078918457, "learning_rate": 2.6759725703552916e-07, "loss": 0.4576, "step": 63100 }, { "epoch": 0.6618424773015258, "grad_norm": 1.6166179180145264, "learning_rate": 2.661123827246088e-07, "loss": 0.5187, "step": 63200 }, { "epoch": 0.6628896964111801, "grad_norm": 2.0667145252227783, "learning_rate": 2.646301442711538e-07, "loss": 0.4963, "step": 63300 }, { "epoch": 0.6639369155208344, "grad_norm": 3.5013437271118164, "learning_rate": 2.6315055837963687e-07, "loss": 0.5027, "step": 63400 }, { "epoch": 0.6649841346304888, "grad_norm": 0.9413002133369446, "learning_rate": 2.616736417246368e-07, "loss": 0.4712, "step": 63500 }, { "epoch": 0.666031353740143, "grad_norm": 1.4072952270507812, "learning_rate": 2.601994109506508e-07, "loss": 0.4731, "step": 63600 }, { "epoch": 0.6670785728497973, "grad_norm": 2.4212138652801514, "learning_rate": 2.587278826719069e-07, "loss": 0.4828, "step": 63700 }, { "epoch": 0.6681257919594517, "grad_norm": 1.7635606527328491, "learning_rate": 2.5725907347217655e-07, "loss": 0.4863, "step": 63800 }, { "epoch": 0.669173011069106, "grad_norm": 2.0671000480651855, "learning_rate": 2.5579299990458785e-07, "loss": 0.4636, "step": 63900 }, { "epoch": 0.6702202301787603, "grad_norm": 2.378913402557373, "learning_rate": 2.5432967849143906e-07, "loss": 0.4766, "step": 64000 }, { "epoch": 0.6712674492884146, "grad_norm": 3.7450199127197266, "learning_rate": 2.528691257240122e-07, "loss": 0.5137, "step": 64100 }, { "epoch": 0.672314668398069, "grad_norm": 2.676037073135376, "learning_rate": 2.514113580623873e-07, "loss": 0.4933, "step": 64200 }, { "epoch": 0.6733618875077232, "grad_norm": 1.6275851726531982, "learning_rate": 2.499563919352572e-07, "loss": 0.5038, "step": 64300 }, { "epoch": 0.6744091066173775, "grad_norm": 2.475569009780884, "learning_rate": 2.485042437397418e-07, "loss": 0.4518, "step": 64400 }, { "epoch": 0.6754563257270318, "grad_norm": 3.2226366996765137, "learning_rate": 2.470549298412036e-07, "loss": 0.4634, "step": 64500 }, { "epoch": 0.6765035448366862, "grad_norm": 2.9092655181884766, "learning_rate": 2.456084665730634e-07, "loss": 0.4851, "step": 64600 }, { "epoch": 0.6775507639463405, "grad_norm": 1.9740290641784668, "learning_rate": 2.441648702366161e-07, "loss": 0.489, "step": 64700 }, { "epoch": 0.6785979830559948, "grad_norm": 2.2705118656158447, "learning_rate": 2.42724157100847e-07, "loss": 0.4918, "step": 64800 }, { "epoch": 0.6796452021656492, "grad_norm": 2.0279767513275146, "learning_rate": 2.4128634340224767e-07, "loss": 0.5309, "step": 64900 }, { "epoch": 0.6806924212753034, "grad_norm": 2.4952125549316406, "learning_rate": 2.3985144534463507e-07, "loss": 0.5253, "step": 65000 }, { "epoch": 0.6817396403849577, "grad_norm": 1.7526471614837646, "learning_rate": 2.3841947909896675e-07, "loss": 0.4919, "step": 65100 }, { "epoch": 0.682786859494612, "grad_norm": 2.78068208694458, "learning_rate": 2.369904608031591e-07, "loss": 0.4678, "step": 65200 }, { "epoch": 0.6838340786042664, "grad_norm": 1.9609248638153076, "learning_rate": 2.3556440656190675e-07, "loss": 0.5004, "step": 65300 }, { "epoch": 0.6848812977139207, "grad_norm": 1.8966784477233887, "learning_rate": 2.3414133244649965e-07, "loss": 0.4609, "step": 65400 }, { "epoch": 0.685928516823575, "grad_norm": 1.7883254289627075, "learning_rate": 2.3272125449464197e-07, "loss": 0.5053, "step": 65500 }, { "epoch": 0.6869757359332294, "grad_norm": 2.0737862586975098, "learning_rate": 2.3130418871027285e-07, "loss": 0.5126, "step": 65600 }, { "epoch": 0.6880229550428836, "grad_norm": 2.2858548164367676, "learning_rate": 2.2989015106338456e-07, "loss": 0.4954, "step": 65700 }, { "epoch": 0.6890701741525379, "grad_norm": 2.121546506881714, "learning_rate": 2.284791574898423e-07, "loss": 0.5017, "step": 65800 }, { "epoch": 0.6901173932621922, "grad_norm": 1.6191834211349487, "learning_rate": 2.270712238912067e-07, "loss": 0.4721, "step": 65900 }, { "epoch": 0.6911646123718466, "grad_norm": 2.482290506362915, "learning_rate": 2.2566636613455185e-07, "loss": 0.5003, "step": 66000 }, { "epoch": 0.6922118314815009, "grad_norm": 2.413865089416504, "learning_rate": 2.242646000522885e-07, "loss": 0.4864, "step": 66100 }, { "epoch": 0.6932590505911552, "grad_norm": 2.390326738357544, "learning_rate": 2.228659414419853e-07, "loss": 0.5155, "step": 66200 }, { "epoch": 0.6943062697008096, "grad_norm": 2.158834457397461, "learning_rate": 2.2147040606618956e-07, "loss": 0.4972, "step": 66300 }, { "epoch": 0.6953534888104638, "grad_norm": 2.767620086669922, "learning_rate": 2.2007800965225087e-07, "loss": 0.4651, "step": 66400 }, { "epoch": 0.6964007079201181, "grad_norm": 3.050821542739868, "learning_rate": 2.1868876789214418e-07, "loss": 0.5146, "step": 66500 }, { "epoch": 0.6974479270297724, "grad_norm": 2.7702839374542236, "learning_rate": 2.1730269644229104e-07, "loss": 0.5143, "step": 66600 }, { "epoch": 0.6984951461394268, "grad_norm": 2.543748140335083, "learning_rate": 2.159198109233849e-07, "loss": 0.5028, "step": 66700 }, { "epoch": 0.6995423652490811, "grad_norm": 3.739572048187256, "learning_rate": 2.1454012692021505e-07, "loss": 0.5471, "step": 66800 }, { "epoch": 0.7005895843587354, "grad_norm": 2.372471809387207, "learning_rate": 2.131636599814896e-07, "loss": 0.4978, "step": 66900 }, { "epoch": 0.7016368034683896, "grad_norm": 2.276508092880249, "learning_rate": 2.1179042561966154e-07, "loss": 0.5153, "step": 67000 }, { "epoch": 0.702684022578044, "grad_norm": 2.0715689659118652, "learning_rate": 2.1042043931075342e-07, "loss": 0.5127, "step": 67100 }, { "epoch": 0.7037312416876983, "grad_norm": 1.9307739734649658, "learning_rate": 2.0905371649418318e-07, "loss": 0.4746, "step": 67200 }, { "epoch": 0.7047784607973526, "grad_norm": 2.039501905441284, "learning_rate": 2.076902725725897e-07, "loss": 0.4952, "step": 67300 }, { "epoch": 0.705825679907007, "grad_norm": 2.397334575653076, "learning_rate": 2.063301229116597e-07, "loss": 0.4728, "step": 67400 }, { "epoch": 0.7068728990166613, "grad_norm": 3.5085904598236084, "learning_rate": 2.0497328283995425e-07, "loss": 0.5176, "step": 67500 }, { "epoch": 0.7079201181263156, "grad_norm": 2.772425651550293, "learning_rate": 2.0361976764873623e-07, "loss": 0.5159, "step": 67600 }, { "epoch": 0.7089673372359698, "grad_norm": 1.3938500881195068, "learning_rate": 2.0226959259179794e-07, "loss": 0.4949, "step": 67700 }, { "epoch": 0.7100145563456242, "grad_norm": 2.1697475910186768, "learning_rate": 2.0092277288528898e-07, "loss": 0.466, "step": 67800 }, { "epoch": 0.7110617754552785, "grad_norm": 1.512786865234375, "learning_rate": 1.995793237075452e-07, "loss": 0.5185, "step": 67900 }, { "epoch": 0.7121089945649328, "grad_norm": 1.7060164213180542, "learning_rate": 1.9823926019891724e-07, "loss": 0.4649, "step": 68000 }, { "epoch": 0.7131562136745871, "grad_norm": 2.2003238201141357, "learning_rate": 1.9690259746160005e-07, "loss": 0.4921, "step": 68100 }, { "epoch": 0.7142034327842415, "grad_norm": 2.538870096206665, "learning_rate": 1.9556935055946277e-07, "loss": 0.5164, "step": 68200 }, { "epoch": 0.7152506518938958, "grad_norm": 3.6677184104919434, "learning_rate": 1.9423953451787888e-07, "loss": 0.5299, "step": 68300 }, { "epoch": 0.71629787100355, "grad_norm": 1.810766339302063, "learning_rate": 1.929131643235569e-07, "loss": 0.4917, "step": 68400 }, { "epoch": 0.7173450901132044, "grad_norm": 1.973241925239563, "learning_rate": 1.9159025492437143e-07, "loss": 0.4827, "step": 68500 }, { "epoch": 0.7183923092228587, "grad_norm": 2.1515488624572754, "learning_rate": 1.9027082122919474e-07, "loss": 0.4748, "step": 68600 }, { "epoch": 0.719439528332513, "grad_norm": 1.521958827972412, "learning_rate": 1.8895487810772882e-07, "loss": 0.5087, "step": 68700 }, { "epoch": 0.7204867474421673, "grad_norm": 2.1833043098449707, "learning_rate": 1.876424403903376e-07, "loss": 0.4784, "step": 68800 }, { "epoch": 0.7215339665518217, "grad_norm": 2.8621373176574707, "learning_rate": 1.8633352286788011e-07, "loss": 0.5077, "step": 68900 }, { "epoch": 0.722581185661476, "grad_norm": 1.9079474210739136, "learning_rate": 1.8502814029154367e-07, "loss": 0.5052, "step": 69000 }, { "epoch": 0.7236284047711302, "grad_norm": 2.184054374694824, "learning_rate": 1.837263073726769e-07, "loss": 0.5109, "step": 69100 }, { "epoch": 0.7246756238807845, "grad_norm": 2.0883328914642334, "learning_rate": 1.824280387826258e-07, "loss": 0.4888, "step": 69200 }, { "epoch": 0.7257228429904389, "grad_norm": 2.368727207183838, "learning_rate": 1.8113334915256663e-07, "loss": 0.4963, "step": 69300 }, { "epoch": 0.7267700621000932, "grad_norm": 2.7945289611816406, "learning_rate": 1.7984225307334106e-07, "loss": 0.4927, "step": 69400 }, { "epoch": 0.7278172812097475, "grad_norm": 1.937376856803894, "learning_rate": 1.7855476509529337e-07, "loss": 0.4741, "step": 69500 }, { "epoch": 0.7288645003194019, "grad_norm": 3.4460761547088623, "learning_rate": 1.7727089972810505e-07, "loss": 0.569, "step": 69600 }, { "epoch": 0.7299117194290562, "grad_norm": 3.9340882301330566, "learning_rate": 1.7599067144063086e-07, "loss": 0.5028, "step": 69700 }, { "epoch": 0.7309589385387104, "grad_norm": 3.2756307125091553, "learning_rate": 1.7471409466073772e-07, "loss": 0.5238, "step": 69800 }, { "epoch": 0.7320061576483647, "grad_norm": 2.0363681316375732, "learning_rate": 1.7344118377514044e-07, "loss": 0.5528, "step": 69900 }, { "epoch": 0.7330533767580191, "grad_norm": 2.6508500576019287, "learning_rate": 1.7217195312923944e-07, "loss": 0.4733, "step": 70000 }, { "epoch": 0.7341005958676734, "grad_norm": 1.832088828086853, "learning_rate": 1.7090641702696102e-07, "loss": 0.4909, "step": 70100 }, { "epoch": 0.7351478149773277, "grad_norm": 2.644780158996582, "learning_rate": 1.6964458973059358e-07, "loss": 0.4928, "step": 70200 }, { "epoch": 0.7361950340869821, "grad_norm": 2.407883644104004, "learning_rate": 1.683864854606289e-07, "loss": 0.4497, "step": 70300 }, { "epoch": 0.7372422531966363, "grad_norm": 2.3634557723999023, "learning_rate": 1.6713211839560125e-07, "loss": 0.4738, "step": 70400 }, { "epoch": 0.7382894723062906, "grad_norm": 2.401092052459717, "learning_rate": 1.658815026719269e-07, "loss": 0.5084, "step": 70500 }, { "epoch": 0.7393366914159449, "grad_norm": 2.105447292327881, "learning_rate": 1.6463465238374568e-07, "loss": 0.4681, "step": 70600 }, { "epoch": 0.7403839105255993, "grad_norm": 2.5298540592193604, "learning_rate": 1.633915815827623e-07, "loss": 0.5149, "step": 70700 }, { "epoch": 0.7414311296352536, "grad_norm": 2.3362057209014893, "learning_rate": 1.621523042780868e-07, "loss": 0.5225, "step": 70800 }, { "epoch": 0.7424783487449079, "grad_norm": 3.7627904415130615, "learning_rate": 1.6091683443607767e-07, "loss": 0.4967, "step": 70900 }, { "epoch": 0.7435255678545623, "grad_norm": 2.4007790088653564, "learning_rate": 1.5968518598018483e-07, "loss": 0.4878, "step": 71000 }, { "epoch": 0.7445727869642165, "grad_norm": 2.1650781631469727, "learning_rate": 1.5845737279079118e-07, "loss": 0.502, "step": 71100 }, { "epoch": 0.7456200060738708, "grad_norm": 1.9574668407440186, "learning_rate": 1.5723340870505753e-07, "loss": 0.4843, "step": 71200 }, { "epoch": 0.7466672251835251, "grad_norm": 2.2389516830444336, "learning_rate": 1.5601330751676624e-07, "loss": 0.519, "step": 71300 }, { "epoch": 0.7477144442931795, "grad_norm": 1.7965580224990845, "learning_rate": 1.5479708297616567e-07, "loss": 0.4676, "step": 71400 }, { "epoch": 0.7487616634028338, "grad_norm": 2.057460069656372, "learning_rate": 1.5358474878981526e-07, "loss": 0.5106, "step": 71500 }, { "epoch": 0.7498088825124881, "grad_norm": 2.1372034549713135, "learning_rate": 1.5237631862043115e-07, "loss": 0.4786, "step": 71600 }, { "epoch": 0.7508561016221424, "grad_norm": 2.0700478553771973, "learning_rate": 1.5117180608673203e-07, "loss": 0.4855, "step": 71700 }, { "epoch": 0.7519033207317967, "grad_norm": 1.7832368612289429, "learning_rate": 1.4997122476328593e-07, "loss": 0.5188, "step": 71800 }, { "epoch": 0.752950539841451, "grad_norm": 3.6390135288238525, "learning_rate": 1.4877458818035705e-07, "loss": 0.5304, "step": 71900 }, { "epoch": 0.7539977589511053, "grad_norm": 3.022871732711792, "learning_rate": 1.4758190982375295e-07, "loss": 0.4648, "step": 72000 }, { "epoch": 0.7550449780607597, "grad_norm": 1.6055036783218384, "learning_rate": 1.463932031346739e-07, "loss": 0.5118, "step": 72100 }, { "epoch": 0.756092197170414, "grad_norm": 4.166171550750732, "learning_rate": 1.4520848150955912e-07, "loss": 0.4986, "step": 72200 }, { "epoch": 0.7571394162800683, "grad_norm": 3.3419265747070312, "learning_rate": 1.44027758299938e-07, "loss": 0.5049, "step": 72300 }, { "epoch": 0.7581866353897226, "grad_norm": 3.171034336090088, "learning_rate": 1.4285104681227854e-07, "loss": 0.5091, "step": 72400 }, { "epoch": 0.7592338544993769, "grad_norm": 2.6404178142547607, "learning_rate": 1.4167836030783752e-07, "loss": 0.5208, "step": 72500 }, { "epoch": 0.7602810736090312, "grad_norm": 2.8442752361297607, "learning_rate": 1.4050971200251115e-07, "loss": 0.475, "step": 72600 }, { "epoch": 0.7613282927186855, "grad_norm": 1.9694572687149048, "learning_rate": 1.3934511506668616e-07, "loss": 0.4477, "step": 72700 }, { "epoch": 0.7623755118283398, "grad_norm": 3.6044440269470215, "learning_rate": 1.3818458262509119e-07, "loss": 0.4972, "step": 72800 }, { "epoch": 0.7634227309379942, "grad_norm": 1.7680317163467407, "learning_rate": 1.3702812775664917e-07, "loss": 0.4964, "step": 72900 }, { "epoch": 0.7644699500476485, "grad_norm": 1.948326587677002, "learning_rate": 1.358757634943296e-07, "loss": 0.4733, "step": 73000 }, { "epoch": 0.7655171691573028, "grad_norm": 2.4567108154296875, "learning_rate": 1.3472750282500195e-07, "loss": 0.5247, "step": 73100 }, { "epoch": 0.766564388266957, "grad_norm": 1.3387149572372437, "learning_rate": 1.3358335868928906e-07, "loss": 0.4894, "step": 73200 }, { "epoch": 0.7676116073766114, "grad_norm": 1.793434977531433, "learning_rate": 1.3244334398142154e-07, "loss": 0.5103, "step": 73300 }, { "epoch": 0.7686588264862657, "grad_norm": 2.429433822631836, "learning_rate": 1.3130747154909227e-07, "loss": 0.5304, "step": 73400 }, { "epoch": 0.76970604559592, "grad_norm": 2.3653488159179688, "learning_rate": 1.3017575419331173e-07, "loss": 0.5092, "step": 73500 }, { "epoch": 0.7707532647055744, "grad_norm": 3.5659842491149902, "learning_rate": 1.2904820466826355e-07, "loss": 0.4835, "step": 73600 }, { "epoch": 0.7718004838152287, "grad_norm": 2.952862501144409, "learning_rate": 1.279248356811611e-07, "loss": 0.5015, "step": 73700 }, { "epoch": 0.7728477029248829, "grad_norm": 2.398303508758545, "learning_rate": 1.2680565989210385e-07, "loss": 0.4938, "step": 73800 }, { "epoch": 0.7738949220345372, "grad_norm": 2.317095994949341, "learning_rate": 1.2569068991393523e-07, "loss": 0.4617, "step": 73900 }, { "epoch": 0.7749421411441916, "grad_norm": 2.453432559967041, "learning_rate": 1.2457993831209989e-07, "loss": 0.5198, "step": 74000 }, { "epoch": 0.7759893602538459, "grad_norm": 1.8672329187393188, "learning_rate": 1.2347341760450263e-07, "loss": 0.4742, "step": 74100 }, { "epoch": 0.7770365793635002, "grad_norm": 3.076641798019409, "learning_rate": 1.223711402613669e-07, "loss": 0.4928, "step": 74200 }, { "epoch": 0.7780837984731546, "grad_norm": 2.7013864517211914, "learning_rate": 1.212731187050946e-07, "loss": 0.4565, "step": 74300 }, { "epoch": 0.7791310175828089, "grad_norm": 3.7489242553710938, "learning_rate": 1.2017936531012574e-07, "loss": 0.5017, "step": 74400 }, { "epoch": 0.7801782366924631, "grad_norm": 2.7046327590942383, "learning_rate": 1.1908989240279938e-07, "loss": 0.4551, "step": 74500 }, { "epoch": 0.7812254558021174, "grad_norm": 1.9993566274642944, "learning_rate": 1.1800471226121456e-07, "loss": 0.4742, "step": 74600 }, { "epoch": 0.7822726749117718, "grad_norm": 2.9598634243011475, "learning_rate": 1.1692383711509129e-07, "loss": 0.5121, "step": 74700 }, { "epoch": 0.7833198940214261, "grad_norm": 3.2795605659484863, "learning_rate": 1.158472791456342e-07, "loss": 0.5344, "step": 74800 }, { "epoch": 0.7843671131310804, "grad_norm": 1.8576877117156982, "learning_rate": 1.1477505048539387e-07, "loss": 0.4924, "step": 74900 }, { "epoch": 0.7854143322407348, "grad_norm": 1.8820946216583252, "learning_rate": 1.1370716321813029e-07, "loss": 0.4794, "step": 75000 }, { "epoch": 0.7864615513503891, "grad_norm": 3.3854475021362305, "learning_rate": 1.1264362937867784e-07, "loss": 0.4841, "step": 75100 }, { "epoch": 0.7875087704600433, "grad_norm": 3.2768609523773193, "learning_rate": 1.1158446095280821e-07, "loss": 0.4802, "step": 75200 }, { "epoch": 0.7885559895696976, "grad_norm": 2.02317476272583, "learning_rate": 1.1052966987709572e-07, "loss": 0.4762, "step": 75300 }, { "epoch": 0.789603208679352, "grad_norm": 2.08528208732605, "learning_rate": 1.0947926803878366e-07, "loss": 0.5083, "step": 75400 }, { "epoch": 0.7906504277890063, "grad_norm": 2.0258214473724365, "learning_rate": 1.0843326727564945e-07, "loss": 0.4927, "step": 75500 }, { "epoch": 0.7916976468986606, "grad_norm": 3.184265375137329, "learning_rate": 1.0739167937587079e-07, "loss": 0.5066, "step": 75600 }, { "epoch": 0.792744866008315, "grad_norm": 2.808084011077881, "learning_rate": 1.0635451607789469e-07, "loss": 0.5172, "step": 75700 }, { "epoch": 0.7937920851179693, "grad_norm": 2.172506332397461, "learning_rate": 1.0532178907030275e-07, "loss": 0.4797, "step": 75800 }, { "epoch": 0.7948393042276235, "grad_norm": 1.9276924133300781, "learning_rate": 1.0429350999168119e-07, "loss": 0.5057, "step": 75900 }, { "epoch": 0.7958865233372778, "grad_norm": 2.1610867977142334, "learning_rate": 1.0326969043048955e-07, "loss": 0.4964, "step": 76000 }, { "epoch": 0.7969337424469322, "grad_norm": 2.5907599925994873, "learning_rate": 1.0225034192492876e-07, "loss": 0.4886, "step": 76100 }, { "epoch": 0.7979809615565865, "grad_norm": 1.8623499870300293, "learning_rate": 1.0123547596281257e-07, "loss": 0.5151, "step": 76200 }, { "epoch": 0.7990281806662408, "grad_norm": 1.7319766283035278, "learning_rate": 1.0022510398143785e-07, "loss": 0.4983, "step": 76300 }, { "epoch": 0.8000753997758951, "grad_norm": 3.9193685054779053, "learning_rate": 9.921923736745452e-08, "loss": 0.5011, "step": 76400 }, { "epoch": 0.8011226188855495, "grad_norm": 1.8976281881332397, "learning_rate": 9.821788745673864e-08, "loss": 0.5036, "step": 76500 }, { "epoch": 0.8021698379952037, "grad_norm": 2.426635980606079, "learning_rate": 9.722106553426446e-08, "loss": 0.4993, "step": 76600 }, { "epoch": 0.803217057104858, "grad_norm": 1.929158329963684, "learning_rate": 9.622878283397596e-08, "loss": 0.515, "step": 76700 }, { "epoch": 0.8042642762145124, "grad_norm": 3.309342622756958, "learning_rate": 9.524105053866182e-08, "loss": 0.5395, "step": 76800 }, { "epoch": 0.8053114953241667, "grad_norm": 1.8991940021514893, "learning_rate": 9.425787977982869e-08, "loss": 0.5079, "step": 76900 }, { "epoch": 0.806358714433821, "grad_norm": 2.271533250808716, "learning_rate": 9.32792816375756e-08, "loss": 0.4579, "step": 77000 }, { "epoch": 0.8074059335434753, "grad_norm": 2.1554083824157715, "learning_rate": 9.230526714046944e-08, "loss": 0.4556, "step": 77100 }, { "epoch": 0.8084531526531297, "grad_norm": 1.8269262313842773, "learning_rate": 9.133584726542037e-08, "loss": 0.4883, "step": 77200 }, { "epoch": 0.8095003717627839, "grad_norm": 2.5304064750671387, "learning_rate": 9.037103293755849e-08, "loss": 0.4977, "step": 77300 }, { "epoch": 0.8105475908724382, "grad_norm": 2.8901185989379883, "learning_rate": 8.941083503011021e-08, "loss": 0.5063, "step": 77400 }, { "epoch": 0.8115948099820925, "grad_norm": 2.2524912357330322, "learning_rate": 8.845526436427625e-08, "loss": 0.5144, "step": 77500 }, { "epoch": 0.8126420290917469, "grad_norm": 2.046915292739868, "learning_rate": 8.750433170910915e-08, "loss": 0.4933, "step": 77600 }, { "epoch": 0.8136892482014012, "grad_norm": 2.644960641860962, "learning_rate": 8.655804778139247e-08, "loss": 0.4962, "step": 77700 }, { "epoch": 0.8147364673110555, "grad_norm": 2.299511432647705, "learning_rate": 8.561642324551954e-08, "loss": 0.4546, "step": 77800 }, { "epoch": 0.8157836864207098, "grad_norm": 2.5044310092926025, "learning_rate": 8.467946871337344e-08, "loss": 0.4768, "step": 77900 }, { "epoch": 0.8168309055303641, "grad_norm": 1.8609235286712646, "learning_rate": 8.374719474420749e-08, "loss": 0.4724, "step": 78000 }, { "epoch": 0.8178781246400184, "grad_norm": 1.9416966438293457, "learning_rate": 8.281961184452629e-08, "loss": 0.4956, "step": 78100 }, { "epoch": 0.8189253437496727, "grad_norm": 2.851625919342041, "learning_rate": 8.189673046796702e-08, "loss": 0.5068, "step": 78200 }, { "epoch": 0.8199725628593271, "grad_norm": 2.262005567550659, "learning_rate": 8.097856101518186e-08, "loss": 0.4846, "step": 78300 }, { "epoch": 0.8210197819689814, "grad_norm": 2.1528186798095703, "learning_rate": 8.00651138337209e-08, "loss": 0.4776, "step": 78400 }, { "epoch": 0.8220670010786357, "grad_norm": 2.505295991897583, "learning_rate": 7.915639921791511e-08, "loss": 0.5012, "step": 78500 }, { "epoch": 0.82311422018829, "grad_norm": 2.5964581966400146, "learning_rate": 7.825242740876081e-08, "loss": 0.5111, "step": 78600 }, { "epoch": 0.8241614392979443, "grad_norm": 2.3113765716552734, "learning_rate": 7.735320859380384e-08, "loss": 0.5262, "step": 78700 }, { "epoch": 0.8252086584075986, "grad_norm": 1.8016088008880615, "learning_rate": 7.645875290702519e-08, "loss": 0.4794, "step": 78800 }, { "epoch": 0.8262558775172529, "grad_norm": 2.7183265686035156, "learning_rate": 7.556907042872601e-08, "loss": 0.5013, "step": 78900 }, { "epoch": 0.8273030966269073, "grad_norm": 1.6194109916687012, "learning_rate": 7.46841711854152e-08, "loss": 0.4662, "step": 79000 }, { "epoch": 0.8283503157365616, "grad_norm": 1.8583705425262451, "learning_rate": 7.38040651496955e-08, "loss": 0.4602, "step": 79100 }, { "epoch": 0.8293975348462159, "grad_norm": 2.0989129543304443, "learning_rate": 7.292876224015082e-08, "loss": 0.4922, "step": 79200 }, { "epoch": 0.8304447539558701, "grad_norm": 2.0418784618377686, "learning_rate": 7.205827232123585e-08, "loss": 0.5032, "step": 79300 }, { "epoch": 0.8314919730655245, "grad_norm": 2.34555983543396, "learning_rate": 7.119260520316368e-08, "loss": 0.4912, "step": 79400 }, { "epoch": 0.8325391921751788, "grad_norm": 2.5016937255859375, "learning_rate": 7.033177064179507e-08, "loss": 0.4792, "step": 79500 }, { "epoch": 0.8335864112848331, "grad_norm": 2.4543182849884033, "learning_rate": 6.947577833852991e-08, "loss": 0.4713, "step": 79600 }, { "epoch": 0.8346336303944875, "grad_norm": 2.092000961303711, "learning_rate": 6.862463794019657e-08, "loss": 0.4607, "step": 79700 }, { "epoch": 0.8356808495041418, "grad_norm": 2.430490255355835, "learning_rate": 6.777835903894324e-08, "loss": 0.5018, "step": 79800 }, { "epoch": 0.8367280686137961, "grad_norm": 1.815276026725769, "learning_rate": 6.69369511721311e-08, "loss": 0.4967, "step": 79900 }, { "epoch": 0.8377752877234503, "grad_norm": 2.1097006797790527, "learning_rate": 6.610042382222497e-08, "loss": 0.4601, "step": 80000 }, { "epoch": 0.8388225068331047, "grad_norm": 3.367506504058838, "learning_rate": 6.526878641668798e-08, "loss": 0.4913, "step": 80100 }, { "epoch": 0.839869725942759, "grad_norm": 1.4861557483673096, "learning_rate": 6.444204832787486e-08, "loss": 0.485, "step": 80200 }, { "epoch": 0.8409169450524133, "grad_norm": 2.3718228340148926, "learning_rate": 6.362021887292578e-08, "loss": 0.4941, "step": 80300 }, { "epoch": 0.8419641641620677, "grad_norm": 2.2200145721435547, "learning_rate": 6.28033073136619e-08, "loss": 0.4928, "step": 80400 }, { "epoch": 0.843011383271722, "grad_norm": 2.4420855045318604, "learning_rate": 6.199132285648129e-08, "loss": 0.515, "step": 80500 }, { "epoch": 0.8440586023813763, "grad_norm": 2.225245714187622, "learning_rate": 6.118427465225418e-08, "loss": 0.5029, "step": 80600 }, { "epoch": 0.8451058214910305, "grad_norm": 2.7253527641296387, "learning_rate": 6.038217179622057e-08, "loss": 0.4898, "step": 80700 }, { "epoch": 0.8461530406006849, "grad_norm": 1.8062297105789185, "learning_rate": 5.958502332788806e-08, "loss": 0.5089, "step": 80800 }, { "epoch": 0.8472002597103392, "grad_norm": 3.0290756225585938, "learning_rate": 5.8792838230928734e-08, "loss": 0.4988, "step": 80900 }, { "epoch": 0.8482474788199935, "grad_norm": 2.042731523513794, "learning_rate": 5.800562543307913e-08, "loss": 0.493, "step": 81000 }, { "epoch": 0.8492946979296478, "grad_norm": 2.5578713417053223, "learning_rate": 5.722339380603908e-08, "loss": 0.475, "step": 81100 }, { "epoch": 0.8503419170393022, "grad_norm": 3.2866199016571045, "learning_rate": 5.6446152165371685e-08, "loss": 0.5102, "step": 81200 }, { "epoch": 0.8513891361489564, "grad_norm": 2.475862979888916, "learning_rate": 5.5673909270404495e-08, "loss": 0.4896, "step": 81300 }, { "epoch": 0.8524363552586107, "grad_norm": 4.128602027893066, "learning_rate": 5.490667382412978e-08, "loss": 0.4781, "step": 81400 }, { "epoch": 0.853483574368265, "grad_norm": 2.8154897689819336, "learning_rate": 5.414445447310745e-08, "loss": 0.5034, "step": 81500 }, { "epoch": 0.8545307934779194, "grad_norm": 2.5624399185180664, "learning_rate": 5.338725980736736e-08, "loss": 0.4997, "step": 81600 }, { "epoch": 0.8555780125875737, "grad_norm": 2.6771199703216553, "learning_rate": 5.263509836031193e-08, "loss": 0.5214, "step": 81700 }, { "epoch": 0.856625231697228, "grad_norm": 2.225013494491577, "learning_rate": 5.1887978608620596e-08, "loss": 0.4838, "step": 81800 }, { "epoch": 0.8576724508068824, "grad_norm": 2.8142294883728027, "learning_rate": 5.114590897215448e-08, "loss": 0.5037, "step": 81900 }, { "epoch": 0.8587196699165366, "grad_norm": 2.071779727935791, "learning_rate": 5.040889781386043e-08, "loss": 0.4689, "step": 82000 }, { "epoch": 0.8597668890261909, "grad_norm": 2.6963651180267334, "learning_rate": 4.9676953439677925e-08, "loss": 0.489, "step": 82100 }, { "epoch": 0.8608141081358452, "grad_norm": 2.4148457050323486, "learning_rate": 4.895008409844481e-08, "loss": 0.4816, "step": 82200 }, { "epoch": 0.8618613272454996, "grad_norm": 2.611649513244629, "learning_rate": 4.822829798180467e-08, "loss": 0.5531, "step": 82300 }, { "epoch": 0.8629085463551539, "grad_norm": 1.8031556606292725, "learning_rate": 4.751160322411418e-08, "loss": 0.454, "step": 82400 }, { "epoch": 0.8639557654648082, "grad_norm": 2.0377116203308105, "learning_rate": 4.680000790235178e-08, "loss": 0.5212, "step": 82500 }, { "epoch": 0.8650029845744626, "grad_norm": 1.7090651988983154, "learning_rate": 4.609352003602646e-08, "loss": 0.4721, "step": 82600 }, { "epoch": 0.8660502036841168, "grad_norm": 0.9355291724205017, "learning_rate": 4.5392147587087315e-08, "loss": 0.4535, "step": 82700 }, { "epoch": 0.8670974227937711, "grad_norm": 2.991403579711914, "learning_rate": 4.4695898459834016e-08, "loss": 0.5108, "step": 82800 }, { "epoch": 0.8681446419034254, "grad_norm": 2.0942938327789307, "learning_rate": 4.400478050082751e-08, "loss": 0.4919, "step": 82900 }, { "epoch": 0.8691918610130798, "grad_norm": 1.971248745918274, "learning_rate": 4.331880149880179e-08, "loss": 0.4981, "step": 83000 }, { "epoch": 0.8702390801227341, "grad_norm": 2.0472984313964844, "learning_rate": 4.263796918457613e-08, "loss": 0.4663, "step": 83100 }, { "epoch": 0.8712862992323884, "grad_norm": 2.9207637310028076, "learning_rate": 4.196229123096762e-08, "loss": 0.4723, "step": 83200 }, { "epoch": 0.8723335183420428, "grad_norm": 2.6545724868774414, "learning_rate": 4.129177525270511e-08, "loss": 0.5042, "step": 83300 }, { "epoch": 0.873380737451697, "grad_norm": 2.008007526397705, "learning_rate": 4.0626428806343205e-08, "loss": 0.4904, "step": 83400 }, { "epoch": 0.8744279565613513, "grad_norm": 1.2464555501937866, "learning_rate": 3.996625939017711e-08, "loss": 0.5248, "step": 83500 }, { "epoch": 0.8754751756710056, "grad_norm": 3.1436216831207275, "learning_rate": 3.9311274444158106e-08, "loss": 0.4924, "step": 83600 }, { "epoch": 0.87652239478066, "grad_norm": 3.0234928131103516, "learning_rate": 3.8661481349809786e-08, "loss": 0.493, "step": 83700 }, { "epoch": 0.8775696138903143, "grad_norm": 2.1175239086151123, "learning_rate": 3.8016887430144754e-08, "loss": 0.4933, "step": 83800 }, { "epoch": 0.8786168329999686, "grad_norm": 2.497673749923706, "learning_rate": 3.737749994958228e-08, "loss": 0.5146, "step": 83900 }, { "epoch": 0.879664052109623, "grad_norm": 1.5378285646438599, "learning_rate": 3.674332611386616e-08, "loss": 0.4628, "step": 84000 }, { "epoch": 0.8807112712192772, "grad_norm": 3.481321334838867, "learning_rate": 3.6114373069983885e-08, "loss": 0.513, "step": 84100 }, { "epoch": 0.8817584903289315, "grad_norm": 3.8998842239379883, "learning_rate": 3.549064790608536e-08, "loss": 0.5157, "step": 84200 }, { "epoch": 0.8828057094385858, "grad_norm": 4.254595756530762, "learning_rate": 3.487215765140422e-08, "loss": 0.503, "step": 84300 }, { "epoch": 0.8838529285482402, "grad_norm": 1.633023977279663, "learning_rate": 3.4258909276177584e-08, "loss": 0.4763, "step": 84400 }, { "epoch": 0.8849001476578945, "grad_norm": 2.1271402835845947, "learning_rate": 3.365090969156764e-08, "loss": 0.514, "step": 84500 }, { "epoch": 0.8859473667675488, "grad_norm": 2.325639009475708, "learning_rate": 3.304816574958441e-08, "loss": 0.5295, "step": 84600 }, { "epoch": 0.886994585877203, "grad_norm": 3.336534261703491, "learning_rate": 3.2450684243007786e-08, "loss": 0.498, "step": 84700 }, { "epoch": 0.8880418049868574, "grad_norm": 2.818937301635742, "learning_rate": 3.185847190531121e-08, "loss": 0.4621, "step": 84800 }, { "epoch": 0.8890890240965117, "grad_norm": 2.3609235286712646, "learning_rate": 3.1271535410586136e-08, "loss": 0.4536, "step": 84900 }, { "epoch": 0.890136243206166, "grad_norm": 2.134856939315796, "learning_rate": 3.06898813734664e-08, "loss": 0.4955, "step": 85000 }, { "epoch": 0.8911834623158204, "grad_norm": 2.349867105484009, "learning_rate": 3.011351634905357e-08, "loss": 0.5, "step": 85100 }, { "epoch": 0.8922306814254747, "grad_norm": 2.3223259449005127, "learning_rate": 2.9542446832843793e-08, "loss": 0.5176, "step": 85200 }, { "epoch": 0.893277900535129, "grad_norm": 2.8934836387634277, "learning_rate": 2.8976679260653613e-08, "loss": 0.5069, "step": 85300 }, { "epoch": 0.8943251196447832, "grad_norm": 2.5627784729003906, "learning_rate": 2.8416220008548152e-08, "loss": 0.5019, "step": 85400 }, { "epoch": 0.8953723387544376, "grad_norm": 4.0183796882629395, "learning_rate": 2.7861075392769275e-08, "loss": 0.4907, "step": 85500 }, { "epoch": 0.8964195578640919, "grad_norm": 2.2696878910064697, "learning_rate": 2.7311251669663692e-08, "loss": 0.4785, "step": 85600 }, { "epoch": 0.8974667769737462, "grad_norm": 2.5743296146392822, "learning_rate": 2.6766755035613155e-08, "loss": 0.4707, "step": 85700 }, { "epoch": 0.8985139960834005, "grad_norm": 2.059088945388794, "learning_rate": 2.622759162696464e-08, "loss": 0.5246, "step": 85800 }, { "epoch": 0.8995612151930549, "grad_norm": 1.2305697202682495, "learning_rate": 2.5693767519960496e-08, "loss": 0.4841, "step": 85900 }, { "epoch": 0.9006084343027092, "grad_norm": 3.181995153427124, "learning_rate": 2.5165288730670585e-08, "loss": 0.4882, "step": 86000 }, { "epoch": 0.9016556534123634, "grad_norm": 2.311540365219116, "learning_rate": 2.464216121492463e-08, "loss": 0.4918, "step": 86100 }, { "epoch": 0.9027028725220178, "grad_norm": 1.5216143131256104, "learning_rate": 2.412439086824436e-08, "loss": 0.4877, "step": 86200 }, { "epoch": 0.9037500916316721, "grad_norm": 1.816412091255188, "learning_rate": 2.361198352577759e-08, "loss": 0.495, "step": 86300 }, { "epoch": 0.9047973107413264, "grad_norm": 1.8467931747436523, "learning_rate": 2.310494496223253e-08, "loss": 0.517, "step": 86400 }, { "epoch": 0.9058445298509807, "grad_norm": 1.95524001121521, "learning_rate": 2.260328089181246e-08, "loss": 0.4702, "step": 86500 }, { "epoch": 0.9068917489606351, "grad_norm": 2.4727303981781006, "learning_rate": 2.210699696815127e-08, "loss": 0.498, "step": 86600 }, { "epoch": 0.9079389680702894, "grad_norm": 3.1941773891448975, "learning_rate": 2.1616098784250082e-08, "loss": 0.4655, "step": 86700 }, { "epoch": 0.9089861871799436, "grad_norm": 3.8430733680725098, "learning_rate": 2.1130591872413837e-08, "loss": 0.5178, "step": 86800 }, { "epoch": 0.910033406289598, "grad_norm": 1.787541151046753, "learning_rate": 2.0650481704189315e-08, "loss": 0.4858, "step": 86900 }, { "epoch": 0.9110806253992523, "grad_norm": 1.8147176504135132, "learning_rate": 2.017577369030321e-08, "loss": 0.4997, "step": 87000 }, { "epoch": 0.9121278445089066, "grad_norm": 2.207904100418091, "learning_rate": 1.9706473180601145e-08, "loss": 0.4998, "step": 87100 }, { "epoch": 0.9131750636185609, "grad_norm": 2.220478057861328, "learning_rate": 1.9242585463987548e-08, "loss": 0.4939, "step": 87200 }, { "epoch": 0.9142222827282153, "grad_norm": 2.459459066390991, "learning_rate": 1.878411576836597e-08, "loss": 0.5106, "step": 87300 }, { "epoch": 0.9152695018378696, "grad_norm": 1.8161354064941406, "learning_rate": 1.8331069260580147e-08, "loss": 0.4519, "step": 87400 }, { "epoch": 0.9163167209475238, "grad_norm": 2.2104363441467285, "learning_rate": 1.78834510463558e-08, "loss": 0.4841, "step": 87500 }, { "epoch": 0.9173639400571781, "grad_norm": 3.3614344596862793, "learning_rate": 1.744126617024305e-08, "loss": 0.4699, "step": 87600 }, { "epoch": 0.9184111591668325, "grad_norm": 1.9489402770996094, "learning_rate": 1.70045196155596e-08, "loss": 0.4884, "step": 87700 }, { "epoch": 0.9194583782764868, "grad_norm": 2.2660348415374756, "learning_rate": 1.6573216304334615e-08, "loss": 0.4971, "step": 87800 }, { "epoch": 0.9205055973861411, "grad_norm": 1.9117883443832397, "learning_rate": 1.6147361097253122e-08, "loss": 0.5133, "step": 87900 }, { "epoch": 0.9215528164957955, "grad_norm": 2.3087127208709717, "learning_rate": 1.5726958793601476e-08, "loss": 0.481, "step": 88000 }, { "epoch": 0.9226000356054497, "grad_norm": 2.1353018283843994, "learning_rate": 1.5312014131212914e-08, "loss": 0.4618, "step": 88100 }, { "epoch": 0.923647254715104, "grad_norm": 2.694920778274536, "learning_rate": 1.4902531786414542e-08, "loss": 0.4633, "step": 88200 }, { "epoch": 0.9246944738247583, "grad_norm": 2.070590019226074, "learning_rate": 1.4498516373974312e-08, "loss": 0.5069, "step": 88300 }, { "epoch": 0.9257416929344127, "grad_norm": 1.7129287719726562, "learning_rate": 1.4099972447049246e-08, "loss": 0.479, "step": 88400 }, { "epoch": 0.926788912044067, "grad_norm": 2.0258448123931885, "learning_rate": 1.3706904497133964e-08, "loss": 0.5026, "step": 88500 }, { "epoch": 0.9278361311537213, "grad_norm": 2.2771730422973633, "learning_rate": 1.331931695401034e-08, "loss": 0.4739, "step": 88600 }, { "epoch": 0.9288833502633757, "grad_norm": 2.1517481803894043, "learning_rate": 1.2937214185696988e-08, "loss": 0.5027, "step": 88700 }, { "epoch": 0.9299305693730299, "grad_norm": 2.0524544715881348, "learning_rate": 1.2560600498400852e-08, "loss": 0.459, "step": 88800 }, { "epoch": 0.9309777884826842, "grad_norm": 2.0591094493865967, "learning_rate": 1.2189480136467978e-08, "loss": 0.512, "step": 88900 }, { "epoch": 0.9320250075923385, "grad_norm": 1.7868990898132324, "learning_rate": 1.1823857282335869e-08, "loss": 0.4755, "step": 89000 }, { "epoch": 0.9330722267019929, "grad_norm": 2.4516055583953857, "learning_rate": 1.146373605648676e-08, "loss": 0.5004, "step": 89100 }, { "epoch": 0.9341194458116472, "grad_norm": 2.602165699005127, "learning_rate": 1.1109120517400704e-08, "loss": 0.5163, "step": 89200 }, { "epoch": 0.9351666649213015, "grad_norm": 4.763970851898193, "learning_rate": 1.076001466150972e-08, "loss": 0.5095, "step": 89300 }, { "epoch": 0.9362138840309558, "grad_norm": 2.463984966278076, "learning_rate": 1.0416422423153547e-08, "loss": 0.5034, "step": 89400 }, { "epoch": 0.9372611031406101, "grad_norm": 2.4041192531585693, "learning_rate": 1.0078347674534194e-08, "loss": 0.4741, "step": 89500 }, { "epoch": 0.9383083222502644, "grad_norm": 3.2481226921081543, "learning_rate": 9.745794225673288e-09, "loss": 0.5558, "step": 89600 }, { "epoch": 0.9393555413599187, "grad_norm": 2.0538644790649414, "learning_rate": 9.418765824368625e-09, "loss": 0.5126, "step": 89700 }, { "epoch": 0.940402760469573, "grad_norm": 3.1280417442321777, "learning_rate": 9.097266156151972e-09, "loss": 0.4813, "step": 89800 }, { "epoch": 0.9414499795792274, "grad_norm": 2.6181859970092773, "learning_rate": 8.781298844247608e-09, "loss": 0.4985, "step": 89900 }, { "epoch": 0.9424971986888817, "grad_norm": 2.8424460887908936, "learning_rate": 8.470867449531627e-09, "loss": 0.5032, "step": 90000 }, { "epoch": 0.943544417798536, "grad_norm": 1.8021912574768066, "learning_rate": 8.165975470491416e-09, "loss": 0.5082, "step": 90100 }, { "epoch": 0.9445916369081903, "grad_norm": 2.1348044872283936, "learning_rate": 7.866626343186577e-09, "loss": 0.4811, "step": 90200 }, { "epoch": 0.9456388560178446, "grad_norm": 1.665382981300354, "learning_rate": 7.572823441210353e-09, "loss": 0.5137, "step": 90300 }, { "epoch": 0.9466860751274989, "grad_norm": 1.782528281211853, "learning_rate": 7.284570075650864e-09, "loss": 0.4861, "step": 90400 }, { "epoch": 0.9477332942371532, "grad_norm": 2.0802054405212402, "learning_rate": 7.001869495054713e-09, "loss": 0.5201, "step": 90500 }, { "epoch": 0.9487805133468076, "grad_norm": 2.515943765640259, "learning_rate": 6.724724885389721e-09, "loss": 0.4863, "step": 90600 }, { "epoch": 0.9498277324564619, "grad_norm": 1.7922004461288452, "learning_rate": 6.4531393700092415e-09, "loss": 0.4858, "step": 90700 }, { "epoch": 0.9508749515661162, "grad_norm": 1.5402792692184448, "learning_rate": 6.187116009617188e-09, "loss": 0.5174, "step": 90800 }, { "epoch": 0.9519221706757705, "grad_norm": 2.370882987976074, "learning_rate": 5.926657802233004e-09, "loss": 0.5299, "step": 90900 }, { "epoch": 0.9529693897854248, "grad_norm": 2.1812610626220703, "learning_rate": 5.671767683158357e-09, "loss": 0.5078, "step": 91000 }, { "epoch": 0.9540166088950791, "grad_norm": 1.9076416492462158, "learning_rate": 5.422448524944057e-09, "loss": 0.4871, "step": 91100 }, { "epoch": 0.9550638280047334, "grad_norm": 2.5718798637390137, "learning_rate": 5.1787031373571326e-09, "loss": 0.5, "step": 91200 }, { "epoch": 0.9561110471143878, "grad_norm": 1.7200427055358887, "learning_rate": 4.940534267349861e-09, "loss": 0.4824, "step": 91300 }, { "epoch": 0.9571582662240421, "grad_norm": 2.0528995990753174, "learning_rate": 4.7079445990284015e-09, "loss": 0.4893, "step": 91400 }, { "epoch": 0.9582054853336963, "grad_norm": 2.170036554336548, "learning_rate": 4.4809367536226e-09, "loss": 0.5468, "step": 91500 }, { "epoch": 0.9592527044433506, "grad_norm": 2.4191830158233643, "learning_rate": 4.2595132894565625e-09, "loss": 0.496, "step": 91600 }, { "epoch": 0.960299923553005, "grad_norm": 3.8748281002044678, "learning_rate": 4.043676701919741e-09, "loss": 0.52, "step": 91700 }, { "epoch": 0.9613471426626593, "grad_norm": 2.9865217208862305, "learning_rate": 3.833429423438838e-09, "loss": 0.4729, "step": 91800 }, { "epoch": 0.9623943617723136, "grad_norm": 3.5876505374908447, "learning_rate": 3.628773823450337e-09, "loss": 0.4557, "step": 91900 }, { "epoch": 0.963441580881968, "grad_norm": 2.007694959640503, "learning_rate": 3.429712208373847e-09, "loss": 0.5197, "step": 92000 }, { "epoch": 0.9644887999916223, "grad_norm": 1.564520239830017, "learning_rate": 3.2362468215861306e-09, "loss": 0.4519, "step": 92100 }, { "epoch": 0.9655360191012765, "grad_norm": 2.6633753776550293, "learning_rate": 3.0483798433957876e-09, "loss": 0.5247, "step": 92200 }, { "epoch": 0.9665832382109308, "grad_norm": 2.7909083366394043, "learning_rate": 2.8661133910187206e-09, "loss": 0.4981, "step": 92300 }, { "epoch": 0.9676304573205852, "grad_norm": 2.7965500354766846, "learning_rate": 2.68944951855421e-09, "loss": 0.4982, "step": 92400 }, { "epoch": 0.9686776764302395, "grad_norm": 2.164356231689453, "learning_rate": 2.5183902169618187e-09, "loss": 0.4926, "step": 92500 }, { "epoch": 0.9697248955398938, "grad_norm": 2.378080368041992, "learning_rate": 2.352937414038969e-09, "loss": 0.4796, "step": 92600 }, { "epoch": 0.9707721146495482, "grad_norm": 2.3100953102111816, "learning_rate": 2.1930929743990136e-09, "loss": 0.511, "step": 92700 }, { "epoch": 0.9718193337592025, "grad_norm": 1.154026985168457, "learning_rate": 2.0388586994506964e-09, "loss": 0.5297, "step": 92800 }, { "epoch": 0.9728665528688567, "grad_norm": 2.432117462158203, "learning_rate": 1.8902363273772815e-09, "loss": 0.4869, "step": 92900 }, { "epoch": 0.973913771978511, "grad_norm": 2.1382997035980225, "learning_rate": 1.7472275331173459e-09, "loss": 0.5253, "step": 93000 }, { "epoch": 0.9749609910881654, "grad_norm": 2.517921209335327, "learning_rate": 1.609833928345794e-09, "loss": 0.4989, "step": 93100 }, { "epoch": 0.9760082101978197, "grad_norm": 2.1486592292785645, "learning_rate": 1.4780570614556508e-09, "loss": 0.5392, "step": 93200 }, { "epoch": 0.977055429307474, "grad_norm": 2.8666563034057617, "learning_rate": 1.3518984175406312e-09, "loss": 0.4899, "step": 93300 }, { "epoch": 0.9781026484171284, "grad_norm": 2.0608692169189453, "learning_rate": 1.231359418378486e-09, "loss": 0.5013, "step": 93400 }, { "epoch": 0.9791498675267827, "grad_norm": 2.5256223678588867, "learning_rate": 1.1164414224149598e-09, "loss": 0.506, "step": 93500 }, { "epoch": 0.9801970866364369, "grad_norm": 1.9714406728744507, "learning_rate": 1.0071457247482485e-09, "loss": 0.5306, "step": 93600 }, { "epoch": 0.9812443057460912, "grad_norm": 2.5823991298675537, "learning_rate": 9.034735571147312e-10, "loss": 0.4887, "step": 93700 }, { "epoch": 0.9822915248557456, "grad_norm": 2.48111891746521, "learning_rate": 8.054260878749275e-10, "loss": 0.5309, "step": 93800 }, { "epoch": 0.9833387439653999, "grad_norm": 3.824676752090454, "learning_rate": 7.130044220003962e-10, "loss": 0.4919, "step": 93900 }, { "epoch": 0.9843859630750542, "grad_norm": 2.073537588119507, "learning_rate": 6.26209601061134e-10, "loss": 0.4679, "step": 94000 }, { "epoch": 0.9854331821847085, "grad_norm": 2.32852840423584, "learning_rate": 5.450426032140298e-10, "loss": 0.4893, "step": 94100 }, { "epoch": 0.9864804012943629, "grad_norm": 3.0331838130950928, "learning_rate": 4.695043431917068e-10, "loss": 0.4837, "step": 94200 }, { "epoch": 0.9875276204040171, "grad_norm": 2.3463919162750244, "learning_rate": 3.995956722922522e-10, "loss": 0.4748, "step": 94300 }, { "epoch": 0.9885748395136714, "grad_norm": 3.0472140312194824, "learning_rate": 3.3531737836967054e-10, "loss": 0.5212, "step": 94400 }, { "epoch": 0.9896220586233258, "grad_norm": 1.4455373287200928, "learning_rate": 2.766701858250009e-10, "loss": 0.4858, "step": 94500 }, { "epoch": 0.9906692777329801, "grad_norm": 2.5533838272094727, "learning_rate": 2.2365475559799064e-10, "loss": 0.5016, "step": 94600 }, { "epoch": 0.9917164968426344, "grad_norm": 2.4406557083129883, "learning_rate": 1.762716851599344e-10, "loss": 0.4551, "step": 94700 }, { "epoch": 0.9927637159522887, "grad_norm": 2.5848546028137207, "learning_rate": 1.3452150850656872e-10, "loss": 0.4797, "step": 94800 }, { "epoch": 0.993810935061943, "grad_norm": 2.0372912883758545, "learning_rate": 9.84046961525209e-11, "loss": 0.4646, "step": 94900 }, { "epoch": 0.9948581541715973, "grad_norm": 2.8523876667022705, "learning_rate": 6.792165512553571e-11, "loss": 0.4876, "step": 95000 }, { "epoch": 0.9959053732812516, "grad_norm": 2.202986001968384, "learning_rate": 4.3072728962256774e-11, "loss": 0.5156, "step": 95100 }, { "epoch": 0.996952592390906, "grad_norm": 2.1548354625701904, "learning_rate": 2.3858197704063055e-11, "loss": 0.5241, "step": 95200 }, { "epoch": 0.9979998115005603, "grad_norm": 1.8615128993988037, "learning_rate": 1.0278277894182342e-11, "loss": 0.4658, "step": 95300 }, { "epoch": 0.9990470306102146, "grad_norm": 2.989764928817749, "learning_rate": 2.3331225750267137e-12, "loss": 0.5486, "step": 95400 } ], "logging_steps": 100, "max_steps": 95491, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.299285301826683e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }