{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4872267579668159, "eval_steps": 500, "global_step": 18500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.633658151171978e-05, "grad_norm": 1.25602126121521, "learning_rate": 4.999868317092442e-05, "loss": 2.5915, "step": 1 }, { "epoch": 5.267316302343956e-05, "grad_norm": 1.4423277378082275, "learning_rate": 4.999736634184883e-05, "loss": 1.4082, "step": 2 }, { "epoch": 7.900974453515933e-05, "grad_norm": NaN, "learning_rate": 4.999736634184883e-05, "loss": 2.7673, "step": 3 }, { "epoch": 0.00010534632604687912, "grad_norm": 1.9636341333389282, "learning_rate": 4.999604951277325e-05, "loss": 2.1738, "step": 4 }, { "epoch": 0.0001316829075585989, "grad_norm": NaN, "learning_rate": 4.999604951277325e-05, "loss": 1.7754, "step": 5 }, { "epoch": 0.00015801948907031866, "grad_norm": 2.365417242050171, "learning_rate": 4.999473268369766e-05, "loss": 0.8906, "step": 6 }, { "epoch": 0.00018435607058203845, "grad_norm": 3.35546612739563, "learning_rate": 4.9993415854622074e-05, "loss": 1.3068, "step": 7 }, { "epoch": 0.00021069265209375823, "grad_norm": 1.5731785297393799, "learning_rate": 4.999209902554648e-05, "loss": 1.8704, "step": 8 }, { "epoch": 0.00023702923360547802, "grad_norm": NaN, "learning_rate": 4.999209902554648e-05, "loss": 1.7605, "step": 9 }, { "epoch": 0.0002633658151171978, "grad_norm": 1.6602925062179565, "learning_rate": 4.99907821964709e-05, "loss": 1.4179, "step": 10 }, { "epoch": 0.00028970239662891756, "grad_norm": 1.1264127492904663, "learning_rate": 4.9989465367395314e-05, "loss": 2.0449, "step": 11 }, { "epoch": 0.0003160389781406373, "grad_norm": 1.6265169382095337, "learning_rate": 4.998814853831973e-05, "loss": 1.6321, "step": 12 }, { "epoch": 0.00034237555965235713, "grad_norm": 1.510681390762329, "learning_rate": 4.9986831709244145e-05, "loss": 2.4123, "step": 13 }, { "epoch": 0.0003687121411640769, "grad_norm": 1.0148299932479858, "learning_rate": 4.9985514880168554e-05, "loss": 1.9112, "step": 14 }, { "epoch": 0.0003950487226757967, "grad_norm": 1.2252342700958252, "learning_rate": 4.9984198051092976e-05, "loss": 1.7106, "step": 15 }, { "epoch": 0.00042138530418751646, "grad_norm": 2.102924346923828, "learning_rate": 4.9982881222017385e-05, "loss": 1.2634, "step": 16 }, { "epoch": 0.0004477218856992362, "grad_norm": 1.4299705028533936, "learning_rate": 4.99815643929418e-05, "loss": 2.1772, "step": 17 }, { "epoch": 0.00047405846721095603, "grad_norm": 1.1273866891860962, "learning_rate": 4.998024756386621e-05, "loss": 2.1013, "step": 18 }, { "epoch": 0.0005003950487226758, "grad_norm": 2.7761473655700684, "learning_rate": 4.9978930734790625e-05, "loss": 1.3722, "step": 19 }, { "epoch": 0.0005267316302343956, "grad_norm": 4.346610069274902, "learning_rate": 4.997761390571504e-05, "loss": 2.1313, "step": 20 }, { "epoch": 0.0005530682117461153, "grad_norm": 1.7366644144058228, "learning_rate": 4.9976297076639456e-05, "loss": 2.2213, "step": 21 }, { "epoch": 0.0005794047932578351, "grad_norm": 1.5845476388931274, "learning_rate": 4.997498024756387e-05, "loss": 1.8117, "step": 22 }, { "epoch": 0.0006057413747695549, "grad_norm": 4.5487775802612305, "learning_rate": 4.997366341848828e-05, "loss": 1.1827, "step": 23 }, { "epoch": 0.0006320779562812746, "grad_norm": 0.8642082214355469, "learning_rate": 4.9972346589412696e-05, "loss": 1.695, "step": 24 }, { "epoch": 0.0006584145377929945, "grad_norm": 1.0657001733779907, "learning_rate": 4.997102976033711e-05, "loss": 2.2334, "step": 25 }, { "epoch": 0.0006847511193047143, "grad_norm": 1.2822493314743042, "learning_rate": 4.996971293126153e-05, "loss": 1.9787, "step": 26 }, { "epoch": 0.0007110877008164341, "grad_norm": 1.2216416597366333, "learning_rate": 4.9968396102185936e-05, "loss": 2.0054, "step": 27 }, { "epoch": 0.0007374242823281538, "grad_norm": 1.1048316955566406, "learning_rate": 4.996707927311035e-05, "loss": 1.4634, "step": 28 }, { "epoch": 0.0007637608638398736, "grad_norm": 1.488633394241333, "learning_rate": 4.996576244403477e-05, "loss": 1.6665, "step": 29 }, { "epoch": 0.0007900974453515934, "grad_norm": 1.2460627555847168, "learning_rate": 4.996444561495918e-05, "loss": 1.8692, "step": 30 }, { "epoch": 0.0008164340268633131, "grad_norm": 2.1408908367156982, "learning_rate": 4.996312878588359e-05, "loss": 1.8691, "step": 31 }, { "epoch": 0.0008427706083750329, "grad_norm": 1.1469541788101196, "learning_rate": 4.996181195680801e-05, "loss": 1.669, "step": 32 }, { "epoch": 0.0008691071898867527, "grad_norm": 4.090087890625, "learning_rate": 4.996049512773242e-05, "loss": 1.0411, "step": 33 }, { "epoch": 0.0008954437713984724, "grad_norm": 2.047508716583252, "learning_rate": 4.995917829865684e-05, "loss": 2.0276, "step": 34 }, { "epoch": 0.0009217803529101923, "grad_norm": 1.4882947206497192, "learning_rate": 4.9957861469581254e-05, "loss": 1.3422, "step": 35 }, { "epoch": 0.0009481169344219121, "grad_norm": 2.8444130420684814, "learning_rate": 4.995654464050566e-05, "loss": 1.6189, "step": 36 }, { "epoch": 0.0009744535159336318, "grad_norm": 1.9858633279800415, "learning_rate": 4.995522781143008e-05, "loss": 1.7378, "step": 37 }, { "epoch": 0.0010007900974453516, "grad_norm": 2.514693260192871, "learning_rate": 4.995391098235449e-05, "loss": 0.9876, "step": 38 }, { "epoch": 0.0010271266789570713, "grad_norm": 1.6809574365615845, "learning_rate": 4.995259415327891e-05, "loss": 1.6963, "step": 39 }, { "epoch": 0.0010534632604687912, "grad_norm": 1.7100722789764404, "learning_rate": 4.995127732420332e-05, "loss": 1.9217, "step": 40 }, { "epoch": 0.001079799841980511, "grad_norm": 5.513702392578125, "learning_rate": 4.9949960495127734e-05, "loss": 1.5183, "step": 41 }, { "epoch": 0.0011061364234922306, "grad_norm": 2.2067952156066895, "learning_rate": 4.994864366605215e-05, "loss": 1.5994, "step": 42 }, { "epoch": 0.0011324730050039505, "grad_norm": 1.6485905647277832, "learning_rate": 4.994732683697656e-05, "loss": 1.8023, "step": 43 }, { "epoch": 0.0011588095865156702, "grad_norm": 2.0764386653900146, "learning_rate": 4.994601000790098e-05, "loss": 2.0264, "step": 44 }, { "epoch": 0.00118514616802739, "grad_norm": 2.3295562267303467, "learning_rate": 4.994469317882539e-05, "loss": 1.438, "step": 45 }, { "epoch": 0.0012114827495391099, "grad_norm": 1.6137781143188477, "learning_rate": 4.9943376349749805e-05, "loss": 2.0789, "step": 46 }, { "epoch": 0.0012378193310508296, "grad_norm": 3.2873547077178955, "learning_rate": 4.9942059520674214e-05, "loss": 1.6047, "step": 47 }, { "epoch": 0.0012641559125625493, "grad_norm": 1.6551767587661743, "learning_rate": 4.9940742691598637e-05, "loss": 1.678, "step": 48 }, { "epoch": 0.0012904924940742692, "grad_norm": 2.1045918464660645, "learning_rate": 4.9939425862523045e-05, "loss": 2.122, "step": 49 }, { "epoch": 0.001316829075585989, "grad_norm": 1.585898995399475, "learning_rate": 4.993810903344746e-05, "loss": 1.3005, "step": 50 }, { "epoch": 0.0013431656570977088, "grad_norm": 2.065453052520752, "learning_rate": 4.9936792204371877e-05, "loss": 2.0565, "step": 51 }, { "epoch": 0.0013695022386094285, "grad_norm": 2.091850519180298, "learning_rate": 4.9935475375296285e-05, "loss": 2.1221, "step": 52 }, { "epoch": 0.0013958388201211482, "grad_norm": 1.811124563217163, "learning_rate": 4.993415854622071e-05, "loss": 2.0501, "step": 53 }, { "epoch": 0.0014221754016328682, "grad_norm": 2.2109591960906982, "learning_rate": 4.9932841717145117e-05, "loss": 1.5615, "step": 54 }, { "epoch": 0.0014485119831445879, "grad_norm": 1.7171549797058105, "learning_rate": 4.993152488806953e-05, "loss": 2.0356, "step": 55 }, { "epoch": 0.0014748485646563076, "grad_norm": 4.514270305633545, "learning_rate": 4.993020805899394e-05, "loss": 1.6096, "step": 56 }, { "epoch": 0.0015011851461680275, "grad_norm": 4.055466175079346, "learning_rate": 4.9928891229918357e-05, "loss": 1.5004, "step": 57 }, { "epoch": 0.0015275217276797472, "grad_norm": NaN, "learning_rate": 4.9928891229918357e-05, "loss": 1.6247, "step": 58 }, { "epoch": 0.001553858309191467, "grad_norm": 1.5687782764434814, "learning_rate": 4.992757440084277e-05, "loss": 2.2262, "step": 59 }, { "epoch": 0.0015801948907031868, "grad_norm": 1.6692463159561157, "learning_rate": 4.992625757176719e-05, "loss": 1.7712, "step": 60 }, { "epoch": 0.0016065314722149065, "grad_norm": 3.837777853012085, "learning_rate": 4.99249407426916e-05, "loss": 1.3561, "step": 61 }, { "epoch": 0.0016328680537266262, "grad_norm": 4.855952739715576, "learning_rate": 4.992362391361601e-05, "loss": 0.9074, "step": 62 }, { "epoch": 0.0016592046352383461, "grad_norm": 3.2597079277038574, "learning_rate": 4.9922307084540435e-05, "loss": 2.1736, "step": 63 }, { "epoch": 0.0016855412167500658, "grad_norm": 2.073603391647339, "learning_rate": 4.9920990255464843e-05, "loss": 1.8285, "step": 64 }, { "epoch": 0.0017118777982617856, "grad_norm": 1.8987481594085693, "learning_rate": 4.991967342638926e-05, "loss": 1.7371, "step": 65 }, { "epoch": 0.0017382143797735055, "grad_norm": 2.7572085857391357, "learning_rate": 4.991835659731367e-05, "loss": 1.8387, "step": 66 }, { "epoch": 0.0017645509612852252, "grad_norm": 4.392783164978027, "learning_rate": 4.9917039768238083e-05, "loss": 2.2349, "step": 67 }, { "epoch": 0.0017908875427969449, "grad_norm": 3.262601613998413, "learning_rate": 4.99157229391625e-05, "loss": 1.9045, "step": 68 }, { "epoch": 0.0018172241243086648, "grad_norm": 4.613892078399658, "learning_rate": 4.9914406110086915e-05, "loss": 0.9683, "step": 69 }, { "epoch": 0.0018435607058203845, "grad_norm": 3.338005542755127, "learning_rate": 4.991308928101133e-05, "loss": 1.6835, "step": 70 }, { "epoch": 0.0018698972873321042, "grad_norm": 1.9964314699172974, "learning_rate": 4.991177245193574e-05, "loss": 1.5167, "step": 71 }, { "epoch": 0.0018962338688438241, "grad_norm": 2.4896624088287354, "learning_rate": 4.9910455622860155e-05, "loss": 1.8422, "step": 72 }, { "epoch": 0.0019225704503555438, "grad_norm": 6.072330951690674, "learning_rate": 4.990913879378457e-05, "loss": 1.5565, "step": 73 }, { "epoch": 0.0019489070318672635, "grad_norm": 2.1096174716949463, "learning_rate": 4.9907821964708986e-05, "loss": 1.5544, "step": 74 }, { "epoch": 0.0019752436133789832, "grad_norm": 3.130415201187134, "learning_rate": 4.9906505135633395e-05, "loss": 2.1596, "step": 75 }, { "epoch": 0.002001580194890703, "grad_norm": 9.269598007202148, "learning_rate": 4.990518830655781e-05, "loss": 1.0918, "step": 76 }, { "epoch": 0.002027916776402423, "grad_norm": 2.547788381576538, "learning_rate": 4.9903871477482226e-05, "loss": 1.8652, "step": 77 }, { "epoch": 0.0020542533579141426, "grad_norm": 2.4005014896392822, "learning_rate": 4.990255464840664e-05, "loss": 1.8942, "step": 78 }, { "epoch": 0.0020805899394258625, "grad_norm": 1.9952259063720703, "learning_rate": 4.990123781933106e-05, "loss": 1.9271, "step": 79 }, { "epoch": 0.0021069265209375824, "grad_norm": 2.127342939376831, "learning_rate": 4.9899920990255466e-05, "loss": 1.7014, "step": 80 }, { "epoch": 0.002133263102449302, "grad_norm": 1.9199120998382568, "learning_rate": 4.989860416117988e-05, "loss": 1.9958, "step": 81 }, { "epoch": 0.002159599683961022, "grad_norm": 4.213884353637695, "learning_rate": 4.98972873321043e-05, "loss": 0.6952, "step": 82 }, { "epoch": 0.0021859362654727417, "grad_norm": 3.3724217414855957, "learning_rate": 4.989597050302871e-05, "loss": 1.3471, "step": 83 }, { "epoch": 0.0022122728469844612, "grad_norm": 1.9994274377822876, "learning_rate": 4.989465367395312e-05, "loss": 1.8571, "step": 84 }, { "epoch": 0.002238609428496181, "grad_norm": 3.1660397052764893, "learning_rate": 4.989333684487754e-05, "loss": 1.7784, "step": 85 }, { "epoch": 0.002264946010007901, "grad_norm": 2.1498875617980957, "learning_rate": 4.9892020015801946e-05, "loss": 1.8134, "step": 86 }, { "epoch": 0.0022912825915196206, "grad_norm": 3.4811267852783203, "learning_rate": 4.989070318672637e-05, "loss": 1.863, "step": 87 }, { "epoch": 0.0023176191730313405, "grad_norm": 4.082001686096191, "learning_rate": 4.988938635765078e-05, "loss": 0.6746, "step": 88 }, { "epoch": 0.0023439557545430604, "grad_norm": 2.0179851055145264, "learning_rate": 4.988806952857519e-05, "loss": 1.8533, "step": 89 }, { "epoch": 0.00237029233605478, "grad_norm": 2.6662609577178955, "learning_rate": 4.988675269949961e-05, "loss": 2.4785, "step": 90 }, { "epoch": 0.0023966289175665, "grad_norm": 2.4066240787506104, "learning_rate": 4.988543587042402e-05, "loss": 2.0299, "step": 91 }, { "epoch": 0.0024229654990782197, "grad_norm": 2.665332555770874, "learning_rate": 4.988411904134844e-05, "loss": 1.9629, "step": 92 }, { "epoch": 0.0024493020805899392, "grad_norm": 2.670659065246582, "learning_rate": 4.988280221227285e-05, "loss": 2.6515, "step": 93 }, { "epoch": 0.002475638662101659, "grad_norm": 2.028151035308838, "learning_rate": 4.9881485383197264e-05, "loss": 2.3114, "step": 94 }, { "epoch": 0.002501975243613379, "grad_norm": 1.8326789140701294, "learning_rate": 4.988016855412167e-05, "loss": 2.2585, "step": 95 }, { "epoch": 0.0025283118251250986, "grad_norm": 2.9877381324768066, "learning_rate": 4.9878851725046095e-05, "loss": 0.6073, "step": 96 }, { "epoch": 0.0025546484066368185, "grad_norm": 5.428811550140381, "learning_rate": 4.9877534895970504e-05, "loss": 2.1438, "step": 97 }, { "epoch": 0.0025809849881485384, "grad_norm": 6.648031234741211, "learning_rate": 4.987621806689492e-05, "loss": 2.4835, "step": 98 }, { "epoch": 0.002607321569660258, "grad_norm": 6.937345504760742, "learning_rate": 4.9874901237819335e-05, "loss": 1.4094, "step": 99 }, { "epoch": 0.002633658151171978, "grad_norm": 2.093762159347534, "learning_rate": 4.9873584408743744e-05, "loss": 1.5811, "step": 100 }, { "epoch": 0.0026599947326836977, "grad_norm": 5.363945007324219, "learning_rate": 4.9872267579668166e-05, "loss": 1.472, "step": 101 }, { "epoch": 0.0026863313141954176, "grad_norm": 2.478797674179077, "learning_rate": 4.9870950750592575e-05, "loss": 2.3101, "step": 102 }, { "epoch": 0.002712667895707137, "grad_norm": 6.754124641418457, "learning_rate": 4.986963392151699e-05, "loss": 2.2972, "step": 103 }, { "epoch": 0.002739004477218857, "grad_norm": 4.061160087585449, "learning_rate": 4.98683170924414e-05, "loss": 1.0694, "step": 104 }, { "epoch": 0.002765341058730577, "grad_norm": 3.0984303951263428, "learning_rate": 4.9867000263365815e-05, "loss": 1.7015, "step": 105 }, { "epoch": 0.0027916776402422965, "grad_norm": 4.187580108642578, "learning_rate": 4.986568343429023e-05, "loss": 2.1395, "step": 106 }, { "epoch": 0.0028180142217540164, "grad_norm": 3.4013631343841553, "learning_rate": 4.9864366605214646e-05, "loss": 2.0131, "step": 107 }, { "epoch": 0.0028443508032657363, "grad_norm": 2.3839163780212402, "learning_rate": 4.986304977613906e-05, "loss": 1.7257, "step": 108 }, { "epoch": 0.002870687384777456, "grad_norm": 3.7074756622314453, "learning_rate": 4.986173294706347e-05, "loss": 1.8269, "step": 109 }, { "epoch": 0.0028970239662891757, "grad_norm": 3.0593509674072266, "learning_rate": 4.9860416117987886e-05, "loss": 1.8013, "step": 110 }, { "epoch": 0.0029233605478008956, "grad_norm": 4.329958438873291, "learning_rate": 4.98590992889123e-05, "loss": 2.7713, "step": 111 }, { "epoch": 0.002949697129312615, "grad_norm": 2.1304702758789062, "learning_rate": 4.985778245983672e-05, "loss": 1.8745, "step": 112 }, { "epoch": 0.002976033710824335, "grad_norm": 2.2222111225128174, "learning_rate": 4.9856465630761126e-05, "loss": 1.8628, "step": 113 }, { "epoch": 0.003002370292336055, "grad_norm": 2.2765445709228516, "learning_rate": 4.985514880168554e-05, "loss": 2.5081, "step": 114 }, { "epoch": 0.0030287068738477745, "grad_norm": 2.0583174228668213, "learning_rate": 4.985383197260996e-05, "loss": 1.906, "step": 115 }, { "epoch": 0.0030550434553594944, "grad_norm": 1.9065154790878296, "learning_rate": 4.985251514353437e-05, "loss": 1.6274, "step": 116 }, { "epoch": 0.0030813800368712143, "grad_norm": 2.5122358798980713, "learning_rate": 4.985119831445879e-05, "loss": 2.2167, "step": 117 }, { "epoch": 0.003107716618382934, "grad_norm": 2.5497119426727295, "learning_rate": 4.98498814853832e-05, "loss": 1.5221, "step": 118 }, { "epoch": 0.0031340531998946537, "grad_norm": 5.591714382171631, "learning_rate": 4.984856465630761e-05, "loss": 1.5908, "step": 119 }, { "epoch": 0.0031603897814063736, "grad_norm": 2.0516998767852783, "learning_rate": 4.984724782723203e-05, "loss": 2.1565, "step": 120 }, { "epoch": 0.003186726362918093, "grad_norm": 2.954334259033203, "learning_rate": 4.9845930998156444e-05, "loss": 1.9972, "step": 121 }, { "epoch": 0.003213062944429813, "grad_norm": 3.947854518890381, "learning_rate": 4.984461416908085e-05, "loss": 0.8273, "step": 122 }, { "epoch": 0.003239399525941533, "grad_norm": 2.1221768856048584, "learning_rate": 4.984329734000527e-05, "loss": 2.0991, "step": 123 }, { "epoch": 0.0032657361074532524, "grad_norm": 4.618890285491943, "learning_rate": 4.9841980510929684e-05, "loss": 0.8006, "step": 124 }, { "epoch": 0.0032920726889649724, "grad_norm": 2.2099010944366455, "learning_rate": 4.98406636818541e-05, "loss": 1.8216, "step": 125 }, { "epoch": 0.0033184092704766923, "grad_norm": 5.8832292556762695, "learning_rate": 4.9839346852778516e-05, "loss": 2.2214, "step": 126 }, { "epoch": 0.0033447458519884118, "grad_norm": 1.9785484075546265, "learning_rate": 4.9838030023702924e-05, "loss": 1.4308, "step": 127 }, { "epoch": 0.0033710824335001317, "grad_norm": 2.9469151496887207, "learning_rate": 4.983671319462734e-05, "loss": 2.159, "step": 128 }, { "epoch": 0.0033974190150118516, "grad_norm": 3.7625527381896973, "learning_rate": 4.9835396365551756e-05, "loss": 1.6932, "step": 129 }, { "epoch": 0.003423755596523571, "grad_norm": 3.0249016284942627, "learning_rate": 4.983407953647617e-05, "loss": 1.8042, "step": 130 }, { "epoch": 0.003450092178035291, "grad_norm": 2.883127450942993, "learning_rate": 4.983276270740058e-05, "loss": 2.1843, "step": 131 }, { "epoch": 0.003476428759547011, "grad_norm": 3.135927200317383, "learning_rate": 4.9831445878324996e-05, "loss": 1.9215, "step": 132 }, { "epoch": 0.0035027653410587304, "grad_norm": 1.9830724000930786, "learning_rate": 4.9830129049249404e-05, "loss": 1.5282, "step": 133 }, { "epoch": 0.0035291019225704504, "grad_norm": 4.631160259246826, "learning_rate": 4.982881222017383e-05, "loss": 1.9417, "step": 134 }, { "epoch": 0.0035554385040821703, "grad_norm": 2.9444193840026855, "learning_rate": 4.9827495391098236e-05, "loss": 1.5634, "step": 135 }, { "epoch": 0.0035817750855938898, "grad_norm": 5.769080638885498, "learning_rate": 4.982617856202265e-05, "loss": 1.2814, "step": 136 }, { "epoch": 0.0036081116671056097, "grad_norm": 3.2159616947174072, "learning_rate": 4.982486173294707e-05, "loss": 2.2346, "step": 137 }, { "epoch": 0.0036344482486173296, "grad_norm": 3.8492844104766846, "learning_rate": 4.9823544903871476e-05, "loss": 0.8781, "step": 138 }, { "epoch": 0.003660784830129049, "grad_norm": 2.716857433319092, "learning_rate": 4.98222280747959e-05, "loss": 1.17, "step": 139 }, { "epoch": 0.003687121411640769, "grad_norm": 2.482445001602173, "learning_rate": 4.982091124572031e-05, "loss": 2.1708, "step": 140 }, { "epoch": 0.003713457993152489, "grad_norm": 5.520145893096924, "learning_rate": 4.981959441664472e-05, "loss": 0.9422, "step": 141 }, { "epoch": 0.0037397945746642084, "grad_norm": 2.650184392929077, "learning_rate": 4.981827758756913e-05, "loss": 1.5212, "step": 142 }, { "epoch": 0.0037661311561759283, "grad_norm": 3.017289161682129, "learning_rate": 4.981696075849355e-05, "loss": 2.0877, "step": 143 }, { "epoch": 0.0037924677376876483, "grad_norm": 5.6365838050842285, "learning_rate": 4.981564392941796e-05, "loss": 1.8011, "step": 144 }, { "epoch": 0.0038188043191993678, "grad_norm": 1.9732463359832764, "learning_rate": 4.981432710034238e-05, "loss": 1.7964, "step": 145 }, { "epoch": 0.0038451409007110877, "grad_norm": 2.4963321685791016, "learning_rate": 4.9813010271266794e-05, "loss": 1.8117, "step": 146 }, { "epoch": 0.0038714774822228076, "grad_norm": 4.752861499786377, "learning_rate": 4.98116934421912e-05, "loss": 1.487, "step": 147 }, { "epoch": 0.003897814063734527, "grad_norm": 1.9943513870239258, "learning_rate": 4.9810376613115625e-05, "loss": 1.4832, "step": 148 }, { "epoch": 0.0039241506452462474, "grad_norm": 2.6101386547088623, "learning_rate": 4.9809059784040034e-05, "loss": 1.9399, "step": 149 }, { "epoch": 0.0039504872267579665, "grad_norm": 3.150451421737671, "learning_rate": 4.980774295496445e-05, "loss": 0.5693, "step": 150 }, { "epoch": 0.003976823808269686, "grad_norm": 2.680048704147339, "learning_rate": 4.980642612588886e-05, "loss": 1.704, "step": 151 }, { "epoch": 0.004003160389781406, "grad_norm": 3.1196019649505615, "learning_rate": 4.9805109296813274e-05, "loss": 1.541, "step": 152 }, { "epoch": 0.004029496971293126, "grad_norm": 3.6955935955047607, "learning_rate": 4.980379246773769e-05, "loss": 2.1527, "step": 153 }, { "epoch": 0.004055833552804846, "grad_norm": 5.005454063415527, "learning_rate": 4.9802475638662105e-05, "loss": 1.7925, "step": 154 }, { "epoch": 0.004082170134316566, "grad_norm": 2.632319927215576, "learning_rate": 4.980115880958652e-05, "loss": 2.8897, "step": 155 }, { "epoch": 0.004108506715828285, "grad_norm": 4.2010087966918945, "learning_rate": 4.979984198051093e-05, "loss": 0.841, "step": 156 }, { "epoch": 0.004134843297340005, "grad_norm": 2.79358172416687, "learning_rate": 4.9798525151435345e-05, "loss": 1.6884, "step": 157 }, { "epoch": 0.004161179878851725, "grad_norm": 3.154688596725464, "learning_rate": 4.979720832235976e-05, "loss": 1.0247, "step": 158 }, { "epoch": 0.004187516460363445, "grad_norm": 2.1128103733062744, "learning_rate": 4.9795891493284176e-05, "loss": 1.512, "step": 159 }, { "epoch": 0.004213853041875165, "grad_norm": 2.2439041137695312, "learning_rate": 4.9794574664208585e-05, "loss": 1.8068, "step": 160 }, { "epoch": 0.004240189623386885, "grad_norm": 2.2295446395874023, "learning_rate": 4.9793257835133e-05, "loss": 1.9697, "step": 161 }, { "epoch": 0.004266526204898604, "grad_norm": 2.5257863998413086, "learning_rate": 4.9791941006057416e-05, "loss": 1.8954, "step": 162 }, { "epoch": 0.004292862786410324, "grad_norm": 3.872302532196045, "learning_rate": 4.979062417698183e-05, "loss": 1.0661, "step": 163 }, { "epoch": 0.004319199367922044, "grad_norm": 3.818669080734253, "learning_rate": 4.978930734790625e-05, "loss": 2.4765, "step": 164 }, { "epoch": 0.004345535949433764, "grad_norm": 2.772634267807007, "learning_rate": 4.9787990518830656e-05, "loss": 2.3597, "step": 165 }, { "epoch": 0.0043718725309454835, "grad_norm": 2.113095760345459, "learning_rate": 4.978667368975507e-05, "loss": 1.6092, "step": 166 }, { "epoch": 0.004398209112457203, "grad_norm": 2.2079544067382812, "learning_rate": 4.978535686067949e-05, "loss": 1.3975, "step": 167 }, { "epoch": 0.0044245456939689225, "grad_norm": 2.840900421142578, "learning_rate": 4.97840400316039e-05, "loss": 1.766, "step": 168 }, { "epoch": 0.004450882275480642, "grad_norm": 3.2003445625305176, "learning_rate": 4.978272320252831e-05, "loss": 2.0482, "step": 169 }, { "epoch": 0.004477218856992362, "grad_norm": 4.071146488189697, "learning_rate": 4.978140637345273e-05, "loss": 1.7155, "step": 170 }, { "epoch": 0.004503555438504082, "grad_norm": 2.204969644546509, "learning_rate": 4.978008954437714e-05, "loss": 1.5785, "step": 171 }, { "epoch": 0.004529892020015802, "grad_norm": 2.2772178649902344, "learning_rate": 4.977877271530156e-05, "loss": 2.2593, "step": 172 }, { "epoch": 0.004556228601527522, "grad_norm": 3.250837564468384, "learning_rate": 4.9777455886225974e-05, "loss": 1.6909, "step": 173 }, { "epoch": 0.004582565183039241, "grad_norm": 3.1715753078460693, "learning_rate": 4.977613905715038e-05, "loss": 1.6968, "step": 174 }, { "epoch": 0.004608901764550961, "grad_norm": 4.073447227478027, "learning_rate": 4.97748222280748e-05, "loss": 2.8818, "step": 175 }, { "epoch": 0.004635238346062681, "grad_norm": 2.430662155151367, "learning_rate": 4.9773505398999214e-05, "loss": 1.9849, "step": 176 }, { "epoch": 0.004661574927574401, "grad_norm": 2.2059905529022217, "learning_rate": 4.977218856992363e-05, "loss": 1.9293, "step": 177 }, { "epoch": 0.004687911509086121, "grad_norm": 4.93207311630249, "learning_rate": 4.977087174084804e-05, "loss": 1.2192, "step": 178 }, { "epoch": 0.004714248090597841, "grad_norm": 3.036472797393799, "learning_rate": 4.9769554911772454e-05, "loss": 1.6647, "step": 179 }, { "epoch": 0.00474058467210956, "grad_norm": 2.2520482540130615, "learning_rate": 4.976823808269687e-05, "loss": 1.8166, "step": 180 }, { "epoch": 0.00476692125362128, "grad_norm": 5.294158935546875, "learning_rate": 4.9766921253621285e-05, "loss": 1.2037, "step": 181 }, { "epoch": 0.004793257835133, "grad_norm": 3.03607439994812, "learning_rate": 4.97656044245457e-05, "loss": 1.8333, "step": 182 }, { "epoch": 0.0048195944166447196, "grad_norm": 2.839785575866699, "learning_rate": 4.976428759547011e-05, "loss": 2.0289, "step": 183 }, { "epoch": 0.0048459309981564395, "grad_norm": 2.876850128173828, "learning_rate": 4.9762970766394525e-05, "loss": 1.7722, "step": 184 }, { "epoch": 0.004872267579668159, "grad_norm": 3.0148491859436035, "learning_rate": 4.9761653937318934e-05, "loss": 1.7479, "step": 185 }, { "epoch": 0.0048986041611798785, "grad_norm": 3.0036590099334717, "learning_rate": 4.9760337108243357e-05, "loss": 1.5007, "step": 186 }, { "epoch": 0.004924940742691598, "grad_norm": 5.28188943862915, "learning_rate": 4.9759020279167765e-05, "loss": 0.8105, "step": 187 }, { "epoch": 0.004951277324203318, "grad_norm": 2.7707080841064453, "learning_rate": 4.975770345009218e-05, "loss": 1.8496, "step": 188 }, { "epoch": 0.004977613905715038, "grad_norm": 2.996790647506714, "learning_rate": 4.975638662101659e-05, "loss": 2.58, "step": 189 }, { "epoch": 0.005003950487226758, "grad_norm": 2.5800158977508545, "learning_rate": 4.9755069791941005e-05, "loss": 1.6247, "step": 190 }, { "epoch": 0.005030287068738478, "grad_norm": 4.0949554443359375, "learning_rate": 4.975375296286542e-05, "loss": 1.8341, "step": 191 }, { "epoch": 0.005056623650250197, "grad_norm": 5.023255348205566, "learning_rate": 4.9752436133789837e-05, "loss": 2.1623, "step": 192 }, { "epoch": 0.005082960231761917, "grad_norm": 2.2133235931396484, "learning_rate": 4.975111930471425e-05, "loss": 1.6761, "step": 193 }, { "epoch": 0.005109296813273637, "grad_norm": 5.704970836639404, "learning_rate": 4.974980247563866e-05, "loss": 0.9858, "step": 194 }, { "epoch": 0.005135633394785357, "grad_norm": 1.9332650899887085, "learning_rate": 4.9748485646563083e-05, "loss": 1.2981, "step": 195 }, { "epoch": 0.005161969976297077, "grad_norm": 4.054973125457764, "learning_rate": 4.974716881748749e-05, "loss": 1.9202, "step": 196 }, { "epoch": 0.005188306557808797, "grad_norm": 3.442539691925049, "learning_rate": 4.974585198841191e-05, "loss": 1.8708, "step": 197 }, { "epoch": 0.005214643139320516, "grad_norm": 2.023343086242676, "learning_rate": 4.974453515933632e-05, "loss": 1.9631, "step": 198 }, { "epoch": 0.005240979720832236, "grad_norm": 3.7599754333496094, "learning_rate": 4.974321833026073e-05, "loss": 2.3993, "step": 199 }, { "epoch": 0.005267316302343956, "grad_norm": 2.6702961921691895, "learning_rate": 4.974190150118515e-05, "loss": 1.4371, "step": 200 }, { "epoch": 0.0052936528838556755, "grad_norm": 2.936985492706299, "learning_rate": 4.9740584672109563e-05, "loss": 1.654, "step": 201 }, { "epoch": 0.0053199894653673955, "grad_norm": 3.7769246101379395, "learning_rate": 4.973926784303398e-05, "loss": 2.0605, "step": 202 }, { "epoch": 0.005346326046879115, "grad_norm": 4.476908206939697, "learning_rate": 4.973795101395839e-05, "loss": 1.8792, "step": 203 }, { "epoch": 0.005372662628390835, "grad_norm": 2.43996524810791, "learning_rate": 4.9736634184882803e-05, "loss": 1.5303, "step": 204 }, { "epoch": 0.005398999209902554, "grad_norm": 2.9250011444091797, "learning_rate": 4.973531735580722e-05, "loss": 2.1826, "step": 205 }, { "epoch": 0.005425335791414274, "grad_norm": 3.787264585494995, "learning_rate": 4.9734000526731635e-05, "loss": 1.1572, "step": 206 }, { "epoch": 0.005451672372925994, "grad_norm": 2.1952826976776123, "learning_rate": 4.9732683697656043e-05, "loss": 1.9259, "step": 207 }, { "epoch": 0.005478008954437714, "grad_norm": 2.5176422595977783, "learning_rate": 4.973136686858046e-05, "loss": 1.8724, "step": 208 }, { "epoch": 0.005504345535949434, "grad_norm": 3.8239634037017822, "learning_rate": 4.9730050039504875e-05, "loss": 1.3196, "step": 209 }, { "epoch": 0.005530682117461154, "grad_norm": 3.663858652114868, "learning_rate": 4.972873321042929e-05, "loss": 1.0881, "step": 210 }, { "epoch": 0.005557018698972873, "grad_norm": 5.210198879241943, "learning_rate": 4.9727416381353706e-05, "loss": 1.4144, "step": 211 }, { "epoch": 0.005583355280484593, "grad_norm": 5.618483066558838, "learning_rate": 4.9726099552278115e-05, "loss": 1.3746, "step": 212 }, { "epoch": 0.005609691861996313, "grad_norm": 2.5995712280273438, "learning_rate": 4.972478272320253e-05, "loss": 1.839, "step": 213 }, { "epoch": 0.005636028443508033, "grad_norm": 4.0986785888671875, "learning_rate": 4.9723465894126946e-05, "loss": 2.2452, "step": 214 }, { "epoch": 0.005662365025019753, "grad_norm": 7.395411968231201, "learning_rate": 4.972214906505136e-05, "loss": 2.079, "step": 215 }, { "epoch": 0.005688701606531473, "grad_norm": 4.094386100769043, "learning_rate": 4.972083223597577e-05, "loss": 2.1434, "step": 216 }, { "epoch": 0.005715038188043192, "grad_norm": 3.1119894981384277, "learning_rate": 4.9719515406900186e-05, "loss": 1.4083, "step": 217 }, { "epoch": 0.005741374769554912, "grad_norm": 3.389418601989746, "learning_rate": 4.97181985778246e-05, "loss": 2.3632, "step": 218 }, { "epoch": 0.0057677113510666315, "grad_norm": 4.414575099945068, "learning_rate": 4.971688174874902e-05, "loss": 1.5154, "step": 219 }, { "epoch": 0.0057940479325783514, "grad_norm": 2.6311278343200684, "learning_rate": 4.971556491967343e-05, "loss": 1.9642, "step": 220 }, { "epoch": 0.005820384514090071, "grad_norm": 2.435178756713867, "learning_rate": 4.971424809059784e-05, "loss": 2.0197, "step": 221 }, { "epoch": 0.005846721095601791, "grad_norm": 6.44195556640625, "learning_rate": 4.971293126152226e-05, "loss": 1.4618, "step": 222 }, { "epoch": 0.00587305767711351, "grad_norm": 4.706697940826416, "learning_rate": 4.9711614432446666e-05, "loss": 1.185, "step": 223 }, { "epoch": 0.00589939425862523, "grad_norm": 2.524803876876831, "learning_rate": 4.971029760337109e-05, "loss": 2.1728, "step": 224 }, { "epoch": 0.00592573084013695, "grad_norm": 14.731494903564453, "learning_rate": 4.97089807742955e-05, "loss": 2.0406, "step": 225 }, { "epoch": 0.00595206742164867, "grad_norm": 3.8456811904907227, "learning_rate": 4.970766394521991e-05, "loss": 2.0363, "step": 226 }, { "epoch": 0.00597840400316039, "grad_norm": 5.158308982849121, "learning_rate": 4.970634711614433e-05, "loss": 1.2614, "step": 227 }, { "epoch": 0.00600474058467211, "grad_norm": 2.347411632537842, "learning_rate": 4.9705030287068744e-05, "loss": 1.5423, "step": 228 }, { "epoch": 0.006031077166183829, "grad_norm": 7.151821136474609, "learning_rate": 4.970371345799316e-05, "loss": 0.8542, "step": 229 }, { "epoch": 0.006057413747695549, "grad_norm": 6.39750337600708, "learning_rate": 4.970239662891757e-05, "loss": 0.706, "step": 230 }, { "epoch": 0.006083750329207269, "grad_norm": 3.0930991172790527, "learning_rate": 4.9701079799841984e-05, "loss": 1.631, "step": 231 }, { "epoch": 0.006110086910718989, "grad_norm": 3.5407567024230957, "learning_rate": 4.969976297076639e-05, "loss": 1.6488, "step": 232 }, { "epoch": 0.006136423492230709, "grad_norm": 4.05654239654541, "learning_rate": 4.9698446141690815e-05, "loss": 1.2322, "step": 233 }, { "epoch": 0.006162760073742429, "grad_norm": 3.4510419368743896, "learning_rate": 4.9697129312615224e-05, "loss": 1.9873, "step": 234 }, { "epoch": 0.006189096655254148, "grad_norm": 3.6991634368896484, "learning_rate": 4.969581248353964e-05, "loss": 2.6805, "step": 235 }, { "epoch": 0.006215433236765868, "grad_norm": 3.597952127456665, "learning_rate": 4.969449565446405e-05, "loss": 1.7013, "step": 236 }, { "epoch": 0.0062417698182775875, "grad_norm": 4.022460460662842, "learning_rate": 4.9693178825388464e-05, "loss": 1.2796, "step": 237 }, { "epoch": 0.006268106399789307, "grad_norm": 2.287949800491333, "learning_rate": 4.969186199631288e-05, "loss": 1.2856, "step": 238 }, { "epoch": 0.006294442981301027, "grad_norm": 3.069411516189575, "learning_rate": 4.9690545167237295e-05, "loss": 1.8174, "step": 239 }, { "epoch": 0.006320779562812747, "grad_norm": 4.457437038421631, "learning_rate": 4.968922833816171e-05, "loss": 1.6977, "step": 240 }, { "epoch": 0.006347116144324466, "grad_norm": 2.3935649394989014, "learning_rate": 4.968791150908612e-05, "loss": 2.2043, "step": 241 }, { "epoch": 0.006373452725836186, "grad_norm": 2.437356948852539, "learning_rate": 4.968659468001054e-05, "loss": 1.689, "step": 242 }, { "epoch": 0.006399789307347906, "grad_norm": 2.710400104522705, "learning_rate": 4.968527785093495e-05, "loss": 1.8618, "step": 243 }, { "epoch": 0.006426125888859626, "grad_norm": 6.757627487182617, "learning_rate": 4.9683961021859366e-05, "loss": 1.0029, "step": 244 }, { "epoch": 0.006452462470371346, "grad_norm": 3.6147267818450928, "learning_rate": 4.9682644192783775e-05, "loss": 2.1118, "step": 245 }, { "epoch": 0.006478799051883066, "grad_norm": 6.167456150054932, "learning_rate": 4.968132736370819e-05, "loss": 1.7851, "step": 246 }, { "epoch": 0.006505135633394785, "grad_norm": 2.6882426738739014, "learning_rate": 4.9680010534632606e-05, "loss": 1.7571, "step": 247 }, { "epoch": 0.006531472214906505, "grad_norm": 3.0543911457061768, "learning_rate": 4.967869370555702e-05, "loss": 1.9994, "step": 248 }, { "epoch": 0.006557808796418225, "grad_norm": 2.950988531112671, "learning_rate": 4.967737687648144e-05, "loss": 1.7448, "step": 249 }, { "epoch": 0.006584145377929945, "grad_norm": 2.491550922393799, "learning_rate": 4.9676060047405846e-05, "loss": 1.6253, "step": 250 }, { "epoch": 0.006610481959441665, "grad_norm": 2.45288348197937, "learning_rate": 4.967474321833026e-05, "loss": 1.7883, "step": 251 }, { "epoch": 0.006636818540953385, "grad_norm": 2.80439829826355, "learning_rate": 4.967342638925468e-05, "loss": 1.5507, "step": 252 }, { "epoch": 0.006663155122465104, "grad_norm": 3.8137927055358887, "learning_rate": 4.967210956017909e-05, "loss": 1.7122, "step": 253 }, { "epoch": 0.0066894917039768236, "grad_norm": 3.2373127937316895, "learning_rate": 4.96707927311035e-05, "loss": 0.5187, "step": 254 }, { "epoch": 0.0067158282854885435, "grad_norm": 3.006848096847534, "learning_rate": 4.966947590202792e-05, "loss": 2.3407, "step": 255 }, { "epoch": 0.006742164867000263, "grad_norm": 2.6437928676605225, "learning_rate": 4.966815907295233e-05, "loss": 1.3933, "step": 256 }, { "epoch": 0.006768501448511983, "grad_norm": 2.1438775062561035, "learning_rate": 4.966684224387675e-05, "loss": 1.3451, "step": 257 }, { "epoch": 0.006794838030023703, "grad_norm": 3.741879463195801, "learning_rate": 4.9665525414801164e-05, "loss": 1.9437, "step": 258 }, { "epoch": 0.006821174611535422, "grad_norm": 3.7661421298980713, "learning_rate": 4.966420858572557e-05, "loss": 1.866, "step": 259 }, { "epoch": 0.006847511193047142, "grad_norm": 3.1512551307678223, "learning_rate": 4.966289175664999e-05, "loss": 1.2263, "step": 260 }, { "epoch": 0.006873847774558862, "grad_norm": 5.102208137512207, "learning_rate": 4.9661574927574404e-05, "loss": 1.7225, "step": 261 }, { "epoch": 0.006900184356070582, "grad_norm": 2.772230386734009, "learning_rate": 4.966025809849882e-05, "loss": 2.1089, "step": 262 }, { "epoch": 0.006926520937582302, "grad_norm": 4.984233379364014, "learning_rate": 4.965894126942323e-05, "loss": 0.9502, "step": 263 }, { "epoch": 0.006952857519094022, "grad_norm": 3.336632490158081, "learning_rate": 4.9657624440347644e-05, "loss": 1.5431, "step": 264 }, { "epoch": 0.006979194100605741, "grad_norm": 4.93807315826416, "learning_rate": 4.965630761127206e-05, "loss": 0.5891, "step": 265 }, { "epoch": 0.007005530682117461, "grad_norm": 3.4254891872406006, "learning_rate": 4.9654990782196476e-05, "loss": 2.1058, "step": 266 }, { "epoch": 0.007031867263629181, "grad_norm": 2.414081573486328, "learning_rate": 4.965367395312089e-05, "loss": 1.9151, "step": 267 }, { "epoch": 0.007058203845140901, "grad_norm": 4.4041876792907715, "learning_rate": 4.96523571240453e-05, "loss": 0.6274, "step": 268 }, { "epoch": 0.007084540426652621, "grad_norm": 3.4176602363586426, "learning_rate": 4.9651040294969716e-05, "loss": 1.4568, "step": 269 }, { "epoch": 0.0071108770081643406, "grad_norm": 4.991017818450928, "learning_rate": 4.9649723465894124e-05, "loss": 1.3682, "step": 270 }, { "epoch": 0.0071372135896760605, "grad_norm": 2.882906675338745, "learning_rate": 4.964840663681855e-05, "loss": 1.9827, "step": 271 }, { "epoch": 0.0071635501711877795, "grad_norm": 2.41365385055542, "learning_rate": 4.9647089807742956e-05, "loss": 2.269, "step": 272 }, { "epoch": 0.0071898867526994995, "grad_norm": 3.56182599067688, "learning_rate": 4.964577297866737e-05, "loss": 1.4444, "step": 273 }, { "epoch": 0.007216223334211219, "grad_norm": 2.774643659591675, "learning_rate": 4.964445614959179e-05, "loss": 2.2455, "step": 274 }, { "epoch": 0.007242559915722939, "grad_norm": 6.077582836151123, "learning_rate": 4.96431393205162e-05, "loss": 2.4372, "step": 275 }, { "epoch": 0.007268896497234659, "grad_norm": 2.978055477142334, "learning_rate": 4.964182249144062e-05, "loss": 1.6934, "step": 276 }, { "epoch": 0.007295233078746379, "grad_norm": 3.786829710006714, "learning_rate": 4.964050566236503e-05, "loss": 1.9201, "step": 277 }, { "epoch": 0.007321569660258098, "grad_norm": 3.4524993896484375, "learning_rate": 4.963918883328944e-05, "loss": 1.4841, "step": 278 }, { "epoch": 0.007347906241769818, "grad_norm": 4.23499059677124, "learning_rate": 4.963787200421385e-05, "loss": 1.6539, "step": 279 }, { "epoch": 0.007374242823281538, "grad_norm": 2.3960883617401123, "learning_rate": 4.9636555175138274e-05, "loss": 2.087, "step": 280 }, { "epoch": 0.007400579404793258, "grad_norm": 4.268434524536133, "learning_rate": 4.963523834606268e-05, "loss": 1.4911, "step": 281 }, { "epoch": 0.007426915986304978, "grad_norm": 2.243865728378296, "learning_rate": 4.96339215169871e-05, "loss": 1.5874, "step": 282 }, { "epoch": 0.007453252567816698, "grad_norm": 2.7072665691375732, "learning_rate": 4.963260468791151e-05, "loss": 1.8963, "step": 283 }, { "epoch": 0.007479589149328417, "grad_norm": 3.8264856338500977, "learning_rate": 4.963128785883592e-05, "loss": 0.6007, "step": 284 }, { "epoch": 0.007505925730840137, "grad_norm": 3.1545920372009277, "learning_rate": 4.9629971029760345e-05, "loss": 1.6745, "step": 285 }, { "epoch": 0.007532262312351857, "grad_norm": 3.4124999046325684, "learning_rate": 4.9628654200684754e-05, "loss": 1.4972, "step": 286 }, { "epoch": 0.007558598893863577, "grad_norm": 2.751809597015381, "learning_rate": 4.962733737160917e-05, "loss": 1.8819, "step": 287 }, { "epoch": 0.0075849354753752965, "grad_norm": 4.699490070343018, "learning_rate": 4.962602054253358e-05, "loss": 1.8979, "step": 288 }, { "epoch": 0.0076112720568870165, "grad_norm": 2.683763027191162, "learning_rate": 4.9624703713457994e-05, "loss": 1.862, "step": 289 }, { "epoch": 0.0076376086383987355, "grad_norm": 4.744758605957031, "learning_rate": 4.962338688438241e-05, "loss": 1.603, "step": 290 }, { "epoch": 0.007663945219910455, "grad_norm": 3.8711984157562256, "learning_rate": 4.9622070055306825e-05, "loss": 1.6996, "step": 291 }, { "epoch": 0.007690281801422175, "grad_norm": 3.0812060832977295, "learning_rate": 4.9620753226231234e-05, "loss": 2.4187, "step": 292 }, { "epoch": 0.007716618382933895, "grad_norm": 3.9074389934539795, "learning_rate": 4.961943639715565e-05, "loss": 2.0815, "step": 293 }, { "epoch": 0.007742954964445615, "grad_norm": 2.894277811050415, "learning_rate": 4.9618119568080065e-05, "loss": 2.0652, "step": 294 }, { "epoch": 0.007769291545957335, "grad_norm": 3.9091920852661133, "learning_rate": 4.961680273900448e-05, "loss": 1.7737, "step": 295 }, { "epoch": 0.007795628127469054, "grad_norm": 1.9550001621246338, "learning_rate": 4.9615485909928896e-05, "loss": 1.3876, "step": 296 }, { "epoch": 0.007821964708980774, "grad_norm": 3.390186309814453, "learning_rate": 4.9614169080853305e-05, "loss": 2.3364, "step": 297 }, { "epoch": 0.007848301290492495, "grad_norm": 3.1299211978912354, "learning_rate": 4.961285225177772e-05, "loss": 2.1138, "step": 298 }, { "epoch": 0.007874637872004214, "grad_norm": 7.753682613372803, "learning_rate": 4.9611535422702136e-05, "loss": 0.6388, "step": 299 }, { "epoch": 0.007900974453515933, "grad_norm": 3.2066988945007324, "learning_rate": 4.961021859362655e-05, "loss": 2.3243, "step": 300 }, { "epoch": 0.007927311035027654, "grad_norm": 2.5140504837036133, "learning_rate": 4.960890176455096e-05, "loss": 1.968, "step": 301 }, { "epoch": 0.007953647616539373, "grad_norm": 9.608623504638672, "learning_rate": 4.9607584935475376e-05, "loss": 1.0477, "step": 302 }, { "epoch": 0.007979984198051094, "grad_norm": 4.970414638519287, "learning_rate": 4.960626810639979e-05, "loss": 2.2033, "step": 303 }, { "epoch": 0.008006320779562813, "grad_norm": 4.394804000854492, "learning_rate": 4.960495127732421e-05, "loss": 1.423, "step": 304 }, { "epoch": 0.008032657361074532, "grad_norm": 4.038106441497803, "learning_rate": 4.960363444824862e-05, "loss": 1.6763, "step": 305 }, { "epoch": 0.008058993942586253, "grad_norm": 2.361182451248169, "learning_rate": 4.960231761917303e-05, "loss": 1.8892, "step": 306 }, { "epoch": 0.008085330524097972, "grad_norm": 2.8867974281311035, "learning_rate": 4.960100079009745e-05, "loss": 1.0601, "step": 307 }, { "epoch": 0.008111667105609692, "grad_norm": 3.2779343128204346, "learning_rate": 4.959968396102186e-05, "loss": 1.7609, "step": 308 }, { "epoch": 0.008138003687121411, "grad_norm": 3.8762402534484863, "learning_rate": 4.959836713194628e-05, "loss": 0.6232, "step": 309 }, { "epoch": 0.008164340268633132, "grad_norm": 2.778155565261841, "learning_rate": 4.959705030287069e-05, "loss": 1.9539, "step": 310 }, { "epoch": 0.008190676850144851, "grad_norm": 4.9701056480407715, "learning_rate": 4.95957334737951e-05, "loss": 1.8396, "step": 311 }, { "epoch": 0.00821701343165657, "grad_norm": 4.125095844268799, "learning_rate": 4.959441664471952e-05, "loss": 0.4815, "step": 312 }, { "epoch": 0.008243350013168291, "grad_norm": 5.6718292236328125, "learning_rate": 4.9593099815643934e-05, "loss": 2.0331, "step": 313 }, { "epoch": 0.00826968659468001, "grad_norm": 6.395870208740234, "learning_rate": 4.959178298656835e-05, "loss": 1.3781, "step": 314 }, { "epoch": 0.008296023176191731, "grad_norm": 2.874675989151001, "learning_rate": 4.959046615749276e-05, "loss": 2.1396, "step": 315 }, { "epoch": 0.00832235975770345, "grad_norm": 2.8582258224487305, "learning_rate": 4.9589149328417174e-05, "loss": 1.6284, "step": 316 }, { "epoch": 0.008348696339215169, "grad_norm": 3.6091315746307373, "learning_rate": 4.958783249934158e-05, "loss": 1.5799, "step": 317 }, { "epoch": 0.00837503292072689, "grad_norm": 3.2684743404388428, "learning_rate": 4.9586515670266005e-05, "loss": 1.8535, "step": 318 }, { "epoch": 0.008401369502238609, "grad_norm": 2.686674118041992, "learning_rate": 4.9585198841190414e-05, "loss": 1.7055, "step": 319 }, { "epoch": 0.00842770608375033, "grad_norm": 2.14872670173645, "learning_rate": 4.958388201211483e-05, "loss": 1.6215, "step": 320 }, { "epoch": 0.008454042665262049, "grad_norm": 2.932607650756836, "learning_rate": 4.9582565183039245e-05, "loss": 1.7547, "step": 321 }, { "epoch": 0.00848037924677377, "grad_norm": 2.8945400714874268, "learning_rate": 4.9581248353963654e-05, "loss": 1.7845, "step": 322 }, { "epoch": 0.008506715828285489, "grad_norm": 3.505058526992798, "learning_rate": 4.957993152488808e-05, "loss": 2.6208, "step": 323 }, { "epoch": 0.008533052409797208, "grad_norm": 2.4234344959259033, "learning_rate": 4.9578614695812485e-05, "loss": 2.0702, "step": 324 }, { "epoch": 0.008559388991308928, "grad_norm": 2.21173095703125, "learning_rate": 4.95772978667369e-05, "loss": 1.2613, "step": 325 }, { "epoch": 0.008585725572820647, "grad_norm": 2.6429340839385986, "learning_rate": 4.957598103766131e-05, "loss": 1.7807, "step": 326 }, { "epoch": 0.008612062154332368, "grad_norm": 3.20620059967041, "learning_rate": 4.957466420858573e-05, "loss": 2.1573, "step": 327 }, { "epoch": 0.008638398735844087, "grad_norm": 2.7805612087249756, "learning_rate": 4.957334737951014e-05, "loss": 1.8478, "step": 328 }, { "epoch": 0.008664735317355806, "grad_norm": 5.594062328338623, "learning_rate": 4.957203055043456e-05, "loss": 2.3134, "step": 329 }, { "epoch": 0.008691071898867527, "grad_norm": 3.3740992546081543, "learning_rate": 4.957071372135897e-05, "loss": 1.9, "step": 330 }, { "epoch": 0.008717408480379246, "grad_norm": 4.023157596588135, "learning_rate": 4.956939689228338e-05, "loss": 2.2293, "step": 331 }, { "epoch": 0.008743745061890967, "grad_norm": 4.384700775146484, "learning_rate": 4.9568080063207803e-05, "loss": 2.7289, "step": 332 }, { "epoch": 0.008770081643402686, "grad_norm": 3.8390510082244873, "learning_rate": 4.956676323413221e-05, "loss": 1.9188, "step": 333 }, { "epoch": 0.008796418224914407, "grad_norm": 6.378127574920654, "learning_rate": 4.956544640505663e-05, "loss": 1.3166, "step": 334 }, { "epoch": 0.008822754806426126, "grad_norm": 5.20752477645874, "learning_rate": 4.956412957598104e-05, "loss": 1.0243, "step": 335 }, { "epoch": 0.008849091387937845, "grad_norm": 4.862656116485596, "learning_rate": 4.956281274690545e-05, "loss": 2.7114, "step": 336 }, { "epoch": 0.008875427969449566, "grad_norm": 4.407208442687988, "learning_rate": 4.956149591782987e-05, "loss": 2.0917, "step": 337 }, { "epoch": 0.008901764550961285, "grad_norm": 4.253674030303955, "learning_rate": 4.9560179088754283e-05, "loss": 1.4495, "step": 338 }, { "epoch": 0.008928101132473006, "grad_norm": 4.106147766113281, "learning_rate": 4.955886225967869e-05, "loss": 1.1252, "step": 339 }, { "epoch": 0.008954437713984725, "grad_norm": 3.6390883922576904, "learning_rate": 4.955754543060311e-05, "loss": 2.0238, "step": 340 }, { "epoch": 0.008980774295496445, "grad_norm": 2.9469077587127686, "learning_rate": 4.9556228601527524e-05, "loss": 2.4644, "step": 341 }, { "epoch": 0.009007110877008164, "grad_norm": 2.9715445041656494, "learning_rate": 4.955491177245194e-05, "loss": 1.2653, "step": 342 }, { "epoch": 0.009033447458519884, "grad_norm": 2.7931387424468994, "learning_rate": 4.9553594943376355e-05, "loss": 2.2082, "step": 343 }, { "epoch": 0.009059784040031604, "grad_norm": 4.04490327835083, "learning_rate": 4.9552278114300764e-05, "loss": 1.8727, "step": 344 }, { "epoch": 0.009086120621543323, "grad_norm": 2.391226053237915, "learning_rate": 4.955096128522518e-05, "loss": 1.5419, "step": 345 }, { "epoch": 0.009112457203055044, "grad_norm": 4.560610771179199, "learning_rate": 4.9549644456149595e-05, "loss": 1.168, "step": 346 }, { "epoch": 0.009138793784566763, "grad_norm": 2.3792755603790283, "learning_rate": 4.954832762707401e-05, "loss": 1.6401, "step": 347 }, { "epoch": 0.009165130366078482, "grad_norm": 2.6933135986328125, "learning_rate": 4.954701079799842e-05, "loss": 1.8095, "step": 348 }, { "epoch": 0.009191466947590203, "grad_norm": 2.3354787826538086, "learning_rate": 4.9545693968922835e-05, "loss": 2.004, "step": 349 }, { "epoch": 0.009217803529101922, "grad_norm": 4.505835056304932, "learning_rate": 4.954437713984725e-05, "loss": 1.4959, "step": 350 }, { "epoch": 0.009244140110613643, "grad_norm": 9.2621431350708, "learning_rate": 4.9543060310771666e-05, "loss": 1.2403, "step": 351 }, { "epoch": 0.009270476692125362, "grad_norm": 2.8335564136505127, "learning_rate": 4.954174348169608e-05, "loss": 1.5436, "step": 352 }, { "epoch": 0.009296813273637083, "grad_norm": 2.834834337234497, "learning_rate": 4.954042665262049e-05, "loss": 2.4434, "step": 353 }, { "epoch": 0.009323149855148802, "grad_norm": 2.5483241081237793, "learning_rate": 4.9539109823544906e-05, "loss": 1.8674, "step": 354 }, { "epoch": 0.00934948643666052, "grad_norm": 4.1416916847229, "learning_rate": 4.9537792994469315e-05, "loss": 1.6774, "step": 355 }, { "epoch": 0.009375823018172242, "grad_norm": 3.8100359439849854, "learning_rate": 4.953647616539374e-05, "loss": 1.3106, "step": 356 }, { "epoch": 0.00940215959968396, "grad_norm": 3.3069844245910645, "learning_rate": 4.9535159336318146e-05, "loss": 0.7217, "step": 357 }, { "epoch": 0.009428496181195681, "grad_norm": 3.8136091232299805, "learning_rate": 4.953384250724256e-05, "loss": 0.8796, "step": 358 }, { "epoch": 0.0094548327627074, "grad_norm": 3.232954263687134, "learning_rate": 4.953252567816698e-05, "loss": 2.0419, "step": 359 }, { "epoch": 0.00948116934421912, "grad_norm": 4.396567344665527, "learning_rate": 4.953120884909139e-05, "loss": 1.7555, "step": 360 }, { "epoch": 0.00950750592573084, "grad_norm": 3.195117712020874, "learning_rate": 4.952989202001581e-05, "loss": 2.1575, "step": 361 }, { "epoch": 0.00953384250724256, "grad_norm": 4.96207857131958, "learning_rate": 4.952857519094022e-05, "loss": 1.5802, "step": 362 }, { "epoch": 0.00956017908875428, "grad_norm": 3.737401008605957, "learning_rate": 4.952725836186463e-05, "loss": 1.8285, "step": 363 }, { "epoch": 0.009586515670266, "grad_norm": 2.9919257164001465, "learning_rate": 4.952594153278904e-05, "loss": 2.0902, "step": 364 }, { "epoch": 0.00961285225177772, "grad_norm": 5.612448692321777, "learning_rate": 4.9524624703713464e-05, "loss": 2.0094, "step": 365 }, { "epoch": 0.009639188833289439, "grad_norm": 2.9793035984039307, "learning_rate": 4.952330787463787e-05, "loss": 1.1435, "step": 366 }, { "epoch": 0.009665525414801158, "grad_norm": 7.405142307281494, "learning_rate": 4.952199104556229e-05, "loss": 0.7576, "step": 367 }, { "epoch": 0.009691861996312879, "grad_norm": 5.179115295410156, "learning_rate": 4.9520674216486704e-05, "loss": 1.5322, "step": 368 }, { "epoch": 0.009718198577824598, "grad_norm": 3.821916341781616, "learning_rate": 4.951935738741111e-05, "loss": 1.9987, "step": 369 }, { "epoch": 0.009744535159336319, "grad_norm": 3.794161796569824, "learning_rate": 4.9518040558335535e-05, "loss": 2.6062, "step": 370 }, { "epoch": 0.009770871740848038, "grad_norm": 4.306283473968506, "learning_rate": 4.9516723729259944e-05, "loss": 1.7716, "step": 371 }, { "epoch": 0.009797208322359757, "grad_norm": 3.607884645462036, "learning_rate": 4.951540690018436e-05, "loss": 0.8448, "step": 372 }, { "epoch": 0.009823544903871478, "grad_norm": 3.094651222229004, "learning_rate": 4.951409007110877e-05, "loss": 2.1413, "step": 373 }, { "epoch": 0.009849881485383197, "grad_norm": 6.7893829345703125, "learning_rate": 4.951277324203319e-05, "loss": 1.5486, "step": 374 }, { "epoch": 0.009876218066894918, "grad_norm": 3.5733284950256348, "learning_rate": 4.95114564129576e-05, "loss": 2.0736, "step": 375 }, { "epoch": 0.009902554648406637, "grad_norm": 3.4923906326293945, "learning_rate": 4.9510139583882015e-05, "loss": 1.6066, "step": 376 }, { "epoch": 0.009928891229918357, "grad_norm": 2.3045380115509033, "learning_rate": 4.950882275480643e-05, "loss": 1.8924, "step": 377 }, { "epoch": 0.009955227811430076, "grad_norm": 2.6986491680145264, "learning_rate": 4.950750592573084e-05, "loss": 2.1344, "step": 378 }, { "epoch": 0.009981564392941795, "grad_norm": 6.939801216125488, "learning_rate": 4.950618909665526e-05, "loss": 1.8125, "step": 379 }, { "epoch": 0.010007900974453516, "grad_norm": 2.977921485900879, "learning_rate": 4.950487226757967e-05, "loss": 2.3517, "step": 380 }, { "epoch": 0.010034237555965235, "grad_norm": 5.064753532409668, "learning_rate": 4.9503555438504086e-05, "loss": 1.9185, "step": 381 }, { "epoch": 0.010060574137476956, "grad_norm": 3.0815107822418213, "learning_rate": 4.9502238609428495e-05, "loss": 1.2712, "step": 382 }, { "epoch": 0.010086910718988675, "grad_norm": 3.1293399333953857, "learning_rate": 4.950092178035291e-05, "loss": 1.7528, "step": 383 }, { "epoch": 0.010113247300500394, "grad_norm": 2.841268539428711, "learning_rate": 4.9499604951277326e-05, "loss": 2.5219, "step": 384 }, { "epoch": 0.010139583882012115, "grad_norm": 2.7860684394836426, "learning_rate": 4.949828812220174e-05, "loss": 1.7577, "step": 385 }, { "epoch": 0.010165920463523834, "grad_norm": 5.101641654968262, "learning_rate": 4.949697129312615e-05, "loss": 1.5768, "step": 386 }, { "epoch": 0.010192257045035555, "grad_norm": 2.6400580406188965, "learning_rate": 4.9495654464050566e-05, "loss": 1.6106, "step": 387 }, { "epoch": 0.010218593626547274, "grad_norm": 3.29453182220459, "learning_rate": 4.949433763497498e-05, "loss": 2.3339, "step": 388 }, { "epoch": 0.010244930208058995, "grad_norm": 3.2891499996185303, "learning_rate": 4.94930208058994e-05, "loss": 2.093, "step": 389 }, { "epoch": 0.010271266789570714, "grad_norm": 3.197697162628174, "learning_rate": 4.949170397682381e-05, "loss": 1.8503, "step": 390 }, { "epoch": 0.010297603371082433, "grad_norm": 3.106626510620117, "learning_rate": 4.949038714774822e-05, "loss": 1.5676, "step": 391 }, { "epoch": 0.010323939952594154, "grad_norm": 3.074042320251465, "learning_rate": 4.948907031867264e-05, "loss": 2.0652, "step": 392 }, { "epoch": 0.010350276534105873, "grad_norm": 5.854307174682617, "learning_rate": 4.948775348959705e-05, "loss": 1.3206, "step": 393 }, { "epoch": 0.010376613115617593, "grad_norm": 4.576664447784424, "learning_rate": 4.948643666052147e-05, "loss": 1.5156, "step": 394 }, { "epoch": 0.010402949697129312, "grad_norm": 2.5585691928863525, "learning_rate": 4.948511983144588e-05, "loss": 1.7142, "step": 395 }, { "epoch": 0.010429286278641032, "grad_norm": 7.966944217681885, "learning_rate": 4.948380300237029e-05, "loss": 1.8537, "step": 396 }, { "epoch": 0.010455622860152752, "grad_norm": 2.7815499305725098, "learning_rate": 4.948248617329471e-05, "loss": 2.3155, "step": 397 }, { "epoch": 0.010481959441664471, "grad_norm": 2.548046827316284, "learning_rate": 4.9481169344219124e-05, "loss": 1.4609, "step": 398 }, { "epoch": 0.010508296023176192, "grad_norm": 3.1576316356658936, "learning_rate": 4.947985251514354e-05, "loss": 2.3327, "step": 399 }, { "epoch": 0.010534632604687911, "grad_norm": 4.465269088745117, "learning_rate": 4.947853568606795e-05, "loss": 1.3708, "step": 400 }, { "epoch": 0.010560969186199632, "grad_norm": 3.72763729095459, "learning_rate": 4.9477218856992364e-05, "loss": 2.9096, "step": 401 }, { "epoch": 0.010587305767711351, "grad_norm": 2.7007408142089844, "learning_rate": 4.947590202791677e-05, "loss": 1.7649, "step": 402 }, { "epoch": 0.01061364234922307, "grad_norm": 3.6093926429748535, "learning_rate": 4.9474585198841196e-05, "loss": 2.3405, "step": 403 }, { "epoch": 0.010639978930734791, "grad_norm": 3.598163366317749, "learning_rate": 4.9473268369765605e-05, "loss": 2.168, "step": 404 }, { "epoch": 0.01066631551224651, "grad_norm": 6.245233058929443, "learning_rate": 4.947195154069002e-05, "loss": 2.1714, "step": 405 }, { "epoch": 0.01069265209375823, "grad_norm": 3.7399916648864746, "learning_rate": 4.9470634711614436e-05, "loss": 2.121, "step": 406 }, { "epoch": 0.01071898867526995, "grad_norm": 3.036510467529297, "learning_rate": 4.946931788253885e-05, "loss": 1.5228, "step": 407 }, { "epoch": 0.01074532525678167, "grad_norm": 4.524996757507324, "learning_rate": 4.946800105346327e-05, "loss": 2.098, "step": 408 }, { "epoch": 0.01077166183829339, "grad_norm": 3.4973762035369873, "learning_rate": 4.9466684224387676e-05, "loss": 2.0426, "step": 409 }, { "epoch": 0.010797998419805109, "grad_norm": 3.5525729656219482, "learning_rate": 4.946536739531209e-05, "loss": 1.6071, "step": 410 }, { "epoch": 0.01082433500131683, "grad_norm": 8.208961486816406, "learning_rate": 4.94640505662365e-05, "loss": 1.7934, "step": 411 }, { "epoch": 0.010850671582828549, "grad_norm": 3.2176403999328613, "learning_rate": 4.946273373716092e-05, "loss": 1.9724, "step": 412 }, { "epoch": 0.01087700816434027, "grad_norm": 3.4763741493225098, "learning_rate": 4.946141690808533e-05, "loss": 0.4135, "step": 413 }, { "epoch": 0.010903344745851988, "grad_norm": 4.857624053955078, "learning_rate": 4.946010007900975e-05, "loss": 1.4289, "step": 414 }, { "epoch": 0.010929681327363707, "grad_norm": 3.0980494022369385, "learning_rate": 4.945878324993416e-05, "loss": 1.68, "step": 415 }, { "epoch": 0.010956017908875428, "grad_norm": 2.611757755279541, "learning_rate": 4.945746642085857e-05, "loss": 1.987, "step": 416 }, { "epoch": 0.010982354490387147, "grad_norm": 3.18953537940979, "learning_rate": 4.9456149591782994e-05, "loss": 1.9162, "step": 417 }, { "epoch": 0.011008691071898868, "grad_norm": 1.9519106149673462, "learning_rate": 4.94548327627074e-05, "loss": 1.7891, "step": 418 }, { "epoch": 0.011035027653410587, "grad_norm": 2.3198137283325195, "learning_rate": 4.945351593363182e-05, "loss": 1.6574, "step": 419 }, { "epoch": 0.011061364234922308, "grad_norm": 2.448354482650757, "learning_rate": 4.945219910455623e-05, "loss": 1.4682, "step": 420 }, { "epoch": 0.011087700816434027, "grad_norm": 3.6776375770568848, "learning_rate": 4.945088227548064e-05, "loss": 1.6907, "step": 421 }, { "epoch": 0.011114037397945746, "grad_norm": 5.139711380004883, "learning_rate": 4.944956544640506e-05, "loss": 2.0815, "step": 422 }, { "epoch": 0.011140373979457467, "grad_norm": 3.2158284187316895, "learning_rate": 4.9448248617329474e-05, "loss": 2.2924, "step": 423 }, { "epoch": 0.011166710560969186, "grad_norm": 3.731276035308838, "learning_rate": 4.944693178825389e-05, "loss": 1.5524, "step": 424 }, { "epoch": 0.011193047142480907, "grad_norm": 2.7019474506378174, "learning_rate": 4.94456149591783e-05, "loss": 2.0697, "step": 425 }, { "epoch": 0.011219383723992626, "grad_norm": 4.75086784362793, "learning_rate": 4.944429813010272e-05, "loss": 1.8361, "step": 426 }, { "epoch": 0.011245720305504345, "grad_norm": 2.408961057662964, "learning_rate": 4.944298130102713e-05, "loss": 1.6186, "step": 427 }, { "epoch": 0.011272056887016066, "grad_norm": 2.6383657455444336, "learning_rate": 4.9441664471951545e-05, "loss": 2.0564, "step": 428 }, { "epoch": 0.011298393468527785, "grad_norm": 2.8923819065093994, "learning_rate": 4.9440347642875954e-05, "loss": 1.1521, "step": 429 }, { "epoch": 0.011324730050039505, "grad_norm": 4.4877424240112305, "learning_rate": 4.943903081380037e-05, "loss": 1.5687, "step": 430 }, { "epoch": 0.011351066631551224, "grad_norm": 3.236896514892578, "learning_rate": 4.9437713984724785e-05, "loss": 1.7857, "step": 431 }, { "epoch": 0.011377403213062945, "grad_norm": 2.7711963653564453, "learning_rate": 4.94363971556492e-05, "loss": 2.5365, "step": 432 }, { "epoch": 0.011403739794574664, "grad_norm": 2.926581859588623, "learning_rate": 4.9435080326573616e-05, "loss": 2.3504, "step": 433 }, { "epoch": 0.011430076376086383, "grad_norm": 2.491097927093506, "learning_rate": 4.9433763497498025e-05, "loss": 1.8617, "step": 434 }, { "epoch": 0.011456412957598104, "grad_norm": 2.979182243347168, "learning_rate": 4.943244666842244e-05, "loss": 1.6826, "step": 435 }, { "epoch": 0.011482749539109823, "grad_norm": 8.16455364227295, "learning_rate": 4.9431129839346856e-05, "loss": 1.5242, "step": 436 }, { "epoch": 0.011509086120621544, "grad_norm": 3.306053400039673, "learning_rate": 4.942981301027127e-05, "loss": 2.1168, "step": 437 }, { "epoch": 0.011535422702133263, "grad_norm": 2.775508403778076, "learning_rate": 4.942849618119568e-05, "loss": 2.3886, "step": 438 }, { "epoch": 0.011561759283644982, "grad_norm": 3.597965717315674, "learning_rate": 4.9427179352120096e-05, "loss": 1.5943, "step": 439 }, { "epoch": 0.011588095865156703, "grad_norm": 4.4641289710998535, "learning_rate": 4.942586252304451e-05, "loss": 1.3134, "step": 440 }, { "epoch": 0.011614432446668422, "grad_norm": 2.697011709213257, "learning_rate": 4.942454569396893e-05, "loss": 1.9472, "step": 441 }, { "epoch": 0.011640769028180143, "grad_norm": 3.3379714488983154, "learning_rate": 4.9423228864893336e-05, "loss": 2.0254, "step": 442 }, { "epoch": 0.011667105609691862, "grad_norm": 4.314624786376953, "learning_rate": 4.942191203581775e-05, "loss": 1.5491, "step": 443 }, { "epoch": 0.011693442191203583, "grad_norm": 4.256104469299316, "learning_rate": 4.942059520674217e-05, "loss": 1.7194, "step": 444 }, { "epoch": 0.011719778772715302, "grad_norm": 4.3630852699279785, "learning_rate": 4.941927837766658e-05, "loss": 2.1208, "step": 445 }, { "epoch": 0.01174611535422702, "grad_norm": 3.4363515377044678, "learning_rate": 4.9417961548591e-05, "loss": 1.9275, "step": 446 }, { "epoch": 0.011772451935738741, "grad_norm": 2.999098062515259, "learning_rate": 4.941664471951541e-05, "loss": 1.8565, "step": 447 }, { "epoch": 0.01179878851725046, "grad_norm": 4.614634037017822, "learning_rate": 4.941532789043982e-05, "loss": 1.7001, "step": 448 }, { "epoch": 0.011825125098762181, "grad_norm": 3.698587417602539, "learning_rate": 4.941401106136423e-05, "loss": 0.8016, "step": 449 }, { "epoch": 0.0118514616802739, "grad_norm": 6.296628475189209, "learning_rate": 4.9412694232288654e-05, "loss": 1.8436, "step": 450 }, { "epoch": 0.01187779826178562, "grad_norm": 2.57922101020813, "learning_rate": 4.941137740321306e-05, "loss": 1.5764, "step": 451 }, { "epoch": 0.01190413484329734, "grad_norm": 2.7335686683654785, "learning_rate": 4.941006057413748e-05, "loss": 1.8235, "step": 452 }, { "epoch": 0.01193047142480906, "grad_norm": 2.7058944702148438, "learning_rate": 4.9408743745061894e-05, "loss": 2.3749, "step": 453 }, { "epoch": 0.01195680800632078, "grad_norm": 2.696030378341675, "learning_rate": 4.94074269159863e-05, "loss": 1.5161, "step": 454 }, { "epoch": 0.011983144587832499, "grad_norm": 3.3722379207611084, "learning_rate": 4.9406110086910725e-05, "loss": 1.9797, "step": 455 }, { "epoch": 0.01200948116934422, "grad_norm": 3.0010156631469727, "learning_rate": 4.9404793257835134e-05, "loss": 1.4081, "step": 456 }, { "epoch": 0.012035817750855939, "grad_norm": 4.13242769241333, "learning_rate": 4.940347642875955e-05, "loss": 2.4764, "step": 457 }, { "epoch": 0.012062154332367658, "grad_norm": 4.92889928817749, "learning_rate": 4.940215959968396e-05, "loss": 1.9282, "step": 458 }, { "epoch": 0.012088490913879379, "grad_norm": 5.652385711669922, "learning_rate": 4.940084277060838e-05, "loss": 1.3044, "step": 459 }, { "epoch": 0.012114827495391098, "grad_norm": 3.8656020164489746, "learning_rate": 4.939952594153279e-05, "loss": 1.5397, "step": 460 }, { "epoch": 0.012141164076902819, "grad_norm": 2.3113582134246826, "learning_rate": 4.9398209112457205e-05, "loss": 1.9664, "step": 461 }, { "epoch": 0.012167500658414538, "grad_norm": 2.551178216934204, "learning_rate": 4.939689228338162e-05, "loss": 0.413, "step": 462 }, { "epoch": 0.012193837239926257, "grad_norm": 2.828598737716675, "learning_rate": 4.939557545430603e-05, "loss": 1.9058, "step": 463 }, { "epoch": 0.012220173821437978, "grad_norm": 5.021803379058838, "learning_rate": 4.939425862523045e-05, "loss": 0.8865, "step": 464 }, { "epoch": 0.012246510402949697, "grad_norm": 4.49089241027832, "learning_rate": 4.939294179615486e-05, "loss": 1.7977, "step": 465 }, { "epoch": 0.012272846984461417, "grad_norm": 2.908942937850952, "learning_rate": 4.939162496707928e-05, "loss": 1.941, "step": 466 }, { "epoch": 0.012299183565973136, "grad_norm": 3.2470862865448, "learning_rate": 4.9390308138003686e-05, "loss": 2.221, "step": 467 }, { "epoch": 0.012325520147484857, "grad_norm": 3.354400396347046, "learning_rate": 4.93889913089281e-05, "loss": 1.1864, "step": 468 }, { "epoch": 0.012351856728996576, "grad_norm": 2.356088638305664, "learning_rate": 4.938767447985252e-05, "loss": 1.8456, "step": 469 }, { "epoch": 0.012378193310508295, "grad_norm": 2.8001549243927, "learning_rate": 4.938635765077693e-05, "loss": 1.9366, "step": 470 }, { "epoch": 0.012404529892020016, "grad_norm": 5.683740615844727, "learning_rate": 4.938504082170135e-05, "loss": 0.8475, "step": 471 }, { "epoch": 0.012430866473531735, "grad_norm": 3.8350062370300293, "learning_rate": 4.938372399262576e-05, "loss": 2.0938, "step": 472 }, { "epoch": 0.012457203055043456, "grad_norm": 4.309392929077148, "learning_rate": 4.938240716355018e-05, "loss": 1.1875, "step": 473 }, { "epoch": 0.012483539636555175, "grad_norm": 2.567596912384033, "learning_rate": 4.938109033447459e-05, "loss": 1.8618, "step": 474 }, { "epoch": 0.012509876218066896, "grad_norm": 4.895605564117432, "learning_rate": 4.9379773505399004e-05, "loss": 1.7448, "step": 475 }, { "epoch": 0.012536212799578615, "grad_norm": 2.557440757751465, "learning_rate": 4.937845667632341e-05, "loss": 1.2474, "step": 476 }, { "epoch": 0.012562549381090334, "grad_norm": 2.6986734867095947, "learning_rate": 4.937713984724783e-05, "loss": 1.8171, "step": 477 }, { "epoch": 0.012588885962602055, "grad_norm": 3.2066280841827393, "learning_rate": 4.9375823018172244e-05, "loss": 1.8473, "step": 478 }, { "epoch": 0.012615222544113774, "grad_norm": 3.2558040618896484, "learning_rate": 4.937450618909666e-05, "loss": 1.7312, "step": 479 }, { "epoch": 0.012641559125625495, "grad_norm": 2.3930885791778564, "learning_rate": 4.9373189360021075e-05, "loss": 1.7677, "step": 480 }, { "epoch": 0.012667895707137214, "grad_norm": 3.230320453643799, "learning_rate": 4.9371872530945484e-05, "loss": 1.7896, "step": 481 }, { "epoch": 0.012694232288648933, "grad_norm": 4.14604377746582, "learning_rate": 4.93705557018699e-05, "loss": 1.4854, "step": 482 }, { "epoch": 0.012720568870160653, "grad_norm": 3.058246612548828, "learning_rate": 4.9369238872794315e-05, "loss": 1.3328, "step": 483 }, { "epoch": 0.012746905451672372, "grad_norm": 4.122186183929443, "learning_rate": 4.936792204371873e-05, "loss": 1.7708, "step": 484 }, { "epoch": 0.012773242033184093, "grad_norm": 3.2511589527130127, "learning_rate": 4.936660521464314e-05, "loss": 2.2072, "step": 485 }, { "epoch": 0.012799578614695812, "grad_norm": 2.4089932441711426, "learning_rate": 4.9365288385567555e-05, "loss": 2.111, "step": 486 }, { "epoch": 0.012825915196207533, "grad_norm": 4.144657135009766, "learning_rate": 4.936397155649197e-05, "loss": 2.2266, "step": 487 }, { "epoch": 0.012852251777719252, "grad_norm": 3.3680617809295654, "learning_rate": 4.9362654727416386e-05, "loss": 2.0625, "step": 488 }, { "epoch": 0.012878588359230971, "grad_norm": 5.534539699554443, "learning_rate": 4.9361337898340795e-05, "loss": 1.2348, "step": 489 }, { "epoch": 0.012904924940742692, "grad_norm": 11.291756629943848, "learning_rate": 4.936002106926521e-05, "loss": 2.3942, "step": 490 }, { "epoch": 0.012931261522254411, "grad_norm": 10.59220027923584, "learning_rate": 4.9358704240189626e-05, "loss": 1.3182, "step": 491 }, { "epoch": 0.012957598103766132, "grad_norm": 4.226751327514648, "learning_rate": 4.935738741111404e-05, "loss": 2.6083, "step": 492 }, { "epoch": 0.012983934685277851, "grad_norm": 2.6384074687957764, "learning_rate": 4.935607058203846e-05, "loss": 0.3831, "step": 493 }, { "epoch": 0.01301027126678957, "grad_norm": 3.1196553707122803, "learning_rate": 4.9354753752962866e-05, "loss": 1.7627, "step": 494 }, { "epoch": 0.01303660784830129, "grad_norm": 8.276817321777344, "learning_rate": 4.935343692388728e-05, "loss": 1.3917, "step": 495 }, { "epoch": 0.01306294442981301, "grad_norm": 3.178133726119995, "learning_rate": 4.935212009481169e-05, "loss": 1.6477, "step": 496 }, { "epoch": 0.01308928101132473, "grad_norm": 3.6952528953552246, "learning_rate": 4.935080326573611e-05, "loss": 0.7113, "step": 497 }, { "epoch": 0.01311561759283645, "grad_norm": 3.7713797092437744, "learning_rate": 4.934948643666052e-05, "loss": 2.0973, "step": 498 }, { "epoch": 0.01314195417434817, "grad_norm": 3.135864496231079, "learning_rate": 4.934816960758494e-05, "loss": 1.3988, "step": 499 }, { "epoch": 0.01316829075585989, "grad_norm": 3.300065040588379, "learning_rate": 4.934685277850935e-05, "loss": 1.7863, "step": 500 }, { "epoch": 0.013194627337371609, "grad_norm": 2.766282796859741, "learning_rate": 4.934553594943376e-05, "loss": 1.2625, "step": 501 }, { "epoch": 0.01322096391888333, "grad_norm": 7.230177879333496, "learning_rate": 4.9344219120358184e-05, "loss": 1.3097, "step": 502 }, { "epoch": 0.013247300500395048, "grad_norm": 3.0049235820770264, "learning_rate": 4.934290229128259e-05, "loss": 2.1888, "step": 503 }, { "epoch": 0.01327363708190677, "grad_norm": 2.8356151580810547, "learning_rate": 4.934158546220701e-05, "loss": 1.573, "step": 504 }, { "epoch": 0.013299973663418488, "grad_norm": 3.1539864540100098, "learning_rate": 4.934026863313142e-05, "loss": 0.6297, "step": 505 }, { "epoch": 0.013326310244930207, "grad_norm": 6.145493984222412, "learning_rate": 4.933895180405584e-05, "loss": 1.6996, "step": 506 }, { "epoch": 0.013352646826441928, "grad_norm": 2.7827095985412598, "learning_rate": 4.933763497498025e-05, "loss": 2.1537, "step": 507 }, { "epoch": 0.013378983407953647, "grad_norm": 5.041752815246582, "learning_rate": 4.9336318145904664e-05, "loss": 1.9504, "step": 508 }, { "epoch": 0.013405319989465368, "grad_norm": 2.551994562149048, "learning_rate": 4.933500131682908e-05, "loss": 1.9604, "step": 509 }, { "epoch": 0.013431656570977087, "grad_norm": 7.829330921173096, "learning_rate": 4.933368448775349e-05, "loss": 1.1279, "step": 510 }, { "epoch": 0.013457993152488808, "grad_norm": 5.9089789390563965, "learning_rate": 4.933236765867791e-05, "loss": 1.7507, "step": 511 }, { "epoch": 0.013484329734000527, "grad_norm": 2.3757100105285645, "learning_rate": 4.933105082960232e-05, "loss": 2.0618, "step": 512 }, { "epoch": 0.013510666315512246, "grad_norm": 5.670836925506592, "learning_rate": 4.9329734000526735e-05, "loss": 0.9599, "step": 513 }, { "epoch": 0.013537002897023967, "grad_norm": 5.219981670379639, "learning_rate": 4.9328417171451144e-05, "loss": 1.4321, "step": 514 }, { "epoch": 0.013563339478535686, "grad_norm": 2.96686053276062, "learning_rate": 4.932710034237556e-05, "loss": 1.7634, "step": 515 }, { "epoch": 0.013589676060047406, "grad_norm": 3.4565460681915283, "learning_rate": 4.9325783513299975e-05, "loss": 1.6794, "step": 516 }, { "epoch": 0.013616012641559126, "grad_norm": 3.0890915393829346, "learning_rate": 4.932446668422439e-05, "loss": 1.9017, "step": 517 }, { "epoch": 0.013642349223070845, "grad_norm": 3.332601308822632, "learning_rate": 4.9323149855148806e-05, "loss": 1.7231, "step": 518 }, { "epoch": 0.013668685804582565, "grad_norm": 4.089598655700684, "learning_rate": 4.9321833026073215e-05, "loss": 1.1871, "step": 519 }, { "epoch": 0.013695022386094284, "grad_norm": 2.75116229057312, "learning_rate": 4.932051619699764e-05, "loss": 1.5234, "step": 520 }, { "epoch": 0.013721358967606005, "grad_norm": 3.515044927597046, "learning_rate": 4.9319199367922046e-05, "loss": 1.8234, "step": 521 }, { "epoch": 0.013747695549117724, "grad_norm": 5.515471935272217, "learning_rate": 4.931788253884646e-05, "loss": 1.3104, "step": 522 }, { "epoch": 0.013774032130629445, "grad_norm": 2.4399476051330566, "learning_rate": 4.931656570977087e-05, "loss": 2.2612, "step": 523 }, { "epoch": 0.013800368712141164, "grad_norm": 3.2772269248962402, "learning_rate": 4.9315248880695287e-05, "loss": 2.083, "step": 524 }, { "epoch": 0.013826705293652883, "grad_norm": 2.3090224266052246, "learning_rate": 4.93139320516197e-05, "loss": 2.3116, "step": 525 }, { "epoch": 0.013853041875164604, "grad_norm": 2.9195845127105713, "learning_rate": 4.931261522254412e-05, "loss": 0.931, "step": 526 }, { "epoch": 0.013879378456676323, "grad_norm": 3.011740207672119, "learning_rate": 4.931129839346853e-05, "loss": 2.0923, "step": 527 }, { "epoch": 0.013905715038188044, "grad_norm": 2.624511241912842, "learning_rate": 4.930998156439294e-05, "loss": 1.6522, "step": 528 }, { "epoch": 0.013932051619699763, "grad_norm": 6.0027756690979, "learning_rate": 4.930866473531736e-05, "loss": 1.2117, "step": 529 }, { "epoch": 0.013958388201211482, "grad_norm": 3.0832457542419434, "learning_rate": 4.930734790624177e-05, "loss": 2.8007, "step": 530 }, { "epoch": 0.013984724782723203, "grad_norm": 3.3799095153808594, "learning_rate": 4.930603107716619e-05, "loss": 2.4809, "step": 531 }, { "epoch": 0.014011061364234922, "grad_norm": 3.8424103260040283, "learning_rate": 4.93047142480906e-05, "loss": 1.6528, "step": 532 }, { "epoch": 0.014037397945746643, "grad_norm": 9.509858131408691, "learning_rate": 4.930339741901501e-05, "loss": 2.5631, "step": 533 }, { "epoch": 0.014063734527258362, "grad_norm": 4.948150157928467, "learning_rate": 4.930208058993943e-05, "loss": 1.0504, "step": 534 }, { "epoch": 0.014090071108770082, "grad_norm": 3.894061803817749, "learning_rate": 4.9300763760863845e-05, "loss": 1.9647, "step": 535 }, { "epoch": 0.014116407690281801, "grad_norm": 2.8031275272369385, "learning_rate": 4.929944693178826e-05, "loss": 1.6106, "step": 536 }, { "epoch": 0.01414274427179352, "grad_norm": 2.3836934566497803, "learning_rate": 4.929813010271267e-05, "loss": 2.0547, "step": 537 }, { "epoch": 0.014169080853305241, "grad_norm": 4.073704242706299, "learning_rate": 4.9296813273637085e-05, "loss": 0.9845, "step": 538 }, { "epoch": 0.01419541743481696, "grad_norm": 2.5212810039520264, "learning_rate": 4.92954964445615e-05, "loss": 1.879, "step": 539 }, { "epoch": 0.014221754016328681, "grad_norm": 2.55657696723938, "learning_rate": 4.9294179615485916e-05, "loss": 1.8492, "step": 540 }, { "epoch": 0.0142480905978404, "grad_norm": 2.444204092025757, "learning_rate": 4.9292862786410325e-05, "loss": 1.7488, "step": 541 }, { "epoch": 0.014274427179352121, "grad_norm": 2.6518898010253906, "learning_rate": 4.929154595733474e-05, "loss": 1.8359, "step": 542 }, { "epoch": 0.01430076376086384, "grad_norm": 7.798391342163086, "learning_rate": 4.929022912825915e-05, "loss": 1.2831, "step": 543 }, { "epoch": 0.014327100342375559, "grad_norm": 3.268341302871704, "learning_rate": 4.928891229918357e-05, "loss": 0.7458, "step": 544 }, { "epoch": 0.01435343692388728, "grad_norm": 2.8737120628356934, "learning_rate": 4.928759547010798e-05, "loss": 1.043, "step": 545 }, { "epoch": 0.014379773505398999, "grad_norm": 2.81797194480896, "learning_rate": 4.9286278641032396e-05, "loss": 2.3844, "step": 546 }, { "epoch": 0.01440611008691072, "grad_norm": 2.3775641918182373, "learning_rate": 4.928496181195681e-05, "loss": 2.0042, "step": 547 }, { "epoch": 0.014432446668422439, "grad_norm": 2.2898850440979004, "learning_rate": 4.928364498288122e-05, "loss": 1.5706, "step": 548 }, { "epoch": 0.014458783249934158, "grad_norm": 3.3940882682800293, "learning_rate": 4.928232815380564e-05, "loss": 1.8806, "step": 549 }, { "epoch": 0.014485119831445879, "grad_norm": 4.997721195220947, "learning_rate": 4.928101132473005e-05, "loss": 2.2222, "step": 550 }, { "epoch": 0.014511456412957598, "grad_norm": 2.8004939556121826, "learning_rate": 4.927969449565447e-05, "loss": 1.9449, "step": 551 }, { "epoch": 0.014537792994469318, "grad_norm": 3.135756731033325, "learning_rate": 4.9278377666578876e-05, "loss": 1.8732, "step": 552 }, { "epoch": 0.014564129575981037, "grad_norm": 2.623967170715332, "learning_rate": 4.92770608375033e-05, "loss": 2.3925, "step": 553 }, { "epoch": 0.014590466157492758, "grad_norm": 3.6214091777801514, "learning_rate": 4.927574400842771e-05, "loss": 1.5284, "step": 554 }, { "epoch": 0.014616802739004477, "grad_norm": 5.104559898376465, "learning_rate": 4.927442717935212e-05, "loss": 1.6069, "step": 555 }, { "epoch": 0.014643139320516196, "grad_norm": 6.233405113220215, "learning_rate": 4.927311035027654e-05, "loss": 1.5233, "step": 556 }, { "epoch": 0.014669475902027917, "grad_norm": 4.184660911560059, "learning_rate": 4.927179352120095e-05, "loss": 0.8444, "step": 557 }, { "epoch": 0.014695812483539636, "grad_norm": 4.182737827301025, "learning_rate": 4.927047669212537e-05, "loss": 2.0278, "step": 558 }, { "epoch": 0.014722149065051357, "grad_norm": 3.9111337661743164, "learning_rate": 4.926915986304978e-05, "loss": 1.534, "step": 559 }, { "epoch": 0.014748485646563076, "grad_norm": 3.3704192638397217, "learning_rate": 4.9267843033974194e-05, "loss": 2.1861, "step": 560 }, { "epoch": 0.014774822228074795, "grad_norm": 2.4734256267547607, "learning_rate": 4.92665262048986e-05, "loss": 2.3121, "step": 561 }, { "epoch": 0.014801158809586516, "grad_norm": 3.1234734058380127, "learning_rate": 4.926520937582302e-05, "loss": 1.4346, "step": 562 }, { "epoch": 0.014827495391098235, "grad_norm": 5.726578712463379, "learning_rate": 4.9263892546747434e-05, "loss": 1.6069, "step": 563 }, { "epoch": 0.014853831972609956, "grad_norm": 3.6641364097595215, "learning_rate": 4.926257571767185e-05, "loss": 1.6828, "step": 564 }, { "epoch": 0.014880168554121675, "grad_norm": 2.650238037109375, "learning_rate": 4.9261258888596265e-05, "loss": 1.6859, "step": 565 }, { "epoch": 0.014906505135633396, "grad_norm": 3.7750728130340576, "learning_rate": 4.9259942059520674e-05, "loss": 1.4728, "step": 566 }, { "epoch": 0.014932841717145115, "grad_norm": 4.534443378448486, "learning_rate": 4.925862523044509e-05, "loss": 1.7134, "step": 567 }, { "epoch": 0.014959178298656834, "grad_norm": 2.571948528289795, "learning_rate": 4.9257308401369505e-05, "loss": 0.7603, "step": 568 }, { "epoch": 0.014985514880168554, "grad_norm": 3.9538166522979736, "learning_rate": 4.925599157229392e-05, "loss": 0.983, "step": 569 }, { "epoch": 0.015011851461680274, "grad_norm": 3.185822010040283, "learning_rate": 4.925467474321833e-05, "loss": 2.4089, "step": 570 }, { "epoch": 0.015038188043191994, "grad_norm": 3.3399198055267334, "learning_rate": 4.9253357914142745e-05, "loss": 2.342, "step": 571 }, { "epoch": 0.015064524624703713, "grad_norm": 4.784788131713867, "learning_rate": 4.925204108506716e-05, "loss": 1.4173, "step": 572 }, { "epoch": 0.015090861206215432, "grad_norm": 2.2170331478118896, "learning_rate": 4.9250724255991576e-05, "loss": 2.0383, "step": 573 }, { "epoch": 0.015117197787727153, "grad_norm": 4.919084072113037, "learning_rate": 4.924940742691599e-05, "loss": 1.6311, "step": 574 }, { "epoch": 0.015143534369238872, "grad_norm": 3.2493388652801514, "learning_rate": 4.92480905978404e-05, "loss": 1.2793, "step": 575 }, { "epoch": 0.015169870950750593, "grad_norm": 4.027344703674316, "learning_rate": 4.9246773768764816e-05, "loss": 1.116, "step": 576 }, { "epoch": 0.015196207532262312, "grad_norm": 3.5965209007263184, "learning_rate": 4.924545693968923e-05, "loss": 1.8381, "step": 577 }, { "epoch": 0.015222544113774033, "grad_norm": 3.134528636932373, "learning_rate": 4.924414011061365e-05, "loss": 2.4347, "step": 578 }, { "epoch": 0.015248880695285752, "grad_norm": 4.555936336517334, "learning_rate": 4.9242823281538056e-05, "loss": 1.6732, "step": 579 }, { "epoch": 0.015275217276797471, "grad_norm": 3.238778591156006, "learning_rate": 4.924150645246247e-05, "loss": 1.7157, "step": 580 }, { "epoch": 0.015301553858309192, "grad_norm": 3.3418161869049072, "learning_rate": 4.924018962338689e-05, "loss": 2.3644, "step": 581 }, { "epoch": 0.01532789043982091, "grad_norm": 2.421600818634033, "learning_rate": 4.92388727943113e-05, "loss": 1.5561, "step": 582 }, { "epoch": 0.015354227021332632, "grad_norm": 2.887904405593872, "learning_rate": 4.923755596523572e-05, "loss": 2.036, "step": 583 }, { "epoch": 0.01538056360284435, "grad_norm": 5.282592296600342, "learning_rate": 4.923623913616013e-05, "loss": 0.9187, "step": 584 }, { "epoch": 0.01540690018435607, "grad_norm": 5.465512752532959, "learning_rate": 4.923492230708454e-05, "loss": 1.6847, "step": 585 }, { "epoch": 0.01543323676586779, "grad_norm": 3.950766086578369, "learning_rate": 4.923360547800896e-05, "loss": 1.1451, "step": 586 }, { "epoch": 0.01545957334737951, "grad_norm": 3.5993056297302246, "learning_rate": 4.9232288648933374e-05, "loss": 1.4428, "step": 587 }, { "epoch": 0.01548590992889123, "grad_norm": 2.4341535568237305, "learning_rate": 4.923097181985778e-05, "loss": 2.0307, "step": 588 }, { "epoch": 0.01551224651040295, "grad_norm": 5.462024688720703, "learning_rate": 4.92296549907822e-05, "loss": 2.4178, "step": 589 }, { "epoch": 0.01553858309191467, "grad_norm": 2.402330160140991, "learning_rate": 4.922833816170661e-05, "loss": 1.8485, "step": 590 }, { "epoch": 0.01556491967342639, "grad_norm": 2.640392541885376, "learning_rate": 4.922702133263103e-05, "loss": 1.4291, "step": 591 }, { "epoch": 0.015591256254938108, "grad_norm": 2.1738691329956055, "learning_rate": 4.922570450355544e-05, "loss": 1.9312, "step": 592 }, { "epoch": 0.015617592836449829, "grad_norm": 2.7656919956207275, "learning_rate": 4.9224387674479854e-05, "loss": 1.912, "step": 593 }, { "epoch": 0.015643929417961548, "grad_norm": 4.148161888122559, "learning_rate": 4.922307084540427e-05, "loss": 1.0477, "step": 594 }, { "epoch": 0.015670265999473267, "grad_norm": 3.2078702449798584, "learning_rate": 4.922175401632868e-05, "loss": 2.0759, "step": 595 }, { "epoch": 0.01569660258098499, "grad_norm": 4.229988098144531, "learning_rate": 4.92204371872531e-05, "loss": 1.3012, "step": 596 }, { "epoch": 0.01572293916249671, "grad_norm": 9.312508583068848, "learning_rate": 4.921912035817751e-05, "loss": 1.6295, "step": 597 }, { "epoch": 0.015749275744008428, "grad_norm": 4.225595474243164, "learning_rate": 4.9217803529101926e-05, "loss": 0.9376, "step": 598 }, { "epoch": 0.015775612325520147, "grad_norm": 4.413676738739014, "learning_rate": 4.9216486700026334e-05, "loss": 1.6213, "step": 599 }, { "epoch": 0.015801948907031866, "grad_norm": 2.6570732593536377, "learning_rate": 4.921516987095075e-05, "loss": 0.8403, "step": 600 }, { "epoch": 0.01582828548854359, "grad_norm": 2.103667736053467, "learning_rate": 4.9213853041875166e-05, "loss": 2.0725, "step": 601 }, { "epoch": 0.015854622070055308, "grad_norm": 4.095400333404541, "learning_rate": 4.921253621279958e-05, "loss": 2.4883, "step": 602 }, { "epoch": 0.015880958651567027, "grad_norm": 4.9196672439575195, "learning_rate": 4.9211219383724e-05, "loss": 1.3018, "step": 603 }, { "epoch": 0.015907295233078746, "grad_norm": 4.2212653160095215, "learning_rate": 4.9209902554648406e-05, "loss": 1.6165, "step": 604 }, { "epoch": 0.015933631814590465, "grad_norm": 2.3772711753845215, "learning_rate": 4.920858572557283e-05, "loss": 1.5943, "step": 605 }, { "epoch": 0.015959968396102187, "grad_norm": 3.723097085952759, "learning_rate": 4.920726889649724e-05, "loss": 1.5913, "step": 606 }, { "epoch": 0.015986304977613906, "grad_norm": 3.3020520210266113, "learning_rate": 4.920595206742165e-05, "loss": 1.8639, "step": 607 }, { "epoch": 0.016012641559125625, "grad_norm": 3.7389028072357178, "learning_rate": 4.920463523834606e-05, "loss": 1.9168, "step": 608 }, { "epoch": 0.016038978140637344, "grad_norm": 5.393535614013672, "learning_rate": 4.920331840927048e-05, "loss": 1.916, "step": 609 }, { "epoch": 0.016065314722149063, "grad_norm": 6.769744873046875, "learning_rate": 4.920200158019489e-05, "loss": 1.4624, "step": 610 }, { "epoch": 0.016091651303660786, "grad_norm": 5.085109710693359, "learning_rate": 4.920068475111931e-05, "loss": 1.7557, "step": 611 }, { "epoch": 0.016117987885172505, "grad_norm": 2.350942611694336, "learning_rate": 4.9199367922043724e-05, "loss": 1.8141, "step": 612 }, { "epoch": 0.016144324466684224, "grad_norm": 5.252973556518555, "learning_rate": 4.919805109296813e-05, "loss": 1.4946, "step": 613 }, { "epoch": 0.016170661048195943, "grad_norm": 5.195660591125488, "learning_rate": 4.919673426389255e-05, "loss": 1.6677, "step": 614 }, { "epoch": 0.016196997629707666, "grad_norm": 2.2531235218048096, "learning_rate": 4.9195417434816964e-05, "loss": 1.6768, "step": 615 }, { "epoch": 0.016223334211219385, "grad_norm": 2.1095333099365234, "learning_rate": 4.919410060574138e-05, "loss": 0.4763, "step": 616 }, { "epoch": 0.016249670792731104, "grad_norm": 6.1692023277282715, "learning_rate": 4.919278377666579e-05, "loss": 1.9035, "step": 617 }, { "epoch": 0.016276007374242823, "grad_norm": 3.5493061542510986, "learning_rate": 4.9191466947590204e-05, "loss": 1.9656, "step": 618 }, { "epoch": 0.016302343955754542, "grad_norm": 2.4924559593200684, "learning_rate": 4.919015011851462e-05, "loss": 1.7477, "step": 619 }, { "epoch": 0.016328680537266264, "grad_norm": 2.7681593894958496, "learning_rate": 4.9188833289439035e-05, "loss": 1.2489, "step": 620 }, { "epoch": 0.016355017118777983, "grad_norm": 2.5957839488983154, "learning_rate": 4.918751646036345e-05, "loss": 1.3904, "step": 621 }, { "epoch": 0.016381353700289703, "grad_norm": 4.6176228523254395, "learning_rate": 4.918619963128786e-05, "loss": 1.8987, "step": 622 }, { "epoch": 0.01640769028180142, "grad_norm": 2.1113476753234863, "learning_rate": 4.9184882802212275e-05, "loss": 1.2665, "step": 623 }, { "epoch": 0.01643402686331314, "grad_norm": 6.761231422424316, "learning_rate": 4.918356597313669e-05, "loss": 1.9924, "step": 624 }, { "epoch": 0.016460363444824863, "grad_norm": 4.207162857055664, "learning_rate": 4.9182249144061106e-05, "loss": 1.1325, "step": 625 }, { "epoch": 0.016486700026336582, "grad_norm": 5.167304039001465, "learning_rate": 4.9180932314985515e-05, "loss": 1.551, "step": 626 }, { "epoch": 0.0165130366078483, "grad_norm": 2.5618858337402344, "learning_rate": 4.917961548590993e-05, "loss": 1.9741, "step": 627 }, { "epoch": 0.01653937318936002, "grad_norm": 2.5403671264648438, "learning_rate": 4.9178298656834346e-05, "loss": 2.673, "step": 628 }, { "epoch": 0.01656570977087174, "grad_norm": 4.460215091705322, "learning_rate": 4.917698182775876e-05, "loss": 1.5169, "step": 629 }, { "epoch": 0.016592046352383462, "grad_norm": 2.9325146675109863, "learning_rate": 4.917566499868318e-05, "loss": 0.7922, "step": 630 }, { "epoch": 0.01661838293389518, "grad_norm": 3.34602427482605, "learning_rate": 4.9174348169607586e-05, "loss": 1.6785, "step": 631 }, { "epoch": 0.0166447195154069, "grad_norm": 2.8243367671966553, "learning_rate": 4.9173031340532e-05, "loss": 2.3495, "step": 632 }, { "epoch": 0.01667105609691862, "grad_norm": 2.8493475914001465, "learning_rate": 4.917171451145641e-05, "loss": 1.6853, "step": 633 }, { "epoch": 0.016697392678430338, "grad_norm": 4.395769119262695, "learning_rate": 4.917039768238083e-05, "loss": 0.831, "step": 634 }, { "epoch": 0.01672372925994206, "grad_norm": 3.789914846420288, "learning_rate": 4.916908085330524e-05, "loss": 2.1424, "step": 635 }, { "epoch": 0.01675006584145378, "grad_norm": 5.724221706390381, "learning_rate": 4.916776402422966e-05, "loss": 1.2813, "step": 636 }, { "epoch": 0.0167764024229655, "grad_norm": 3.548966884613037, "learning_rate": 4.916644719515407e-05, "loss": 2.5177, "step": 637 }, { "epoch": 0.016802739004477218, "grad_norm": 2.3427090644836426, "learning_rate": 4.916513036607849e-05, "loss": 1.5009, "step": 638 }, { "epoch": 0.01682907558598894, "grad_norm": 2.6451728343963623, "learning_rate": 4.9163813537002904e-05, "loss": 2.0077, "step": 639 }, { "epoch": 0.01685541216750066, "grad_norm": 3.45279860496521, "learning_rate": 4.916249670792731e-05, "loss": 1.0719, "step": 640 }, { "epoch": 0.01688174874901238, "grad_norm": 5.000061988830566, "learning_rate": 4.916117987885173e-05, "loss": 1.411, "step": 641 }, { "epoch": 0.016908085330524097, "grad_norm": 3.6413791179656982, "learning_rate": 4.915986304977614e-05, "loss": 0.9491, "step": 642 }, { "epoch": 0.016934421912035817, "grad_norm": 6.532812595367432, "learning_rate": 4.915854622070056e-05, "loss": 0.8769, "step": 643 }, { "epoch": 0.01696075849354754, "grad_norm": 2.975863218307495, "learning_rate": 4.915722939162497e-05, "loss": 1.6162, "step": 644 }, { "epoch": 0.016987095075059258, "grad_norm": 2.6892645359039307, "learning_rate": 4.9155912562549384e-05, "loss": 1.9541, "step": 645 }, { "epoch": 0.017013431656570977, "grad_norm": 7.905243396759033, "learning_rate": 4.915459573347379e-05, "loss": 2.6824, "step": 646 }, { "epoch": 0.017039768238082696, "grad_norm": 2.5596559047698975, "learning_rate": 4.915327890439821e-05, "loss": 1.8071, "step": 647 }, { "epoch": 0.017066104819594415, "grad_norm": 2.167295455932617, "learning_rate": 4.9151962075322624e-05, "loss": 1.6388, "step": 648 }, { "epoch": 0.017092441401106138, "grad_norm": 4.472445964813232, "learning_rate": 4.915064524624704e-05, "loss": 0.4865, "step": 649 }, { "epoch": 0.017118777982617857, "grad_norm": 2.858304262161255, "learning_rate": 4.9149328417171455e-05, "loss": 2.0971, "step": 650 }, { "epoch": 0.017145114564129576, "grad_norm": 2.73447585105896, "learning_rate": 4.9148011588095864e-05, "loss": 0.6435, "step": 651 }, { "epoch": 0.017171451145641295, "grad_norm": 3.694622039794922, "learning_rate": 4.9146694759020287e-05, "loss": 1.6498, "step": 652 }, { "epoch": 0.017197787727153014, "grad_norm": 2.649141311645508, "learning_rate": 4.9145377929944695e-05, "loss": 2.7969, "step": 653 }, { "epoch": 0.017224124308664737, "grad_norm": 2.7107765674591064, "learning_rate": 4.914406110086911e-05, "loss": 1.9972, "step": 654 }, { "epoch": 0.017250460890176456, "grad_norm": 6.076614856719971, "learning_rate": 4.914274427179352e-05, "loss": 0.6942, "step": 655 }, { "epoch": 0.017276797471688175, "grad_norm": 3.9620161056518555, "learning_rate": 4.9141427442717935e-05, "loss": 2.1558, "step": 656 }, { "epoch": 0.017303134053199894, "grad_norm": 6.982167720794678, "learning_rate": 4.914011061364235e-05, "loss": 1.713, "step": 657 }, { "epoch": 0.017329470634711613, "grad_norm": 6.892648696899414, "learning_rate": 4.9138793784566767e-05, "loss": 1.5932, "step": 658 }, { "epoch": 0.017355807216223335, "grad_norm": 4.177656650543213, "learning_rate": 4.913747695549118e-05, "loss": 1.1373, "step": 659 }, { "epoch": 0.017382143797735054, "grad_norm": 3.7838337421417236, "learning_rate": 4.913616012641559e-05, "loss": 1.3375, "step": 660 }, { "epoch": 0.017408480379246773, "grad_norm": 2.8625714778900146, "learning_rate": 4.9134843297340007e-05, "loss": 1.4474, "step": 661 }, { "epoch": 0.017434816960758492, "grad_norm": 2.763807773590088, "learning_rate": 4.913352646826442e-05, "loss": 2.0358, "step": 662 }, { "epoch": 0.017461153542270215, "grad_norm": 4.135056972503662, "learning_rate": 4.913220963918884e-05, "loss": 1.5602, "step": 663 }, { "epoch": 0.017487490123781934, "grad_norm": 2.3488876819610596, "learning_rate": 4.9130892810113247e-05, "loss": 1.8612, "step": 664 }, { "epoch": 0.017513826705293653, "grad_norm": 3.223292112350464, "learning_rate": 4.912957598103766e-05, "loss": 1.9454, "step": 665 }, { "epoch": 0.017540163286805372, "grad_norm": 9.494622230529785, "learning_rate": 4.912825915196208e-05, "loss": 0.6274, "step": 666 }, { "epoch": 0.01756649986831709, "grad_norm": 2.9268500804901123, "learning_rate": 4.912694232288649e-05, "loss": 2.4056, "step": 667 }, { "epoch": 0.017592836449828814, "grad_norm": 2.4697117805480957, "learning_rate": 4.912562549381091e-05, "loss": 2.1795, "step": 668 }, { "epoch": 0.017619173031340533, "grad_norm": 3.0642313957214355, "learning_rate": 4.912430866473532e-05, "loss": 1.8113, "step": 669 }, { "epoch": 0.017645509612852252, "grad_norm": 2.972248077392578, "learning_rate": 4.912299183565973e-05, "loss": 1.6875, "step": 670 }, { "epoch": 0.01767184619436397, "grad_norm": 2.488313913345337, "learning_rate": 4.912167500658415e-05, "loss": 2.3337, "step": 671 }, { "epoch": 0.01769818277587569, "grad_norm": 3.433907985687256, "learning_rate": 4.9120358177508565e-05, "loss": 1.2048, "step": 672 }, { "epoch": 0.017724519357387412, "grad_norm": 3.9574859142303467, "learning_rate": 4.9119041348432973e-05, "loss": 1.9056, "step": 673 }, { "epoch": 0.01775085593889913, "grad_norm": 2.6303348541259766, "learning_rate": 4.911772451935739e-05, "loss": 1.4656, "step": 674 }, { "epoch": 0.01777719252041085, "grad_norm": 4.3974714279174805, "learning_rate": 4.9116407690281805e-05, "loss": 2.0426, "step": 675 }, { "epoch": 0.01780352910192257, "grad_norm": 3.659027099609375, "learning_rate": 4.911509086120622e-05, "loss": 2.0193, "step": 676 }, { "epoch": 0.01782986568343429, "grad_norm": 3.02225923538208, "learning_rate": 4.9113774032130636e-05, "loss": 1.5299, "step": 677 }, { "epoch": 0.01785620226494601, "grad_norm": 2.1771230697631836, "learning_rate": 4.9112457203055045e-05, "loss": 2.0673, "step": 678 }, { "epoch": 0.01788253884645773, "grad_norm": 3.230940580368042, "learning_rate": 4.911114037397946e-05, "loss": 1.759, "step": 679 }, { "epoch": 0.01790887542796945, "grad_norm": 3.779658794403076, "learning_rate": 4.910982354490387e-05, "loss": 1.4368, "step": 680 }, { "epoch": 0.01793521200948117, "grad_norm": 3.737619638442993, "learning_rate": 4.910850671582829e-05, "loss": 1.7809, "step": 681 }, { "epoch": 0.01796154859099289, "grad_norm": 3.0055973529815674, "learning_rate": 4.91071898867527e-05, "loss": 1.9685, "step": 682 }, { "epoch": 0.01798788517250461, "grad_norm": 4.044579982757568, "learning_rate": 4.9105873057677116e-05, "loss": 1.2692, "step": 683 }, { "epoch": 0.01801422175401633, "grad_norm": 2.2516746520996094, "learning_rate": 4.910455622860153e-05, "loss": 2.1363, "step": 684 }, { "epoch": 0.018040558335528048, "grad_norm": 3.4395387172698975, "learning_rate": 4.910323939952595e-05, "loss": 1.7178, "step": 685 }, { "epoch": 0.018066894917039767, "grad_norm": 2.8364453315734863, "learning_rate": 4.910192257045036e-05, "loss": 1.5717, "step": 686 }, { "epoch": 0.01809323149855149, "grad_norm": 2.7511515617370605, "learning_rate": 4.910060574137477e-05, "loss": 2.0651, "step": 687 }, { "epoch": 0.01811956808006321, "grad_norm": 4.945553779602051, "learning_rate": 4.909928891229919e-05, "loss": 2.0403, "step": 688 }, { "epoch": 0.018145904661574928, "grad_norm": 3.2428159713745117, "learning_rate": 4.9097972083223596e-05, "loss": 1.2601, "step": 689 }, { "epoch": 0.018172241243086647, "grad_norm": 2.8357133865356445, "learning_rate": 4.909665525414802e-05, "loss": 1.9201, "step": 690 }, { "epoch": 0.018198577824598366, "grad_norm": 3.0857584476470947, "learning_rate": 4.909533842507243e-05, "loss": 2.613, "step": 691 }, { "epoch": 0.01822491440611009, "grad_norm": 3.888237953186035, "learning_rate": 4.909402159599684e-05, "loss": 1.3062, "step": 692 }, { "epoch": 0.018251250987621807, "grad_norm": 3.859483003616333, "learning_rate": 4.909270476692125e-05, "loss": 1.2223, "step": 693 }, { "epoch": 0.018277587569133526, "grad_norm": 2.710242986679077, "learning_rate": 4.909138793784567e-05, "loss": 0.5843, "step": 694 }, { "epoch": 0.018303924150645245, "grad_norm": 4.768449783325195, "learning_rate": 4.909007110877008e-05, "loss": 1.3698, "step": 695 }, { "epoch": 0.018330260732156965, "grad_norm": 4.321253299713135, "learning_rate": 4.90887542796945e-05, "loss": 0.5832, "step": 696 }, { "epoch": 0.018356597313668687, "grad_norm": 2.6788434982299805, "learning_rate": 4.9087437450618914e-05, "loss": 2.3529, "step": 697 }, { "epoch": 0.018382933895180406, "grad_norm": 4.308788299560547, "learning_rate": 4.908612062154332e-05, "loss": 2.8389, "step": 698 }, { "epoch": 0.018409270476692125, "grad_norm": 3.0056040287017822, "learning_rate": 4.908480379246774e-05, "loss": 1.7889, "step": 699 }, { "epoch": 0.018435607058203844, "grad_norm": 2.3902392387390137, "learning_rate": 4.9083486963392154e-05, "loss": 1.9402, "step": 700 }, { "epoch": 0.018461943639715563, "grad_norm": 2.8440802097320557, "learning_rate": 4.908217013431657e-05, "loss": 1.4952, "step": 701 }, { "epoch": 0.018488280221227286, "grad_norm": 3.7777483463287354, "learning_rate": 4.908085330524098e-05, "loss": 0.997, "step": 702 }, { "epoch": 0.018514616802739005, "grad_norm": 4.508836269378662, "learning_rate": 4.9079536476165394e-05, "loss": 1.8724, "step": 703 }, { "epoch": 0.018540953384250724, "grad_norm": 4.095557689666748, "learning_rate": 4.907821964708981e-05, "loss": 1.6751, "step": 704 }, { "epoch": 0.018567289965762443, "grad_norm": 3.1924331188201904, "learning_rate": 4.9076902818014225e-05, "loss": 2.0006, "step": 705 }, { "epoch": 0.018593626547274165, "grad_norm": 4.989907264709473, "learning_rate": 4.907558598893864e-05, "loss": 1.7909, "step": 706 }, { "epoch": 0.018619963128785885, "grad_norm": 2.632371187210083, "learning_rate": 4.907426915986305e-05, "loss": 2.3802, "step": 707 }, { "epoch": 0.018646299710297604, "grad_norm": 3.216073513031006, "learning_rate": 4.9072952330787465e-05, "loss": 1.894, "step": 708 }, { "epoch": 0.018672636291809323, "grad_norm": 3.9569344520568848, "learning_rate": 4.907163550171188e-05, "loss": 1.0289, "step": 709 }, { "epoch": 0.01869897287332104, "grad_norm": 2.3757307529449463, "learning_rate": 4.9070318672636296e-05, "loss": 1.5536, "step": 710 }, { "epoch": 0.018725309454832764, "grad_norm": 4.2826714515686035, "learning_rate": 4.9069001843560705e-05, "loss": 2.2823, "step": 711 }, { "epoch": 0.018751646036344483, "grad_norm": 3.4693210124969482, "learning_rate": 4.906768501448512e-05, "loss": 1.5999, "step": 712 }, { "epoch": 0.018777982617856202, "grad_norm": 3.102646589279175, "learning_rate": 4.9066368185409536e-05, "loss": 0.3172, "step": 713 }, { "epoch": 0.01880431919936792, "grad_norm": 4.541385650634766, "learning_rate": 4.906505135633395e-05, "loss": 1.5117, "step": 714 }, { "epoch": 0.01883065578087964, "grad_norm": 2.585900068283081, "learning_rate": 4.906373452725837e-05, "loss": 1.6201, "step": 715 }, { "epoch": 0.018856992362391363, "grad_norm": 2.852032423019409, "learning_rate": 4.9062417698182776e-05, "loss": 2.2742, "step": 716 }, { "epoch": 0.018883328943903082, "grad_norm": 2.8967807292938232, "learning_rate": 4.906110086910719e-05, "loss": 1.8815, "step": 717 }, { "epoch": 0.0189096655254148, "grad_norm": 7.6833038330078125, "learning_rate": 4.905978404003161e-05, "loss": 2.237, "step": 718 }, { "epoch": 0.01893600210692652, "grad_norm": 3.026221990585327, "learning_rate": 4.905846721095602e-05, "loss": 1.4727, "step": 719 }, { "epoch": 0.01896233868843824, "grad_norm": 7.7613725662231445, "learning_rate": 4.905715038188043e-05, "loss": 1.9785, "step": 720 }, { "epoch": 0.01898867526994996, "grad_norm": 23.023826599121094, "learning_rate": 4.905583355280485e-05, "loss": 1.1939, "step": 721 }, { "epoch": 0.01901501185146168, "grad_norm": 3.0799660682678223, "learning_rate": 4.905451672372926e-05, "loss": 2.0895, "step": 722 }, { "epoch": 0.0190413484329734, "grad_norm": 4.517638683319092, "learning_rate": 4.905319989465368e-05, "loss": 2.0233, "step": 723 }, { "epoch": 0.01906768501448512, "grad_norm": 6.13850212097168, "learning_rate": 4.9051883065578094e-05, "loss": 1.5015, "step": 724 }, { "epoch": 0.019094021595996838, "grad_norm": 4.092638969421387, "learning_rate": 4.90505662365025e-05, "loss": 1.8314, "step": 725 }, { "epoch": 0.01912035817750856, "grad_norm": 2.1235930919647217, "learning_rate": 4.904924940742692e-05, "loss": 2.2646, "step": 726 }, { "epoch": 0.01914669475902028, "grad_norm": 2.9502296447753906, "learning_rate": 4.904793257835133e-05, "loss": 0.6506, "step": 727 }, { "epoch": 0.019173031340532, "grad_norm": 3.1551878452301025, "learning_rate": 4.904661574927575e-05, "loss": 1.7719, "step": 728 }, { "epoch": 0.019199367922043718, "grad_norm": 2.6724605560302734, "learning_rate": 4.904529892020016e-05, "loss": 1.792, "step": 729 }, { "epoch": 0.01922570450355544, "grad_norm": 2.949495553970337, "learning_rate": 4.9043982091124574e-05, "loss": 1.4856, "step": 730 }, { "epoch": 0.01925204108506716, "grad_norm": 2.914775848388672, "learning_rate": 4.904266526204899e-05, "loss": 1.5858, "step": 731 }, { "epoch": 0.019278377666578878, "grad_norm": 18.118562698364258, "learning_rate": 4.90413484329734e-05, "loss": 2.7145, "step": 732 }, { "epoch": 0.019304714248090597, "grad_norm": 2.8546435832977295, "learning_rate": 4.904003160389782e-05, "loss": 1.5072, "step": 733 }, { "epoch": 0.019331050829602316, "grad_norm": 2.6304736137390137, "learning_rate": 4.903871477482223e-05, "loss": 2.0668, "step": 734 }, { "epoch": 0.01935738741111404, "grad_norm": 2.861255407333374, "learning_rate": 4.9037397945746646e-05, "loss": 1.0789, "step": 735 }, { "epoch": 0.019383723992625758, "grad_norm": 3.318653106689453, "learning_rate": 4.9036081116671054e-05, "loss": 1.6155, "step": 736 }, { "epoch": 0.019410060574137477, "grad_norm": 2.467630386352539, "learning_rate": 4.903476428759548e-05, "loss": 1.9833, "step": 737 }, { "epoch": 0.019436397155649196, "grad_norm": 2.600247859954834, "learning_rate": 4.9033447458519886e-05, "loss": 1.5517, "step": 738 }, { "epoch": 0.019462733737160915, "grad_norm": 6.090360641479492, "learning_rate": 4.90321306294443e-05, "loss": 1.1927, "step": 739 }, { "epoch": 0.019489070318672638, "grad_norm": 2.7624895572662354, "learning_rate": 4.903081380036872e-05, "loss": 1.5016, "step": 740 }, { "epoch": 0.019515406900184357, "grad_norm": 3.2997429370880127, "learning_rate": 4.9029496971293126e-05, "loss": 1.8181, "step": 741 }, { "epoch": 0.019541743481696076, "grad_norm": 2.4257280826568604, "learning_rate": 4.902818014221755e-05, "loss": 1.5971, "step": 742 }, { "epoch": 0.019568080063207795, "grad_norm": 5.636271953582764, "learning_rate": 4.902686331314196e-05, "loss": 2.5826, "step": 743 }, { "epoch": 0.019594416644719514, "grad_norm": 4.720231056213379, "learning_rate": 4.902554648406637e-05, "loss": 0.7437, "step": 744 }, { "epoch": 0.019620753226231236, "grad_norm": 2.4664206504821777, "learning_rate": 4.902422965499078e-05, "loss": 2.8211, "step": 745 }, { "epoch": 0.019647089807742955, "grad_norm": 3.704176664352417, "learning_rate": 4.90229128259152e-05, "loss": 1.4692, "step": 746 }, { "epoch": 0.019673426389254674, "grad_norm": 3.1931588649749756, "learning_rate": 4.902159599683961e-05, "loss": 1.7259, "step": 747 }, { "epoch": 0.019699762970766393, "grad_norm": 3.2398006916046143, "learning_rate": 4.902027916776403e-05, "loss": 2.1453, "step": 748 }, { "epoch": 0.019726099552278116, "grad_norm": 2.211707830429077, "learning_rate": 4.901896233868844e-05, "loss": 1.4725, "step": 749 }, { "epoch": 0.019752436133789835, "grad_norm": 5.693408966064453, "learning_rate": 4.901764550961285e-05, "loss": 2.1259, "step": 750 }, { "epoch": 0.019778772715301554, "grad_norm": 4.840297698974609, "learning_rate": 4.901632868053727e-05, "loss": 1.5723, "step": 751 }, { "epoch": 0.019805109296813273, "grad_norm": 2.4746127128601074, "learning_rate": 4.9015011851461684e-05, "loss": 2.0466, "step": 752 }, { "epoch": 0.019831445878324992, "grad_norm": 3.711047410964966, "learning_rate": 4.90136950223861e-05, "loss": 0.8246, "step": 753 }, { "epoch": 0.019857782459836715, "grad_norm": 3.0407891273498535, "learning_rate": 4.901237819331051e-05, "loss": 2.0245, "step": 754 }, { "epoch": 0.019884119041348434, "grad_norm": 4.244345188140869, "learning_rate": 4.9011061364234924e-05, "loss": 1.5863, "step": 755 }, { "epoch": 0.019910455622860153, "grad_norm": 3.2105660438537598, "learning_rate": 4.900974453515934e-05, "loss": 1.7174, "step": 756 }, { "epoch": 0.019936792204371872, "grad_norm": 3.018162727355957, "learning_rate": 4.9008427706083755e-05, "loss": 1.5234, "step": 757 }, { "epoch": 0.01996312878588359, "grad_norm": 5.20952033996582, "learning_rate": 4.9007110877008164e-05, "loss": 1.5424, "step": 758 }, { "epoch": 0.019989465367395313, "grad_norm": 4.107244968414307, "learning_rate": 4.900579404793258e-05, "loss": 2.334, "step": 759 }, { "epoch": 0.020015801948907033, "grad_norm": 3.29659104347229, "learning_rate": 4.9004477218856995e-05, "loss": 1.7371, "step": 760 }, { "epoch": 0.02004213853041875, "grad_norm": 2.391526699066162, "learning_rate": 4.900316038978141e-05, "loss": 1.4182, "step": 761 }, { "epoch": 0.02006847511193047, "grad_norm": 3.2081947326660156, "learning_rate": 4.9001843560705826e-05, "loss": 2.1381, "step": 762 }, { "epoch": 0.02009481169344219, "grad_norm": 5.759424209594727, "learning_rate": 4.9000526731630235e-05, "loss": 1.0129, "step": 763 }, { "epoch": 0.020121148274953912, "grad_norm": 2.460491895675659, "learning_rate": 4.899920990255465e-05, "loss": 2.0876, "step": 764 }, { "epoch": 0.02014748485646563, "grad_norm": 2.915313482284546, "learning_rate": 4.899789307347906e-05, "loss": 2.4913, "step": 765 }, { "epoch": 0.02017382143797735, "grad_norm": 2.943474531173706, "learning_rate": 4.899657624440348e-05, "loss": 1.2272, "step": 766 }, { "epoch": 0.02020015801948907, "grad_norm": 3.699199676513672, "learning_rate": 4.899525941532789e-05, "loss": 0.9989, "step": 767 }, { "epoch": 0.02022649460100079, "grad_norm": 2.4398043155670166, "learning_rate": 4.8993942586252306e-05, "loss": 1.7294, "step": 768 }, { "epoch": 0.02025283118251251, "grad_norm": 2.853638172149658, "learning_rate": 4.899262575717672e-05, "loss": 1.5957, "step": 769 }, { "epoch": 0.02027916776402423, "grad_norm": 2.8390755653381348, "learning_rate": 4.899130892810114e-05, "loss": 1.4267, "step": 770 }, { "epoch": 0.02030550434553595, "grad_norm": 2.1481523513793945, "learning_rate": 4.898999209902555e-05, "loss": 2.0185, "step": 771 }, { "epoch": 0.020331840927047668, "grad_norm": 2.8952174186706543, "learning_rate": 4.898867526994996e-05, "loss": 2.0734, "step": 772 }, { "epoch": 0.02035817750855939, "grad_norm": 4.551745891571045, "learning_rate": 4.898735844087438e-05, "loss": 1.5128, "step": 773 }, { "epoch": 0.02038451409007111, "grad_norm": 2.6201367378234863, "learning_rate": 4.8986041611798786e-05, "loss": 1.4021, "step": 774 }, { "epoch": 0.02041085067158283, "grad_norm": 4.821719169616699, "learning_rate": 4.898472478272321e-05, "loss": 2.5525, "step": 775 }, { "epoch": 0.020437187253094548, "grad_norm": 2.4338624477386475, "learning_rate": 4.898340795364762e-05, "loss": 1.7702, "step": 776 }, { "epoch": 0.020463523834606267, "grad_norm": 2.9143950939178467, "learning_rate": 4.898209112457203e-05, "loss": 1.8072, "step": 777 }, { "epoch": 0.02048986041611799, "grad_norm": 4.292383193969727, "learning_rate": 4.898077429549645e-05, "loss": 2.1108, "step": 778 }, { "epoch": 0.02051619699762971, "grad_norm": 3.321052312850952, "learning_rate": 4.897945746642086e-05, "loss": 1.9431, "step": 779 }, { "epoch": 0.020542533579141427, "grad_norm": 3.4524850845336914, "learning_rate": 4.897814063734528e-05, "loss": 1.8244, "step": 780 }, { "epoch": 0.020568870160653147, "grad_norm": 3.016355514526367, "learning_rate": 4.897682380826969e-05, "loss": 1.8159, "step": 781 }, { "epoch": 0.020595206742164866, "grad_norm": 9.822920799255371, "learning_rate": 4.8975506979194104e-05, "loss": 2.6316, "step": 782 }, { "epoch": 0.020621543323676588, "grad_norm": 3.655283212661743, "learning_rate": 4.897419015011851e-05, "loss": 1.9178, "step": 783 }, { "epoch": 0.020647879905188307, "grad_norm": 3.4776971340179443, "learning_rate": 4.8972873321042935e-05, "loss": 0.8009, "step": 784 }, { "epoch": 0.020674216486700026, "grad_norm": 5.085859298706055, "learning_rate": 4.8971556491967344e-05, "loss": 1.2723, "step": 785 }, { "epoch": 0.020700553068211745, "grad_norm": 2.645732879638672, "learning_rate": 4.897023966289176e-05, "loss": 1.8745, "step": 786 }, { "epoch": 0.020726889649723464, "grad_norm": 2.9395639896392822, "learning_rate": 4.8968922833816175e-05, "loss": 1.4894, "step": 787 }, { "epoch": 0.020753226231235187, "grad_norm": 7.118697166442871, "learning_rate": 4.8967606004740584e-05, "loss": 1.4846, "step": 788 }, { "epoch": 0.020779562812746906, "grad_norm": 3.920042037963867, "learning_rate": 4.8966289175665007e-05, "loss": 1.4626, "step": 789 }, { "epoch": 0.020805899394258625, "grad_norm": 4.029362678527832, "learning_rate": 4.8964972346589415e-05, "loss": 2.1308, "step": 790 }, { "epoch": 0.020832235975770344, "grad_norm": 2.5646917819976807, "learning_rate": 4.896365551751383e-05, "loss": 0.8227, "step": 791 }, { "epoch": 0.020858572557282063, "grad_norm": 4.099377155303955, "learning_rate": 4.896233868843824e-05, "loss": 1.7736, "step": 792 }, { "epoch": 0.020884909138793786, "grad_norm": 3.346822500228882, "learning_rate": 4.8961021859362655e-05, "loss": 2.4753, "step": 793 }, { "epoch": 0.020911245720305505, "grad_norm": 3.1177546977996826, "learning_rate": 4.895970503028707e-05, "loss": 1.2981, "step": 794 }, { "epoch": 0.020937582301817224, "grad_norm": 3.4955661296844482, "learning_rate": 4.8958388201211487e-05, "loss": 1.9598, "step": 795 }, { "epoch": 0.020963918883328943, "grad_norm": 3.7434422969818115, "learning_rate": 4.8957071372135895e-05, "loss": 1.9448, "step": 796 }, { "epoch": 0.020990255464840665, "grad_norm": 4.377191543579102, "learning_rate": 4.895575454306031e-05, "loss": 1.5413, "step": 797 }, { "epoch": 0.021016592046352384, "grad_norm": 3.4647278785705566, "learning_rate": 4.8954437713984727e-05, "loss": 1.4311, "step": 798 }, { "epoch": 0.021042928627864103, "grad_norm": 3.394336700439453, "learning_rate": 4.895312088490914e-05, "loss": 1.3913, "step": 799 }, { "epoch": 0.021069265209375822, "grad_norm": 3.9088380336761475, "learning_rate": 4.895180405583356e-05, "loss": 1.6976, "step": 800 }, { "epoch": 0.02109560179088754, "grad_norm": 2.859862804412842, "learning_rate": 4.895048722675797e-05, "loss": 1.5352, "step": 801 }, { "epoch": 0.021121938372399264, "grad_norm": 2.3032243251800537, "learning_rate": 4.894917039768238e-05, "loss": 2.3919, "step": 802 }, { "epoch": 0.021148274953910983, "grad_norm": 4.2017130851745605, "learning_rate": 4.89478535686068e-05, "loss": 1.2697, "step": 803 }, { "epoch": 0.021174611535422702, "grad_norm": 3.5838441848754883, "learning_rate": 4.8946536739531213e-05, "loss": 1.6274, "step": 804 }, { "epoch": 0.02120094811693442, "grad_norm": 3.4935455322265625, "learning_rate": 4.894521991045562e-05, "loss": 2.0953, "step": 805 }, { "epoch": 0.02122728469844614, "grad_norm": 4.362836837768555, "learning_rate": 4.894390308138004e-05, "loss": 1.5424, "step": 806 }, { "epoch": 0.021253621279957863, "grad_norm": 7.931687831878662, "learning_rate": 4.8942586252304453e-05, "loss": 1.2894, "step": 807 }, { "epoch": 0.021279957861469582, "grad_norm": 2.71730375289917, "learning_rate": 4.894126942322887e-05, "loss": 2.1413, "step": 808 }, { "epoch": 0.0213062944429813, "grad_norm": 2.596869707107544, "learning_rate": 4.8939952594153285e-05, "loss": 1.5955, "step": 809 }, { "epoch": 0.02133263102449302, "grad_norm": 7.385660648345947, "learning_rate": 4.8938635765077693e-05, "loss": 1.9132, "step": 810 }, { "epoch": 0.02135896760600474, "grad_norm": 2.570417881011963, "learning_rate": 4.893731893600211e-05, "loss": 1.4441, "step": 811 }, { "epoch": 0.02138530418751646, "grad_norm": 2.4142816066741943, "learning_rate": 4.893600210692652e-05, "loss": 1.8909, "step": 812 }, { "epoch": 0.02141164076902818, "grad_norm": 3.1070990562438965, "learning_rate": 4.893468527785094e-05, "loss": 1.5658, "step": 813 }, { "epoch": 0.0214379773505399, "grad_norm": 2.599963665008545, "learning_rate": 4.893336844877535e-05, "loss": 1.5471, "step": 814 }, { "epoch": 0.02146431393205162, "grad_norm": 5.090512275695801, "learning_rate": 4.8932051619699765e-05, "loss": 1.8339, "step": 815 }, { "epoch": 0.02149065051356334, "grad_norm": 4.4836907386779785, "learning_rate": 4.893073479062418e-05, "loss": 2.67, "step": 816 }, { "epoch": 0.02151698709507506, "grad_norm": 4.416508197784424, "learning_rate": 4.8929417961548596e-05, "loss": 1.3435, "step": 817 }, { "epoch": 0.02154332367658678, "grad_norm": 3.6350066661834717, "learning_rate": 4.892810113247301e-05, "loss": 2.2413, "step": 818 }, { "epoch": 0.0215696602580985, "grad_norm": 4.3631768226623535, "learning_rate": 4.892678430339742e-05, "loss": 1.1512, "step": 819 }, { "epoch": 0.021595996839610217, "grad_norm": 3.968855381011963, "learning_rate": 4.8925467474321836e-05, "loss": 1.4528, "step": 820 }, { "epoch": 0.02162233342112194, "grad_norm": 3.9882712364196777, "learning_rate": 4.8924150645246245e-05, "loss": 1.3471, "step": 821 }, { "epoch": 0.02164867000263366, "grad_norm": 4.810586929321289, "learning_rate": 4.892283381617067e-05, "loss": 1.8159, "step": 822 }, { "epoch": 0.021675006584145378, "grad_norm": 4.532617092132568, "learning_rate": 4.8921516987095076e-05, "loss": 2.188, "step": 823 }, { "epoch": 0.021701343165657097, "grad_norm": 4.6967339515686035, "learning_rate": 4.892020015801949e-05, "loss": 1.9399, "step": 824 }, { "epoch": 0.021727679747168816, "grad_norm": 6.225020408630371, "learning_rate": 4.891888332894391e-05, "loss": 1.5084, "step": 825 }, { "epoch": 0.02175401632868054, "grad_norm": 2.7183237075805664, "learning_rate": 4.8917566499868316e-05, "loss": 1.7584, "step": 826 }, { "epoch": 0.021780352910192258, "grad_norm": 3.1652307510375977, "learning_rate": 4.891624967079274e-05, "loss": 2.3683, "step": 827 }, { "epoch": 0.021806689491703977, "grad_norm": 3.4707798957824707, "learning_rate": 4.891493284171715e-05, "loss": 2.1044, "step": 828 }, { "epoch": 0.021833026073215696, "grad_norm": 5.230329990386963, "learning_rate": 4.891361601264156e-05, "loss": 2.9414, "step": 829 }, { "epoch": 0.021859362654727415, "grad_norm": 3.530433177947998, "learning_rate": 4.891229918356597e-05, "loss": 1.5021, "step": 830 }, { "epoch": 0.021885699236239137, "grad_norm": 2.7317306995391846, "learning_rate": 4.8910982354490394e-05, "loss": 2.3066, "step": 831 }, { "epoch": 0.021912035817750856, "grad_norm": 2.825913667678833, "learning_rate": 4.89096655254148e-05, "loss": 1.836, "step": 832 }, { "epoch": 0.021938372399262576, "grad_norm": 4.459882736206055, "learning_rate": 4.890834869633922e-05, "loss": 1.3156, "step": 833 }, { "epoch": 0.021964708980774295, "grad_norm": 3.0017170906066895, "learning_rate": 4.8907031867263634e-05, "loss": 1.903, "step": 834 }, { "epoch": 0.021991045562286014, "grad_norm": 2.3982746601104736, "learning_rate": 4.890571503818804e-05, "loss": 2.0684, "step": 835 }, { "epoch": 0.022017382143797736, "grad_norm": 2.809511661529541, "learning_rate": 4.8904398209112465e-05, "loss": 0.8368, "step": 836 }, { "epoch": 0.022043718725309455, "grad_norm": 5.6450581550598145, "learning_rate": 4.8903081380036874e-05, "loss": 1.9314, "step": 837 }, { "epoch": 0.022070055306821174, "grad_norm": 3.1256988048553467, "learning_rate": 4.890176455096129e-05, "loss": 1.7095, "step": 838 }, { "epoch": 0.022096391888332893, "grad_norm": 7.690213680267334, "learning_rate": 4.89004477218857e-05, "loss": 1.7726, "step": 839 }, { "epoch": 0.022122728469844616, "grad_norm": 2.41968035697937, "learning_rate": 4.8899130892810114e-05, "loss": 2.0054, "step": 840 }, { "epoch": 0.022149065051356335, "grad_norm": 3.7235939502716064, "learning_rate": 4.889781406373453e-05, "loss": 1.6424, "step": 841 }, { "epoch": 0.022175401632868054, "grad_norm": 4.181987285614014, "learning_rate": 4.8896497234658945e-05, "loss": 1.497, "step": 842 }, { "epoch": 0.022201738214379773, "grad_norm": 4.181909561157227, "learning_rate": 4.8895180405583354e-05, "loss": 2.0824, "step": 843 }, { "epoch": 0.022228074795891492, "grad_norm": 4.448643684387207, "learning_rate": 4.889386357650777e-05, "loss": 1.926, "step": 844 }, { "epoch": 0.022254411377403215, "grad_norm": 2.937082052230835, "learning_rate": 4.8892546747432185e-05, "loss": 2.2453, "step": 845 }, { "epoch": 0.022280747958914934, "grad_norm": 3.1048316955566406, "learning_rate": 4.88912299183566e-05, "loss": 2.1669, "step": 846 }, { "epoch": 0.022307084540426653, "grad_norm": 3.082481861114502, "learning_rate": 4.8889913089281016e-05, "loss": 1.7621, "step": 847 }, { "epoch": 0.02233342112193837, "grad_norm": 3.7697834968566895, "learning_rate": 4.8888596260205425e-05, "loss": 1.3528, "step": 848 }, { "epoch": 0.02235975770345009, "grad_norm": 7.551332473754883, "learning_rate": 4.888727943112984e-05, "loss": 1.1198, "step": 849 }, { "epoch": 0.022386094284961813, "grad_norm": 3.309126615524292, "learning_rate": 4.8885962602054256e-05, "loss": 2.1807, "step": 850 }, { "epoch": 0.022412430866473532, "grad_norm": 4.420648574829102, "learning_rate": 4.888464577297867e-05, "loss": 1.4397, "step": 851 }, { "epoch": 0.02243876744798525, "grad_norm": 3.5129315853118896, "learning_rate": 4.888332894390308e-05, "loss": 0.6918, "step": 852 }, { "epoch": 0.02246510402949697, "grad_norm": 3.1707139015197754, "learning_rate": 4.8882012114827496e-05, "loss": 0.9939, "step": 853 }, { "epoch": 0.02249144061100869, "grad_norm": 3.777784824371338, "learning_rate": 4.888069528575191e-05, "loss": 2.0356, "step": 854 }, { "epoch": 0.022517777192520412, "grad_norm": 5.508716106414795, "learning_rate": 4.887937845667633e-05, "loss": 2.249, "step": 855 }, { "epoch": 0.02254411377403213, "grad_norm": 6.5840864181518555, "learning_rate": 4.887806162760074e-05, "loss": 1.6953, "step": 856 }, { "epoch": 0.02257045035554385, "grad_norm": 4.6469340324401855, "learning_rate": 4.887674479852515e-05, "loss": 1.1643, "step": 857 }, { "epoch": 0.02259678693705557, "grad_norm": 3.110201835632324, "learning_rate": 4.887542796944957e-05, "loss": 2.6273, "step": 858 }, { "epoch": 0.022623123518567288, "grad_norm": 2.776529550552368, "learning_rate": 4.8874111140373976e-05, "loss": 1.2361, "step": 859 }, { "epoch": 0.02264946010007901, "grad_norm": 4.608916759490967, "learning_rate": 4.88727943112984e-05, "loss": 1.5542, "step": 860 }, { "epoch": 0.02267579668159073, "grad_norm": 4.610592842102051, "learning_rate": 4.887147748222281e-05, "loss": 1.5604, "step": 861 }, { "epoch": 0.02270213326310245, "grad_norm": 2.4414761066436768, "learning_rate": 4.887016065314722e-05, "loss": 1.8716, "step": 862 }, { "epoch": 0.022728469844614168, "grad_norm": 4.946755886077881, "learning_rate": 4.886884382407164e-05, "loss": 1.2964, "step": 863 }, { "epoch": 0.02275480642612589, "grad_norm": 3.669976234436035, "learning_rate": 4.8867526994996054e-05, "loss": 1.6591, "step": 864 }, { "epoch": 0.02278114300763761, "grad_norm": 2.9368083477020264, "learning_rate": 4.886621016592047e-05, "loss": 2.0373, "step": 865 }, { "epoch": 0.02280747958914933, "grad_norm": 2.7219672203063965, "learning_rate": 4.886489333684488e-05, "loss": 1.6246, "step": 866 }, { "epoch": 0.022833816170661048, "grad_norm": 5.060786247253418, "learning_rate": 4.8863576507769294e-05, "loss": 2.0179, "step": 867 }, { "epoch": 0.022860152752172767, "grad_norm": 5.938994407653809, "learning_rate": 4.88622596786937e-05, "loss": 1.6806, "step": 868 }, { "epoch": 0.02288648933368449, "grad_norm": 3.05311918258667, "learning_rate": 4.8860942849618126e-05, "loss": 2.0176, "step": 869 }, { "epoch": 0.022912825915196208, "grad_norm": 3.7118985652923584, "learning_rate": 4.8859626020542534e-05, "loss": 1.1776, "step": 870 }, { "epoch": 0.022939162496707927, "grad_norm": 2.4207539558410645, "learning_rate": 4.885830919146695e-05, "loss": 1.9499, "step": 871 }, { "epoch": 0.022965499078219646, "grad_norm": 2.7396440505981445, "learning_rate": 4.8856992362391366e-05, "loss": 0.7162, "step": 872 }, { "epoch": 0.022991835659731365, "grad_norm": 3.2223732471466064, "learning_rate": 4.8855675533315774e-05, "loss": 1.4836, "step": 873 }, { "epoch": 0.023018172241243088, "grad_norm": 3.0279431343078613, "learning_rate": 4.88543587042402e-05, "loss": 1.8642, "step": 874 }, { "epoch": 0.023044508822754807, "grad_norm": 3.1469645500183105, "learning_rate": 4.8853041875164606e-05, "loss": 1.6819, "step": 875 }, { "epoch": 0.023070845404266526, "grad_norm": 4.9359450340271, "learning_rate": 4.885172504608902e-05, "loss": 1.52, "step": 876 }, { "epoch": 0.023097181985778245, "grad_norm": 5.223402500152588, "learning_rate": 4.885040821701343e-05, "loss": 0.8004, "step": 877 }, { "epoch": 0.023123518567289964, "grad_norm": 3.0356979370117188, "learning_rate": 4.8849091387937846e-05, "loss": 0.2715, "step": 878 }, { "epoch": 0.023149855148801687, "grad_norm": 2.9502663612365723, "learning_rate": 4.884777455886226e-05, "loss": 1.8226, "step": 879 }, { "epoch": 0.023176191730313406, "grad_norm": 3.033193349838257, "learning_rate": 4.884645772978668e-05, "loss": 1.9013, "step": 880 }, { "epoch": 0.023202528311825125, "grad_norm": 4.26116418838501, "learning_rate": 4.884514090071109e-05, "loss": 1.8828, "step": 881 }, { "epoch": 0.023228864893336844, "grad_norm": 2.8875274658203125, "learning_rate": 4.88438240716355e-05, "loss": 1.5716, "step": 882 }, { "epoch": 0.023255201474848566, "grad_norm": 4.26268196105957, "learning_rate": 4.8842507242559924e-05, "loss": 1.6109, "step": 883 }, { "epoch": 0.023281538056360285, "grad_norm": 2.3405840396881104, "learning_rate": 4.884119041348433e-05, "loss": 3.0292, "step": 884 }, { "epoch": 0.023307874637872004, "grad_norm": 2.4629173278808594, "learning_rate": 4.883987358440875e-05, "loss": 1.7301, "step": 885 }, { "epoch": 0.023334211219383724, "grad_norm": 2.497011184692383, "learning_rate": 4.883855675533316e-05, "loss": 2.2863, "step": 886 }, { "epoch": 0.023360547800895443, "grad_norm": 2.809114694595337, "learning_rate": 4.883723992625757e-05, "loss": 1.8682, "step": 887 }, { "epoch": 0.023386884382407165, "grad_norm": 2.7719645500183105, "learning_rate": 4.883592309718199e-05, "loss": 2.3763, "step": 888 }, { "epoch": 0.023413220963918884, "grad_norm": 2.340416669845581, "learning_rate": 4.8834606268106404e-05, "loss": 1.559, "step": 889 }, { "epoch": 0.023439557545430603, "grad_norm": 5.768289566040039, "learning_rate": 4.883328943903082e-05, "loss": 1.3442, "step": 890 }, { "epoch": 0.023465894126942322, "grad_norm": 3.9083545207977295, "learning_rate": 4.883197260995523e-05, "loss": 1.5086, "step": 891 }, { "epoch": 0.02349223070845404, "grad_norm": 2.550734043121338, "learning_rate": 4.8830655780879644e-05, "loss": 1.7015, "step": 892 }, { "epoch": 0.023518567289965764, "grad_norm": 2.9778783321380615, "learning_rate": 4.882933895180406e-05, "loss": 1.2605, "step": 893 }, { "epoch": 0.023544903871477483, "grad_norm": 2.945540189743042, "learning_rate": 4.8828022122728475e-05, "loss": 1.8471, "step": 894 }, { "epoch": 0.023571240452989202, "grad_norm": 3.3573033809661865, "learning_rate": 4.8826705293652884e-05, "loss": 1.4592, "step": 895 }, { "epoch": 0.02359757703450092, "grad_norm": 8.201980590820312, "learning_rate": 4.88253884645773e-05, "loss": 2.0542, "step": 896 }, { "epoch": 0.02362391361601264, "grad_norm": 2.681422710418701, "learning_rate": 4.8824071635501715e-05, "loss": 1.8801, "step": 897 }, { "epoch": 0.023650250197524363, "grad_norm": 2.5939226150512695, "learning_rate": 4.882275480642613e-05, "loss": 2.2178, "step": 898 }, { "epoch": 0.02367658677903608, "grad_norm": 4.881478786468506, "learning_rate": 4.882143797735054e-05, "loss": 2.1732, "step": 899 }, { "epoch": 0.0237029233605478, "grad_norm": 3.984933614730835, "learning_rate": 4.8820121148274955e-05, "loss": 1.7548, "step": 900 }, { "epoch": 0.02372925994205952, "grad_norm": 3.310772180557251, "learning_rate": 4.881880431919937e-05, "loss": 0.8692, "step": 901 }, { "epoch": 0.02375559652357124, "grad_norm": 5.751379489898682, "learning_rate": 4.8817487490123786e-05, "loss": 1.0979, "step": 902 }, { "epoch": 0.02378193310508296, "grad_norm": 4.656223297119141, "learning_rate": 4.88161706610482e-05, "loss": 0.528, "step": 903 }, { "epoch": 0.02380826968659468, "grad_norm": 3.370290756225586, "learning_rate": 4.881485383197261e-05, "loss": 2.2476, "step": 904 }, { "epoch": 0.0238346062681064, "grad_norm": 6.551933765411377, "learning_rate": 4.8813537002897026e-05, "loss": 1.2507, "step": 905 }, { "epoch": 0.02386094284961812, "grad_norm": 3.8577675819396973, "learning_rate": 4.8812220173821435e-05, "loss": 1.8234, "step": 906 }, { "epoch": 0.02388727943112984, "grad_norm": 2.82926344871521, "learning_rate": 4.881090334474586e-05, "loss": 1.7066, "step": 907 }, { "epoch": 0.02391361601264156, "grad_norm": 2.4536755084991455, "learning_rate": 4.8809586515670266e-05, "loss": 1.5696, "step": 908 }, { "epoch": 0.02393995259415328, "grad_norm": 3.422820568084717, "learning_rate": 4.880826968659468e-05, "loss": 0.8928, "step": 909 }, { "epoch": 0.023966289175664998, "grad_norm": 2.8297548294067383, "learning_rate": 4.88069528575191e-05, "loss": 0.3006, "step": 910 }, { "epoch": 0.023992625757176717, "grad_norm": 5.321094036102295, "learning_rate": 4.8805636028443506e-05, "loss": 2.1824, "step": 911 }, { "epoch": 0.02401896233868844, "grad_norm": 2.4695184230804443, "learning_rate": 4.880431919936793e-05, "loss": 1.9922, "step": 912 }, { "epoch": 0.02404529892020016, "grad_norm": 4.551111221313477, "learning_rate": 4.880300237029234e-05, "loss": 2.2532, "step": 913 }, { "epoch": 0.024071635501711878, "grad_norm": 3.917362689971924, "learning_rate": 4.880168554121675e-05, "loss": 1.858, "step": 914 }, { "epoch": 0.024097972083223597, "grad_norm": 2.7697012424468994, "learning_rate": 4.880036871214116e-05, "loss": 1.8184, "step": 915 }, { "epoch": 0.024124308664735316, "grad_norm": 2.403337240219116, "learning_rate": 4.8799051883065584e-05, "loss": 1.4952, "step": 916 }, { "epoch": 0.02415064524624704, "grad_norm": 3.2855231761932373, "learning_rate": 4.879773505398999e-05, "loss": 2.4549, "step": 917 }, { "epoch": 0.024176981827758758, "grad_norm": 5.837979316711426, "learning_rate": 4.879641822491441e-05, "loss": 1.776, "step": 918 }, { "epoch": 0.024203318409270477, "grad_norm": 3.956073760986328, "learning_rate": 4.8795101395838824e-05, "loss": 1.4027, "step": 919 }, { "epoch": 0.024229654990782196, "grad_norm": 3.2240607738494873, "learning_rate": 4.879378456676323e-05, "loss": 2.0036, "step": 920 }, { "epoch": 0.024255991572293915, "grad_norm": 3.176830530166626, "learning_rate": 4.8792467737687655e-05, "loss": 2.0702, "step": 921 }, { "epoch": 0.024282328153805637, "grad_norm": 3.58725905418396, "learning_rate": 4.8791150908612064e-05, "loss": 1.8628, "step": 922 }, { "epoch": 0.024308664735317356, "grad_norm": 5.654665946960449, "learning_rate": 4.878983407953648e-05, "loss": 1.5798, "step": 923 }, { "epoch": 0.024335001316829075, "grad_norm": 7.691973686218262, "learning_rate": 4.878851725046089e-05, "loss": 1.8847, "step": 924 }, { "epoch": 0.024361337898340794, "grad_norm": 2.611201524734497, "learning_rate": 4.8787200421385304e-05, "loss": 2.0848, "step": 925 }, { "epoch": 0.024387674479852513, "grad_norm": 2.717235565185547, "learning_rate": 4.878588359230972e-05, "loss": 1.8219, "step": 926 }, { "epoch": 0.024414011061364236, "grad_norm": 3.0322349071502686, "learning_rate": 4.8784566763234135e-05, "loss": 1.9138, "step": 927 }, { "epoch": 0.024440347642875955, "grad_norm": 2.383579730987549, "learning_rate": 4.878324993415855e-05, "loss": 2.0439, "step": 928 }, { "epoch": 0.024466684224387674, "grad_norm": 3.828047752380371, "learning_rate": 4.878193310508296e-05, "loss": 1.931, "step": 929 }, { "epoch": 0.024493020805899393, "grad_norm": 3.5176422595977783, "learning_rate": 4.878061627600738e-05, "loss": 1.4164, "step": 930 }, { "epoch": 0.024519357387411116, "grad_norm": 2.933224678039551, "learning_rate": 4.877929944693179e-05, "loss": 2.2594, "step": 931 }, { "epoch": 0.024545693968922835, "grad_norm": 7.70943546295166, "learning_rate": 4.877798261785621e-05, "loss": 1.0859, "step": 932 }, { "epoch": 0.024572030550434554, "grad_norm": 2.605595111846924, "learning_rate": 4.8776665788780615e-05, "loss": 1.6807, "step": 933 }, { "epoch": 0.024598367131946273, "grad_norm": 3.604996681213379, "learning_rate": 4.877534895970503e-05, "loss": 1.4888, "step": 934 }, { "epoch": 0.024624703713457992, "grad_norm": 2.6662745475769043, "learning_rate": 4.877403213062945e-05, "loss": 1.8947, "step": 935 }, { "epoch": 0.024651040294969714, "grad_norm": 3.2520012855529785, "learning_rate": 4.877271530155386e-05, "loss": 1.7753, "step": 936 }, { "epoch": 0.024677376876481433, "grad_norm": 4.311880111694336, "learning_rate": 4.877139847247828e-05, "loss": 2.1511, "step": 937 }, { "epoch": 0.024703713457993152, "grad_norm": 2.115872859954834, "learning_rate": 4.877008164340269e-05, "loss": 1.6899, "step": 938 }, { "epoch": 0.02473005003950487, "grad_norm": 4.769501209259033, "learning_rate": 4.87687648143271e-05, "loss": 2.4844, "step": 939 }, { "epoch": 0.02475638662101659, "grad_norm": 3.7659108638763428, "learning_rate": 4.876744798525152e-05, "loss": 2.0844, "step": 940 }, { "epoch": 0.024782723202528313, "grad_norm": 2.8463897705078125, "learning_rate": 4.8766131156175933e-05, "loss": 1.9853, "step": 941 }, { "epoch": 0.024809059784040032, "grad_norm": 2.3476593494415283, "learning_rate": 4.876481432710034e-05, "loss": 2.2732, "step": 942 }, { "epoch": 0.02483539636555175, "grad_norm": 3.762220859527588, "learning_rate": 4.876349749802476e-05, "loss": 1.3429, "step": 943 }, { "epoch": 0.02486173294706347, "grad_norm": 3.1902225017547607, "learning_rate": 4.876218066894917e-05, "loss": 0.9517, "step": 944 }, { "epoch": 0.02488806952857519, "grad_norm": 2.5766420364379883, "learning_rate": 4.876086383987359e-05, "loss": 2.0754, "step": 945 }, { "epoch": 0.024914406110086912, "grad_norm": 3.0132415294647217, "learning_rate": 4.8759547010798e-05, "loss": 1.9503, "step": 946 }, { "epoch": 0.02494074269159863, "grad_norm": 4.317477703094482, "learning_rate": 4.8758230181722413e-05, "loss": 1.7543, "step": 947 }, { "epoch": 0.02496707927311035, "grad_norm": 2.949892997741699, "learning_rate": 4.875691335264683e-05, "loss": 0.9036, "step": 948 }, { "epoch": 0.02499341585462207, "grad_norm": 3.8822638988494873, "learning_rate": 4.8755596523571245e-05, "loss": 0.9061, "step": 949 }, { "epoch": 0.02501975243613379, "grad_norm": 3.780977487564087, "learning_rate": 4.875427969449566e-05, "loss": 1.4815, "step": 950 }, { "epoch": 0.02504608901764551, "grad_norm": 2.225395441055298, "learning_rate": 4.875296286542007e-05, "loss": 1.3013, "step": 951 }, { "epoch": 0.02507242559915723, "grad_norm": 3.3168656826019287, "learning_rate": 4.8751646036344485e-05, "loss": 1.5468, "step": 952 }, { "epoch": 0.02509876218066895, "grad_norm": 2.268721580505371, "learning_rate": 4.8750329207268894e-05, "loss": 1.5199, "step": 953 }, { "epoch": 0.025125098762180668, "grad_norm": 2.3924734592437744, "learning_rate": 4.8749012378193316e-05, "loss": 1.9441, "step": 954 }, { "epoch": 0.02515143534369239, "grad_norm": 2.3553285598754883, "learning_rate": 4.8747695549117725e-05, "loss": 1.9051, "step": 955 }, { "epoch": 0.02517777192520411, "grad_norm": 2.352037191390991, "learning_rate": 4.874637872004214e-05, "loss": 1.7318, "step": 956 }, { "epoch": 0.02520410850671583, "grad_norm": 2.3553106784820557, "learning_rate": 4.8745061890966556e-05, "loss": 1.4107, "step": 957 }, { "epoch": 0.025230445088227547, "grad_norm": 4.828738689422607, "learning_rate": 4.8743745061890965e-05, "loss": 1.0074, "step": 958 }, { "epoch": 0.025256781669739266, "grad_norm": 3.7099461555480957, "learning_rate": 4.874242823281539e-05, "loss": 2.2068, "step": 959 }, { "epoch": 0.02528311825125099, "grad_norm": 2.071575164794922, "learning_rate": 4.8741111403739796e-05, "loss": 1.8892, "step": 960 }, { "epoch": 0.025309454832762708, "grad_norm": 6.233997344970703, "learning_rate": 4.873979457466421e-05, "loss": 1.186, "step": 961 }, { "epoch": 0.025335791414274427, "grad_norm": 5.112436294555664, "learning_rate": 4.873847774558862e-05, "loss": 1.8068, "step": 962 }, { "epoch": 0.025362127995786146, "grad_norm": 4.417271614074707, "learning_rate": 4.873716091651304e-05, "loss": 2.0767, "step": 963 }, { "epoch": 0.025388464577297865, "grad_norm": 5.198521137237549, "learning_rate": 4.873584408743745e-05, "loss": 2.3583, "step": 964 }, { "epoch": 0.025414801158809588, "grad_norm": 2.9751198291778564, "learning_rate": 4.873452725836187e-05, "loss": 1.74, "step": 965 }, { "epoch": 0.025441137740321307, "grad_norm": 3.3860409259796143, "learning_rate": 4.873321042928628e-05, "loss": 1.3247, "step": 966 }, { "epoch": 0.025467474321833026, "grad_norm": 4.9931817054748535, "learning_rate": 4.873189360021069e-05, "loss": 1.3301, "step": 967 }, { "epoch": 0.025493810903344745, "grad_norm": 7.048219203948975, "learning_rate": 4.8730576771135114e-05, "loss": 1.6089, "step": 968 }, { "epoch": 0.025520147484856464, "grad_norm": 3.70931339263916, "learning_rate": 4.872925994205952e-05, "loss": 1.8571, "step": 969 }, { "epoch": 0.025546484066368186, "grad_norm": 4.149421691894531, "learning_rate": 4.872794311298394e-05, "loss": 1.6011, "step": 970 }, { "epoch": 0.025572820647879906, "grad_norm": 2.326721429824829, "learning_rate": 4.872662628390835e-05, "loss": 2.4341, "step": 971 }, { "epoch": 0.025599157229391625, "grad_norm": 2.405155897140503, "learning_rate": 4.872530945483276e-05, "loss": 1.7249, "step": 972 }, { "epoch": 0.025625493810903344, "grad_norm": 4.748103141784668, "learning_rate": 4.872399262575718e-05, "loss": 1.9832, "step": 973 }, { "epoch": 0.025651830392415066, "grad_norm": 3.56463360786438, "learning_rate": 4.8722675796681594e-05, "loss": 1.6517, "step": 974 }, { "epoch": 0.025678166973926785, "grad_norm": 3.736293315887451, "learning_rate": 4.872135896760601e-05, "loss": 0.6747, "step": 975 }, { "epoch": 0.025704503555438504, "grad_norm": 2.9828007221221924, "learning_rate": 4.872004213853042e-05, "loss": 1.765, "step": 976 }, { "epoch": 0.025730840136950223, "grad_norm": 8.252352714538574, "learning_rate": 4.8718725309454834e-05, "loss": 1.6381, "step": 977 }, { "epoch": 0.025757176718461942, "grad_norm": 4.802109241485596, "learning_rate": 4.871740848037925e-05, "loss": 1.2956, "step": 978 }, { "epoch": 0.025783513299973665, "grad_norm": 4.678725242614746, "learning_rate": 4.8716091651303665e-05, "loss": 1.8981, "step": 979 }, { "epoch": 0.025809849881485384, "grad_norm": 2.763256072998047, "learning_rate": 4.8714774822228074e-05, "loss": 0.8677, "step": 980 }, { "epoch": 0.025836186462997103, "grad_norm": 3.9678878784179688, "learning_rate": 4.871345799315249e-05, "loss": 1.1441, "step": 981 }, { "epoch": 0.025862523044508822, "grad_norm": 4.467024803161621, "learning_rate": 4.8712141164076905e-05, "loss": 1.3829, "step": 982 }, { "epoch": 0.02588885962602054, "grad_norm": 2.9426639080047607, "learning_rate": 4.871082433500132e-05, "loss": 1.5174, "step": 983 }, { "epoch": 0.025915196207532264, "grad_norm": 2.438995838165283, "learning_rate": 4.8709507505925736e-05, "loss": 1.7923, "step": 984 }, { "epoch": 0.025941532789043983, "grad_norm": 2.7595555782318115, "learning_rate": 4.8708190676850145e-05, "loss": 2.3838, "step": 985 }, { "epoch": 0.025967869370555702, "grad_norm": 3.703192949295044, "learning_rate": 4.870687384777456e-05, "loss": 1.6882, "step": 986 }, { "epoch": 0.02599420595206742, "grad_norm": 2.6279776096343994, "learning_rate": 4.8705557018698976e-05, "loss": 1.0209, "step": 987 }, { "epoch": 0.02602054253357914, "grad_norm": 3.901630163192749, "learning_rate": 4.870424018962339e-05, "loss": 1.2321, "step": 988 }, { "epoch": 0.026046879115090862, "grad_norm": 3.1039302349090576, "learning_rate": 4.87029233605478e-05, "loss": 0.7191, "step": 989 }, { "epoch": 0.02607321569660258, "grad_norm": 5.460488319396973, "learning_rate": 4.8701606531472216e-05, "loss": 2.5224, "step": 990 }, { "epoch": 0.0260995522781143, "grad_norm": 4.716031551361084, "learning_rate": 4.870028970239663e-05, "loss": 1.3963, "step": 991 }, { "epoch": 0.02612588885962602, "grad_norm": 7.198510646820068, "learning_rate": 4.869897287332105e-05, "loss": 1.13, "step": 992 }, { "epoch": 0.02615222544113774, "grad_norm": 3.0804200172424316, "learning_rate": 4.869765604424546e-05, "loss": 0.5474, "step": 993 }, { "epoch": 0.02617856202264946, "grad_norm": 2.0527169704437256, "learning_rate": 4.869633921516987e-05, "loss": 1.741, "step": 994 }, { "epoch": 0.02620489860416118, "grad_norm": 2.9041318893432617, "learning_rate": 4.869502238609429e-05, "loss": 1.4097, "step": 995 }, { "epoch": 0.0262312351856729, "grad_norm": 2.4357457160949707, "learning_rate": 4.86937055570187e-05, "loss": 1.7655, "step": 996 }, { "epoch": 0.02625757176718462, "grad_norm": 2.678795337677002, "learning_rate": 4.869238872794312e-05, "loss": 1.855, "step": 997 }, { "epoch": 0.02628390834869634, "grad_norm": 2.4673001766204834, "learning_rate": 4.869107189886753e-05, "loss": 1.6506, "step": 998 }, { "epoch": 0.02631024493020806, "grad_norm": 2.4799981117248535, "learning_rate": 4.868975506979194e-05, "loss": 1.8883, "step": 999 }, { "epoch": 0.02633658151171978, "grad_norm": 2.46384859085083, "learning_rate": 4.868843824071635e-05, "loss": 1.5749, "step": 1000 }, { "epoch": 0.026362918093231498, "grad_norm": 3.096233606338501, "learning_rate": 4.8687121411640774e-05, "loss": 1.5581, "step": 1001 }, { "epoch": 0.026389254674743217, "grad_norm": 2.272268295288086, "learning_rate": 4.868580458256518e-05, "loss": 1.3949, "step": 1002 }, { "epoch": 0.02641559125625494, "grad_norm": 3.7771518230438232, "learning_rate": 4.86844877534896e-05, "loss": 1.4578, "step": 1003 }, { "epoch": 0.02644192783776666, "grad_norm": 4.052642822265625, "learning_rate": 4.8683170924414014e-05, "loss": 1.7712, "step": 1004 }, { "epoch": 0.026468264419278378, "grad_norm": 2.4844229221343994, "learning_rate": 4.868185409533842e-05, "loss": 2.2215, "step": 1005 }, { "epoch": 0.026494601000790097, "grad_norm": 4.846006870269775, "learning_rate": 4.8680537266262846e-05, "loss": 0.8832, "step": 1006 }, { "epoch": 0.026520937582301816, "grad_norm": 2.821293354034424, "learning_rate": 4.8679220437187254e-05, "loss": 1.6234, "step": 1007 }, { "epoch": 0.02654727416381354, "grad_norm": 2.3138506412506104, "learning_rate": 4.867790360811167e-05, "loss": 1.7553, "step": 1008 }, { "epoch": 0.026573610745325257, "grad_norm": 2.4005398750305176, "learning_rate": 4.867658677903608e-05, "loss": 1.8875, "step": 1009 }, { "epoch": 0.026599947326836976, "grad_norm": 7.436602592468262, "learning_rate": 4.8675269949960495e-05, "loss": 2.3947, "step": 1010 }, { "epoch": 0.026626283908348695, "grad_norm": 3.919088840484619, "learning_rate": 4.867395312088491e-05, "loss": 1.2254, "step": 1011 }, { "epoch": 0.026652620489860415, "grad_norm": 7.609036922454834, "learning_rate": 4.8672636291809326e-05, "loss": 1.5265, "step": 1012 }, { "epoch": 0.026678957071372137, "grad_norm": 2.81717586517334, "learning_rate": 4.867131946273374e-05, "loss": 1.7644, "step": 1013 }, { "epoch": 0.026705293652883856, "grad_norm": 4.284199237823486, "learning_rate": 4.867000263365815e-05, "loss": 2.0247, "step": 1014 }, { "epoch": 0.026731630234395575, "grad_norm": 6.873014450073242, "learning_rate": 4.866868580458257e-05, "loss": 1.9102, "step": 1015 }, { "epoch": 0.026757966815907294, "grad_norm": 3.0735270977020264, "learning_rate": 4.866736897550698e-05, "loss": 1.7031, "step": 1016 }, { "epoch": 0.026784303397419017, "grad_norm": 4.263140678405762, "learning_rate": 4.86660521464314e-05, "loss": 1.1439, "step": 1017 }, { "epoch": 0.026810639978930736, "grad_norm": 2.453805923461914, "learning_rate": 4.8664735317355806e-05, "loss": 2.4004, "step": 1018 }, { "epoch": 0.026836976560442455, "grad_norm": 3.2226572036743164, "learning_rate": 4.866341848828022e-05, "loss": 1.6197, "step": 1019 }, { "epoch": 0.026863313141954174, "grad_norm": 2.62640643119812, "learning_rate": 4.866210165920464e-05, "loss": 1.7825, "step": 1020 }, { "epoch": 0.026889649723465893, "grad_norm": 2.3195648193359375, "learning_rate": 4.866078483012905e-05, "loss": 1.6573, "step": 1021 }, { "epoch": 0.026915986304977615, "grad_norm": 2.2187161445617676, "learning_rate": 4.865946800105347e-05, "loss": 1.6928, "step": 1022 }, { "epoch": 0.026942322886489335, "grad_norm": 2.3962841033935547, "learning_rate": 4.865815117197788e-05, "loss": 2.4874, "step": 1023 }, { "epoch": 0.026968659468001054, "grad_norm": 3.23622465133667, "learning_rate": 4.865683434290229e-05, "loss": 0.4497, "step": 1024 }, { "epoch": 0.026994996049512773, "grad_norm": 2.3920416831970215, "learning_rate": 4.865551751382671e-05, "loss": 1.6916, "step": 1025 }, { "epoch": 0.02702133263102449, "grad_norm": 2.71406626701355, "learning_rate": 4.8654200684751124e-05, "loss": 1.749, "step": 1026 }, { "epoch": 0.027047669212536214, "grad_norm": 2.2783379554748535, "learning_rate": 4.865288385567553e-05, "loss": 1.7853, "step": 1027 }, { "epoch": 0.027074005794047933, "grad_norm": 2.941490650177002, "learning_rate": 4.865156702659995e-05, "loss": 1.2299, "step": 1028 }, { "epoch": 0.027100342375559652, "grad_norm": 3.930819272994995, "learning_rate": 4.8650250197524364e-05, "loss": 0.9998, "step": 1029 }, { "epoch": 0.02712667895707137, "grad_norm": 4.769101619720459, "learning_rate": 4.864893336844878e-05, "loss": 1.6989, "step": 1030 }, { "epoch": 0.02715301553858309, "grad_norm": 3.156647205352783, "learning_rate": 4.8647616539373195e-05, "loss": 1.8619, "step": 1031 }, { "epoch": 0.027179352120094813, "grad_norm": 2.7352776527404785, "learning_rate": 4.8646299710297604e-05, "loss": 0.7755, "step": 1032 }, { "epoch": 0.027205688701606532, "grad_norm": 6.724462032318115, "learning_rate": 4.864498288122202e-05, "loss": 0.9797, "step": 1033 }, { "epoch": 0.02723202528311825, "grad_norm": 3.0356597900390625, "learning_rate": 4.8643666052146435e-05, "loss": 1.4286, "step": 1034 }, { "epoch": 0.02725836186462997, "grad_norm": 4.173969268798828, "learning_rate": 4.864234922307085e-05, "loss": 1.1673, "step": 1035 }, { "epoch": 0.02728469844614169, "grad_norm": 2.5415663719177246, "learning_rate": 4.864103239399526e-05, "loss": 1.5362, "step": 1036 }, { "epoch": 0.02731103502765341, "grad_norm": 3.3292665481567383, "learning_rate": 4.8639715564919675e-05, "loss": 2.4023, "step": 1037 }, { "epoch": 0.02733737160916513, "grad_norm": 3.314760208129883, "learning_rate": 4.863839873584409e-05, "loss": 1.8555, "step": 1038 }, { "epoch": 0.02736370819067685, "grad_norm": 3.2847180366516113, "learning_rate": 4.8637081906768506e-05, "loss": 0.5167, "step": 1039 }, { "epoch": 0.02739004477218857, "grad_norm": 2.7714438438415527, "learning_rate": 4.863576507769292e-05, "loss": 1.9907, "step": 1040 }, { "epoch": 0.02741638135370029, "grad_norm": 2.5430490970611572, "learning_rate": 4.863444824861733e-05, "loss": 2.2006, "step": 1041 }, { "epoch": 0.02744271793521201, "grad_norm": 2.117619514465332, "learning_rate": 4.8633131419541746e-05, "loss": 1.8754, "step": 1042 }, { "epoch": 0.02746905451672373, "grad_norm": 3.6804146766662598, "learning_rate": 4.8631814590466155e-05, "loss": 1.7422, "step": 1043 }, { "epoch": 0.02749539109823545, "grad_norm": 2.070969820022583, "learning_rate": 4.863049776139058e-05, "loss": 1.903, "step": 1044 }, { "epoch": 0.027521727679747168, "grad_norm": 2.4523069858551025, "learning_rate": 4.8629180932314986e-05, "loss": 1.9201, "step": 1045 }, { "epoch": 0.02754806426125889, "grad_norm": 3.284054756164551, "learning_rate": 4.86278641032394e-05, "loss": 1.6978, "step": 1046 }, { "epoch": 0.02757440084277061, "grad_norm": 3.6890952587127686, "learning_rate": 4.862654727416381e-05, "loss": 1.5353, "step": 1047 }, { "epoch": 0.027600737424282328, "grad_norm": 5.518612861633301, "learning_rate": 4.862523044508823e-05, "loss": 2.5851, "step": 1048 }, { "epoch": 0.027627074005794047, "grad_norm": 2.9064016342163086, "learning_rate": 4.862391361601264e-05, "loss": 1.6828, "step": 1049 }, { "epoch": 0.027653410587305766, "grad_norm": 4.6123738288879395, "learning_rate": 4.862259678693706e-05, "loss": 0.8455, "step": 1050 }, { "epoch": 0.02767974716881749, "grad_norm": 9.094766616821289, "learning_rate": 4.862127995786147e-05, "loss": 2.7074, "step": 1051 }, { "epoch": 0.027706083750329208, "grad_norm": 3.067086696624756, "learning_rate": 4.861996312878588e-05, "loss": 1.8295, "step": 1052 }, { "epoch": 0.027732420331840927, "grad_norm": 3.5218935012817383, "learning_rate": 4.8618646299710304e-05, "loss": 1.7841, "step": 1053 }, { "epoch": 0.027758756913352646, "grad_norm": 2.4527480602264404, "learning_rate": 4.861732947063471e-05, "loss": 1.5477, "step": 1054 }, { "epoch": 0.027785093494864365, "grad_norm": 5.567698955535889, "learning_rate": 4.861601264155913e-05, "loss": 1.481, "step": 1055 }, { "epoch": 0.027811430076376088, "grad_norm": 2.672114849090576, "learning_rate": 4.861469581248354e-05, "loss": 1.8227, "step": 1056 }, { "epoch": 0.027837766657887807, "grad_norm": 2.226153612136841, "learning_rate": 4.861337898340795e-05, "loss": 1.7298, "step": 1057 }, { "epoch": 0.027864103239399526, "grad_norm": 2.4429683685302734, "learning_rate": 4.861206215433237e-05, "loss": 1.373, "step": 1058 }, { "epoch": 0.027890439820911245, "grad_norm": 2.6967573165893555, "learning_rate": 4.8610745325256784e-05, "loss": 1.9188, "step": 1059 }, { "epoch": 0.027916776402422964, "grad_norm": 4.453019142150879, "learning_rate": 4.86094284961812e-05, "loss": 1.6254, "step": 1060 }, { "epoch": 0.027943112983934686, "grad_norm": 4.1539626121521, "learning_rate": 4.860811166710561e-05, "loss": 2.2024, "step": 1061 }, { "epoch": 0.027969449565446405, "grad_norm": 4.406345367431641, "learning_rate": 4.860679483803003e-05, "loss": 1.5992, "step": 1062 }, { "epoch": 0.027995786146958124, "grad_norm": 2.6692306995391846, "learning_rate": 4.860547800895444e-05, "loss": 2.7249, "step": 1063 }, { "epoch": 0.028022122728469843, "grad_norm": 5.10105037689209, "learning_rate": 4.8604161179878855e-05, "loss": 1.7338, "step": 1064 }, { "epoch": 0.028048459309981566, "grad_norm": 3.1361806392669678, "learning_rate": 4.8602844350803264e-05, "loss": 1.9357, "step": 1065 }, { "epoch": 0.028074795891493285, "grad_norm": 2.493241310119629, "learning_rate": 4.860152752172768e-05, "loss": 2.2922, "step": 1066 }, { "epoch": 0.028101132473005004, "grad_norm": 3.7938878536224365, "learning_rate": 4.8600210692652095e-05, "loss": 1.9997, "step": 1067 }, { "epoch": 0.028127469054516723, "grad_norm": 2.4619712829589844, "learning_rate": 4.859889386357651e-05, "loss": 2.0023, "step": 1068 }, { "epoch": 0.028153805636028442, "grad_norm": 3.2266292572021484, "learning_rate": 4.859757703450093e-05, "loss": 1.4799, "step": 1069 }, { "epoch": 0.028180142217540165, "grad_norm": 2.6764299869537354, "learning_rate": 4.8596260205425335e-05, "loss": 2.2143, "step": 1070 }, { "epoch": 0.028206478799051884, "grad_norm": 2.3708853721618652, "learning_rate": 4.859494337634975e-05, "loss": 2.6099, "step": 1071 }, { "epoch": 0.028232815380563603, "grad_norm": 4.766274929046631, "learning_rate": 4.859362654727417e-05, "loss": 1.9012, "step": 1072 }, { "epoch": 0.028259151962075322, "grad_norm": 4.662174701690674, "learning_rate": 4.859230971819858e-05, "loss": 1.8046, "step": 1073 }, { "epoch": 0.02828548854358704, "grad_norm": 5.352644443511963, "learning_rate": 4.859099288912299e-05, "loss": 1.6289, "step": 1074 }, { "epoch": 0.028311825125098763, "grad_norm": 3.8625576496124268, "learning_rate": 4.858967606004741e-05, "loss": 2.3011, "step": 1075 }, { "epoch": 0.028338161706610483, "grad_norm": 2.4151413440704346, "learning_rate": 4.858835923097182e-05, "loss": 1.2742, "step": 1076 }, { "epoch": 0.0283644982881222, "grad_norm": 2.240936756134033, "learning_rate": 4.858704240189624e-05, "loss": 2.1133, "step": 1077 }, { "epoch": 0.02839083486963392, "grad_norm": 4.756962299346924, "learning_rate": 4.8585725572820654e-05, "loss": 2.4781, "step": 1078 }, { "epoch": 0.02841717145114564, "grad_norm": 2.3123276233673096, "learning_rate": 4.858440874374506e-05, "loss": 0.6893, "step": 1079 }, { "epoch": 0.028443508032657362, "grad_norm": 2.4670345783233643, "learning_rate": 4.858309191466948e-05, "loss": 1.8863, "step": 1080 }, { "epoch": 0.02846984461416908, "grad_norm": 4.072238445281982, "learning_rate": 4.8581775085593894e-05, "loss": 1.6139, "step": 1081 }, { "epoch": 0.0284961811956808, "grad_norm": 2.351463556289673, "learning_rate": 4.858045825651831e-05, "loss": 1.8861, "step": 1082 }, { "epoch": 0.02852251777719252, "grad_norm": 3.012080430984497, "learning_rate": 4.857914142744272e-05, "loss": 2.1996, "step": 1083 }, { "epoch": 0.028548854358704242, "grad_norm": 2.5738027095794678, "learning_rate": 4.8577824598367134e-05, "loss": 1.7826, "step": 1084 }, { "epoch": 0.02857519094021596, "grad_norm": 5.832880973815918, "learning_rate": 4.857650776929155e-05, "loss": 1.1905, "step": 1085 }, { "epoch": 0.02860152752172768, "grad_norm": 2.4658961296081543, "learning_rate": 4.8575190940215965e-05, "loss": 1.8326, "step": 1086 }, { "epoch": 0.0286278641032394, "grad_norm": 2.3752379417419434, "learning_rate": 4.857387411114038e-05, "loss": 2.2214, "step": 1087 }, { "epoch": 0.028654200684751118, "grad_norm": 2.201660633087158, "learning_rate": 4.857255728206479e-05, "loss": 2.3061, "step": 1088 }, { "epoch": 0.02868053726626284, "grad_norm": 4.142340660095215, "learning_rate": 4.8571240452989205e-05, "loss": 1.2818, "step": 1089 }, { "epoch": 0.02870687384777456, "grad_norm": 2.3261709213256836, "learning_rate": 4.8569923623913614e-05, "loss": 1.4621, "step": 1090 }, { "epoch": 0.02873321042928628, "grad_norm": 3.5733439922332764, "learning_rate": 4.8568606794838036e-05, "loss": 1.6464, "step": 1091 }, { "epoch": 0.028759547010797998, "grad_norm": 3.020409345626831, "learning_rate": 4.8567289965762445e-05, "loss": 2.7091, "step": 1092 }, { "epoch": 0.028785883592309717, "grad_norm": 2.2784829139709473, "learning_rate": 4.856597313668686e-05, "loss": 1.9396, "step": 1093 }, { "epoch": 0.02881222017382144, "grad_norm": 2.57547664642334, "learning_rate": 4.8564656307611276e-05, "loss": 1.3221, "step": 1094 }, { "epoch": 0.02883855675533316, "grad_norm": 5.381950378417969, "learning_rate": 4.856333947853569e-05, "loss": 1.6726, "step": 1095 }, { "epoch": 0.028864893336844877, "grad_norm": 1.9736328125, "learning_rate": 4.856202264946011e-05, "loss": 1.8708, "step": 1096 }, { "epoch": 0.028891229918356597, "grad_norm": 4.961425304412842, "learning_rate": 4.8560705820384516e-05, "loss": 2.0083, "step": 1097 }, { "epoch": 0.028917566499868316, "grad_norm": 3.0586788654327393, "learning_rate": 4.855938899130893e-05, "loss": 1.7768, "step": 1098 }, { "epoch": 0.028943903081380038, "grad_norm": 3.3477911949157715, "learning_rate": 4.855807216223334e-05, "loss": 1.9127, "step": 1099 }, { "epoch": 0.028970239662891757, "grad_norm": 4.469985485076904, "learning_rate": 4.855675533315776e-05, "loss": 1.5644, "step": 1100 }, { "epoch": 0.028996576244403476, "grad_norm": 3.75849986076355, "learning_rate": 4.855543850408217e-05, "loss": 1.6602, "step": 1101 }, { "epoch": 0.029022912825915195, "grad_norm": 2.754058361053467, "learning_rate": 4.855412167500659e-05, "loss": 1.6427, "step": 1102 }, { "epoch": 0.029049249407426914, "grad_norm": 2.0229575634002686, "learning_rate": 4.8552804845930996e-05, "loss": 1.745, "step": 1103 }, { "epoch": 0.029075585988938637, "grad_norm": 3.46441912651062, "learning_rate": 4.855148801685541e-05, "loss": 2.6647, "step": 1104 }, { "epoch": 0.029101922570450356, "grad_norm": 7.25486421585083, "learning_rate": 4.855017118777983e-05, "loss": 1.9896, "step": 1105 }, { "epoch": 0.029128259151962075, "grad_norm": 3.0288054943084717, "learning_rate": 4.854885435870424e-05, "loss": 2.3027, "step": 1106 }, { "epoch": 0.029154595733473794, "grad_norm": 4.595308303833008, "learning_rate": 4.854753752962866e-05, "loss": 1.6661, "step": 1107 }, { "epoch": 0.029180932314985517, "grad_norm": 2.545994997024536, "learning_rate": 4.854622070055307e-05, "loss": 0.7444, "step": 1108 }, { "epoch": 0.029207268896497236, "grad_norm": 2.2626681327819824, "learning_rate": 4.854490387147749e-05, "loss": 2.0572, "step": 1109 }, { "epoch": 0.029233605478008955, "grad_norm": 2.315185070037842, "learning_rate": 4.85435870424019e-05, "loss": 1.3084, "step": 1110 }, { "epoch": 0.029259942059520674, "grad_norm": 3.43516206741333, "learning_rate": 4.8542270213326314e-05, "loss": 0.4376, "step": 1111 }, { "epoch": 0.029286278641032393, "grad_norm": 2.599492073059082, "learning_rate": 4.854095338425072e-05, "loss": 1.4326, "step": 1112 }, { "epoch": 0.029312615222544115, "grad_norm": 2.722194194793701, "learning_rate": 4.853963655517514e-05, "loss": 1.9376, "step": 1113 }, { "epoch": 0.029338951804055834, "grad_norm": 3.498096466064453, "learning_rate": 4.8538319726099554e-05, "loss": 1.2945, "step": 1114 }, { "epoch": 0.029365288385567553, "grad_norm": 2.3955607414245605, "learning_rate": 4.853700289702397e-05, "loss": 1.42, "step": 1115 }, { "epoch": 0.029391624967079272, "grad_norm": 3.262183666229248, "learning_rate": 4.8535686067948385e-05, "loss": 1.5349, "step": 1116 }, { "epoch": 0.02941796154859099, "grad_norm": 4.325577259063721, "learning_rate": 4.8534369238872794e-05, "loss": 2.2147, "step": 1117 }, { "epoch": 0.029444298130102714, "grad_norm": 3.8473992347717285, "learning_rate": 4.853305240979721e-05, "loss": 2.2715, "step": 1118 }, { "epoch": 0.029470634711614433, "grad_norm": 3.3051633834838867, "learning_rate": 4.8531735580721625e-05, "loss": 1.3309, "step": 1119 }, { "epoch": 0.029496971293126152, "grad_norm": 3.3608505725860596, "learning_rate": 4.853041875164604e-05, "loss": 0.877, "step": 1120 }, { "epoch": 0.02952330787463787, "grad_norm": 5.844008922576904, "learning_rate": 4.852910192257045e-05, "loss": 1.8371, "step": 1121 }, { "epoch": 0.02954964445614959, "grad_norm": 10.57100772857666, "learning_rate": 4.8527785093494865e-05, "loss": 2.5075, "step": 1122 }, { "epoch": 0.029575981037661313, "grad_norm": 5.153822898864746, "learning_rate": 4.852646826441928e-05, "loss": 1.6994, "step": 1123 }, { "epoch": 0.029602317619173032, "grad_norm": 3.8108110427856445, "learning_rate": 4.8525151435343696e-05, "loss": 1.8838, "step": 1124 }, { "epoch": 0.02962865420068475, "grad_norm": 3.1336042881011963, "learning_rate": 4.852383460626811e-05, "loss": 1.9083, "step": 1125 }, { "epoch": 0.02965499078219647, "grad_norm": 3.490872383117676, "learning_rate": 4.852251777719252e-05, "loss": 1.912, "step": 1126 }, { "epoch": 0.02968132736370819, "grad_norm": 2.1635420322418213, "learning_rate": 4.8521200948116936e-05, "loss": 0.6838, "step": 1127 }, { "epoch": 0.02970766394521991, "grad_norm": 2.9938266277313232, "learning_rate": 4.851988411904135e-05, "loss": 2.2292, "step": 1128 }, { "epoch": 0.02973400052673163, "grad_norm": 4.7393670082092285, "learning_rate": 4.851856728996577e-05, "loss": 1.3348, "step": 1129 }, { "epoch": 0.02976033710824335, "grad_norm": 2.9201533794403076, "learning_rate": 4.8517250460890176e-05, "loss": 1.4847, "step": 1130 }, { "epoch": 0.02978667368975507, "grad_norm": 5.068320274353027, "learning_rate": 4.851593363181459e-05, "loss": 1.5796, "step": 1131 }, { "epoch": 0.02981301027126679, "grad_norm": 3.0481390953063965, "learning_rate": 4.851461680273901e-05, "loss": 0.413, "step": 1132 }, { "epoch": 0.02983934685277851, "grad_norm": 3.275697708129883, "learning_rate": 4.851329997366342e-05, "loss": 1.8815, "step": 1133 }, { "epoch": 0.02986568343429023, "grad_norm": 2.432833671569824, "learning_rate": 4.851198314458784e-05, "loss": 2.0992, "step": 1134 }, { "epoch": 0.02989202001580195, "grad_norm": 2.92842960357666, "learning_rate": 4.851066631551225e-05, "loss": 0.7407, "step": 1135 }, { "epoch": 0.029918356597313667, "grad_norm": 2.2532742023468018, "learning_rate": 4.850934948643666e-05, "loss": 2.0612, "step": 1136 }, { "epoch": 0.02994469317882539, "grad_norm": 2.632624387741089, "learning_rate": 4.850803265736107e-05, "loss": 1.5505, "step": 1137 }, { "epoch": 0.02997102976033711, "grad_norm": 4.293117523193359, "learning_rate": 4.8506715828285494e-05, "loss": 1.9591, "step": 1138 }, { "epoch": 0.029997366341848828, "grad_norm": 9.684476852416992, "learning_rate": 4.85053989992099e-05, "loss": 0.8484, "step": 1139 }, { "epoch": 0.030023702923360547, "grad_norm": 2.6307811737060547, "learning_rate": 4.850408217013432e-05, "loss": 1.5203, "step": 1140 }, { "epoch": 0.030050039504872266, "grad_norm": 2.824821710586548, "learning_rate": 4.8502765341058735e-05, "loss": 1.9105, "step": 1141 }, { "epoch": 0.03007637608638399, "grad_norm": 2.490757942199707, "learning_rate": 4.850144851198315e-05, "loss": 2.4798, "step": 1142 }, { "epoch": 0.030102712667895708, "grad_norm": 2.523568630218506, "learning_rate": 4.8500131682907566e-05, "loss": 1.7346, "step": 1143 }, { "epoch": 0.030129049249407427, "grad_norm": 2.3697028160095215, "learning_rate": 4.8498814853831975e-05, "loss": 1.7498, "step": 1144 }, { "epoch": 0.030155385830919146, "grad_norm": 2.4945552349090576, "learning_rate": 4.849749802475639e-05, "loss": 1.8484, "step": 1145 }, { "epoch": 0.030181722412430865, "grad_norm": 5.43267297744751, "learning_rate": 4.84961811956808e-05, "loss": 2.0347, "step": 1146 }, { "epoch": 0.030208058993942587, "grad_norm": 3.8459794521331787, "learning_rate": 4.849486436660522e-05, "loss": 1.9875, "step": 1147 }, { "epoch": 0.030234395575454306, "grad_norm": 3.389763593673706, "learning_rate": 4.849354753752963e-05, "loss": 1.36, "step": 1148 }, { "epoch": 0.030260732156966026, "grad_norm": 3.047062873840332, "learning_rate": 4.8492230708454046e-05, "loss": 2.0154, "step": 1149 }, { "epoch": 0.030287068738477745, "grad_norm": 1.9687374830245972, "learning_rate": 4.8490913879378455e-05, "loss": 2.14, "step": 1150 }, { "epoch": 0.030313405319989467, "grad_norm": 5.142848014831543, "learning_rate": 4.848959705030287e-05, "loss": 1.3017, "step": 1151 }, { "epoch": 0.030339741901501186, "grad_norm": 5.077940940856934, "learning_rate": 4.8488280221227286e-05, "loss": 1.7044, "step": 1152 }, { "epoch": 0.030366078483012905, "grad_norm": 2.410503387451172, "learning_rate": 4.84869633921517e-05, "loss": 1.8168, "step": 1153 }, { "epoch": 0.030392415064524624, "grad_norm": 2.8993778228759766, "learning_rate": 4.848564656307612e-05, "loss": 2.3745, "step": 1154 }, { "epoch": 0.030418751646036343, "grad_norm": 2.937490224838257, "learning_rate": 4.8484329734000526e-05, "loss": 1.3934, "step": 1155 }, { "epoch": 0.030445088227548066, "grad_norm": 3.265104055404663, "learning_rate": 4.848301290492494e-05, "loss": 1.978, "step": 1156 }, { "epoch": 0.030471424809059785, "grad_norm": 2.479564905166626, "learning_rate": 4.848169607584936e-05, "loss": 0.8255, "step": 1157 }, { "epoch": 0.030497761390571504, "grad_norm": 2.08847975730896, "learning_rate": 4.848037924677377e-05, "loss": 1.8818, "step": 1158 }, { "epoch": 0.030524097972083223, "grad_norm": 4.553527355194092, "learning_rate": 4.847906241769818e-05, "loss": 0.8432, "step": 1159 }, { "epoch": 0.030550434553594942, "grad_norm": 2.920544385910034, "learning_rate": 4.84777455886226e-05, "loss": 1.516, "step": 1160 }, { "epoch": 0.030576771135106665, "grad_norm": 2.890082359313965, "learning_rate": 4.847642875954701e-05, "loss": 1.5008, "step": 1161 }, { "epoch": 0.030603107716618384, "grad_norm": 6.989933013916016, "learning_rate": 4.847511193047143e-05, "loss": 0.8749, "step": 1162 }, { "epoch": 0.030629444298130103, "grad_norm": 4.9764509201049805, "learning_rate": 4.8473795101395844e-05, "loss": 1.455, "step": 1163 }, { "epoch": 0.03065578087964182, "grad_norm": 4.515769004821777, "learning_rate": 4.847247827232025e-05, "loss": 1.1419, "step": 1164 }, { "epoch": 0.03068211746115354, "grad_norm": 2.244635820388794, "learning_rate": 4.847116144324467e-05, "loss": 2.5669, "step": 1165 }, { "epoch": 0.030708454042665263, "grad_norm": 5.076201915740967, "learning_rate": 4.8469844614169084e-05, "loss": 1.9085, "step": 1166 }, { "epoch": 0.030734790624176982, "grad_norm": 2.9700934886932373, "learning_rate": 4.84685277850935e-05, "loss": 1.81, "step": 1167 }, { "epoch": 0.0307611272056887, "grad_norm": 5.795651435852051, "learning_rate": 4.846721095601791e-05, "loss": 1.3432, "step": 1168 }, { "epoch": 0.03078746378720042, "grad_norm": 2.369987726211548, "learning_rate": 4.8465894126942324e-05, "loss": 1.9682, "step": 1169 }, { "epoch": 0.03081380036871214, "grad_norm": 3.435899496078491, "learning_rate": 4.846457729786674e-05, "loss": 2.5022, "step": 1170 }, { "epoch": 0.030840136950223862, "grad_norm": 3.1760265827178955, "learning_rate": 4.8463260468791155e-05, "loss": 2.0571, "step": 1171 }, { "epoch": 0.03086647353173558, "grad_norm": 2.5435879230499268, "learning_rate": 4.846194363971557e-05, "loss": 1.8374, "step": 1172 }, { "epoch": 0.0308928101132473, "grad_norm": 2.7065792083740234, "learning_rate": 4.846062681063998e-05, "loss": 2.1015, "step": 1173 }, { "epoch": 0.03091914669475902, "grad_norm": 1.9133766889572144, "learning_rate": 4.8459309981564395e-05, "loss": 1.6722, "step": 1174 }, { "epoch": 0.03094548327627074, "grad_norm": 2.596497058868408, "learning_rate": 4.845799315248881e-05, "loss": 1.9452, "step": 1175 }, { "epoch": 0.03097181985778246, "grad_norm": 5.125361442565918, "learning_rate": 4.8456676323413226e-05, "loss": 1.1212, "step": 1176 }, { "epoch": 0.03099815643929418, "grad_norm": 2.5910115242004395, "learning_rate": 4.8455359494337635e-05, "loss": 1.9922, "step": 1177 }, { "epoch": 0.0310244930208059, "grad_norm": 4.699963092803955, "learning_rate": 4.845404266526205e-05, "loss": 1.7486, "step": 1178 }, { "epoch": 0.031050829602317618, "grad_norm": 3.3821754455566406, "learning_rate": 4.8452725836186466e-05, "loss": 1.8035, "step": 1179 }, { "epoch": 0.03107716618382934, "grad_norm": 2.596687078475952, "learning_rate": 4.845140900711088e-05, "loss": 1.3562, "step": 1180 }, { "epoch": 0.03110350276534106, "grad_norm": 3.078029155731201, "learning_rate": 4.84500921780353e-05, "loss": 1.5201, "step": 1181 }, { "epoch": 0.03112983934685278, "grad_norm": 5.427302360534668, "learning_rate": 4.8448775348959706e-05, "loss": 2.043, "step": 1182 }, { "epoch": 0.031156175928364498, "grad_norm": 5.404221534729004, "learning_rate": 4.844745851988412e-05, "loss": 2.0277, "step": 1183 }, { "epoch": 0.031182512509876217, "grad_norm": 2.2742812633514404, "learning_rate": 4.844614169080853e-05, "loss": 2.0188, "step": 1184 }, { "epoch": 0.03120884909138794, "grad_norm": 3.477735996246338, "learning_rate": 4.844482486173295e-05, "loss": 0.3864, "step": 1185 }, { "epoch": 0.031235185672899658, "grad_norm": 2.8333590030670166, "learning_rate": 4.844350803265736e-05, "loss": 1.3481, "step": 1186 }, { "epoch": 0.03126152225441138, "grad_norm": 3.7202811241149902, "learning_rate": 4.844219120358178e-05, "loss": 1.4413, "step": 1187 }, { "epoch": 0.031287858835923096, "grad_norm": 3.836300849914551, "learning_rate": 4.844087437450619e-05, "loss": 2.4155, "step": 1188 }, { "epoch": 0.03131419541743482, "grad_norm": 2.1705095767974854, "learning_rate": 4.84395575454306e-05, "loss": 1.5203, "step": 1189 }, { "epoch": 0.031340531998946534, "grad_norm": 9.435272216796875, "learning_rate": 4.8438240716355024e-05, "loss": 2.1017, "step": 1190 }, { "epoch": 0.03136686858045826, "grad_norm": 2.069197177886963, "learning_rate": 4.843692388727943e-05, "loss": 1.6487, "step": 1191 }, { "epoch": 0.03139320516196998, "grad_norm": 4.131947040557861, "learning_rate": 4.843560705820385e-05, "loss": 1.4584, "step": 1192 }, { "epoch": 0.031419541743481695, "grad_norm": 2.280083417892456, "learning_rate": 4.843429022912826e-05, "loss": 1.2831, "step": 1193 }, { "epoch": 0.03144587832499342, "grad_norm": 2.114161729812622, "learning_rate": 4.843297340005268e-05, "loss": 1.3618, "step": 1194 }, { "epoch": 0.03147221490650513, "grad_norm": 2.3315417766571045, "learning_rate": 4.843165657097709e-05, "loss": 1.4538, "step": 1195 }, { "epoch": 0.031498551488016856, "grad_norm": 2.4717631340026855, "learning_rate": 4.8430339741901504e-05, "loss": 1.127, "step": 1196 }, { "epoch": 0.03152488806952858, "grad_norm": 3.3255231380462646, "learning_rate": 4.842902291282592e-05, "loss": 0.7941, "step": 1197 }, { "epoch": 0.031551224651040294, "grad_norm": 2.67148494720459, "learning_rate": 4.842770608375033e-05, "loss": 2.1661, "step": 1198 }, { "epoch": 0.031577561232552016, "grad_norm": 4.5087714195251465, "learning_rate": 4.842638925467475e-05, "loss": 2.2192, "step": 1199 }, { "epoch": 0.03160389781406373, "grad_norm": 2.4082632064819336, "learning_rate": 4.842507242559916e-05, "loss": 2.1281, "step": 1200 }, { "epoch": 0.031630234395575454, "grad_norm": 2.3811023235321045, "learning_rate": 4.8423755596523576e-05, "loss": 1.706, "step": 1201 }, { "epoch": 0.03165657097708718, "grad_norm": 4.598793029785156, "learning_rate": 4.8422438767447984e-05, "loss": 1.8155, "step": 1202 }, { "epoch": 0.03168290755859889, "grad_norm": 2.2454915046691895, "learning_rate": 4.84211219383724e-05, "loss": 1.6846, "step": 1203 }, { "epoch": 0.031709244140110615, "grad_norm": 6.481629371643066, "learning_rate": 4.8419805109296816e-05, "loss": 2.5402, "step": 1204 }, { "epoch": 0.03173558072162233, "grad_norm": 3.875046730041504, "learning_rate": 4.841848828022123e-05, "loss": 1.0837, "step": 1205 }, { "epoch": 0.03176191730313405, "grad_norm": 2.3516602516174316, "learning_rate": 4.841717145114564e-05, "loss": 1.891, "step": 1206 }, { "epoch": 0.031788253884645776, "grad_norm": 2.502894639968872, "learning_rate": 4.8415854622070056e-05, "loss": 1.9226, "step": 1207 }, { "epoch": 0.03181459046615749, "grad_norm": 2.2583303451538086, "learning_rate": 4.841453779299447e-05, "loss": 1.9457, "step": 1208 }, { "epoch": 0.031840927047669214, "grad_norm": 2.4761016368865967, "learning_rate": 4.841322096391889e-05, "loss": 1.5265, "step": 1209 }, { "epoch": 0.03186726362918093, "grad_norm": 2.4672107696533203, "learning_rate": 4.84119041348433e-05, "loss": 2.1244, "step": 1210 }, { "epoch": 0.03189360021069265, "grad_norm": 3.0499157905578613, "learning_rate": 4.841058730576771e-05, "loss": 0.3298, "step": 1211 }, { "epoch": 0.031919936792204374, "grad_norm": 2.463407516479492, "learning_rate": 4.840927047669213e-05, "loss": 1.7443, "step": 1212 }, { "epoch": 0.03194627337371609, "grad_norm": 5.298512935638428, "learning_rate": 4.840795364761654e-05, "loss": 1.075, "step": 1213 }, { "epoch": 0.03197260995522781, "grad_norm": 2.7082419395446777, "learning_rate": 4.840663681854096e-05, "loss": 1.8387, "step": 1214 }, { "epoch": 0.03199894653673953, "grad_norm": 3.170705556869507, "learning_rate": 4.840531998946537e-05, "loss": 0.6658, "step": 1215 }, { "epoch": 0.03202528311825125, "grad_norm": 3.09491229057312, "learning_rate": 4.840400316038978e-05, "loss": 1.971, "step": 1216 }, { "epoch": 0.03205161969976297, "grad_norm": 3.932175874710083, "learning_rate": 4.84026863313142e-05, "loss": 1.4447, "step": 1217 }, { "epoch": 0.03207795628127469, "grad_norm": 2.247992515563965, "learning_rate": 4.8401369502238614e-05, "loss": 1.7858, "step": 1218 }, { "epoch": 0.03210429286278641, "grad_norm": 2.677718162536621, "learning_rate": 4.840005267316303e-05, "loss": 1.7789, "step": 1219 }, { "epoch": 0.03213062944429813, "grad_norm": 2.337888240814209, "learning_rate": 4.839873584408744e-05, "loss": 1.9844, "step": 1220 }, { "epoch": 0.03215696602580985, "grad_norm": 2.4834091663360596, "learning_rate": 4.8397419015011854e-05, "loss": 1.7121, "step": 1221 }, { "epoch": 0.03218330260732157, "grad_norm": 2.4524142742156982, "learning_rate": 4.839610218593626e-05, "loss": 2.3675, "step": 1222 }, { "epoch": 0.03220963918883329, "grad_norm": 2.556180953979492, "learning_rate": 4.8394785356860685e-05, "loss": 1.9163, "step": 1223 }, { "epoch": 0.03223597577034501, "grad_norm": 5.075287342071533, "learning_rate": 4.8393468527785094e-05, "loss": 1.3693, "step": 1224 }, { "epoch": 0.032262312351856726, "grad_norm": 2.355077028274536, "learning_rate": 4.839215169870951e-05, "loss": 1.6558, "step": 1225 }, { "epoch": 0.03228864893336845, "grad_norm": 2.9545958042144775, "learning_rate": 4.8390834869633925e-05, "loss": 0.5404, "step": 1226 }, { "epoch": 0.03231498551488017, "grad_norm": 2.1071887016296387, "learning_rate": 4.838951804055834e-05, "loss": 1.8877, "step": 1227 }, { "epoch": 0.032341322096391886, "grad_norm": 6.698026180267334, "learning_rate": 4.8388201211482756e-05, "loss": 1.1915, "step": 1228 }, { "epoch": 0.03236765867790361, "grad_norm": 2.412713050842285, "learning_rate": 4.8386884382407165e-05, "loss": 1.2396, "step": 1229 }, { "epoch": 0.03239399525941533, "grad_norm": 2.383208990097046, "learning_rate": 4.838556755333158e-05, "loss": 1.9404, "step": 1230 }, { "epoch": 0.03242033184092705, "grad_norm": 2.873124122619629, "learning_rate": 4.838425072425599e-05, "loss": 2.3502, "step": 1231 }, { "epoch": 0.03244666842243877, "grad_norm": 4.161839962005615, "learning_rate": 4.838293389518041e-05, "loss": 1.9853, "step": 1232 }, { "epoch": 0.032473005003950485, "grad_norm": 2.328953742980957, "learning_rate": 4.838161706610482e-05, "loss": 2.03, "step": 1233 }, { "epoch": 0.03249934158546221, "grad_norm": 3.1227409839630127, "learning_rate": 4.8380300237029236e-05, "loss": 0.5901, "step": 1234 }, { "epoch": 0.03252567816697393, "grad_norm": 2.4840292930603027, "learning_rate": 4.837898340795365e-05, "loss": 2.0666, "step": 1235 }, { "epoch": 0.032552014748485646, "grad_norm": 2.3030710220336914, "learning_rate": 4.837766657887806e-05, "loss": 1.6181, "step": 1236 }, { "epoch": 0.03257835132999737, "grad_norm": 4.652442455291748, "learning_rate": 4.837634974980248e-05, "loss": 1.3754, "step": 1237 }, { "epoch": 0.032604687911509084, "grad_norm": 2.7813150882720947, "learning_rate": 4.837503292072689e-05, "loss": 2.5307, "step": 1238 }, { "epoch": 0.032631024493020806, "grad_norm": 2.065514326095581, "learning_rate": 4.837371609165131e-05, "loss": 1.521, "step": 1239 }, { "epoch": 0.03265736107453253, "grad_norm": 4.542300224304199, "learning_rate": 4.8372399262575716e-05, "loss": 1.3511, "step": 1240 }, { "epoch": 0.032683697656044244, "grad_norm": 3.3543527126312256, "learning_rate": 4.837108243350014e-05, "loss": 2.1861, "step": 1241 }, { "epoch": 0.03271003423755597, "grad_norm": 3.2510035037994385, "learning_rate": 4.836976560442455e-05, "loss": 1.738, "step": 1242 }, { "epoch": 0.03273637081906768, "grad_norm": 3.5997731685638428, "learning_rate": 4.836844877534896e-05, "loss": 1.1752, "step": 1243 }, { "epoch": 0.032762707400579405, "grad_norm": 2.349310874938965, "learning_rate": 4.836713194627338e-05, "loss": 1.907, "step": 1244 }, { "epoch": 0.03278904398209113, "grad_norm": 3.2032058238983154, "learning_rate": 4.836581511719779e-05, "loss": 1.8664, "step": 1245 }, { "epoch": 0.03281538056360284, "grad_norm": 4.305539608001709, "learning_rate": 4.836449828812221e-05, "loss": 0.6863, "step": 1246 }, { "epoch": 0.032841717145114566, "grad_norm": 6.19760799407959, "learning_rate": 4.836318145904662e-05, "loss": 0.8177, "step": 1247 }, { "epoch": 0.03286805372662628, "grad_norm": 3.9096808433532715, "learning_rate": 4.8361864629971034e-05, "loss": 2.1843, "step": 1248 }, { "epoch": 0.032894390308138004, "grad_norm": 3.2951722145080566, "learning_rate": 4.836054780089544e-05, "loss": 1.9124, "step": 1249 }, { "epoch": 0.032920726889649726, "grad_norm": 3.348398208618164, "learning_rate": 4.835923097181986e-05, "loss": 1.6557, "step": 1250 }, { "epoch": 0.03294706347116144, "grad_norm": 2.1526827812194824, "learning_rate": 4.8357914142744274e-05, "loss": 1.8642, "step": 1251 }, { "epoch": 0.032973400052673164, "grad_norm": 6.212658882141113, "learning_rate": 4.835659731366869e-05, "loss": 1.4845, "step": 1252 }, { "epoch": 0.03299973663418488, "grad_norm": 2.499819278717041, "learning_rate": 4.83552804845931e-05, "loss": 2.1482, "step": 1253 }, { "epoch": 0.0330260732156966, "grad_norm": 2.5231502056121826, "learning_rate": 4.8353963655517514e-05, "loss": 1.3152, "step": 1254 }, { "epoch": 0.033052409797208325, "grad_norm": 4.560873985290527, "learning_rate": 4.835264682644193e-05, "loss": 1.0126, "step": 1255 }, { "epoch": 0.03307874637872004, "grad_norm": 6.148919582366943, "learning_rate": 4.8351329997366345e-05, "loss": 2.2314, "step": 1256 }, { "epoch": 0.03310508296023176, "grad_norm": 8.156359672546387, "learning_rate": 4.835001316829076e-05, "loss": 0.9903, "step": 1257 }, { "epoch": 0.03313141954174348, "grad_norm": 2.1040937900543213, "learning_rate": 4.834869633921517e-05, "loss": 2.2143, "step": 1258 }, { "epoch": 0.0331577561232552, "grad_norm": 2.2730515003204346, "learning_rate": 4.8347379510139585e-05, "loss": 1.8334, "step": 1259 }, { "epoch": 0.033184092704766924, "grad_norm": 2.8352062702178955, "learning_rate": 4.8346062681064e-05, "loss": 1.7796, "step": 1260 }, { "epoch": 0.03321042928627864, "grad_norm": 4.116631984710693, "learning_rate": 4.8344745851988417e-05, "loss": 1.8516, "step": 1261 }, { "epoch": 0.03323676586779036, "grad_norm": 2.0427281856536865, "learning_rate": 4.8343429022912825e-05, "loss": 2.4238, "step": 1262 }, { "epoch": 0.03326310244930208, "grad_norm": 3.4862911701202393, "learning_rate": 4.834211219383724e-05, "loss": 1.7011, "step": 1263 }, { "epoch": 0.0332894390308138, "grad_norm": 3.1244916915893555, "learning_rate": 4.8340795364761657e-05, "loss": 1.1916, "step": 1264 }, { "epoch": 0.03331577561232552, "grad_norm": 2.486405849456787, "learning_rate": 4.833947853568607e-05, "loss": 2.1259, "step": 1265 }, { "epoch": 0.03334211219383724, "grad_norm": 6.2503767013549805, "learning_rate": 4.833816170661049e-05, "loss": 0.8626, "step": 1266 }, { "epoch": 0.03336844877534896, "grad_norm": 8.085247993469238, "learning_rate": 4.8336844877534897e-05, "loss": 1.1699, "step": 1267 }, { "epoch": 0.033394785356860676, "grad_norm": 2.33953857421875, "learning_rate": 4.833552804845931e-05, "loss": 2.0922, "step": 1268 }, { "epoch": 0.0334211219383724, "grad_norm": 5.278698921203613, "learning_rate": 4.833421121938372e-05, "loss": 0.9735, "step": 1269 }, { "epoch": 0.03344745851988412, "grad_norm": 3.065584421157837, "learning_rate": 4.833289439030814e-05, "loss": 1.7886, "step": 1270 }, { "epoch": 0.03347379510139584, "grad_norm": 3.733517646789551, "learning_rate": 4.833157756123255e-05, "loss": 1.2978, "step": 1271 }, { "epoch": 0.03350013168290756, "grad_norm": 1.9371833801269531, "learning_rate": 4.833026073215697e-05, "loss": 1.7297, "step": 1272 }, { "epoch": 0.033526468264419275, "grad_norm": 2.1965014934539795, "learning_rate": 4.832894390308138e-05, "loss": 1.7929, "step": 1273 }, { "epoch": 0.033552804845931, "grad_norm": 2.186753511428833, "learning_rate": 4.83276270740058e-05, "loss": 2.1285, "step": 1274 }, { "epoch": 0.03357914142744272, "grad_norm": 2.922542095184326, "learning_rate": 4.8326310244930215e-05, "loss": 1.6997, "step": 1275 }, { "epoch": 0.033605478008954436, "grad_norm": 3.0036470890045166, "learning_rate": 4.832499341585462e-05, "loss": 1.7925, "step": 1276 }, { "epoch": 0.03363181459046616, "grad_norm": 7.521142959594727, "learning_rate": 4.832367658677904e-05, "loss": 2.1324, "step": 1277 }, { "epoch": 0.03365815117197788, "grad_norm": 4.4400739669799805, "learning_rate": 4.832235975770345e-05, "loss": 1.2237, "step": 1278 }, { "epoch": 0.033684487753489596, "grad_norm": 2.646451234817505, "learning_rate": 4.832104292862787e-05, "loss": 1.3697, "step": 1279 }, { "epoch": 0.03371082433500132, "grad_norm": 2.9623894691467285, "learning_rate": 4.831972609955228e-05, "loss": 2.2205, "step": 1280 }, { "epoch": 0.033737160916513034, "grad_norm": 4.186872482299805, "learning_rate": 4.8318409270476695e-05, "loss": 1.6858, "step": 1281 }, { "epoch": 0.03376349749802476, "grad_norm": 3.4387664794921875, "learning_rate": 4.831709244140111e-05, "loss": 1.0801, "step": 1282 }, { "epoch": 0.03378983407953648, "grad_norm": 4.123729705810547, "learning_rate": 4.831577561232552e-05, "loss": 1.5339, "step": 1283 }, { "epoch": 0.033816170661048195, "grad_norm": 2.3596715927124023, "learning_rate": 4.831445878324994e-05, "loss": 1.7717, "step": 1284 }, { "epoch": 0.03384250724255992, "grad_norm": 4.8778276443481445, "learning_rate": 4.831314195417435e-05, "loss": 1.4071, "step": 1285 }, { "epoch": 0.03386884382407163, "grad_norm": 2.4648845195770264, "learning_rate": 4.8311825125098766e-05, "loss": 2.5404, "step": 1286 }, { "epoch": 0.033895180405583356, "grad_norm": 2.552497625350952, "learning_rate": 4.8310508296023175e-05, "loss": 2.2106, "step": 1287 }, { "epoch": 0.03392151698709508, "grad_norm": 8.436152458190918, "learning_rate": 4.830919146694759e-05, "loss": 1.8232, "step": 1288 }, { "epoch": 0.033947853568606794, "grad_norm": 2.4023749828338623, "learning_rate": 4.8307874637872006e-05, "loss": 1.6959, "step": 1289 }, { "epoch": 0.033974190150118516, "grad_norm": 8.067594528198242, "learning_rate": 4.830655780879642e-05, "loss": 1.52, "step": 1290 }, { "epoch": 0.03400052673163023, "grad_norm": 6.440790176391602, "learning_rate": 4.830524097972084e-05, "loss": 1.4087, "step": 1291 }, { "epoch": 0.034026863313141954, "grad_norm": 6.596911430358887, "learning_rate": 4.8303924150645246e-05, "loss": 1.3801, "step": 1292 }, { "epoch": 0.03405319989465368, "grad_norm": 2.277406930923462, "learning_rate": 4.830260732156967e-05, "loss": 1.5915, "step": 1293 }, { "epoch": 0.03407953647616539, "grad_norm": 2.351935863494873, "learning_rate": 4.830129049249408e-05, "loss": 2.3419, "step": 1294 }, { "epoch": 0.034105873057677115, "grad_norm": 3.372990846633911, "learning_rate": 4.829997366341849e-05, "loss": 0.3246, "step": 1295 }, { "epoch": 0.03413220963918883, "grad_norm": 4.0242156982421875, "learning_rate": 4.82986568343429e-05, "loss": 1.3814, "step": 1296 }, { "epoch": 0.03415854622070055, "grad_norm": 2.2410614490509033, "learning_rate": 4.829734000526732e-05, "loss": 1.9321, "step": 1297 }, { "epoch": 0.034184882802212276, "grad_norm": 2.1041512489318848, "learning_rate": 4.829602317619173e-05, "loss": 1.1096, "step": 1298 }, { "epoch": 0.03421121938372399, "grad_norm": 11.442770004272461, "learning_rate": 4.829470634711615e-05, "loss": 1.6356, "step": 1299 }, { "epoch": 0.034237555965235714, "grad_norm": 2.5029757022857666, "learning_rate": 4.829338951804056e-05, "loss": 2.2546, "step": 1300 }, { "epoch": 0.03426389254674743, "grad_norm": 3.0751092433929443, "learning_rate": 4.829207268896497e-05, "loss": 2.0434, "step": 1301 }, { "epoch": 0.03429022912825915, "grad_norm": 3.357970714569092, "learning_rate": 4.829075585988939e-05, "loss": 1.894, "step": 1302 }, { "epoch": 0.034316565709770874, "grad_norm": 2.2058346271514893, "learning_rate": 4.8289439030813804e-05, "loss": 1.9861, "step": 1303 }, { "epoch": 0.03434290229128259, "grad_norm": 3.02363657951355, "learning_rate": 4.828812220173822e-05, "loss": 0.3299, "step": 1304 }, { "epoch": 0.03436923887279431, "grad_norm": 4.684982776641846, "learning_rate": 4.828680537266263e-05, "loss": 1.0779, "step": 1305 }, { "epoch": 0.03439557545430603, "grad_norm": 5.264890193939209, "learning_rate": 4.8285488543587044e-05, "loss": 0.9333, "step": 1306 }, { "epoch": 0.03442191203581775, "grad_norm": 3.341413974761963, "learning_rate": 4.828417171451146e-05, "loss": 1.3823, "step": 1307 }, { "epoch": 0.03444824861732947, "grad_norm": 3.4844167232513428, "learning_rate": 4.8282854885435875e-05, "loss": 1.4234, "step": 1308 }, { "epoch": 0.03447458519884119, "grad_norm": 3.4124138355255127, "learning_rate": 4.8281538056360284e-05, "loss": 1.9774, "step": 1309 }, { "epoch": 0.03450092178035291, "grad_norm": 2.2452385425567627, "learning_rate": 4.82802212272847e-05, "loss": 2.1438, "step": 1310 }, { "epoch": 0.03452725836186463, "grad_norm": 3.668550729751587, "learning_rate": 4.8278904398209115e-05, "loss": 1.36, "step": 1311 }, { "epoch": 0.03455359494337635, "grad_norm": 2.3201591968536377, "learning_rate": 4.827758756913353e-05, "loss": 1.6325, "step": 1312 }, { "epoch": 0.03457993152488807, "grad_norm": 2.956763505935669, "learning_rate": 4.8276270740057946e-05, "loss": 2.9976, "step": 1313 }, { "epoch": 0.03460626810639979, "grad_norm": 2.1899678707122803, "learning_rate": 4.8274953910982355e-05, "loss": 2.1028, "step": 1314 }, { "epoch": 0.03463260468791151, "grad_norm": 3.3504998683929443, "learning_rate": 4.827363708190677e-05, "loss": 0.732, "step": 1315 }, { "epoch": 0.034658941269423225, "grad_norm": 2.741833448410034, "learning_rate": 4.827232025283118e-05, "loss": 1.6869, "step": 1316 }, { "epoch": 0.03468527785093495, "grad_norm": 2.77390193939209, "learning_rate": 4.82710034237556e-05, "loss": 1.678, "step": 1317 }, { "epoch": 0.03471161443244667, "grad_norm": 5.46265983581543, "learning_rate": 4.826968659468001e-05, "loss": 1.5652, "step": 1318 }, { "epoch": 0.034737951013958386, "grad_norm": 6.403779983520508, "learning_rate": 4.8268369765604426e-05, "loss": 0.9487, "step": 1319 }, { "epoch": 0.03476428759547011, "grad_norm": 2.5105652809143066, "learning_rate": 4.826705293652884e-05, "loss": 2.2179, "step": 1320 }, { "epoch": 0.03479062417698183, "grad_norm": 2.5746607780456543, "learning_rate": 4.826573610745325e-05, "loss": 2.0969, "step": 1321 }, { "epoch": 0.03481696075849355, "grad_norm": 4.139590263366699, "learning_rate": 4.826441927837767e-05, "loss": 1.494, "step": 1322 }, { "epoch": 0.03484329734000527, "grad_norm": 2.3788700103759766, "learning_rate": 4.826310244930208e-05, "loss": 1.5353, "step": 1323 }, { "epoch": 0.034869633921516985, "grad_norm": 1.9991793632507324, "learning_rate": 4.82617856202265e-05, "loss": 1.8536, "step": 1324 }, { "epoch": 0.03489597050302871, "grad_norm": 3.0393738746643066, "learning_rate": 4.8260468791150906e-05, "loss": 0.8268, "step": 1325 }, { "epoch": 0.03492230708454043, "grad_norm": 3.374997138977051, "learning_rate": 4.825915196207533e-05, "loss": 1.8157, "step": 1326 }, { "epoch": 0.034948643666052145, "grad_norm": 6.295700550079346, "learning_rate": 4.825783513299974e-05, "loss": 2.6546, "step": 1327 }, { "epoch": 0.03497498024756387, "grad_norm": 3.399216651916504, "learning_rate": 4.825651830392415e-05, "loss": 1.482, "step": 1328 }, { "epoch": 0.035001316829075584, "grad_norm": 2.5268938541412354, "learning_rate": 4.825520147484857e-05, "loss": 2.2271, "step": 1329 }, { "epoch": 0.035027653410587306, "grad_norm": 4.140182971954346, "learning_rate": 4.825388464577298e-05, "loss": 1.6582, "step": 1330 }, { "epoch": 0.03505398999209903, "grad_norm": 2.9830386638641357, "learning_rate": 4.82525678166974e-05, "loss": 2.0572, "step": 1331 }, { "epoch": 0.035080326573610744, "grad_norm": 5.903008460998535, "learning_rate": 4.825125098762181e-05, "loss": 0.4726, "step": 1332 }, { "epoch": 0.03510666315512247, "grad_norm": 4.696227550506592, "learning_rate": 4.8249934158546224e-05, "loss": 1.269, "step": 1333 }, { "epoch": 0.03513299973663418, "grad_norm": 4.289698123931885, "learning_rate": 4.824861732947063e-05, "loss": 0.7344, "step": 1334 }, { "epoch": 0.035159336318145905, "grad_norm": 2.677140235900879, "learning_rate": 4.824730050039505e-05, "loss": 1.0939, "step": 1335 }, { "epoch": 0.03518567289965763, "grad_norm": 2.237262487411499, "learning_rate": 4.8245983671319464e-05, "loss": 2.1044, "step": 1336 }, { "epoch": 0.03521200948116934, "grad_norm": 2.694110155105591, "learning_rate": 4.824466684224388e-05, "loss": 0.5662, "step": 1337 }, { "epoch": 0.035238346062681065, "grad_norm": 2.9993741512298584, "learning_rate": 4.8243350013168296e-05, "loss": 0.7674, "step": 1338 }, { "epoch": 0.03526468264419278, "grad_norm": 1.9640650749206543, "learning_rate": 4.8242033184092704e-05, "loss": 2.2223, "step": 1339 }, { "epoch": 0.035291019225704504, "grad_norm": 2.2978978157043457, "learning_rate": 4.824071635501713e-05, "loss": 2.1875, "step": 1340 }, { "epoch": 0.035317355807216226, "grad_norm": 3.9391028881073, "learning_rate": 4.8239399525941536e-05, "loss": 1.8525, "step": 1341 }, { "epoch": 0.03534369238872794, "grad_norm": 3.5753555297851562, "learning_rate": 4.823808269686595e-05, "loss": 2.3247, "step": 1342 }, { "epoch": 0.035370028970239664, "grad_norm": 3.101198673248291, "learning_rate": 4.823676586779036e-05, "loss": 2.4757, "step": 1343 }, { "epoch": 0.03539636555175138, "grad_norm": 3.337867021560669, "learning_rate": 4.8235449038714776e-05, "loss": 1.4835, "step": 1344 }, { "epoch": 0.0354227021332631, "grad_norm": 2.6907167434692383, "learning_rate": 4.823413220963919e-05, "loss": 1.7228, "step": 1345 }, { "epoch": 0.035449038714774825, "grad_norm": 3.3301050662994385, "learning_rate": 4.823281538056361e-05, "loss": 1.9144, "step": 1346 }, { "epoch": 0.03547537529628654, "grad_norm": 3.553663492202759, "learning_rate": 4.823149855148802e-05, "loss": 2.2447, "step": 1347 }, { "epoch": 0.03550171187779826, "grad_norm": 4.311988353729248, "learning_rate": 4.823018172241243e-05, "loss": 1.6596, "step": 1348 }, { "epoch": 0.03552804845930998, "grad_norm": 3.39910888671875, "learning_rate": 4.822886489333685e-05, "loss": 1.9699, "step": 1349 }, { "epoch": 0.0355543850408217, "grad_norm": 2.2859280109405518, "learning_rate": 4.822754806426126e-05, "loss": 1.8481, "step": 1350 }, { "epoch": 0.035580721622333424, "grad_norm": 4.299882888793945, "learning_rate": 4.822623123518568e-05, "loss": 1.5176, "step": 1351 }, { "epoch": 0.03560705820384514, "grad_norm": 4.792234420776367, "learning_rate": 4.822491440611009e-05, "loss": 1.6701, "step": 1352 }, { "epoch": 0.03563339478535686, "grad_norm": 3.010335683822632, "learning_rate": 4.82235975770345e-05, "loss": 0.8111, "step": 1353 }, { "epoch": 0.03565973136686858, "grad_norm": 2.2211203575134277, "learning_rate": 4.822228074795891e-05, "loss": 1.9087, "step": 1354 }, { "epoch": 0.0356860679483803, "grad_norm": 2.451058864593506, "learning_rate": 4.8220963918883334e-05, "loss": 1.545, "step": 1355 }, { "epoch": 0.03571240452989202, "grad_norm": 2.579103946685791, "learning_rate": 4.821964708980774e-05, "loss": 1.796, "step": 1356 }, { "epoch": 0.03573874111140374, "grad_norm": 4.573684215545654, "learning_rate": 4.821833026073216e-05, "loss": 1.0103, "step": 1357 }, { "epoch": 0.03576507769291546, "grad_norm": 2.26727294921875, "learning_rate": 4.8217013431656574e-05, "loss": 1.5844, "step": 1358 }, { "epoch": 0.035791414274427176, "grad_norm": 3.4333019256591797, "learning_rate": 4.821569660258099e-05, "loss": 1.7426, "step": 1359 }, { "epoch": 0.0358177508559389, "grad_norm": 7.677754878997803, "learning_rate": 4.8214379773505405e-05, "loss": 1.8325, "step": 1360 }, { "epoch": 0.03584408743745062, "grad_norm": 3.0336837768554688, "learning_rate": 4.8213062944429814e-05, "loss": 2.039, "step": 1361 }, { "epoch": 0.03587042401896234, "grad_norm": 3.4892420768737793, "learning_rate": 4.821174611535423e-05, "loss": 1.5181, "step": 1362 }, { "epoch": 0.03589676060047406, "grad_norm": 3.1478965282440186, "learning_rate": 4.821042928627864e-05, "loss": 1.4686, "step": 1363 }, { "epoch": 0.03592309718198578, "grad_norm": 2.5794179439544678, "learning_rate": 4.820911245720306e-05, "loss": 0.4452, "step": 1364 }, { "epoch": 0.0359494337634975, "grad_norm": 2.2024765014648438, "learning_rate": 4.820779562812747e-05, "loss": 1.4755, "step": 1365 }, { "epoch": 0.03597577034500922, "grad_norm": 2.669931173324585, "learning_rate": 4.8206478799051885e-05, "loss": 1.8739, "step": 1366 }, { "epoch": 0.036002106926520935, "grad_norm": 3.913017511367798, "learning_rate": 4.82051619699763e-05, "loss": 1.9402, "step": 1367 }, { "epoch": 0.03602844350803266, "grad_norm": 3.415797472000122, "learning_rate": 4.820384514090071e-05, "loss": 1.0633, "step": 1368 }, { "epoch": 0.03605478008954438, "grad_norm": 3.1644015312194824, "learning_rate": 4.820252831182513e-05, "loss": 1.5442, "step": 1369 }, { "epoch": 0.036081116671056096, "grad_norm": 3.9022562503814697, "learning_rate": 4.820121148274954e-05, "loss": 1.7415, "step": 1370 }, { "epoch": 0.03610745325256782, "grad_norm": 3.3646533489227295, "learning_rate": 4.8199894653673956e-05, "loss": 2.3507, "step": 1371 }, { "epoch": 0.036133789834079534, "grad_norm": 2.9992384910583496, "learning_rate": 4.8198577824598365e-05, "loss": 1.7051, "step": 1372 }, { "epoch": 0.03616012641559126, "grad_norm": 3.279506206512451, "learning_rate": 4.819726099552279e-05, "loss": 1.307, "step": 1373 }, { "epoch": 0.03618646299710298, "grad_norm": 3.8295230865478516, "learning_rate": 4.8195944166447196e-05, "loss": 1.657, "step": 1374 }, { "epoch": 0.036212799578614695, "grad_norm": 4.198893070220947, "learning_rate": 4.819462733737161e-05, "loss": 0.9144, "step": 1375 }, { "epoch": 0.03623913616012642, "grad_norm": 2.8750100135803223, "learning_rate": 4.819331050829603e-05, "loss": 2.0248, "step": 1376 }, { "epoch": 0.03626547274163813, "grad_norm": 2.485234260559082, "learning_rate": 4.8191993679220436e-05, "loss": 1.6745, "step": 1377 }, { "epoch": 0.036291809323149855, "grad_norm": 2.555755376815796, "learning_rate": 4.819067685014486e-05, "loss": 1.0228, "step": 1378 }, { "epoch": 0.03631814590466158, "grad_norm": 3.703505754470825, "learning_rate": 4.818936002106927e-05, "loss": 0.6707, "step": 1379 }, { "epoch": 0.03634448248617329, "grad_norm": 2.726428508758545, "learning_rate": 4.818804319199368e-05, "loss": 1.5401, "step": 1380 }, { "epoch": 0.036370819067685016, "grad_norm": 3.7582430839538574, "learning_rate": 4.818672636291809e-05, "loss": 1.1392, "step": 1381 }, { "epoch": 0.03639715564919673, "grad_norm": 2.7370986938476562, "learning_rate": 4.818540953384251e-05, "loss": 2.1486, "step": 1382 }, { "epoch": 0.036423492230708454, "grad_norm": 3.982513189315796, "learning_rate": 4.818409270476692e-05, "loss": 2.6714, "step": 1383 }, { "epoch": 0.03644982881222018, "grad_norm": 3.420853614807129, "learning_rate": 4.818277587569134e-05, "loss": 0.4569, "step": 1384 }, { "epoch": 0.03647616539373189, "grad_norm": 2.9245407581329346, "learning_rate": 4.8181459046615754e-05, "loss": 2.3995, "step": 1385 }, { "epoch": 0.036502501975243615, "grad_norm": 8.815753936767578, "learning_rate": 4.818014221754016e-05, "loss": 1.1951, "step": 1386 }, { "epoch": 0.03652883855675533, "grad_norm": 5.456750392913818, "learning_rate": 4.8178825388464585e-05, "loss": 1.5686, "step": 1387 }, { "epoch": 0.03655517513826705, "grad_norm": 3.259455919265747, "learning_rate": 4.8177508559388994e-05, "loss": 1.5534, "step": 1388 }, { "epoch": 0.036581511719778775, "grad_norm": 2.732182741165161, "learning_rate": 4.817619173031341e-05, "loss": 1.7391, "step": 1389 }, { "epoch": 0.03660784830129049, "grad_norm": 3.078794479370117, "learning_rate": 4.817487490123782e-05, "loss": 2.286, "step": 1390 }, { "epoch": 0.03663418488280221, "grad_norm": 4.257495880126953, "learning_rate": 4.8173558072162234e-05, "loss": 1.0608, "step": 1391 }, { "epoch": 0.03666052146431393, "grad_norm": 3.027405261993408, "learning_rate": 4.817224124308665e-05, "loss": 1.176, "step": 1392 }, { "epoch": 0.03668685804582565, "grad_norm": 3.272449493408203, "learning_rate": 4.8170924414011065e-05, "loss": 1.7577, "step": 1393 }, { "epoch": 0.036713194627337374, "grad_norm": 7.281032562255859, "learning_rate": 4.816960758493548e-05, "loss": 2.452, "step": 1394 }, { "epoch": 0.03673953120884909, "grad_norm": 2.2292678356170654, "learning_rate": 4.816829075585989e-05, "loss": 1.9123, "step": 1395 }, { "epoch": 0.03676586779036081, "grad_norm": 4.264851093292236, "learning_rate": 4.8166973926784305e-05, "loss": 0.8762, "step": 1396 }, { "epoch": 0.03679220437187253, "grad_norm": 3.4080307483673096, "learning_rate": 4.816565709770872e-05, "loss": 1.0579, "step": 1397 }, { "epoch": 0.03681854095338425, "grad_norm": 2.9511606693267822, "learning_rate": 4.8164340268633137e-05, "loss": 1.876, "step": 1398 }, { "epoch": 0.03684487753489597, "grad_norm": 3.078354597091675, "learning_rate": 4.8163023439557545e-05, "loss": 2.0897, "step": 1399 }, { "epoch": 0.03687121411640769, "grad_norm": 2.9722883701324463, "learning_rate": 4.816170661048196e-05, "loss": 1.5776, "step": 1400 }, { "epoch": 0.03689755069791941, "grad_norm": 4.6948723793029785, "learning_rate": 4.816038978140637e-05, "loss": 1.19, "step": 1401 }, { "epoch": 0.036923887279431127, "grad_norm": 2.6448488235473633, "learning_rate": 4.815907295233079e-05, "loss": 1.4374, "step": 1402 }, { "epoch": 0.03695022386094285, "grad_norm": 2.951441526412964, "learning_rate": 4.81577561232552e-05, "loss": 2.1182, "step": 1403 }, { "epoch": 0.03697656044245457, "grad_norm": 2.4465129375457764, "learning_rate": 4.8156439294179617e-05, "loss": 1.7781, "step": 1404 }, { "epoch": 0.03700289702396629, "grad_norm": 2.9098169803619385, "learning_rate": 4.815512246510403e-05, "loss": 1.9132, "step": 1405 }, { "epoch": 0.03702923360547801, "grad_norm": 2.493217945098877, "learning_rate": 4.815380563602845e-05, "loss": 2.0592, "step": 1406 }, { "epoch": 0.037055570186989725, "grad_norm": 3.7359561920166016, "learning_rate": 4.815248880695286e-05, "loss": 1.5837, "step": 1407 }, { "epoch": 0.03708190676850145, "grad_norm": 2.9856550693511963, "learning_rate": 4.815117197787727e-05, "loss": 2.203, "step": 1408 }, { "epoch": 0.03710824335001317, "grad_norm": 2.2589473724365234, "learning_rate": 4.814985514880169e-05, "loss": 1.9073, "step": 1409 }, { "epoch": 0.037134579931524886, "grad_norm": 3.434154748916626, "learning_rate": 4.81485383197261e-05, "loss": 1.8538, "step": 1410 }, { "epoch": 0.03716091651303661, "grad_norm": 2.994636058807373, "learning_rate": 4.814722149065052e-05, "loss": 0.6366, "step": 1411 }, { "epoch": 0.03718725309454833, "grad_norm": 2.82177734375, "learning_rate": 4.814590466157493e-05, "loss": 1.4591, "step": 1412 }, { "epoch": 0.037213589676060047, "grad_norm": 4.66945219039917, "learning_rate": 4.8144587832499343e-05, "loss": 2.239, "step": 1413 }, { "epoch": 0.03723992625757177, "grad_norm": 3.21661639213562, "learning_rate": 4.814327100342376e-05, "loss": 1.3781, "step": 1414 }, { "epoch": 0.037266262839083485, "grad_norm": 2.4987006187438965, "learning_rate": 4.814195417434817e-05, "loss": 1.7669, "step": 1415 }, { "epoch": 0.03729259942059521, "grad_norm": 2.091668128967285, "learning_rate": 4.814063734527259e-05, "loss": 1.5082, "step": 1416 }, { "epoch": 0.03731893600210693, "grad_norm": 2.086125373840332, "learning_rate": 4.8139320516197e-05, "loss": 1.6896, "step": 1417 }, { "epoch": 0.037345272583618645, "grad_norm": 2.369629144668579, "learning_rate": 4.8138003687121415e-05, "loss": 2.1499, "step": 1418 }, { "epoch": 0.03737160916513037, "grad_norm": 2.1372570991516113, "learning_rate": 4.8136686858045823e-05, "loss": 1.6387, "step": 1419 }, { "epoch": 0.03739794574664208, "grad_norm": 2.225848436355591, "learning_rate": 4.8135370028970246e-05, "loss": 2.3318, "step": 1420 }, { "epoch": 0.037424282328153806, "grad_norm": 4.289445877075195, "learning_rate": 4.8134053199894655e-05, "loss": 1.773, "step": 1421 }, { "epoch": 0.03745061890966553, "grad_norm": 5.045720100402832, "learning_rate": 4.813273637081907e-05, "loss": 1.1577, "step": 1422 }, { "epoch": 0.037476955491177244, "grad_norm": 2.802020311355591, "learning_rate": 4.8131419541743486e-05, "loss": 1.5965, "step": 1423 }, { "epoch": 0.037503292072688967, "grad_norm": 3.742877244949341, "learning_rate": 4.8130102712667895e-05, "loss": 1.2814, "step": 1424 }, { "epoch": 0.03752962865420068, "grad_norm": 4.184816360473633, "learning_rate": 4.812878588359232e-05, "loss": 1.8437, "step": 1425 }, { "epoch": 0.037555965235712405, "grad_norm": 5.02267599105835, "learning_rate": 4.8127469054516726e-05, "loss": 0.6091, "step": 1426 }, { "epoch": 0.03758230181722413, "grad_norm": 5.021767616271973, "learning_rate": 4.812615222544114e-05, "loss": 0.8637, "step": 1427 }, { "epoch": 0.03760863839873584, "grad_norm": 6.322299480438232, "learning_rate": 4.812483539636555e-05, "loss": 1.9106, "step": 1428 }, { "epoch": 0.037634974980247565, "grad_norm": 4.426939487457275, "learning_rate": 4.8123518567289966e-05, "loss": 0.513, "step": 1429 }, { "epoch": 0.03766131156175928, "grad_norm": 3.189406394958496, "learning_rate": 4.812220173821438e-05, "loss": 2.2815, "step": 1430 }, { "epoch": 0.037687648143271, "grad_norm": 4.226385593414307, "learning_rate": 4.81208849091388e-05, "loss": 1.8733, "step": 1431 }, { "epoch": 0.037713984724782726, "grad_norm": 2.647603988647461, "learning_rate": 4.811956808006321e-05, "loss": 2.4634, "step": 1432 }, { "epoch": 0.03774032130629444, "grad_norm": 2.241933584213257, "learning_rate": 4.811825125098762e-05, "loss": 1.9525, "step": 1433 }, { "epoch": 0.037766657887806164, "grad_norm": 3.8084769248962402, "learning_rate": 4.811693442191204e-05, "loss": 1.5282, "step": 1434 }, { "epoch": 0.03779299446931788, "grad_norm": 2.568134307861328, "learning_rate": 4.811561759283645e-05, "loss": 1.8027, "step": 1435 }, { "epoch": 0.0378193310508296, "grad_norm": 1.9084701538085938, "learning_rate": 4.811430076376087e-05, "loss": 1.7684, "step": 1436 }, { "epoch": 0.037845667632341325, "grad_norm": 4.7902679443359375, "learning_rate": 4.811298393468528e-05, "loss": 1.4196, "step": 1437 }, { "epoch": 0.03787200421385304, "grad_norm": 2.587052822113037, "learning_rate": 4.811166710560969e-05, "loss": 1.7431, "step": 1438 }, { "epoch": 0.03789834079536476, "grad_norm": 3.3005499839782715, "learning_rate": 4.811035027653411e-05, "loss": 1.8728, "step": 1439 }, { "epoch": 0.03792467737687648, "grad_norm": 3.298290252685547, "learning_rate": 4.8109033447458524e-05, "loss": 1.9283, "step": 1440 }, { "epoch": 0.0379510139583882, "grad_norm": 2.685931921005249, "learning_rate": 4.810771661838294e-05, "loss": 1.651, "step": 1441 }, { "epoch": 0.03797735053989992, "grad_norm": 1.9227145910263062, "learning_rate": 4.810639978930735e-05, "loss": 1.8952, "step": 1442 }, { "epoch": 0.03800368712141164, "grad_norm": 2.178524971008301, "learning_rate": 4.8105082960231764e-05, "loss": 1.5836, "step": 1443 }, { "epoch": 0.03803002370292336, "grad_norm": 3.530819892883301, "learning_rate": 4.810376613115618e-05, "loss": 1.6536, "step": 1444 }, { "epoch": 0.03805636028443508, "grad_norm": 4.413656234741211, "learning_rate": 4.8102449302080595e-05, "loss": 1.7439, "step": 1445 }, { "epoch": 0.0380826968659468, "grad_norm": 2.618088960647583, "learning_rate": 4.8101132473005004e-05, "loss": 1.6252, "step": 1446 }, { "epoch": 0.03810903344745852, "grad_norm": 6.07904052734375, "learning_rate": 4.809981564392942e-05, "loss": 2.3006, "step": 1447 }, { "epoch": 0.03813537002897024, "grad_norm": 4.130184650421143, "learning_rate": 4.8098498814853835e-05, "loss": 1.7968, "step": 1448 }, { "epoch": 0.03816170661048196, "grad_norm": 2.354788064956665, "learning_rate": 4.809718198577825e-05, "loss": 1.7768, "step": 1449 }, { "epoch": 0.038188043191993676, "grad_norm": 4.8974409103393555, "learning_rate": 4.8095865156702666e-05, "loss": 2.2463, "step": 1450 }, { "epoch": 0.0382143797735054, "grad_norm": 2.6150875091552734, "learning_rate": 4.8094548327627075e-05, "loss": 1.1499, "step": 1451 }, { "epoch": 0.03824071635501712, "grad_norm": 2.972714900970459, "learning_rate": 4.809323149855149e-05, "loss": 0.6017, "step": 1452 }, { "epoch": 0.038267052936528836, "grad_norm": 2.5725221633911133, "learning_rate": 4.8091914669475906e-05, "loss": 1.7313, "step": 1453 }, { "epoch": 0.03829338951804056, "grad_norm": 3.1481668949127197, "learning_rate": 4.809059784040032e-05, "loss": 2.4714, "step": 1454 }, { "epoch": 0.03831972609955228, "grad_norm": 3.2930145263671875, "learning_rate": 4.808928101132473e-05, "loss": 1.4067, "step": 1455 }, { "epoch": 0.038346062681064, "grad_norm": 2.3901965618133545, "learning_rate": 4.8087964182249146e-05, "loss": 2.0908, "step": 1456 }, { "epoch": 0.03837239926257572, "grad_norm": 2.4288535118103027, "learning_rate": 4.8086647353173555e-05, "loss": 1.7124, "step": 1457 }, { "epoch": 0.038398735844087435, "grad_norm": 2.4708545207977295, "learning_rate": 4.808533052409798e-05, "loss": 1.7156, "step": 1458 }, { "epoch": 0.03842507242559916, "grad_norm": 4.000490665435791, "learning_rate": 4.8084013695022386e-05, "loss": 0.8025, "step": 1459 }, { "epoch": 0.03845140900711088, "grad_norm": 2.33880615234375, "learning_rate": 4.80826968659468e-05, "loss": 2.1641, "step": 1460 }, { "epoch": 0.038477745588622596, "grad_norm": 4.5710673332214355, "learning_rate": 4.808138003687122e-05, "loss": 1.4686, "step": 1461 }, { "epoch": 0.03850408217013432, "grad_norm": 7.145897388458252, "learning_rate": 4.8080063207795626e-05, "loss": 0.7443, "step": 1462 }, { "epoch": 0.038530418751646034, "grad_norm": 2.459108352661133, "learning_rate": 4.807874637872005e-05, "loss": 1.6073, "step": 1463 }, { "epoch": 0.038556755333157756, "grad_norm": 3.707882881164551, "learning_rate": 4.807742954964446e-05, "loss": 1.2412, "step": 1464 }, { "epoch": 0.03858309191466948, "grad_norm": 5.278344631195068, "learning_rate": 4.807611272056887e-05, "loss": 1.1009, "step": 1465 }, { "epoch": 0.038609428496181195, "grad_norm": 2.403775930404663, "learning_rate": 4.807479589149328e-05, "loss": 1.7357, "step": 1466 }, { "epoch": 0.03863576507769292, "grad_norm": 4.183655738830566, "learning_rate": 4.80734790624177e-05, "loss": 1.335, "step": 1467 }, { "epoch": 0.03866210165920463, "grad_norm": 2.9094831943511963, "learning_rate": 4.807216223334211e-05, "loss": 0.8503, "step": 1468 }, { "epoch": 0.038688438240716355, "grad_norm": 3.1184732913970947, "learning_rate": 4.807084540426653e-05, "loss": 0.6333, "step": 1469 }, { "epoch": 0.03871477482222808, "grad_norm": 2.4614548683166504, "learning_rate": 4.8069528575190944e-05, "loss": 1.896, "step": 1470 }, { "epoch": 0.03874111140373979, "grad_norm": 2.6103456020355225, "learning_rate": 4.806821174611535e-05, "loss": 1.2525, "step": 1471 }, { "epoch": 0.038767447985251516, "grad_norm": 3.9586451053619385, "learning_rate": 4.8066894917039776e-05, "loss": 0.6833, "step": 1472 }, { "epoch": 0.03879378456676323, "grad_norm": 2.902448892593384, "learning_rate": 4.8065578087964184e-05, "loss": 1.4491, "step": 1473 }, { "epoch": 0.038820121148274954, "grad_norm": 2.5038821697235107, "learning_rate": 4.80642612588886e-05, "loss": 2.2628, "step": 1474 }, { "epoch": 0.038846457729786676, "grad_norm": 3.35122013092041, "learning_rate": 4.806294442981301e-05, "loss": 2.0252, "step": 1475 }, { "epoch": 0.03887279431129839, "grad_norm": 6.653318881988525, "learning_rate": 4.8061627600737424e-05, "loss": 1.5859, "step": 1476 }, { "epoch": 0.038899130892810115, "grad_norm": 2.248802423477173, "learning_rate": 4.806031077166184e-05, "loss": 1.708, "step": 1477 }, { "epoch": 0.03892546747432183, "grad_norm": 4.171624183654785, "learning_rate": 4.8058993942586256e-05, "loss": 1.8427, "step": 1478 }, { "epoch": 0.03895180405583355, "grad_norm": 6.605432987213135, "learning_rate": 4.805767711351067e-05, "loss": 1.4973, "step": 1479 }, { "epoch": 0.038978140637345275, "grad_norm": 4.893253326416016, "learning_rate": 4.805636028443508e-05, "loss": 1.768, "step": 1480 }, { "epoch": 0.03900447721885699, "grad_norm": 2.6387462615966797, "learning_rate": 4.8055043455359496e-05, "loss": 1.2058, "step": 1481 }, { "epoch": 0.03903081380036871, "grad_norm": 2.1615798473358154, "learning_rate": 4.805372662628391e-05, "loss": 2.7933, "step": 1482 }, { "epoch": 0.03905715038188043, "grad_norm": 4.006496906280518, "learning_rate": 4.805240979720833e-05, "loss": 1.5641, "step": 1483 }, { "epoch": 0.03908348696339215, "grad_norm": 3.03674578666687, "learning_rate": 4.8051092968132736e-05, "loss": 1.5177, "step": 1484 }, { "epoch": 0.039109823544903874, "grad_norm": 3.581326723098755, "learning_rate": 4.804977613905715e-05, "loss": 1.3614, "step": 1485 }, { "epoch": 0.03913616012641559, "grad_norm": 2.2554171085357666, "learning_rate": 4.804845930998157e-05, "loss": 2.1849, "step": 1486 }, { "epoch": 0.03916249670792731, "grad_norm": 2.508643388748169, "learning_rate": 4.804714248090598e-05, "loss": 0.3424, "step": 1487 }, { "epoch": 0.03918883328943903, "grad_norm": 2.149822235107422, "learning_rate": 4.80458256518304e-05, "loss": 1.5553, "step": 1488 }, { "epoch": 0.03921516987095075, "grad_norm": 2.9578163623809814, "learning_rate": 4.804450882275481e-05, "loss": 1.2616, "step": 1489 }, { "epoch": 0.03924150645246247, "grad_norm": 3.0891036987304688, "learning_rate": 4.804319199367922e-05, "loss": 0.5437, "step": 1490 }, { "epoch": 0.03926784303397419, "grad_norm": 3.854132652282715, "learning_rate": 4.804187516460364e-05, "loss": 1.0159, "step": 1491 }, { "epoch": 0.03929417961548591, "grad_norm": 2.0185046195983887, "learning_rate": 4.8040558335528054e-05, "loss": 1.455, "step": 1492 }, { "epoch": 0.039320516196997626, "grad_norm": 2.6549127101898193, "learning_rate": 4.803924150645246e-05, "loss": 1.2516, "step": 1493 }, { "epoch": 0.03934685277850935, "grad_norm": 2.0132808685302734, "learning_rate": 4.803792467737688e-05, "loss": 2.1761, "step": 1494 }, { "epoch": 0.03937318936002107, "grad_norm": 3.6217565536499023, "learning_rate": 4.8036607848301294e-05, "loss": 1.6783, "step": 1495 }, { "epoch": 0.03939952594153279, "grad_norm": 2.7061996459960938, "learning_rate": 4.803529101922571e-05, "loss": 1.9596, "step": 1496 }, { "epoch": 0.03942586252304451, "grad_norm": 3.6776225566864014, "learning_rate": 4.8033974190150125e-05, "loss": 2.2301, "step": 1497 }, { "epoch": 0.03945219910455623, "grad_norm": 2.7040247917175293, "learning_rate": 4.8032657361074534e-05, "loss": 1.9844, "step": 1498 }, { "epoch": 0.03947853568606795, "grad_norm": 5.953075885772705, "learning_rate": 4.803134053199895e-05, "loss": 1.189, "step": 1499 }, { "epoch": 0.03950487226757967, "grad_norm": 4.968349933624268, "learning_rate": 4.803002370292336e-05, "loss": 1.6651, "step": 1500 }, { "epoch": 0.039531208849091386, "grad_norm": 2.8154876232147217, "learning_rate": 4.802870687384778e-05, "loss": 2.0657, "step": 1501 }, { "epoch": 0.03955754543060311, "grad_norm": 5.4353203773498535, "learning_rate": 4.802739004477219e-05, "loss": 1.0015, "step": 1502 }, { "epoch": 0.03958388201211483, "grad_norm": 6.000146389007568, "learning_rate": 4.8026073215696605e-05, "loss": 1.6951, "step": 1503 }, { "epoch": 0.039610218593626546, "grad_norm": 3.825589656829834, "learning_rate": 4.8024756386621014e-05, "loss": 1.8409, "step": 1504 }, { "epoch": 0.03963655517513827, "grad_norm": 3.282968044281006, "learning_rate": 4.8023439557545436e-05, "loss": 0.5316, "step": 1505 }, { "epoch": 0.039662891756649984, "grad_norm": 5.311993598937988, "learning_rate": 4.8022122728469845e-05, "loss": 1.5813, "step": 1506 }, { "epoch": 0.03968922833816171, "grad_norm": 2.694779872894287, "learning_rate": 4.802080589939426e-05, "loss": 2.5022, "step": 1507 }, { "epoch": 0.03971556491967343, "grad_norm": 3.5496184825897217, "learning_rate": 4.8019489070318676e-05, "loss": 0.5585, "step": 1508 }, { "epoch": 0.039741901501185145, "grad_norm": 4.04374885559082, "learning_rate": 4.8018172241243085e-05, "loss": 1.0659, "step": 1509 }, { "epoch": 0.03976823808269687, "grad_norm": 3.6027233600616455, "learning_rate": 4.801685541216751e-05, "loss": 2.686, "step": 1510 }, { "epoch": 0.03979457466420858, "grad_norm": 3.1707308292388916, "learning_rate": 4.8015538583091916e-05, "loss": 1.6032, "step": 1511 }, { "epoch": 0.039820911245720306, "grad_norm": 2.2083845138549805, "learning_rate": 4.801422175401633e-05, "loss": 2.2039, "step": 1512 }, { "epoch": 0.03984724782723203, "grad_norm": 2.095097541809082, "learning_rate": 4.801290492494074e-05, "loss": 2.2174, "step": 1513 }, { "epoch": 0.039873584408743744, "grad_norm": 4.769643306732178, "learning_rate": 4.8011588095865156e-05, "loss": 1.6258, "step": 1514 }, { "epoch": 0.039899920990255466, "grad_norm": 2.147710084915161, "learning_rate": 4.801027126678957e-05, "loss": 1.9945, "step": 1515 }, { "epoch": 0.03992625757176718, "grad_norm": 2.94934344291687, "learning_rate": 4.800895443771399e-05, "loss": 1.8538, "step": 1516 }, { "epoch": 0.039952594153278904, "grad_norm": 3.88328218460083, "learning_rate": 4.80076376086384e-05, "loss": 2.2841, "step": 1517 }, { "epoch": 0.03997893073479063, "grad_norm": 2.9859044551849365, "learning_rate": 4.800632077956281e-05, "loss": 0.4016, "step": 1518 }, { "epoch": 0.04000526731630234, "grad_norm": 2.992422342300415, "learning_rate": 4.8005003950487234e-05, "loss": 0.351, "step": 1519 }, { "epoch": 0.040031603897814065, "grad_norm": 4.386923313140869, "learning_rate": 4.800368712141164e-05, "loss": 1.9155, "step": 1520 }, { "epoch": 0.04005794047932578, "grad_norm": 8.80577564239502, "learning_rate": 4.800237029233606e-05, "loss": 1.6105, "step": 1521 }, { "epoch": 0.0400842770608375, "grad_norm": 3.1761035919189453, "learning_rate": 4.800105346326047e-05, "loss": 1.8264, "step": 1522 }, { "epoch": 0.040110613642349226, "grad_norm": 2.6750943660736084, "learning_rate": 4.799973663418488e-05, "loss": 1.7529, "step": 1523 }, { "epoch": 0.04013695022386094, "grad_norm": 2.4789552688598633, "learning_rate": 4.79984198051093e-05, "loss": 2.0503, "step": 1524 }, { "epoch": 0.040163286805372664, "grad_norm": 3.246100664138794, "learning_rate": 4.7997102976033714e-05, "loss": 0.3434, "step": 1525 }, { "epoch": 0.04018962338688438, "grad_norm": 2.3975906372070312, "learning_rate": 4.799578614695813e-05, "loss": 2.0262, "step": 1526 }, { "epoch": 0.0402159599683961, "grad_norm": 3.271538257598877, "learning_rate": 4.799446931788254e-05, "loss": 1.6912, "step": 1527 }, { "epoch": 0.040242296549907824, "grad_norm": 2.627981424331665, "learning_rate": 4.7993152488806954e-05, "loss": 2.229, "step": 1528 }, { "epoch": 0.04026863313141954, "grad_norm": 4.80177640914917, "learning_rate": 4.799183565973137e-05, "loss": 1.9914, "step": 1529 }, { "epoch": 0.04029496971293126, "grad_norm": 4.581860065460205, "learning_rate": 4.7990518830655785e-05, "loss": 0.5321, "step": 1530 }, { "epoch": 0.04032130629444298, "grad_norm": 4.275857448577881, "learning_rate": 4.7989202001580194e-05, "loss": 1.0698, "step": 1531 }, { "epoch": 0.0403476428759547, "grad_norm": 3.4956071376800537, "learning_rate": 4.798788517250461e-05, "loss": 1.1492, "step": 1532 }, { "epoch": 0.04037397945746642, "grad_norm": 3.086482524871826, "learning_rate": 4.7986568343429025e-05, "loss": 1.7993, "step": 1533 }, { "epoch": 0.04040031603897814, "grad_norm": 3.3483619689941406, "learning_rate": 4.798525151435344e-05, "loss": 1.977, "step": 1534 }, { "epoch": 0.04042665262048986, "grad_norm": 2.910607099533081, "learning_rate": 4.7983934685277857e-05, "loss": 1.6934, "step": 1535 }, { "epoch": 0.04045298920200158, "grad_norm": 4.313564777374268, "learning_rate": 4.7982617856202265e-05, "loss": 1.9829, "step": 1536 }, { "epoch": 0.0404793257835133, "grad_norm": 3.657001495361328, "learning_rate": 4.798130102712668e-05, "loss": 2.2356, "step": 1537 }, { "epoch": 0.04050566236502502, "grad_norm": 2.117579460144043, "learning_rate": 4.79799841980511e-05, "loss": 1.7355, "step": 1538 }, { "epoch": 0.04053199894653674, "grad_norm": 2.5381734371185303, "learning_rate": 4.797866736897551e-05, "loss": 2.1239, "step": 1539 }, { "epoch": 0.04055833552804846, "grad_norm": 2.609988212585449, "learning_rate": 4.797735053989992e-05, "loss": 1.7085, "step": 1540 }, { "epoch": 0.040584672109560176, "grad_norm": 2.351916551589966, "learning_rate": 4.797603371082434e-05, "loss": 1.9941, "step": 1541 }, { "epoch": 0.0406110086910719, "grad_norm": 4.297755241394043, "learning_rate": 4.797471688174875e-05, "loss": 1.2975, "step": 1542 }, { "epoch": 0.04063734527258362, "grad_norm": 3.406371593475342, "learning_rate": 4.797340005267317e-05, "loss": 0.6757, "step": 1543 }, { "epoch": 0.040663681854095336, "grad_norm": 2.7242140769958496, "learning_rate": 4.7972083223597583e-05, "loss": 1.4838, "step": 1544 }, { "epoch": 0.04069001843560706, "grad_norm": 5.119045734405518, "learning_rate": 4.797076639452199e-05, "loss": 1.1646, "step": 1545 }, { "epoch": 0.04071635501711878, "grad_norm": 2.1951379776000977, "learning_rate": 4.796944956544641e-05, "loss": 1.6532, "step": 1546 }, { "epoch": 0.0407426915986305, "grad_norm": 2.7918548583984375, "learning_rate": 4.796813273637082e-05, "loss": 1.8081, "step": 1547 }, { "epoch": 0.04076902818014222, "grad_norm": 3.408937931060791, "learning_rate": 4.796681590729524e-05, "loss": 1.7224, "step": 1548 }, { "epoch": 0.040795364761653935, "grad_norm": 13.387161254882812, "learning_rate": 4.796549907821965e-05, "loss": 1.2584, "step": 1549 }, { "epoch": 0.04082170134316566, "grad_norm": 2.8224565982818604, "learning_rate": 4.7964182249144063e-05, "loss": 2.2906, "step": 1550 }, { "epoch": 0.04084803792467738, "grad_norm": 1.834579348564148, "learning_rate": 4.796286542006848e-05, "loss": 2.7615, "step": 1551 }, { "epoch": 0.040874374506189096, "grad_norm": 3.280954599380493, "learning_rate": 4.7961548590992895e-05, "loss": 1.973, "step": 1552 }, { "epoch": 0.04090071108770082, "grad_norm": 2.621882438659668, "learning_rate": 4.796023176191731e-05, "loss": 2.1953, "step": 1553 }, { "epoch": 0.040927047669212534, "grad_norm": 2.110182046890259, "learning_rate": 4.795891493284172e-05, "loss": 1.7225, "step": 1554 }, { "epoch": 0.040953384250724256, "grad_norm": 3.0083208084106445, "learning_rate": 4.7957598103766135e-05, "loss": 0.3899, "step": 1555 }, { "epoch": 0.04097972083223598, "grad_norm": 3.3830409049987793, "learning_rate": 4.7956281274690543e-05, "loss": 1.9343, "step": 1556 }, { "epoch": 0.041006057413747694, "grad_norm": 2.7673492431640625, "learning_rate": 4.7954964445614966e-05, "loss": 2.0526, "step": 1557 }, { "epoch": 0.04103239399525942, "grad_norm": 2.359412431716919, "learning_rate": 4.7953647616539375e-05, "loss": 1.9374, "step": 1558 }, { "epoch": 0.04105873057677113, "grad_norm": 2.9968972206115723, "learning_rate": 4.795233078746379e-05, "loss": 2.271, "step": 1559 }, { "epoch": 0.041085067158282855, "grad_norm": 3.117983341217041, "learning_rate": 4.79510139583882e-05, "loss": 1.6007, "step": 1560 }, { "epoch": 0.04111140373979458, "grad_norm": 5.773384094238281, "learning_rate": 4.7949697129312615e-05, "loss": 1.176, "step": 1561 }, { "epoch": 0.04113774032130629, "grad_norm": 2.4290173053741455, "learning_rate": 4.794838030023703e-05, "loss": 1.1747, "step": 1562 }, { "epoch": 0.041164076902818016, "grad_norm": 3.631800651550293, "learning_rate": 4.7947063471161446e-05, "loss": 1.7565, "step": 1563 }, { "epoch": 0.04119041348432973, "grad_norm": 4.197127342224121, "learning_rate": 4.794574664208586e-05, "loss": 1.7623, "step": 1564 }, { "epoch": 0.041216750065841454, "grad_norm": 3.4556450843811035, "learning_rate": 4.794442981301027e-05, "loss": 1.9954, "step": 1565 }, { "epoch": 0.041243086647353176, "grad_norm": 2.8269834518432617, "learning_rate": 4.7943112983934686e-05, "loss": 2.2221, "step": 1566 }, { "epoch": 0.04126942322886489, "grad_norm": 1.908091425895691, "learning_rate": 4.79417961548591e-05, "loss": 1.8832, "step": 1567 }, { "epoch": 0.041295759810376614, "grad_norm": 4.320079326629639, "learning_rate": 4.794047932578352e-05, "loss": 1.6265, "step": 1568 }, { "epoch": 0.04132209639188833, "grad_norm": 3.9409115314483643, "learning_rate": 4.7939162496707926e-05, "loss": 1.2814, "step": 1569 }, { "epoch": 0.04134843297340005, "grad_norm": 3.6473047733306885, "learning_rate": 4.793784566763234e-05, "loss": 2.4627, "step": 1570 }, { "epoch": 0.041374769554911775, "grad_norm": 2.154637575149536, "learning_rate": 4.793652883855676e-05, "loss": 0.5109, "step": 1571 }, { "epoch": 0.04140110613642349, "grad_norm": 2.0151844024658203, "learning_rate": 4.793521200948117e-05, "loss": 1.7914, "step": 1572 }, { "epoch": 0.04142744271793521, "grad_norm": 3.6110353469848633, "learning_rate": 4.793389518040559e-05, "loss": 1.1011, "step": 1573 }, { "epoch": 0.04145377929944693, "grad_norm": 2.5403449535369873, "learning_rate": 4.793257835133e-05, "loss": 1.3453, "step": 1574 }, { "epoch": 0.04148011588095865, "grad_norm": 3.1963884830474854, "learning_rate": 4.793126152225441e-05, "loss": 0.7116, "step": 1575 }, { "epoch": 0.041506452462470374, "grad_norm": 2.1681923866271973, "learning_rate": 4.792994469317883e-05, "loss": 1.6642, "step": 1576 }, { "epoch": 0.04153278904398209, "grad_norm": 2.9085628986358643, "learning_rate": 4.7928627864103244e-05, "loss": 1.6519, "step": 1577 }, { "epoch": 0.04155912562549381, "grad_norm": 2.453711986541748, "learning_rate": 4.792731103502765e-05, "loss": 1.5192, "step": 1578 }, { "epoch": 0.04158546220700553, "grad_norm": 5.075864791870117, "learning_rate": 4.792599420595207e-05, "loss": 1.5274, "step": 1579 }, { "epoch": 0.04161179878851725, "grad_norm": 3.4300835132598877, "learning_rate": 4.7924677376876484e-05, "loss": 1.7227, "step": 1580 }, { "epoch": 0.04163813537002897, "grad_norm": 2.6349151134490967, "learning_rate": 4.79233605478009e-05, "loss": 1.8592, "step": 1581 }, { "epoch": 0.04166447195154069, "grad_norm": 2.22281813621521, "learning_rate": 4.7922043718725315e-05, "loss": 0.9179, "step": 1582 }, { "epoch": 0.04169080853305241, "grad_norm": 2.7420451641082764, "learning_rate": 4.7920726889649724e-05, "loss": 2.0062, "step": 1583 }, { "epoch": 0.041717145114564126, "grad_norm": 2.932339668273926, "learning_rate": 4.791941006057414e-05, "loss": 2.4226, "step": 1584 }, { "epoch": 0.04174348169607585, "grad_norm": 2.3704586029052734, "learning_rate": 4.7918093231498555e-05, "loss": 2.0831, "step": 1585 }, { "epoch": 0.04176981827758757, "grad_norm": 3.4765636920928955, "learning_rate": 4.791677640242297e-05, "loss": 1.7034, "step": 1586 }, { "epoch": 0.04179615485909929, "grad_norm": 4.112740993499756, "learning_rate": 4.791545957334738e-05, "loss": 0.7328, "step": 1587 }, { "epoch": 0.04182249144061101, "grad_norm": 2.626997232437134, "learning_rate": 4.7914142744271795e-05, "loss": 2.0492, "step": 1588 }, { "epoch": 0.04184882802212273, "grad_norm": 6.23048734664917, "learning_rate": 4.791282591519621e-05, "loss": 1.3451, "step": 1589 }, { "epoch": 0.04187516460363445, "grad_norm": 4.811365604400635, "learning_rate": 4.7911509086120626e-05, "loss": 1.1983, "step": 1590 }, { "epoch": 0.04190150118514617, "grad_norm": 2.2768898010253906, "learning_rate": 4.791019225704504e-05, "loss": 2.4662, "step": 1591 }, { "epoch": 0.041927837766657886, "grad_norm": 2.5405027866363525, "learning_rate": 4.790887542796945e-05, "loss": 2.5304, "step": 1592 }, { "epoch": 0.04195417434816961, "grad_norm": 2.4795734882354736, "learning_rate": 4.7907558598893866e-05, "loss": 2.1591, "step": 1593 }, { "epoch": 0.04198051092968133, "grad_norm": 2.5752766132354736, "learning_rate": 4.7906241769818275e-05, "loss": 2.2811, "step": 1594 }, { "epoch": 0.042006847511193046, "grad_norm": 2.638512134552002, "learning_rate": 4.79049249407427e-05, "loss": 1.8319, "step": 1595 }, { "epoch": 0.04203318409270477, "grad_norm": 3.4777679443359375, "learning_rate": 4.7903608111667106e-05, "loss": 2.272, "step": 1596 }, { "epoch": 0.042059520674216484, "grad_norm": 20.70643424987793, "learning_rate": 4.790229128259152e-05, "loss": 1.9126, "step": 1597 }, { "epoch": 0.04208585725572821, "grad_norm": 3.7559597492218018, "learning_rate": 4.790097445351594e-05, "loss": 0.4809, "step": 1598 }, { "epoch": 0.04211219383723993, "grad_norm": 3.2905445098876953, "learning_rate": 4.7899657624440346e-05, "loss": 1.1684, "step": 1599 }, { "epoch": 0.042138530418751645, "grad_norm": 3.089296817779541, "learning_rate": 4.789834079536477e-05, "loss": 2.2523, "step": 1600 }, { "epoch": 0.04216486700026337, "grad_norm": 2.502190589904785, "learning_rate": 4.789702396628918e-05, "loss": 1.8332, "step": 1601 }, { "epoch": 0.04219120358177508, "grad_norm": 2.6924796104431152, "learning_rate": 4.789570713721359e-05, "loss": 1.8899, "step": 1602 }, { "epoch": 0.042217540163286806, "grad_norm": 8.66147232055664, "learning_rate": 4.7894390308138e-05, "loss": 1.1298, "step": 1603 }, { "epoch": 0.04224387674479853, "grad_norm": 3.2306127548217773, "learning_rate": 4.7893073479062424e-05, "loss": 2.0096, "step": 1604 }, { "epoch": 0.042270213326310244, "grad_norm": 2.53001070022583, "learning_rate": 4.789175664998683e-05, "loss": 1.6158, "step": 1605 }, { "epoch": 0.042296549907821966, "grad_norm": 2.410015344619751, "learning_rate": 4.789043982091125e-05, "loss": 2.1817, "step": 1606 }, { "epoch": 0.04232288648933368, "grad_norm": 3.794081926345825, "learning_rate": 4.788912299183566e-05, "loss": 1.2384, "step": 1607 }, { "epoch": 0.042349223070845404, "grad_norm": 2.8532073497772217, "learning_rate": 4.788780616276007e-05, "loss": 1.6316, "step": 1608 }, { "epoch": 0.04237555965235713, "grad_norm": 3.2820916175842285, "learning_rate": 4.788648933368449e-05, "loss": 1.4657, "step": 1609 }, { "epoch": 0.04240189623386884, "grad_norm": 2.917919397354126, "learning_rate": 4.7885172504608904e-05, "loss": 1.5911, "step": 1610 }, { "epoch": 0.042428232815380565, "grad_norm": 2.8589375019073486, "learning_rate": 4.788385567553332e-05, "loss": 1.4094, "step": 1611 }, { "epoch": 0.04245456939689228, "grad_norm": 2.2336888313293457, "learning_rate": 4.788253884645773e-05, "loss": 1.4805, "step": 1612 }, { "epoch": 0.042480905978404, "grad_norm": 1.9819060564041138, "learning_rate": 4.7881222017382144e-05, "loss": 1.9223, "step": 1613 }, { "epoch": 0.042507242559915726, "grad_norm": 3.8112151622772217, "learning_rate": 4.787990518830656e-05, "loss": 1.6173, "step": 1614 }, { "epoch": 0.04253357914142744, "grad_norm": 4.478504657745361, "learning_rate": 4.7878588359230976e-05, "loss": 2.2684, "step": 1615 }, { "epoch": 0.042559915722939164, "grad_norm": 5.168664932250977, "learning_rate": 4.7877271530155384e-05, "loss": 0.9267, "step": 1616 }, { "epoch": 0.04258625230445088, "grad_norm": 2.1900522708892822, "learning_rate": 4.78759547010798e-05, "loss": 1.5993, "step": 1617 }, { "epoch": 0.0426125888859626, "grad_norm": 2.6770801544189453, "learning_rate": 4.7874637872004216e-05, "loss": 1.8449, "step": 1618 }, { "epoch": 0.042638925467474324, "grad_norm": 3.2104368209838867, "learning_rate": 4.787332104292863e-05, "loss": 1.2252, "step": 1619 }, { "epoch": 0.04266526204898604, "grad_norm": 2.747957706451416, "learning_rate": 4.787200421385305e-05, "loss": 2.3908, "step": 1620 }, { "epoch": 0.04269159863049776, "grad_norm": 6.097873687744141, "learning_rate": 4.7870687384777456e-05, "loss": 1.9629, "step": 1621 }, { "epoch": 0.04271793521200948, "grad_norm": 2.296062469482422, "learning_rate": 4.786937055570187e-05, "loss": 2.1889, "step": 1622 }, { "epoch": 0.0427442717935212, "grad_norm": 3.2836878299713135, "learning_rate": 4.786805372662629e-05, "loss": 0.7493, "step": 1623 }, { "epoch": 0.04277060837503292, "grad_norm": 2.845104455947876, "learning_rate": 4.78667368975507e-05, "loss": 2.0775, "step": 1624 }, { "epoch": 0.04279694495654464, "grad_norm": 2.390986680984497, "learning_rate": 4.786542006847511e-05, "loss": 1.8683, "step": 1625 }, { "epoch": 0.04282328153805636, "grad_norm": 4.470747470855713, "learning_rate": 4.786410323939953e-05, "loss": 1.2525, "step": 1626 }, { "epoch": 0.04284961811956808, "grad_norm": 2.0850131511688232, "learning_rate": 4.786278641032394e-05, "loss": 2.0149, "step": 1627 }, { "epoch": 0.0428759547010798, "grad_norm": 2.494018077850342, "learning_rate": 4.786146958124836e-05, "loss": 1.3731, "step": 1628 }, { "epoch": 0.04290229128259152, "grad_norm": 3.647768974304199, "learning_rate": 4.7860152752172774e-05, "loss": 1.2123, "step": 1629 }, { "epoch": 0.04292862786410324, "grad_norm": 2.676943302154541, "learning_rate": 4.785883592309718e-05, "loss": 1.5728, "step": 1630 }, { "epoch": 0.04295496444561496, "grad_norm": 1.8378205299377441, "learning_rate": 4.78575190940216e-05, "loss": 0.6929, "step": 1631 }, { "epoch": 0.04298130102712668, "grad_norm": 3.5674049854278564, "learning_rate": 4.785620226494601e-05, "loss": 1.3228, "step": 1632 }, { "epoch": 0.0430076376086384, "grad_norm": 2.0570549964904785, "learning_rate": 4.785488543587043e-05, "loss": 1.7695, "step": 1633 }, { "epoch": 0.04303397419015012, "grad_norm": 4.9037370681762695, "learning_rate": 4.785356860679484e-05, "loss": 1.747, "step": 1634 }, { "epoch": 0.043060310771661836, "grad_norm": 3.769916534423828, "learning_rate": 4.7852251777719254e-05, "loss": 1.6977, "step": 1635 }, { "epoch": 0.04308664735317356, "grad_norm": 2.68953800201416, "learning_rate": 4.785093494864367e-05, "loss": 0.4149, "step": 1636 }, { "epoch": 0.04311298393468528, "grad_norm": 4.373473644256592, "learning_rate": 4.7849618119568085e-05, "loss": 1.1404, "step": 1637 }, { "epoch": 0.043139320516197, "grad_norm": 7.387909412384033, "learning_rate": 4.78483012904925e-05, "loss": 1.8211, "step": 1638 }, { "epoch": 0.04316565709770872, "grad_norm": 3.6866443157196045, "learning_rate": 4.784698446141691e-05, "loss": 1.7778, "step": 1639 }, { "epoch": 0.043191993679220435, "grad_norm": 3.591341018676758, "learning_rate": 4.7845667632341325e-05, "loss": 1.6675, "step": 1640 }, { "epoch": 0.04321833026073216, "grad_norm": 3.440986394882202, "learning_rate": 4.7844350803265734e-05, "loss": 1.5045, "step": 1641 }, { "epoch": 0.04324466684224388, "grad_norm": 3.4958741664886475, "learning_rate": 4.7843033974190156e-05, "loss": 2.2951, "step": 1642 }, { "epoch": 0.043271003423755595, "grad_norm": 2.1908135414123535, "learning_rate": 4.7841717145114565e-05, "loss": 1.3439, "step": 1643 }, { "epoch": 0.04329734000526732, "grad_norm": 2.8393232822418213, "learning_rate": 4.784040031603898e-05, "loss": 1.4972, "step": 1644 }, { "epoch": 0.043323676586779034, "grad_norm": 2.38371205329895, "learning_rate": 4.7839083486963396e-05, "loss": 1.8012, "step": 1645 }, { "epoch": 0.043350013168290756, "grad_norm": 2.383183002471924, "learning_rate": 4.7837766657887805e-05, "loss": 1.8169, "step": 1646 }, { "epoch": 0.04337634974980248, "grad_norm": 4.216830730438232, "learning_rate": 4.783644982881223e-05, "loss": 2.4733, "step": 1647 }, { "epoch": 0.043402686331314194, "grad_norm": 3.641187906265259, "learning_rate": 4.7835132999736636e-05, "loss": 2.337, "step": 1648 }, { "epoch": 0.04342902291282592, "grad_norm": 3.450556993484497, "learning_rate": 4.783381617066105e-05, "loss": 0.6939, "step": 1649 }, { "epoch": 0.04345535949433763, "grad_norm": 3.0492610931396484, "learning_rate": 4.783249934158546e-05, "loss": 0.5371, "step": 1650 }, { "epoch": 0.043481696075849355, "grad_norm": 4.064547061920166, "learning_rate": 4.783118251250988e-05, "loss": 0.7245, "step": 1651 }, { "epoch": 0.04350803265736108, "grad_norm": 3.24357271194458, "learning_rate": 4.782986568343429e-05, "loss": 1.5819, "step": 1652 }, { "epoch": 0.04353436923887279, "grad_norm": 2.13814640045166, "learning_rate": 4.782854885435871e-05, "loss": 1.7686, "step": 1653 }, { "epoch": 0.043560705820384515, "grad_norm": 2.536398410797119, "learning_rate": 4.782723202528312e-05, "loss": 1.4857, "step": 1654 }, { "epoch": 0.04358704240189623, "grad_norm": 4.551061630249023, "learning_rate": 4.782591519620753e-05, "loss": 1.4803, "step": 1655 }, { "epoch": 0.043613378983407954, "grad_norm": 2.0763628482818604, "learning_rate": 4.7824598367131954e-05, "loss": 2.0751, "step": 1656 }, { "epoch": 0.043639715564919676, "grad_norm": 1.9243184328079224, "learning_rate": 4.782328153805636e-05, "loss": 1.5623, "step": 1657 }, { "epoch": 0.04366605214643139, "grad_norm": 3.0155081748962402, "learning_rate": 4.782196470898078e-05, "loss": 2.1019, "step": 1658 }, { "epoch": 0.043692388727943114, "grad_norm": 3.214911460876465, "learning_rate": 4.782064787990519e-05, "loss": 1.6881, "step": 1659 }, { "epoch": 0.04371872530945483, "grad_norm": 2.7685279846191406, "learning_rate": 4.78193310508296e-05, "loss": 2.7239, "step": 1660 }, { "epoch": 0.04374506189096655, "grad_norm": 4.537632942199707, "learning_rate": 4.781801422175402e-05, "loss": 1.7678, "step": 1661 }, { "epoch": 0.043771398472478275, "grad_norm": 3.539085626602173, "learning_rate": 4.7816697392678434e-05, "loss": 1.7179, "step": 1662 }, { "epoch": 0.04379773505398999, "grad_norm": 3.8898887634277344, "learning_rate": 4.781538056360284e-05, "loss": 0.8136, "step": 1663 }, { "epoch": 0.04382407163550171, "grad_norm": 6.164101600646973, "learning_rate": 4.781406373452726e-05, "loss": 2.2738, "step": 1664 }, { "epoch": 0.04385040821701343, "grad_norm": 3.5677146911621094, "learning_rate": 4.7812746905451674e-05, "loss": 1.68, "step": 1665 }, { "epoch": 0.04387674479852515, "grad_norm": 2.4857919216156006, "learning_rate": 4.781143007637609e-05, "loss": 0.4742, "step": 1666 }, { "epoch": 0.043903081380036874, "grad_norm": 2.818009376525879, "learning_rate": 4.7810113247300505e-05, "loss": 1.6659, "step": 1667 }, { "epoch": 0.04392941796154859, "grad_norm": 2.3224613666534424, "learning_rate": 4.7808796418224914e-05, "loss": 1.6821, "step": 1668 }, { "epoch": 0.04395575454306031, "grad_norm": 8.352319717407227, "learning_rate": 4.780747958914933e-05, "loss": 1.7879, "step": 1669 }, { "epoch": 0.04398209112457203, "grad_norm": 3.1340174674987793, "learning_rate": 4.7806162760073745e-05, "loss": 1.6865, "step": 1670 }, { "epoch": 0.04400842770608375, "grad_norm": 3.5655996799468994, "learning_rate": 4.780484593099816e-05, "loss": 1.8297, "step": 1671 }, { "epoch": 0.04403476428759547, "grad_norm": 2.270786762237549, "learning_rate": 4.780352910192257e-05, "loss": 2.0561, "step": 1672 }, { "epoch": 0.04406110086910719, "grad_norm": 3.1496076583862305, "learning_rate": 4.7802212272846985e-05, "loss": 1.2317, "step": 1673 }, { "epoch": 0.04408743745061891, "grad_norm": 6.753971576690674, "learning_rate": 4.78008954437714e-05, "loss": 1.1494, "step": 1674 }, { "epoch": 0.044113774032130626, "grad_norm": 3.142165422439575, "learning_rate": 4.779957861469582e-05, "loss": 1.7838, "step": 1675 }, { "epoch": 0.04414011061364235, "grad_norm": 6.98069429397583, "learning_rate": 4.779826178562023e-05, "loss": 2.4283, "step": 1676 }, { "epoch": 0.04416644719515407, "grad_norm": 3.2393054962158203, "learning_rate": 4.779694495654464e-05, "loss": 1.7799, "step": 1677 }, { "epoch": 0.04419278377666579, "grad_norm": 3.4538471698760986, "learning_rate": 4.779562812746906e-05, "loss": 2.1511, "step": 1678 }, { "epoch": 0.04421912035817751, "grad_norm": 2.1494596004486084, "learning_rate": 4.7794311298393466e-05, "loss": 1.5318, "step": 1679 }, { "epoch": 0.04424545693968923, "grad_norm": 3.0062994956970215, "learning_rate": 4.779299446931789e-05, "loss": 2.2842, "step": 1680 }, { "epoch": 0.04427179352120095, "grad_norm": 2.5928919315338135, "learning_rate": 4.77916776402423e-05, "loss": 2.4077, "step": 1681 }, { "epoch": 0.04429813010271267, "grad_norm": 3.13175368309021, "learning_rate": 4.779036081116671e-05, "loss": 2.0801, "step": 1682 }, { "epoch": 0.044324466684224385, "grad_norm": 2.3422467708587646, "learning_rate": 4.778904398209113e-05, "loss": 2.3598, "step": 1683 }, { "epoch": 0.04435080326573611, "grad_norm": 3.7321741580963135, "learning_rate": 4.7787727153015543e-05, "loss": 1.4985, "step": 1684 }, { "epoch": 0.04437713984724783, "grad_norm": 3.111586570739746, "learning_rate": 4.778641032393996e-05, "loss": 0.7241, "step": 1685 }, { "epoch": 0.044403476428759546, "grad_norm": 3.2628390789031982, "learning_rate": 4.778509349486437e-05, "loss": 0.9041, "step": 1686 }, { "epoch": 0.04442981301027127, "grad_norm": 2.896920680999756, "learning_rate": 4.7783776665788784e-05, "loss": 2.3112, "step": 1687 }, { "epoch": 0.044456149591782984, "grad_norm": 2.1524009704589844, "learning_rate": 4.778245983671319e-05, "loss": 1.6757, "step": 1688 }, { "epoch": 0.04448248617329471, "grad_norm": 4.034472465515137, "learning_rate": 4.7781143007637615e-05, "loss": 1.7498, "step": 1689 }, { "epoch": 0.04450882275480643, "grad_norm": 2.1806130409240723, "learning_rate": 4.7779826178562024e-05, "loss": 1.2612, "step": 1690 }, { "epoch": 0.044535159336318145, "grad_norm": 2.5430376529693604, "learning_rate": 4.777850934948644e-05, "loss": 1.7788, "step": 1691 }, { "epoch": 0.04456149591782987, "grad_norm": 2.6140480041503906, "learning_rate": 4.7777192520410855e-05, "loss": 0.7482, "step": 1692 }, { "epoch": 0.04458783249934158, "grad_norm": 3.0158402919769287, "learning_rate": 4.7775875691335264e-05, "loss": 1.2215, "step": 1693 }, { "epoch": 0.044614169080853305, "grad_norm": 1.6862404346466064, "learning_rate": 4.7774558862259686e-05, "loss": 1.801, "step": 1694 }, { "epoch": 0.04464050566236503, "grad_norm": 7.785438537597656, "learning_rate": 4.7773242033184095e-05, "loss": 0.9266, "step": 1695 }, { "epoch": 0.04466684224387674, "grad_norm": 4.126392841339111, "learning_rate": 4.777192520410851e-05, "loss": 0.8319, "step": 1696 }, { "epoch": 0.044693178825388466, "grad_norm": 2.515692949295044, "learning_rate": 4.777060837503292e-05, "loss": 1.9474, "step": 1697 }, { "epoch": 0.04471951540690018, "grad_norm": 3.4862043857574463, "learning_rate": 4.776929154595734e-05, "loss": 0.9102, "step": 1698 }, { "epoch": 0.044745851988411904, "grad_norm": 5.722166061401367, "learning_rate": 4.776797471688175e-05, "loss": 2.1326, "step": 1699 }, { "epoch": 0.04477218856992363, "grad_norm": 2.5878779888153076, "learning_rate": 4.7766657887806166e-05, "loss": 1.6677, "step": 1700 }, { "epoch": 0.04479852515143534, "grad_norm": 3.3507132530212402, "learning_rate": 4.776534105873058e-05, "loss": 1.5343, "step": 1701 }, { "epoch": 0.044824861732947065, "grad_norm": 2.779021978378296, "learning_rate": 4.776402422965499e-05, "loss": 1.5199, "step": 1702 }, { "epoch": 0.04485119831445878, "grad_norm": 3.0135865211486816, "learning_rate": 4.776270740057941e-05, "loss": 0.6944, "step": 1703 }, { "epoch": 0.0448775348959705, "grad_norm": 3.047968626022339, "learning_rate": 4.776139057150382e-05, "loss": 1.4053, "step": 1704 }, { "epoch": 0.044903871477482225, "grad_norm": 3.3515453338623047, "learning_rate": 4.776007374242824e-05, "loss": 1.9602, "step": 1705 }, { "epoch": 0.04493020805899394, "grad_norm": 7.582807540893555, "learning_rate": 4.7758756913352646e-05, "loss": 1.8265, "step": 1706 }, { "epoch": 0.04495654464050566, "grad_norm": 2.6268017292022705, "learning_rate": 4.775744008427706e-05, "loss": 1.9812, "step": 1707 }, { "epoch": 0.04498288122201738, "grad_norm": 2.0065243244171143, "learning_rate": 4.775612325520148e-05, "loss": 1.7819, "step": 1708 }, { "epoch": 0.0450092178035291, "grad_norm": 4.183550834655762, "learning_rate": 4.775480642612589e-05, "loss": 1.3637, "step": 1709 }, { "epoch": 0.045035554385040824, "grad_norm": 6.974421977996826, "learning_rate": 4.77534895970503e-05, "loss": 1.2507, "step": 1710 }, { "epoch": 0.04506189096655254, "grad_norm": 2.9028408527374268, "learning_rate": 4.775217276797472e-05, "loss": 1.9522, "step": 1711 }, { "epoch": 0.04508822754806426, "grad_norm": 2.099782943725586, "learning_rate": 4.775085593889913e-05, "loss": 1.7498, "step": 1712 }, { "epoch": 0.04511456412957598, "grad_norm": 3.4039838314056396, "learning_rate": 4.774953910982355e-05, "loss": 1.1261, "step": 1713 }, { "epoch": 0.0451409007110877, "grad_norm": 3.148505449295044, "learning_rate": 4.7748222280747964e-05, "loss": 2.0884, "step": 1714 }, { "epoch": 0.04516723729259942, "grad_norm": 4.304432392120361, "learning_rate": 4.774690545167237e-05, "loss": 1.6005, "step": 1715 }, { "epoch": 0.04519357387411114, "grad_norm": 2.3728487491607666, "learning_rate": 4.774558862259679e-05, "loss": 1.8499, "step": 1716 }, { "epoch": 0.04521991045562286, "grad_norm": 2.966421604156494, "learning_rate": 4.7744271793521204e-05, "loss": 1.3817, "step": 1717 }, { "epoch": 0.045246247037134577, "grad_norm": 2.327455759048462, "learning_rate": 4.774295496444562e-05, "loss": 1.7435, "step": 1718 }, { "epoch": 0.0452725836186463, "grad_norm": 2.3098363876342773, "learning_rate": 4.774163813537003e-05, "loss": 1.8852, "step": 1719 }, { "epoch": 0.04529892020015802, "grad_norm": 2.8407859802246094, "learning_rate": 4.7740321306294444e-05, "loss": 2.2121, "step": 1720 }, { "epoch": 0.04532525678166974, "grad_norm": 2.419339418411255, "learning_rate": 4.773900447721886e-05, "loss": 1.8823, "step": 1721 }, { "epoch": 0.04535159336318146, "grad_norm": 2.6375315189361572, "learning_rate": 4.7737687648143275e-05, "loss": 1.795, "step": 1722 }, { "epoch": 0.04537792994469318, "grad_norm": 2.089695692062378, "learning_rate": 4.773637081906769e-05, "loss": 1.5587, "step": 1723 }, { "epoch": 0.0454042665262049, "grad_norm": 2.309267044067383, "learning_rate": 4.77350539899921e-05, "loss": 1.6003, "step": 1724 }, { "epoch": 0.04543060310771662, "grad_norm": 2.6133782863616943, "learning_rate": 4.7733737160916515e-05, "loss": 1.8045, "step": 1725 }, { "epoch": 0.045456939689228336, "grad_norm": 2.815037727355957, "learning_rate": 4.7732420331840924e-05, "loss": 1.9914, "step": 1726 }, { "epoch": 0.04548327627074006, "grad_norm": 4.499743461608887, "learning_rate": 4.7731103502765346e-05, "loss": 1.8553, "step": 1727 }, { "epoch": 0.04550961285225178, "grad_norm": 2.416105270385742, "learning_rate": 4.7729786673689755e-05, "loss": 2.1857, "step": 1728 }, { "epoch": 0.045535949433763497, "grad_norm": 2.986762523651123, "learning_rate": 4.772846984461417e-05, "loss": 1.8378, "step": 1729 }, { "epoch": 0.04556228601527522, "grad_norm": 3.844364881515503, "learning_rate": 4.7727153015538586e-05, "loss": 2.5077, "step": 1730 }, { "epoch": 0.045588622596786935, "grad_norm": 2.905626058578491, "learning_rate": 4.7725836186463e-05, "loss": 1.5804, "step": 1731 }, { "epoch": 0.04561495917829866, "grad_norm": 3.7784745693206787, "learning_rate": 4.772451935738742e-05, "loss": 1.1766, "step": 1732 }, { "epoch": 0.04564129575981038, "grad_norm": 2.265690565109253, "learning_rate": 4.7723202528311826e-05, "loss": 1.9701, "step": 1733 }, { "epoch": 0.045667632341322095, "grad_norm": 4.532913684844971, "learning_rate": 4.772188569923624e-05, "loss": 1.8431, "step": 1734 }, { "epoch": 0.04569396892283382, "grad_norm": 1.9257761240005493, "learning_rate": 4.772056887016065e-05, "loss": 1.9792, "step": 1735 }, { "epoch": 0.04572030550434553, "grad_norm": 2.6322133541107178, "learning_rate": 4.771925204108507e-05, "loss": 1.5659, "step": 1736 }, { "epoch": 0.045746642085857256, "grad_norm": 2.7895565032958984, "learning_rate": 4.771793521200948e-05, "loss": 1.7882, "step": 1737 }, { "epoch": 0.04577297866736898, "grad_norm": 2.231600046157837, "learning_rate": 4.77166183829339e-05, "loss": 1.8305, "step": 1738 }, { "epoch": 0.045799315248880694, "grad_norm": 3.5905938148498535, "learning_rate": 4.771530155385831e-05, "loss": 1.2803, "step": 1739 }, { "epoch": 0.045825651830392417, "grad_norm": 1.8099578619003296, "learning_rate": 4.771398472478272e-05, "loss": 0.4731, "step": 1740 }, { "epoch": 0.04585198841190413, "grad_norm": 3.470010280609131, "learning_rate": 4.7712667895707144e-05, "loss": 2.1015, "step": 1741 }, { "epoch": 0.045878324993415855, "grad_norm": 3.340970516204834, "learning_rate": 4.771135106663155e-05, "loss": 1.8298, "step": 1742 }, { "epoch": 0.04590466157492758, "grad_norm": 2.5090110301971436, "learning_rate": 4.771003423755597e-05, "loss": 2.4194, "step": 1743 }, { "epoch": 0.04593099815643929, "grad_norm": 2.6823501586914062, "learning_rate": 4.770871740848038e-05, "loss": 0.5867, "step": 1744 }, { "epoch": 0.045957334737951015, "grad_norm": 2.7022886276245117, "learning_rate": 4.770740057940479e-05, "loss": 1.8058, "step": 1745 }, { "epoch": 0.04598367131946273, "grad_norm": 2.5525381565093994, "learning_rate": 4.770608375032921e-05, "loss": 2.1268, "step": 1746 }, { "epoch": 0.04601000790097445, "grad_norm": 2.399473190307617, "learning_rate": 4.7704766921253625e-05, "loss": 2.1018, "step": 1747 }, { "epoch": 0.046036344482486176, "grad_norm": 3.300828456878662, "learning_rate": 4.770345009217804e-05, "loss": 1.9747, "step": 1748 }, { "epoch": 0.04606268106399789, "grad_norm": 3.060251235961914, "learning_rate": 4.770213326310245e-05, "loss": 1.6118, "step": 1749 }, { "epoch": 0.046089017645509614, "grad_norm": 4.1365227699279785, "learning_rate": 4.770081643402687e-05, "loss": 1.4013, "step": 1750 }, { "epoch": 0.04611535422702133, "grad_norm": 2.2801895141601562, "learning_rate": 4.769949960495128e-05, "loss": 1.9977, "step": 1751 }, { "epoch": 0.04614169080853305, "grad_norm": 2.8356292247772217, "learning_rate": 4.7698182775875696e-05, "loss": 1.7156, "step": 1752 }, { "epoch": 0.046168027390044775, "grad_norm": 4.43156623840332, "learning_rate": 4.7696865946800105e-05, "loss": 1.5122, "step": 1753 }, { "epoch": 0.04619436397155649, "grad_norm": 1.935263752937317, "learning_rate": 4.769554911772452e-05, "loss": 1.9238, "step": 1754 }, { "epoch": 0.04622070055306821, "grad_norm": 2.5804927349090576, "learning_rate": 4.7694232288648936e-05, "loss": 1.6656, "step": 1755 }, { "epoch": 0.04624703713457993, "grad_norm": 2.096917152404785, "learning_rate": 4.769291545957335e-05, "loss": 2.3052, "step": 1756 }, { "epoch": 0.04627337371609165, "grad_norm": 3.051651954650879, "learning_rate": 4.769159863049777e-05, "loss": 2.1008, "step": 1757 }, { "epoch": 0.04629971029760337, "grad_norm": 2.965366840362549, "learning_rate": 4.7690281801422176e-05, "loss": 1.3892, "step": 1758 }, { "epoch": 0.04632604687911509, "grad_norm": 2.560187578201294, "learning_rate": 4.768896497234659e-05, "loss": 1.3878, "step": 1759 }, { "epoch": 0.04635238346062681, "grad_norm": 3.776427984237671, "learning_rate": 4.768764814327101e-05, "loss": 2.7926, "step": 1760 }, { "epoch": 0.04637872004213853, "grad_norm": 3.302241802215576, "learning_rate": 4.768633131419542e-05, "loss": 1.3911, "step": 1761 }, { "epoch": 0.04640505662365025, "grad_norm": 2.7292490005493164, "learning_rate": 4.768501448511983e-05, "loss": 1.3793, "step": 1762 }, { "epoch": 0.04643139320516197, "grad_norm": 5.511817932128906, "learning_rate": 4.768369765604425e-05, "loss": 1.7398, "step": 1763 }, { "epoch": 0.04645772978667369, "grad_norm": 3.4631576538085938, "learning_rate": 4.768238082696866e-05, "loss": 1.5239, "step": 1764 }, { "epoch": 0.04648406636818541, "grad_norm": 1.8677030801773071, "learning_rate": 4.768106399789308e-05, "loss": 1.3443, "step": 1765 }, { "epoch": 0.04651040294969713, "grad_norm": 4.334285736083984, "learning_rate": 4.767974716881749e-05, "loss": 1.2896, "step": 1766 }, { "epoch": 0.04653673953120885, "grad_norm": 2.226372718811035, "learning_rate": 4.76784303397419e-05, "loss": 1.3587, "step": 1767 }, { "epoch": 0.04656307611272057, "grad_norm": 2.619678497314453, "learning_rate": 4.767711351066632e-05, "loss": 1.237, "step": 1768 }, { "epoch": 0.046589412694232286, "grad_norm": 3.8238658905029297, "learning_rate": 4.7675796681590734e-05, "loss": 1.3064, "step": 1769 }, { "epoch": 0.04661574927574401, "grad_norm": 4.150592803955078, "learning_rate": 4.767447985251515e-05, "loss": 1.5192, "step": 1770 }, { "epoch": 0.04664208585725573, "grad_norm": 3.409359931945801, "learning_rate": 4.767316302343956e-05, "loss": 1.2098, "step": 1771 }, { "epoch": 0.04666842243876745, "grad_norm": 2.423478126525879, "learning_rate": 4.7671846194363974e-05, "loss": 2.569, "step": 1772 }, { "epoch": 0.04669475902027917, "grad_norm": 3.326936960220337, "learning_rate": 4.767052936528838e-05, "loss": 2.1252, "step": 1773 }, { "epoch": 0.046721095601790885, "grad_norm": 2.609152317047119, "learning_rate": 4.7669212536212805e-05, "loss": 1.7812, "step": 1774 }, { "epoch": 0.04674743218330261, "grad_norm": 9.348191261291504, "learning_rate": 4.7667895707137214e-05, "loss": 0.5157, "step": 1775 }, { "epoch": 0.04677376876481433, "grad_norm": 3.1011111736297607, "learning_rate": 4.766657887806163e-05, "loss": 2.4706, "step": 1776 }, { "epoch": 0.046800105346326046, "grad_norm": 3.4174981117248535, "learning_rate": 4.7665262048986045e-05, "loss": 1.6857, "step": 1777 }, { "epoch": 0.04682644192783777, "grad_norm": 5.005181789398193, "learning_rate": 4.7663945219910454e-05, "loss": 1.5254, "step": 1778 }, { "epoch": 0.046852778509349484, "grad_norm": 2.7576558589935303, "learning_rate": 4.7662628390834876e-05, "loss": 1.1868, "step": 1779 }, { "epoch": 0.046879115090861206, "grad_norm": 3.931638240814209, "learning_rate": 4.7661311561759285e-05, "loss": 2.2186, "step": 1780 }, { "epoch": 0.04690545167237293, "grad_norm": 1.7449153661727905, "learning_rate": 4.76599947326837e-05, "loss": 2.1682, "step": 1781 }, { "epoch": 0.046931788253884645, "grad_norm": 2.6958789825439453, "learning_rate": 4.765867790360811e-05, "loss": 2.0851, "step": 1782 }, { "epoch": 0.04695812483539637, "grad_norm": 4.813263893127441, "learning_rate": 4.765736107453253e-05, "loss": 1.0686, "step": 1783 }, { "epoch": 0.04698446141690808, "grad_norm": 2.0777738094329834, "learning_rate": 4.765604424545694e-05, "loss": 2.0219, "step": 1784 }, { "epoch": 0.047010797998419805, "grad_norm": 3.966855525970459, "learning_rate": 4.7654727416381356e-05, "loss": 1.7394, "step": 1785 }, { "epoch": 0.04703713457993153, "grad_norm": 2.7082858085632324, "learning_rate": 4.765341058730577e-05, "loss": 1.7643, "step": 1786 }, { "epoch": 0.04706347116144324, "grad_norm": 2.816910982131958, "learning_rate": 4.765209375823018e-05, "loss": 1.9363, "step": 1787 }, { "epoch": 0.047089807742954966, "grad_norm": 2.363173246383667, "learning_rate": 4.76507769291546e-05, "loss": 1.8564, "step": 1788 }, { "epoch": 0.04711614432446668, "grad_norm": 2.4330899715423584, "learning_rate": 4.764946010007901e-05, "loss": 2.0441, "step": 1789 }, { "epoch": 0.047142480905978404, "grad_norm": 2.9492557048797607, "learning_rate": 4.764814327100343e-05, "loss": 2.008, "step": 1790 }, { "epoch": 0.047168817487490126, "grad_norm": 6.210226058959961, "learning_rate": 4.7646826441927836e-05, "loss": 1.1207, "step": 1791 }, { "epoch": 0.04719515406900184, "grad_norm": 2.4410698413848877, "learning_rate": 4.764550961285225e-05, "loss": 1.7368, "step": 1792 }, { "epoch": 0.047221490650513565, "grad_norm": 2.2780191898345947, "learning_rate": 4.764419278377667e-05, "loss": 1.9566, "step": 1793 }, { "epoch": 0.04724782723202528, "grad_norm": 3.4953715801239014, "learning_rate": 4.764287595470108e-05, "loss": 0.9462, "step": 1794 }, { "epoch": 0.047274163813537, "grad_norm": 2.6264867782592773, "learning_rate": 4.76415591256255e-05, "loss": 1.8387, "step": 1795 }, { "epoch": 0.047300500395048725, "grad_norm": 2.1491644382476807, "learning_rate": 4.764024229654991e-05, "loss": 1.5921, "step": 1796 }, { "epoch": 0.04732683697656044, "grad_norm": 3.6468658447265625, "learning_rate": 4.763892546747433e-05, "loss": 1.2979, "step": 1797 }, { "epoch": 0.04735317355807216, "grad_norm": 2.1967475414276123, "learning_rate": 4.763760863839874e-05, "loss": 1.4175, "step": 1798 }, { "epoch": 0.04737951013958388, "grad_norm": 2.6574835777282715, "learning_rate": 4.7636291809323154e-05, "loss": 2.0071, "step": 1799 }, { "epoch": 0.0474058467210956, "grad_norm": 1.8592238426208496, "learning_rate": 4.763497498024756e-05, "loss": 2.7643, "step": 1800 }, { "epoch": 0.047432183302607324, "grad_norm": 3.7676124572753906, "learning_rate": 4.763365815117198e-05, "loss": 0.4047, "step": 1801 }, { "epoch": 0.04745851988411904, "grad_norm": 3.908045768737793, "learning_rate": 4.7632341322096394e-05, "loss": 1.8999, "step": 1802 }, { "epoch": 0.04748485646563076, "grad_norm": 2.474395513534546, "learning_rate": 4.763102449302081e-05, "loss": 2.4039, "step": 1803 }, { "epoch": 0.04751119304714248, "grad_norm": 2.8622822761535645, "learning_rate": 4.7629707663945225e-05, "loss": 0.5707, "step": 1804 }, { "epoch": 0.0475375296286542, "grad_norm": 2.638190507888794, "learning_rate": 4.7628390834869634e-05, "loss": 1.6038, "step": 1805 }, { "epoch": 0.04756386621016592, "grad_norm": 2.4967191219329834, "learning_rate": 4.762707400579405e-05, "loss": 1.9041, "step": 1806 }, { "epoch": 0.04759020279167764, "grad_norm": 3.1558327674865723, "learning_rate": 4.7625757176718465e-05, "loss": 0.7622, "step": 1807 }, { "epoch": 0.04761653937318936, "grad_norm": 3.3613836765289307, "learning_rate": 4.762444034764288e-05, "loss": 1.114, "step": 1808 }, { "epoch": 0.047642875954701076, "grad_norm": 3.9828338623046875, "learning_rate": 4.762312351856729e-05, "loss": 2.1822, "step": 1809 }, { "epoch": 0.0476692125362128, "grad_norm": 4.770702362060547, "learning_rate": 4.7621806689491706e-05, "loss": 0.9249, "step": 1810 }, { "epoch": 0.04769554911772452, "grad_norm": 2.768343448638916, "learning_rate": 4.7620489860416114e-05, "loss": 1.7084, "step": 1811 }, { "epoch": 0.04772188569923624, "grad_norm": 2.334273338317871, "learning_rate": 4.761917303134054e-05, "loss": 1.744, "step": 1812 }, { "epoch": 0.04774822228074796, "grad_norm": 2.6795191764831543, "learning_rate": 4.7617856202264946e-05, "loss": 1.3036, "step": 1813 }, { "epoch": 0.04777455886225968, "grad_norm": 3.734348773956299, "learning_rate": 4.761653937318936e-05, "loss": 2.1771, "step": 1814 }, { "epoch": 0.0478008954437714, "grad_norm": 3.245972156524658, "learning_rate": 4.761522254411378e-05, "loss": 1.9377, "step": 1815 }, { "epoch": 0.04782723202528312, "grad_norm": 2.143364906311035, "learning_rate": 4.761390571503819e-05, "loss": 2.0788, "step": 1816 }, { "epoch": 0.047853568606794836, "grad_norm": 2.610452890396118, "learning_rate": 4.761258888596261e-05, "loss": 0.83, "step": 1817 }, { "epoch": 0.04787990518830656, "grad_norm": 2.242518424987793, "learning_rate": 4.761127205688702e-05, "loss": 1.9233, "step": 1818 }, { "epoch": 0.04790624176981828, "grad_norm": 2.1876494884490967, "learning_rate": 4.760995522781143e-05, "loss": 2.1151, "step": 1819 }, { "epoch": 0.047932578351329996, "grad_norm": 3.099712371826172, "learning_rate": 4.760863839873584e-05, "loss": 0.5995, "step": 1820 }, { "epoch": 0.04795891493284172, "grad_norm": 3.743948459625244, "learning_rate": 4.7607321569660264e-05, "loss": 2.0935, "step": 1821 }, { "epoch": 0.047985251514353434, "grad_norm": 2.697472333908081, "learning_rate": 4.760600474058467e-05, "loss": 2.4041, "step": 1822 }, { "epoch": 0.04801158809586516, "grad_norm": 5.2074785232543945, "learning_rate": 4.760468791150909e-05, "loss": 1.6589, "step": 1823 }, { "epoch": 0.04803792467737688, "grad_norm": 3.097689390182495, "learning_rate": 4.7603371082433504e-05, "loss": 2.1798, "step": 1824 }, { "epoch": 0.048064261258888595, "grad_norm": 2.648519277572632, "learning_rate": 4.760205425335791e-05, "loss": 1.5074, "step": 1825 }, { "epoch": 0.04809059784040032, "grad_norm": 3.4516360759735107, "learning_rate": 4.7600737424282335e-05, "loss": 1.97, "step": 1826 }, { "epoch": 0.04811693442191203, "grad_norm": 3.2515652179718018, "learning_rate": 4.7599420595206744e-05, "loss": 2.2869, "step": 1827 }, { "epoch": 0.048143271003423756, "grad_norm": 2.2592008113861084, "learning_rate": 4.759810376613116e-05, "loss": 2.0299, "step": 1828 }, { "epoch": 0.04816960758493548, "grad_norm": 3.1698997020721436, "learning_rate": 4.759678693705557e-05, "loss": 1.6406, "step": 1829 }, { "epoch": 0.048195944166447194, "grad_norm": 2.9967458248138428, "learning_rate": 4.759547010797999e-05, "loss": 2.2723, "step": 1830 }, { "epoch": 0.048222280747958916, "grad_norm": 2.8788657188415527, "learning_rate": 4.75941532789044e-05, "loss": 0.3471, "step": 1831 }, { "epoch": 0.04824861732947063, "grad_norm": 3.3488869667053223, "learning_rate": 4.7592836449828815e-05, "loss": 2.5425, "step": 1832 }, { "epoch": 0.048274953910982354, "grad_norm": 2.062664031982422, "learning_rate": 4.759151962075323e-05, "loss": 1.9583, "step": 1833 }, { "epoch": 0.04830129049249408, "grad_norm": 2.0781054496765137, "learning_rate": 4.759020279167764e-05, "loss": 1.4181, "step": 1834 }, { "epoch": 0.04832762707400579, "grad_norm": 3.9546961784362793, "learning_rate": 4.758888596260206e-05, "loss": 1.2198, "step": 1835 }, { "epoch": 0.048353963655517515, "grad_norm": 9.188541412353516, "learning_rate": 4.758756913352647e-05, "loss": 2.2232, "step": 1836 }, { "epoch": 0.04838030023702923, "grad_norm": 4.707973003387451, "learning_rate": 4.7586252304450886e-05, "loss": 1.7777, "step": 1837 }, { "epoch": 0.04840663681854095, "grad_norm": 2.385406494140625, "learning_rate": 4.7584935475375295e-05, "loss": 1.7226, "step": 1838 }, { "epoch": 0.048432973400052676, "grad_norm": 2.3620717525482178, "learning_rate": 4.758361864629971e-05, "loss": 1.9293, "step": 1839 }, { "epoch": 0.04845930998156439, "grad_norm": 2.837907075881958, "learning_rate": 4.7582301817224126e-05, "loss": 0.9535, "step": 1840 }, { "epoch": 0.048485646563076114, "grad_norm": 2.6598939895629883, "learning_rate": 4.758098498814854e-05, "loss": 0.6699, "step": 1841 }, { "epoch": 0.04851198314458783, "grad_norm": 2.891472339630127, "learning_rate": 4.757966815907296e-05, "loss": 1.4541, "step": 1842 }, { "epoch": 0.04853831972609955, "grad_norm": 3.0237386226654053, "learning_rate": 4.7578351329997366e-05, "loss": 2.0308, "step": 1843 }, { "epoch": 0.048564656307611274, "grad_norm": 3.1423702239990234, "learning_rate": 4.757703450092178e-05, "loss": 2.256, "step": 1844 }, { "epoch": 0.04859099288912299, "grad_norm": 2.3241753578186035, "learning_rate": 4.75757176718462e-05, "loss": 2.286, "step": 1845 }, { "epoch": 0.04861732947063471, "grad_norm": 3.8985023498535156, "learning_rate": 4.757440084277061e-05, "loss": 1.4076, "step": 1846 }, { "epoch": 0.04864366605214643, "grad_norm": 3.129281997680664, "learning_rate": 4.757308401369502e-05, "loss": 2.116, "step": 1847 }, { "epoch": 0.04867000263365815, "grad_norm": 1.8293781280517578, "learning_rate": 4.757176718461944e-05, "loss": 1.856, "step": 1848 }, { "epoch": 0.04869633921516987, "grad_norm": 3.197397232055664, "learning_rate": 4.757045035554385e-05, "loss": 0.9616, "step": 1849 }, { "epoch": 0.04872267579668159, "grad_norm": 7.709231853485107, "learning_rate": 4.756913352646827e-05, "loss": 2.0307, "step": 1850 }, { "epoch": 0.04874901237819331, "grad_norm": 2.38158917427063, "learning_rate": 4.7567816697392684e-05, "loss": 2.0916, "step": 1851 }, { "epoch": 0.04877534895970503, "grad_norm": 5.8824567794799805, "learning_rate": 4.756649986831709e-05, "loss": 0.9896, "step": 1852 }, { "epoch": 0.04880168554121675, "grad_norm": 2.287475824356079, "learning_rate": 4.756518303924151e-05, "loss": 1.4894, "step": 1853 }, { "epoch": 0.04882802212272847, "grad_norm": 3.0922799110412598, "learning_rate": 4.7563866210165924e-05, "loss": 2.0622, "step": 1854 }, { "epoch": 0.04885435870424019, "grad_norm": 3.4541170597076416, "learning_rate": 4.756254938109034e-05, "loss": 1.5996, "step": 1855 }, { "epoch": 0.04888069528575191, "grad_norm": 3.240755558013916, "learning_rate": 4.756123255201475e-05, "loss": 1.6617, "step": 1856 }, { "epoch": 0.04890703186726363, "grad_norm": 6.699325084686279, "learning_rate": 4.7559915722939164e-05, "loss": 1.6213, "step": 1857 }, { "epoch": 0.04893336844877535, "grad_norm": 2.5335021018981934, "learning_rate": 4.755859889386357e-05, "loss": 1.7242, "step": 1858 }, { "epoch": 0.04895970503028707, "grad_norm": 4.607972621917725, "learning_rate": 4.7557282064787995e-05, "loss": 1.5549, "step": 1859 }, { "epoch": 0.048986041611798786, "grad_norm": 2.0614113807678223, "learning_rate": 4.7555965235712404e-05, "loss": 1.8949, "step": 1860 }, { "epoch": 0.04901237819331051, "grad_norm": 5.062224388122559, "learning_rate": 4.755464840663682e-05, "loss": 1.2759, "step": 1861 }, { "epoch": 0.04903871477482223, "grad_norm": 5.996825218200684, "learning_rate": 4.7553331577561235e-05, "loss": 2.8841, "step": 1862 }, { "epoch": 0.04906505135633395, "grad_norm": 2.899484634399414, "learning_rate": 4.755201474848565e-05, "loss": 2.3065, "step": 1863 }, { "epoch": 0.04909138793784567, "grad_norm": 5.001921653747559, "learning_rate": 4.7550697919410066e-05, "loss": 2.2021, "step": 1864 }, { "epoch": 0.049117724519357385, "grad_norm": 5.116119861602783, "learning_rate": 4.7549381090334475e-05, "loss": 1.6329, "step": 1865 }, { "epoch": 0.04914406110086911, "grad_norm": 3.9319283962249756, "learning_rate": 4.754806426125889e-05, "loss": 1.4503, "step": 1866 }, { "epoch": 0.04917039768238083, "grad_norm": 2.820613384246826, "learning_rate": 4.75467474321833e-05, "loss": 1.9275, "step": 1867 }, { "epoch": 0.049196734263892546, "grad_norm": 2.5694661140441895, "learning_rate": 4.754543060310772e-05, "loss": 3.0441, "step": 1868 }, { "epoch": 0.04922307084540427, "grad_norm": 2.513005018234253, "learning_rate": 4.754411377403213e-05, "loss": 1.9219, "step": 1869 }, { "epoch": 0.049249407426915984, "grad_norm": 3.107499837875366, "learning_rate": 4.7542796944956547e-05, "loss": 0.9544, "step": 1870 }, { "epoch": 0.049275744008427706, "grad_norm": 2.8746564388275146, "learning_rate": 4.754148011588096e-05, "loss": 1.4186, "step": 1871 }, { "epoch": 0.04930208058993943, "grad_norm": 7.512135982513428, "learning_rate": 4.754016328680537e-05, "loss": 1.6365, "step": 1872 }, { "epoch": 0.049328417171451144, "grad_norm": 2.2588987350463867, "learning_rate": 4.753884645772979e-05, "loss": 1.4779, "step": 1873 }, { "epoch": 0.04935475375296287, "grad_norm": 2.7753746509552, "learning_rate": 4.75375296286542e-05, "loss": 1.6723, "step": 1874 }, { "epoch": 0.04938109033447458, "grad_norm": 5.017754554748535, "learning_rate": 4.753621279957862e-05, "loss": 1.3852, "step": 1875 }, { "epoch": 0.049407426915986305, "grad_norm": 2.4810543060302734, "learning_rate": 4.7534895970503027e-05, "loss": 1.6004, "step": 1876 }, { "epoch": 0.04943376349749803, "grad_norm": 5.922138690948486, "learning_rate": 4.753357914142744e-05, "loss": 2.0072, "step": 1877 }, { "epoch": 0.04946010007900974, "grad_norm": 2.1590514183044434, "learning_rate": 4.753226231235186e-05, "loss": 1.1749, "step": 1878 }, { "epoch": 0.049486436660521466, "grad_norm": 2.207684278488159, "learning_rate": 4.753094548327627e-05, "loss": 2.7352, "step": 1879 }, { "epoch": 0.04951277324203318, "grad_norm": 2.755525588989258, "learning_rate": 4.752962865420069e-05, "loss": 1.8859, "step": 1880 }, { "epoch": 0.049539109823544904, "grad_norm": 2.835995674133301, "learning_rate": 4.75283118251251e-05, "loss": 1.9658, "step": 1881 }, { "epoch": 0.049565446405056626, "grad_norm": 2.135390281677246, "learning_rate": 4.752699499604952e-05, "loss": 2.206, "step": 1882 }, { "epoch": 0.04959178298656834, "grad_norm": 2.1092138290405273, "learning_rate": 4.752567816697393e-05, "loss": 1.7261, "step": 1883 }, { "epoch": 0.049618119568080064, "grad_norm": 2.323204517364502, "learning_rate": 4.7524361337898345e-05, "loss": 1.7719, "step": 1884 }, { "epoch": 0.04964445614959178, "grad_norm": 2.8365225791931152, "learning_rate": 4.752304450882275e-05, "loss": 1.8774, "step": 1885 }, { "epoch": 0.0496707927311035, "grad_norm": 2.5356972217559814, "learning_rate": 4.752172767974717e-05, "loss": 2.5013, "step": 1886 }, { "epoch": 0.049697129312615225, "grad_norm": 3.487934112548828, "learning_rate": 4.7520410850671585e-05, "loss": 1.7786, "step": 1887 }, { "epoch": 0.04972346589412694, "grad_norm": 2.3606793880462646, "learning_rate": 4.7519094021596e-05, "loss": 1.9574, "step": 1888 }, { "epoch": 0.04974980247563866, "grad_norm": 2.667154312133789, "learning_rate": 4.7517777192520416e-05, "loss": 1.5387, "step": 1889 }, { "epoch": 0.04977613905715038, "grad_norm": 2.8656506538391113, "learning_rate": 4.7516460363444825e-05, "loss": 0.3605, "step": 1890 }, { "epoch": 0.0498024756386621, "grad_norm": 3.4387459754943848, "learning_rate": 4.751514353436924e-05, "loss": 1.4456, "step": 1891 }, { "epoch": 0.049828812220173824, "grad_norm": 2.3970303535461426, "learning_rate": 4.7513826705293656e-05, "loss": 2.17, "step": 1892 }, { "epoch": 0.04985514880168554, "grad_norm": 1.9291952848434448, "learning_rate": 4.751250987621807e-05, "loss": 1.9023, "step": 1893 }, { "epoch": 0.04988148538319726, "grad_norm": 3.236123561859131, "learning_rate": 4.751119304714248e-05, "loss": 2.0105, "step": 1894 }, { "epoch": 0.04990782196470898, "grad_norm": 2.9148108959198, "learning_rate": 4.7509876218066896e-05, "loss": 2.5134, "step": 1895 }, { "epoch": 0.0499341585462207, "grad_norm": 2.144341230392456, "learning_rate": 4.750855938899131e-05, "loss": 1.3602, "step": 1896 }, { "epoch": 0.04996049512773242, "grad_norm": 2.2980000972747803, "learning_rate": 4.750724255991573e-05, "loss": 2.0295, "step": 1897 }, { "epoch": 0.04998683170924414, "grad_norm": 4.49240255355835, "learning_rate": 4.750592573084014e-05, "loss": 1.6687, "step": 1898 }, { "epoch": 0.05001316829075586, "grad_norm": 6.845585346221924, "learning_rate": 4.750460890176455e-05, "loss": 1.5149, "step": 1899 }, { "epoch": 0.05003950487226758, "grad_norm": 2.1115126609802246, "learning_rate": 4.750329207268897e-05, "loss": 1.8899, "step": 1900 }, { "epoch": 0.0500658414537793, "grad_norm": 3.358717918395996, "learning_rate": 4.750197524361338e-05, "loss": 1.9376, "step": 1901 }, { "epoch": 0.05009217803529102, "grad_norm": 2.899951457977295, "learning_rate": 4.75006584145378e-05, "loss": 1.5477, "step": 1902 }, { "epoch": 0.05011851461680274, "grad_norm": 3.117509126663208, "learning_rate": 4.749934158546221e-05, "loss": 1.6119, "step": 1903 }, { "epoch": 0.05014485119831446, "grad_norm": 2.152562379837036, "learning_rate": 4.749802475638662e-05, "loss": 1.6742, "step": 1904 }, { "epoch": 0.05017118777982618, "grad_norm": 6.079864978790283, "learning_rate": 4.749670792731104e-05, "loss": 1.015, "step": 1905 }, { "epoch": 0.0501975243613379, "grad_norm": 2.2239325046539307, "learning_rate": 4.7495391098235454e-05, "loss": 1.7363, "step": 1906 }, { "epoch": 0.05022386094284962, "grad_norm": 2.863947868347168, "learning_rate": 4.749407426915987e-05, "loss": 1.8702, "step": 1907 }, { "epoch": 0.050250197524361336, "grad_norm": 2.382564067840576, "learning_rate": 4.749275744008428e-05, "loss": 2.1574, "step": 1908 }, { "epoch": 0.05027653410587306, "grad_norm": 4.304083347320557, "learning_rate": 4.7491440611008694e-05, "loss": 1.6574, "step": 1909 }, { "epoch": 0.05030287068738478, "grad_norm": 2.6413674354553223, "learning_rate": 4.74901237819331e-05, "loss": 1.942, "step": 1910 }, { "epoch": 0.050329207268896496, "grad_norm": 4.041344165802002, "learning_rate": 4.7488806952857525e-05, "loss": 1.8458, "step": 1911 }, { "epoch": 0.05035554385040822, "grad_norm": 2.3719329833984375, "learning_rate": 4.7487490123781934e-05, "loss": 1.7017, "step": 1912 }, { "epoch": 0.050381880431919934, "grad_norm": 2.3702516555786133, "learning_rate": 4.748617329470635e-05, "loss": 1.9608, "step": 1913 }, { "epoch": 0.05040821701343166, "grad_norm": 1.9186686277389526, "learning_rate": 4.748485646563076e-05, "loss": 1.6874, "step": 1914 }, { "epoch": 0.05043455359494338, "grad_norm": 5.602931976318359, "learning_rate": 4.748353963655518e-05, "loss": 1.7823, "step": 1915 }, { "epoch": 0.050460890176455095, "grad_norm": 2.4267005920410156, "learning_rate": 4.748222280747959e-05, "loss": 1.386, "step": 1916 }, { "epoch": 0.05048722675796682, "grad_norm": 2.46811580657959, "learning_rate": 4.7480905978404005e-05, "loss": 1.0042, "step": 1917 }, { "epoch": 0.05051356333947853, "grad_norm": 4.005268573760986, "learning_rate": 4.747958914932842e-05, "loss": 0.7043, "step": 1918 }, { "epoch": 0.050539899920990256, "grad_norm": 2.318715810775757, "learning_rate": 4.747827232025283e-05, "loss": 1.4512, "step": 1919 }, { "epoch": 0.05056623650250198, "grad_norm": 2.486562728881836, "learning_rate": 4.747695549117725e-05, "loss": 1.8038, "step": 1920 }, { "epoch": 0.050592573084013694, "grad_norm": 3.297780990600586, "learning_rate": 4.747563866210166e-05, "loss": 1.6142, "step": 1921 }, { "epoch": 0.050618909665525416, "grad_norm": 3.328735589981079, "learning_rate": 4.7474321833026076e-05, "loss": 2.6925, "step": 1922 }, { "epoch": 0.05064524624703713, "grad_norm": 3.1211743354797363, "learning_rate": 4.7473005003950485e-05, "loss": 2.216, "step": 1923 }, { "epoch": 0.050671582828548854, "grad_norm": 3.6800456047058105, "learning_rate": 4.74716881748749e-05, "loss": 2.0336, "step": 1924 }, { "epoch": 0.05069791941006058, "grad_norm": 7.806819915771484, "learning_rate": 4.7470371345799316e-05, "loss": 2.03, "step": 1925 }, { "epoch": 0.05072425599157229, "grad_norm": 2.605405330657959, "learning_rate": 4.746905451672373e-05, "loss": 1.9685, "step": 1926 }, { "epoch": 0.050750592573084015, "grad_norm": 2.262094497680664, "learning_rate": 4.746773768764815e-05, "loss": 1.8916, "step": 1927 }, { "epoch": 0.05077692915459573, "grad_norm": 2.370267391204834, "learning_rate": 4.7466420858572556e-05, "loss": 1.4207, "step": 1928 }, { "epoch": 0.05080326573610745, "grad_norm": 3.779275417327881, "learning_rate": 4.746510402949698e-05, "loss": 2.0838, "step": 1929 }, { "epoch": 0.050829602317619176, "grad_norm": 2.3945186138153076, "learning_rate": 4.746378720042139e-05, "loss": 1.3614, "step": 1930 }, { "epoch": 0.05085593889913089, "grad_norm": 2.312326669692993, "learning_rate": 4.74624703713458e-05, "loss": 1.7729, "step": 1931 }, { "epoch": 0.050882275480642614, "grad_norm": 3.108835458755493, "learning_rate": 4.746115354227021e-05, "loss": 1.3388, "step": 1932 }, { "epoch": 0.05090861206215433, "grad_norm": 3.0566723346710205, "learning_rate": 4.745983671319463e-05, "loss": 1.6849, "step": 1933 }, { "epoch": 0.05093494864366605, "grad_norm": 2.147061347961426, "learning_rate": 4.745851988411904e-05, "loss": 1.4168, "step": 1934 }, { "epoch": 0.050961285225177774, "grad_norm": 2.6942696571350098, "learning_rate": 4.745720305504346e-05, "loss": 1.9201, "step": 1935 }, { "epoch": 0.05098762180668949, "grad_norm": 2.424778461456299, "learning_rate": 4.7455886225967874e-05, "loss": 1.5221, "step": 1936 }, { "epoch": 0.05101395838820121, "grad_norm": 3.8146185874938965, "learning_rate": 4.745456939689228e-05, "loss": 1.5397, "step": 1937 }, { "epoch": 0.05104029496971293, "grad_norm": 2.4015159606933594, "learning_rate": 4.74532525678167e-05, "loss": 1.7502, "step": 1938 }, { "epoch": 0.05106663155122465, "grad_norm": 1.926942229270935, "learning_rate": 4.7451935738741114e-05, "loss": 1.7521, "step": 1939 }, { "epoch": 0.05109296813273637, "grad_norm": 1.8515033721923828, "learning_rate": 4.745061890966553e-05, "loss": 1.8109, "step": 1940 }, { "epoch": 0.05111930471424809, "grad_norm": 2.4684648513793945, "learning_rate": 4.744930208058994e-05, "loss": 1.9839, "step": 1941 }, { "epoch": 0.05114564129575981, "grad_norm": 3.5624656677246094, "learning_rate": 4.7447985251514354e-05, "loss": 2.0394, "step": 1942 }, { "epoch": 0.05117197787727153, "grad_norm": 3.420684576034546, "learning_rate": 4.744666842243877e-05, "loss": 2.0959, "step": 1943 }, { "epoch": 0.05119831445878325, "grad_norm": 1.8533002138137817, "learning_rate": 4.7445351593363186e-05, "loss": 2.078, "step": 1944 }, { "epoch": 0.05122465104029497, "grad_norm": 2.8681881427764893, "learning_rate": 4.74440347642876e-05, "loss": 0.4574, "step": 1945 }, { "epoch": 0.05125098762180669, "grad_norm": 3.17950177192688, "learning_rate": 4.744271793521201e-05, "loss": 1.1603, "step": 1946 }, { "epoch": 0.05127732420331841, "grad_norm": 2.905369758605957, "learning_rate": 4.7441401106136426e-05, "loss": 1.7011, "step": 1947 }, { "epoch": 0.05130366078483013, "grad_norm": 2.4327666759490967, "learning_rate": 4.744008427706084e-05, "loss": 1.4245, "step": 1948 }, { "epoch": 0.05132999736634185, "grad_norm": 2.5221340656280518, "learning_rate": 4.743876744798526e-05, "loss": 1.6163, "step": 1949 }, { "epoch": 0.05135633394785357, "grad_norm": 2.8383095264434814, "learning_rate": 4.7437450618909666e-05, "loss": 0.8223, "step": 1950 }, { "epoch": 0.051382670529365286, "grad_norm": 3.493562698364258, "learning_rate": 4.743613378983408e-05, "loss": 1.3465, "step": 1951 }, { "epoch": 0.05140900711087701, "grad_norm": 3.039860486984253, "learning_rate": 4.74348169607585e-05, "loss": 1.5341, "step": 1952 }, { "epoch": 0.05143534369238873, "grad_norm": 19.08000373840332, "learning_rate": 4.743350013168291e-05, "loss": 3.6486, "step": 1953 }, { "epoch": 0.05146168027390045, "grad_norm": 3.1299734115600586, "learning_rate": 4.743218330260733e-05, "loss": 1.8175, "step": 1954 }, { "epoch": 0.05148801685541217, "grad_norm": 1.8462104797363281, "learning_rate": 4.743086647353174e-05, "loss": 1.7258, "step": 1955 }, { "epoch": 0.051514353436923885, "grad_norm": 2.8475687503814697, "learning_rate": 4.742954964445615e-05, "loss": 1.458, "step": 1956 }, { "epoch": 0.05154069001843561, "grad_norm": 2.0529065132141113, "learning_rate": 4.742823281538056e-05, "loss": 1.8204, "step": 1957 }, { "epoch": 0.05156702659994733, "grad_norm": 2.9291958808898926, "learning_rate": 4.7426915986304984e-05, "loss": 1.7926, "step": 1958 }, { "epoch": 0.051593363181459045, "grad_norm": 2.488250970840454, "learning_rate": 4.742559915722939e-05, "loss": 2.1954, "step": 1959 }, { "epoch": 0.05161969976297077, "grad_norm": 3.042243480682373, "learning_rate": 4.742428232815381e-05, "loss": 0.3938, "step": 1960 }, { "epoch": 0.051646036344482484, "grad_norm": 3.34202241897583, "learning_rate": 4.742296549907822e-05, "loss": 1.6101, "step": 1961 }, { "epoch": 0.051672372925994206, "grad_norm": 2.0528404712677, "learning_rate": 4.742164867000264e-05, "loss": 2.2328, "step": 1962 }, { "epoch": 0.05169870950750593, "grad_norm": 2.2398762702941895, "learning_rate": 4.742033184092705e-05, "loss": 1.4511, "step": 1963 }, { "epoch": 0.051725046089017644, "grad_norm": 2.497408628463745, "learning_rate": 4.7419015011851464e-05, "loss": 2.3698, "step": 1964 }, { "epoch": 0.05175138267052937, "grad_norm": 1.7545833587646484, "learning_rate": 4.741769818277588e-05, "loss": 2.3217, "step": 1965 }, { "epoch": 0.05177771925204108, "grad_norm": 1.993113398551941, "learning_rate": 4.741638135370029e-05, "loss": 2.019, "step": 1966 }, { "epoch": 0.051804055833552805, "grad_norm": 3.2278997898101807, "learning_rate": 4.741506452462471e-05, "loss": 1.867, "step": 1967 }, { "epoch": 0.05183039241506453, "grad_norm": 2.4407131671905518, "learning_rate": 4.741374769554912e-05, "loss": 1.8683, "step": 1968 }, { "epoch": 0.05185672899657624, "grad_norm": 2.2846662998199463, "learning_rate": 4.7412430866473535e-05, "loss": 2.114, "step": 1969 }, { "epoch": 0.051883065578087965, "grad_norm": 2.683457851409912, "learning_rate": 4.7411114037397944e-05, "loss": 1.9092, "step": 1970 }, { "epoch": 0.05190940215959968, "grad_norm": 2.1620802879333496, "learning_rate": 4.740979720832236e-05, "loss": 1.622, "step": 1971 }, { "epoch": 0.051935738741111404, "grad_norm": 2.3451576232910156, "learning_rate": 4.7408480379246775e-05, "loss": 2.3576, "step": 1972 }, { "epoch": 0.051962075322623126, "grad_norm": 2.0542962551116943, "learning_rate": 4.740716355017119e-05, "loss": 1.7956, "step": 1973 }, { "epoch": 0.05198841190413484, "grad_norm": 4.009820938110352, "learning_rate": 4.7405846721095606e-05, "loss": 0.915, "step": 1974 }, { "epoch": 0.052014748485646564, "grad_norm": 2.7401866912841797, "learning_rate": 4.7404529892020015e-05, "loss": 2.1814, "step": 1975 }, { "epoch": 0.05204108506715828, "grad_norm": 2.3345112800598145, "learning_rate": 4.740321306294443e-05, "loss": 2.1897, "step": 1976 }, { "epoch": 0.05206742164867, "grad_norm": 2.5021276473999023, "learning_rate": 4.7401896233868846e-05, "loss": 0.9051, "step": 1977 }, { "epoch": 0.052093758230181725, "grad_norm": 2.211122512817383, "learning_rate": 4.740057940479326e-05, "loss": 1.7051, "step": 1978 }, { "epoch": 0.05212009481169344, "grad_norm": 2.5293128490448, "learning_rate": 4.739926257571767e-05, "loss": 1.7382, "step": 1979 }, { "epoch": 0.05214643139320516, "grad_norm": 3.504066228866577, "learning_rate": 4.7397945746642086e-05, "loss": 0.5729, "step": 1980 }, { "epoch": 0.05217276797471688, "grad_norm": 2.8518624305725098, "learning_rate": 4.73966289175665e-05, "loss": 0.8239, "step": 1981 }, { "epoch": 0.0521991045562286, "grad_norm": 5.398329257965088, "learning_rate": 4.739531208849092e-05, "loss": 1.1326, "step": 1982 }, { "epoch": 0.052225441137740324, "grad_norm": 2.0230722427368164, "learning_rate": 4.739399525941533e-05, "loss": 1.6693, "step": 1983 }, { "epoch": 0.05225177771925204, "grad_norm": 2.5795300006866455, "learning_rate": 4.739267843033974e-05, "loss": 1.5578, "step": 1984 }, { "epoch": 0.05227811430076376, "grad_norm": 2.0967042446136475, "learning_rate": 4.739136160126416e-05, "loss": 2.1262, "step": 1985 }, { "epoch": 0.05230445088227548, "grad_norm": 4.75216007232666, "learning_rate": 4.739004477218857e-05, "loss": 1.7363, "step": 1986 }, { "epoch": 0.0523307874637872, "grad_norm": 1.9590355157852173, "learning_rate": 4.738872794311299e-05, "loss": 2.3006, "step": 1987 }, { "epoch": 0.05235712404529892, "grad_norm": 2.3280680179595947, "learning_rate": 4.73874111140374e-05, "loss": 0.8837, "step": 1988 }, { "epoch": 0.05238346062681064, "grad_norm": 2.8242263793945312, "learning_rate": 4.738609428496181e-05, "loss": 1.6547, "step": 1989 }, { "epoch": 0.05240979720832236, "grad_norm": 3.7524397373199463, "learning_rate": 4.738477745588623e-05, "loss": 1.3506, "step": 1990 }, { "epoch": 0.05243613378983408, "grad_norm": 1.93019437789917, "learning_rate": 4.7383460626810644e-05, "loss": 1.6627, "step": 1991 }, { "epoch": 0.0524624703713458, "grad_norm": 2.211174488067627, "learning_rate": 4.738214379773506e-05, "loss": 2.0234, "step": 1992 }, { "epoch": 0.05248880695285752, "grad_norm": 3.0913209915161133, "learning_rate": 4.738082696865947e-05, "loss": 2.3623, "step": 1993 }, { "epoch": 0.05251514353436924, "grad_norm": 2.308661937713623, "learning_rate": 4.7379510139583884e-05, "loss": 1.3141, "step": 1994 }, { "epoch": 0.05254148011588096, "grad_norm": 2.420914888381958, "learning_rate": 4.73781933105083e-05, "loss": 1.675, "step": 1995 }, { "epoch": 0.05256781669739268, "grad_norm": 2.018430471420288, "learning_rate": 4.7376876481432715e-05, "loss": 1.2073, "step": 1996 }, { "epoch": 0.0525941532789044, "grad_norm": 2.0899672508239746, "learning_rate": 4.7375559652357124e-05, "loss": 2.0052, "step": 1997 }, { "epoch": 0.05262048986041612, "grad_norm": 2.986414909362793, "learning_rate": 4.737424282328154e-05, "loss": 1.9322, "step": 1998 }, { "epoch": 0.052646826441927835, "grad_norm": 2.661581516265869, "learning_rate": 4.7372925994205955e-05, "loss": 1.9804, "step": 1999 }, { "epoch": 0.05267316302343956, "grad_norm": 2.6744041442871094, "learning_rate": 4.737160916513037e-05, "loss": 0.6653, "step": 2000 }, { "epoch": 0.05269949960495128, "grad_norm": 3.131136178970337, "learning_rate": 4.7370292336054787e-05, "loss": 0.6623, "step": 2001 }, { "epoch": 0.052725836186462996, "grad_norm": 2.4772188663482666, "learning_rate": 4.7368975506979195e-05, "loss": 1.4164, "step": 2002 }, { "epoch": 0.05275217276797472, "grad_norm": 3.378445863723755, "learning_rate": 4.736765867790361e-05, "loss": 1.4511, "step": 2003 }, { "epoch": 0.052778509349486434, "grad_norm": 2.7198100090026855, "learning_rate": 4.736634184882802e-05, "loss": 0.6776, "step": 2004 }, { "epoch": 0.05280484593099816, "grad_norm": 2.3264641761779785, "learning_rate": 4.736502501975244e-05, "loss": 1.6766, "step": 2005 }, { "epoch": 0.05283118251250988, "grad_norm": 2.581003189086914, "learning_rate": 4.736370819067685e-05, "loss": 2.2525, "step": 2006 }, { "epoch": 0.052857519094021595, "grad_norm": 2.278221845626831, "learning_rate": 4.7362391361601267e-05, "loss": 1.8756, "step": 2007 }, { "epoch": 0.05288385567553332, "grad_norm": 2.241326093673706, "learning_rate": 4.736107453252568e-05, "loss": 2.9003, "step": 2008 }, { "epoch": 0.05291019225704503, "grad_norm": 2.409233570098877, "learning_rate": 4.73597577034501e-05, "loss": 1.9883, "step": 2009 }, { "epoch": 0.052936528838556755, "grad_norm": 3.8274152278900146, "learning_rate": 4.735844087437451e-05, "loss": 2.2167, "step": 2010 }, { "epoch": 0.05296286542006848, "grad_norm": 6.1324143409729, "learning_rate": 4.735712404529892e-05, "loss": 0.7901, "step": 2011 }, { "epoch": 0.05298920200158019, "grad_norm": 4.438776969909668, "learning_rate": 4.735580721622334e-05, "loss": 1.5326, "step": 2012 }, { "epoch": 0.053015538583091916, "grad_norm": 4.440764904022217, "learning_rate": 4.7354490387147747e-05, "loss": 0.9479, "step": 2013 }, { "epoch": 0.05304187516460363, "grad_norm": 4.195770740509033, "learning_rate": 4.735317355807217e-05, "loss": 2.1979, "step": 2014 }, { "epoch": 0.053068211746115354, "grad_norm": 2.461336851119995, "learning_rate": 4.735185672899658e-05, "loss": 2.2412, "step": 2015 }, { "epoch": 0.05309454832762708, "grad_norm": 2.2120697498321533, "learning_rate": 4.735053989992099e-05, "loss": 1.4593, "step": 2016 }, { "epoch": 0.05312088490913879, "grad_norm": 1.7244751453399658, "learning_rate": 4.73492230708454e-05, "loss": 2.0892, "step": 2017 }, { "epoch": 0.053147221490650515, "grad_norm": 2.106907844543457, "learning_rate": 4.734790624176982e-05, "loss": 1.7483, "step": 2018 }, { "epoch": 0.05317355807216223, "grad_norm": 1.8575800657272339, "learning_rate": 4.7346589412694233e-05, "loss": 1.9242, "step": 2019 }, { "epoch": 0.05319989465367395, "grad_norm": 2.1366355419158936, "learning_rate": 4.734527258361865e-05, "loss": 1.9495, "step": 2020 }, { "epoch": 0.053226231235185675, "grad_norm": 2.7665510177612305, "learning_rate": 4.7343955754543065e-05, "loss": 1.6444, "step": 2021 }, { "epoch": 0.05325256781669739, "grad_norm": 3.159970283508301, "learning_rate": 4.7342638925467473e-05, "loss": 1.0339, "step": 2022 }, { "epoch": 0.05327890439820911, "grad_norm": 4.321499347686768, "learning_rate": 4.734132209639189e-05, "loss": 0.8682, "step": 2023 }, { "epoch": 0.05330524097972083, "grad_norm": 2.1694865226745605, "learning_rate": 4.7340005267316305e-05, "loss": 1.4608, "step": 2024 }, { "epoch": 0.05333157756123255, "grad_norm": 2.515545129776001, "learning_rate": 4.733868843824072e-05, "loss": 1.7315, "step": 2025 }, { "epoch": 0.053357914142744274, "grad_norm": 3.302790880203247, "learning_rate": 4.733737160916513e-05, "loss": 1.6774, "step": 2026 }, { "epoch": 0.05338425072425599, "grad_norm": 2.1500535011291504, "learning_rate": 4.7336054780089545e-05, "loss": 2.338, "step": 2027 }, { "epoch": 0.05341058730576771, "grad_norm": 2.0290355682373047, "learning_rate": 4.733473795101396e-05, "loss": 1.7395, "step": 2028 }, { "epoch": 0.05343692388727943, "grad_norm": 4.227078914642334, "learning_rate": 4.7333421121938376e-05, "loss": 1.0584, "step": 2029 }, { "epoch": 0.05346326046879115, "grad_norm": 2.465289831161499, "learning_rate": 4.733210429286279e-05, "loss": 2.0544, "step": 2030 }, { "epoch": 0.05348959705030287, "grad_norm": 2.5843186378479004, "learning_rate": 4.73307874637872e-05, "loss": 1.5605, "step": 2031 }, { "epoch": 0.05351593363181459, "grad_norm": 2.026824474334717, "learning_rate": 4.7329470634711616e-05, "loss": 1.2874, "step": 2032 }, { "epoch": 0.05354227021332631, "grad_norm": 2.224200963973999, "learning_rate": 4.732815380563603e-05, "loss": 2.2449, "step": 2033 }, { "epoch": 0.05356860679483803, "grad_norm": 2.0412447452545166, "learning_rate": 4.732683697656045e-05, "loss": 1.8891, "step": 2034 }, { "epoch": 0.05359494337634975, "grad_norm": 5.431974411010742, "learning_rate": 4.7325520147484856e-05, "loss": 0.7076, "step": 2035 }, { "epoch": 0.05362127995786147, "grad_norm": 2.5169289112091064, "learning_rate": 4.732420331840927e-05, "loss": 1.9156, "step": 2036 }, { "epoch": 0.05364761653937319, "grad_norm": 4.09473180770874, "learning_rate": 4.732288648933369e-05, "loss": 2.0082, "step": 2037 }, { "epoch": 0.05367395312088491, "grad_norm": 1.8645721673965454, "learning_rate": 4.73215696602581e-05, "loss": 2.0827, "step": 2038 }, { "epoch": 0.05370028970239663, "grad_norm": 1.8461860418319702, "learning_rate": 4.732025283118252e-05, "loss": 2.6198, "step": 2039 }, { "epoch": 0.05372662628390835, "grad_norm": 2.1248743534088135, "learning_rate": 4.731893600210693e-05, "loss": 2.2577, "step": 2040 }, { "epoch": 0.05375296286542007, "grad_norm": 3.238090753555298, "learning_rate": 4.731761917303134e-05, "loss": 1.6984, "step": 2041 }, { "epoch": 0.053779299446931786, "grad_norm": 6.792541980743408, "learning_rate": 4.731630234395576e-05, "loss": 0.805, "step": 2042 }, { "epoch": 0.05380563602844351, "grad_norm": 3.9255001544952393, "learning_rate": 4.7314985514880174e-05, "loss": 1.999, "step": 2043 }, { "epoch": 0.05383197260995523, "grad_norm": 2.77415132522583, "learning_rate": 4.731366868580458e-05, "loss": 1.7732, "step": 2044 }, { "epoch": 0.053858309191466947, "grad_norm": 2.2490503787994385, "learning_rate": 4.7312351856729e-05, "loss": 1.3486, "step": 2045 }, { "epoch": 0.05388464577297867, "grad_norm": 2.318188428878784, "learning_rate": 4.7311035027653414e-05, "loss": 1.5477, "step": 2046 }, { "epoch": 0.053910982354490385, "grad_norm": 2.8028390407562256, "learning_rate": 4.730971819857783e-05, "loss": 1.3602, "step": 2047 }, { "epoch": 0.05393731893600211, "grad_norm": 4.531374454498291, "learning_rate": 4.7308401369502245e-05, "loss": 0.584, "step": 2048 }, { "epoch": 0.05396365551751383, "grad_norm": 2.0498664379119873, "learning_rate": 4.7307084540426654e-05, "loss": 1.6595, "step": 2049 }, { "epoch": 0.053989992099025545, "grad_norm": 1.9256951808929443, "learning_rate": 4.730576771135107e-05, "loss": 1.7819, "step": 2050 }, { "epoch": 0.05401632868053727, "grad_norm": 2.812506675720215, "learning_rate": 4.730445088227548e-05, "loss": 1.5771, "step": 2051 }, { "epoch": 0.05404266526204898, "grad_norm": 4.5180840492248535, "learning_rate": 4.73031340531999e-05, "loss": 0.9664, "step": 2052 }, { "epoch": 0.054069001843560706, "grad_norm": 4.770064353942871, "learning_rate": 4.730181722412431e-05, "loss": 2.0111, "step": 2053 }, { "epoch": 0.05409533842507243, "grad_norm": 2.6490061283111572, "learning_rate": 4.7300500395048725e-05, "loss": 2.3096, "step": 2054 }, { "epoch": 0.054121675006584144, "grad_norm": 1.717597246170044, "learning_rate": 4.729918356597314e-05, "loss": 1.7607, "step": 2055 }, { "epoch": 0.054148011588095867, "grad_norm": 2.0759642124176025, "learning_rate": 4.729786673689755e-05, "loss": 0.8702, "step": 2056 }, { "epoch": 0.05417434816960758, "grad_norm": 3.1635491847991943, "learning_rate": 4.729654990782197e-05, "loss": 1.9108, "step": 2057 }, { "epoch": 0.054200684751119305, "grad_norm": 2.7228267192840576, "learning_rate": 4.729523307874638e-05, "loss": 2.328, "step": 2058 }, { "epoch": 0.05422702133263103, "grad_norm": 3.286054849624634, "learning_rate": 4.7293916249670796e-05, "loss": 1.3617, "step": 2059 }, { "epoch": 0.05425335791414274, "grad_norm": 2.2653841972351074, "learning_rate": 4.7292599420595205e-05, "loss": 1.937, "step": 2060 }, { "epoch": 0.054279694495654465, "grad_norm": 2.1899664402008057, "learning_rate": 4.729128259151963e-05, "loss": 1.411, "step": 2061 }, { "epoch": 0.05430603107716618, "grad_norm": 2.935560941696167, "learning_rate": 4.7289965762444036e-05, "loss": 1.1944, "step": 2062 }, { "epoch": 0.0543323676586779, "grad_norm": 2.700507879257202, "learning_rate": 4.728864893336845e-05, "loss": 1.7689, "step": 2063 }, { "epoch": 0.054358704240189626, "grad_norm": 3.2642228603363037, "learning_rate": 4.728733210429286e-05, "loss": 0.4289, "step": 2064 }, { "epoch": 0.05438504082170134, "grad_norm": 8.299562454223633, "learning_rate": 4.7286015275217276e-05, "loss": 1.7906, "step": 2065 }, { "epoch": 0.054411377403213064, "grad_norm": 4.752680778503418, "learning_rate": 4.728469844614169e-05, "loss": 1.2543, "step": 2066 }, { "epoch": 0.05443771398472478, "grad_norm": 2.2282509803771973, "learning_rate": 4.728338161706611e-05, "loss": 1.5959, "step": 2067 }, { "epoch": 0.0544640505662365, "grad_norm": 2.411721706390381, "learning_rate": 4.728206478799052e-05, "loss": 2.1507, "step": 2068 }, { "epoch": 0.054490387147748225, "grad_norm": 2.560208559036255, "learning_rate": 4.728074795891493e-05, "loss": 1.7338, "step": 2069 }, { "epoch": 0.05451672372925994, "grad_norm": 2.463653564453125, "learning_rate": 4.727943112983935e-05, "loss": 1.6155, "step": 2070 }, { "epoch": 0.05454306031077166, "grad_norm": 3.0527713298797607, "learning_rate": 4.727811430076376e-05, "loss": 2.5102, "step": 2071 }, { "epoch": 0.05456939689228338, "grad_norm": 2.5716476440429688, "learning_rate": 4.727679747168818e-05, "loss": 1.1894, "step": 2072 }, { "epoch": 0.0545957334737951, "grad_norm": 1.9859488010406494, "learning_rate": 4.727548064261259e-05, "loss": 1.3319, "step": 2073 }, { "epoch": 0.05462207005530682, "grad_norm": 3.26849627494812, "learning_rate": 4.7274163813537e-05, "loss": 0.9321, "step": 2074 }, { "epoch": 0.05464840663681854, "grad_norm": 2.9846725463867188, "learning_rate": 4.727284698446142e-05, "loss": 1.77, "step": 2075 }, { "epoch": 0.05467474321833026, "grad_norm": 3.5807576179504395, "learning_rate": 4.7271530155385834e-05, "loss": 1.042, "step": 2076 }, { "epoch": 0.05470107979984198, "grad_norm": 2.4289557933807373, "learning_rate": 4.727021332631025e-05, "loss": 1.2542, "step": 2077 }, { "epoch": 0.0547274163813537, "grad_norm": 2.07059645652771, "learning_rate": 4.726889649723466e-05, "loss": 1.8483, "step": 2078 }, { "epoch": 0.05475375296286542, "grad_norm": 2.3563039302825928, "learning_rate": 4.7267579668159074e-05, "loss": 1.5422, "step": 2079 }, { "epoch": 0.05478008954437714, "grad_norm": 2.043539047241211, "learning_rate": 4.726626283908349e-05, "loss": 2.4442, "step": 2080 }, { "epoch": 0.05480642612588886, "grad_norm": 2.965324640274048, "learning_rate": 4.7264946010007906e-05, "loss": 0.4117, "step": 2081 }, { "epoch": 0.05483276270740058, "grad_norm": 4.158634185791016, "learning_rate": 4.7263629180932314e-05, "loss": 1.752, "step": 2082 }, { "epoch": 0.0548590992889123, "grad_norm": 2.371791124343872, "learning_rate": 4.726231235185673e-05, "loss": 2.4796, "step": 2083 }, { "epoch": 0.05488543587042402, "grad_norm": 4.616940021514893, "learning_rate": 4.7260995522781146e-05, "loss": 1.6888, "step": 2084 }, { "epoch": 0.054911772451935736, "grad_norm": 2.3304145336151123, "learning_rate": 4.725967869370556e-05, "loss": 0.5402, "step": 2085 }, { "epoch": 0.05493810903344746, "grad_norm": 2.3909904956817627, "learning_rate": 4.725836186462998e-05, "loss": 1.8232, "step": 2086 }, { "epoch": 0.05496444561495918, "grad_norm": 2.0792007446289062, "learning_rate": 4.7257045035554386e-05, "loss": 1.6262, "step": 2087 }, { "epoch": 0.0549907821964709, "grad_norm": 2.2291672229766846, "learning_rate": 4.72557282064788e-05, "loss": 1.4469, "step": 2088 }, { "epoch": 0.05501711877798262, "grad_norm": 2.22428560256958, "learning_rate": 4.725441137740321e-05, "loss": 1.6023, "step": 2089 }, { "epoch": 0.055043455359494335, "grad_norm": 2.5402305126190186, "learning_rate": 4.725309454832763e-05, "loss": 1.342, "step": 2090 }, { "epoch": 0.05506979194100606, "grad_norm": 2.6844735145568848, "learning_rate": 4.725177771925204e-05, "loss": 2.3668, "step": 2091 }, { "epoch": 0.05509612852251778, "grad_norm": 1.9301756620407104, "learning_rate": 4.725046089017646e-05, "loss": 1.5754, "step": 2092 }, { "epoch": 0.055122465104029496, "grad_norm": 2.6414220333099365, "learning_rate": 4.724914406110087e-05, "loss": 0.6245, "step": 2093 }, { "epoch": 0.05514880168554122, "grad_norm": 2.5184977054595947, "learning_rate": 4.724782723202529e-05, "loss": 1.8511, "step": 2094 }, { "epoch": 0.055175138267052934, "grad_norm": 2.051305055618286, "learning_rate": 4.7246510402949704e-05, "loss": 1.6211, "step": 2095 }, { "epoch": 0.055201474848564656, "grad_norm": 3.3867344856262207, "learning_rate": 4.724519357387411e-05, "loss": 1.7756, "step": 2096 }, { "epoch": 0.05522781143007638, "grad_norm": 4.963647365570068, "learning_rate": 4.724387674479853e-05, "loss": 2.0092, "step": 2097 }, { "epoch": 0.055254148011588095, "grad_norm": 7.482678413391113, "learning_rate": 4.724255991572294e-05, "loss": 0.9619, "step": 2098 }, { "epoch": 0.05528048459309982, "grad_norm": 2.995453119277954, "learning_rate": 4.724124308664736e-05, "loss": 2.2678, "step": 2099 }, { "epoch": 0.05530682117461153, "grad_norm": 16.927141189575195, "learning_rate": 4.723992625757177e-05, "loss": 1.1171, "step": 2100 }, { "epoch": 0.055333157756123255, "grad_norm": 5.2754621505737305, "learning_rate": 4.7238609428496184e-05, "loss": 1.3992, "step": 2101 }, { "epoch": 0.05535949433763498, "grad_norm": 2.401787281036377, "learning_rate": 4.72372925994206e-05, "loss": 1.8031, "step": 2102 }, { "epoch": 0.05538583091914669, "grad_norm": 2.4323031902313232, "learning_rate": 4.723597577034501e-05, "loss": 1.491, "step": 2103 }, { "epoch": 0.055412167500658416, "grad_norm": 2.329332113265991, "learning_rate": 4.723465894126943e-05, "loss": 2.2786, "step": 2104 }, { "epoch": 0.05543850408217013, "grad_norm": 3.1811001300811768, "learning_rate": 4.723334211219384e-05, "loss": 1.8405, "step": 2105 }, { "epoch": 0.055464840663681854, "grad_norm": 1.9039262533187866, "learning_rate": 4.7232025283118255e-05, "loss": 1.8215, "step": 2106 }, { "epoch": 0.055491177245193576, "grad_norm": 6.489499092102051, "learning_rate": 4.7230708454042664e-05, "loss": 1.3855, "step": 2107 }, { "epoch": 0.05551751382670529, "grad_norm": 1.8143821954727173, "learning_rate": 4.7229391624967086e-05, "loss": 0.4841, "step": 2108 }, { "epoch": 0.055543850408217015, "grad_norm": 3.48903751373291, "learning_rate": 4.7228074795891495e-05, "loss": 0.7245, "step": 2109 }, { "epoch": 0.05557018698972873, "grad_norm": 2.996108293533325, "learning_rate": 4.722675796681591e-05, "loss": 1.8393, "step": 2110 }, { "epoch": 0.05559652357124045, "grad_norm": 3.148120880126953, "learning_rate": 4.7225441137740326e-05, "loss": 1.9642, "step": 2111 }, { "epoch": 0.055622860152752175, "grad_norm": 3.929673433303833, "learning_rate": 4.7224124308664735e-05, "loss": 0.6346, "step": 2112 }, { "epoch": 0.05564919673426389, "grad_norm": 2.003108501434326, "learning_rate": 4.722280747958916e-05, "loss": 1.8192, "step": 2113 }, { "epoch": 0.05567553331577561, "grad_norm": 5.596811771392822, "learning_rate": 4.7221490650513566e-05, "loss": 1.0959, "step": 2114 }, { "epoch": 0.05570186989728733, "grad_norm": 2.6867105960845947, "learning_rate": 4.722017382143798e-05, "loss": 1.9643, "step": 2115 }, { "epoch": 0.05572820647879905, "grad_norm": 2.384495258331299, "learning_rate": 4.721885699236239e-05, "loss": 2.4373, "step": 2116 }, { "epoch": 0.055754543060310774, "grad_norm": 2.1462979316711426, "learning_rate": 4.7217540163286806e-05, "loss": 1.8085, "step": 2117 }, { "epoch": 0.05578087964182249, "grad_norm": 2.061133623123169, "learning_rate": 4.721622333421122e-05, "loss": 2.0714, "step": 2118 }, { "epoch": 0.05580721622333421, "grad_norm": 2.018908739089966, "learning_rate": 4.721490650513564e-05, "loss": 0.5484, "step": 2119 }, { "epoch": 0.05583355280484593, "grad_norm": 5.552015781402588, "learning_rate": 4.7213589676060046e-05, "loss": 1.4557, "step": 2120 }, { "epoch": 0.05585988938635765, "grad_norm": 2.1100313663482666, "learning_rate": 4.721227284698446e-05, "loss": 2.0805, "step": 2121 }, { "epoch": 0.05588622596786937, "grad_norm": 2.795180320739746, "learning_rate": 4.721095601790888e-05, "loss": 1.7733, "step": 2122 }, { "epoch": 0.05591256254938109, "grad_norm": 2.8228659629821777, "learning_rate": 4.720963918883329e-05, "loss": 1.2784, "step": 2123 }, { "epoch": 0.05593889913089281, "grad_norm": 1.9794082641601562, "learning_rate": 4.720832235975771e-05, "loss": 1.956, "step": 2124 }, { "epoch": 0.05596523571240453, "grad_norm": 5.0670976638793945, "learning_rate": 4.720700553068212e-05, "loss": 1.3835, "step": 2125 }, { "epoch": 0.05599157229391625, "grad_norm": 2.25516414642334, "learning_rate": 4.720568870160653e-05, "loss": 1.8704, "step": 2126 }, { "epoch": 0.05601790887542797, "grad_norm": 1.8798249959945679, "learning_rate": 4.720437187253095e-05, "loss": 1.6445, "step": 2127 }, { "epoch": 0.05604424545693969, "grad_norm": 3.3134806156158447, "learning_rate": 4.7203055043455364e-05, "loss": 1.9723, "step": 2128 }, { "epoch": 0.05607058203845141, "grad_norm": 2.083958625793457, "learning_rate": 4.720173821437977e-05, "loss": 1.4712, "step": 2129 }, { "epoch": 0.05609691861996313, "grad_norm": 3.3280749320983887, "learning_rate": 4.720042138530419e-05, "loss": 1.3424, "step": 2130 }, { "epoch": 0.05612325520147485, "grad_norm": 3.359832525253296, "learning_rate": 4.7199104556228604e-05, "loss": 1.7044, "step": 2131 }, { "epoch": 0.05614959178298657, "grad_norm": 2.6637461185455322, "learning_rate": 4.719778772715302e-05, "loss": 1.3722, "step": 2132 }, { "epoch": 0.056175928364498286, "grad_norm": 5.161062240600586, "learning_rate": 4.7196470898077435e-05, "loss": 1.555, "step": 2133 }, { "epoch": 0.05620226494601001, "grad_norm": 2.719771385192871, "learning_rate": 4.7195154069001844e-05, "loss": 2.2629, "step": 2134 }, { "epoch": 0.05622860152752173, "grad_norm": 2.229858875274658, "learning_rate": 4.719383723992626e-05, "loss": 2.4039, "step": 2135 }, { "epoch": 0.056254938109033446, "grad_norm": 2.868281364440918, "learning_rate": 4.719252041085067e-05, "loss": 1.7693, "step": 2136 }, { "epoch": 0.05628127469054517, "grad_norm": 3.25990629196167, "learning_rate": 4.719120358177509e-05, "loss": 1.8586, "step": 2137 }, { "epoch": 0.056307611272056884, "grad_norm": 4.00319242477417, "learning_rate": 4.71898867526995e-05, "loss": 0.9102, "step": 2138 }, { "epoch": 0.05633394785356861, "grad_norm": 3.3860199451446533, "learning_rate": 4.7188569923623915e-05, "loss": 1.4904, "step": 2139 }, { "epoch": 0.05636028443508033, "grad_norm": 3.3914263248443604, "learning_rate": 4.718725309454833e-05, "loss": 1.7373, "step": 2140 }, { "epoch": 0.056386621016592045, "grad_norm": 2.3849058151245117, "learning_rate": 4.7185936265472747e-05, "loss": 2.0921, "step": 2141 }, { "epoch": 0.05641295759810377, "grad_norm": 2.0900299549102783, "learning_rate": 4.718461943639716e-05, "loss": 2.193, "step": 2142 }, { "epoch": 0.05643929417961548, "grad_norm": 2.992194414138794, "learning_rate": 4.718330260732157e-05, "loss": 1.1533, "step": 2143 }, { "epoch": 0.056465630761127206, "grad_norm": 2.0007081031799316, "learning_rate": 4.7181985778245987e-05, "loss": 1.4728, "step": 2144 }, { "epoch": 0.05649196734263893, "grad_norm": 3.823934316635132, "learning_rate": 4.7180668949170395e-05, "loss": 0.9316, "step": 2145 }, { "epoch": 0.056518303924150644, "grad_norm": 2.3916842937469482, "learning_rate": 4.717935212009482e-05, "loss": 1.9451, "step": 2146 }, { "epoch": 0.056544640505662366, "grad_norm": 4.011995792388916, "learning_rate": 4.717803529101923e-05, "loss": 1.6147, "step": 2147 }, { "epoch": 0.05657097708717408, "grad_norm": 2.2616333961486816, "learning_rate": 4.717671846194364e-05, "loss": 1.6991, "step": 2148 }, { "epoch": 0.056597313668685804, "grad_norm": 5.333600044250488, "learning_rate": 4.717540163286806e-05, "loss": 2.1645, "step": 2149 }, { "epoch": 0.05662365025019753, "grad_norm": 4.291018009185791, "learning_rate": 4.717408480379247e-05, "loss": 2.3565, "step": 2150 }, { "epoch": 0.05664998683170924, "grad_norm": 2.2324442863464355, "learning_rate": 4.717276797471689e-05, "loss": 2.1599, "step": 2151 }, { "epoch": 0.056676323413220965, "grad_norm": 3.4782960414886475, "learning_rate": 4.71714511456413e-05, "loss": 0.9414, "step": 2152 }, { "epoch": 0.05670265999473268, "grad_norm": 5.408863067626953, "learning_rate": 4.7170134316565713e-05, "loss": 1.6579, "step": 2153 }, { "epoch": 0.0567289965762444, "grad_norm": 3.2574551105499268, "learning_rate": 4.716881748749012e-05, "loss": 2.3205, "step": 2154 }, { "epoch": 0.056755333157756126, "grad_norm": 2.595188856124878, "learning_rate": 4.716750065841454e-05, "loss": 0.7521, "step": 2155 }, { "epoch": 0.05678166973926784, "grad_norm": 5.192615509033203, "learning_rate": 4.7166183829338953e-05, "loss": 1.6112, "step": 2156 }, { "epoch": 0.056808006320779564, "grad_norm": 2.759761095046997, "learning_rate": 4.716486700026337e-05, "loss": 1.6369, "step": 2157 }, { "epoch": 0.05683434290229128, "grad_norm": 3.5821824073791504, "learning_rate": 4.7163550171187785e-05, "loss": 2.138, "step": 2158 }, { "epoch": 0.056860679483803, "grad_norm": 2.3638978004455566, "learning_rate": 4.7162233342112193e-05, "loss": 1.6788, "step": 2159 }, { "epoch": 0.056887016065314724, "grad_norm": 2.5548527240753174, "learning_rate": 4.7160916513036616e-05, "loss": 1.9895, "step": 2160 }, { "epoch": 0.05691335264682644, "grad_norm": 3.196124792098999, "learning_rate": 4.7159599683961025e-05, "loss": 1.556, "step": 2161 }, { "epoch": 0.05693968922833816, "grad_norm": 2.3067831993103027, "learning_rate": 4.715828285488544e-05, "loss": 2.3408, "step": 2162 }, { "epoch": 0.05696602580984988, "grad_norm": 3.1644434928894043, "learning_rate": 4.715696602580985e-05, "loss": 1.891, "step": 2163 }, { "epoch": 0.0569923623913616, "grad_norm": 2.0824637413024902, "learning_rate": 4.7155649196734265e-05, "loss": 1.6948, "step": 2164 }, { "epoch": 0.05701869897287332, "grad_norm": 4.186317443847656, "learning_rate": 4.715433236765868e-05, "loss": 0.397, "step": 2165 }, { "epoch": 0.05704503555438504, "grad_norm": 3.7899069786071777, "learning_rate": 4.7153015538583096e-05, "loss": 0.8298, "step": 2166 }, { "epoch": 0.05707137213589676, "grad_norm": 4.509664535522461, "learning_rate": 4.7151698709507505e-05, "loss": 1.593, "step": 2167 }, { "epoch": 0.057097708717408484, "grad_norm": 3.0293211936950684, "learning_rate": 4.715038188043192e-05, "loss": 1.2093, "step": 2168 }, { "epoch": 0.0571240452989202, "grad_norm": 3.6300442218780518, "learning_rate": 4.7149065051356336e-05, "loss": 2.1209, "step": 2169 }, { "epoch": 0.05715038188043192, "grad_norm": 1.6079756021499634, "learning_rate": 4.714774822228075e-05, "loss": 1.7361, "step": 2170 }, { "epoch": 0.05717671846194364, "grad_norm": 2.393828868865967, "learning_rate": 4.714643139320517e-05, "loss": 1.1658, "step": 2171 }, { "epoch": 0.05720305504345536, "grad_norm": 2.2417051792144775, "learning_rate": 4.7145114564129576e-05, "loss": 2.2137, "step": 2172 }, { "epoch": 0.05722939162496708, "grad_norm": 3.120640516281128, "learning_rate": 4.714379773505399e-05, "loss": 1.5996, "step": 2173 }, { "epoch": 0.0572557282064788, "grad_norm": 5.414018630981445, "learning_rate": 4.714248090597841e-05, "loss": 1.99, "step": 2174 }, { "epoch": 0.05728206478799052, "grad_norm": 2.18473219871521, "learning_rate": 4.714116407690282e-05, "loss": 1.4559, "step": 2175 }, { "epoch": 0.057308401369502236, "grad_norm": 2.0574967861175537, "learning_rate": 4.713984724782723e-05, "loss": 1.3193, "step": 2176 }, { "epoch": 0.05733473795101396, "grad_norm": 2.525825262069702, "learning_rate": 4.713853041875165e-05, "loss": 0.5651, "step": 2177 }, { "epoch": 0.05736107453252568, "grad_norm": 2.9111571311950684, "learning_rate": 4.713721358967606e-05, "loss": 1.3808, "step": 2178 }, { "epoch": 0.0573874111140374, "grad_norm": 10.607233047485352, "learning_rate": 4.713589676060048e-05, "loss": 2.1026, "step": 2179 }, { "epoch": 0.05741374769554912, "grad_norm": 2.3342490196228027, "learning_rate": 4.7134579931524894e-05, "loss": 1.5054, "step": 2180 }, { "epoch": 0.057440084277060835, "grad_norm": 5.707728862762451, "learning_rate": 4.71332631024493e-05, "loss": 1.3972, "step": 2181 }, { "epoch": 0.05746642085857256, "grad_norm": 2.1899068355560303, "learning_rate": 4.713194627337372e-05, "loss": 1.7015, "step": 2182 }, { "epoch": 0.05749275744008428, "grad_norm": 2.359449863433838, "learning_rate": 4.713062944429813e-05, "loss": 1.6338, "step": 2183 }, { "epoch": 0.057519094021595996, "grad_norm": 1.9393079280853271, "learning_rate": 4.712931261522255e-05, "loss": 2.5717, "step": 2184 }, { "epoch": 0.05754543060310772, "grad_norm": 5.754360675811768, "learning_rate": 4.712799578614696e-05, "loss": 1.4453, "step": 2185 }, { "epoch": 0.057571767184619434, "grad_norm": 7.439517021179199, "learning_rate": 4.7126678957071374e-05, "loss": 1.2166, "step": 2186 }, { "epoch": 0.057598103766131156, "grad_norm": 3.0107805728912354, "learning_rate": 4.712536212799579e-05, "loss": 2.4133, "step": 2187 }, { "epoch": 0.05762444034764288, "grad_norm": 2.320903778076172, "learning_rate": 4.71240452989202e-05, "loss": 1.4147, "step": 2188 }, { "epoch": 0.057650776929154594, "grad_norm": 2.8712406158447266, "learning_rate": 4.712272846984462e-05, "loss": 1.2879, "step": 2189 }, { "epoch": 0.05767711351066632, "grad_norm": 2.044370651245117, "learning_rate": 4.712141164076903e-05, "loss": 1.1372, "step": 2190 }, { "epoch": 0.05770345009217803, "grad_norm": 4.43475341796875, "learning_rate": 4.7120094811693445e-05, "loss": 0.8103, "step": 2191 }, { "epoch": 0.057729786673689755, "grad_norm": 1.9338033199310303, "learning_rate": 4.7118777982617854e-05, "loss": 1.3003, "step": 2192 }, { "epoch": 0.05775612325520148, "grad_norm": 2.5325772762298584, "learning_rate": 4.7117461153542276e-05, "loss": 1.4003, "step": 2193 }, { "epoch": 0.05778245983671319, "grad_norm": 3.6060450077056885, "learning_rate": 4.7116144324466685e-05, "loss": 1.9385, "step": 2194 }, { "epoch": 0.057808796418224916, "grad_norm": 2.596099615097046, "learning_rate": 4.71148274953911e-05, "loss": 1.5216, "step": 2195 }, { "epoch": 0.05783513299973663, "grad_norm": 2.9077229499816895, "learning_rate": 4.7113510666315516e-05, "loss": 1.9273, "step": 2196 }, { "epoch": 0.057861469581248354, "grad_norm": 2.57293701171875, "learning_rate": 4.7112193837239925e-05, "loss": 1.8177, "step": 2197 }, { "epoch": 0.057887806162760076, "grad_norm": 4.707583427429199, "learning_rate": 4.711087700816435e-05, "loss": 1.2391, "step": 2198 }, { "epoch": 0.05791414274427179, "grad_norm": 3.1155073642730713, "learning_rate": 4.7109560179088756e-05, "loss": 1.5555, "step": 2199 }, { "epoch": 0.057940479325783514, "grad_norm": 2.756563425064087, "learning_rate": 4.710824335001317e-05, "loss": 1.2731, "step": 2200 }, { "epoch": 0.05796681590729523, "grad_norm": 3.910980224609375, "learning_rate": 4.710692652093758e-05, "loss": 1.1219, "step": 2201 }, { "epoch": 0.05799315248880695, "grad_norm": 3.0736021995544434, "learning_rate": 4.7105609691861996e-05, "loss": 2.3444, "step": 2202 }, { "epoch": 0.058019489070318675, "grad_norm": 2.265638828277588, "learning_rate": 4.710429286278641e-05, "loss": 2.2269, "step": 2203 }, { "epoch": 0.05804582565183039, "grad_norm": 3.4562597274780273, "learning_rate": 4.710297603371083e-05, "loss": 1.4524, "step": 2204 }, { "epoch": 0.05807216223334211, "grad_norm": 2.9995017051696777, "learning_rate": 4.710165920463524e-05, "loss": 1.5533, "step": 2205 }, { "epoch": 0.05809849881485383, "grad_norm": 4.540975570678711, "learning_rate": 4.710034237555965e-05, "loss": 1.4094, "step": 2206 }, { "epoch": 0.05812483539636555, "grad_norm": 1.7497889995574951, "learning_rate": 4.7099025546484074e-05, "loss": 1.9038, "step": 2207 }, { "epoch": 0.058151171977877274, "grad_norm": 2.1031877994537354, "learning_rate": 4.709770871740848e-05, "loss": 2.0104, "step": 2208 }, { "epoch": 0.05817750855938899, "grad_norm": 2.4106247425079346, "learning_rate": 4.70963918883329e-05, "loss": 2.0538, "step": 2209 }, { "epoch": 0.05820384514090071, "grad_norm": 1.9706648588180542, "learning_rate": 4.709507505925731e-05, "loss": 1.5769, "step": 2210 }, { "epoch": 0.05823018172241243, "grad_norm": 2.4568169116973877, "learning_rate": 4.709375823018172e-05, "loss": 0.731, "step": 2211 }, { "epoch": 0.05825651830392415, "grad_norm": 4.356393814086914, "learning_rate": 4.709244140110614e-05, "loss": 1.8031, "step": 2212 }, { "epoch": 0.05828285488543587, "grad_norm": 1.876869559288025, "learning_rate": 4.7091124572030554e-05, "loss": 1.6414, "step": 2213 }, { "epoch": 0.05830919146694759, "grad_norm": 2.18074631690979, "learning_rate": 4.708980774295497e-05, "loss": 2.3957, "step": 2214 }, { "epoch": 0.05833552804845931, "grad_norm": 2.849780321121216, "learning_rate": 4.708849091387938e-05, "loss": 2.2857, "step": 2215 }, { "epoch": 0.05836186462997103, "grad_norm": 2.6972780227661133, "learning_rate": 4.7087174084803794e-05, "loss": 2.3248, "step": 2216 }, { "epoch": 0.05838820121148275, "grad_norm": 1.7781732082366943, "learning_rate": 4.708585725572821e-05, "loss": 1.1056, "step": 2217 }, { "epoch": 0.05841453779299447, "grad_norm": 2.488926649093628, "learning_rate": 4.7084540426652626e-05, "loss": 1.7901, "step": 2218 }, { "epoch": 0.05844087437450619, "grad_norm": 2.4412338733673096, "learning_rate": 4.7083223597577034e-05, "loss": 2.3414, "step": 2219 }, { "epoch": 0.05846721095601791, "grad_norm": 2.6597626209259033, "learning_rate": 4.708190676850145e-05, "loss": 1.9127, "step": 2220 }, { "epoch": 0.05849354753752963, "grad_norm": 3.2455780506134033, "learning_rate": 4.708058993942586e-05, "loss": 1.54, "step": 2221 }, { "epoch": 0.05851988411904135, "grad_norm": 1.9604851007461548, "learning_rate": 4.707927311035028e-05, "loss": 2.1296, "step": 2222 }, { "epoch": 0.05854622070055307, "grad_norm": 1.9545475244522095, "learning_rate": 4.707795628127469e-05, "loss": 1.8768, "step": 2223 }, { "epoch": 0.058572557282064786, "grad_norm": 4.185596466064453, "learning_rate": 4.7076639452199106e-05, "loss": 1.928, "step": 2224 }, { "epoch": 0.05859889386357651, "grad_norm": 2.5845375061035156, "learning_rate": 4.707532262312352e-05, "loss": 2.0505, "step": 2225 }, { "epoch": 0.05862523044508823, "grad_norm": 2.424807548522949, "learning_rate": 4.707400579404794e-05, "loss": 1.1109, "step": 2226 }, { "epoch": 0.058651567026599946, "grad_norm": 2.557514190673828, "learning_rate": 4.707268896497235e-05, "loss": 2.0236, "step": 2227 }, { "epoch": 0.05867790360811167, "grad_norm": 2.9080758094787598, "learning_rate": 4.707137213589676e-05, "loss": 1.4031, "step": 2228 }, { "epoch": 0.058704240189623384, "grad_norm": 3.306455135345459, "learning_rate": 4.707005530682118e-05, "loss": 0.3484, "step": 2229 }, { "epoch": 0.05873057677113511, "grad_norm": 3.528730630874634, "learning_rate": 4.7068738477745586e-05, "loss": 1.573, "step": 2230 }, { "epoch": 0.05875691335264683, "grad_norm": 2.327526330947876, "learning_rate": 4.706742164867001e-05, "loss": 1.4864, "step": 2231 }, { "epoch": 0.058783249934158545, "grad_norm": 2.211899518966675, "learning_rate": 4.706610481959442e-05, "loss": 2.1208, "step": 2232 }, { "epoch": 0.05880958651567027, "grad_norm": 3.8888766765594482, "learning_rate": 4.706478799051883e-05, "loss": 1.3574, "step": 2233 }, { "epoch": 0.05883592309718198, "grad_norm": 1.8692330121994019, "learning_rate": 4.706347116144325e-05, "loss": 1.9271, "step": 2234 }, { "epoch": 0.058862259678693706, "grad_norm": 3.9838359355926514, "learning_rate": 4.706215433236766e-05, "loss": 2.0212, "step": 2235 }, { "epoch": 0.05888859626020543, "grad_norm": 2.807814598083496, "learning_rate": 4.706083750329208e-05, "loss": 1.8291, "step": 2236 }, { "epoch": 0.058914932841717144, "grad_norm": 2.5180716514587402, "learning_rate": 4.705952067421649e-05, "loss": 1.3584, "step": 2237 }, { "epoch": 0.058941269423228866, "grad_norm": 4.121596336364746, "learning_rate": 4.7058203845140904e-05, "loss": 2.1318, "step": 2238 }, { "epoch": 0.05896760600474058, "grad_norm": 2.9912428855895996, "learning_rate": 4.705688701606531e-05, "loss": 1.5819, "step": 2239 }, { "epoch": 0.058993942586252304, "grad_norm": 2.5563719272613525, "learning_rate": 4.7055570186989735e-05, "loss": 2.1908, "step": 2240 }, { "epoch": 0.05902027916776403, "grad_norm": 2.503976821899414, "learning_rate": 4.7054253357914144e-05, "loss": 2.4987, "step": 2241 }, { "epoch": 0.05904661574927574, "grad_norm": 1.8304951190948486, "learning_rate": 4.705293652883856e-05, "loss": 1.6273, "step": 2242 }, { "epoch": 0.059072952330787465, "grad_norm": 4.101771354675293, "learning_rate": 4.7051619699762975e-05, "loss": 1.5418, "step": 2243 }, { "epoch": 0.05909928891229918, "grad_norm": 2.231008529663086, "learning_rate": 4.7050302870687384e-05, "loss": 1.3712, "step": 2244 }, { "epoch": 0.0591256254938109, "grad_norm": 2.1481590270996094, "learning_rate": 4.7048986041611806e-05, "loss": 1.4567, "step": 2245 }, { "epoch": 0.059151962075322626, "grad_norm": 2.145627975463867, "learning_rate": 4.7047669212536215e-05, "loss": 2.6714, "step": 2246 }, { "epoch": 0.05917829865683434, "grad_norm": 2.559617280960083, "learning_rate": 4.704635238346063e-05, "loss": 1.5486, "step": 2247 }, { "epoch": 0.059204635238346064, "grad_norm": 2.089916467666626, "learning_rate": 4.704503555438504e-05, "loss": 1.7973, "step": 2248 }, { "epoch": 0.05923097181985778, "grad_norm": 1.8132014274597168, "learning_rate": 4.7043718725309455e-05, "loss": 1.8544, "step": 2249 }, { "epoch": 0.0592573084013695, "grad_norm": 2.802475929260254, "learning_rate": 4.704240189623387e-05, "loss": 1.7482, "step": 2250 }, { "epoch": 0.059283644982881224, "grad_norm": 5.057201862335205, "learning_rate": 4.7041085067158286e-05, "loss": 0.9728, "step": 2251 }, { "epoch": 0.05930998156439294, "grad_norm": 2.865506410598755, "learning_rate": 4.70397682380827e-05, "loss": 2.0518, "step": 2252 }, { "epoch": 0.05933631814590466, "grad_norm": 2.898247003555298, "learning_rate": 4.703845140900711e-05, "loss": 1.0441, "step": 2253 }, { "epoch": 0.05936265472741638, "grad_norm": 2.4292194843292236, "learning_rate": 4.7037134579931526e-05, "loss": 0.4388, "step": 2254 }, { "epoch": 0.0593889913089281, "grad_norm": 8.985533714294434, "learning_rate": 4.703581775085594e-05, "loss": 1.8704, "step": 2255 }, { "epoch": 0.05941532789043982, "grad_norm": 2.1336731910705566, "learning_rate": 4.703450092178036e-05, "loss": 0.6001, "step": 2256 }, { "epoch": 0.05944166447195154, "grad_norm": 2.3788089752197266, "learning_rate": 4.7033184092704766e-05, "loss": 1.8919, "step": 2257 }, { "epoch": 0.05946800105346326, "grad_norm": 2.526772975921631, "learning_rate": 4.703186726362918e-05, "loss": 1.7069, "step": 2258 }, { "epoch": 0.059494337634974984, "grad_norm": 2.207334280014038, "learning_rate": 4.70305504345536e-05, "loss": 1.4619, "step": 2259 }, { "epoch": 0.0595206742164867, "grad_norm": 3.2663016319274902, "learning_rate": 4.702923360547801e-05, "loss": 1.6889, "step": 2260 }, { "epoch": 0.05954701079799842, "grad_norm": 3.2809529304504395, "learning_rate": 4.702791677640243e-05, "loss": 2.2813, "step": 2261 }, { "epoch": 0.05957334737951014, "grad_norm": 3.178739547729492, "learning_rate": 4.702659994732684e-05, "loss": 1.358, "step": 2262 }, { "epoch": 0.05959968396102186, "grad_norm": 2.3345863819122314, "learning_rate": 4.702528311825125e-05, "loss": 2.1037, "step": 2263 }, { "epoch": 0.05962602054253358, "grad_norm": 4.884681224822998, "learning_rate": 4.702396628917567e-05, "loss": 0.7522, "step": 2264 }, { "epoch": 0.0596523571240453, "grad_norm": 3.6622402667999268, "learning_rate": 4.7022649460100084e-05, "loss": 1.4466, "step": 2265 }, { "epoch": 0.05967869370555702, "grad_norm": 2.3584275245666504, "learning_rate": 4.702133263102449e-05, "loss": 1.9784, "step": 2266 }, { "epoch": 0.059705030287068736, "grad_norm": 2.529788017272949, "learning_rate": 4.702001580194891e-05, "loss": 1.7532, "step": 2267 }, { "epoch": 0.05973136686858046, "grad_norm": 4.265054225921631, "learning_rate": 4.701869897287332e-05, "loss": 1.5829, "step": 2268 }, { "epoch": 0.05975770345009218, "grad_norm": 1.83254075050354, "learning_rate": 4.701738214379774e-05, "loss": 1.7224, "step": 2269 }, { "epoch": 0.0597840400316039, "grad_norm": 2.8131589889526367, "learning_rate": 4.701606531472215e-05, "loss": 1.5929, "step": 2270 }, { "epoch": 0.05981037661311562, "grad_norm": 3.6532633304595947, "learning_rate": 4.7014748485646564e-05, "loss": 1.2478, "step": 2271 }, { "epoch": 0.059836713194627335, "grad_norm": 2.2759995460510254, "learning_rate": 4.701343165657098e-05, "loss": 1.5491, "step": 2272 }, { "epoch": 0.05986304977613906, "grad_norm": 2.3043057918548584, "learning_rate": 4.7012114827495395e-05, "loss": 1.8527, "step": 2273 }, { "epoch": 0.05988938635765078, "grad_norm": 2.1862733364105225, "learning_rate": 4.701079799841981e-05, "loss": 1.7428, "step": 2274 }, { "epoch": 0.059915722939162495, "grad_norm": 3.3454487323760986, "learning_rate": 4.700948116934422e-05, "loss": 1.7384, "step": 2275 }, { "epoch": 0.05994205952067422, "grad_norm": 3.5950281620025635, "learning_rate": 4.7008164340268635e-05, "loss": 1.4028, "step": 2276 }, { "epoch": 0.059968396102185934, "grad_norm": 4.540416717529297, "learning_rate": 4.7006847511193044e-05, "loss": 1.705, "step": 2277 }, { "epoch": 0.059994732683697656, "grad_norm": 2.2978692054748535, "learning_rate": 4.700553068211747e-05, "loss": 2.5032, "step": 2278 }, { "epoch": 0.06002106926520938, "grad_norm": 1.6918925046920776, "learning_rate": 4.7004213853041875e-05, "loss": 1.5561, "step": 2279 }, { "epoch": 0.060047405846721094, "grad_norm": 2.237168312072754, "learning_rate": 4.700289702396629e-05, "loss": 1.4813, "step": 2280 }, { "epoch": 0.06007374242823282, "grad_norm": 3.1560251712799072, "learning_rate": 4.700158019489071e-05, "loss": 2.2517, "step": 2281 }, { "epoch": 0.06010007900974453, "grad_norm": 3.299541473388672, "learning_rate": 4.7000263365815115e-05, "loss": 2.9007, "step": 2282 }, { "epoch": 0.060126415591256255, "grad_norm": 2.6424081325531006, "learning_rate": 4.699894653673954e-05, "loss": 1.9763, "step": 2283 }, { "epoch": 0.06015275217276798, "grad_norm": 2.60740065574646, "learning_rate": 4.699762970766395e-05, "loss": 2.1791, "step": 2284 }, { "epoch": 0.06017908875427969, "grad_norm": 3.41214656829834, "learning_rate": 4.699631287858836e-05, "loss": 1.596, "step": 2285 }, { "epoch": 0.060205425335791415, "grad_norm": 2.762927532196045, "learning_rate": 4.699499604951277e-05, "loss": 1.9012, "step": 2286 }, { "epoch": 0.06023176191730313, "grad_norm": 2.6899235248565674, "learning_rate": 4.699367922043719e-05, "loss": 1.2745, "step": 2287 }, { "epoch": 0.060258098498814854, "grad_norm": 1.9002881050109863, "learning_rate": 4.69923623913616e-05, "loss": 1.752, "step": 2288 }, { "epoch": 0.060284435080326576, "grad_norm": 4.1252899169921875, "learning_rate": 4.699104556228602e-05, "loss": 1.3271, "step": 2289 }, { "epoch": 0.06031077166183829, "grad_norm": 2.194669723510742, "learning_rate": 4.6989728733210433e-05, "loss": 1.9517, "step": 2290 }, { "epoch": 0.060337108243350014, "grad_norm": 2.555772066116333, "learning_rate": 4.698841190413484e-05, "loss": 1.4758, "step": 2291 }, { "epoch": 0.06036344482486173, "grad_norm": 2.6042699813842773, "learning_rate": 4.6987095075059265e-05, "loss": 1.7974, "step": 2292 }, { "epoch": 0.06038978140637345, "grad_norm": 2.769167184829712, "learning_rate": 4.6985778245983673e-05, "loss": 1.5591, "step": 2293 }, { "epoch": 0.060416117987885175, "grad_norm": 5.088955402374268, "learning_rate": 4.698446141690809e-05, "loss": 1.0187, "step": 2294 }, { "epoch": 0.06044245456939689, "grad_norm": 2.3629088401794434, "learning_rate": 4.69831445878325e-05, "loss": 1.4315, "step": 2295 }, { "epoch": 0.06046879115090861, "grad_norm": 2.3658950328826904, "learning_rate": 4.6981827758756914e-05, "loss": 1.578, "step": 2296 }, { "epoch": 0.06049512773242033, "grad_norm": 1.7849318981170654, "learning_rate": 4.698051092968133e-05, "loss": 2.0656, "step": 2297 }, { "epoch": 0.06052146431393205, "grad_norm": 3.287752866744995, "learning_rate": 4.6979194100605745e-05, "loss": 1.6277, "step": 2298 }, { "epoch": 0.060547800895443774, "grad_norm": 2.5434162616729736, "learning_rate": 4.697787727153016e-05, "loss": 1.6112, "step": 2299 }, { "epoch": 0.06057413747695549, "grad_norm": 3.7894043922424316, "learning_rate": 4.697656044245457e-05, "loss": 2.4612, "step": 2300 }, { "epoch": 0.06060047405846721, "grad_norm": 2.033865451812744, "learning_rate": 4.6975243613378985e-05, "loss": 1.8314, "step": 2301 }, { "epoch": 0.060626810639978934, "grad_norm": 2.481954574584961, "learning_rate": 4.69739267843034e-05, "loss": 1.2539, "step": 2302 }, { "epoch": 0.06065314722149065, "grad_norm": 3.2213282585144043, "learning_rate": 4.6972609955227816e-05, "loss": 0.858, "step": 2303 }, { "epoch": 0.06067948380300237, "grad_norm": 3.107637882232666, "learning_rate": 4.6971293126152225e-05, "loss": 1.984, "step": 2304 }, { "epoch": 0.06070582038451409, "grad_norm": 2.3892407417297363, "learning_rate": 4.696997629707664e-05, "loss": 1.4793, "step": 2305 }, { "epoch": 0.06073215696602581, "grad_norm": 2.00102162361145, "learning_rate": 4.6968659468001056e-05, "loss": 1.6171, "step": 2306 }, { "epoch": 0.06075849354753753, "grad_norm": 1.9140921831130981, "learning_rate": 4.696734263892547e-05, "loss": 1.5184, "step": 2307 }, { "epoch": 0.06078483012904925, "grad_norm": 2.6027801036834717, "learning_rate": 4.696602580984989e-05, "loss": 1.4163, "step": 2308 }, { "epoch": 0.06081116671056097, "grad_norm": 3.8211586475372314, "learning_rate": 4.6964708980774296e-05, "loss": 1.7563, "step": 2309 }, { "epoch": 0.06083750329207269, "grad_norm": 3.426889657974243, "learning_rate": 4.696339215169871e-05, "loss": 1.4254, "step": 2310 }, { "epoch": 0.06086383987358441, "grad_norm": 2.7382829189300537, "learning_rate": 4.696207532262313e-05, "loss": 1.942, "step": 2311 }, { "epoch": 0.06089017645509613, "grad_norm": 4.564896106719971, "learning_rate": 4.696075849354754e-05, "loss": 1.7901, "step": 2312 }, { "epoch": 0.06091651303660785, "grad_norm": 2.068636178970337, "learning_rate": 4.695944166447195e-05, "loss": 1.6466, "step": 2313 }, { "epoch": 0.06094284961811957, "grad_norm": 2.961036443710327, "learning_rate": 4.695812483539637e-05, "loss": 0.681, "step": 2314 }, { "epoch": 0.060969186199631285, "grad_norm": 2.3746185302734375, "learning_rate": 4.6956808006320776e-05, "loss": 1.4592, "step": 2315 }, { "epoch": 0.06099552278114301, "grad_norm": 2.8096494674682617, "learning_rate": 4.69554911772452e-05, "loss": 1.7898, "step": 2316 }, { "epoch": 0.06102185936265473, "grad_norm": 2.3663113117218018, "learning_rate": 4.6954174348169614e-05, "loss": 1.999, "step": 2317 }, { "epoch": 0.061048195944166446, "grad_norm": 2.512408971786499, "learning_rate": 4.695285751909402e-05, "loss": 2.1202, "step": 2318 }, { "epoch": 0.06107453252567817, "grad_norm": 2.44903302192688, "learning_rate": 4.695154069001844e-05, "loss": 2.0955, "step": 2319 }, { "epoch": 0.061100869107189884, "grad_norm": 5.071829319000244, "learning_rate": 4.6950223860942854e-05, "loss": 1.3134, "step": 2320 }, { "epoch": 0.06112720568870161, "grad_norm": 2.7499380111694336, "learning_rate": 4.694890703186727e-05, "loss": 2.114, "step": 2321 }, { "epoch": 0.06115354227021333, "grad_norm": 3.3781466484069824, "learning_rate": 4.694759020279168e-05, "loss": 1.6296, "step": 2322 }, { "epoch": 0.061179878851725045, "grad_norm": 1.9659886360168457, "learning_rate": 4.6946273373716094e-05, "loss": 2.3236, "step": 2323 }, { "epoch": 0.06120621543323677, "grad_norm": 2.6231226921081543, "learning_rate": 4.69449565446405e-05, "loss": 2.2305, "step": 2324 }, { "epoch": 0.06123255201474848, "grad_norm": 2.6958329677581787, "learning_rate": 4.6943639715564925e-05, "loss": 1.122, "step": 2325 }, { "epoch": 0.061258888596260205, "grad_norm": 1.8875097036361694, "learning_rate": 4.6942322886489334e-05, "loss": 1.9176, "step": 2326 }, { "epoch": 0.06128522517777193, "grad_norm": 2.3418688774108887, "learning_rate": 4.694100605741375e-05, "loss": 1.4749, "step": 2327 }, { "epoch": 0.06131156175928364, "grad_norm": 4.613044261932373, "learning_rate": 4.6939689228338165e-05, "loss": 0.6705, "step": 2328 }, { "epoch": 0.061337898340795366, "grad_norm": 2.5555670261383057, "learning_rate": 4.6938372399262574e-05, "loss": 1.8351, "step": 2329 }, { "epoch": 0.06136423492230708, "grad_norm": 2.5072150230407715, "learning_rate": 4.6937055570186996e-05, "loss": 1.9891, "step": 2330 }, { "epoch": 0.061390571503818804, "grad_norm": 2.2406907081604004, "learning_rate": 4.6935738741111405e-05, "loss": 1.8428, "step": 2331 }, { "epoch": 0.06141690808533053, "grad_norm": 2.5359418392181396, "learning_rate": 4.693442191203582e-05, "loss": 1.0713, "step": 2332 }, { "epoch": 0.06144324466684224, "grad_norm": 3.020839214324951, "learning_rate": 4.693310508296023e-05, "loss": 1.6319, "step": 2333 }, { "epoch": 0.061469581248353965, "grad_norm": 2.4607772827148438, "learning_rate": 4.6931788253884645e-05, "loss": 2.2703, "step": 2334 }, { "epoch": 0.06149591782986568, "grad_norm": 2.916142463684082, "learning_rate": 4.693047142480906e-05, "loss": 1.6846, "step": 2335 }, { "epoch": 0.0615222544113774, "grad_norm": 2.408010721206665, "learning_rate": 4.6929154595733476e-05, "loss": 1.0948, "step": 2336 }, { "epoch": 0.061548590992889125, "grad_norm": 2.9166905879974365, "learning_rate": 4.692783776665789e-05, "loss": 1.607, "step": 2337 }, { "epoch": 0.06157492757440084, "grad_norm": 5.828845024108887, "learning_rate": 4.69265209375823e-05, "loss": 1.6196, "step": 2338 }, { "epoch": 0.06160126415591256, "grad_norm": 4.621794700622559, "learning_rate": 4.692520410850672e-05, "loss": 1.149, "step": 2339 }, { "epoch": 0.06162760073742428, "grad_norm": 3.325955390930176, "learning_rate": 4.692388727943113e-05, "loss": 1.0291, "step": 2340 }, { "epoch": 0.061653937318936, "grad_norm": 2.8587429523468018, "learning_rate": 4.692257045035555e-05, "loss": 2.1007, "step": 2341 }, { "epoch": 0.061680273900447724, "grad_norm": 3.480433464050293, "learning_rate": 4.6921253621279956e-05, "loss": 2.3088, "step": 2342 }, { "epoch": 0.06170661048195944, "grad_norm": 1.83161199092865, "learning_rate": 4.691993679220437e-05, "loss": 1.5615, "step": 2343 }, { "epoch": 0.06173294706347116, "grad_norm": 3.0452425479888916, "learning_rate": 4.691861996312879e-05, "loss": 1.9265, "step": 2344 }, { "epoch": 0.06175928364498288, "grad_norm": 3.0918824672698975, "learning_rate": 4.69173031340532e-05, "loss": 0.9461, "step": 2345 }, { "epoch": 0.0617856202264946, "grad_norm": 1.8512251377105713, "learning_rate": 4.691598630497762e-05, "loss": 1.2078, "step": 2346 }, { "epoch": 0.06181195680800632, "grad_norm": 3.598766326904297, "learning_rate": 4.691466947590203e-05, "loss": 1.4373, "step": 2347 }, { "epoch": 0.06183829338951804, "grad_norm": 2.05191707611084, "learning_rate": 4.691335264682644e-05, "loss": 1.6134, "step": 2348 }, { "epoch": 0.06186462997102976, "grad_norm": 2.3048503398895264, "learning_rate": 4.691203581775086e-05, "loss": 1.6072, "step": 2349 }, { "epoch": 0.06189096655254148, "grad_norm": 2.4440760612487793, "learning_rate": 4.6910718988675274e-05, "loss": 1.5599, "step": 2350 }, { "epoch": 0.0619173031340532, "grad_norm": 2.9048287868499756, "learning_rate": 4.690940215959968e-05, "loss": 2.4745, "step": 2351 }, { "epoch": 0.06194363971556492, "grad_norm": 3.846914529800415, "learning_rate": 4.69080853305241e-05, "loss": 0.8357, "step": 2352 }, { "epoch": 0.06196997629707664, "grad_norm": 2.4925453662872314, "learning_rate": 4.6906768501448514e-05, "loss": 2.2837, "step": 2353 }, { "epoch": 0.06199631287858836, "grad_norm": 8.011929512023926, "learning_rate": 4.690545167237293e-05, "loss": 1.6929, "step": 2354 }, { "epoch": 0.06202264946010008, "grad_norm": 2.592222213745117, "learning_rate": 4.6904134843297346e-05, "loss": 1.5417, "step": 2355 }, { "epoch": 0.0620489860416118, "grad_norm": 4.365649700164795, "learning_rate": 4.6902818014221755e-05, "loss": 1.6431, "step": 2356 }, { "epoch": 0.06207532262312352, "grad_norm": 2.6333377361297607, "learning_rate": 4.690150118514617e-05, "loss": 1.4739, "step": 2357 }, { "epoch": 0.062101659204635236, "grad_norm": 2.52740740776062, "learning_rate": 4.6900184356070586e-05, "loss": 2.8303, "step": 2358 }, { "epoch": 0.06212799578614696, "grad_norm": 2.5063893795013428, "learning_rate": 4.6898867526995e-05, "loss": 1.8748, "step": 2359 }, { "epoch": 0.06215433236765868, "grad_norm": 2.152168035507202, "learning_rate": 4.689755069791941e-05, "loss": 1.6331, "step": 2360 }, { "epoch": 0.062180668949170396, "grad_norm": 2.199742317199707, "learning_rate": 4.6896233868843826e-05, "loss": 2.1367, "step": 2361 }, { "epoch": 0.06220700553068212, "grad_norm": 6.430613994598389, "learning_rate": 4.689491703976824e-05, "loss": 1.0867, "step": 2362 }, { "epoch": 0.062233342112193835, "grad_norm": 2.8499956130981445, "learning_rate": 4.689360021069266e-05, "loss": 1.3986, "step": 2363 }, { "epoch": 0.06225967869370556, "grad_norm": 1.7732125520706177, "learning_rate": 4.689228338161707e-05, "loss": 1.6242, "step": 2364 }, { "epoch": 0.06228601527521728, "grad_norm": 2.204768657684326, "learning_rate": 4.689096655254148e-05, "loss": 2.108, "step": 2365 }, { "epoch": 0.062312351856728995, "grad_norm": 3.954988956451416, "learning_rate": 4.68896497234659e-05, "loss": 1.5833, "step": 2366 }, { "epoch": 0.06233868843824072, "grad_norm": 2.904905319213867, "learning_rate": 4.6888332894390306e-05, "loss": 2.2774, "step": 2367 }, { "epoch": 0.06236502501975243, "grad_norm": 1.716963291168213, "learning_rate": 4.688701606531473e-05, "loss": 1.7282, "step": 2368 }, { "epoch": 0.062391361601264156, "grad_norm": 3.9335405826568604, "learning_rate": 4.688569923623914e-05, "loss": 2.0746, "step": 2369 }, { "epoch": 0.06241769818277588, "grad_norm": 2.6345374584198, "learning_rate": 4.688438240716355e-05, "loss": 1.9933, "step": 2370 }, { "epoch": 0.062444034764287594, "grad_norm": 2.059537887573242, "learning_rate": 4.688306557808796e-05, "loss": 1.6789, "step": 2371 }, { "epoch": 0.062470371345799316, "grad_norm": 3.3977279663085938, "learning_rate": 4.6881748749012384e-05, "loss": 1.5188, "step": 2372 }, { "epoch": 0.06249670792731103, "grad_norm": 2.193197011947632, "learning_rate": 4.688043191993679e-05, "loss": 1.9147, "step": 2373 }, { "epoch": 0.06252304450882276, "grad_norm": 2.059917449951172, "learning_rate": 4.687911509086121e-05, "loss": 2.5271, "step": 2374 }, { "epoch": 0.06254938109033448, "grad_norm": 2.6211464405059814, "learning_rate": 4.6877798261785624e-05, "loss": 1.31, "step": 2375 }, { "epoch": 0.06257571767184619, "grad_norm": 5.8871169090271, "learning_rate": 4.687648143271003e-05, "loss": 0.9713, "step": 2376 }, { "epoch": 0.06260205425335791, "grad_norm": 4.014262676239014, "learning_rate": 4.6875164603634455e-05, "loss": 1.2031, "step": 2377 }, { "epoch": 0.06262839083486964, "grad_norm": 2.2473819255828857, "learning_rate": 4.6873847774558864e-05, "loss": 2.0133, "step": 2378 }, { "epoch": 0.06265472741638135, "grad_norm": 5.412081718444824, "learning_rate": 4.687253094548328e-05, "loss": 1.2997, "step": 2379 }, { "epoch": 0.06268106399789307, "grad_norm": 2.1226892471313477, "learning_rate": 4.687121411640769e-05, "loss": 1.6275, "step": 2380 }, { "epoch": 0.0627074005794048, "grad_norm": 3.127661943435669, "learning_rate": 4.6869897287332104e-05, "loss": 1.4595, "step": 2381 }, { "epoch": 0.06273373716091651, "grad_norm": 2.2085540294647217, "learning_rate": 4.686858045825652e-05, "loss": 1.445, "step": 2382 }, { "epoch": 0.06276007374242823, "grad_norm": 2.1729025840759277, "learning_rate": 4.6867263629180935e-05, "loss": 1.5951, "step": 2383 }, { "epoch": 0.06278641032393996, "grad_norm": 1.9573615789413452, "learning_rate": 4.686594680010535e-05, "loss": 1.9353, "step": 2384 }, { "epoch": 0.06281274690545167, "grad_norm": 2.1518776416778564, "learning_rate": 4.686462997102976e-05, "loss": 2.1263, "step": 2385 }, { "epoch": 0.06283908348696339, "grad_norm": 2.1036746501922607, "learning_rate": 4.686331314195418e-05, "loss": 1.6969, "step": 2386 }, { "epoch": 0.0628654200684751, "grad_norm": 2.245342493057251, "learning_rate": 4.686199631287859e-05, "loss": 1.7591, "step": 2387 }, { "epoch": 0.06289175664998684, "grad_norm": 2.0827269554138184, "learning_rate": 4.6860679483803006e-05, "loss": 0.5328, "step": 2388 }, { "epoch": 0.06291809323149855, "grad_norm": 2.7235958576202393, "learning_rate": 4.6859362654727415e-05, "loss": 1.3614, "step": 2389 }, { "epoch": 0.06294442981301027, "grad_norm": 7.018543243408203, "learning_rate": 4.685804582565183e-05, "loss": 1.3847, "step": 2390 }, { "epoch": 0.062970766394522, "grad_norm": 5.000580310821533, "learning_rate": 4.6856728996576246e-05, "loss": 1.9968, "step": 2391 }, { "epoch": 0.06299710297603371, "grad_norm": 5.311912536621094, "learning_rate": 4.685541216750066e-05, "loss": 1.8817, "step": 2392 }, { "epoch": 0.06302343955754543, "grad_norm": 3.7038862705230713, "learning_rate": 4.685409533842508e-05, "loss": 1.4629, "step": 2393 }, { "epoch": 0.06304977613905716, "grad_norm": 2.9747352600097656, "learning_rate": 4.6852778509349486e-05, "loss": 1.8721, "step": 2394 }, { "epoch": 0.06307611272056887, "grad_norm": 1.9817137718200684, "learning_rate": 4.68514616802739e-05, "loss": 1.8767, "step": 2395 }, { "epoch": 0.06310244930208059, "grad_norm": 2.0077948570251465, "learning_rate": 4.685014485119832e-05, "loss": 2.0026, "step": 2396 }, { "epoch": 0.0631287858835923, "grad_norm": 2.7527992725372314, "learning_rate": 4.684882802212273e-05, "loss": 1.8603, "step": 2397 }, { "epoch": 0.06315512246510403, "grad_norm": 4.314809799194336, "learning_rate": 4.684751119304714e-05, "loss": 0.5443, "step": 2398 }, { "epoch": 0.06318145904661575, "grad_norm": 3.1604745388031006, "learning_rate": 4.684619436397156e-05, "loss": 1.3714, "step": 2399 }, { "epoch": 0.06320779562812746, "grad_norm": 2.6723835468292236, "learning_rate": 4.684487753489597e-05, "loss": 0.4497, "step": 2400 }, { "epoch": 0.0632341322096392, "grad_norm": 2.9711673259735107, "learning_rate": 4.684356070582039e-05, "loss": 1.8005, "step": 2401 }, { "epoch": 0.06326046879115091, "grad_norm": 3.8346498012542725, "learning_rate": 4.6842243876744804e-05, "loss": 1.554, "step": 2402 }, { "epoch": 0.06328680537266262, "grad_norm": 2.6312010288238525, "learning_rate": 4.684092704766921e-05, "loss": 1.906, "step": 2403 }, { "epoch": 0.06331314195417435, "grad_norm": 2.83207368850708, "learning_rate": 4.683961021859363e-05, "loss": 2.0679, "step": 2404 }, { "epoch": 0.06333947853568607, "grad_norm": 7.46080207824707, "learning_rate": 4.6838293389518044e-05, "loss": 1.9119, "step": 2405 }, { "epoch": 0.06336581511719779, "grad_norm": 3.579887866973877, "learning_rate": 4.683697656044246e-05, "loss": 0.4994, "step": 2406 }, { "epoch": 0.0633921516987095, "grad_norm": 2.598342180252075, "learning_rate": 4.683565973136687e-05, "loss": 0.7125, "step": 2407 }, { "epoch": 0.06341848828022123, "grad_norm": 4.512175559997559, "learning_rate": 4.6834342902291284e-05, "loss": 1.9072, "step": 2408 }, { "epoch": 0.06344482486173295, "grad_norm": 5.32249641418457, "learning_rate": 4.68330260732157e-05, "loss": 2.4061, "step": 2409 }, { "epoch": 0.06347116144324466, "grad_norm": 3.2500739097595215, "learning_rate": 4.6831709244140115e-05, "loss": 1.7072, "step": 2410 }, { "epoch": 0.06349749802475639, "grad_norm": 1.9146647453308105, "learning_rate": 4.683039241506453e-05, "loss": 2.4875, "step": 2411 }, { "epoch": 0.0635238346062681, "grad_norm": 4.5336198806762695, "learning_rate": 4.682907558598894e-05, "loss": 0.5381, "step": 2412 }, { "epoch": 0.06355017118777982, "grad_norm": 2.675198793411255, "learning_rate": 4.6827758756913355e-05, "loss": 2.0764, "step": 2413 }, { "epoch": 0.06357650776929155, "grad_norm": 2.0779879093170166, "learning_rate": 4.6826441927837764e-05, "loss": 1.3311, "step": 2414 }, { "epoch": 0.06360284435080327, "grad_norm": 8.918055534362793, "learning_rate": 4.682512509876219e-05, "loss": 1.4941, "step": 2415 }, { "epoch": 0.06362918093231498, "grad_norm": 2.119518756866455, "learning_rate": 4.6823808269686596e-05, "loss": 1.5165, "step": 2416 }, { "epoch": 0.06365551751382671, "grad_norm": 4.731598377227783, "learning_rate": 4.682249144061101e-05, "loss": 1.2126, "step": 2417 }, { "epoch": 0.06368185409533843, "grad_norm": 2.4830589294433594, "learning_rate": 4.682117461153542e-05, "loss": 1.4372, "step": 2418 }, { "epoch": 0.06370819067685014, "grad_norm": 1.9942201375961304, "learning_rate": 4.681985778245984e-05, "loss": 1.8839, "step": 2419 }, { "epoch": 0.06373452725836186, "grad_norm": 2.2299985885620117, "learning_rate": 4.681854095338425e-05, "loss": 1.9067, "step": 2420 }, { "epoch": 0.06376086383987359, "grad_norm": 3.289201021194458, "learning_rate": 4.681722412430867e-05, "loss": 1.9577, "step": 2421 }, { "epoch": 0.0637872004213853, "grad_norm": 5.261205196380615, "learning_rate": 4.681590729523308e-05, "loss": 2.384, "step": 2422 }, { "epoch": 0.06381353700289702, "grad_norm": 2.5124638080596924, "learning_rate": 4.681459046615749e-05, "loss": 1.8541, "step": 2423 }, { "epoch": 0.06383987358440875, "grad_norm": 1.994523286819458, "learning_rate": 4.6813273637081914e-05, "loss": 1.584, "step": 2424 }, { "epoch": 0.06386621016592046, "grad_norm": 1.8511320352554321, "learning_rate": 4.681195680800632e-05, "loss": 1.5149, "step": 2425 }, { "epoch": 0.06389254674743218, "grad_norm": 2.3349974155426025, "learning_rate": 4.681063997893074e-05, "loss": 2.6602, "step": 2426 }, { "epoch": 0.06391888332894391, "grad_norm": 3.3603076934814453, "learning_rate": 4.680932314985515e-05, "loss": 1.4353, "step": 2427 }, { "epoch": 0.06394521991045563, "grad_norm": 2.5830607414245605, "learning_rate": 4.680800632077956e-05, "loss": 0.5467, "step": 2428 }, { "epoch": 0.06397155649196734, "grad_norm": 1.9991215467453003, "learning_rate": 4.680668949170398e-05, "loss": 2.1647, "step": 2429 }, { "epoch": 0.06399789307347906, "grad_norm": 4.537223815917969, "learning_rate": 4.6805372662628394e-05, "loss": 0.7771, "step": 2430 }, { "epoch": 0.06402422965499079, "grad_norm": 2.468398094177246, "learning_rate": 4.680405583355281e-05, "loss": 1.8903, "step": 2431 }, { "epoch": 0.0640505662365025, "grad_norm": 2.197237968444824, "learning_rate": 4.680273900447722e-05, "loss": 2.1243, "step": 2432 }, { "epoch": 0.06407690281801422, "grad_norm": 3.1796863079071045, "learning_rate": 4.6801422175401634e-05, "loss": 0.3634, "step": 2433 }, { "epoch": 0.06410323939952595, "grad_norm": 3.5290427207946777, "learning_rate": 4.680010534632605e-05, "loss": 1.8702, "step": 2434 }, { "epoch": 0.06412957598103766, "grad_norm": 2.695802927017212, "learning_rate": 4.6798788517250465e-05, "loss": 2.3979, "step": 2435 }, { "epoch": 0.06415591256254938, "grad_norm": 3.778914213180542, "learning_rate": 4.6797471688174874e-05, "loss": 2.0953, "step": 2436 }, { "epoch": 0.06418224914406111, "grad_norm": 2.339081048965454, "learning_rate": 4.679615485909929e-05, "loss": 1.6206, "step": 2437 }, { "epoch": 0.06420858572557282, "grad_norm": 2.2991528511047363, "learning_rate": 4.6794838030023705e-05, "loss": 2.1085, "step": 2438 }, { "epoch": 0.06423492230708454, "grad_norm": 3.41226863861084, "learning_rate": 4.679352120094812e-05, "loss": 1.7824, "step": 2439 }, { "epoch": 0.06426125888859625, "grad_norm": 3.640415668487549, "learning_rate": 4.6792204371872536e-05, "loss": 1.7638, "step": 2440 }, { "epoch": 0.06428759547010798, "grad_norm": 2.1721041202545166, "learning_rate": 4.6790887542796945e-05, "loss": 1.9222, "step": 2441 }, { "epoch": 0.0643139320516197, "grad_norm": 3.03312611579895, "learning_rate": 4.678957071372136e-05, "loss": 1.763, "step": 2442 }, { "epoch": 0.06434026863313141, "grad_norm": 2.873274803161621, "learning_rate": 4.6788253884645776e-05, "loss": 1.17, "step": 2443 }, { "epoch": 0.06436660521464314, "grad_norm": 1.8585783243179321, "learning_rate": 4.678693705557019e-05, "loss": 2.0744, "step": 2444 }, { "epoch": 0.06439294179615486, "grad_norm": 2.5648512840270996, "learning_rate": 4.67856202264946e-05, "loss": 1.8095, "step": 2445 }, { "epoch": 0.06441927837766658, "grad_norm": 3.6596877574920654, "learning_rate": 4.6784303397419016e-05, "loss": 1.8555, "step": 2446 }, { "epoch": 0.0644456149591783, "grad_norm": 4.499950885772705, "learning_rate": 4.678298656834343e-05, "loss": 2.3845, "step": 2447 }, { "epoch": 0.06447195154069002, "grad_norm": 1.8886054754257202, "learning_rate": 4.678166973926785e-05, "loss": 1.4694, "step": 2448 }, { "epoch": 0.06449828812220174, "grad_norm": 3.0235955715179443, "learning_rate": 4.678035291019226e-05, "loss": 1.8034, "step": 2449 }, { "epoch": 0.06452462470371345, "grad_norm": 4.964101314544678, "learning_rate": 4.677903608111667e-05, "loss": 1.4596, "step": 2450 }, { "epoch": 0.06455096128522518, "grad_norm": 3.664073944091797, "learning_rate": 4.677771925204109e-05, "loss": 0.5822, "step": 2451 }, { "epoch": 0.0645772978667369, "grad_norm": 4.64607572555542, "learning_rate": 4.67764024229655e-05, "loss": 1.8566, "step": 2452 }, { "epoch": 0.06460363444824861, "grad_norm": 2.4599313735961914, "learning_rate": 4.677508559388992e-05, "loss": 2.019, "step": 2453 }, { "epoch": 0.06462997102976034, "grad_norm": 2.585855722427368, "learning_rate": 4.677376876481433e-05, "loss": 1.8816, "step": 2454 }, { "epoch": 0.06465630761127206, "grad_norm": 4.79041862487793, "learning_rate": 4.677245193573874e-05, "loss": 1.0569, "step": 2455 }, { "epoch": 0.06468264419278377, "grad_norm": 7.284574508666992, "learning_rate": 4.677113510666316e-05, "loss": 1.7071, "step": 2456 }, { "epoch": 0.0647089807742955, "grad_norm": 2.1668169498443604, "learning_rate": 4.6769818277587574e-05, "loss": 1.3356, "step": 2457 }, { "epoch": 0.06473531735580722, "grad_norm": 1.8819416761398315, "learning_rate": 4.676850144851199e-05, "loss": 1.8233, "step": 2458 }, { "epoch": 0.06476165393731893, "grad_norm": 3.402418613433838, "learning_rate": 4.67671846194364e-05, "loss": 1.613, "step": 2459 }, { "epoch": 0.06478799051883066, "grad_norm": 2.380220890045166, "learning_rate": 4.6765867790360814e-05, "loss": 1.8139, "step": 2460 }, { "epoch": 0.06481432710034238, "grad_norm": 3.6850509643554688, "learning_rate": 4.676455096128522e-05, "loss": 1.77, "step": 2461 }, { "epoch": 0.0648406636818541, "grad_norm": 2.4299466609954834, "learning_rate": 4.6763234132209645e-05, "loss": 1.8125, "step": 2462 }, { "epoch": 0.06486700026336581, "grad_norm": 1.9561840295791626, "learning_rate": 4.6761917303134054e-05, "loss": 2.0749, "step": 2463 }, { "epoch": 0.06489333684487754, "grad_norm": 4.448574542999268, "learning_rate": 4.676060047405847e-05, "loss": 1.0838, "step": 2464 }, { "epoch": 0.06491967342638925, "grad_norm": 3.413048028945923, "learning_rate": 4.6759283644982885e-05, "loss": 0.5598, "step": 2465 }, { "epoch": 0.06494601000790097, "grad_norm": 2.1648099422454834, "learning_rate": 4.6757966815907294e-05, "loss": 2.1142, "step": 2466 }, { "epoch": 0.0649723465894127, "grad_norm": 4.810667991638184, "learning_rate": 4.6756649986831716e-05, "loss": 2.2293, "step": 2467 }, { "epoch": 0.06499868317092442, "grad_norm": 3.0840086936950684, "learning_rate": 4.6755333157756125e-05, "loss": 1.5708, "step": 2468 }, { "epoch": 0.06502501975243613, "grad_norm": 4.743466377258301, "learning_rate": 4.675401632868054e-05, "loss": 1.9965, "step": 2469 }, { "epoch": 0.06505135633394786, "grad_norm": 2.19016695022583, "learning_rate": 4.675269949960495e-05, "loss": 2.3954, "step": 2470 }, { "epoch": 0.06507769291545958, "grad_norm": 3.0504465103149414, "learning_rate": 4.675138267052937e-05, "loss": 1.7875, "step": 2471 }, { "epoch": 0.06510402949697129, "grad_norm": 2.3120687007904053, "learning_rate": 4.675006584145378e-05, "loss": 1.3664, "step": 2472 }, { "epoch": 0.065130366078483, "grad_norm": 3.733536958694458, "learning_rate": 4.6748749012378196e-05, "loss": 1.4547, "step": 2473 }, { "epoch": 0.06515670265999474, "grad_norm": 2.8666889667510986, "learning_rate": 4.6747432183302605e-05, "loss": 1.45, "step": 2474 }, { "epoch": 0.06518303924150645, "grad_norm": 1.8116925954818726, "learning_rate": 4.674611535422702e-05, "loss": 1.8247, "step": 2475 }, { "epoch": 0.06520937582301817, "grad_norm": 2.427353858947754, "learning_rate": 4.6744798525151436e-05, "loss": 0.5902, "step": 2476 }, { "epoch": 0.0652357124045299, "grad_norm": 3.56376314163208, "learning_rate": 4.674348169607585e-05, "loss": 1.092, "step": 2477 }, { "epoch": 0.06526204898604161, "grad_norm": 4.646085739135742, "learning_rate": 4.674216486700027e-05, "loss": 1.2398, "step": 2478 }, { "epoch": 0.06528838556755333, "grad_norm": 5.491124153137207, "learning_rate": 4.6740848037924677e-05, "loss": 1.3864, "step": 2479 }, { "epoch": 0.06531472214906506, "grad_norm": 3.0339837074279785, "learning_rate": 4.673953120884909e-05, "loss": 2.0601, "step": 2480 }, { "epoch": 0.06534105873057677, "grad_norm": 3.239531993865967, "learning_rate": 4.673821437977351e-05, "loss": 0.4445, "step": 2481 }, { "epoch": 0.06536739531208849, "grad_norm": 2.132255792617798, "learning_rate": 4.673689755069792e-05, "loss": 1.8743, "step": 2482 }, { "epoch": 0.0653937318936002, "grad_norm": 1.9474587440490723, "learning_rate": 4.673558072162233e-05, "loss": 2.0281, "step": 2483 }, { "epoch": 0.06542006847511193, "grad_norm": 2.3764939308166504, "learning_rate": 4.673426389254675e-05, "loss": 1.3453, "step": 2484 }, { "epoch": 0.06544640505662365, "grad_norm": 7.971607208251953, "learning_rate": 4.673294706347116e-05, "loss": 2.0164, "step": 2485 }, { "epoch": 0.06547274163813536, "grad_norm": 4.840243339538574, "learning_rate": 4.673163023439558e-05, "loss": 1.0984, "step": 2486 }, { "epoch": 0.0654990782196471, "grad_norm": 5.740271091461182, "learning_rate": 4.6730313405319995e-05, "loss": 0.5724, "step": 2487 }, { "epoch": 0.06552541480115881, "grad_norm": 2.0132150650024414, "learning_rate": 4.67289965762444e-05, "loss": 1.471, "step": 2488 }, { "epoch": 0.06555175138267053, "grad_norm": 3.446458578109741, "learning_rate": 4.672767974716882e-05, "loss": 1.7057, "step": 2489 }, { "epoch": 0.06557808796418226, "grad_norm": 2.107990264892578, "learning_rate": 4.6726362918093235e-05, "loss": 1.2397, "step": 2490 }, { "epoch": 0.06560442454569397, "grad_norm": 6.227726459503174, "learning_rate": 4.672504608901765e-05, "loss": 1.8721, "step": 2491 }, { "epoch": 0.06563076112720569, "grad_norm": 3.4746124744415283, "learning_rate": 4.672372925994206e-05, "loss": 2.0732, "step": 2492 }, { "epoch": 0.0656570977087174, "grad_norm": 3.332350254058838, "learning_rate": 4.6722412430866475e-05, "loss": 1.6787, "step": 2493 }, { "epoch": 0.06568343429022913, "grad_norm": 2.8294148445129395, "learning_rate": 4.672109560179089e-05, "loss": 1.6272, "step": 2494 }, { "epoch": 0.06570977087174085, "grad_norm": 3.3581392765045166, "learning_rate": 4.6719778772715306e-05, "loss": 2.2576, "step": 2495 }, { "epoch": 0.06573610745325256, "grad_norm": 4.122766494750977, "learning_rate": 4.671846194363972e-05, "loss": 1.5237, "step": 2496 }, { "epoch": 0.06576244403476429, "grad_norm": 2.0318691730499268, "learning_rate": 4.671714511456413e-05, "loss": 1.8989, "step": 2497 }, { "epoch": 0.06578878061627601, "grad_norm": 2.39886474609375, "learning_rate": 4.6715828285488546e-05, "loss": 1.6113, "step": 2498 }, { "epoch": 0.06581511719778772, "grad_norm": 2.074373960494995, "learning_rate": 4.6714511456412955e-05, "loss": 1.0614, "step": 2499 }, { "epoch": 0.06584145377929945, "grad_norm": 1.8505825996398926, "learning_rate": 4.671319462733738e-05, "loss": 0.5445, "step": 2500 }, { "epoch": 0.06586779036081117, "grad_norm": 2.2921855449676514, "learning_rate": 4.6711877798261786e-05, "loss": 1.5375, "step": 2501 }, { "epoch": 0.06589412694232288, "grad_norm": 1.9849905967712402, "learning_rate": 4.67105609691862e-05, "loss": 1.5585, "step": 2502 }, { "epoch": 0.06592046352383461, "grad_norm": 2.228806734085083, "learning_rate": 4.670924414011062e-05, "loss": 1.8496, "step": 2503 }, { "epoch": 0.06594680010534633, "grad_norm": 3.650649070739746, "learning_rate": 4.670792731103503e-05, "loss": 0.5406, "step": 2504 }, { "epoch": 0.06597313668685804, "grad_norm": 2.1462583541870117, "learning_rate": 4.670661048195945e-05, "loss": 1.9369, "step": 2505 }, { "epoch": 0.06599947326836976, "grad_norm": 2.357234239578247, "learning_rate": 4.670529365288386e-05, "loss": 2.3181, "step": 2506 }, { "epoch": 0.06602580984988149, "grad_norm": 3.277623414993286, "learning_rate": 4.670397682380827e-05, "loss": 2.0376, "step": 2507 }, { "epoch": 0.0660521464313932, "grad_norm": 2.1119625568389893, "learning_rate": 4.670265999473268e-05, "loss": 1.1413, "step": 2508 }, { "epoch": 0.06607848301290492, "grad_norm": 5.124713897705078, "learning_rate": 4.6701343165657104e-05, "loss": 1.632, "step": 2509 }, { "epoch": 0.06610481959441665, "grad_norm": 1.9271942377090454, "learning_rate": 4.670002633658151e-05, "loss": 1.2184, "step": 2510 }, { "epoch": 0.06613115617592837, "grad_norm": 2.0628840923309326, "learning_rate": 4.669870950750593e-05, "loss": 1.9178, "step": 2511 }, { "epoch": 0.06615749275744008, "grad_norm": 1.8928604125976562, "learning_rate": 4.6697392678430344e-05, "loss": 2.0566, "step": 2512 }, { "epoch": 0.06618382933895181, "grad_norm": 1.9438021183013916, "learning_rate": 4.669607584935475e-05, "loss": 2.5018, "step": 2513 }, { "epoch": 0.06621016592046353, "grad_norm": 2.5813844203948975, "learning_rate": 4.6694759020279175e-05, "loss": 2.4215, "step": 2514 }, { "epoch": 0.06623650250197524, "grad_norm": 1.6554195880889893, "learning_rate": 4.6693442191203584e-05, "loss": 2.1316, "step": 2515 }, { "epoch": 0.06626283908348696, "grad_norm": 2.4325571060180664, "learning_rate": 4.6692125362128e-05, "loss": 1.8311, "step": 2516 }, { "epoch": 0.06628917566499869, "grad_norm": 4.826909065246582, "learning_rate": 4.669080853305241e-05, "loss": 1.0181, "step": 2517 }, { "epoch": 0.0663155122465104, "grad_norm": 5.109097480773926, "learning_rate": 4.668949170397683e-05, "loss": 2.0126, "step": 2518 }, { "epoch": 0.06634184882802212, "grad_norm": 3.1182119846343994, "learning_rate": 4.668817487490124e-05, "loss": 1.213, "step": 2519 }, { "epoch": 0.06636818540953385, "grad_norm": 1.836631417274475, "learning_rate": 4.6686858045825655e-05, "loss": 2.2042, "step": 2520 }, { "epoch": 0.06639452199104556, "grad_norm": 4.009081840515137, "learning_rate": 4.6685541216750064e-05, "loss": 2.114, "step": 2521 }, { "epoch": 0.06642085857255728, "grad_norm": 6.382277488708496, "learning_rate": 4.668422438767448e-05, "loss": 2.1955, "step": 2522 }, { "epoch": 0.06644719515406901, "grad_norm": 3.5269582271575928, "learning_rate": 4.6682907558598895e-05, "loss": 0.5381, "step": 2523 }, { "epoch": 0.06647353173558072, "grad_norm": 2.0849244594573975, "learning_rate": 4.668159072952331e-05, "loss": 0.4977, "step": 2524 }, { "epoch": 0.06649986831709244, "grad_norm": 1.678020715713501, "learning_rate": 4.6680273900447726e-05, "loss": 1.4492, "step": 2525 }, { "epoch": 0.06652620489860415, "grad_norm": 2.257838249206543, "learning_rate": 4.6678957071372135e-05, "loss": 1.4787, "step": 2526 }, { "epoch": 0.06655254148011588, "grad_norm": 2.6151444911956787, "learning_rate": 4.667764024229655e-05, "loss": 0.8137, "step": 2527 }, { "epoch": 0.0665788780616276, "grad_norm": 2.042978048324585, "learning_rate": 4.6676323413220966e-05, "loss": 1.5399, "step": 2528 }, { "epoch": 0.06660521464313932, "grad_norm": 4.034358978271484, "learning_rate": 4.667500658414538e-05, "loss": 0.6036, "step": 2529 }, { "epoch": 0.06663155122465104, "grad_norm": 4.255539894104004, "learning_rate": 4.667368975506979e-05, "loss": 1.634, "step": 2530 }, { "epoch": 0.06665788780616276, "grad_norm": 2.1822259426116943, "learning_rate": 4.6672372925994206e-05, "loss": 2.0315, "step": 2531 }, { "epoch": 0.06668422438767448, "grad_norm": 4.216916084289551, "learning_rate": 4.667105609691862e-05, "loss": 1.98, "step": 2532 }, { "epoch": 0.0667105609691862, "grad_norm": 2.0080924034118652, "learning_rate": 4.666973926784304e-05, "loss": 1.5931, "step": 2533 }, { "epoch": 0.06673689755069792, "grad_norm": 1.8816441297531128, "learning_rate": 4.666842243876745e-05, "loss": 1.5876, "step": 2534 }, { "epoch": 0.06676323413220964, "grad_norm": 2.8267483711242676, "learning_rate": 4.666710560969186e-05, "loss": 1.3382, "step": 2535 }, { "epoch": 0.06678957071372135, "grad_norm": 3.6327762603759766, "learning_rate": 4.666578878061628e-05, "loss": 1.9714, "step": 2536 }, { "epoch": 0.06681590729523308, "grad_norm": 1.8365795612335205, "learning_rate": 4.666447195154069e-05, "loss": 1.6223, "step": 2537 }, { "epoch": 0.0668422438767448, "grad_norm": 3.036679983139038, "learning_rate": 4.666315512246511e-05, "loss": 1.7841, "step": 2538 }, { "epoch": 0.06686858045825651, "grad_norm": 2.0765795707702637, "learning_rate": 4.666183829338952e-05, "loss": 2.0547, "step": 2539 }, { "epoch": 0.06689491703976824, "grad_norm": 2.1875293254852295, "learning_rate": 4.666052146431393e-05, "loss": 2.1577, "step": 2540 }, { "epoch": 0.06692125362127996, "grad_norm": 2.086848258972168, "learning_rate": 4.665920463523835e-05, "loss": 2.0517, "step": 2541 }, { "epoch": 0.06694759020279167, "grad_norm": 2.2084317207336426, "learning_rate": 4.6657887806162764e-05, "loss": 2.7073, "step": 2542 }, { "epoch": 0.0669739267843034, "grad_norm": 2.1722753047943115, "learning_rate": 4.665657097708718e-05, "loss": 1.4848, "step": 2543 }, { "epoch": 0.06700026336581512, "grad_norm": 2.3003392219543457, "learning_rate": 4.665525414801159e-05, "loss": 2.2607, "step": 2544 }, { "epoch": 0.06702659994732683, "grad_norm": 3.7220823764801025, "learning_rate": 4.6653937318936004e-05, "loss": 1.4799, "step": 2545 }, { "epoch": 0.06705293652883855, "grad_norm": 2.39377498626709, "learning_rate": 4.665262048986041e-05, "loss": 1.3127, "step": 2546 }, { "epoch": 0.06707927311035028, "grad_norm": 2.2810888290405273, "learning_rate": 4.6651303660784836e-05, "loss": 1.6819, "step": 2547 }, { "epoch": 0.067105609691862, "grad_norm": 9.56706714630127, "learning_rate": 4.6649986831709244e-05, "loss": 1.1164, "step": 2548 }, { "epoch": 0.06713194627337371, "grad_norm": 2.0614781379699707, "learning_rate": 4.664867000263366e-05, "loss": 0.5602, "step": 2549 }, { "epoch": 0.06715828285488544, "grad_norm": 3.2973594665527344, "learning_rate": 4.6647353173558076e-05, "loss": 1.2567, "step": 2550 }, { "epoch": 0.06718461943639716, "grad_norm": 1.8161203861236572, "learning_rate": 4.664603634448249e-05, "loss": 1.7685, "step": 2551 }, { "epoch": 0.06721095601790887, "grad_norm": 1.6718690395355225, "learning_rate": 4.664471951540691e-05, "loss": 1.1297, "step": 2552 }, { "epoch": 0.0672372925994206, "grad_norm": 1.7231602668762207, "learning_rate": 4.6643402686331316e-05, "loss": 0.9104, "step": 2553 }, { "epoch": 0.06726362918093232, "grad_norm": 3.418454170227051, "learning_rate": 4.664208585725573e-05, "loss": 0.9618, "step": 2554 }, { "epoch": 0.06728996576244403, "grad_norm": 1.7077583074569702, "learning_rate": 4.664076902818014e-05, "loss": 1.8292, "step": 2555 }, { "epoch": 0.06731630234395576, "grad_norm": 1.8046586513519287, "learning_rate": 4.663945219910456e-05, "loss": 1.6216, "step": 2556 }, { "epoch": 0.06734263892546748, "grad_norm": 3.2594175338745117, "learning_rate": 4.663813537002897e-05, "loss": 1.4472, "step": 2557 }, { "epoch": 0.06736897550697919, "grad_norm": 2.9247069358825684, "learning_rate": 4.663681854095339e-05, "loss": 2.147, "step": 2558 }, { "epoch": 0.06739531208849091, "grad_norm": 2.0607879161834717, "learning_rate": 4.66355017118778e-05, "loss": 1.6401, "step": 2559 }, { "epoch": 0.06742164867000264, "grad_norm": 1.7980475425720215, "learning_rate": 4.663418488280221e-05, "loss": 1.8501, "step": 2560 }, { "epoch": 0.06744798525151435, "grad_norm": 1.9060691595077515, "learning_rate": 4.6632868053726634e-05, "loss": 1.8517, "step": 2561 }, { "epoch": 0.06747432183302607, "grad_norm": 5.874664306640625, "learning_rate": 4.663155122465104e-05, "loss": 1.4543, "step": 2562 }, { "epoch": 0.0675006584145378, "grad_norm": 5.053830623626709, "learning_rate": 4.663023439557546e-05, "loss": 1.3172, "step": 2563 }, { "epoch": 0.06752699499604951, "grad_norm": 1.8805735111236572, "learning_rate": 4.662891756649987e-05, "loss": 1.6243, "step": 2564 }, { "epoch": 0.06755333157756123, "grad_norm": 2.795335292816162, "learning_rate": 4.662760073742428e-05, "loss": 1.7764, "step": 2565 }, { "epoch": 0.06757966815907296, "grad_norm": 2.3032498359680176, "learning_rate": 4.66262839083487e-05, "loss": 2.2232, "step": 2566 }, { "epoch": 0.06760600474058467, "grad_norm": 3.9962282180786133, "learning_rate": 4.6624967079273114e-05, "loss": 1.4959, "step": 2567 }, { "epoch": 0.06763234132209639, "grad_norm": 2.8505661487579346, "learning_rate": 4.662365025019753e-05, "loss": 1.1346, "step": 2568 }, { "epoch": 0.0676586779036081, "grad_norm": 2.709254503250122, "learning_rate": 4.662233342112194e-05, "loss": 2.3283, "step": 2569 }, { "epoch": 0.06768501448511983, "grad_norm": 1.8439650535583496, "learning_rate": 4.662101659204636e-05, "loss": 1.8535, "step": 2570 }, { "epoch": 0.06771135106663155, "grad_norm": 2.210843086242676, "learning_rate": 4.661969976297077e-05, "loss": 1.4266, "step": 2571 }, { "epoch": 0.06773768764814327, "grad_norm": 2.4437777996063232, "learning_rate": 4.6618382933895185e-05, "loss": 1.8741, "step": 2572 }, { "epoch": 0.067764024229655, "grad_norm": 2.7891852855682373, "learning_rate": 4.6617066104819594e-05, "loss": 1.0567, "step": 2573 }, { "epoch": 0.06779036081116671, "grad_norm": 3.1497902870178223, "learning_rate": 4.661574927574401e-05, "loss": 1.6176, "step": 2574 }, { "epoch": 0.06781669739267843, "grad_norm": 1.909374475479126, "learning_rate": 4.6614432446668425e-05, "loss": 1.6587, "step": 2575 }, { "epoch": 0.06784303397419016, "grad_norm": 2.338366985321045, "learning_rate": 4.661311561759284e-05, "loss": 1.6408, "step": 2576 }, { "epoch": 0.06786937055570187, "grad_norm": 3.6487927436828613, "learning_rate": 4.661179878851725e-05, "loss": 1.5463, "step": 2577 }, { "epoch": 0.06789570713721359, "grad_norm": 2.0124223232269287, "learning_rate": 4.6610481959441665e-05, "loss": 1.5813, "step": 2578 }, { "epoch": 0.0679220437187253, "grad_norm": 1.8438575267791748, "learning_rate": 4.660916513036608e-05, "loss": 0.7456, "step": 2579 }, { "epoch": 0.06794838030023703, "grad_norm": 3.658520221710205, "learning_rate": 4.6607848301290496e-05, "loss": 2.2583, "step": 2580 }, { "epoch": 0.06797471688174875, "grad_norm": 2.3209245204925537, "learning_rate": 4.660653147221491e-05, "loss": 2.1204, "step": 2581 }, { "epoch": 0.06800105346326046, "grad_norm": 2.0832419395446777, "learning_rate": 4.660521464313932e-05, "loss": 1.6178, "step": 2582 }, { "epoch": 0.06802739004477219, "grad_norm": 1.811886191368103, "learning_rate": 4.6603897814063736e-05, "loss": 1.7885, "step": 2583 }, { "epoch": 0.06805372662628391, "grad_norm": 1.7254831790924072, "learning_rate": 4.660258098498815e-05, "loss": 1.3446, "step": 2584 }, { "epoch": 0.06808006320779562, "grad_norm": 4.650494575500488, "learning_rate": 4.660126415591257e-05, "loss": 0.8956, "step": 2585 }, { "epoch": 0.06810639978930735, "grad_norm": 2.0037341117858887, "learning_rate": 4.6599947326836976e-05, "loss": 1.9698, "step": 2586 }, { "epoch": 0.06813273637081907, "grad_norm": 3.3112096786499023, "learning_rate": 4.659863049776139e-05, "loss": 1.4204, "step": 2587 }, { "epoch": 0.06815907295233078, "grad_norm": 2.6080074310302734, "learning_rate": 4.659731366868581e-05, "loss": 0.8499, "step": 2588 }, { "epoch": 0.0681854095338425, "grad_norm": 2.7312350273132324, "learning_rate": 4.659599683961022e-05, "loss": 1.3241, "step": 2589 }, { "epoch": 0.06821174611535423, "grad_norm": 1.6900057792663574, "learning_rate": 4.659468001053464e-05, "loss": 1.97, "step": 2590 }, { "epoch": 0.06823808269686595, "grad_norm": 3.0792837142944336, "learning_rate": 4.659336318145905e-05, "loss": 2.3653, "step": 2591 }, { "epoch": 0.06826441927837766, "grad_norm": 4.47657585144043, "learning_rate": 4.659204635238346e-05, "loss": 1.6617, "step": 2592 }, { "epoch": 0.06829075585988939, "grad_norm": 2.885293960571289, "learning_rate": 4.659072952330787e-05, "loss": 1.5072, "step": 2593 }, { "epoch": 0.0683170924414011, "grad_norm": 5.959609031677246, "learning_rate": 4.6589412694232294e-05, "loss": 1.2388, "step": 2594 }, { "epoch": 0.06834342902291282, "grad_norm": 4.799948215484619, "learning_rate": 4.65880958651567e-05, "loss": 1.683, "step": 2595 }, { "epoch": 0.06836976560442455, "grad_norm": 2.2544190883636475, "learning_rate": 4.658677903608112e-05, "loss": 1.7779, "step": 2596 }, { "epoch": 0.06839610218593627, "grad_norm": 1.947739839553833, "learning_rate": 4.6585462207005534e-05, "loss": 1.8944, "step": 2597 }, { "epoch": 0.06842243876744798, "grad_norm": 2.574916362762451, "learning_rate": 4.658414537792994e-05, "loss": 1.8165, "step": 2598 }, { "epoch": 0.06844877534895971, "grad_norm": 3.0508999824523926, "learning_rate": 4.6582828548854365e-05, "loss": 1.5944, "step": 2599 }, { "epoch": 0.06847511193047143, "grad_norm": 5.070839881896973, "learning_rate": 4.6581511719778774e-05, "loss": 1.2167, "step": 2600 }, { "epoch": 0.06850144851198314, "grad_norm": 2.3829562664031982, "learning_rate": 4.658019489070319e-05, "loss": 0.651, "step": 2601 }, { "epoch": 0.06852778509349486, "grad_norm": 4.177533149719238, "learning_rate": 4.65788780616276e-05, "loss": 2.3382, "step": 2602 }, { "epoch": 0.06855412167500659, "grad_norm": 2.7740416526794434, "learning_rate": 4.657756123255202e-05, "loss": 2.0598, "step": 2603 }, { "epoch": 0.0685804582565183, "grad_norm": 3.516300916671753, "learning_rate": 4.657624440347643e-05, "loss": 0.2323, "step": 2604 }, { "epoch": 0.06860679483803002, "grad_norm": 3.4978506565093994, "learning_rate": 4.6574927574400845e-05, "loss": 2.2463, "step": 2605 }, { "epoch": 0.06863313141954175, "grad_norm": 2.4125983715057373, "learning_rate": 4.657361074532526e-05, "loss": 1.8078, "step": 2606 }, { "epoch": 0.06865946800105346, "grad_norm": 2.26993727684021, "learning_rate": 4.657229391624967e-05, "loss": 1.6311, "step": 2607 }, { "epoch": 0.06868580458256518, "grad_norm": 3.9862186908721924, "learning_rate": 4.657097708717409e-05, "loss": 1.7732, "step": 2608 }, { "epoch": 0.06871214116407691, "grad_norm": 2.0518388748168945, "learning_rate": 4.65696602580985e-05, "loss": 1.4654, "step": 2609 }, { "epoch": 0.06873847774558862, "grad_norm": 3.12658953666687, "learning_rate": 4.6568343429022917e-05, "loss": 0.5777, "step": 2610 }, { "epoch": 0.06876481432710034, "grad_norm": 8.929038047790527, "learning_rate": 4.6567026599947325e-05, "loss": 1.638, "step": 2611 }, { "epoch": 0.06879115090861206, "grad_norm": 3.825570583343506, "learning_rate": 4.656570977087174e-05, "loss": 1.5552, "step": 2612 }, { "epoch": 0.06881748749012379, "grad_norm": 2.1644906997680664, "learning_rate": 4.6564392941796157e-05, "loss": 2.1256, "step": 2613 }, { "epoch": 0.0688438240716355, "grad_norm": 2.153041124343872, "learning_rate": 4.656307611272057e-05, "loss": 1.6542, "step": 2614 }, { "epoch": 0.06887016065314722, "grad_norm": 2.481976270675659, "learning_rate": 4.656175928364499e-05, "loss": 1.1864, "step": 2615 }, { "epoch": 0.06889649723465895, "grad_norm": 2.1133615970611572, "learning_rate": 4.6560442454569397e-05, "loss": 1.4337, "step": 2616 }, { "epoch": 0.06892283381617066, "grad_norm": 2.5127944946289062, "learning_rate": 4.655912562549382e-05, "loss": 1.8533, "step": 2617 }, { "epoch": 0.06894917039768238, "grad_norm": 2.4987242221832275, "learning_rate": 4.655780879641823e-05, "loss": 1.8171, "step": 2618 }, { "epoch": 0.0689755069791941, "grad_norm": 2.4039416313171387, "learning_rate": 4.655649196734264e-05, "loss": 2.0467, "step": 2619 }, { "epoch": 0.06900184356070582, "grad_norm": 3.5297069549560547, "learning_rate": 4.655517513826705e-05, "loss": 1.1857, "step": 2620 }, { "epoch": 0.06902818014221754, "grad_norm": 2.375171661376953, "learning_rate": 4.655385830919147e-05, "loss": 2.0219, "step": 2621 }, { "epoch": 0.06905451672372925, "grad_norm": 1.766218662261963, "learning_rate": 4.655254148011588e-05, "loss": 1.7518, "step": 2622 }, { "epoch": 0.06908085330524098, "grad_norm": 3.488804578781128, "learning_rate": 4.65512246510403e-05, "loss": 1.6097, "step": 2623 }, { "epoch": 0.0691071898867527, "grad_norm": 2.894591808319092, "learning_rate": 4.654990782196471e-05, "loss": 2.0309, "step": 2624 }, { "epoch": 0.06913352646826441, "grad_norm": 7.223591327667236, "learning_rate": 4.6548590992889123e-05, "loss": 0.6836, "step": 2625 }, { "epoch": 0.06915986304977614, "grad_norm": 2.455810070037842, "learning_rate": 4.654727416381354e-05, "loss": 2.0535, "step": 2626 }, { "epoch": 0.06918619963128786, "grad_norm": 1.8866004943847656, "learning_rate": 4.6545957334737955e-05, "loss": 1.6238, "step": 2627 }, { "epoch": 0.06921253621279957, "grad_norm": 2.6919102668762207, "learning_rate": 4.654464050566237e-05, "loss": 1.6533, "step": 2628 }, { "epoch": 0.0692388727943113, "grad_norm": 2.480489730834961, "learning_rate": 4.654332367658678e-05, "loss": 2.1289, "step": 2629 }, { "epoch": 0.06926520937582302, "grad_norm": 4.822482585906982, "learning_rate": 4.6542006847511195e-05, "loss": 1.734, "step": 2630 }, { "epoch": 0.06929154595733474, "grad_norm": 4.469937324523926, "learning_rate": 4.654069001843561e-05, "loss": 1.2581, "step": 2631 }, { "epoch": 0.06931788253884645, "grad_norm": 2.7554965019226074, "learning_rate": 4.6539373189360026e-05, "loss": 2.1281, "step": 2632 }, { "epoch": 0.06934421912035818, "grad_norm": 2.2402126789093018, "learning_rate": 4.6538056360284435e-05, "loss": 1.9077, "step": 2633 }, { "epoch": 0.0693705557018699, "grad_norm": 2.3627853393554688, "learning_rate": 4.653673953120885e-05, "loss": 2.1034, "step": 2634 }, { "epoch": 0.06939689228338161, "grad_norm": 2.813047409057617, "learning_rate": 4.6535422702133266e-05, "loss": 0.725, "step": 2635 }, { "epoch": 0.06942322886489334, "grad_norm": 2.6547837257385254, "learning_rate": 4.653410587305768e-05, "loss": 1.9301, "step": 2636 }, { "epoch": 0.06944956544640506, "grad_norm": 3.0400991439819336, "learning_rate": 4.65327890439821e-05, "loss": 2.0376, "step": 2637 }, { "epoch": 0.06947590202791677, "grad_norm": 2.4060683250427246, "learning_rate": 4.6531472214906506e-05, "loss": 1.9381, "step": 2638 }, { "epoch": 0.0695022386094285, "grad_norm": 4.608154773712158, "learning_rate": 4.653015538583092e-05, "loss": 1.8165, "step": 2639 }, { "epoch": 0.06952857519094022, "grad_norm": 2.51985764503479, "learning_rate": 4.652883855675533e-05, "loss": 0.2848, "step": 2640 }, { "epoch": 0.06955491177245193, "grad_norm": 2.6939659118652344, "learning_rate": 4.652752172767975e-05, "loss": 0.4254, "step": 2641 }, { "epoch": 0.06958124835396366, "grad_norm": 3.266824960708618, "learning_rate": 4.652620489860416e-05, "loss": 2.2181, "step": 2642 }, { "epoch": 0.06960758493547538, "grad_norm": 4.517200469970703, "learning_rate": 4.652488806952858e-05, "loss": 0.746, "step": 2643 }, { "epoch": 0.0696339215169871, "grad_norm": 1.7529770135879517, "learning_rate": 4.652357124045299e-05, "loss": 1.9167, "step": 2644 }, { "epoch": 0.06966025809849881, "grad_norm": 2.715451717376709, "learning_rate": 4.65222544113774e-05, "loss": 1.5291, "step": 2645 }, { "epoch": 0.06968659468001054, "grad_norm": 2.0796825885772705, "learning_rate": 4.6520937582301824e-05, "loss": 1.8641, "step": 2646 }, { "epoch": 0.06971293126152225, "grad_norm": 4.067246913909912, "learning_rate": 4.651962075322623e-05, "loss": 1.1368, "step": 2647 }, { "epoch": 0.06973926784303397, "grad_norm": 1.6699625253677368, "learning_rate": 4.651830392415065e-05, "loss": 1.742, "step": 2648 }, { "epoch": 0.0697656044245457, "grad_norm": 2.325200080871582, "learning_rate": 4.651698709507506e-05, "loss": 2.3476, "step": 2649 }, { "epoch": 0.06979194100605741, "grad_norm": 2.1421377658843994, "learning_rate": 4.651567026599948e-05, "loss": 1.7371, "step": 2650 }, { "epoch": 0.06981827758756913, "grad_norm": 2.5462629795074463, "learning_rate": 4.651435343692389e-05, "loss": 1.7869, "step": 2651 }, { "epoch": 0.06984461416908086, "grad_norm": 1.9661940336227417, "learning_rate": 4.6513036607848304e-05, "loss": 1.966, "step": 2652 }, { "epoch": 0.06987095075059258, "grad_norm": 4.2822370529174805, "learning_rate": 4.651171977877272e-05, "loss": 1.8279, "step": 2653 }, { "epoch": 0.06989728733210429, "grad_norm": 1.9207484722137451, "learning_rate": 4.651040294969713e-05, "loss": 1.3293, "step": 2654 }, { "epoch": 0.069923623913616, "grad_norm": 2.5592851638793945, "learning_rate": 4.650908612062155e-05, "loss": 1.9382, "step": 2655 }, { "epoch": 0.06994996049512774, "grad_norm": 2.754074811935425, "learning_rate": 4.650776929154596e-05, "loss": 2.0085, "step": 2656 }, { "epoch": 0.06997629707663945, "grad_norm": 3.550414800643921, "learning_rate": 4.6506452462470375e-05, "loss": 1.9523, "step": 2657 }, { "epoch": 0.07000263365815117, "grad_norm": 2.1844401359558105, "learning_rate": 4.6505135633394784e-05, "loss": 1.7098, "step": 2658 }, { "epoch": 0.0700289702396629, "grad_norm": 3.489062547683716, "learning_rate": 4.65038188043192e-05, "loss": 1.4202, "step": 2659 }, { "epoch": 0.07005530682117461, "grad_norm": 2.4555776119232178, "learning_rate": 4.6502501975243615e-05, "loss": 2.1021, "step": 2660 }, { "epoch": 0.07008164340268633, "grad_norm": 2.9414126873016357, "learning_rate": 4.650118514616803e-05, "loss": 0.2593, "step": 2661 }, { "epoch": 0.07010797998419806, "grad_norm": 2.705049991607666, "learning_rate": 4.6499868317092446e-05, "loss": 1.1382, "step": 2662 }, { "epoch": 0.07013431656570977, "grad_norm": 3.071199655532837, "learning_rate": 4.6498551488016855e-05, "loss": 0.5684, "step": 2663 }, { "epoch": 0.07016065314722149, "grad_norm": 2.3718481063842773, "learning_rate": 4.649723465894128e-05, "loss": 1.8246, "step": 2664 }, { "epoch": 0.0701869897287332, "grad_norm": 3.0356109142303467, "learning_rate": 4.6495917829865686e-05, "loss": 1.1527, "step": 2665 }, { "epoch": 0.07021332631024493, "grad_norm": 1.8045742511749268, "learning_rate": 4.64946010007901e-05, "loss": 1.7838, "step": 2666 }, { "epoch": 0.07023966289175665, "grad_norm": 2.5278451442718506, "learning_rate": 4.649328417171451e-05, "loss": 1.235, "step": 2667 }, { "epoch": 0.07026599947326836, "grad_norm": 3.5595812797546387, "learning_rate": 4.6491967342638926e-05, "loss": 2.2291, "step": 2668 }, { "epoch": 0.0702923360547801, "grad_norm": 2.0275182723999023, "learning_rate": 4.649065051356334e-05, "loss": 1.5879, "step": 2669 }, { "epoch": 0.07031867263629181, "grad_norm": 3.090912342071533, "learning_rate": 4.648933368448776e-05, "loss": 1.6005, "step": 2670 }, { "epoch": 0.07034500921780353, "grad_norm": 1.8806706666946411, "learning_rate": 4.648801685541217e-05, "loss": 1.8741, "step": 2671 }, { "epoch": 0.07037134579931525, "grad_norm": 4.31226110458374, "learning_rate": 4.648670002633658e-05, "loss": 2.1137, "step": 2672 }, { "epoch": 0.07039768238082697, "grad_norm": 1.993150234222412, "learning_rate": 4.6485383197261e-05, "loss": 2.1648, "step": 2673 }, { "epoch": 0.07042401896233869, "grad_norm": 2.3159568309783936, "learning_rate": 4.648406636818541e-05, "loss": 1.5296, "step": 2674 }, { "epoch": 0.0704503555438504, "grad_norm": 4.529603958129883, "learning_rate": 4.648274953910983e-05, "loss": 1.8003, "step": 2675 }, { "epoch": 0.07047669212536213, "grad_norm": 2.405383586883545, "learning_rate": 4.648143271003424e-05, "loss": 1.782, "step": 2676 }, { "epoch": 0.07050302870687385, "grad_norm": 2.07312273979187, "learning_rate": 4.648011588095865e-05, "loss": 1.6537, "step": 2677 }, { "epoch": 0.07052936528838556, "grad_norm": 3.462106227874756, "learning_rate": 4.647879905188306e-05, "loss": 1.8495, "step": 2678 }, { "epoch": 0.07055570186989729, "grad_norm": 3.344675302505493, "learning_rate": 4.6477482222807484e-05, "loss": 1.4403, "step": 2679 }, { "epoch": 0.07058203845140901, "grad_norm": 2.6176297664642334, "learning_rate": 4.647616539373189e-05, "loss": 2.0217, "step": 2680 }, { "epoch": 0.07060837503292072, "grad_norm": 2.351436138153076, "learning_rate": 4.647484856465631e-05, "loss": 1.8672, "step": 2681 }, { "epoch": 0.07063471161443245, "grad_norm": 4.086192607879639, "learning_rate": 4.6473531735580724e-05, "loss": 1.7592, "step": 2682 }, { "epoch": 0.07066104819594417, "grad_norm": 3.0787124633789062, "learning_rate": 4.647221490650514e-05, "loss": 1.8843, "step": 2683 }, { "epoch": 0.07068738477745588, "grad_norm": 2.002134084701538, "learning_rate": 4.6470898077429556e-05, "loss": 1.7586, "step": 2684 }, { "epoch": 0.07071372135896761, "grad_norm": 4.480076313018799, "learning_rate": 4.6469581248353964e-05, "loss": 1.7915, "step": 2685 }, { "epoch": 0.07074005794047933, "grad_norm": 7.935029983520508, "learning_rate": 4.646826441927838e-05, "loss": 2.3479, "step": 2686 }, { "epoch": 0.07076639452199104, "grad_norm": 2.092501640319824, "learning_rate": 4.646694759020279e-05, "loss": 1.5596, "step": 2687 }, { "epoch": 0.07079273110350276, "grad_norm": 2.9171571731567383, "learning_rate": 4.646563076112721e-05, "loss": 2.1102, "step": 2688 }, { "epoch": 0.07081906768501449, "grad_norm": 2.918426036834717, "learning_rate": 4.646431393205162e-05, "loss": 1.9477, "step": 2689 }, { "epoch": 0.0708454042665262, "grad_norm": 2.7173399925231934, "learning_rate": 4.6462997102976036e-05, "loss": 0.8485, "step": 2690 }, { "epoch": 0.07087174084803792, "grad_norm": 2.0940628051757812, "learning_rate": 4.646168027390045e-05, "loss": 1.8431, "step": 2691 }, { "epoch": 0.07089807742954965, "grad_norm": 3.8046703338623047, "learning_rate": 4.646036344482486e-05, "loss": 1.626, "step": 2692 }, { "epoch": 0.07092441401106137, "grad_norm": 3.061880350112915, "learning_rate": 4.645904661574928e-05, "loss": 1.9849, "step": 2693 }, { "epoch": 0.07095075059257308, "grad_norm": 3.367072343826294, "learning_rate": 4.645772978667369e-05, "loss": 2.0503, "step": 2694 }, { "epoch": 0.07097708717408481, "grad_norm": 4.6529645919799805, "learning_rate": 4.645641295759811e-05, "loss": 1.0493, "step": 2695 }, { "epoch": 0.07100342375559653, "grad_norm": 5.167924880981445, "learning_rate": 4.6455096128522516e-05, "loss": 1.4928, "step": 2696 }, { "epoch": 0.07102976033710824, "grad_norm": 3.35506534576416, "learning_rate": 4.645377929944694e-05, "loss": 1.5682, "step": 2697 }, { "epoch": 0.07105609691861996, "grad_norm": 2.238064765930176, "learning_rate": 4.645246247037135e-05, "loss": 1.8461, "step": 2698 }, { "epoch": 0.07108243350013169, "grad_norm": 1.931767463684082, "learning_rate": 4.645114564129576e-05, "loss": 1.3575, "step": 2699 }, { "epoch": 0.0711087700816434, "grad_norm": 3.225132465362549, "learning_rate": 4.644982881222018e-05, "loss": 2.0281, "step": 2700 }, { "epoch": 0.07113510666315512, "grad_norm": 2.8764572143554688, "learning_rate": 4.644851198314459e-05, "loss": 2.5044, "step": 2701 }, { "epoch": 0.07116144324466685, "grad_norm": 1.9680535793304443, "learning_rate": 4.644719515406901e-05, "loss": 1.55, "step": 2702 }, { "epoch": 0.07118777982617856, "grad_norm": 2.0848116874694824, "learning_rate": 4.644587832499342e-05, "loss": 1.5599, "step": 2703 }, { "epoch": 0.07121411640769028, "grad_norm": 4.546876907348633, "learning_rate": 4.6444561495917834e-05, "loss": 1.7025, "step": 2704 }, { "epoch": 0.07124045298920201, "grad_norm": 4.038058280944824, "learning_rate": 4.644324466684224e-05, "loss": 1.5236, "step": 2705 }, { "epoch": 0.07126678957071372, "grad_norm": 4.274616718292236, "learning_rate": 4.644192783776666e-05, "loss": 1.3206, "step": 2706 }, { "epoch": 0.07129312615222544, "grad_norm": 2.615851879119873, "learning_rate": 4.6440611008691074e-05, "loss": 1.705, "step": 2707 }, { "epoch": 0.07131946273373715, "grad_norm": 3.249099016189575, "learning_rate": 4.643929417961549e-05, "loss": 2.0197, "step": 2708 }, { "epoch": 0.07134579931524888, "grad_norm": 5.246429920196533, "learning_rate": 4.6437977350539905e-05, "loss": 2.0912, "step": 2709 }, { "epoch": 0.0713721358967606, "grad_norm": 1.8831321001052856, "learning_rate": 4.6436660521464314e-05, "loss": 1.8726, "step": 2710 }, { "epoch": 0.07139847247827232, "grad_norm": 3.412057399749756, "learning_rate": 4.643534369238873e-05, "loss": 1.4548, "step": 2711 }, { "epoch": 0.07142480905978404, "grad_norm": 1.715803623199463, "learning_rate": 4.6434026863313145e-05, "loss": 1.7469, "step": 2712 }, { "epoch": 0.07145114564129576, "grad_norm": 1.728883147239685, "learning_rate": 4.643271003423756e-05, "loss": 1.5368, "step": 2713 }, { "epoch": 0.07147748222280748, "grad_norm": 3.7357983589172363, "learning_rate": 4.643139320516197e-05, "loss": 1.9267, "step": 2714 }, { "epoch": 0.0715038188043192, "grad_norm": 4.025263786315918, "learning_rate": 4.6430076376086385e-05, "loss": 1.8416, "step": 2715 }, { "epoch": 0.07153015538583092, "grad_norm": 2.205474853515625, "learning_rate": 4.64287595470108e-05, "loss": 1.3173, "step": 2716 }, { "epoch": 0.07155649196734264, "grad_norm": 2.4038355350494385, "learning_rate": 4.6427442717935216e-05, "loss": 1.8354, "step": 2717 }, { "epoch": 0.07158282854885435, "grad_norm": 2.608717918395996, "learning_rate": 4.642612588885963e-05, "loss": 1.8021, "step": 2718 }, { "epoch": 0.07160916513036608, "grad_norm": 1.6370832920074463, "learning_rate": 4.642480905978404e-05, "loss": 1.5468, "step": 2719 }, { "epoch": 0.0716355017118778, "grad_norm": 2.068415641784668, "learning_rate": 4.6423492230708456e-05, "loss": 1.5858, "step": 2720 }, { "epoch": 0.07166183829338951, "grad_norm": 2.628811836242676, "learning_rate": 4.642217540163287e-05, "loss": 2.2224, "step": 2721 }, { "epoch": 0.07168817487490124, "grad_norm": 3.618770122528076, "learning_rate": 4.642085857255729e-05, "loss": 1.7167, "step": 2722 }, { "epoch": 0.07171451145641296, "grad_norm": 4.247048377990723, "learning_rate": 4.6419541743481696e-05, "loss": 1.224, "step": 2723 }, { "epoch": 0.07174084803792467, "grad_norm": 2.9350998401641846, "learning_rate": 4.641822491440611e-05, "loss": 2.1076, "step": 2724 }, { "epoch": 0.0717671846194364, "grad_norm": 3.7061140537261963, "learning_rate": 4.641690808533052e-05, "loss": 1.7952, "step": 2725 }, { "epoch": 0.07179352120094812, "grad_norm": 2.312399387359619, "learning_rate": 4.641559125625494e-05, "loss": 1.2502, "step": 2726 }, { "epoch": 0.07181985778245983, "grad_norm": 2.017063617706299, "learning_rate": 4.641427442717935e-05, "loss": 1.7595, "step": 2727 }, { "epoch": 0.07184619436397156, "grad_norm": 2.7324509620666504, "learning_rate": 4.641295759810377e-05, "loss": 1.33, "step": 2728 }, { "epoch": 0.07187253094548328, "grad_norm": 3.6948230266571045, "learning_rate": 4.641164076902818e-05, "loss": 2.3593, "step": 2729 }, { "epoch": 0.071898867526995, "grad_norm": 1.7741047143936157, "learning_rate": 4.64103239399526e-05, "loss": 1.5545, "step": 2730 }, { "epoch": 0.07192520410850671, "grad_norm": 5.119868278503418, "learning_rate": 4.6409007110877014e-05, "loss": 1.3571, "step": 2731 }, { "epoch": 0.07195154069001844, "grad_norm": 2.158773183822632, "learning_rate": 4.640769028180142e-05, "loss": 1.581, "step": 2732 }, { "epoch": 0.07197787727153016, "grad_norm": 2.9639108180999756, "learning_rate": 4.640637345272584e-05, "loss": 1.7442, "step": 2733 }, { "epoch": 0.07200421385304187, "grad_norm": 4.275511741638184, "learning_rate": 4.640505662365025e-05, "loss": 1.7204, "step": 2734 }, { "epoch": 0.0720305504345536, "grad_norm": 2.5602612495422363, "learning_rate": 4.640373979457467e-05, "loss": 2.0076, "step": 2735 }, { "epoch": 0.07205688701606532, "grad_norm": 2.8891618251800537, "learning_rate": 4.640242296549908e-05, "loss": 1.8932, "step": 2736 }, { "epoch": 0.07208322359757703, "grad_norm": 2.761899948120117, "learning_rate": 4.6401106136423494e-05, "loss": 1.7741, "step": 2737 }, { "epoch": 0.07210956017908876, "grad_norm": 4.940716743469238, "learning_rate": 4.639978930734791e-05, "loss": 1.1045, "step": 2738 }, { "epoch": 0.07213589676060048, "grad_norm": 2.88822340965271, "learning_rate": 4.639847247827232e-05, "loss": 0.7925, "step": 2739 }, { "epoch": 0.07216223334211219, "grad_norm": 2.7456512451171875, "learning_rate": 4.639715564919674e-05, "loss": 2.0357, "step": 2740 }, { "epoch": 0.07218856992362391, "grad_norm": 3.2925772666931152, "learning_rate": 4.639583882012115e-05, "loss": 1.6522, "step": 2741 }, { "epoch": 0.07221490650513564, "grad_norm": 2.1087934970855713, "learning_rate": 4.6394521991045565e-05, "loss": 1.4956, "step": 2742 }, { "epoch": 0.07224124308664735, "grad_norm": 2.858607292175293, "learning_rate": 4.6393205161969974e-05, "loss": 1.7281, "step": 2743 }, { "epoch": 0.07226757966815907, "grad_norm": 4.906359672546387, "learning_rate": 4.639188833289439e-05, "loss": 1.8161, "step": 2744 }, { "epoch": 0.0722939162496708, "grad_norm": 3.8215255737304688, "learning_rate": 4.6390571503818805e-05, "loss": 1.9722, "step": 2745 }, { "epoch": 0.07232025283118251, "grad_norm": 9.646496772766113, "learning_rate": 4.638925467474322e-05, "loss": 1.0756, "step": 2746 }, { "epoch": 0.07234658941269423, "grad_norm": 4.426901340484619, "learning_rate": 4.6387937845667637e-05, "loss": 0.9286, "step": 2747 }, { "epoch": 0.07237292599420596, "grad_norm": 4.849145412445068, "learning_rate": 4.6386621016592045e-05, "loss": 1.1735, "step": 2748 }, { "epoch": 0.07239926257571767, "grad_norm": NaN, "learning_rate": 4.6386621016592045e-05, "loss": 2.466, "step": 2749 }, { "epoch": 0.07242559915722939, "grad_norm": 2.6882951259613037, "learning_rate": 4.638530418751647e-05, "loss": 2.6353, "step": 2750 }, { "epoch": 0.0724519357387411, "grad_norm": 1.9634631872177124, "learning_rate": 4.6383987358440877e-05, "loss": 1.5359, "step": 2751 }, { "epoch": 0.07247827232025283, "grad_norm": 3.8971569538116455, "learning_rate": 4.638267052936529e-05, "loss": 1.5967, "step": 2752 }, { "epoch": 0.07250460890176455, "grad_norm": 4.099257469177246, "learning_rate": 4.63813537002897e-05, "loss": 1.8788, "step": 2753 }, { "epoch": 0.07253094548327627, "grad_norm": 1.8790581226348877, "learning_rate": 4.6380036871214117e-05, "loss": 2.2948, "step": 2754 }, { "epoch": 0.072557282064788, "grad_norm": 2.6746301651000977, "learning_rate": 4.637872004213853e-05, "loss": 1.6662, "step": 2755 }, { "epoch": 0.07258361864629971, "grad_norm": 2.729245185852051, "learning_rate": 4.637740321306295e-05, "loss": 2.0442, "step": 2756 }, { "epoch": 0.07260995522781143, "grad_norm": 1.7360540628433228, "learning_rate": 4.6376086383987363e-05, "loss": 2.1775, "step": 2757 }, { "epoch": 0.07263629180932316, "grad_norm": 1.9921828508377075, "learning_rate": 4.637476955491177e-05, "loss": 1.5739, "step": 2758 }, { "epoch": 0.07266262839083487, "grad_norm": 4.548732757568359, "learning_rate": 4.637345272583619e-05, "loss": 0.9905, "step": 2759 }, { "epoch": 0.07268896497234659, "grad_norm": 2.191924810409546, "learning_rate": 4.6372135896760603e-05, "loss": 1.5967, "step": 2760 }, { "epoch": 0.0727153015538583, "grad_norm": 2.550980806350708, "learning_rate": 4.637081906768502e-05, "loss": 2.0663, "step": 2761 }, { "epoch": 0.07274163813537003, "grad_norm": 2.215832233428955, "learning_rate": 4.636950223860943e-05, "loss": 1.0186, "step": 2762 }, { "epoch": 0.07276797471688175, "grad_norm": 1.8944953680038452, "learning_rate": 4.6368185409533843e-05, "loss": 1.9387, "step": 2763 }, { "epoch": 0.07279431129839346, "grad_norm": 2.056403160095215, "learning_rate": 4.636686858045826e-05, "loss": 2.1449, "step": 2764 }, { "epoch": 0.07282064787990519, "grad_norm": 2.469348192214966, "learning_rate": 4.6365551751382675e-05, "loss": 1.9617, "step": 2765 }, { "epoch": 0.07284698446141691, "grad_norm": 5.035886764526367, "learning_rate": 4.636423492230709e-05, "loss": 1.4871, "step": 2766 }, { "epoch": 0.07287332104292862, "grad_norm": 2.6207799911499023, "learning_rate": 4.63629180932315e-05, "loss": 1.7257, "step": 2767 }, { "epoch": 0.07289965762444035, "grad_norm": 2.999866008758545, "learning_rate": 4.6361601264155915e-05, "loss": 1.291, "step": 2768 }, { "epoch": 0.07292599420595207, "grad_norm": 4.365527629852295, "learning_rate": 4.636028443508033e-05, "loss": 1.7727, "step": 2769 }, { "epoch": 0.07295233078746378, "grad_norm": 1.7593533992767334, "learning_rate": 4.6358967606004746e-05, "loss": 1.4748, "step": 2770 }, { "epoch": 0.07297866736897551, "grad_norm": 4.576137065887451, "learning_rate": 4.6357650776929155e-05, "loss": 1.8213, "step": 2771 }, { "epoch": 0.07300500395048723, "grad_norm": 2.4472999572753906, "learning_rate": 4.635633394785357e-05, "loss": 2.0751, "step": 2772 }, { "epoch": 0.07303134053199895, "grad_norm": 1.9276094436645508, "learning_rate": 4.6355017118777986e-05, "loss": 1.4565, "step": 2773 }, { "epoch": 0.07305767711351066, "grad_norm": 3.5174710750579834, "learning_rate": 4.63537002897024e-05, "loss": 1.4744, "step": 2774 }, { "epoch": 0.07308401369502239, "grad_norm": 3.8854033946990967, "learning_rate": 4.635238346062682e-05, "loss": 0.8084, "step": 2775 }, { "epoch": 0.0731103502765341, "grad_norm": 2.2220587730407715, "learning_rate": 4.6351066631551226e-05, "loss": 0.8171, "step": 2776 }, { "epoch": 0.07313668685804582, "grad_norm": 2.2211291790008545, "learning_rate": 4.634974980247564e-05, "loss": 1.754, "step": 2777 }, { "epoch": 0.07316302343955755, "grad_norm": 3.8703322410583496, "learning_rate": 4.634843297340005e-05, "loss": 1.7574, "step": 2778 }, { "epoch": 0.07318936002106927, "grad_norm": 2.7567102909088135, "learning_rate": 4.634711614432447e-05, "loss": 0.5344, "step": 2779 }, { "epoch": 0.07321569660258098, "grad_norm": 2.733025550842285, "learning_rate": 4.634579931524888e-05, "loss": 0.7457, "step": 2780 }, { "epoch": 0.07324203318409271, "grad_norm": 2.9197328090667725, "learning_rate": 4.63444824861733e-05, "loss": 1.8035, "step": 2781 }, { "epoch": 0.07326836976560443, "grad_norm": 4.82131814956665, "learning_rate": 4.6343165657097706e-05, "loss": 1.3844, "step": 2782 }, { "epoch": 0.07329470634711614, "grad_norm": 2.0215184688568115, "learning_rate": 4.634184882802213e-05, "loss": 1.7277, "step": 2783 }, { "epoch": 0.07332104292862786, "grad_norm": 1.888427495956421, "learning_rate": 4.634053199894654e-05, "loss": 2.1028, "step": 2784 }, { "epoch": 0.07334737951013959, "grad_norm": 2.3030498027801514, "learning_rate": 4.633921516987095e-05, "loss": 1.8546, "step": 2785 }, { "epoch": 0.0733737160916513, "grad_norm": 2.647611379623413, "learning_rate": 4.633789834079537e-05, "loss": 1.9227, "step": 2786 }, { "epoch": 0.07340005267316302, "grad_norm": 2.333958148956299, "learning_rate": 4.633658151171978e-05, "loss": 1.9955, "step": 2787 }, { "epoch": 0.07342638925467475, "grad_norm": 3.4081473350524902, "learning_rate": 4.63352646826442e-05, "loss": 1.4516, "step": 2788 }, { "epoch": 0.07345272583618646, "grad_norm": 1.7533105611801147, "learning_rate": 4.633394785356861e-05, "loss": 1.5543, "step": 2789 }, { "epoch": 0.07347906241769818, "grad_norm": 3.1015400886535645, "learning_rate": 4.6332631024493024e-05, "loss": 0.3429, "step": 2790 }, { "epoch": 0.07350539899920991, "grad_norm": 1.7616078853607178, "learning_rate": 4.633131419541743e-05, "loss": 2.0438, "step": 2791 }, { "epoch": 0.07353173558072162, "grad_norm": 2.3825430870056152, "learning_rate": 4.632999736634185e-05, "loss": 1.4313, "step": 2792 }, { "epoch": 0.07355807216223334, "grad_norm": 3.4073896408081055, "learning_rate": 4.6328680537266264e-05, "loss": 1.5878, "step": 2793 }, { "epoch": 0.07358440874374506, "grad_norm": 2.3406107425689697, "learning_rate": 4.632736370819068e-05, "loss": 2.2283, "step": 2794 }, { "epoch": 0.07361074532525679, "grad_norm": 2.7205867767333984, "learning_rate": 4.6326046879115095e-05, "loss": 1.7144, "step": 2795 }, { "epoch": 0.0736370819067685, "grad_norm": 6.849981784820557, "learning_rate": 4.6324730050039504e-05, "loss": 1.2354, "step": 2796 }, { "epoch": 0.07366341848828022, "grad_norm": 2.2369143962860107, "learning_rate": 4.6323413220963926e-05, "loss": 1.9458, "step": 2797 }, { "epoch": 0.07368975506979195, "grad_norm": 2.0094168186187744, "learning_rate": 4.6322096391888335e-05, "loss": 1.8407, "step": 2798 }, { "epoch": 0.07371609165130366, "grad_norm": 2.328657627105713, "learning_rate": 4.632077956281275e-05, "loss": 1.1801, "step": 2799 }, { "epoch": 0.07374242823281538, "grad_norm": 2.2383835315704346, "learning_rate": 4.631946273373716e-05, "loss": 1.5971, "step": 2800 }, { "epoch": 0.0737687648143271, "grad_norm": 2.6518330574035645, "learning_rate": 4.6318145904661575e-05, "loss": 1.8028, "step": 2801 }, { "epoch": 0.07379510139583882, "grad_norm": 3.24520206451416, "learning_rate": 4.631682907558599e-05, "loss": 1.3934, "step": 2802 }, { "epoch": 0.07382143797735054, "grad_norm": 2.1288161277770996, "learning_rate": 4.6315512246510406e-05, "loss": 1.6207, "step": 2803 }, { "epoch": 0.07384777455886225, "grad_norm": 2.2702548503875732, "learning_rate": 4.631419541743482e-05, "loss": 1.5697, "step": 2804 }, { "epoch": 0.07387411114037398, "grad_norm": 1.9476163387298584, "learning_rate": 4.631287858835923e-05, "loss": 1.6175, "step": 2805 }, { "epoch": 0.0739004477218857, "grad_norm": 2.1629433631896973, "learning_rate": 4.6311561759283646e-05, "loss": 1.8719, "step": 2806 }, { "epoch": 0.07392678430339741, "grad_norm": 2.0354788303375244, "learning_rate": 4.631024493020806e-05, "loss": 2.6282, "step": 2807 }, { "epoch": 0.07395312088490914, "grad_norm": 3.1469314098358154, "learning_rate": 4.630892810113248e-05, "loss": 2.3025, "step": 2808 }, { "epoch": 0.07397945746642086, "grad_norm": 2.7114975452423096, "learning_rate": 4.6307611272056886e-05, "loss": 2.1133, "step": 2809 }, { "epoch": 0.07400579404793257, "grad_norm": 2.010967254638672, "learning_rate": 4.63062944429813e-05, "loss": 1.8658, "step": 2810 }, { "epoch": 0.0740321306294443, "grad_norm": 4.655788421630859, "learning_rate": 4.630497761390572e-05, "loss": 0.6612, "step": 2811 }, { "epoch": 0.07405846721095602, "grad_norm": 2.373643636703491, "learning_rate": 4.630366078483013e-05, "loss": 1.4071, "step": 2812 }, { "epoch": 0.07408480379246773, "grad_norm": 1.9067589044570923, "learning_rate": 4.630234395575455e-05, "loss": 2.3246, "step": 2813 }, { "epoch": 0.07411114037397945, "grad_norm": 1.7788095474243164, "learning_rate": 4.630102712667896e-05, "loss": 0.5815, "step": 2814 }, { "epoch": 0.07413747695549118, "grad_norm": 2.4540064334869385, "learning_rate": 4.629971029760337e-05, "loss": 2.1468, "step": 2815 }, { "epoch": 0.0741638135370029, "grad_norm": 3.333746910095215, "learning_rate": 4.629839346852779e-05, "loss": 1.8047, "step": 2816 }, { "epoch": 0.07419015011851461, "grad_norm": 3.1054818630218506, "learning_rate": 4.6297076639452204e-05, "loss": 1.43, "step": 2817 }, { "epoch": 0.07421648670002634, "grad_norm": 1.972234845161438, "learning_rate": 4.629575981037661e-05, "loss": 1.26, "step": 2818 }, { "epoch": 0.07424282328153806, "grad_norm": 2.5473155975341797, "learning_rate": 4.629444298130103e-05, "loss": 1.6786, "step": 2819 }, { "epoch": 0.07426915986304977, "grad_norm": 2.884748697280884, "learning_rate": 4.6293126152225444e-05, "loss": 1.4978, "step": 2820 }, { "epoch": 0.0742954964445615, "grad_norm": 2.6371817588806152, "learning_rate": 4.629180932314986e-05, "loss": 1.6876, "step": 2821 }, { "epoch": 0.07432183302607322, "grad_norm": 3.1315529346466064, "learning_rate": 4.6290492494074276e-05, "loss": 2.1545, "step": 2822 }, { "epoch": 0.07434816960758493, "grad_norm": 3.07360577583313, "learning_rate": 4.6289175664998684e-05, "loss": 1.89, "step": 2823 }, { "epoch": 0.07437450618909666, "grad_norm": 2.0975546836853027, "learning_rate": 4.62878588359231e-05, "loss": 0.5298, "step": 2824 }, { "epoch": 0.07440084277060838, "grad_norm": 2.6233761310577393, "learning_rate": 4.628654200684751e-05, "loss": 1.6488, "step": 2825 }, { "epoch": 0.07442717935212009, "grad_norm": 1.8994566202163696, "learning_rate": 4.628522517777193e-05, "loss": 1.9329, "step": 2826 }, { "epoch": 0.07445351593363181, "grad_norm": 1.836676836013794, "learning_rate": 4.628390834869634e-05, "loss": 1.715, "step": 2827 }, { "epoch": 0.07447985251514354, "grad_norm": 1.7107295989990234, "learning_rate": 4.6282591519620756e-05, "loss": 2.1464, "step": 2828 }, { "epoch": 0.07450618909665525, "grad_norm": 3.448990821838379, "learning_rate": 4.6281274690545164e-05, "loss": 1.5147, "step": 2829 }, { "epoch": 0.07453252567816697, "grad_norm": 2.2580504417419434, "learning_rate": 4.627995786146959e-05, "loss": 2.1167, "step": 2830 }, { "epoch": 0.0745588622596787, "grad_norm": 2.84763240814209, "learning_rate": 4.6278641032393996e-05, "loss": 1.2402, "step": 2831 }, { "epoch": 0.07458519884119041, "grad_norm": 2.063436508178711, "learning_rate": 4.627732420331841e-05, "loss": 2.049, "step": 2832 }, { "epoch": 0.07461153542270213, "grad_norm": 4.745668411254883, "learning_rate": 4.627600737424283e-05, "loss": 1.9608, "step": 2833 }, { "epoch": 0.07463787200421386, "grad_norm": 3.4372541904449463, "learning_rate": 4.6274690545167236e-05, "loss": 1.491, "step": 2834 }, { "epoch": 0.07466420858572557, "grad_norm": 2.8895318508148193, "learning_rate": 4.627337371609166e-05, "loss": 1.6019, "step": 2835 }, { "epoch": 0.07469054516723729, "grad_norm": 4.08427619934082, "learning_rate": 4.627205688701607e-05, "loss": 1.1404, "step": 2836 }, { "epoch": 0.074716881748749, "grad_norm": 2.6643011569976807, "learning_rate": 4.627074005794048e-05, "loss": 1.5322, "step": 2837 }, { "epoch": 0.07474321833026074, "grad_norm": 9.928352355957031, "learning_rate": 4.626942322886489e-05, "loss": 1.3646, "step": 2838 }, { "epoch": 0.07476955491177245, "grad_norm": 2.4245355129241943, "learning_rate": 4.626810639978931e-05, "loss": 1.6984, "step": 2839 }, { "epoch": 0.07479589149328417, "grad_norm": 3.102386474609375, "learning_rate": 4.626678957071372e-05, "loss": 0.578, "step": 2840 }, { "epoch": 0.0748222280747959, "grad_norm": 4.1624555587768555, "learning_rate": 4.626547274163814e-05, "loss": 1.5494, "step": 2841 }, { "epoch": 0.07484856465630761, "grad_norm": 6.126819610595703, "learning_rate": 4.6264155912562554e-05, "loss": 2.2051, "step": 2842 }, { "epoch": 0.07487490123781933, "grad_norm": 2.342970848083496, "learning_rate": 4.626283908348696e-05, "loss": 1.99, "step": 2843 }, { "epoch": 0.07490123781933106, "grad_norm": 2.877779006958008, "learning_rate": 4.626152225441138e-05, "loss": 1.5391, "step": 2844 }, { "epoch": 0.07492757440084277, "grad_norm": 3.9619438648223877, "learning_rate": 4.6260205425335794e-05, "loss": 0.6488, "step": 2845 }, { "epoch": 0.07495391098235449, "grad_norm": 2.223205327987671, "learning_rate": 4.625888859626021e-05, "loss": 1.1693, "step": 2846 }, { "epoch": 0.0749802475638662, "grad_norm": 3.2557179927825928, "learning_rate": 4.625757176718462e-05, "loss": 1.8447, "step": 2847 }, { "epoch": 0.07500658414537793, "grad_norm": 3.4557149410247803, "learning_rate": 4.6256254938109034e-05, "loss": 1.9351, "step": 2848 }, { "epoch": 0.07503292072688965, "grad_norm": 2.156149387359619, "learning_rate": 4.625493810903345e-05, "loss": 1.7212, "step": 2849 }, { "epoch": 0.07505925730840136, "grad_norm": 3.2540628910064697, "learning_rate": 4.6253621279957865e-05, "loss": 2.0247, "step": 2850 }, { "epoch": 0.0750855938899131, "grad_norm": 4.749756813049316, "learning_rate": 4.625230445088228e-05, "loss": 1.2521, "step": 2851 }, { "epoch": 0.07511193047142481, "grad_norm": 2.281892776489258, "learning_rate": 4.625098762180669e-05, "loss": 1.8424, "step": 2852 }, { "epoch": 0.07513826705293652, "grad_norm": 4.3686957359313965, "learning_rate": 4.6249670792731105e-05, "loss": 0.5506, "step": 2853 }, { "epoch": 0.07516460363444825, "grad_norm": 2.1627280712127686, "learning_rate": 4.624835396365552e-05, "loss": 2.2248, "step": 2854 }, { "epoch": 0.07519094021595997, "grad_norm": 2.4081296920776367, "learning_rate": 4.6247037134579936e-05, "loss": 1.6437, "step": 2855 }, { "epoch": 0.07521727679747169, "grad_norm": 1.8965318202972412, "learning_rate": 4.6245720305504345e-05, "loss": 1.3987, "step": 2856 }, { "epoch": 0.0752436133789834, "grad_norm": 2.997220754623413, "learning_rate": 4.624440347642876e-05, "loss": 2.5019, "step": 2857 }, { "epoch": 0.07526994996049513, "grad_norm": 2.150428533554077, "learning_rate": 4.6243086647353176e-05, "loss": 1.9793, "step": 2858 }, { "epoch": 0.07529628654200685, "grad_norm": 3.462949275970459, "learning_rate": 4.624176981827759e-05, "loss": 2.0437, "step": 2859 }, { "epoch": 0.07532262312351856, "grad_norm": 3.389911651611328, "learning_rate": 4.624045298920201e-05, "loss": 1.4694, "step": 2860 }, { "epoch": 0.07534895970503029, "grad_norm": 2.236562728881836, "learning_rate": 4.6239136160126416e-05, "loss": 2.0452, "step": 2861 }, { "epoch": 0.075375296286542, "grad_norm": 7.113264560699463, "learning_rate": 4.623781933105083e-05, "loss": 2.1405, "step": 2862 }, { "epoch": 0.07540163286805372, "grad_norm": 3.2841694355010986, "learning_rate": 4.623650250197525e-05, "loss": 1.3068, "step": 2863 }, { "epoch": 0.07542796944956545, "grad_norm": 7.079731464385986, "learning_rate": 4.623518567289966e-05, "loss": 1.5962, "step": 2864 }, { "epoch": 0.07545430603107717, "grad_norm": 2.9107322692871094, "learning_rate": 4.623386884382407e-05, "loss": 1.1992, "step": 2865 }, { "epoch": 0.07548064261258888, "grad_norm": 2.7110557556152344, "learning_rate": 4.623255201474849e-05, "loss": 1.9561, "step": 2866 }, { "epoch": 0.07550697919410061, "grad_norm": 2.516211986541748, "learning_rate": 4.62312351856729e-05, "loss": 2.2882, "step": 2867 }, { "epoch": 0.07553331577561233, "grad_norm": 2.0445072650909424, "learning_rate": 4.622991835659732e-05, "loss": 1.7402, "step": 2868 }, { "epoch": 0.07555965235712404, "grad_norm": 5.0942559242248535, "learning_rate": 4.6228601527521734e-05, "loss": 1.6377, "step": 2869 }, { "epoch": 0.07558598893863576, "grad_norm": 4.013828754425049, "learning_rate": 4.622728469844614e-05, "loss": 1.3201, "step": 2870 }, { "epoch": 0.07561232552014749, "grad_norm": 2.3567638397216797, "learning_rate": 4.622596786937056e-05, "loss": 0.6248, "step": 2871 }, { "epoch": 0.0756386621016592, "grad_norm": 4.042812824249268, "learning_rate": 4.622465104029497e-05, "loss": 1.4718, "step": 2872 }, { "epoch": 0.07566499868317092, "grad_norm": 2.1142780780792236, "learning_rate": 4.622333421121939e-05, "loss": 1.9396, "step": 2873 }, { "epoch": 0.07569133526468265, "grad_norm": 2.020040512084961, "learning_rate": 4.62220173821438e-05, "loss": 1.9907, "step": 2874 }, { "epoch": 0.07571767184619436, "grad_norm": 3.9280433654785156, "learning_rate": 4.6220700553068214e-05, "loss": 0.4253, "step": 2875 }, { "epoch": 0.07574400842770608, "grad_norm": 2.530353307723999, "learning_rate": 4.621938372399262e-05, "loss": 1.4493, "step": 2876 }, { "epoch": 0.07577034500921781, "grad_norm": 3.8539600372314453, "learning_rate": 4.621806689491704e-05, "loss": 1.9191, "step": 2877 }, { "epoch": 0.07579668159072953, "grad_norm": 1.726455807685852, "learning_rate": 4.621675006584146e-05, "loss": 1.4166, "step": 2878 }, { "epoch": 0.07582301817224124, "grad_norm": 1.8120595216751099, "learning_rate": 4.621543323676587e-05, "loss": 0.8819, "step": 2879 }, { "epoch": 0.07584935475375296, "grad_norm": 2.1814632415771484, "learning_rate": 4.6214116407690285e-05, "loss": 0.7063, "step": 2880 }, { "epoch": 0.07587569133526469, "grad_norm": 2.7255566120147705, "learning_rate": 4.6212799578614694e-05, "loss": 1.6888, "step": 2881 }, { "epoch": 0.0759020279167764, "grad_norm": 3.285147190093994, "learning_rate": 4.6211482749539117e-05, "loss": 1.525, "step": 2882 }, { "epoch": 0.07592836449828812, "grad_norm": 2.5586330890655518, "learning_rate": 4.6210165920463525e-05, "loss": 1.696, "step": 2883 }, { "epoch": 0.07595470107979985, "grad_norm": 2.305975914001465, "learning_rate": 4.620884909138794e-05, "loss": 1.3767, "step": 2884 }, { "epoch": 0.07598103766131156, "grad_norm": 7.872295379638672, "learning_rate": 4.620753226231235e-05, "loss": 0.7357, "step": 2885 }, { "epoch": 0.07600737424282328, "grad_norm": 1.8039056062698364, "learning_rate": 4.6206215433236765e-05, "loss": 1.8057, "step": 2886 }, { "epoch": 0.07603371082433501, "grad_norm": 3.1082780361175537, "learning_rate": 4.620489860416118e-05, "loss": 1.4222, "step": 2887 }, { "epoch": 0.07606004740584672, "grad_norm": 1.9146350622177124, "learning_rate": 4.62035817750856e-05, "loss": 1.9234, "step": 2888 }, { "epoch": 0.07608638398735844, "grad_norm": 2.7180566787719727, "learning_rate": 4.620226494601001e-05, "loss": 1.8174, "step": 2889 }, { "epoch": 0.07611272056887015, "grad_norm": 2.9868123531341553, "learning_rate": 4.620094811693442e-05, "loss": 1.9753, "step": 2890 }, { "epoch": 0.07613905715038188, "grad_norm": 2.3932347297668457, "learning_rate": 4.619963128785884e-05, "loss": 1.0506, "step": 2891 }, { "epoch": 0.0761653937318936, "grad_norm": 2.998116970062256, "learning_rate": 4.619831445878325e-05, "loss": 1.4265, "step": 2892 }, { "epoch": 0.07619173031340531, "grad_norm": 2.2100486755371094, "learning_rate": 4.619699762970767e-05, "loss": 1.5486, "step": 2893 }, { "epoch": 0.07621806689491704, "grad_norm": 2.5608062744140625, "learning_rate": 4.619568080063208e-05, "loss": 1.167, "step": 2894 }, { "epoch": 0.07624440347642876, "grad_norm": 6.639842510223389, "learning_rate": 4.619436397155649e-05, "loss": 1.3116, "step": 2895 }, { "epoch": 0.07627074005794048, "grad_norm": 2.0278637409210205, "learning_rate": 4.619304714248091e-05, "loss": 1.5837, "step": 2896 }, { "epoch": 0.0762970766394522, "grad_norm": 8.712678909301758, "learning_rate": 4.6191730313405323e-05, "loss": 1.5904, "step": 2897 }, { "epoch": 0.07632341322096392, "grad_norm": 3.8720109462738037, "learning_rate": 4.619041348432974e-05, "loss": 1.6175, "step": 2898 }, { "epoch": 0.07634974980247564, "grad_norm": 4.7755327224731445, "learning_rate": 4.618909665525415e-05, "loss": 1.3538, "step": 2899 }, { "epoch": 0.07637608638398735, "grad_norm": 2.2061166763305664, "learning_rate": 4.6187779826178563e-05, "loss": 1.4728, "step": 2900 }, { "epoch": 0.07640242296549908, "grad_norm": 7.621268272399902, "learning_rate": 4.618646299710298e-05, "loss": 1.6505, "step": 2901 }, { "epoch": 0.0764287595470108, "grad_norm": 3.170424461364746, "learning_rate": 4.6185146168027395e-05, "loss": 0.6962, "step": 2902 }, { "epoch": 0.07645509612852251, "grad_norm": 4.69133996963501, "learning_rate": 4.6183829338951804e-05, "loss": 1.7371, "step": 2903 }, { "epoch": 0.07648143271003424, "grad_norm": 3.802349328994751, "learning_rate": 4.618251250987622e-05, "loss": 0.4832, "step": 2904 }, { "epoch": 0.07650776929154596, "grad_norm": 2.538747787475586, "learning_rate": 4.6181195680800635e-05, "loss": 1.9829, "step": 2905 }, { "epoch": 0.07653410587305767, "grad_norm": 3.03297758102417, "learning_rate": 4.617987885172505e-05, "loss": 1.972, "step": 2906 }, { "epoch": 0.0765604424545694, "grad_norm": 2.6025068759918213, "learning_rate": 4.6178562022649466e-05, "loss": 1.4485, "step": 2907 }, { "epoch": 0.07658677903608112, "grad_norm": 2.474449396133423, "learning_rate": 4.6177245193573875e-05, "loss": 1.5531, "step": 2908 }, { "epoch": 0.07661311561759283, "grad_norm": 2.2864882946014404, "learning_rate": 4.617592836449829e-05, "loss": 2.2275, "step": 2909 }, { "epoch": 0.07663945219910456, "grad_norm": 5.040701866149902, "learning_rate": 4.61746115354227e-05, "loss": 2.3351, "step": 2910 }, { "epoch": 0.07666578878061628, "grad_norm": 2.6368234157562256, "learning_rate": 4.617329470634712e-05, "loss": 1.7002, "step": 2911 }, { "epoch": 0.076692125362128, "grad_norm": 3.585076093673706, "learning_rate": 4.617197787727153e-05, "loss": 1.1735, "step": 2912 }, { "epoch": 0.07671846194363971, "grad_norm": 2.4913384914398193, "learning_rate": 4.6170661048195946e-05, "loss": 1.3096, "step": 2913 }, { "epoch": 0.07674479852515144, "grad_norm": 2.65311336517334, "learning_rate": 4.616934421912036e-05, "loss": 0.9502, "step": 2914 }, { "epoch": 0.07677113510666315, "grad_norm": 2.1741271018981934, "learning_rate": 4.616802739004478e-05, "loss": 1.5825, "step": 2915 }, { "epoch": 0.07679747168817487, "grad_norm": 2.6662893295288086, "learning_rate": 4.616671056096919e-05, "loss": 1.4234, "step": 2916 }, { "epoch": 0.0768238082696866, "grad_norm": 2.2269794940948486, "learning_rate": 4.61653937318936e-05, "loss": 2.6377, "step": 2917 }, { "epoch": 0.07685014485119832, "grad_norm": 3.0671260356903076, "learning_rate": 4.616407690281802e-05, "loss": 1.3336, "step": 2918 }, { "epoch": 0.07687648143271003, "grad_norm": 4.627152442932129, "learning_rate": 4.6162760073742426e-05, "loss": 1.2017, "step": 2919 }, { "epoch": 0.07690281801422176, "grad_norm": 2.6742775440216064, "learning_rate": 4.616144324466685e-05, "loss": 1.9193, "step": 2920 }, { "epoch": 0.07692915459573348, "grad_norm": 4.334056377410889, "learning_rate": 4.616012641559126e-05, "loss": 1.7124, "step": 2921 }, { "epoch": 0.07695549117724519, "grad_norm": 3.9531803131103516, "learning_rate": 4.615880958651567e-05, "loss": 1.9557, "step": 2922 }, { "epoch": 0.07698182775875691, "grad_norm": 1.8117527961730957, "learning_rate": 4.615749275744009e-05, "loss": 2.0753, "step": 2923 }, { "epoch": 0.07700816434026864, "grad_norm": 2.708638906478882, "learning_rate": 4.61561759283645e-05, "loss": 2.0953, "step": 2924 }, { "epoch": 0.07703450092178035, "grad_norm": 4.326902866363525, "learning_rate": 4.615485909928892e-05, "loss": 2.238, "step": 2925 }, { "epoch": 0.07706083750329207, "grad_norm": 2.4174532890319824, "learning_rate": 4.615354227021333e-05, "loss": 1.7706, "step": 2926 }, { "epoch": 0.0770871740848038, "grad_norm": 2.5048410892486572, "learning_rate": 4.6152225441137744e-05, "loss": 2.3998, "step": 2927 }, { "epoch": 0.07711351066631551, "grad_norm": 1.947087049484253, "learning_rate": 4.615090861206215e-05, "loss": 2.1922, "step": 2928 }, { "epoch": 0.07713984724782723, "grad_norm": 2.6374828815460205, "learning_rate": 4.6149591782986575e-05, "loss": 1.994, "step": 2929 }, { "epoch": 0.07716618382933896, "grad_norm": 2.377981424331665, "learning_rate": 4.6148274953910984e-05, "loss": 1.6372, "step": 2930 }, { "epoch": 0.07719252041085067, "grad_norm": 2.6368439197540283, "learning_rate": 4.61469581248354e-05, "loss": 1.0267, "step": 2931 }, { "epoch": 0.07721885699236239, "grad_norm": 2.7755324840545654, "learning_rate": 4.614564129575981e-05, "loss": 2.0596, "step": 2932 }, { "epoch": 0.0772451935738741, "grad_norm": 3.1608474254608154, "learning_rate": 4.6144324466684224e-05, "loss": 1.355, "step": 2933 }, { "epoch": 0.07727153015538583, "grad_norm": 1.9804837703704834, "learning_rate": 4.614300763760864e-05, "loss": 2.3291, "step": 2934 }, { "epoch": 0.07729786673689755, "grad_norm": 1.8469260931015015, "learning_rate": 4.6141690808533055e-05, "loss": 1.9021, "step": 2935 }, { "epoch": 0.07732420331840927, "grad_norm": 2.2760767936706543, "learning_rate": 4.614037397945747e-05, "loss": 2.091, "step": 2936 }, { "epoch": 0.077350539899921, "grad_norm": 2.4643969535827637, "learning_rate": 4.613905715038188e-05, "loss": 1.9725, "step": 2937 }, { "epoch": 0.07737687648143271, "grad_norm": 5.384614944458008, "learning_rate": 4.6137740321306295e-05, "loss": 1.4552, "step": 2938 }, { "epoch": 0.07740321306294443, "grad_norm": 4.860240459442139, "learning_rate": 4.613642349223071e-05, "loss": 0.9666, "step": 2939 }, { "epoch": 0.07742954964445616, "grad_norm": 2.0333573818206787, "learning_rate": 4.6135106663155126e-05, "loss": 1.0944, "step": 2940 }, { "epoch": 0.07745588622596787, "grad_norm": 4.997055530548096, "learning_rate": 4.6133789834079535e-05, "loss": 1.4165, "step": 2941 }, { "epoch": 0.07748222280747959, "grad_norm": 4.055838584899902, "learning_rate": 4.613247300500395e-05, "loss": 1.3761, "step": 2942 }, { "epoch": 0.0775085593889913, "grad_norm": 2.942561149597168, "learning_rate": 4.6131156175928366e-05, "loss": 1.5633, "step": 2943 }, { "epoch": 0.07753489597050303, "grad_norm": 2.882842779159546, "learning_rate": 4.612983934685278e-05, "loss": 1.5864, "step": 2944 }, { "epoch": 0.07756123255201475, "grad_norm": 2.0620086193084717, "learning_rate": 4.61285225177772e-05, "loss": 1.9958, "step": 2945 }, { "epoch": 0.07758756913352646, "grad_norm": 4.141435146331787, "learning_rate": 4.6127205688701606e-05, "loss": 2.2422, "step": 2946 }, { "epoch": 0.07761390571503819, "grad_norm": 1.8920514583587646, "learning_rate": 4.612588885962602e-05, "loss": 1.7647, "step": 2947 }, { "epoch": 0.07764024229654991, "grad_norm": 1.988913655281067, "learning_rate": 4.612457203055044e-05, "loss": 1.7631, "step": 2948 }, { "epoch": 0.07766657887806162, "grad_norm": 11.161272048950195, "learning_rate": 4.612325520147485e-05, "loss": 1.6981, "step": 2949 }, { "epoch": 0.07769291545957335, "grad_norm": 3.5461437702178955, "learning_rate": 4.612193837239926e-05, "loss": 0.6758, "step": 2950 }, { "epoch": 0.07771925204108507, "grad_norm": 2.4620890617370605, "learning_rate": 4.612062154332368e-05, "loss": 2.3795, "step": 2951 }, { "epoch": 0.07774558862259678, "grad_norm": 2.7031571865081787, "learning_rate": 4.611930471424809e-05, "loss": 2.0075, "step": 2952 }, { "epoch": 0.07777192520410851, "grad_norm": 3.0769670009613037, "learning_rate": 4.611798788517251e-05, "loss": 1.2456, "step": 2953 }, { "epoch": 0.07779826178562023, "grad_norm": 7.008205413818359, "learning_rate": 4.6116671056096924e-05, "loss": 3.1135, "step": 2954 }, { "epoch": 0.07782459836713194, "grad_norm": 3.0412566661834717, "learning_rate": 4.611535422702133e-05, "loss": 1.9332, "step": 2955 }, { "epoch": 0.07785093494864366, "grad_norm": 2.492886781692505, "learning_rate": 4.611403739794575e-05, "loss": 0.3785, "step": 2956 }, { "epoch": 0.07787727153015539, "grad_norm": 2.2769527435302734, "learning_rate": 4.611272056887016e-05, "loss": 1.4016, "step": 2957 }, { "epoch": 0.0779036081116671, "grad_norm": 4.312548637390137, "learning_rate": 4.611140373979458e-05, "loss": 2.0011, "step": 2958 }, { "epoch": 0.07792994469317882, "grad_norm": 1.95980966091156, "learning_rate": 4.611008691071899e-05, "loss": 2.3541, "step": 2959 }, { "epoch": 0.07795628127469055, "grad_norm": 3.9868710041046143, "learning_rate": 4.6108770081643404e-05, "loss": 1.0886, "step": 2960 }, { "epoch": 0.07798261785620227, "grad_norm": 1.928377628326416, "learning_rate": 4.610745325256782e-05, "loss": 1.8441, "step": 2961 }, { "epoch": 0.07800895443771398, "grad_norm": 2.732083797454834, "learning_rate": 4.6106136423492236e-05, "loss": 1.5392, "step": 2962 }, { "epoch": 0.07803529101922571, "grad_norm": 5.650703430175781, "learning_rate": 4.610481959441665e-05, "loss": 0.7904, "step": 2963 }, { "epoch": 0.07806162760073743, "grad_norm": 1.938529133796692, "learning_rate": 4.610350276534106e-05, "loss": 1.845, "step": 2964 }, { "epoch": 0.07808796418224914, "grad_norm": 6.214179992675781, "learning_rate": 4.6102185936265476e-05, "loss": 1.4523, "step": 2965 }, { "epoch": 0.07811430076376086, "grad_norm": 4.206273078918457, "learning_rate": 4.6100869107189885e-05, "loss": 0.8772, "step": 2966 }, { "epoch": 0.07814063734527259, "grad_norm": 2.002572536468506, "learning_rate": 4.609955227811431e-05, "loss": 2.2036, "step": 2967 }, { "epoch": 0.0781669739267843, "grad_norm": 2.311776638031006, "learning_rate": 4.6098235449038716e-05, "loss": 1.8375, "step": 2968 }, { "epoch": 0.07819331050829602, "grad_norm": 2.337829828262329, "learning_rate": 4.609691861996313e-05, "loss": 1.8979, "step": 2969 }, { "epoch": 0.07821964708980775, "grad_norm": 3.020416736602783, "learning_rate": 4.609560179088755e-05, "loss": 0.9351, "step": 2970 }, { "epoch": 0.07824598367131946, "grad_norm": 5.188978672027588, "learning_rate": 4.6094284961811956e-05, "loss": 1.3461, "step": 2971 }, { "epoch": 0.07827232025283118, "grad_norm": 1.8756476640701294, "learning_rate": 4.609296813273638e-05, "loss": 1.492, "step": 2972 }, { "epoch": 0.07829865683434291, "grad_norm": 2.01534366607666, "learning_rate": 4.609165130366079e-05, "loss": 2.0342, "step": 2973 }, { "epoch": 0.07832499341585462, "grad_norm": 2.973799467086792, "learning_rate": 4.60903344745852e-05, "loss": 0.8264, "step": 2974 }, { "epoch": 0.07835132999736634, "grad_norm": 2.7090418338775635, "learning_rate": 4.608901764550961e-05, "loss": 1.9434, "step": 2975 }, { "epoch": 0.07837766657887806, "grad_norm": 1.9312663078308105, "learning_rate": 4.6087700816434034e-05, "loss": 0.5209, "step": 2976 }, { "epoch": 0.07840400316038978, "grad_norm": 2.0510969161987305, "learning_rate": 4.608638398735844e-05, "loss": 2.0458, "step": 2977 }, { "epoch": 0.0784303397419015, "grad_norm": 5.430920124053955, "learning_rate": 4.608506715828286e-05, "loss": 1.4459, "step": 2978 }, { "epoch": 0.07845667632341322, "grad_norm": 7.730001449584961, "learning_rate": 4.608375032920727e-05, "loss": 1.1434, "step": 2979 }, { "epoch": 0.07848301290492495, "grad_norm": 3.235725164413452, "learning_rate": 4.608243350013168e-05, "loss": 1.8174, "step": 2980 }, { "epoch": 0.07850934948643666, "grad_norm": 3.830639362335205, "learning_rate": 4.60811166710561e-05, "loss": 1.165, "step": 2981 }, { "epoch": 0.07853568606794838, "grad_norm": 2.1870667934417725, "learning_rate": 4.6079799841980514e-05, "loss": 1.4659, "step": 2982 }, { "epoch": 0.0785620226494601, "grad_norm": 1.709317922592163, "learning_rate": 4.607848301290493e-05, "loss": 1.7562, "step": 2983 }, { "epoch": 0.07858835923097182, "grad_norm": 2.114741086959839, "learning_rate": 4.607716618382934e-05, "loss": 2.4639, "step": 2984 }, { "epoch": 0.07861469581248354, "grad_norm": 3.909815549850464, "learning_rate": 4.6075849354753754e-05, "loss": 1.3364, "step": 2985 }, { "epoch": 0.07864103239399525, "grad_norm": 2.1716513633728027, "learning_rate": 4.607453252567817e-05, "loss": 2.0291, "step": 2986 }, { "epoch": 0.07866736897550698, "grad_norm": 1.996850609779358, "learning_rate": 4.6073215696602585e-05, "loss": 1.5977, "step": 2987 }, { "epoch": 0.0786937055570187, "grad_norm": 2.8697946071624756, "learning_rate": 4.6071898867526994e-05, "loss": 0.8517, "step": 2988 }, { "epoch": 0.07872004213853041, "grad_norm": 2.9075510501861572, "learning_rate": 4.607058203845141e-05, "loss": 1.3262, "step": 2989 }, { "epoch": 0.07874637872004214, "grad_norm": 3.050605535507202, "learning_rate": 4.6069265209375825e-05, "loss": 0.8237, "step": 2990 }, { "epoch": 0.07877271530155386, "grad_norm": 3.7436141967773438, "learning_rate": 4.606794838030024e-05, "loss": 1.6384, "step": 2991 }, { "epoch": 0.07879905188306557, "grad_norm": 4.508068561553955, "learning_rate": 4.6066631551224656e-05, "loss": 1.618, "step": 2992 }, { "epoch": 0.0788253884645773, "grad_norm": 1.782735824584961, "learning_rate": 4.6065314722149065e-05, "loss": 1.6343, "step": 2993 }, { "epoch": 0.07885172504608902, "grad_norm": 1.9519214630126953, "learning_rate": 4.606399789307348e-05, "loss": 1.9377, "step": 2994 }, { "epoch": 0.07887806162760073, "grad_norm": 2.068739414215088, "learning_rate": 4.6062681063997896e-05, "loss": 1.2672, "step": 2995 }, { "epoch": 0.07890439820911246, "grad_norm": 5.3604021072387695, "learning_rate": 4.606136423492231e-05, "loss": 1.3025, "step": 2996 }, { "epoch": 0.07893073479062418, "grad_norm": 1.7631012201309204, "learning_rate": 4.606004740584672e-05, "loss": 1.6901, "step": 2997 }, { "epoch": 0.0789570713721359, "grad_norm": 4.020761966705322, "learning_rate": 4.6058730576771136e-05, "loss": 1.8172, "step": 2998 }, { "epoch": 0.07898340795364761, "grad_norm": 2.0464046001434326, "learning_rate": 4.605741374769555e-05, "loss": 2.3855, "step": 2999 }, { "epoch": 0.07900974453515934, "grad_norm": 2.5864334106445312, "learning_rate": 4.605609691861997e-05, "loss": 1.9189, "step": 3000 }, { "epoch": 0.07903608111667106, "grad_norm": 3.0215535163879395, "learning_rate": 4.605478008954438e-05, "loss": 0.5917, "step": 3001 }, { "epoch": 0.07906241769818277, "grad_norm": 3.3121135234832764, "learning_rate": 4.605346326046879e-05, "loss": 2.4299, "step": 3002 }, { "epoch": 0.0790887542796945, "grad_norm": 2.774287462234497, "learning_rate": 4.605214643139321e-05, "loss": 2.07, "step": 3003 }, { "epoch": 0.07911509086120622, "grad_norm": 8.307010650634766, "learning_rate": 4.6050829602317616e-05, "loss": 1.2052, "step": 3004 }, { "epoch": 0.07914142744271793, "grad_norm": 1.8780688047409058, "learning_rate": 4.604951277324204e-05, "loss": 1.8467, "step": 3005 }, { "epoch": 0.07916776402422966, "grad_norm": 3.9588537216186523, "learning_rate": 4.604819594416645e-05, "loss": 1.4457, "step": 3006 }, { "epoch": 0.07919410060574138, "grad_norm": 1.9787161350250244, "learning_rate": 4.604687911509086e-05, "loss": 1.7519, "step": 3007 }, { "epoch": 0.07922043718725309, "grad_norm": 2.7040998935699463, "learning_rate": 4.604556228601528e-05, "loss": 1.5914, "step": 3008 }, { "epoch": 0.07924677376876481, "grad_norm": 2.356250047683716, "learning_rate": 4.6044245456939694e-05, "loss": 1.4581, "step": 3009 }, { "epoch": 0.07927311035027654, "grad_norm": 7.49885892868042, "learning_rate": 4.604292862786411e-05, "loss": 2.0232, "step": 3010 }, { "epoch": 0.07929944693178825, "grad_norm": 3.5787158012390137, "learning_rate": 4.604161179878852e-05, "loss": 1.452, "step": 3011 }, { "epoch": 0.07932578351329997, "grad_norm": 2.3142282962799072, "learning_rate": 4.6040294969712934e-05, "loss": 1.9553, "step": 3012 }, { "epoch": 0.0793521200948117, "grad_norm": 3.6337833404541016, "learning_rate": 4.603897814063734e-05, "loss": 1.4466, "step": 3013 }, { "epoch": 0.07937845667632341, "grad_norm": 3.2178382873535156, "learning_rate": 4.6037661311561765e-05, "loss": 2.0248, "step": 3014 }, { "epoch": 0.07940479325783513, "grad_norm": 2.0973870754241943, "learning_rate": 4.6036344482486174e-05, "loss": 2.4394, "step": 3015 }, { "epoch": 0.07943112983934686, "grad_norm": 3.0410025119781494, "learning_rate": 4.603502765341059e-05, "loss": 0.9493, "step": 3016 }, { "epoch": 0.07945746642085857, "grad_norm": 2.0005555152893066, "learning_rate": 4.6033710824335005e-05, "loss": 1.7615, "step": 3017 }, { "epoch": 0.07948380300237029, "grad_norm": 2.3067333698272705, "learning_rate": 4.6032393995259414e-05, "loss": 1.8746, "step": 3018 }, { "epoch": 0.079510139583882, "grad_norm": 3.151745557785034, "learning_rate": 4.603107716618384e-05, "loss": 1.1948, "step": 3019 }, { "epoch": 0.07953647616539374, "grad_norm": 2.040066719055176, "learning_rate": 4.6029760337108245e-05, "loss": 1.383, "step": 3020 }, { "epoch": 0.07956281274690545, "grad_norm": 5.638552188873291, "learning_rate": 4.602844350803266e-05, "loss": 1.2643, "step": 3021 }, { "epoch": 0.07958914932841717, "grad_norm": 3.586662769317627, "learning_rate": 4.602712667895707e-05, "loss": 1.6929, "step": 3022 }, { "epoch": 0.0796154859099289, "grad_norm": 2.253549098968506, "learning_rate": 4.6025809849881485e-05, "loss": 1.6356, "step": 3023 }, { "epoch": 0.07964182249144061, "grad_norm": 2.4989919662475586, "learning_rate": 4.60244930208059e-05, "loss": 1.3934, "step": 3024 }, { "epoch": 0.07966815907295233, "grad_norm": 2.0018396377563477, "learning_rate": 4.602317619173032e-05, "loss": 2.1064, "step": 3025 }, { "epoch": 0.07969449565446406, "grad_norm": 3.5105741024017334, "learning_rate": 4.602185936265473e-05, "loss": 0.7057, "step": 3026 }, { "epoch": 0.07972083223597577, "grad_norm": 2.9432575702667236, "learning_rate": 4.602054253357914e-05, "loss": 2.017, "step": 3027 }, { "epoch": 0.07974716881748749, "grad_norm": 2.45501446723938, "learning_rate": 4.6019225704503563e-05, "loss": 1.8069, "step": 3028 }, { "epoch": 0.0797735053989992, "grad_norm": 2.6317880153656006, "learning_rate": 4.601790887542797e-05, "loss": 1.1185, "step": 3029 }, { "epoch": 0.07979984198051093, "grad_norm": 1.8810955286026, "learning_rate": 4.601659204635239e-05, "loss": 0.757, "step": 3030 }, { "epoch": 0.07982617856202265, "grad_norm": 1.8643970489501953, "learning_rate": 4.60152752172768e-05, "loss": 2.212, "step": 3031 }, { "epoch": 0.07985251514353436, "grad_norm": 4.852502822875977, "learning_rate": 4.601395838820121e-05, "loss": 1.0029, "step": 3032 }, { "epoch": 0.0798788517250461, "grad_norm": 2.6382932662963867, "learning_rate": 4.601264155912563e-05, "loss": 1.3618, "step": 3033 }, { "epoch": 0.07990518830655781, "grad_norm": 2.009019136428833, "learning_rate": 4.6011324730050044e-05, "loss": 1.7227, "step": 3034 }, { "epoch": 0.07993152488806952, "grad_norm": 2.0312139987945557, "learning_rate": 4.601000790097445e-05, "loss": 1.0722, "step": 3035 }, { "epoch": 0.07995786146958125, "grad_norm": 2.287750720977783, "learning_rate": 4.600869107189887e-05, "loss": 2.1844, "step": 3036 }, { "epoch": 0.07998419805109297, "grad_norm": 2.407707929611206, "learning_rate": 4.6007374242823284e-05, "loss": 2.1177, "step": 3037 }, { "epoch": 0.08001053463260469, "grad_norm": 4.104620933532715, "learning_rate": 4.60060574137477e-05, "loss": 1.6166, "step": 3038 }, { "epoch": 0.08003687121411641, "grad_norm": 1.9677842855453491, "learning_rate": 4.6004740584672115e-05, "loss": 2.0887, "step": 3039 }, { "epoch": 0.08006320779562813, "grad_norm": 3.8040413856506348, "learning_rate": 4.6003423755596524e-05, "loss": 1.3351, "step": 3040 }, { "epoch": 0.08008954437713985, "grad_norm": 3.152820587158203, "learning_rate": 4.600210692652094e-05, "loss": 0.5594, "step": 3041 }, { "epoch": 0.08011588095865156, "grad_norm": 2.030339241027832, "learning_rate": 4.6000790097445355e-05, "loss": 1.8194, "step": 3042 }, { "epoch": 0.08014221754016329, "grad_norm": 2.9567949771881104, "learning_rate": 4.599947326836977e-05, "loss": 2.0513, "step": 3043 }, { "epoch": 0.080168554121675, "grad_norm": 2.779965400695801, "learning_rate": 4.599815643929418e-05, "loss": 1.0721, "step": 3044 }, { "epoch": 0.08019489070318672, "grad_norm": 3.786208391189575, "learning_rate": 4.5996839610218595e-05, "loss": 0.5259, "step": 3045 }, { "epoch": 0.08022122728469845, "grad_norm": 2.9450812339782715, "learning_rate": 4.599552278114301e-05, "loss": 0.7078, "step": 3046 }, { "epoch": 0.08024756386621017, "grad_norm": 1.8191871643066406, "learning_rate": 4.5994205952067426e-05, "loss": 1.6206, "step": 3047 }, { "epoch": 0.08027390044772188, "grad_norm": 3.709888219833374, "learning_rate": 4.599288912299184e-05, "loss": 1.5232, "step": 3048 }, { "epoch": 0.08030023702923361, "grad_norm": 3.206434726715088, "learning_rate": 4.599157229391625e-05, "loss": 0.6644, "step": 3049 }, { "epoch": 0.08032657361074533, "grad_norm": 8.217453002929688, "learning_rate": 4.5990255464840666e-05, "loss": 2.3626, "step": 3050 }, { "epoch": 0.08035291019225704, "grad_norm": 2.6956565380096436, "learning_rate": 4.5988938635765075e-05, "loss": 2.2998, "step": 3051 }, { "epoch": 0.08037924677376876, "grad_norm": 2.4701461791992188, "learning_rate": 4.59876218066895e-05, "loss": 1.7273, "step": 3052 }, { "epoch": 0.08040558335528049, "grad_norm": 4.820345878601074, "learning_rate": 4.5986304977613906e-05, "loss": 2.4332, "step": 3053 }, { "epoch": 0.0804319199367922, "grad_norm": 3.2415659427642822, "learning_rate": 4.598498814853832e-05, "loss": 1.9357, "step": 3054 }, { "epoch": 0.08045825651830392, "grad_norm": 3.080388307571411, "learning_rate": 4.598367131946274e-05, "loss": 2.0317, "step": 3055 }, { "epoch": 0.08048459309981565, "grad_norm": 2.7453036308288574, "learning_rate": 4.5982354490387146e-05, "loss": 1.9507, "step": 3056 }, { "epoch": 0.08051092968132736, "grad_norm": 1.9261882305145264, "learning_rate": 4.598103766131157e-05, "loss": 1.6568, "step": 3057 }, { "epoch": 0.08053726626283908, "grad_norm": 2.7540392875671387, "learning_rate": 4.597972083223598e-05, "loss": 1.5972, "step": 3058 }, { "epoch": 0.08056360284435081, "grad_norm": 6.094642162322998, "learning_rate": 4.597840400316039e-05, "loss": 1.7056, "step": 3059 }, { "epoch": 0.08058993942586253, "grad_norm": 1.748673915863037, "learning_rate": 4.59770871740848e-05, "loss": 1.7786, "step": 3060 }, { "epoch": 0.08061627600737424, "grad_norm": 2.5155184268951416, "learning_rate": 4.5975770345009224e-05, "loss": 2.1524, "step": 3061 }, { "epoch": 0.08064261258888596, "grad_norm": 5.381051540374756, "learning_rate": 4.597445351593363e-05, "loss": 1.9852, "step": 3062 }, { "epoch": 0.08066894917039769, "grad_norm": 1.9428106546401978, "learning_rate": 4.597313668685805e-05, "loss": 1.9864, "step": 3063 }, { "epoch": 0.0806952857519094, "grad_norm": 3.8232650756835938, "learning_rate": 4.5971819857782464e-05, "loss": 1.4981, "step": 3064 }, { "epoch": 0.08072162233342112, "grad_norm": 3.798069953918457, "learning_rate": 4.597050302870687e-05, "loss": 1.5659, "step": 3065 }, { "epoch": 0.08074795891493285, "grad_norm": 3.831735372543335, "learning_rate": 4.5969186199631295e-05, "loss": 0.9129, "step": 3066 }, { "epoch": 0.08077429549644456, "grad_norm": 2.117058277130127, "learning_rate": 4.5967869370555704e-05, "loss": 1.9879, "step": 3067 }, { "epoch": 0.08080063207795628, "grad_norm": 2.2150096893310547, "learning_rate": 4.596655254148012e-05, "loss": 2.1295, "step": 3068 }, { "epoch": 0.08082696865946801, "grad_norm": 3.386474132537842, "learning_rate": 4.596523571240453e-05, "loss": 1.8026, "step": 3069 }, { "epoch": 0.08085330524097972, "grad_norm": 3.0243096351623535, "learning_rate": 4.5963918883328944e-05, "loss": 1.653, "step": 3070 }, { "epoch": 0.08087964182249144, "grad_norm": 4.91994571685791, "learning_rate": 4.596260205425336e-05, "loss": 0.8772, "step": 3071 }, { "epoch": 0.08090597840400315, "grad_norm": 7.770267963409424, "learning_rate": 4.5961285225177775e-05, "loss": 1.2967, "step": 3072 }, { "epoch": 0.08093231498551488, "grad_norm": 2.2486369609832764, "learning_rate": 4.595996839610219e-05, "loss": 1.8587, "step": 3073 }, { "epoch": 0.0809586515670266, "grad_norm": 2.6649832725524902, "learning_rate": 4.59586515670266e-05, "loss": 1.575, "step": 3074 }, { "epoch": 0.08098498814853831, "grad_norm": 2.645906686782837, "learning_rate": 4.595733473795102e-05, "loss": 1.1733, "step": 3075 }, { "epoch": 0.08101132473005004, "grad_norm": 5.157594680786133, "learning_rate": 4.595601790887543e-05, "loss": 0.7833, "step": 3076 }, { "epoch": 0.08103766131156176, "grad_norm": 2.006969690322876, "learning_rate": 4.5954701079799846e-05, "loss": 1.4374, "step": 3077 }, { "epoch": 0.08106399789307348, "grad_norm": 2.1734561920166016, "learning_rate": 4.5953384250724255e-05, "loss": 2.2792, "step": 3078 }, { "epoch": 0.0810903344745852, "grad_norm": 2.1892194747924805, "learning_rate": 4.595206742164867e-05, "loss": 1.7573, "step": 3079 }, { "epoch": 0.08111667105609692, "grad_norm": 3.7811434268951416, "learning_rate": 4.5950750592573086e-05, "loss": 2.2416, "step": 3080 }, { "epoch": 0.08114300763760864, "grad_norm": 4.700692176818848, "learning_rate": 4.59494337634975e-05, "loss": 1.8576, "step": 3081 }, { "epoch": 0.08116934421912035, "grad_norm": 1.868679165840149, "learning_rate": 4.594811693442191e-05, "loss": 0.5832, "step": 3082 }, { "epoch": 0.08119568080063208, "grad_norm": 2.693378210067749, "learning_rate": 4.5946800105346326e-05, "loss": 1.9498, "step": 3083 }, { "epoch": 0.0812220173821438, "grad_norm": 3.871093511581421, "learning_rate": 4.594548327627074e-05, "loss": 1.1099, "step": 3084 }, { "epoch": 0.08124835396365551, "grad_norm": 6.053171634674072, "learning_rate": 4.594416644719516e-05, "loss": 1.9812, "step": 3085 }, { "epoch": 0.08127469054516724, "grad_norm": 2.1427392959594727, "learning_rate": 4.594284961811957e-05, "loss": 1.6059, "step": 3086 }, { "epoch": 0.08130102712667896, "grad_norm": 2.369745969772339, "learning_rate": 4.594153278904398e-05, "loss": 0.469, "step": 3087 }, { "epoch": 0.08132736370819067, "grad_norm": 2.579380512237549, "learning_rate": 4.59402159599684e-05, "loss": 0.7767, "step": 3088 }, { "epoch": 0.0813537002897024, "grad_norm": 2.1181387901306152, "learning_rate": 4.5938899130892807e-05, "loss": 1.9556, "step": 3089 }, { "epoch": 0.08138003687121412, "grad_norm": 9.613819122314453, "learning_rate": 4.593758230181723e-05, "loss": 2.4814, "step": 3090 }, { "epoch": 0.08140637345272583, "grad_norm": 2.4750685691833496, "learning_rate": 4.593626547274164e-05, "loss": 2.0728, "step": 3091 }, { "epoch": 0.08143271003423756, "grad_norm": 1.8456908464431763, "learning_rate": 4.593494864366605e-05, "loss": 1.4081, "step": 3092 }, { "epoch": 0.08145904661574928, "grad_norm": 2.7805752754211426, "learning_rate": 4.593363181459047e-05, "loss": 1.8494, "step": 3093 }, { "epoch": 0.081485383197261, "grad_norm": 2.2097983360290527, "learning_rate": 4.5932314985514885e-05, "loss": 1.4948, "step": 3094 }, { "epoch": 0.08151171977877271, "grad_norm": 3.662487745285034, "learning_rate": 4.59309981564393e-05, "loss": 0.5671, "step": 3095 }, { "epoch": 0.08153805636028444, "grad_norm": 2.623788356781006, "learning_rate": 4.592968132736371e-05, "loss": 1.7313, "step": 3096 }, { "epoch": 0.08156439294179615, "grad_norm": 1.7780513763427734, "learning_rate": 4.5928364498288125e-05, "loss": 1.422, "step": 3097 }, { "epoch": 0.08159072952330787, "grad_norm": 2.2921080589294434, "learning_rate": 4.592704766921253e-05, "loss": 2.0359, "step": 3098 }, { "epoch": 0.0816170661048196, "grad_norm": 2.3026371002197266, "learning_rate": 4.5925730840136956e-05, "loss": 1.9653, "step": 3099 }, { "epoch": 0.08164340268633132, "grad_norm": 1.8745219707489014, "learning_rate": 4.5924414011061365e-05, "loss": 1.7081, "step": 3100 }, { "epoch": 0.08166973926784303, "grad_norm": 2.1467669010162354, "learning_rate": 4.592309718198578e-05, "loss": 2.3063, "step": 3101 }, { "epoch": 0.08169607584935476, "grad_norm": 6.352311611175537, "learning_rate": 4.5921780352910196e-05, "loss": 0.8252, "step": 3102 }, { "epoch": 0.08172241243086648, "grad_norm": 2.388911724090576, "learning_rate": 4.5920463523834605e-05, "loss": 1.4659, "step": 3103 }, { "epoch": 0.08174874901237819, "grad_norm": 3.9200141429901123, "learning_rate": 4.591914669475903e-05, "loss": 1.8962, "step": 3104 }, { "epoch": 0.0817750855938899, "grad_norm": 2.021963357925415, "learning_rate": 4.5917829865683436e-05, "loss": 2.1938, "step": 3105 }, { "epoch": 0.08180142217540164, "grad_norm": 1.8183528184890747, "learning_rate": 4.591651303660785e-05, "loss": 1.7039, "step": 3106 }, { "epoch": 0.08182775875691335, "grad_norm": 3.588721752166748, "learning_rate": 4.591519620753226e-05, "loss": 1.4668, "step": 3107 }, { "epoch": 0.08185409533842507, "grad_norm": 4.492329120635986, "learning_rate": 4.591387937845668e-05, "loss": 1.3517, "step": 3108 }, { "epoch": 0.0818804319199368, "grad_norm": 2.134694814682007, "learning_rate": 4.591256254938109e-05, "loss": 0.7819, "step": 3109 }, { "epoch": 0.08190676850144851, "grad_norm": 2.926333427429199, "learning_rate": 4.591124572030551e-05, "loss": 1.5406, "step": 3110 }, { "epoch": 0.08193310508296023, "grad_norm": 5.121904373168945, "learning_rate": 4.590992889122992e-05, "loss": 0.9217, "step": 3111 }, { "epoch": 0.08195944166447196, "grad_norm": 1.8765417337417603, "learning_rate": 4.590861206215433e-05, "loss": 2.0339, "step": 3112 }, { "epoch": 0.08198577824598367, "grad_norm": 5.389040470123291, "learning_rate": 4.5907295233078754e-05, "loss": 1.0556, "step": 3113 }, { "epoch": 0.08201211482749539, "grad_norm": 2.2882680892944336, "learning_rate": 4.590597840400316e-05, "loss": 1.8318, "step": 3114 }, { "epoch": 0.0820384514090071, "grad_norm": 2.079467296600342, "learning_rate": 4.590466157492758e-05, "loss": 1.736, "step": 3115 }, { "epoch": 0.08206478799051883, "grad_norm": 1.993194818496704, "learning_rate": 4.590334474585199e-05, "loss": 2.0398, "step": 3116 }, { "epoch": 0.08209112457203055, "grad_norm": 4.376698017120361, "learning_rate": 4.59020279167764e-05, "loss": 1.1316, "step": 3117 }, { "epoch": 0.08211746115354226, "grad_norm": 2.508852243423462, "learning_rate": 4.590071108770082e-05, "loss": 2.047, "step": 3118 }, { "epoch": 0.082143797735054, "grad_norm": 7.158236026763916, "learning_rate": 4.5899394258625234e-05, "loss": 0.8504, "step": 3119 }, { "epoch": 0.08217013431656571, "grad_norm": 3.2473576068878174, "learning_rate": 4.589807742954965e-05, "loss": 1.3946, "step": 3120 }, { "epoch": 0.08219647089807743, "grad_norm": 5.032447814941406, "learning_rate": 4.589676060047406e-05, "loss": 1.9325, "step": 3121 }, { "epoch": 0.08222280747958916, "grad_norm": 3.326627731323242, "learning_rate": 4.5895443771398474e-05, "loss": 1.6226, "step": 3122 }, { "epoch": 0.08224914406110087, "grad_norm": 3.977592945098877, "learning_rate": 4.589412694232289e-05, "loss": 1.6423, "step": 3123 }, { "epoch": 0.08227548064261259, "grad_norm": 2.88101863861084, "learning_rate": 4.5892810113247305e-05, "loss": 2.1351, "step": 3124 }, { "epoch": 0.0823018172241243, "grad_norm": 1.8409802913665771, "learning_rate": 4.5891493284171714e-05, "loss": 2.2556, "step": 3125 }, { "epoch": 0.08232815380563603, "grad_norm": 2.643402338027954, "learning_rate": 4.589017645509613e-05, "loss": 0.7815, "step": 3126 }, { "epoch": 0.08235449038714775, "grad_norm": 6.87968635559082, "learning_rate": 4.5888859626020545e-05, "loss": 2.7936, "step": 3127 }, { "epoch": 0.08238082696865946, "grad_norm": 5.262762069702148, "learning_rate": 4.588754279694496e-05, "loss": 1.8864, "step": 3128 }, { "epoch": 0.08240716355017119, "grad_norm": 2.5659544467926025, "learning_rate": 4.5886225967869376e-05, "loss": 2.2452, "step": 3129 }, { "epoch": 0.08243350013168291, "grad_norm": 1.4848806858062744, "learning_rate": 4.5884909138793785e-05, "loss": 1.3881, "step": 3130 }, { "epoch": 0.08245983671319462, "grad_norm": 3.090043783187866, "learning_rate": 4.58835923097182e-05, "loss": 0.4795, "step": 3131 }, { "epoch": 0.08248617329470635, "grad_norm": 5.4616241455078125, "learning_rate": 4.5882275480642616e-05, "loss": 1.7237, "step": 3132 }, { "epoch": 0.08251250987621807, "grad_norm": 3.349935293197632, "learning_rate": 4.588095865156703e-05, "loss": 1.4697, "step": 3133 }, { "epoch": 0.08253884645772978, "grad_norm": 3.1029186248779297, "learning_rate": 4.587964182249144e-05, "loss": 1.9954, "step": 3134 }, { "epoch": 0.08256518303924151, "grad_norm": 1.8943114280700684, "learning_rate": 4.5878324993415856e-05, "loss": 1.7665, "step": 3135 }, { "epoch": 0.08259151962075323, "grad_norm": 2.9870240688323975, "learning_rate": 4.5877008164340265e-05, "loss": 1.3939, "step": 3136 }, { "epoch": 0.08261785620226494, "grad_norm": 2.2673745155334473, "learning_rate": 4.587569133526469e-05, "loss": 0.6984, "step": 3137 }, { "epoch": 0.08264419278377666, "grad_norm": 3.3405747413635254, "learning_rate": 4.5874374506189096e-05, "loss": 1.8298, "step": 3138 }, { "epoch": 0.08267052936528839, "grad_norm": 3.767385482788086, "learning_rate": 4.587305767711351e-05, "loss": 2.4307, "step": 3139 }, { "epoch": 0.0826968659468001, "grad_norm": 1.9182902574539185, "learning_rate": 4.587174084803793e-05, "loss": 2.1343, "step": 3140 }, { "epoch": 0.08272320252831182, "grad_norm": 3.1401264667510986, "learning_rate": 4.587042401896234e-05, "loss": 1.8234, "step": 3141 }, { "epoch": 0.08274953910982355, "grad_norm": 3.5475192070007324, "learning_rate": 4.586910718988676e-05, "loss": 1.2326, "step": 3142 }, { "epoch": 0.08277587569133527, "grad_norm": 2.9847376346588135, "learning_rate": 4.586779036081117e-05, "loss": 1.4619, "step": 3143 }, { "epoch": 0.08280221227284698, "grad_norm": 2.3415334224700928, "learning_rate": 4.586647353173558e-05, "loss": 1.7182, "step": 3144 }, { "epoch": 0.08282854885435871, "grad_norm": 2.0909996032714844, "learning_rate": 4.586515670265999e-05, "loss": 1.9498, "step": 3145 }, { "epoch": 0.08285488543587043, "grad_norm": 2.3579351902008057, "learning_rate": 4.5863839873584414e-05, "loss": 0.7382, "step": 3146 }, { "epoch": 0.08288122201738214, "grad_norm": 4.94607400894165, "learning_rate": 4.586252304450882e-05, "loss": 1.8791, "step": 3147 }, { "epoch": 0.08290755859889386, "grad_norm": 2.106743812561035, "learning_rate": 4.586120621543324e-05, "loss": 1.7887, "step": 3148 }, { "epoch": 0.08293389518040559, "grad_norm": 1.873420000076294, "learning_rate": 4.5859889386357654e-05, "loss": 1.9993, "step": 3149 }, { "epoch": 0.0829602317619173, "grad_norm": 1.831897497177124, "learning_rate": 4.585857255728206e-05, "loss": 1.8923, "step": 3150 }, { "epoch": 0.08298656834342902, "grad_norm": 3.0289714336395264, "learning_rate": 4.5857255728206485e-05, "loss": 1.9192, "step": 3151 }, { "epoch": 0.08301290492494075, "grad_norm": 2.8784220218658447, "learning_rate": 4.5855938899130894e-05, "loss": 1.5155, "step": 3152 }, { "epoch": 0.08303924150645246, "grad_norm": 6.403001308441162, "learning_rate": 4.585462207005531e-05, "loss": 1.0225, "step": 3153 }, { "epoch": 0.08306557808796418, "grad_norm": 2.122565984725952, "learning_rate": 4.585330524097972e-05, "loss": 1.9004, "step": 3154 }, { "epoch": 0.08309191466947591, "grad_norm": 2.0493125915527344, "learning_rate": 4.5851988411904134e-05, "loss": 2.023, "step": 3155 }, { "epoch": 0.08311825125098762, "grad_norm": 2.776400089263916, "learning_rate": 4.585067158282855e-05, "loss": 2.266, "step": 3156 }, { "epoch": 0.08314458783249934, "grad_norm": 4.151550769805908, "learning_rate": 4.5849354753752966e-05, "loss": 0.8544, "step": 3157 }, { "epoch": 0.08317092441401105, "grad_norm": 1.9945951700210571, "learning_rate": 4.584803792467738e-05, "loss": 1.3352, "step": 3158 }, { "epoch": 0.08319726099552278, "grad_norm": 1.8696441650390625, "learning_rate": 4.584672109560179e-05, "loss": 1.3425, "step": 3159 }, { "epoch": 0.0832235975770345, "grad_norm": 2.921455144882202, "learning_rate": 4.584540426652621e-05, "loss": 1.5419, "step": 3160 }, { "epoch": 0.08324993415854622, "grad_norm": 2.025024652481079, "learning_rate": 4.584408743745062e-05, "loss": 1.6507, "step": 3161 }, { "epoch": 0.08327627074005794, "grad_norm": 2.3824870586395264, "learning_rate": 4.584277060837504e-05, "loss": 1.9859, "step": 3162 }, { "epoch": 0.08330260732156966, "grad_norm": 2.3922998905181885, "learning_rate": 4.5841453779299446e-05, "loss": 2.1497, "step": 3163 }, { "epoch": 0.08332894390308138, "grad_norm": 1.7682502269744873, "learning_rate": 4.584013695022386e-05, "loss": 1.5649, "step": 3164 }, { "epoch": 0.0833552804845931, "grad_norm": 3.6769139766693115, "learning_rate": 4.583882012114828e-05, "loss": 1.7707, "step": 3165 }, { "epoch": 0.08338161706610482, "grad_norm": 2.7153096199035645, "learning_rate": 4.583750329207269e-05, "loss": 0.948, "step": 3166 }, { "epoch": 0.08340795364761654, "grad_norm": 2.6838431358337402, "learning_rate": 4.583618646299711e-05, "loss": 1.1769, "step": 3167 }, { "epoch": 0.08343429022912825, "grad_norm": 3.939176559448242, "learning_rate": 4.583486963392152e-05, "loss": 1.149, "step": 3168 }, { "epoch": 0.08346062681063998, "grad_norm": 4.521262168884277, "learning_rate": 4.583355280484593e-05, "loss": 0.6673, "step": 3169 }, { "epoch": 0.0834869633921517, "grad_norm": 3.738725185394287, "learning_rate": 4.583223597577035e-05, "loss": 1.2397, "step": 3170 }, { "epoch": 0.08351329997366341, "grad_norm": 2.1133363246917725, "learning_rate": 4.5830919146694764e-05, "loss": 1.4473, "step": 3171 }, { "epoch": 0.08353963655517514, "grad_norm": 2.7973620891571045, "learning_rate": 4.582960231761917e-05, "loss": 2.2941, "step": 3172 }, { "epoch": 0.08356597313668686, "grad_norm": 3.1256542205810547, "learning_rate": 4.582828548854359e-05, "loss": 1.3881, "step": 3173 }, { "epoch": 0.08359230971819857, "grad_norm": 3.162682294845581, "learning_rate": 4.5826968659468004e-05, "loss": 1.7668, "step": 3174 }, { "epoch": 0.0836186462997103, "grad_norm": 2.5578365325927734, "learning_rate": 4.582565183039242e-05, "loss": 1.2669, "step": 3175 }, { "epoch": 0.08364498288122202, "grad_norm": 2.274585723876953, "learning_rate": 4.5824335001316835e-05, "loss": 1.7379, "step": 3176 }, { "epoch": 0.08367131946273373, "grad_norm": 2.08831787109375, "learning_rate": 4.5823018172241244e-05, "loss": 1.5238, "step": 3177 }, { "epoch": 0.08369765604424546, "grad_norm": 3.0488479137420654, "learning_rate": 4.582170134316566e-05, "loss": 1.7891, "step": 3178 }, { "epoch": 0.08372399262575718, "grad_norm": 3.5930607318878174, "learning_rate": 4.5820384514090075e-05, "loss": 1.6324, "step": 3179 }, { "epoch": 0.0837503292072689, "grad_norm": 3.64334774017334, "learning_rate": 4.581906768501449e-05, "loss": 1.7488, "step": 3180 }, { "epoch": 0.08377666578878061, "grad_norm": 3.94474458694458, "learning_rate": 4.58177508559389e-05, "loss": 0.6436, "step": 3181 }, { "epoch": 0.08380300237029234, "grad_norm": 3.3415911197662354, "learning_rate": 4.5816434026863315e-05, "loss": 1.619, "step": 3182 }, { "epoch": 0.08382933895180406, "grad_norm": 3.105062484741211, "learning_rate": 4.5815117197787724e-05, "loss": 1.5413, "step": 3183 }, { "epoch": 0.08385567553331577, "grad_norm": 1.6706628799438477, "learning_rate": 4.5813800368712146e-05, "loss": 1.9191, "step": 3184 }, { "epoch": 0.0838820121148275, "grad_norm": 3.660766363143921, "learning_rate": 4.5812483539636555e-05, "loss": 2.1776, "step": 3185 }, { "epoch": 0.08390834869633922, "grad_norm": 1.9701188802719116, "learning_rate": 4.581116671056097e-05, "loss": 1.7777, "step": 3186 }, { "epoch": 0.08393468527785093, "grad_norm": 2.0551421642303467, "learning_rate": 4.5809849881485386e-05, "loss": 1.9264, "step": 3187 }, { "epoch": 0.08396102185936266, "grad_norm": 1.8702586889266968, "learning_rate": 4.5808533052409795e-05, "loss": 1.8949, "step": 3188 }, { "epoch": 0.08398735844087438, "grad_norm": 2.8003616333007812, "learning_rate": 4.580721622333422e-05, "loss": 2.1091, "step": 3189 }, { "epoch": 0.08401369502238609, "grad_norm": 2.0695996284484863, "learning_rate": 4.5805899394258626e-05, "loss": 1.8386, "step": 3190 }, { "epoch": 0.08404003160389781, "grad_norm": 4.105753421783447, "learning_rate": 4.580458256518304e-05, "loss": 0.9941, "step": 3191 }, { "epoch": 0.08406636818540954, "grad_norm": 5.378888130187988, "learning_rate": 4.580326573610745e-05, "loss": 1.6269, "step": 3192 }, { "epoch": 0.08409270476692125, "grad_norm": 2.4382083415985107, "learning_rate": 4.580194890703187e-05, "loss": 2.5767, "step": 3193 }, { "epoch": 0.08411904134843297, "grad_norm": 5.3425116539001465, "learning_rate": 4.580063207795628e-05, "loss": 1.0474, "step": 3194 }, { "epoch": 0.0841453779299447, "grad_norm": 1.9401028156280518, "learning_rate": 4.57993152488807e-05, "loss": 2.0606, "step": 3195 }, { "epoch": 0.08417171451145641, "grad_norm": 3.2426631450653076, "learning_rate": 4.579799841980511e-05, "loss": 2.38, "step": 3196 }, { "epoch": 0.08419805109296813, "grad_norm": 1.6750125885009766, "learning_rate": 4.579668159072952e-05, "loss": 1.5457, "step": 3197 }, { "epoch": 0.08422438767447986, "grad_norm": 1.8950235843658447, "learning_rate": 4.5795364761653944e-05, "loss": 1.8299, "step": 3198 }, { "epoch": 0.08425072425599157, "grad_norm": 2.9908668994903564, "learning_rate": 4.579404793257835e-05, "loss": 1.1757, "step": 3199 }, { "epoch": 0.08427706083750329, "grad_norm": 2.055785655975342, "learning_rate": 4.579273110350277e-05, "loss": 2.0399, "step": 3200 }, { "epoch": 0.084303397419015, "grad_norm": 3.2788565158843994, "learning_rate": 4.579141427442718e-05, "loss": 2.2281, "step": 3201 }, { "epoch": 0.08432973400052673, "grad_norm": 2.5476937294006348, "learning_rate": 4.579009744535159e-05, "loss": 0.9109, "step": 3202 }, { "epoch": 0.08435607058203845, "grad_norm": 3.333198070526123, "learning_rate": 4.578878061627601e-05, "loss": 2.1412, "step": 3203 }, { "epoch": 0.08438240716355017, "grad_norm": 2.6660966873168945, "learning_rate": 4.5787463787200424e-05, "loss": 0.8025, "step": 3204 }, { "epoch": 0.0844087437450619, "grad_norm": 3.7357802391052246, "learning_rate": 4.578614695812484e-05, "loss": 0.9043, "step": 3205 }, { "epoch": 0.08443508032657361, "grad_norm": 2.3206193447113037, "learning_rate": 4.578483012904925e-05, "loss": 2.3118, "step": 3206 }, { "epoch": 0.08446141690808533, "grad_norm": 4.291245937347412, "learning_rate": 4.578351329997367e-05, "loss": 1.599, "step": 3207 }, { "epoch": 0.08448775348959706, "grad_norm": 2.4323413372039795, "learning_rate": 4.578219647089808e-05, "loss": 2.2438, "step": 3208 }, { "epoch": 0.08451409007110877, "grad_norm": 2.935488700866699, "learning_rate": 4.5780879641822495e-05, "loss": 1.6592, "step": 3209 }, { "epoch": 0.08454042665262049, "grad_norm": 2.0307133197784424, "learning_rate": 4.5779562812746904e-05, "loss": 2.2286, "step": 3210 }, { "epoch": 0.0845667632341322, "grad_norm": 2.061506748199463, "learning_rate": 4.577824598367132e-05, "loss": 1.5909, "step": 3211 }, { "epoch": 0.08459309981564393, "grad_norm": 2.495828151702881, "learning_rate": 4.5776929154595735e-05, "loss": 1.5772, "step": 3212 }, { "epoch": 0.08461943639715565, "grad_norm": 1.9573198556900024, "learning_rate": 4.577561232552015e-05, "loss": 0.6242, "step": 3213 }, { "epoch": 0.08464577297866736, "grad_norm": 1.838287353515625, "learning_rate": 4.5774295496444567e-05, "loss": 1.7578, "step": 3214 }, { "epoch": 0.08467210956017909, "grad_norm": 1.61006760597229, "learning_rate": 4.5772978667368975e-05, "loss": 1.8254, "step": 3215 }, { "epoch": 0.08469844614169081, "grad_norm": 1.8644553422927856, "learning_rate": 4.577166183829339e-05, "loss": 1.7631, "step": 3216 }, { "epoch": 0.08472478272320252, "grad_norm": 2.303036689758301, "learning_rate": 4.5770345009217807e-05, "loss": 2.7301, "step": 3217 }, { "epoch": 0.08475111930471425, "grad_norm": 2.0750932693481445, "learning_rate": 4.576902818014222e-05, "loss": 0.6355, "step": 3218 }, { "epoch": 0.08477745588622597, "grad_norm": 3.8102564811706543, "learning_rate": 4.576771135106663e-05, "loss": 1.4302, "step": 3219 }, { "epoch": 0.08480379246773768, "grad_norm": 2.098309278488159, "learning_rate": 4.5766394521991047e-05, "loss": 1.5723, "step": 3220 }, { "epoch": 0.08483012904924941, "grad_norm": 2.583853244781494, "learning_rate": 4.576507769291546e-05, "loss": 1.5143, "step": 3221 }, { "epoch": 0.08485646563076113, "grad_norm": 2.2531728744506836, "learning_rate": 4.576376086383988e-05, "loss": 2.1432, "step": 3222 }, { "epoch": 0.08488280221227285, "grad_norm": 2.871525764465332, "learning_rate": 4.576244403476429e-05, "loss": 1.8594, "step": 3223 }, { "epoch": 0.08490913879378456, "grad_norm": 2.0095560550689697, "learning_rate": 4.57611272056887e-05, "loss": 1.5931, "step": 3224 }, { "epoch": 0.08493547537529629, "grad_norm": 3.202383279800415, "learning_rate": 4.575981037661312e-05, "loss": 1.0461, "step": 3225 }, { "epoch": 0.084961811956808, "grad_norm": 2.3500051498413086, "learning_rate": 4.575849354753753e-05, "loss": 1.7214, "step": 3226 }, { "epoch": 0.08498814853831972, "grad_norm": 3.089770555496216, "learning_rate": 4.575717671846195e-05, "loss": 0.8427, "step": 3227 }, { "epoch": 0.08501448511983145, "grad_norm": 2.141021490097046, "learning_rate": 4.575585988938636e-05, "loss": 2.2557, "step": 3228 }, { "epoch": 0.08504082170134317, "grad_norm": 3.637605667114258, "learning_rate": 4.575454306031077e-05, "loss": 1.9752, "step": 3229 }, { "epoch": 0.08506715828285488, "grad_norm": 2.2627432346343994, "learning_rate": 4.575322623123519e-05, "loss": 2.217, "step": 3230 }, { "epoch": 0.08509349486436661, "grad_norm": 3.7502896785736084, "learning_rate": 4.5751909402159605e-05, "loss": 1.3013, "step": 3231 }, { "epoch": 0.08511983144587833, "grad_norm": 4.817307949066162, "learning_rate": 4.575059257308402e-05, "loss": 1.3074, "step": 3232 }, { "epoch": 0.08514616802739004, "grad_norm": 2.2230911254882812, "learning_rate": 4.574927574400843e-05, "loss": 0.486, "step": 3233 }, { "epoch": 0.08517250460890176, "grad_norm": 2.1989874839782715, "learning_rate": 4.5747958914932845e-05, "loss": 1.7278, "step": 3234 }, { "epoch": 0.08519884119041349, "grad_norm": 2.4527993202209473, "learning_rate": 4.5746642085857253e-05, "loss": 2.0587, "step": 3235 }, { "epoch": 0.0852251777719252, "grad_norm": 4.338960647583008, "learning_rate": 4.5745325256781676e-05, "loss": 1.4725, "step": 3236 }, { "epoch": 0.08525151435343692, "grad_norm": 2.488229751586914, "learning_rate": 4.5744008427706085e-05, "loss": 1.6375, "step": 3237 }, { "epoch": 0.08527785093494865, "grad_norm": 3.5188822746276855, "learning_rate": 4.57426915986305e-05, "loss": 1.622, "step": 3238 }, { "epoch": 0.08530418751646036, "grad_norm": 2.0137503147125244, "learning_rate": 4.574137476955491e-05, "loss": 1.6166, "step": 3239 }, { "epoch": 0.08533052409797208, "grad_norm": 4.743828296661377, "learning_rate": 4.574005794047933e-05, "loss": 1.155, "step": 3240 }, { "epoch": 0.08535686067948381, "grad_norm": 4.959859371185303, "learning_rate": 4.573874111140374e-05, "loss": 0.4543, "step": 3241 }, { "epoch": 0.08538319726099552, "grad_norm": 2.6431691646575928, "learning_rate": 4.5737424282328156e-05, "loss": 2.4788, "step": 3242 }, { "epoch": 0.08540953384250724, "grad_norm": 2.981017827987671, "learning_rate": 4.573610745325257e-05, "loss": 2.1791, "step": 3243 }, { "epoch": 0.08543587042401896, "grad_norm": 2.1606719493865967, "learning_rate": 4.573479062417698e-05, "loss": 2.3109, "step": 3244 }, { "epoch": 0.08546220700553069, "grad_norm": 2.191272258758545, "learning_rate": 4.57334737951014e-05, "loss": 1.4289, "step": 3245 }, { "epoch": 0.0854885435870424, "grad_norm": 1.7807561159133911, "learning_rate": 4.573215696602581e-05, "loss": 2.6139, "step": 3246 }, { "epoch": 0.08551488016855412, "grad_norm": 2.424823760986328, "learning_rate": 4.573084013695023e-05, "loss": 1.6689, "step": 3247 }, { "epoch": 0.08554121675006585, "grad_norm": 1.7393673658370972, "learning_rate": 4.5729523307874636e-05, "loss": 2.0159, "step": 3248 }, { "epoch": 0.08556755333157756, "grad_norm": 1.9312829971313477, "learning_rate": 4.572820647879905e-05, "loss": 1.7716, "step": 3249 }, { "epoch": 0.08559388991308928, "grad_norm": 2.718719482421875, "learning_rate": 4.572688964972347e-05, "loss": 1.7089, "step": 3250 }, { "epoch": 0.085620226494601, "grad_norm": 2.220393419265747, "learning_rate": 4.572557282064788e-05, "loss": 1.9193, "step": 3251 }, { "epoch": 0.08564656307611272, "grad_norm": 2.8280107975006104, "learning_rate": 4.57242559915723e-05, "loss": 1.7072, "step": 3252 }, { "epoch": 0.08567289965762444, "grad_norm": 2.6569924354553223, "learning_rate": 4.572293916249671e-05, "loss": 2.2634, "step": 3253 }, { "epoch": 0.08569923623913615, "grad_norm": 3.409811019897461, "learning_rate": 4.572162233342113e-05, "loss": 1.4841, "step": 3254 }, { "epoch": 0.08572557282064788, "grad_norm": 2.3559229373931885, "learning_rate": 4.572030550434554e-05, "loss": 0.4952, "step": 3255 }, { "epoch": 0.0857519094021596, "grad_norm": 3.11930775642395, "learning_rate": 4.5718988675269954e-05, "loss": 1.9097, "step": 3256 }, { "epoch": 0.08577824598367131, "grad_norm": 3.1896774768829346, "learning_rate": 4.571767184619436e-05, "loss": 1.6084, "step": 3257 }, { "epoch": 0.08580458256518304, "grad_norm": 4.589871406555176, "learning_rate": 4.571635501711878e-05, "loss": 1.4056, "step": 3258 }, { "epoch": 0.08583091914669476, "grad_norm": 4.741261005401611, "learning_rate": 4.5715038188043194e-05, "loss": 0.9644, "step": 3259 }, { "epoch": 0.08585725572820647, "grad_norm": 5.810362815856934, "learning_rate": 4.571372135896761e-05, "loss": 1.5796, "step": 3260 }, { "epoch": 0.0858835923097182, "grad_norm": 1.920261025428772, "learning_rate": 4.5712404529892025e-05, "loss": 2.0457, "step": 3261 }, { "epoch": 0.08590992889122992, "grad_norm": 2.4054150581359863, "learning_rate": 4.5711087700816434e-05, "loss": 2.0352, "step": 3262 }, { "epoch": 0.08593626547274164, "grad_norm": 2.4277303218841553, "learning_rate": 4.570977087174085e-05, "loss": 2.0703, "step": 3263 }, { "epoch": 0.08596260205425336, "grad_norm": 3.1370632648468018, "learning_rate": 4.5708454042665265e-05, "loss": 1.2243, "step": 3264 }, { "epoch": 0.08598893863576508, "grad_norm": 2.9451024532318115, "learning_rate": 4.570713721358968e-05, "loss": 1.9078, "step": 3265 }, { "epoch": 0.0860152752172768, "grad_norm": 2.067363977432251, "learning_rate": 4.570582038451409e-05, "loss": 2.0494, "step": 3266 }, { "epoch": 0.08604161179878851, "grad_norm": 3.4219279289245605, "learning_rate": 4.5704503555438505e-05, "loss": 2.0001, "step": 3267 }, { "epoch": 0.08606794838030024, "grad_norm": 4.3392720222473145, "learning_rate": 4.570318672636292e-05, "loss": 2.0908, "step": 3268 }, { "epoch": 0.08609428496181196, "grad_norm": 1.5715715885162354, "learning_rate": 4.5701869897287336e-05, "loss": 1.7674, "step": 3269 }, { "epoch": 0.08612062154332367, "grad_norm": 6.030750751495361, "learning_rate": 4.570055306821175e-05, "loss": 1.3608, "step": 3270 }, { "epoch": 0.0861469581248354, "grad_norm": 1.8376785516738892, "learning_rate": 4.569923623913616e-05, "loss": 2.0341, "step": 3271 }, { "epoch": 0.08617329470634712, "grad_norm": 2.9195828437805176, "learning_rate": 4.5697919410060576e-05, "loss": 1.8199, "step": 3272 }, { "epoch": 0.08619963128785883, "grad_norm": 3.337942600250244, "learning_rate": 4.569660258098499e-05, "loss": 1.2327, "step": 3273 }, { "epoch": 0.08622596786937056, "grad_norm": 2.0117812156677246, "learning_rate": 4.569528575190941e-05, "loss": 1.5599, "step": 3274 }, { "epoch": 0.08625230445088228, "grad_norm": 2.2164361476898193, "learning_rate": 4.5693968922833816e-05, "loss": 1.401, "step": 3275 }, { "epoch": 0.086278641032394, "grad_norm": 2.879408836364746, "learning_rate": 4.569265209375823e-05, "loss": 0.7136, "step": 3276 }, { "epoch": 0.08630497761390571, "grad_norm": 2.3157923221588135, "learning_rate": 4.569133526468265e-05, "loss": 0.9512, "step": 3277 }, { "epoch": 0.08633131419541744, "grad_norm": 2.668488025665283, "learning_rate": 4.569001843560706e-05, "loss": 1.8315, "step": 3278 }, { "epoch": 0.08635765077692915, "grad_norm": 2.078740358352661, "learning_rate": 4.568870160653148e-05, "loss": 1.7008, "step": 3279 }, { "epoch": 0.08638398735844087, "grad_norm": 2.9175126552581787, "learning_rate": 4.568738477745589e-05, "loss": 1.534, "step": 3280 }, { "epoch": 0.0864103239399526, "grad_norm": 3.69405198097229, "learning_rate": 4.56860679483803e-05, "loss": 1.7667, "step": 3281 }, { "epoch": 0.08643666052146431, "grad_norm": 2.7728805541992188, "learning_rate": 4.568475111930471e-05, "loss": 1.6557, "step": 3282 }, { "epoch": 0.08646299710297603, "grad_norm": 1.509304404258728, "learning_rate": 4.5683434290229134e-05, "loss": 1.6786, "step": 3283 }, { "epoch": 0.08648933368448776, "grad_norm": 5.2924089431762695, "learning_rate": 4.568211746115354e-05, "loss": 2.3038, "step": 3284 }, { "epoch": 0.08651567026599948, "grad_norm": 2.9266762733459473, "learning_rate": 4.568080063207796e-05, "loss": 1.6544, "step": 3285 }, { "epoch": 0.08654200684751119, "grad_norm": 3.1236605644226074, "learning_rate": 4.567948380300237e-05, "loss": 1.7951, "step": 3286 }, { "epoch": 0.0865683434290229, "grad_norm": 2.3273816108703613, "learning_rate": 4.567816697392679e-05, "loss": 2.18, "step": 3287 }, { "epoch": 0.08659468001053464, "grad_norm": 1.8327178955078125, "learning_rate": 4.56768501448512e-05, "loss": 1.8965, "step": 3288 }, { "epoch": 0.08662101659204635, "grad_norm": 2.5005617141723633, "learning_rate": 4.5675533315775614e-05, "loss": 1.9199, "step": 3289 }, { "epoch": 0.08664735317355807, "grad_norm": 4.19956636428833, "learning_rate": 4.567421648670003e-05, "loss": 2.0134, "step": 3290 }, { "epoch": 0.0866736897550698, "grad_norm": 2.499419689178467, "learning_rate": 4.567289965762444e-05, "loss": 1.8685, "step": 3291 }, { "epoch": 0.08670002633658151, "grad_norm": 1.7010895013809204, "learning_rate": 4.567158282854886e-05, "loss": 1.8863, "step": 3292 }, { "epoch": 0.08672636291809323, "grad_norm": 2.4933762550354004, "learning_rate": 4.567026599947327e-05, "loss": 0.4979, "step": 3293 }, { "epoch": 0.08675269949960496, "grad_norm": 1.8119902610778809, "learning_rate": 4.5668949170397686e-05, "loss": 1.7239, "step": 3294 }, { "epoch": 0.08677903608111667, "grad_norm": 3.5810108184814453, "learning_rate": 4.5667632341322094e-05, "loss": 2.1185, "step": 3295 }, { "epoch": 0.08680537266262839, "grad_norm": 2.684758424758911, "learning_rate": 4.566631551224651e-05, "loss": 1.7439, "step": 3296 }, { "epoch": 0.0868317092441401, "grad_norm": 2.041480779647827, "learning_rate": 4.5664998683170926e-05, "loss": 2.1769, "step": 3297 }, { "epoch": 0.08685804582565183, "grad_norm": 2.4490764141082764, "learning_rate": 4.566368185409534e-05, "loss": 1.5937, "step": 3298 }, { "epoch": 0.08688438240716355, "grad_norm": 2.4468672275543213, "learning_rate": 4.566236502501976e-05, "loss": 1.6006, "step": 3299 }, { "epoch": 0.08691071898867526, "grad_norm": 2.040010929107666, "learning_rate": 4.5661048195944166e-05, "loss": 2.0957, "step": 3300 }, { "epoch": 0.086937055570187, "grad_norm": 2.194331169128418, "learning_rate": 4.565973136686858e-05, "loss": 1.9661, "step": 3301 }, { "epoch": 0.08696339215169871, "grad_norm": 1.8147886991500854, "learning_rate": 4.5658414537793e-05, "loss": 2.434, "step": 3302 }, { "epoch": 0.08698972873321043, "grad_norm": 3.349013090133667, "learning_rate": 4.565709770871741e-05, "loss": 1.229, "step": 3303 }, { "epoch": 0.08701606531472215, "grad_norm": 2.3615927696228027, "learning_rate": 4.565578087964182e-05, "loss": 1.8129, "step": 3304 }, { "epoch": 0.08704240189623387, "grad_norm": 2.8906731605529785, "learning_rate": 4.565446405056624e-05, "loss": 1.3058, "step": 3305 }, { "epoch": 0.08706873847774559, "grad_norm": 4.31870698928833, "learning_rate": 4.565314722149065e-05, "loss": 2.7197, "step": 3306 }, { "epoch": 0.08709507505925732, "grad_norm": 6.206526756286621, "learning_rate": 4.565183039241507e-05, "loss": 0.878, "step": 3307 }, { "epoch": 0.08712141164076903, "grad_norm": 5.137819766998291, "learning_rate": 4.5650513563339484e-05, "loss": 1.5677, "step": 3308 }, { "epoch": 0.08714774822228075, "grad_norm": 2.115330219268799, "learning_rate": 4.564919673426389e-05, "loss": 1.6648, "step": 3309 }, { "epoch": 0.08717408480379246, "grad_norm": 4.449472427368164, "learning_rate": 4.564787990518831e-05, "loss": 1.173, "step": 3310 }, { "epoch": 0.08720042138530419, "grad_norm": 3.948798418045044, "learning_rate": 4.5646563076112724e-05, "loss": 1.4963, "step": 3311 }, { "epoch": 0.08722675796681591, "grad_norm": 4.386968612670898, "learning_rate": 4.564524624703714e-05, "loss": 1.9632, "step": 3312 }, { "epoch": 0.08725309454832762, "grad_norm": 2.518096923828125, "learning_rate": 4.564392941796155e-05, "loss": 1.0785, "step": 3313 }, { "epoch": 0.08727943112983935, "grad_norm": 3.4689698219299316, "learning_rate": 4.5642612588885964e-05, "loss": 2.1707, "step": 3314 }, { "epoch": 0.08730576771135107, "grad_norm": 3.268017053604126, "learning_rate": 4.564129575981038e-05, "loss": 1.1932, "step": 3315 }, { "epoch": 0.08733210429286278, "grad_norm": 7.4629807472229, "learning_rate": 4.5639978930734795e-05, "loss": 2.1586, "step": 3316 }, { "epoch": 0.08735844087437451, "grad_norm": 2.441282272338867, "learning_rate": 4.563866210165921e-05, "loss": 0.4565, "step": 3317 }, { "epoch": 0.08738477745588623, "grad_norm": 4.256004810333252, "learning_rate": 4.563734527258362e-05, "loss": 1.4775, "step": 3318 }, { "epoch": 0.08741111403739794, "grad_norm": 1.818750023841858, "learning_rate": 4.5636028443508035e-05, "loss": 1.7899, "step": 3319 }, { "epoch": 0.08743745061890966, "grad_norm": 1.843833565711975, "learning_rate": 4.563471161443245e-05, "loss": 1.5933, "step": 3320 }, { "epoch": 0.08746378720042139, "grad_norm": 2.0348916053771973, "learning_rate": 4.5633394785356866e-05, "loss": 1.5735, "step": 3321 }, { "epoch": 0.0874901237819331, "grad_norm": 2.6325459480285645, "learning_rate": 4.5632077956281275e-05, "loss": 1.827, "step": 3322 }, { "epoch": 0.08751646036344482, "grad_norm": 2.4956932067871094, "learning_rate": 4.563076112720569e-05, "loss": 2.3305, "step": 3323 }, { "epoch": 0.08754279694495655, "grad_norm": 2.065242290496826, "learning_rate": 4.5629444298130106e-05, "loss": 1.725, "step": 3324 }, { "epoch": 0.08756913352646827, "grad_norm": 3.2543764114379883, "learning_rate": 4.562812746905452e-05, "loss": 0.6695, "step": 3325 }, { "epoch": 0.08759547010797998, "grad_norm": 2.7452926635742188, "learning_rate": 4.562681063997894e-05, "loss": 1.5987, "step": 3326 }, { "epoch": 0.08762180668949171, "grad_norm": 2.2238802909851074, "learning_rate": 4.5625493810903346e-05, "loss": 2.0862, "step": 3327 }, { "epoch": 0.08764814327100343, "grad_norm": 2.7854669094085693, "learning_rate": 4.562417698182776e-05, "loss": 2.3381, "step": 3328 }, { "epoch": 0.08767447985251514, "grad_norm": 1.9521734714508057, "learning_rate": 4.562286015275217e-05, "loss": 1.4309, "step": 3329 }, { "epoch": 0.08770081643402686, "grad_norm": 3.3219892978668213, "learning_rate": 4.562154332367659e-05, "loss": 2.1148, "step": 3330 }, { "epoch": 0.08772715301553859, "grad_norm": 2.3819589614868164, "learning_rate": 4.5620226494601e-05, "loss": 1.559, "step": 3331 }, { "epoch": 0.0877534895970503, "grad_norm": 3.2879862785339355, "learning_rate": 4.561890966552542e-05, "loss": 1.5046, "step": 3332 }, { "epoch": 0.08777982617856202, "grad_norm": 3.7280631065368652, "learning_rate": 4.561759283644983e-05, "loss": 1.404, "step": 3333 }, { "epoch": 0.08780616276007375, "grad_norm": 1.9950778484344482, "learning_rate": 4.561627600737424e-05, "loss": 0.3617, "step": 3334 }, { "epoch": 0.08783249934158546, "grad_norm": 1.8072651624679565, "learning_rate": 4.5614959178298664e-05, "loss": 1.5139, "step": 3335 }, { "epoch": 0.08785883592309718, "grad_norm": 1.9130542278289795, "learning_rate": 4.561364234922307e-05, "loss": 2.3877, "step": 3336 }, { "epoch": 0.08788517250460891, "grad_norm": 2.688465118408203, "learning_rate": 4.561232552014749e-05, "loss": 1.4498, "step": 3337 }, { "epoch": 0.08791150908612062, "grad_norm": 1.6436034440994263, "learning_rate": 4.56110086910719e-05, "loss": 1.5885, "step": 3338 }, { "epoch": 0.08793784566763234, "grad_norm": 2.0929412841796875, "learning_rate": 4.560969186199632e-05, "loss": 1.8179, "step": 3339 }, { "epoch": 0.08796418224914405, "grad_norm": 5.781081199645996, "learning_rate": 4.560837503292073e-05, "loss": 1.0893, "step": 3340 }, { "epoch": 0.08799051883065578, "grad_norm": 2.7853565216064453, "learning_rate": 4.5607058203845144e-05, "loss": 1.7618, "step": 3341 }, { "epoch": 0.0880168554121675, "grad_norm": 1.579538106918335, "learning_rate": 4.560574137476955e-05, "loss": 1.7477, "step": 3342 }, { "epoch": 0.08804319199367922, "grad_norm": 3.7131972312927246, "learning_rate": 4.560442454569397e-05, "loss": 1.6057, "step": 3343 }, { "epoch": 0.08806952857519094, "grad_norm": 3.0380332469940186, "learning_rate": 4.5603107716618384e-05, "loss": 1.5221, "step": 3344 }, { "epoch": 0.08809586515670266, "grad_norm": 1.8556195497512817, "learning_rate": 4.56017908875428e-05, "loss": 1.7945, "step": 3345 }, { "epoch": 0.08812220173821438, "grad_norm": 2.110743522644043, "learning_rate": 4.5600474058467215e-05, "loss": 1.7812, "step": 3346 }, { "epoch": 0.0881485383197261, "grad_norm": 1.9272798299789429, "learning_rate": 4.5599157229391624e-05, "loss": 1.2776, "step": 3347 }, { "epoch": 0.08817487490123782, "grad_norm": 1.881285309791565, "learning_rate": 4.559784040031604e-05, "loss": 2.1988, "step": 3348 }, { "epoch": 0.08820121148274954, "grad_norm": 2.1280012130737305, "learning_rate": 4.5596523571240455e-05, "loss": 1.8454, "step": 3349 }, { "epoch": 0.08822754806426125, "grad_norm": 2.944765329360962, "learning_rate": 4.559520674216487e-05, "loss": 0.4876, "step": 3350 }, { "epoch": 0.08825388464577298, "grad_norm": 2.803616523742676, "learning_rate": 4.559388991308928e-05, "loss": 1.9499, "step": 3351 }, { "epoch": 0.0882802212272847, "grad_norm": 2.4479265213012695, "learning_rate": 4.5592573084013695e-05, "loss": 2.2732, "step": 3352 }, { "epoch": 0.08830655780879641, "grad_norm": 3.8088347911834717, "learning_rate": 4.559125625493811e-05, "loss": 1.345, "step": 3353 }, { "epoch": 0.08833289439030814, "grad_norm": 3.1336209774017334, "learning_rate": 4.5589939425862527e-05, "loss": 1.9881, "step": 3354 }, { "epoch": 0.08835923097181986, "grad_norm": 3.227994441986084, "learning_rate": 4.558862259678694e-05, "loss": 1.7598, "step": 3355 }, { "epoch": 0.08838556755333157, "grad_norm": 3.023447275161743, "learning_rate": 4.558730576771135e-05, "loss": 1.3893, "step": 3356 }, { "epoch": 0.0884119041348433, "grad_norm": 2.735240936279297, "learning_rate": 4.5585988938635767e-05, "loss": 0.5625, "step": 3357 }, { "epoch": 0.08843824071635502, "grad_norm": 2.5172595977783203, "learning_rate": 4.558467210956018e-05, "loss": 1.4498, "step": 3358 }, { "epoch": 0.08846457729786673, "grad_norm": 1.993535041809082, "learning_rate": 4.55833552804846e-05, "loss": 1.9375, "step": 3359 }, { "epoch": 0.08849091387937846, "grad_norm": 2.281315803527832, "learning_rate": 4.5582038451409007e-05, "loss": 0.3755, "step": 3360 }, { "epoch": 0.08851725046089018, "grad_norm": 3.085216999053955, "learning_rate": 4.558072162233342e-05, "loss": 1.9078, "step": 3361 }, { "epoch": 0.0885435870424019, "grad_norm": 2.7327311038970947, "learning_rate": 4.557940479325784e-05, "loss": 1.5368, "step": 3362 }, { "epoch": 0.08856992362391361, "grad_norm": 2.1845152378082275, "learning_rate": 4.5578087964182253e-05, "loss": 1.6946, "step": 3363 }, { "epoch": 0.08859626020542534, "grad_norm": 2.8556973934173584, "learning_rate": 4.557677113510667e-05, "loss": 2.3051, "step": 3364 }, { "epoch": 0.08862259678693706, "grad_norm": 2.062838554382324, "learning_rate": 4.557545430603108e-05, "loss": 1.8454, "step": 3365 }, { "epoch": 0.08864893336844877, "grad_norm": 2.3930327892303467, "learning_rate": 4.5574137476955493e-05, "loss": 2.152, "step": 3366 }, { "epoch": 0.0886752699499605, "grad_norm": 2.4107680320739746, "learning_rate": 4.55728206478799e-05, "loss": 2.0974, "step": 3367 }, { "epoch": 0.08870160653147222, "grad_norm": 1.9558051824569702, "learning_rate": 4.5571503818804325e-05, "loss": 1.4819, "step": 3368 }, { "epoch": 0.08872794311298393, "grad_norm": 2.191049575805664, "learning_rate": 4.5570186989728733e-05, "loss": 1.6372, "step": 3369 }, { "epoch": 0.08875427969449566, "grad_norm": 2.010561466217041, "learning_rate": 4.556887016065315e-05, "loss": 1.9864, "step": 3370 }, { "epoch": 0.08878061627600738, "grad_norm": 2.129725217819214, "learning_rate": 4.5567553331577565e-05, "loss": 1.8083, "step": 3371 }, { "epoch": 0.08880695285751909, "grad_norm": 1.9936469793319702, "learning_rate": 4.556623650250198e-05, "loss": 1.8133, "step": 3372 }, { "epoch": 0.08883328943903081, "grad_norm": 3.681715965270996, "learning_rate": 4.5564919673426396e-05, "loss": 1.5447, "step": 3373 }, { "epoch": 0.08885962602054254, "grad_norm": 2.613180637359619, "learning_rate": 4.5563602844350805e-05, "loss": 1.6233, "step": 3374 }, { "epoch": 0.08888596260205425, "grad_norm": 2.479228973388672, "learning_rate": 4.556228601527522e-05, "loss": 1.3793, "step": 3375 }, { "epoch": 0.08891229918356597, "grad_norm": 1.781434416770935, "learning_rate": 4.556096918619963e-05, "loss": 1.7175, "step": 3376 }, { "epoch": 0.0889386357650777, "grad_norm": 2.4801998138427734, "learning_rate": 4.555965235712405e-05, "loss": 1.5128, "step": 3377 }, { "epoch": 0.08896497234658941, "grad_norm": 2.037580728530884, "learning_rate": 4.555833552804846e-05, "loss": 1.0426, "step": 3378 }, { "epoch": 0.08899130892810113, "grad_norm": 1.9650875329971313, "learning_rate": 4.5557018698972876e-05, "loss": 1.9092, "step": 3379 }, { "epoch": 0.08901764550961286, "grad_norm": 2.6709585189819336, "learning_rate": 4.555570186989729e-05, "loss": 2.0747, "step": 3380 }, { "epoch": 0.08904398209112457, "grad_norm": 1.9449385404586792, "learning_rate": 4.55543850408217e-05, "loss": 1.8836, "step": 3381 }, { "epoch": 0.08907031867263629, "grad_norm": 2.2440335750579834, "learning_rate": 4.555306821174612e-05, "loss": 1.2759, "step": 3382 }, { "epoch": 0.089096655254148, "grad_norm": 3.023714542388916, "learning_rate": 4.555175138267053e-05, "loss": 0.8821, "step": 3383 }, { "epoch": 0.08912299183565973, "grad_norm": 3.3295540809631348, "learning_rate": 4.555043455359495e-05, "loss": 1.4563, "step": 3384 }, { "epoch": 0.08914932841717145, "grad_norm": 3.4079713821411133, "learning_rate": 4.5549117724519356e-05, "loss": 1.2193, "step": 3385 }, { "epoch": 0.08917566499868317, "grad_norm": 1.8142025470733643, "learning_rate": 4.554780089544378e-05, "loss": 1.2313, "step": 3386 }, { "epoch": 0.0892020015801949, "grad_norm": 3.524857521057129, "learning_rate": 4.554648406636819e-05, "loss": 1.9658, "step": 3387 }, { "epoch": 0.08922833816170661, "grad_norm": 2.367950439453125, "learning_rate": 4.55451672372926e-05, "loss": 1.8446, "step": 3388 }, { "epoch": 0.08925467474321833, "grad_norm": 2.3109920024871826, "learning_rate": 4.554385040821701e-05, "loss": 1.0784, "step": 3389 }, { "epoch": 0.08928101132473006, "grad_norm": 2.0897305011749268, "learning_rate": 4.554253357914143e-05, "loss": 1.6419, "step": 3390 }, { "epoch": 0.08930734790624177, "grad_norm": 1.8517029285430908, "learning_rate": 4.554121675006584e-05, "loss": 2.2631, "step": 3391 }, { "epoch": 0.08933368448775349, "grad_norm": 2.5218851566314697, "learning_rate": 4.553989992099026e-05, "loss": 1.8359, "step": 3392 }, { "epoch": 0.0893600210692652, "grad_norm": 2.8504812717437744, "learning_rate": 4.5538583091914674e-05, "loss": 1.9991, "step": 3393 }, { "epoch": 0.08938635765077693, "grad_norm": 3.4940335750579834, "learning_rate": 4.553726626283908e-05, "loss": 1.4142, "step": 3394 }, { "epoch": 0.08941269423228865, "grad_norm": 3.1050009727478027, "learning_rate": 4.55359494337635e-05, "loss": 0.478, "step": 3395 }, { "epoch": 0.08943903081380036, "grad_norm": 2.702674388885498, "learning_rate": 4.5534632604687914e-05, "loss": 1.977, "step": 3396 }, { "epoch": 0.08946536739531209, "grad_norm": 3.2361903190612793, "learning_rate": 4.553331577561233e-05, "loss": 1.8961, "step": 3397 }, { "epoch": 0.08949170397682381, "grad_norm": 3.6690638065338135, "learning_rate": 4.553199894653674e-05, "loss": 2.0012, "step": 3398 }, { "epoch": 0.08951804055833552, "grad_norm": 2.043118476867676, "learning_rate": 4.5530682117461154e-05, "loss": 1.9398, "step": 3399 }, { "epoch": 0.08954437713984725, "grad_norm": 2.2660138607025146, "learning_rate": 4.552936528838557e-05, "loss": 2.2989, "step": 3400 }, { "epoch": 0.08957071372135897, "grad_norm": 2.0987002849578857, "learning_rate": 4.5528048459309985e-05, "loss": 1.5779, "step": 3401 }, { "epoch": 0.08959705030287068, "grad_norm": 2.952312707901001, "learning_rate": 4.55267316302344e-05, "loss": 1.9738, "step": 3402 }, { "epoch": 0.08962338688438241, "grad_norm": 2.3099260330200195, "learning_rate": 4.552541480115881e-05, "loss": 1.7237, "step": 3403 }, { "epoch": 0.08964972346589413, "grad_norm": 5.9042134284973145, "learning_rate": 4.5524097972083225e-05, "loss": 1.4515, "step": 3404 }, { "epoch": 0.08967606004740585, "grad_norm": 1.9012696743011475, "learning_rate": 4.552278114300764e-05, "loss": 1.9434, "step": 3405 }, { "epoch": 0.08970239662891756, "grad_norm": 2.4749765396118164, "learning_rate": 4.5521464313932056e-05, "loss": 1.6463, "step": 3406 }, { "epoch": 0.08972873321042929, "grad_norm": 1.9249846935272217, "learning_rate": 4.5520147484856465e-05, "loss": 2.0145, "step": 3407 }, { "epoch": 0.089755069791941, "grad_norm": 2.121868371963501, "learning_rate": 4.551883065578088e-05, "loss": 0.6021, "step": 3408 }, { "epoch": 0.08978140637345272, "grad_norm": 2.3962197303771973, "learning_rate": 4.5517513826705296e-05, "loss": 2.4078, "step": 3409 }, { "epoch": 0.08980774295496445, "grad_norm": 2.1263630390167236, "learning_rate": 4.551619699762971e-05, "loss": 1.9646, "step": 3410 }, { "epoch": 0.08983407953647617, "grad_norm": 3.167188882827759, "learning_rate": 4.551488016855413e-05, "loss": 1.3661, "step": 3411 }, { "epoch": 0.08986041611798788, "grad_norm": 3.5980520248413086, "learning_rate": 4.5513563339478536e-05, "loss": 1.7937, "step": 3412 }, { "epoch": 0.08988675269949961, "grad_norm": 1.9002790451049805, "learning_rate": 4.551224651040295e-05, "loss": 1.268, "step": 3413 }, { "epoch": 0.08991308928101133, "grad_norm": 1.7264384031295776, "learning_rate": 4.551092968132736e-05, "loss": 2.1844, "step": 3414 }, { "epoch": 0.08993942586252304, "grad_norm": 1.6253485679626465, "learning_rate": 4.550961285225178e-05, "loss": 1.6023, "step": 3415 }, { "epoch": 0.08996576244403476, "grad_norm": 2.992448091506958, "learning_rate": 4.550829602317619e-05, "loss": 2.0443, "step": 3416 }, { "epoch": 0.08999209902554649, "grad_norm": 2.267564058303833, "learning_rate": 4.550697919410061e-05, "loss": 0.4088, "step": 3417 }, { "epoch": 0.0900184356070582, "grad_norm": 2.0954949855804443, "learning_rate": 4.550566236502502e-05, "loss": 1.5922, "step": 3418 }, { "epoch": 0.09004477218856992, "grad_norm": 2.935631275177002, "learning_rate": 4.550434553594944e-05, "loss": 2.4555, "step": 3419 }, { "epoch": 0.09007110877008165, "grad_norm": 3.6485068798065186, "learning_rate": 4.5503028706873854e-05, "loss": 1.3428, "step": 3420 }, { "epoch": 0.09009744535159336, "grad_norm": 2.8711252212524414, "learning_rate": 4.550171187779826e-05, "loss": 2.4802, "step": 3421 }, { "epoch": 0.09012378193310508, "grad_norm": 2.427990674972534, "learning_rate": 4.550039504872268e-05, "loss": 1.9352, "step": 3422 }, { "epoch": 0.09015011851461681, "grad_norm": 3.124286413192749, "learning_rate": 4.549907821964709e-05, "loss": 1.6265, "step": 3423 }, { "epoch": 0.09017645509612852, "grad_norm": 3.1010942459106445, "learning_rate": 4.549776139057151e-05, "loss": 2.1104, "step": 3424 }, { "epoch": 0.09020279167764024, "grad_norm": 3.4730076789855957, "learning_rate": 4.549644456149592e-05, "loss": 1.4576, "step": 3425 }, { "epoch": 0.09022912825915196, "grad_norm": 3.8209004402160645, "learning_rate": 4.5495127732420334e-05, "loss": 1.1925, "step": 3426 }, { "epoch": 0.09025546484066369, "grad_norm": 3.584869384765625, "learning_rate": 4.549381090334475e-05, "loss": 1.4934, "step": 3427 }, { "epoch": 0.0902818014221754, "grad_norm": 1.8450781106948853, "learning_rate": 4.549249407426916e-05, "loss": 1.7063, "step": 3428 }, { "epoch": 0.09030813800368712, "grad_norm": 2.419992685317993, "learning_rate": 4.549117724519358e-05, "loss": 1.5129, "step": 3429 }, { "epoch": 0.09033447458519885, "grad_norm": 2.637809991836548, "learning_rate": 4.548986041611799e-05, "loss": 1.4932, "step": 3430 }, { "epoch": 0.09036081116671056, "grad_norm": 6.464300632476807, "learning_rate": 4.5488543587042406e-05, "loss": 1.943, "step": 3431 }, { "epoch": 0.09038714774822228, "grad_norm": 2.107273578643799, "learning_rate": 4.5487226757966814e-05, "loss": 2.2235, "step": 3432 }, { "epoch": 0.090413484329734, "grad_norm": 2.3448657989501953, "learning_rate": 4.548590992889123e-05, "loss": 1.7162, "step": 3433 }, { "epoch": 0.09043982091124572, "grad_norm": 2.627598762512207, "learning_rate": 4.5484593099815646e-05, "loss": 1.4002, "step": 3434 }, { "epoch": 0.09046615749275744, "grad_norm": 2.0682199001312256, "learning_rate": 4.548327627074006e-05, "loss": 1.5173, "step": 3435 }, { "epoch": 0.09049249407426915, "grad_norm": 3.4184250831604004, "learning_rate": 4.548195944166447e-05, "loss": 1.856, "step": 3436 }, { "epoch": 0.09051883065578088, "grad_norm": 4.626522541046143, "learning_rate": 4.5480642612588886e-05, "loss": 1.2222, "step": 3437 }, { "epoch": 0.0905451672372926, "grad_norm": 2.2340171337127686, "learning_rate": 4.547932578351331e-05, "loss": 2.0774, "step": 3438 }, { "epoch": 0.09057150381880431, "grad_norm": 4.535863399505615, "learning_rate": 4.547800895443772e-05, "loss": 2.6402, "step": 3439 }, { "epoch": 0.09059784040031604, "grad_norm": 1.9701533317565918, "learning_rate": 4.547669212536213e-05, "loss": 2.5163, "step": 3440 }, { "epoch": 0.09062417698182776, "grad_norm": 1.865283727645874, "learning_rate": 4.547537529628654e-05, "loss": 1.7879, "step": 3441 }, { "epoch": 0.09065051356333947, "grad_norm": 1.8001735210418701, "learning_rate": 4.547405846721096e-05, "loss": 1.6645, "step": 3442 }, { "epoch": 0.0906768501448512, "grad_norm": 2.2745540142059326, "learning_rate": 4.547274163813537e-05, "loss": 1.4718, "step": 3443 }, { "epoch": 0.09070318672636292, "grad_norm": 5.53325891494751, "learning_rate": 4.547142480905979e-05, "loss": 1.6999, "step": 3444 }, { "epoch": 0.09072952330787463, "grad_norm": 2.608701467514038, "learning_rate": 4.54701079799842e-05, "loss": 1.7494, "step": 3445 }, { "epoch": 0.09075585988938636, "grad_norm": 2.4674465656280518, "learning_rate": 4.546879115090861e-05, "loss": 1.474, "step": 3446 }, { "epoch": 0.09078219647089808, "grad_norm": 1.9420452117919922, "learning_rate": 4.546747432183303e-05, "loss": 1.978, "step": 3447 }, { "epoch": 0.0908085330524098, "grad_norm": 3.726248264312744, "learning_rate": 4.5466157492757444e-05, "loss": 0.7192, "step": 3448 }, { "epoch": 0.09083486963392151, "grad_norm": 3.000051259994507, "learning_rate": 4.546484066368186e-05, "loss": 1.6535, "step": 3449 }, { "epoch": 0.09086120621543324, "grad_norm": 2.382986068725586, "learning_rate": 4.546352383460627e-05, "loss": 2.1222, "step": 3450 }, { "epoch": 0.09088754279694496, "grad_norm": 4.2474775314331055, "learning_rate": 4.5462207005530684e-05, "loss": 1.8899, "step": 3451 }, { "epoch": 0.09091387937845667, "grad_norm": 4.1579670906066895, "learning_rate": 4.54608901764551e-05, "loss": 1.727, "step": 3452 }, { "epoch": 0.0909402159599684, "grad_norm": 1.8609178066253662, "learning_rate": 4.5459573347379515e-05, "loss": 1.9584, "step": 3453 }, { "epoch": 0.09096655254148012, "grad_norm": 2.892361879348755, "learning_rate": 4.5458256518303924e-05, "loss": 1.4133, "step": 3454 }, { "epoch": 0.09099288912299183, "grad_norm": 3.1268770694732666, "learning_rate": 4.545693968922834e-05, "loss": 1.2705, "step": 3455 }, { "epoch": 0.09101922570450356, "grad_norm": 2.5478532314300537, "learning_rate": 4.5455622860152755e-05, "loss": 2.2414, "step": 3456 }, { "epoch": 0.09104556228601528, "grad_norm": 3.5269534587860107, "learning_rate": 4.545430603107717e-05, "loss": 2.0966, "step": 3457 }, { "epoch": 0.09107189886752699, "grad_norm": 3.479748010635376, "learning_rate": 4.5452989202001586e-05, "loss": 1.0934, "step": 3458 }, { "epoch": 0.09109823544903871, "grad_norm": 2.448650598526001, "learning_rate": 4.5451672372925995e-05, "loss": 1.6114, "step": 3459 }, { "epoch": 0.09112457203055044, "grad_norm": 4.655614852905273, "learning_rate": 4.545035554385041e-05, "loss": 0.7682, "step": 3460 }, { "epoch": 0.09115090861206215, "grad_norm": 3.621953248977661, "learning_rate": 4.544903871477482e-05, "loss": 1.2884, "step": 3461 }, { "epoch": 0.09117724519357387, "grad_norm": 1.8357905149459839, "learning_rate": 4.544772188569924e-05, "loss": 1.9588, "step": 3462 }, { "epoch": 0.0912035817750856, "grad_norm": 2.350175619125366, "learning_rate": 4.544640505662365e-05, "loss": 1.5626, "step": 3463 }, { "epoch": 0.09122991835659731, "grad_norm": 2.3754656314849854, "learning_rate": 4.5445088227548066e-05, "loss": 1.5809, "step": 3464 }, { "epoch": 0.09125625493810903, "grad_norm": 3.776057243347168, "learning_rate": 4.544377139847248e-05, "loss": 1.0369, "step": 3465 }, { "epoch": 0.09128259151962076, "grad_norm": 2.334263324737549, "learning_rate": 4.544245456939689e-05, "loss": 1.731, "step": 3466 }, { "epoch": 0.09130892810113247, "grad_norm": 4.7469305992126465, "learning_rate": 4.544113774032131e-05, "loss": 1.9404, "step": 3467 }, { "epoch": 0.09133526468264419, "grad_norm": 3.5409865379333496, "learning_rate": 4.543982091124572e-05, "loss": 1.7411, "step": 3468 }, { "epoch": 0.0913616012641559, "grad_norm": 1.9411232471466064, "learning_rate": 4.543850408217014e-05, "loss": 1.692, "step": 3469 }, { "epoch": 0.09138793784566764, "grad_norm": 1.9997941255569458, "learning_rate": 4.5437187253094546e-05, "loss": 1.9027, "step": 3470 }, { "epoch": 0.09141427442717935, "grad_norm": 4.417593479156494, "learning_rate": 4.543587042401897e-05, "loss": 2.0089, "step": 3471 }, { "epoch": 0.09144061100869107, "grad_norm": 1.9682583808898926, "learning_rate": 4.543455359494338e-05, "loss": 1.6445, "step": 3472 }, { "epoch": 0.0914669475902028, "grad_norm": 2.2500627040863037, "learning_rate": 4.543323676586779e-05, "loss": 1.7452, "step": 3473 }, { "epoch": 0.09149328417171451, "grad_norm": 2.26936411857605, "learning_rate": 4.543191993679221e-05, "loss": 2.1707, "step": 3474 }, { "epoch": 0.09151962075322623, "grad_norm": 4.783372402191162, "learning_rate": 4.543060310771662e-05, "loss": 2.3268, "step": 3475 }, { "epoch": 0.09154595733473796, "grad_norm": 2.153541088104248, "learning_rate": 4.542928627864104e-05, "loss": 1.7916, "step": 3476 }, { "epoch": 0.09157229391624967, "grad_norm": 2.3099043369293213, "learning_rate": 4.542796944956545e-05, "loss": 2.0975, "step": 3477 }, { "epoch": 0.09159863049776139, "grad_norm": 3.0742597579956055, "learning_rate": 4.5426652620489864e-05, "loss": 1.5007, "step": 3478 }, { "epoch": 0.0916249670792731, "grad_norm": 4.193145275115967, "learning_rate": 4.542533579141427e-05, "loss": 1.5927, "step": 3479 }, { "epoch": 0.09165130366078483, "grad_norm": 2.1241064071655273, "learning_rate": 4.542401896233869e-05, "loss": 2.051, "step": 3480 }, { "epoch": 0.09167764024229655, "grad_norm": 2.8343396186828613, "learning_rate": 4.5422702133263104e-05, "loss": 2.055, "step": 3481 }, { "epoch": 0.09170397682380826, "grad_norm": 3.0514485836029053, "learning_rate": 4.542138530418752e-05, "loss": 2.1494, "step": 3482 }, { "epoch": 0.09173031340532, "grad_norm": 4.142845630645752, "learning_rate": 4.5420068475111935e-05, "loss": 1.5494, "step": 3483 }, { "epoch": 0.09175664998683171, "grad_norm": 5.5925774574279785, "learning_rate": 4.5418751646036344e-05, "loss": 1.7694, "step": 3484 }, { "epoch": 0.09178298656834342, "grad_norm": 2.1784701347351074, "learning_rate": 4.5417434816960767e-05, "loss": 1.7135, "step": 3485 }, { "epoch": 0.09180932314985515, "grad_norm": 4.068819999694824, "learning_rate": 4.5416117987885175e-05, "loss": 1.0339, "step": 3486 }, { "epoch": 0.09183565973136687, "grad_norm": 4.306507110595703, "learning_rate": 4.541480115880959e-05, "loss": 1.8897, "step": 3487 }, { "epoch": 0.09186199631287859, "grad_norm": 2.95487904548645, "learning_rate": 4.5413484329734e-05, "loss": 0.6175, "step": 3488 }, { "epoch": 0.09188833289439031, "grad_norm": 1.7955442667007446, "learning_rate": 4.5412167500658415e-05, "loss": 2.0957, "step": 3489 }, { "epoch": 0.09191466947590203, "grad_norm": 1.7751002311706543, "learning_rate": 4.541085067158283e-05, "loss": 2.3951, "step": 3490 }, { "epoch": 0.09194100605741375, "grad_norm": 3.020322799682617, "learning_rate": 4.5409533842507247e-05, "loss": 1.2985, "step": 3491 }, { "epoch": 0.09196734263892546, "grad_norm": 2.5244171619415283, "learning_rate": 4.5408217013431655e-05, "loss": 0.9907, "step": 3492 }, { "epoch": 0.09199367922043719, "grad_norm": 2.9127509593963623, "learning_rate": 4.540690018435607e-05, "loss": 1.3944, "step": 3493 }, { "epoch": 0.0920200158019489, "grad_norm": 4.527369022369385, "learning_rate": 4.540558335528049e-05, "loss": 1.9289, "step": 3494 }, { "epoch": 0.09204635238346062, "grad_norm": 3.5853872299194336, "learning_rate": 4.54042665262049e-05, "loss": 1.4307, "step": 3495 }, { "epoch": 0.09207268896497235, "grad_norm": 4.448419570922852, "learning_rate": 4.540294969712932e-05, "loss": 1.4803, "step": 3496 }, { "epoch": 0.09209902554648407, "grad_norm": 2.9541196823120117, "learning_rate": 4.540163286805373e-05, "loss": 2.2405, "step": 3497 }, { "epoch": 0.09212536212799578, "grad_norm": 5.0814948081970215, "learning_rate": 4.540031603897814e-05, "loss": 0.8587, "step": 3498 }, { "epoch": 0.09215169870950751, "grad_norm": 2.039577007293701, "learning_rate": 4.539899920990255e-05, "loss": 1.8517, "step": 3499 }, { "epoch": 0.09217803529101923, "grad_norm": 3.2318668365478516, "learning_rate": 4.5397682380826973e-05, "loss": 1.984, "step": 3500 }, { "epoch": 0.09220437187253094, "grad_norm": 4.1662163734436035, "learning_rate": 4.539636555175138e-05, "loss": 2.0723, "step": 3501 }, { "epoch": 0.09223070845404266, "grad_norm": 3.0352349281311035, "learning_rate": 4.53950487226758e-05, "loss": 1.4149, "step": 3502 }, { "epoch": 0.09225704503555439, "grad_norm": 3.3878934383392334, "learning_rate": 4.5393731893600213e-05, "loss": 1.1382, "step": 3503 }, { "epoch": 0.0922833816170661, "grad_norm": 3.484917402267456, "learning_rate": 4.539241506452463e-05, "loss": 2.4563, "step": 3504 }, { "epoch": 0.09230971819857782, "grad_norm": 2.1364645957946777, "learning_rate": 4.5391098235449045e-05, "loss": 1.8152, "step": 3505 }, { "epoch": 0.09233605478008955, "grad_norm": 3.4091784954071045, "learning_rate": 4.5389781406373453e-05, "loss": 2.3247, "step": 3506 }, { "epoch": 0.09236239136160126, "grad_norm": 2.75504732131958, "learning_rate": 4.538846457729787e-05, "loss": 2.0052, "step": 3507 }, { "epoch": 0.09238872794311298, "grad_norm": 1.5291812419891357, "learning_rate": 4.538714774822228e-05, "loss": 1.6434, "step": 3508 }, { "epoch": 0.09241506452462471, "grad_norm": 2.5041794776916504, "learning_rate": 4.53858309191467e-05, "loss": 1.8082, "step": 3509 }, { "epoch": 0.09244140110613643, "grad_norm": 4.507685661315918, "learning_rate": 4.538451409007111e-05, "loss": 1.6125, "step": 3510 }, { "epoch": 0.09246773768764814, "grad_norm": 4.7342634201049805, "learning_rate": 4.5383197260995525e-05, "loss": 1.9749, "step": 3511 }, { "epoch": 0.09249407426915986, "grad_norm": 2.9032680988311768, "learning_rate": 4.538188043191994e-05, "loss": 2.1286, "step": 3512 }, { "epoch": 0.09252041085067159, "grad_norm": 3.5357062816619873, "learning_rate": 4.538056360284435e-05, "loss": 2.0115, "step": 3513 }, { "epoch": 0.0925467474321833, "grad_norm": 2.528106927871704, "learning_rate": 4.537924677376877e-05, "loss": 1.2313, "step": 3514 }, { "epoch": 0.09257308401369502, "grad_norm": 1.802107334136963, "learning_rate": 4.537792994469318e-05, "loss": 1.2219, "step": 3515 }, { "epoch": 0.09259942059520675, "grad_norm": 2.346094846725464, "learning_rate": 4.5376613115617596e-05, "loss": 2.0631, "step": 3516 }, { "epoch": 0.09262575717671846, "grad_norm": 3.0261690616607666, "learning_rate": 4.5375296286542005e-05, "loss": 1.4302, "step": 3517 }, { "epoch": 0.09265209375823018, "grad_norm": 2.1139276027679443, "learning_rate": 4.537397945746643e-05, "loss": 1.4349, "step": 3518 }, { "epoch": 0.09267843033974191, "grad_norm": 2.240539312362671, "learning_rate": 4.5372662628390836e-05, "loss": 1.9774, "step": 3519 }, { "epoch": 0.09270476692125362, "grad_norm": 4.797918319702148, "learning_rate": 4.537134579931525e-05, "loss": 0.8176, "step": 3520 }, { "epoch": 0.09273110350276534, "grad_norm": 2.712881326675415, "learning_rate": 4.537002897023967e-05, "loss": 0.6057, "step": 3521 }, { "epoch": 0.09275744008427705, "grad_norm": 2.269455671310425, "learning_rate": 4.5368712141164076e-05, "loss": 2.0535, "step": 3522 }, { "epoch": 0.09278377666578878, "grad_norm": 3.852405548095703, "learning_rate": 4.53673953120885e-05, "loss": 0.8587, "step": 3523 }, { "epoch": 0.0928101132473005, "grad_norm": 3.5605695247650146, "learning_rate": 4.536607848301291e-05, "loss": 1.5908, "step": 3524 }, { "epoch": 0.09283644982881221, "grad_norm": 3.5673930644989014, "learning_rate": 4.536476165393732e-05, "loss": 2.4376, "step": 3525 }, { "epoch": 0.09286278641032394, "grad_norm": 2.3191635608673096, "learning_rate": 4.536344482486173e-05, "loss": 2.012, "step": 3526 }, { "epoch": 0.09288912299183566, "grad_norm": 1.6937735080718994, "learning_rate": 4.536212799578615e-05, "loss": 1.7562, "step": 3527 }, { "epoch": 0.09291545957334738, "grad_norm": 2.5325427055358887, "learning_rate": 4.536081116671056e-05, "loss": 1.7308, "step": 3528 }, { "epoch": 0.0929417961548591, "grad_norm": 1.8451688289642334, "learning_rate": 4.535949433763498e-05, "loss": 1.8, "step": 3529 }, { "epoch": 0.09296813273637082, "grad_norm": 2.1341207027435303, "learning_rate": 4.5358177508559394e-05, "loss": 1.6055, "step": 3530 }, { "epoch": 0.09299446931788254, "grad_norm": 3.2626891136169434, "learning_rate": 4.53568606794838e-05, "loss": 1.4218, "step": 3531 }, { "epoch": 0.09302080589939427, "grad_norm": 2.2403175830841064, "learning_rate": 4.5355543850408225e-05, "loss": 0.917, "step": 3532 }, { "epoch": 0.09304714248090598, "grad_norm": 2.618361234664917, "learning_rate": 4.5354227021332634e-05, "loss": 0.5938, "step": 3533 }, { "epoch": 0.0930734790624177, "grad_norm": 2.2554256916046143, "learning_rate": 4.535291019225705e-05, "loss": 1.2907, "step": 3534 }, { "epoch": 0.09309981564392941, "grad_norm": 1.6092514991760254, "learning_rate": 4.535159336318146e-05, "loss": 1.7301, "step": 3535 }, { "epoch": 0.09312615222544114, "grad_norm": 1.8308345079421997, "learning_rate": 4.5350276534105874e-05, "loss": 2.04, "step": 3536 }, { "epoch": 0.09315248880695286, "grad_norm": 1.8022361993789673, "learning_rate": 4.534895970503029e-05, "loss": 2.0434, "step": 3537 }, { "epoch": 0.09317882538846457, "grad_norm": 2.0214109420776367, "learning_rate": 4.5347642875954705e-05, "loss": 1.6638, "step": 3538 }, { "epoch": 0.0932051619699763, "grad_norm": 2.7885022163391113, "learning_rate": 4.5346326046879114e-05, "loss": 2.6573, "step": 3539 }, { "epoch": 0.09323149855148802, "grad_norm": 4.343166828155518, "learning_rate": 4.534500921780353e-05, "loss": 1.0507, "step": 3540 }, { "epoch": 0.09325783513299973, "grad_norm": 1.6867811679840088, "learning_rate": 4.5343692388727945e-05, "loss": 1.7796, "step": 3541 }, { "epoch": 0.09328417171451146, "grad_norm": 2.3982093334198, "learning_rate": 4.534237555965236e-05, "loss": 1.7246, "step": 3542 }, { "epoch": 0.09331050829602318, "grad_norm": 1.8931176662445068, "learning_rate": 4.5341058730576776e-05, "loss": 1.804, "step": 3543 }, { "epoch": 0.0933368448775349, "grad_norm": 2.561126947402954, "learning_rate": 4.5339741901501185e-05, "loss": 1.1635, "step": 3544 }, { "epoch": 0.09336318145904661, "grad_norm": 3.875622034072876, "learning_rate": 4.53384250724256e-05, "loss": 0.9318, "step": 3545 }, { "epoch": 0.09338951804055834, "grad_norm": 4.270608901977539, "learning_rate": 4.533710824335001e-05, "loss": 2.0671, "step": 3546 }, { "epoch": 0.09341585462207005, "grad_norm": 2.244062662124634, "learning_rate": 4.533579141427443e-05, "loss": 1.3546, "step": 3547 }, { "epoch": 0.09344219120358177, "grad_norm": 2.1929867267608643, "learning_rate": 4.533447458519884e-05, "loss": 1.8429, "step": 3548 }, { "epoch": 0.0934685277850935, "grad_norm": 2.2113425731658936, "learning_rate": 4.5333157756123256e-05, "loss": 1.9521, "step": 3549 }, { "epoch": 0.09349486436660522, "grad_norm": 1.865720510482788, "learning_rate": 4.533184092704767e-05, "loss": 1.7548, "step": 3550 }, { "epoch": 0.09352120094811693, "grad_norm": 4.912597179412842, "learning_rate": 4.533052409797209e-05, "loss": 1.0805, "step": 3551 }, { "epoch": 0.09354753752962866, "grad_norm": 2.5251331329345703, "learning_rate": 4.53292072688965e-05, "loss": 1.7525, "step": 3552 }, { "epoch": 0.09357387411114038, "grad_norm": 1.4777976274490356, "learning_rate": 4.532789043982091e-05, "loss": 1.5884, "step": 3553 }, { "epoch": 0.09360021069265209, "grad_norm": 3.00777006149292, "learning_rate": 4.532657361074533e-05, "loss": 2.3691, "step": 3554 }, { "epoch": 0.09362654727416381, "grad_norm": 7.6721038818359375, "learning_rate": 4.5325256781669736e-05, "loss": 1.1305, "step": 3555 }, { "epoch": 0.09365288385567554, "grad_norm": 1.8230565786361694, "learning_rate": 4.532393995259416e-05, "loss": 2.1075, "step": 3556 }, { "epoch": 0.09367922043718725, "grad_norm": 1.7814339399337769, "learning_rate": 4.532262312351857e-05, "loss": 2.1857, "step": 3557 }, { "epoch": 0.09370555701869897, "grad_norm": 4.5540289878845215, "learning_rate": 4.532130629444298e-05, "loss": 1.3009, "step": 3558 }, { "epoch": 0.0937318936002107, "grad_norm": 1.9780340194702148, "learning_rate": 4.53199894653674e-05, "loss": 1.6564, "step": 3559 }, { "epoch": 0.09375823018172241, "grad_norm": 2.5220141410827637, "learning_rate": 4.531867263629181e-05, "loss": 1.9551, "step": 3560 }, { "epoch": 0.09378456676323413, "grad_norm": 4.293489456176758, "learning_rate": 4.531735580721623e-05, "loss": 2.0275, "step": 3561 }, { "epoch": 0.09381090334474586, "grad_norm": 2.2610857486724854, "learning_rate": 4.531603897814064e-05, "loss": 1.5956, "step": 3562 }, { "epoch": 0.09383723992625757, "grad_norm": 4.735538482666016, "learning_rate": 4.5314722149065054e-05, "loss": 0.8918, "step": 3563 }, { "epoch": 0.09386357650776929, "grad_norm": 3.5435924530029297, "learning_rate": 4.531340531998946e-05, "loss": 1.6614, "step": 3564 }, { "epoch": 0.093889913089281, "grad_norm": 1.715957522392273, "learning_rate": 4.5312088490913886e-05, "loss": 2.4809, "step": 3565 }, { "epoch": 0.09391624967079273, "grad_norm": 4.611833572387695, "learning_rate": 4.5310771661838294e-05, "loss": 1.4671, "step": 3566 }, { "epoch": 0.09394258625230445, "grad_norm": 2.0678489208221436, "learning_rate": 4.530945483276271e-05, "loss": 0.5457, "step": 3567 }, { "epoch": 0.09396892283381617, "grad_norm": 2.0815773010253906, "learning_rate": 4.5308138003687126e-05, "loss": 1.6149, "step": 3568 }, { "epoch": 0.0939952594153279, "grad_norm": 3.303248882293701, "learning_rate": 4.5306821174611534e-05, "loss": 1.2468, "step": 3569 }, { "epoch": 0.09402159599683961, "grad_norm": 2.458906888961792, "learning_rate": 4.530550434553596e-05, "loss": 1.6612, "step": 3570 }, { "epoch": 0.09404793257835133, "grad_norm": 3.835932493209839, "learning_rate": 4.5304187516460366e-05, "loss": 2.5313, "step": 3571 }, { "epoch": 0.09407426915986306, "grad_norm": 1.7421433925628662, "learning_rate": 4.530287068738478e-05, "loss": 1.5607, "step": 3572 }, { "epoch": 0.09410060574137477, "grad_norm": 5.627799987792969, "learning_rate": 4.530155385830919e-05, "loss": 0.3134, "step": 3573 }, { "epoch": 0.09412694232288649, "grad_norm": 2.220440149307251, "learning_rate": 4.5300237029233606e-05, "loss": 2.4299, "step": 3574 }, { "epoch": 0.09415327890439822, "grad_norm": 2.5359511375427246, "learning_rate": 4.529892020015802e-05, "loss": 2.1116, "step": 3575 }, { "epoch": 0.09417961548590993, "grad_norm": 3.1033644676208496, "learning_rate": 4.529760337108244e-05, "loss": 1.6407, "step": 3576 }, { "epoch": 0.09420595206742165, "grad_norm": 1.7653049230575562, "learning_rate": 4.529628654200685e-05, "loss": 1.629, "step": 3577 }, { "epoch": 0.09423228864893336, "grad_norm": 1.9302960634231567, "learning_rate": 4.529496971293126e-05, "loss": 1.71, "step": 3578 }, { "epoch": 0.09425862523044509, "grad_norm": 2.943619966506958, "learning_rate": 4.529365288385568e-05, "loss": 0.7295, "step": 3579 }, { "epoch": 0.09428496181195681, "grad_norm": 2.1683950424194336, "learning_rate": 4.529233605478009e-05, "loss": 1.6699, "step": 3580 }, { "epoch": 0.09431129839346852, "grad_norm": 2.8118507862091064, "learning_rate": 4.529101922570451e-05, "loss": 1.3244, "step": 3581 }, { "epoch": 0.09433763497498025, "grad_norm": 2.927462100982666, "learning_rate": 4.528970239662892e-05, "loss": 2.366, "step": 3582 }, { "epoch": 0.09436397155649197, "grad_norm": 4.5996599197387695, "learning_rate": 4.528838556755333e-05, "loss": 1.4591, "step": 3583 }, { "epoch": 0.09439030813800368, "grad_norm": 2.266023635864258, "learning_rate": 4.528706873847775e-05, "loss": 2.0292, "step": 3584 }, { "epoch": 0.09441664471951541, "grad_norm": 4.423757076263428, "learning_rate": 4.5285751909402164e-05, "loss": 1.973, "step": 3585 }, { "epoch": 0.09444298130102713, "grad_norm": 6.177351951599121, "learning_rate": 4.528443508032658e-05, "loss": 1.9698, "step": 3586 }, { "epoch": 0.09446931788253884, "grad_norm": 3.440419912338257, "learning_rate": 4.528311825125099e-05, "loss": 1.8221, "step": 3587 }, { "epoch": 0.09449565446405056, "grad_norm": 2.492399215698242, "learning_rate": 4.5281801422175404e-05, "loss": 2.0154, "step": 3588 }, { "epoch": 0.09452199104556229, "grad_norm": 2.0901925563812256, "learning_rate": 4.528048459309982e-05, "loss": 0.9605, "step": 3589 }, { "epoch": 0.094548327627074, "grad_norm": 1.8072923421859741, "learning_rate": 4.5279167764024235e-05, "loss": 1.7053, "step": 3590 }, { "epoch": 0.09457466420858572, "grad_norm": 2.0344669818878174, "learning_rate": 4.5277850934948644e-05, "loss": 1.8751, "step": 3591 }, { "epoch": 0.09460100079009745, "grad_norm": 2.647554397583008, "learning_rate": 4.527653410587306e-05, "loss": 1.9211, "step": 3592 }, { "epoch": 0.09462733737160917, "grad_norm": 3.688228130340576, "learning_rate": 4.527521727679747e-05, "loss": 2.1693, "step": 3593 }, { "epoch": 0.09465367395312088, "grad_norm": 2.506467580795288, "learning_rate": 4.527390044772189e-05, "loss": 1.9605, "step": 3594 }, { "epoch": 0.09468001053463261, "grad_norm": 4.199458599090576, "learning_rate": 4.52725836186463e-05, "loss": 0.9748, "step": 3595 }, { "epoch": 0.09470634711614433, "grad_norm": 2.637802839279175, "learning_rate": 4.5271266789570715e-05, "loss": 0.9159, "step": 3596 }, { "epoch": 0.09473268369765604, "grad_norm": 1.9187687635421753, "learning_rate": 4.526994996049513e-05, "loss": 1.8459, "step": 3597 }, { "epoch": 0.09475902027916776, "grad_norm": 2.075516700744629, "learning_rate": 4.5268633131419546e-05, "loss": 2.3203, "step": 3598 }, { "epoch": 0.09478535686067949, "grad_norm": 2.219097375869751, "learning_rate": 4.526731630234396e-05, "loss": 2.166, "step": 3599 }, { "epoch": 0.0948116934421912, "grad_norm": 2.6724448204040527, "learning_rate": 4.526599947326837e-05, "loss": 1.9467, "step": 3600 }, { "epoch": 0.09483803002370292, "grad_norm": 1.9259095191955566, "learning_rate": 4.5264682644192786e-05, "loss": 1.9302, "step": 3601 }, { "epoch": 0.09486436660521465, "grad_norm": 2.850823163986206, "learning_rate": 4.5263365815117195e-05, "loss": 1.8097, "step": 3602 }, { "epoch": 0.09489070318672636, "grad_norm": 3.107180595397949, "learning_rate": 4.526204898604162e-05, "loss": 1.3213, "step": 3603 }, { "epoch": 0.09491703976823808, "grad_norm": 3.6365256309509277, "learning_rate": 4.5260732156966026e-05, "loss": 1.0158, "step": 3604 }, { "epoch": 0.09494337634974981, "grad_norm": 1.9909608364105225, "learning_rate": 4.525941532789044e-05, "loss": 1.9825, "step": 3605 }, { "epoch": 0.09496971293126152, "grad_norm": 2.534580945968628, "learning_rate": 4.525809849881486e-05, "loss": 2.0879, "step": 3606 }, { "epoch": 0.09499604951277324, "grad_norm": 2.238964796066284, "learning_rate": 4.5256781669739266e-05, "loss": 1.5413, "step": 3607 }, { "epoch": 0.09502238609428496, "grad_norm": 1.7729644775390625, "learning_rate": 4.525546484066369e-05, "loss": 1.5519, "step": 3608 }, { "epoch": 0.09504872267579668, "grad_norm": 2.237271547317505, "learning_rate": 4.52541480115881e-05, "loss": 1.8953, "step": 3609 }, { "epoch": 0.0950750592573084, "grad_norm": 1.7214950323104858, "learning_rate": 4.525283118251251e-05, "loss": 1.4514, "step": 3610 }, { "epoch": 0.09510139583882012, "grad_norm": 2.7791130542755127, "learning_rate": 4.525151435343692e-05, "loss": 1.595, "step": 3611 }, { "epoch": 0.09512773242033185, "grad_norm": 5.118468284606934, "learning_rate": 4.525019752436134e-05, "loss": 1.3965, "step": 3612 }, { "epoch": 0.09515406900184356, "grad_norm": 2.1532459259033203, "learning_rate": 4.524888069528575e-05, "loss": 1.1161, "step": 3613 }, { "epoch": 0.09518040558335528, "grad_norm": 2.744793653488159, "learning_rate": 4.524756386621017e-05, "loss": 0.6077, "step": 3614 }, { "epoch": 0.095206742164867, "grad_norm": 3.585491895675659, "learning_rate": 4.5246247037134584e-05, "loss": 1.3194, "step": 3615 }, { "epoch": 0.09523307874637872, "grad_norm": 1.915829062461853, "learning_rate": 4.524493020805899e-05, "loss": 1.469, "step": 3616 }, { "epoch": 0.09525941532789044, "grad_norm": 2.767784595489502, "learning_rate": 4.5243613378983415e-05, "loss": 2.0906, "step": 3617 }, { "epoch": 0.09528575190940215, "grad_norm": 2.7908408641815186, "learning_rate": 4.5242296549907824e-05, "loss": 2.1139, "step": 3618 }, { "epoch": 0.09531208849091388, "grad_norm": 2.8103015422821045, "learning_rate": 4.524097972083224e-05, "loss": 1.83, "step": 3619 }, { "epoch": 0.0953384250724256, "grad_norm": 1.7638869285583496, "learning_rate": 4.523966289175665e-05, "loss": 0.5245, "step": 3620 }, { "epoch": 0.09536476165393731, "grad_norm": 3.2313201427459717, "learning_rate": 4.5238346062681064e-05, "loss": 1.5791, "step": 3621 }, { "epoch": 0.09539109823544904, "grad_norm": 2.9392025470733643, "learning_rate": 4.523702923360548e-05, "loss": 1.0644, "step": 3622 }, { "epoch": 0.09541743481696076, "grad_norm": 2.4455037117004395, "learning_rate": 4.5235712404529895e-05, "loss": 2.24, "step": 3623 }, { "epoch": 0.09544377139847247, "grad_norm": 6.37355899810791, "learning_rate": 4.523439557545431e-05, "loss": 1.3735, "step": 3624 }, { "epoch": 0.0954701079799842, "grad_norm": 6.340606212615967, "learning_rate": 4.523307874637872e-05, "loss": 0.7241, "step": 3625 }, { "epoch": 0.09549644456149592, "grad_norm": 2.154714584350586, "learning_rate": 4.5231761917303135e-05, "loss": 1.112, "step": 3626 }, { "epoch": 0.09552278114300763, "grad_norm": 2.133312940597534, "learning_rate": 4.523044508822755e-05, "loss": 1.2442, "step": 3627 }, { "epoch": 0.09554911772451936, "grad_norm": 3.2518069744110107, "learning_rate": 4.522912825915197e-05, "loss": 2.0202, "step": 3628 }, { "epoch": 0.09557545430603108, "grad_norm": 1.9350520372390747, "learning_rate": 4.5227811430076375e-05, "loss": 1.9149, "step": 3629 }, { "epoch": 0.0956017908875428, "grad_norm": 2.430617332458496, "learning_rate": 4.522649460100079e-05, "loss": 2.3257, "step": 3630 }, { "epoch": 0.09562812746905451, "grad_norm": 2.9433438777923584, "learning_rate": 4.522517777192521e-05, "loss": 1.324, "step": 3631 }, { "epoch": 0.09565446405056624, "grad_norm": 8.794106483459473, "learning_rate": 4.522386094284962e-05, "loss": 1.8983, "step": 3632 }, { "epoch": 0.09568080063207796, "grad_norm": 5.53743314743042, "learning_rate": 4.522254411377404e-05, "loss": 1.3177, "step": 3633 }, { "epoch": 0.09570713721358967, "grad_norm": 3.19970703125, "learning_rate": 4.522122728469845e-05, "loss": 1.6594, "step": 3634 }, { "epoch": 0.0957334737951014, "grad_norm": 2.3984477519989014, "learning_rate": 4.521991045562286e-05, "loss": 1.6546, "step": 3635 }, { "epoch": 0.09575981037661312, "grad_norm": 4.994995594024658, "learning_rate": 4.521859362654728e-05, "loss": 2.6488, "step": 3636 }, { "epoch": 0.09578614695812483, "grad_norm": 3.187831163406372, "learning_rate": 4.5217276797471693e-05, "loss": 2.6566, "step": 3637 }, { "epoch": 0.09581248353963656, "grad_norm": 1.8058795928955078, "learning_rate": 4.52159599683961e-05, "loss": 1.6738, "step": 3638 }, { "epoch": 0.09583882012114828, "grad_norm": 2.445007085800171, "learning_rate": 4.521464313932052e-05, "loss": 1.3263, "step": 3639 }, { "epoch": 0.09586515670265999, "grad_norm": 1.6659449338912964, "learning_rate": 4.521332631024493e-05, "loss": 1.6478, "step": 3640 }, { "epoch": 0.09589149328417171, "grad_norm": 2.046175956726074, "learning_rate": 4.521200948116935e-05, "loss": 1.7378, "step": 3641 }, { "epoch": 0.09591782986568344, "grad_norm": 2.381976366043091, "learning_rate": 4.521069265209376e-05, "loss": 1.7872, "step": 3642 }, { "epoch": 0.09594416644719515, "grad_norm": 3.948195457458496, "learning_rate": 4.5209375823018174e-05, "loss": 1.816, "step": 3643 }, { "epoch": 0.09597050302870687, "grad_norm": 2.0074939727783203, "learning_rate": 4.520805899394259e-05, "loss": 2.2602, "step": 3644 }, { "epoch": 0.0959968396102186, "grad_norm": 2.225593328475952, "learning_rate": 4.5206742164867e-05, "loss": 2.3454, "step": 3645 }, { "epoch": 0.09602317619173031, "grad_norm": 3.638373613357544, "learning_rate": 4.520542533579142e-05, "loss": 0.9837, "step": 3646 }, { "epoch": 0.09604951277324203, "grad_norm": 2.8617241382598877, "learning_rate": 4.520410850671583e-05, "loss": 0.8882, "step": 3647 }, { "epoch": 0.09607584935475376, "grad_norm": 7.497365474700928, "learning_rate": 4.5202791677640245e-05, "loss": 1.0505, "step": 3648 }, { "epoch": 0.09610218593626547, "grad_norm": 2.914956569671631, "learning_rate": 4.5201474848564654e-05, "loss": 1.8209, "step": 3649 }, { "epoch": 0.09612852251777719, "grad_norm": 2.335761308670044, "learning_rate": 4.5200158019489076e-05, "loss": 1.4838, "step": 3650 }, { "epoch": 0.0961548590992889, "grad_norm": 3.184492826461792, "learning_rate": 4.5198841190413485e-05, "loss": 2.1476, "step": 3651 }, { "epoch": 0.09618119568080064, "grad_norm": 3.5369904041290283, "learning_rate": 4.51975243613379e-05, "loss": 1.3361, "step": 3652 }, { "epoch": 0.09620753226231235, "grad_norm": 2.3214669227600098, "learning_rate": 4.5196207532262316e-05, "loss": 2.0435, "step": 3653 }, { "epoch": 0.09623386884382407, "grad_norm": 1.8957983255386353, "learning_rate": 4.5194890703186725e-05, "loss": 1.9756, "step": 3654 }, { "epoch": 0.0962602054253358, "grad_norm": 6.836841106414795, "learning_rate": 4.519357387411115e-05, "loss": 4.0496, "step": 3655 }, { "epoch": 0.09628654200684751, "grad_norm": 8.642605781555176, "learning_rate": 4.5192257045035556e-05, "loss": 1.8218, "step": 3656 }, { "epoch": 0.09631287858835923, "grad_norm": 5.0724334716796875, "learning_rate": 4.519094021595997e-05, "loss": 2.0119, "step": 3657 }, { "epoch": 0.09633921516987096, "grad_norm": 1.6873488426208496, "learning_rate": 4.518962338688438e-05, "loss": 1.9877, "step": 3658 }, { "epoch": 0.09636555175138267, "grad_norm": 2.1610593795776367, "learning_rate": 4.5188306557808796e-05, "loss": 2.7636, "step": 3659 }, { "epoch": 0.09639188833289439, "grad_norm": 3.336202383041382, "learning_rate": 4.518698972873321e-05, "loss": 1.2608, "step": 3660 }, { "epoch": 0.0964182249144061, "grad_norm": 2.1662914752960205, "learning_rate": 4.518567289965763e-05, "loss": 1.6087, "step": 3661 }, { "epoch": 0.09644456149591783, "grad_norm": 1.8431601524353027, "learning_rate": 4.518435607058204e-05, "loss": 1.5027, "step": 3662 }, { "epoch": 0.09647089807742955, "grad_norm": 2.0472123622894287, "learning_rate": 4.518303924150645e-05, "loss": 2.1419, "step": 3663 }, { "epoch": 0.09649723465894126, "grad_norm": 1.5350587368011475, "learning_rate": 4.5181722412430874e-05, "loss": 1.2688, "step": 3664 }, { "epoch": 0.096523571240453, "grad_norm": 2.0831425189971924, "learning_rate": 4.518040558335528e-05, "loss": 1.7411, "step": 3665 }, { "epoch": 0.09654990782196471, "grad_norm": 2.745490789413452, "learning_rate": 4.51790887542797e-05, "loss": 1.8446, "step": 3666 }, { "epoch": 0.09657624440347642, "grad_norm": 2.4591481685638428, "learning_rate": 4.517777192520411e-05, "loss": 1.3824, "step": 3667 }, { "epoch": 0.09660258098498815, "grad_norm": 4.647483825683594, "learning_rate": 4.517645509612852e-05, "loss": 2.1942, "step": 3668 }, { "epoch": 0.09662891756649987, "grad_norm": 2.449812650680542, "learning_rate": 4.517513826705294e-05, "loss": 1.9773, "step": 3669 }, { "epoch": 0.09665525414801159, "grad_norm": 2.2432150840759277, "learning_rate": 4.5173821437977354e-05, "loss": 1.8984, "step": 3670 }, { "epoch": 0.09668159072952331, "grad_norm": 2.4704086780548096, "learning_rate": 4.517250460890177e-05, "loss": 0.739, "step": 3671 }, { "epoch": 0.09670792731103503, "grad_norm": 3.129194498062134, "learning_rate": 4.517118777982618e-05, "loss": 2.5484, "step": 3672 }, { "epoch": 0.09673426389254675, "grad_norm": 2.480876922607422, "learning_rate": 4.5169870950750594e-05, "loss": 0.6907, "step": 3673 }, { "epoch": 0.09676060047405846, "grad_norm": 1.9893954992294312, "learning_rate": 4.516855412167501e-05, "loss": 2.3519, "step": 3674 }, { "epoch": 0.09678693705557019, "grad_norm": 2.693394184112549, "learning_rate": 4.5167237292599425e-05, "loss": 2.2721, "step": 3675 }, { "epoch": 0.0968132736370819, "grad_norm": 1.923799753189087, "learning_rate": 4.5165920463523834e-05, "loss": 1.2271, "step": 3676 }, { "epoch": 0.09683961021859362, "grad_norm": 2.5754921436309814, "learning_rate": 4.516460363444825e-05, "loss": 2.0139, "step": 3677 }, { "epoch": 0.09686594680010535, "grad_norm": 2.0736756324768066, "learning_rate": 4.5163286805372665e-05, "loss": 1.5492, "step": 3678 }, { "epoch": 0.09689228338161707, "grad_norm": 2.9396755695343018, "learning_rate": 4.516196997629708e-05, "loss": 1.3545, "step": 3679 }, { "epoch": 0.09691861996312878, "grad_norm": 2.0697171688079834, "learning_rate": 4.5160653147221496e-05, "loss": 2.2178, "step": 3680 }, { "epoch": 0.09694495654464051, "grad_norm": 2.6935935020446777, "learning_rate": 4.5159336318145905e-05, "loss": 2.1558, "step": 3681 }, { "epoch": 0.09697129312615223, "grad_norm": 2.0553812980651855, "learning_rate": 4.515801948907032e-05, "loss": 1.7593, "step": 3682 }, { "epoch": 0.09699762970766394, "grad_norm": 4.6022419929504395, "learning_rate": 4.5156702659994736e-05, "loss": 1.3408, "step": 3683 }, { "epoch": 0.09702396628917566, "grad_norm": 2.250375747680664, "learning_rate": 4.515538583091915e-05, "loss": 1.8138, "step": 3684 }, { "epoch": 0.09705030287068739, "grad_norm": 3.7652809619903564, "learning_rate": 4.515406900184356e-05, "loss": 2.105, "step": 3685 }, { "epoch": 0.0970766394521991, "grad_norm": 2.4491586685180664, "learning_rate": 4.5152752172767976e-05, "loss": 2.4553, "step": 3686 }, { "epoch": 0.09710297603371082, "grad_norm": 3.584064245223999, "learning_rate": 4.515143534369239e-05, "loss": 0.5796, "step": 3687 }, { "epoch": 0.09712931261522255, "grad_norm": 1.9849361181259155, "learning_rate": 4.515011851461681e-05, "loss": 1.7573, "step": 3688 }, { "epoch": 0.09715564919673426, "grad_norm": 1.837116003036499, "learning_rate": 4.514880168554122e-05, "loss": 1.8401, "step": 3689 }, { "epoch": 0.09718198577824598, "grad_norm": 3.2957603931427, "learning_rate": 4.514748485646563e-05, "loss": 1.9486, "step": 3690 }, { "epoch": 0.09720832235975771, "grad_norm": 1.8041918277740479, "learning_rate": 4.514616802739005e-05, "loss": 1.36, "step": 3691 }, { "epoch": 0.09723465894126943, "grad_norm": 3.6805500984191895, "learning_rate": 4.5144851198314456e-05, "loss": 1.9729, "step": 3692 }, { "epoch": 0.09726099552278114, "grad_norm": 2.446524143218994, "learning_rate": 4.514353436923888e-05, "loss": 1.4213, "step": 3693 }, { "epoch": 0.09728733210429286, "grad_norm": 2.0969126224517822, "learning_rate": 4.514221754016329e-05, "loss": 1.9055, "step": 3694 }, { "epoch": 0.09731366868580459, "grad_norm": 4.652068614959717, "learning_rate": 4.51409007110877e-05, "loss": 0.7785, "step": 3695 }, { "epoch": 0.0973400052673163, "grad_norm": 2.7421815395355225, "learning_rate": 4.513958388201211e-05, "loss": 1.282, "step": 3696 }, { "epoch": 0.09736634184882802, "grad_norm": 4.3839569091796875, "learning_rate": 4.5138267052936534e-05, "loss": 1.85, "step": 3697 }, { "epoch": 0.09739267843033975, "grad_norm": 2.232586145401001, "learning_rate": 4.513695022386094e-05, "loss": 1.8399, "step": 3698 }, { "epoch": 0.09741901501185146, "grad_norm": 2.443162202835083, "learning_rate": 4.513563339478536e-05, "loss": 2.2778, "step": 3699 }, { "epoch": 0.09744535159336318, "grad_norm": 2.664928674697876, "learning_rate": 4.5134316565709774e-05, "loss": 1.0631, "step": 3700 }, { "epoch": 0.09747168817487491, "grad_norm": 1.9210928678512573, "learning_rate": 4.513299973663418e-05, "loss": 2.0139, "step": 3701 }, { "epoch": 0.09749802475638662, "grad_norm": 2.0483391284942627, "learning_rate": 4.5131682907558606e-05, "loss": 1.6869, "step": 3702 }, { "epoch": 0.09752436133789834, "grad_norm": 2.570066213607788, "learning_rate": 4.5130366078483015e-05, "loss": 1.7603, "step": 3703 }, { "epoch": 0.09755069791941005, "grad_norm": 2.8687045574188232, "learning_rate": 4.512904924940743e-05, "loss": 2.3633, "step": 3704 }, { "epoch": 0.09757703450092178, "grad_norm": 2.9738430976867676, "learning_rate": 4.512773242033184e-05, "loss": 1.4073, "step": 3705 }, { "epoch": 0.0976033710824335, "grad_norm": 2.5782880783081055, "learning_rate": 4.5126415591256255e-05, "loss": 1.9642, "step": 3706 }, { "epoch": 0.09762970766394521, "grad_norm": 2.512622117996216, "learning_rate": 4.512509876218067e-05, "loss": 1.6972, "step": 3707 }, { "epoch": 0.09765604424545694, "grad_norm": 4.486298084259033, "learning_rate": 4.5123781933105086e-05, "loss": 1.5843, "step": 3708 }, { "epoch": 0.09768238082696866, "grad_norm": 1.773552417755127, "learning_rate": 4.51224651040295e-05, "loss": 1.689, "step": 3709 }, { "epoch": 0.09770871740848038, "grad_norm": 2.6420490741729736, "learning_rate": 4.512114827495391e-05, "loss": 1.6237, "step": 3710 }, { "epoch": 0.0977350539899921, "grad_norm": 3.664464235305786, "learning_rate": 4.5119831445878326e-05, "loss": 1.0444, "step": 3711 }, { "epoch": 0.09776139057150382, "grad_norm": 1.7949111461639404, "learning_rate": 4.511851461680274e-05, "loss": 1.9621, "step": 3712 }, { "epoch": 0.09778772715301554, "grad_norm": 4.075164794921875, "learning_rate": 4.511719778772716e-05, "loss": 1.4136, "step": 3713 }, { "epoch": 0.09781406373452727, "grad_norm": 2.90339994430542, "learning_rate": 4.5115880958651566e-05, "loss": 2.0128, "step": 3714 }, { "epoch": 0.09784040031603898, "grad_norm": 2.8007373809814453, "learning_rate": 4.511456412957598e-05, "loss": 0.6457, "step": 3715 }, { "epoch": 0.0978667368975507, "grad_norm": 3.6463592052459717, "learning_rate": 4.51132473005004e-05, "loss": 1.9723, "step": 3716 }, { "epoch": 0.09789307347906241, "grad_norm": 2.673933506011963, "learning_rate": 4.511193047142481e-05, "loss": 2.447, "step": 3717 }, { "epoch": 0.09791941006057414, "grad_norm": 1.8435684442520142, "learning_rate": 4.511061364234923e-05, "loss": 2.3292, "step": 3718 }, { "epoch": 0.09794574664208586, "grad_norm": 2.1149821281433105, "learning_rate": 4.510929681327364e-05, "loss": 1.9996, "step": 3719 }, { "epoch": 0.09797208322359757, "grad_norm": 2.6121387481689453, "learning_rate": 4.510797998419805e-05, "loss": 1.7739, "step": 3720 }, { "epoch": 0.0979984198051093, "grad_norm": 1.902207374572754, "learning_rate": 4.510666315512247e-05, "loss": 1.6011, "step": 3721 }, { "epoch": 0.09802475638662102, "grad_norm": 2.0520217418670654, "learning_rate": 4.5105346326046884e-05, "loss": 2.1635, "step": 3722 }, { "epoch": 0.09805109296813273, "grad_norm": 2.4916579723358154, "learning_rate": 4.510402949697129e-05, "loss": 1.272, "step": 3723 }, { "epoch": 0.09807742954964446, "grad_norm": 2.4104409217834473, "learning_rate": 4.510271266789571e-05, "loss": 1.9387, "step": 3724 }, { "epoch": 0.09810376613115618, "grad_norm": 1.992605447769165, "learning_rate": 4.5101395838820124e-05, "loss": 2.2106, "step": 3725 }, { "epoch": 0.0981301027126679, "grad_norm": 2.632085084915161, "learning_rate": 4.510007900974454e-05, "loss": 1.8529, "step": 3726 }, { "epoch": 0.09815643929417961, "grad_norm": 1.9421182870864868, "learning_rate": 4.5098762180668955e-05, "loss": 2.2319, "step": 3727 }, { "epoch": 0.09818277587569134, "grad_norm": 4.704944610595703, "learning_rate": 4.5097445351593364e-05, "loss": 2.316, "step": 3728 }, { "epoch": 0.09820911245720305, "grad_norm": 2.330127477645874, "learning_rate": 4.509612852251778e-05, "loss": 1.648, "step": 3729 }, { "epoch": 0.09823544903871477, "grad_norm": 3.828066110610962, "learning_rate": 4.5094811693442195e-05, "loss": 0.742, "step": 3730 }, { "epoch": 0.0982617856202265, "grad_norm": 5.853951454162598, "learning_rate": 4.509349486436661e-05, "loss": 1.2046, "step": 3731 }, { "epoch": 0.09828812220173822, "grad_norm": 3.631951093673706, "learning_rate": 4.509217803529102e-05, "loss": 1.0181, "step": 3732 }, { "epoch": 0.09831445878324993, "grad_norm": 1.8185824155807495, "learning_rate": 4.5090861206215435e-05, "loss": 1.5716, "step": 3733 }, { "epoch": 0.09834079536476166, "grad_norm": 3.048617362976074, "learning_rate": 4.508954437713985e-05, "loss": 1.9356, "step": 3734 }, { "epoch": 0.09836713194627338, "grad_norm": 2.251716375350952, "learning_rate": 4.5088227548064266e-05, "loss": 2.5036, "step": 3735 }, { "epoch": 0.09839346852778509, "grad_norm": 2.165480852127075, "learning_rate": 4.508691071898868e-05, "loss": 1.5794, "step": 3736 }, { "epoch": 0.0984198051092968, "grad_norm": 1.8818281888961792, "learning_rate": 4.508559388991309e-05, "loss": 1.8376, "step": 3737 }, { "epoch": 0.09844614169080854, "grad_norm": 4.45981502532959, "learning_rate": 4.5084277060837506e-05, "loss": 1.5519, "step": 3738 }, { "epoch": 0.09847247827232025, "grad_norm": 2.865576982498169, "learning_rate": 4.5082960231761915e-05, "loss": 0.4992, "step": 3739 }, { "epoch": 0.09849881485383197, "grad_norm": 2.348257541656494, "learning_rate": 4.508164340268634e-05, "loss": 2.2336, "step": 3740 }, { "epoch": 0.0985251514353437, "grad_norm": 2.0799033641815186, "learning_rate": 4.5080326573610746e-05, "loss": 2.0153, "step": 3741 }, { "epoch": 0.09855148801685541, "grad_norm": 3.990330457687378, "learning_rate": 4.507900974453516e-05, "loss": 1.3609, "step": 3742 }, { "epoch": 0.09857782459836713, "grad_norm": 2.3788907527923584, "learning_rate": 4.507769291545957e-05, "loss": 1.8452, "step": 3743 }, { "epoch": 0.09860416117987886, "grad_norm": 2.714266777038574, "learning_rate": 4.5076376086383986e-05, "loss": 2.0757, "step": 3744 }, { "epoch": 0.09863049776139057, "grad_norm": 2.107374906539917, "learning_rate": 4.50750592573084e-05, "loss": 1.8191, "step": 3745 }, { "epoch": 0.09865683434290229, "grad_norm": 4.436236381530762, "learning_rate": 4.507374242823282e-05, "loss": 1.948, "step": 3746 }, { "epoch": 0.098683170924414, "grad_norm": 2.7902300357818604, "learning_rate": 4.507242559915723e-05, "loss": 2.0981, "step": 3747 }, { "epoch": 0.09870950750592573, "grad_norm": 1.75342857837677, "learning_rate": 4.507110877008164e-05, "loss": 1.8215, "step": 3748 }, { "epoch": 0.09873584408743745, "grad_norm": 3.086707830429077, "learning_rate": 4.5069791941006064e-05, "loss": 1.8107, "step": 3749 }, { "epoch": 0.09876218066894916, "grad_norm": 2.3465688228607178, "learning_rate": 4.506847511193047e-05, "loss": 1.8832, "step": 3750 }, { "epoch": 0.0987885172504609, "grad_norm": 2.120803117752075, "learning_rate": 4.506715828285489e-05, "loss": 1.3609, "step": 3751 }, { "epoch": 0.09881485383197261, "grad_norm": 4.1749348640441895, "learning_rate": 4.50658414537793e-05, "loss": 1.6664, "step": 3752 }, { "epoch": 0.09884119041348433, "grad_norm": 2.644705295562744, "learning_rate": 4.506452462470371e-05, "loss": 1.3371, "step": 3753 }, { "epoch": 0.09886752699499606, "grad_norm": 2.0972976684570312, "learning_rate": 4.506320779562813e-05, "loss": 1.8616, "step": 3754 }, { "epoch": 0.09889386357650777, "grad_norm": 2.455526113510132, "learning_rate": 4.5061890966552544e-05, "loss": 1.9646, "step": 3755 }, { "epoch": 0.09892020015801949, "grad_norm": 1.8271840810775757, "learning_rate": 4.506057413747696e-05, "loss": 0.5271, "step": 3756 }, { "epoch": 0.09894653673953122, "grad_norm": 2.690463066101074, "learning_rate": 4.505925730840137e-05, "loss": 2.1319, "step": 3757 }, { "epoch": 0.09897287332104293, "grad_norm": 2.247929573059082, "learning_rate": 4.5057940479325784e-05, "loss": 1.5843, "step": 3758 }, { "epoch": 0.09899920990255465, "grad_norm": 1.897557020187378, "learning_rate": 4.50566236502502e-05, "loss": 2.2191, "step": 3759 }, { "epoch": 0.09902554648406636, "grad_norm": 1.8032574653625488, "learning_rate": 4.5055306821174615e-05, "loss": 1.7499, "step": 3760 }, { "epoch": 0.09905188306557809, "grad_norm": 2.043447732925415, "learning_rate": 4.5053989992099024e-05, "loss": 1.8648, "step": 3761 }, { "epoch": 0.09907821964708981, "grad_norm": 2.2304294109344482, "learning_rate": 4.505267316302344e-05, "loss": 0.5406, "step": 3762 }, { "epoch": 0.09910455622860152, "grad_norm": 3.9684667587280273, "learning_rate": 4.5051356333947856e-05, "loss": 2.356, "step": 3763 }, { "epoch": 0.09913089281011325, "grad_norm": 2.1880557537078857, "learning_rate": 4.505003950487227e-05, "loss": 1.0215, "step": 3764 }, { "epoch": 0.09915722939162497, "grad_norm": 3.6301941871643066, "learning_rate": 4.504872267579669e-05, "loss": 1.4861, "step": 3765 }, { "epoch": 0.09918356597313668, "grad_norm": 5.493913173675537, "learning_rate": 4.5047405846721096e-05, "loss": 1.5048, "step": 3766 }, { "epoch": 0.09920990255464841, "grad_norm": 1.8711073398590088, "learning_rate": 4.504608901764551e-05, "loss": 1.7595, "step": 3767 }, { "epoch": 0.09923623913616013, "grad_norm": 3.579517126083374, "learning_rate": 4.504477218856993e-05, "loss": 1.4115, "step": 3768 }, { "epoch": 0.09926257571767184, "grad_norm": 2.168915033340454, "learning_rate": 4.504345535949434e-05, "loss": 1.7171, "step": 3769 }, { "epoch": 0.09928891229918356, "grad_norm": 3.1862964630126953, "learning_rate": 4.504213853041875e-05, "loss": 0.8614, "step": 3770 }, { "epoch": 0.09931524888069529, "grad_norm": 3.594576835632324, "learning_rate": 4.504082170134317e-05, "loss": 1.4088, "step": 3771 }, { "epoch": 0.099341585462207, "grad_norm": 3.186267375946045, "learning_rate": 4.503950487226758e-05, "loss": 1.3349, "step": 3772 }, { "epoch": 0.09936792204371872, "grad_norm": 2.0694220066070557, "learning_rate": 4.5038188043192e-05, "loss": 1.8724, "step": 3773 }, { "epoch": 0.09939425862523045, "grad_norm": 2.560370683670044, "learning_rate": 4.5036871214116414e-05, "loss": 2.3651, "step": 3774 }, { "epoch": 0.09942059520674217, "grad_norm": 1.939302682876587, "learning_rate": 4.503555438504082e-05, "loss": 1.5869, "step": 3775 }, { "epoch": 0.09944693178825388, "grad_norm": 2.949557065963745, "learning_rate": 4.503423755596524e-05, "loss": 2.1721, "step": 3776 }, { "epoch": 0.09947326836976561, "grad_norm": 3.7444915771484375, "learning_rate": 4.503292072688965e-05, "loss": 2.4473, "step": 3777 }, { "epoch": 0.09949960495127733, "grad_norm": 2.419501543045044, "learning_rate": 4.503160389781407e-05, "loss": 1.7612, "step": 3778 }, { "epoch": 0.09952594153278904, "grad_norm": 4.271428108215332, "learning_rate": 4.503028706873848e-05, "loss": 1.4787, "step": 3779 }, { "epoch": 0.09955227811430076, "grad_norm": 1.6048818826675415, "learning_rate": 4.5028970239662894e-05, "loss": 1.8313, "step": 3780 }, { "epoch": 0.09957861469581249, "grad_norm": 2.2156386375427246, "learning_rate": 4.502765341058731e-05, "loss": 0.429, "step": 3781 }, { "epoch": 0.0996049512773242, "grad_norm": 5.712406635284424, "learning_rate": 4.5026336581511725e-05, "loss": 2.3388, "step": 3782 }, { "epoch": 0.09963128785883592, "grad_norm": 4.480116367340088, "learning_rate": 4.502501975243614e-05, "loss": 1.572, "step": 3783 }, { "epoch": 0.09965762444034765, "grad_norm": 1.6718860864639282, "learning_rate": 4.502370292336055e-05, "loss": 1.8966, "step": 3784 }, { "epoch": 0.09968396102185936, "grad_norm": 3.7211596965789795, "learning_rate": 4.5022386094284965e-05, "loss": 1.313, "step": 3785 }, { "epoch": 0.09971029760337108, "grad_norm": 4.084120750427246, "learning_rate": 4.5021069265209374e-05, "loss": 0.7511, "step": 3786 }, { "epoch": 0.09973663418488281, "grad_norm": 8.82149887084961, "learning_rate": 4.5019752436133796e-05, "loss": 1.9127, "step": 3787 }, { "epoch": 0.09976297076639452, "grad_norm": 3.8063817024230957, "learning_rate": 4.5018435607058205e-05, "loss": 0.9581, "step": 3788 }, { "epoch": 0.09978930734790624, "grad_norm": 2.2536633014678955, "learning_rate": 4.501711877798262e-05, "loss": 1.6813, "step": 3789 }, { "epoch": 0.09981564392941795, "grad_norm": 3.1599223613739014, "learning_rate": 4.5015801948907036e-05, "loss": 0.4418, "step": 3790 }, { "epoch": 0.09984198051092968, "grad_norm": 1.986271858215332, "learning_rate": 4.5014485119831445e-05, "loss": 1.4102, "step": 3791 }, { "epoch": 0.0998683170924414, "grad_norm": 2.5842885971069336, "learning_rate": 4.501316829075587e-05, "loss": 2.2324, "step": 3792 }, { "epoch": 0.09989465367395312, "grad_norm": 3.0341603755950928, "learning_rate": 4.5011851461680276e-05, "loss": 1.2503, "step": 3793 }, { "epoch": 0.09992099025546484, "grad_norm": 1.8510781526565552, "learning_rate": 4.501053463260469e-05, "loss": 1.4495, "step": 3794 }, { "epoch": 0.09994732683697656, "grad_norm": 2.9165852069854736, "learning_rate": 4.50092178035291e-05, "loss": 2.8417, "step": 3795 }, { "epoch": 0.09997366341848828, "grad_norm": 2.408564567565918, "learning_rate": 4.500790097445352e-05, "loss": 2.2565, "step": 3796 }, { "epoch": 0.1, "grad_norm": 2.188035726547241, "learning_rate": 4.500658414537793e-05, "loss": 1.5012, "step": 3797 }, { "epoch": 0.10002633658151172, "grad_norm": 4.150546550750732, "learning_rate": 4.500526731630235e-05, "loss": 2.3743, "step": 3798 }, { "epoch": 0.10005267316302344, "grad_norm": 3.530560255050659, "learning_rate": 4.5003950487226756e-05, "loss": 1.2454, "step": 3799 }, { "epoch": 0.10007900974453517, "grad_norm": 4.042998313903809, "learning_rate": 4.500263365815117e-05, "loss": 1.4332, "step": 3800 }, { "epoch": 0.10010534632604688, "grad_norm": 2.05966854095459, "learning_rate": 4.500131682907559e-05, "loss": 2.2651, "step": 3801 }, { "epoch": 0.1001316829075586, "grad_norm": 1.9672224521636963, "learning_rate": 4.5e-05, "loss": 1.5866, "step": 3802 }, { "epoch": 0.10015801948907031, "grad_norm": 2.380561351776123, "learning_rate": 4.499868317092442e-05, "loss": 1.4811, "step": 3803 }, { "epoch": 0.10018435607058204, "grad_norm": 2.6420717239379883, "learning_rate": 4.499736634184883e-05, "loss": 1.7474, "step": 3804 }, { "epoch": 0.10021069265209376, "grad_norm": 9.223584175109863, "learning_rate": 4.499604951277324e-05, "loss": 0.8603, "step": 3805 }, { "epoch": 0.10023702923360547, "grad_norm": 2.276423215866089, "learning_rate": 4.499473268369766e-05, "loss": 1.6951, "step": 3806 }, { "epoch": 0.1002633658151172, "grad_norm": 2.397859811782837, "learning_rate": 4.4993415854622074e-05, "loss": 1.2482, "step": 3807 }, { "epoch": 0.10028970239662892, "grad_norm": 3.1488842964172363, "learning_rate": 4.499209902554648e-05, "loss": 1.3942, "step": 3808 }, { "epoch": 0.10031603897814063, "grad_norm": 2.4266960620880127, "learning_rate": 4.49907821964709e-05, "loss": 2.1123, "step": 3809 }, { "epoch": 0.10034237555965236, "grad_norm": 2.485085964202881, "learning_rate": 4.4989465367395314e-05, "loss": 1.7803, "step": 3810 }, { "epoch": 0.10036871214116408, "grad_norm": 3.4337573051452637, "learning_rate": 4.498814853831973e-05, "loss": 0.5831, "step": 3811 }, { "epoch": 0.1003950487226758, "grad_norm": 3.493136405944824, "learning_rate": 4.4986831709244145e-05, "loss": 1.7557, "step": 3812 }, { "epoch": 0.10042138530418751, "grad_norm": 2.224128007888794, "learning_rate": 4.4985514880168554e-05, "loss": 2.2421, "step": 3813 }, { "epoch": 0.10044772188569924, "grad_norm": 1.8008248805999756, "learning_rate": 4.498419805109297e-05, "loss": 2.1127, "step": 3814 }, { "epoch": 0.10047405846721096, "grad_norm": 2.4031283855438232, "learning_rate": 4.4982881222017385e-05, "loss": 1.8835, "step": 3815 }, { "epoch": 0.10050039504872267, "grad_norm": 2.955813407897949, "learning_rate": 4.49815643929418e-05, "loss": 2.2879, "step": 3816 }, { "epoch": 0.1005267316302344, "grad_norm": 3.2041842937469482, "learning_rate": 4.498024756386621e-05, "loss": 1.4607, "step": 3817 }, { "epoch": 0.10055306821174612, "grad_norm": 3.0943050384521484, "learning_rate": 4.4978930734790625e-05, "loss": 1.6395, "step": 3818 }, { "epoch": 0.10057940479325783, "grad_norm": 2.2135558128356934, "learning_rate": 4.497761390571504e-05, "loss": 1.8168, "step": 3819 }, { "epoch": 0.10060574137476956, "grad_norm": 2.5748634338378906, "learning_rate": 4.4976297076639456e-05, "loss": 1.5128, "step": 3820 }, { "epoch": 0.10063207795628128, "grad_norm": 4.733027935028076, "learning_rate": 4.497498024756387e-05, "loss": 0.8025, "step": 3821 }, { "epoch": 0.10065841453779299, "grad_norm": 2.3711979389190674, "learning_rate": 4.497366341848828e-05, "loss": 2.2044, "step": 3822 }, { "epoch": 0.10068475111930471, "grad_norm": 2.8120198249816895, "learning_rate": 4.4972346589412697e-05, "loss": 1.6934, "step": 3823 }, { "epoch": 0.10071108770081644, "grad_norm": 2.93912935256958, "learning_rate": 4.4971029760337105e-05, "loss": 1.3493, "step": 3824 }, { "epoch": 0.10073742428232815, "grad_norm": 2.880110502243042, "learning_rate": 4.496971293126153e-05, "loss": 1.2028, "step": 3825 }, { "epoch": 0.10076376086383987, "grad_norm": 2.13470196723938, "learning_rate": 4.4968396102185937e-05, "loss": 1.8668, "step": 3826 }, { "epoch": 0.1007900974453516, "grad_norm": 3.4758222103118896, "learning_rate": 4.496707927311035e-05, "loss": 2.0979, "step": 3827 }, { "epoch": 0.10081643402686331, "grad_norm": 2.111302375793457, "learning_rate": 4.496576244403477e-05, "loss": 1.6521, "step": 3828 }, { "epoch": 0.10084277060837503, "grad_norm": 2.630613327026367, "learning_rate": 4.496444561495918e-05, "loss": 1.3449, "step": 3829 }, { "epoch": 0.10086910718988676, "grad_norm": 5.345747470855713, "learning_rate": 4.49631287858836e-05, "loss": 1.1698, "step": 3830 }, { "epoch": 0.10089544377139847, "grad_norm": 3.3519644737243652, "learning_rate": 4.496181195680801e-05, "loss": 0.7235, "step": 3831 }, { "epoch": 0.10092178035291019, "grad_norm": 1.7442739009857178, "learning_rate": 4.496049512773242e-05, "loss": 1.81, "step": 3832 }, { "epoch": 0.1009481169344219, "grad_norm": 2.1400156021118164, "learning_rate": 4.495917829865683e-05, "loss": 1.7726, "step": 3833 }, { "epoch": 0.10097445351593363, "grad_norm": 2.0222418308258057, "learning_rate": 4.4957861469581255e-05, "loss": 2.4957, "step": 3834 }, { "epoch": 0.10100079009744535, "grad_norm": 2.494149923324585, "learning_rate": 4.495654464050566e-05, "loss": 2.0013, "step": 3835 }, { "epoch": 0.10102712667895707, "grad_norm": 4.976408958435059, "learning_rate": 4.495522781143008e-05, "loss": 1.5433, "step": 3836 }, { "epoch": 0.1010534632604688, "grad_norm": 1.8104454278945923, "learning_rate": 4.4953910982354495e-05, "loss": 1.3188, "step": 3837 }, { "epoch": 0.10107979984198051, "grad_norm": 1.821811318397522, "learning_rate": 4.49525941532789e-05, "loss": 1.5422, "step": 3838 }, { "epoch": 0.10110613642349223, "grad_norm": 2.410536289215088, "learning_rate": 4.4951277324203326e-05, "loss": 1.7623, "step": 3839 }, { "epoch": 0.10113247300500396, "grad_norm": 4.866095542907715, "learning_rate": 4.4949960495127735e-05, "loss": 2.2759, "step": 3840 }, { "epoch": 0.10115880958651567, "grad_norm": 2.438474655151367, "learning_rate": 4.494864366605215e-05, "loss": 1.7839, "step": 3841 }, { "epoch": 0.10118514616802739, "grad_norm": 2.5334908962249756, "learning_rate": 4.494732683697656e-05, "loss": 2.0451, "step": 3842 }, { "epoch": 0.10121148274953912, "grad_norm": 1.5801764726638794, "learning_rate": 4.494601000790098e-05, "loss": 1.6087, "step": 3843 }, { "epoch": 0.10123781933105083, "grad_norm": 2.1839675903320312, "learning_rate": 4.494469317882539e-05, "loss": 2.2102, "step": 3844 }, { "epoch": 0.10126415591256255, "grad_norm": 3.1293671131134033, "learning_rate": 4.4943376349749806e-05, "loss": 1.4554, "step": 3845 }, { "epoch": 0.10129049249407426, "grad_norm": 2.179328441619873, "learning_rate": 4.4942059520674215e-05, "loss": 1.833, "step": 3846 }, { "epoch": 0.10131682907558599, "grad_norm": 1.850671410560608, "learning_rate": 4.494074269159863e-05, "loss": 2.6449, "step": 3847 }, { "epoch": 0.10134316565709771, "grad_norm": 4.069072723388672, "learning_rate": 4.4939425862523046e-05, "loss": 1.3082, "step": 3848 }, { "epoch": 0.10136950223860942, "grad_norm": 3.0500688552856445, "learning_rate": 4.493810903344746e-05, "loss": 1.5405, "step": 3849 }, { "epoch": 0.10139583882012115, "grad_norm": 2.249415397644043, "learning_rate": 4.493679220437188e-05, "loss": 2.0155, "step": 3850 }, { "epoch": 0.10142217540163287, "grad_norm": 2.2017037868499756, "learning_rate": 4.4935475375296286e-05, "loss": 1.9497, "step": 3851 }, { "epoch": 0.10144851198314458, "grad_norm": 3.5100765228271484, "learning_rate": 4.49341585462207e-05, "loss": 1.483, "step": 3852 }, { "epoch": 0.10147484856465631, "grad_norm": 2.7092349529266357, "learning_rate": 4.493284171714512e-05, "loss": 2.5334, "step": 3853 }, { "epoch": 0.10150118514616803, "grad_norm": 2.613349676132202, "learning_rate": 4.493152488806953e-05, "loss": 1.9086, "step": 3854 }, { "epoch": 0.10152752172767975, "grad_norm": 3.615673780441284, "learning_rate": 4.493020805899394e-05, "loss": 1.4175, "step": 3855 }, { "epoch": 0.10155385830919146, "grad_norm": 2.193631887435913, "learning_rate": 4.492889122991836e-05, "loss": 1.7111, "step": 3856 }, { "epoch": 0.10158019489070319, "grad_norm": 1.9345149993896484, "learning_rate": 4.492757440084277e-05, "loss": 0.6001, "step": 3857 }, { "epoch": 0.1016065314722149, "grad_norm": 1.9012631177902222, "learning_rate": 4.492625757176719e-05, "loss": 1.5349, "step": 3858 }, { "epoch": 0.10163286805372662, "grad_norm": 2.148568630218506, "learning_rate": 4.4924940742691604e-05, "loss": 2.1311, "step": 3859 }, { "epoch": 0.10165920463523835, "grad_norm": 3.65657639503479, "learning_rate": 4.492362391361601e-05, "loss": 0.6288, "step": 3860 }, { "epoch": 0.10168554121675007, "grad_norm": 2.1184465885162354, "learning_rate": 4.492230708454043e-05, "loss": 2.0239, "step": 3861 }, { "epoch": 0.10171187779826178, "grad_norm": 2.5524072647094727, "learning_rate": 4.4920990255464844e-05, "loss": 1.528, "step": 3862 }, { "epoch": 0.10173821437977351, "grad_norm": 2.9298508167266846, "learning_rate": 4.491967342638926e-05, "loss": 1.4404, "step": 3863 }, { "epoch": 0.10176455096128523, "grad_norm": 2.2487053871154785, "learning_rate": 4.491835659731367e-05, "loss": 0.1919, "step": 3864 }, { "epoch": 0.10179088754279694, "grad_norm": 5.0985870361328125, "learning_rate": 4.4917039768238084e-05, "loss": 0.9194, "step": 3865 }, { "epoch": 0.10181722412430866, "grad_norm": 2.198937177658081, "learning_rate": 4.49157229391625e-05, "loss": 1.7879, "step": 3866 }, { "epoch": 0.10184356070582039, "grad_norm": 2.556926727294922, "learning_rate": 4.4914406110086915e-05, "loss": 1.6204, "step": 3867 }, { "epoch": 0.1018698972873321, "grad_norm": 1.9259483814239502, "learning_rate": 4.491308928101133e-05, "loss": 1.5702, "step": 3868 }, { "epoch": 0.10189623386884382, "grad_norm": 2.51617169380188, "learning_rate": 4.491177245193574e-05, "loss": 1.7788, "step": 3869 }, { "epoch": 0.10192257045035555, "grad_norm": 2.9019148349761963, "learning_rate": 4.4910455622860155e-05, "loss": 1.4209, "step": 3870 }, { "epoch": 0.10194890703186726, "grad_norm": 2.2281227111816406, "learning_rate": 4.4909138793784564e-05, "loss": 2.0085, "step": 3871 }, { "epoch": 0.10197524361337898, "grad_norm": 1.8960057497024536, "learning_rate": 4.4907821964708986e-05, "loss": 1.9725, "step": 3872 }, { "epoch": 0.10200158019489071, "grad_norm": 1.6910125017166138, "learning_rate": 4.4906505135633395e-05, "loss": 2.1222, "step": 3873 }, { "epoch": 0.10202791677640242, "grad_norm": 2.12577223777771, "learning_rate": 4.490518830655781e-05, "loss": 2.1383, "step": 3874 }, { "epoch": 0.10205425335791414, "grad_norm": 2.494915246963501, "learning_rate": 4.4903871477482226e-05, "loss": 1.8938, "step": 3875 }, { "epoch": 0.10208058993942586, "grad_norm": 1.9448983669281006, "learning_rate": 4.490255464840664e-05, "loss": 1.7263, "step": 3876 }, { "epoch": 0.10210692652093759, "grad_norm": 2.9931201934814453, "learning_rate": 4.490123781933106e-05, "loss": 0.954, "step": 3877 }, { "epoch": 0.1021332631024493, "grad_norm": 3.022486925125122, "learning_rate": 4.4899920990255466e-05, "loss": 2.2079, "step": 3878 }, { "epoch": 0.10215959968396102, "grad_norm": 4.349597454071045, "learning_rate": 4.489860416117988e-05, "loss": 1.0891, "step": 3879 }, { "epoch": 0.10218593626547275, "grad_norm": 2.0917694568634033, "learning_rate": 4.489728733210429e-05, "loss": 1.8808, "step": 3880 }, { "epoch": 0.10221227284698446, "grad_norm": 2.205721855163574, "learning_rate": 4.489597050302871e-05, "loss": 1.5352, "step": 3881 }, { "epoch": 0.10223860942849618, "grad_norm": 3.1652493476867676, "learning_rate": 4.489465367395312e-05, "loss": 1.918, "step": 3882 }, { "epoch": 0.1022649460100079, "grad_norm": 6.0257792472839355, "learning_rate": 4.489333684487754e-05, "loss": 2.119, "step": 3883 }, { "epoch": 0.10229128259151962, "grad_norm": 2.2120919227600098, "learning_rate": 4.489202001580195e-05, "loss": 1.2787, "step": 3884 }, { "epoch": 0.10231761917303134, "grad_norm": 3.2696492671966553, "learning_rate": 4.489070318672636e-05, "loss": 0.8269, "step": 3885 }, { "epoch": 0.10234395575454305, "grad_norm": 1.874639868736267, "learning_rate": 4.4889386357650784e-05, "loss": 1.6064, "step": 3886 }, { "epoch": 0.10237029233605478, "grad_norm": 1.9932085275650024, "learning_rate": 4.488806952857519e-05, "loss": 2.0404, "step": 3887 }, { "epoch": 0.1023966289175665, "grad_norm": 2.500261068344116, "learning_rate": 4.488675269949961e-05, "loss": 1.0359, "step": 3888 }, { "epoch": 0.10242296549907821, "grad_norm": 2.3774125576019287, "learning_rate": 4.488543587042402e-05, "loss": 2.052, "step": 3889 }, { "epoch": 0.10244930208058994, "grad_norm": 2.0700395107269287, "learning_rate": 4.488411904134843e-05, "loss": 1.8927, "step": 3890 }, { "epoch": 0.10247563866210166, "grad_norm": 1.9385424852371216, "learning_rate": 4.488280221227285e-05, "loss": 2.1316, "step": 3891 }, { "epoch": 0.10250197524361337, "grad_norm": 2.2179131507873535, "learning_rate": 4.4881485383197264e-05, "loss": 1.596, "step": 3892 }, { "epoch": 0.1025283118251251, "grad_norm": 1.9597073793411255, "learning_rate": 4.488016855412167e-05, "loss": 2.0666, "step": 3893 }, { "epoch": 0.10255464840663682, "grad_norm": 3.590493679046631, "learning_rate": 4.487885172504609e-05, "loss": 0.797, "step": 3894 }, { "epoch": 0.10258098498814854, "grad_norm": 3.5431103706359863, "learning_rate": 4.487753489597051e-05, "loss": 0.9543, "step": 3895 }, { "epoch": 0.10260732156966026, "grad_norm": 3.236133098602295, "learning_rate": 4.487621806689492e-05, "loss": 1.0766, "step": 3896 }, { "epoch": 0.10263365815117198, "grad_norm": 5.906928539276123, "learning_rate": 4.4874901237819336e-05, "loss": 1.1328, "step": 3897 }, { "epoch": 0.1026599947326837, "grad_norm": 2.431095600128174, "learning_rate": 4.4873584408743744e-05, "loss": 1.3497, "step": 3898 }, { "epoch": 0.10268633131419541, "grad_norm": 2.8399951457977295, "learning_rate": 4.487226757966816e-05, "loss": 1.8871, "step": 3899 }, { "epoch": 0.10271266789570714, "grad_norm": 2.45115065574646, "learning_rate": 4.4870950750592576e-05, "loss": 1.7018, "step": 3900 }, { "epoch": 0.10273900447721886, "grad_norm": 2.5587565898895264, "learning_rate": 4.486963392151699e-05, "loss": 1.5922, "step": 3901 }, { "epoch": 0.10276534105873057, "grad_norm": 3.362809181213379, "learning_rate": 4.48683170924414e-05, "loss": 1.3428, "step": 3902 }, { "epoch": 0.1027916776402423, "grad_norm": 3.032858371734619, "learning_rate": 4.4867000263365816e-05, "loss": 1.6904, "step": 3903 }, { "epoch": 0.10281801422175402, "grad_norm": 3.6812729835510254, "learning_rate": 4.486568343429023e-05, "loss": 1.4182, "step": 3904 }, { "epoch": 0.10284435080326573, "grad_norm": 2.6831905841827393, "learning_rate": 4.486436660521465e-05, "loss": 1.6019, "step": 3905 }, { "epoch": 0.10287068738477746, "grad_norm": 1.8663004636764526, "learning_rate": 4.486304977613906e-05, "loss": 2.0442, "step": 3906 }, { "epoch": 0.10289702396628918, "grad_norm": 2.102212905883789, "learning_rate": 4.486173294706347e-05, "loss": 1.786, "step": 3907 }, { "epoch": 0.1029233605478009, "grad_norm": 4.404050350189209, "learning_rate": 4.486041611798789e-05, "loss": 1.5842, "step": 3908 }, { "epoch": 0.10294969712931261, "grad_norm": 7.1329755783081055, "learning_rate": 4.48590992889123e-05, "loss": 2.2315, "step": 3909 }, { "epoch": 0.10297603371082434, "grad_norm": 2.8774871826171875, "learning_rate": 4.485778245983672e-05, "loss": 1.2062, "step": 3910 }, { "epoch": 0.10300237029233605, "grad_norm": 1.801954984664917, "learning_rate": 4.485646563076113e-05, "loss": 1.6084, "step": 3911 }, { "epoch": 0.10302870687384777, "grad_norm": 2.9546315670013428, "learning_rate": 4.485514880168554e-05, "loss": 0.5987, "step": 3912 }, { "epoch": 0.1030550434553595, "grad_norm": 1.9714163541793823, "learning_rate": 4.485383197260996e-05, "loss": 1.8321, "step": 3913 }, { "epoch": 0.10308138003687121, "grad_norm": 1.6383371353149414, "learning_rate": 4.4852515143534374e-05, "loss": 1.4924, "step": 3914 }, { "epoch": 0.10310771661838293, "grad_norm": 3.2470390796661377, "learning_rate": 4.485119831445879e-05, "loss": 0.6634, "step": 3915 }, { "epoch": 0.10313405319989466, "grad_norm": 2.703115463256836, "learning_rate": 4.48498814853832e-05, "loss": 1.5607, "step": 3916 }, { "epoch": 0.10316038978140638, "grad_norm": 4.0662994384765625, "learning_rate": 4.4848564656307614e-05, "loss": 2.0106, "step": 3917 }, { "epoch": 0.10318672636291809, "grad_norm": 3.7793779373168945, "learning_rate": 4.484724782723202e-05, "loss": 2.5227, "step": 3918 }, { "epoch": 0.1032130629444298, "grad_norm": 2.2200205326080322, "learning_rate": 4.4845930998156445e-05, "loss": 1.8839, "step": 3919 }, { "epoch": 0.10323939952594154, "grad_norm": 1.6686019897460938, "learning_rate": 4.4844614169080854e-05, "loss": 1.5918, "step": 3920 }, { "epoch": 0.10326573610745325, "grad_norm": 2.117866039276123, "learning_rate": 4.484329734000527e-05, "loss": 1.644, "step": 3921 }, { "epoch": 0.10329207268896497, "grad_norm": 2.207977533340454, "learning_rate": 4.4841980510929685e-05, "loss": 1.5119, "step": 3922 }, { "epoch": 0.1033184092704767, "grad_norm": 2.9472129344940186, "learning_rate": 4.4840663681854094e-05, "loss": 1.8325, "step": 3923 }, { "epoch": 0.10334474585198841, "grad_norm": 2.2142364978790283, "learning_rate": 4.4839346852778516e-05, "loss": 2.0007, "step": 3924 }, { "epoch": 0.10337108243350013, "grad_norm": 1.6783168315887451, "learning_rate": 4.4838030023702925e-05, "loss": 1.7686, "step": 3925 }, { "epoch": 0.10339741901501186, "grad_norm": 1.9474951028823853, "learning_rate": 4.483671319462734e-05, "loss": 1.6394, "step": 3926 }, { "epoch": 0.10342375559652357, "grad_norm": 2.798863172531128, "learning_rate": 4.483539636555175e-05, "loss": 1.4046, "step": 3927 }, { "epoch": 0.10345009217803529, "grad_norm": 4.639814376831055, "learning_rate": 4.483407953647617e-05, "loss": 0.7852, "step": 3928 }, { "epoch": 0.103476428759547, "grad_norm": 1.954859972000122, "learning_rate": 4.483276270740058e-05, "loss": 1.6035, "step": 3929 }, { "epoch": 0.10350276534105873, "grad_norm": 1.8147886991500854, "learning_rate": 4.4831445878324996e-05, "loss": 1.994, "step": 3930 }, { "epoch": 0.10352910192257045, "grad_norm": 3.092109441757202, "learning_rate": 4.483012904924941e-05, "loss": 1.8483, "step": 3931 }, { "epoch": 0.10355543850408216, "grad_norm": 4.166642189025879, "learning_rate": 4.482881222017382e-05, "loss": 1.3048, "step": 3932 }, { "epoch": 0.1035817750855939, "grad_norm": 1.8559051752090454, "learning_rate": 4.482749539109824e-05, "loss": 1.5706, "step": 3933 }, { "epoch": 0.10360811166710561, "grad_norm": 2.076110601425171, "learning_rate": 4.482617856202265e-05, "loss": 1.1648, "step": 3934 }, { "epoch": 0.10363444824861733, "grad_norm": 3.1907925605773926, "learning_rate": 4.482486173294707e-05, "loss": 1.7439, "step": 3935 }, { "epoch": 0.10366078483012905, "grad_norm": 2.254758596420288, "learning_rate": 4.4823544903871476e-05, "loss": 1.3704, "step": 3936 }, { "epoch": 0.10368712141164077, "grad_norm": 2.8785667419433594, "learning_rate": 4.482222807479589e-05, "loss": 1.7097, "step": 3937 }, { "epoch": 0.10371345799315249, "grad_norm": 2.2197372913360596, "learning_rate": 4.482091124572031e-05, "loss": 1.4248, "step": 3938 }, { "epoch": 0.10373979457466422, "grad_norm": 2.4855663776397705, "learning_rate": 4.481959441664472e-05, "loss": 1.6056, "step": 3939 }, { "epoch": 0.10376613115617593, "grad_norm": 3.4125561714172363, "learning_rate": 4.481827758756914e-05, "loss": 1.2521, "step": 3940 }, { "epoch": 0.10379246773768765, "grad_norm": 3.5316920280456543, "learning_rate": 4.481696075849355e-05, "loss": 1.2078, "step": 3941 }, { "epoch": 0.10381880431919936, "grad_norm": 1.9573456048965454, "learning_rate": 4.481564392941797e-05, "loss": 2.1511, "step": 3942 }, { "epoch": 0.10384514090071109, "grad_norm": 2.3475427627563477, "learning_rate": 4.481432710034238e-05, "loss": 1.44, "step": 3943 }, { "epoch": 0.10387147748222281, "grad_norm": 1.5809345245361328, "learning_rate": 4.4813010271266794e-05, "loss": 1.5978, "step": 3944 }, { "epoch": 0.10389781406373452, "grad_norm": 3.8339791297912598, "learning_rate": 4.48116934421912e-05, "loss": 2.7539, "step": 3945 }, { "epoch": 0.10392415064524625, "grad_norm": 1.9423012733459473, "learning_rate": 4.481037661311562e-05, "loss": 0.798, "step": 3946 }, { "epoch": 0.10395048722675797, "grad_norm": 4.6782145500183105, "learning_rate": 4.4809059784040034e-05, "loss": 1.3771, "step": 3947 }, { "epoch": 0.10397682380826968, "grad_norm": 2.536159038543701, "learning_rate": 4.480774295496445e-05, "loss": 1.1967, "step": 3948 }, { "epoch": 0.10400316038978141, "grad_norm": 3.16945481300354, "learning_rate": 4.480642612588886e-05, "loss": 2.148, "step": 3949 }, { "epoch": 0.10402949697129313, "grad_norm": 2.1217691898345947, "learning_rate": 4.4805109296813274e-05, "loss": 2.1199, "step": 3950 }, { "epoch": 0.10405583355280484, "grad_norm": 6.577202320098877, "learning_rate": 4.480379246773769e-05, "loss": 1.214, "step": 3951 }, { "epoch": 0.10408217013431656, "grad_norm": 3.3185997009277344, "learning_rate": 4.4802475638662105e-05, "loss": 1.468, "step": 3952 }, { "epoch": 0.10410850671582829, "grad_norm": 1.7041481733322144, "learning_rate": 4.480115880958652e-05, "loss": 2.129, "step": 3953 }, { "epoch": 0.10413484329734, "grad_norm": 1.6504563093185425, "learning_rate": 4.479984198051093e-05, "loss": 1.8116, "step": 3954 }, { "epoch": 0.10416117987885172, "grad_norm": 1.7092504501342773, "learning_rate": 4.4798525151435345e-05, "loss": 1.6747, "step": 3955 }, { "epoch": 0.10418751646036345, "grad_norm": 4.163072109222412, "learning_rate": 4.4797208322359754e-05, "loss": 1.1396, "step": 3956 }, { "epoch": 0.10421385304187517, "grad_norm": 3.019253730773926, "learning_rate": 4.4795891493284177e-05, "loss": 0.8102, "step": 3957 }, { "epoch": 0.10424018962338688, "grad_norm": 1.8424838781356812, "learning_rate": 4.4794574664208585e-05, "loss": 2.2282, "step": 3958 }, { "epoch": 0.10426652620489861, "grad_norm": 2.632228374481201, "learning_rate": 4.4793257835133e-05, "loss": 1.8617, "step": 3959 }, { "epoch": 0.10429286278641033, "grad_norm": 2.02886700630188, "learning_rate": 4.4791941006057417e-05, "loss": 1.7877, "step": 3960 }, { "epoch": 0.10431919936792204, "grad_norm": 1.9014345407485962, "learning_rate": 4.479062417698183e-05, "loss": 1.3873, "step": 3961 }, { "epoch": 0.10434553594943376, "grad_norm": 2.2207207679748535, "learning_rate": 4.478930734790625e-05, "loss": 1.9763, "step": 3962 }, { "epoch": 0.10437187253094549, "grad_norm": 3.94708514213562, "learning_rate": 4.4787990518830657e-05, "loss": 1.4662, "step": 3963 }, { "epoch": 0.1043982091124572, "grad_norm": 4.338469982147217, "learning_rate": 4.478667368975507e-05, "loss": 1.4012, "step": 3964 }, { "epoch": 0.10442454569396892, "grad_norm": 2.5421175956726074, "learning_rate": 4.478535686067948e-05, "loss": 1.165, "step": 3965 }, { "epoch": 0.10445088227548065, "grad_norm": 5.195154190063477, "learning_rate": 4.47840400316039e-05, "loss": 0.9609, "step": 3966 }, { "epoch": 0.10447721885699236, "grad_norm": 1.7909053564071655, "learning_rate": 4.478272320252831e-05, "loss": 2.2669, "step": 3967 }, { "epoch": 0.10450355543850408, "grad_norm": 2.0777971744537354, "learning_rate": 4.478140637345273e-05, "loss": 1.4504, "step": 3968 }, { "epoch": 0.10452989202001581, "grad_norm": 1.5917402505874634, "learning_rate": 4.478008954437714e-05, "loss": 1.138, "step": 3969 }, { "epoch": 0.10455622860152752, "grad_norm": 6.32620906829834, "learning_rate": 4.477877271530155e-05, "loss": 1.4478, "step": 3970 }, { "epoch": 0.10458256518303924, "grad_norm": 2.0029797554016113, "learning_rate": 4.4777455886225975e-05, "loss": 2.1641, "step": 3971 }, { "epoch": 0.10460890176455095, "grad_norm": 2.4755024909973145, "learning_rate": 4.4776139057150383e-05, "loss": 0.8509, "step": 3972 }, { "epoch": 0.10463523834606268, "grad_norm": 2.942523956298828, "learning_rate": 4.47748222280748e-05, "loss": 1.3487, "step": 3973 }, { "epoch": 0.1046615749275744, "grad_norm": 3.0445621013641357, "learning_rate": 4.477350539899921e-05, "loss": 1.497, "step": 3974 }, { "epoch": 0.10468791150908612, "grad_norm": 4.136894226074219, "learning_rate": 4.477218856992363e-05, "loss": 0.9682, "step": 3975 }, { "epoch": 0.10471424809059784, "grad_norm": 1.9515690803527832, "learning_rate": 4.477087174084804e-05, "loss": 1.8275, "step": 3976 }, { "epoch": 0.10474058467210956, "grad_norm": 2.0134923458099365, "learning_rate": 4.4769554911772455e-05, "loss": 1.7219, "step": 3977 }, { "epoch": 0.10476692125362128, "grad_norm": 2.482132911682129, "learning_rate": 4.476823808269687e-05, "loss": 2.0579, "step": 3978 }, { "epoch": 0.104793257835133, "grad_norm": 4.157577037811279, "learning_rate": 4.476692125362128e-05, "loss": 2.6826, "step": 3979 }, { "epoch": 0.10481959441664472, "grad_norm": 1.9987140893936157, "learning_rate": 4.47656044245457e-05, "loss": 2.4325, "step": 3980 }, { "epoch": 0.10484593099815644, "grad_norm": 5.881584167480469, "learning_rate": 4.476428759547011e-05, "loss": 1.5879, "step": 3981 }, { "epoch": 0.10487226757966817, "grad_norm": 1.9028657674789429, "learning_rate": 4.4762970766394526e-05, "loss": 0.3889, "step": 3982 }, { "epoch": 0.10489860416117988, "grad_norm": 2.433701276779175, "learning_rate": 4.4761653937318935e-05, "loss": 2.031, "step": 3983 }, { "epoch": 0.1049249407426916, "grad_norm": 2.0801424980163574, "learning_rate": 4.476033710824335e-05, "loss": 1.8975, "step": 3984 }, { "epoch": 0.10495127732420331, "grad_norm": 2.2493185997009277, "learning_rate": 4.4759020279167766e-05, "loss": 1.2808, "step": 3985 }, { "epoch": 0.10497761390571504, "grad_norm": 1.7209901809692383, "learning_rate": 4.475770345009218e-05, "loss": 2.4493, "step": 3986 }, { "epoch": 0.10500395048722676, "grad_norm": 4.634757041931152, "learning_rate": 4.47563866210166e-05, "loss": 1.1222, "step": 3987 }, { "epoch": 0.10503028706873847, "grad_norm": 2.6888427734375, "learning_rate": 4.4755069791941006e-05, "loss": 0.4693, "step": 3988 }, { "epoch": 0.1050566236502502, "grad_norm": 3.5422425270080566, "learning_rate": 4.475375296286542e-05, "loss": 1.7103, "step": 3989 }, { "epoch": 0.10508296023176192, "grad_norm": 2.344755172729492, "learning_rate": 4.475243613378984e-05, "loss": 0.7759, "step": 3990 }, { "epoch": 0.10510929681327363, "grad_norm": 2.7901930809020996, "learning_rate": 4.475111930471425e-05, "loss": 2.1823, "step": 3991 }, { "epoch": 0.10513563339478536, "grad_norm": 4.578499794006348, "learning_rate": 4.474980247563866e-05, "loss": 1.5064, "step": 3992 }, { "epoch": 0.10516196997629708, "grad_norm": 2.010589838027954, "learning_rate": 4.474848564656308e-05, "loss": 1.8707, "step": 3993 }, { "epoch": 0.1051883065578088, "grad_norm": 3.7168216705322266, "learning_rate": 4.474716881748749e-05, "loss": 1.6647, "step": 3994 }, { "epoch": 0.10521464313932051, "grad_norm": 2.7797844409942627, "learning_rate": 4.474585198841191e-05, "loss": 2.1472, "step": 3995 }, { "epoch": 0.10524097972083224, "grad_norm": 2.030571222305298, "learning_rate": 4.474453515933632e-05, "loss": 2.2849, "step": 3996 }, { "epoch": 0.10526731630234396, "grad_norm": 1.6757009029388428, "learning_rate": 4.474321833026073e-05, "loss": 1.708, "step": 3997 }, { "epoch": 0.10529365288385567, "grad_norm": 2.0670464038848877, "learning_rate": 4.474190150118515e-05, "loss": 1.8766, "step": 3998 }, { "epoch": 0.1053199894653674, "grad_norm": 3.673553466796875, "learning_rate": 4.4740584672109564e-05, "loss": 1.6483, "step": 3999 }, { "epoch": 0.10534632604687912, "grad_norm": 1.9316781759262085, "learning_rate": 4.473926784303398e-05, "loss": 1.5283, "step": 4000 }, { "epoch": 0.10537266262839083, "grad_norm": 3.0214569568634033, "learning_rate": 4.473795101395839e-05, "loss": 0.876, "step": 4001 }, { "epoch": 0.10539899920990256, "grad_norm": 2.2037527561187744, "learning_rate": 4.4736634184882804e-05, "loss": 1.4227, "step": 4002 }, { "epoch": 0.10542533579141428, "grad_norm": 2.4519338607788086, "learning_rate": 4.473531735580721e-05, "loss": 1.7417, "step": 4003 }, { "epoch": 0.10545167237292599, "grad_norm": 4.407434940338135, "learning_rate": 4.4734000526731635e-05, "loss": 1.3909, "step": 4004 }, { "epoch": 0.10547800895443771, "grad_norm": 2.551903009414673, "learning_rate": 4.4732683697656044e-05, "loss": 1.966, "step": 4005 }, { "epoch": 0.10550434553594944, "grad_norm": 2.2575459480285645, "learning_rate": 4.473136686858046e-05, "loss": 1.7654, "step": 4006 }, { "epoch": 0.10553068211746115, "grad_norm": 2.2231922149658203, "learning_rate": 4.4730050039504875e-05, "loss": 1.8128, "step": 4007 }, { "epoch": 0.10555701869897287, "grad_norm": 2.688328981399536, "learning_rate": 4.472873321042929e-05, "loss": 1.3064, "step": 4008 }, { "epoch": 0.1055833552804846, "grad_norm": 2.364464282989502, "learning_rate": 4.4727416381353706e-05, "loss": 1.5036, "step": 4009 }, { "epoch": 0.10560969186199631, "grad_norm": 3.155242681503296, "learning_rate": 4.4726099552278115e-05, "loss": 1.2735, "step": 4010 }, { "epoch": 0.10563602844350803, "grad_norm": 2.4731411933898926, "learning_rate": 4.472478272320253e-05, "loss": 2.0021, "step": 4011 }, { "epoch": 0.10566236502501976, "grad_norm": 3.477363348007202, "learning_rate": 4.472346589412694e-05, "loss": 0.7553, "step": 4012 }, { "epoch": 0.10568870160653147, "grad_norm": 2.0620481967926025, "learning_rate": 4.472214906505136e-05, "loss": 1.0688, "step": 4013 }, { "epoch": 0.10571503818804319, "grad_norm": 2.5248160362243652, "learning_rate": 4.472083223597577e-05, "loss": 1.4607, "step": 4014 }, { "epoch": 0.1057413747695549, "grad_norm": 1.8622386455535889, "learning_rate": 4.4719515406900186e-05, "loss": 1.5775, "step": 4015 }, { "epoch": 0.10576771135106663, "grad_norm": 1.6190394163131714, "learning_rate": 4.47181985778246e-05, "loss": 1.2959, "step": 4016 }, { "epoch": 0.10579404793257835, "grad_norm": 4.105989933013916, "learning_rate": 4.471688174874901e-05, "loss": 1.3315, "step": 4017 }, { "epoch": 0.10582038451409007, "grad_norm": 2.533867359161377, "learning_rate": 4.471556491967343e-05, "loss": 1.0959, "step": 4018 }, { "epoch": 0.1058467210956018, "grad_norm": 3.2738797664642334, "learning_rate": 4.471424809059784e-05, "loss": 1.7608, "step": 4019 }, { "epoch": 0.10587305767711351, "grad_norm": 3.3484888076782227, "learning_rate": 4.471293126152226e-05, "loss": 1.952, "step": 4020 }, { "epoch": 0.10589939425862523, "grad_norm": 2.184448480606079, "learning_rate": 4.4711614432446666e-05, "loss": 1.7956, "step": 4021 }, { "epoch": 0.10592573084013696, "grad_norm": 2.1217143535614014, "learning_rate": 4.471029760337108e-05, "loss": 1.5918, "step": 4022 }, { "epoch": 0.10595206742164867, "grad_norm": 1.9207357168197632, "learning_rate": 4.47089807742955e-05, "loss": 1.9232, "step": 4023 }, { "epoch": 0.10597840400316039, "grad_norm": 2.3726885318756104, "learning_rate": 4.470766394521991e-05, "loss": 1.6053, "step": 4024 }, { "epoch": 0.10600474058467212, "grad_norm": 2.4885849952697754, "learning_rate": 4.470634711614433e-05, "loss": 2.5578, "step": 4025 }, { "epoch": 0.10603107716618383, "grad_norm": 2.848715305328369, "learning_rate": 4.470503028706874e-05, "loss": 1.7455, "step": 4026 }, { "epoch": 0.10605741374769555, "grad_norm": 3.8315703868865967, "learning_rate": 4.470371345799316e-05, "loss": 1.6488, "step": 4027 }, { "epoch": 0.10608375032920726, "grad_norm": 1.9673184156417847, "learning_rate": 4.470239662891757e-05, "loss": 0.8827, "step": 4028 }, { "epoch": 0.10611008691071899, "grad_norm": 2.9483304023742676, "learning_rate": 4.4701079799841984e-05, "loss": 1.31, "step": 4029 }, { "epoch": 0.10613642349223071, "grad_norm": 1.9678658246994019, "learning_rate": 4.469976297076639e-05, "loss": 0.3727, "step": 4030 }, { "epoch": 0.10616276007374242, "grad_norm": 1.7625017166137695, "learning_rate": 4.469844614169081e-05, "loss": 1.9109, "step": 4031 }, { "epoch": 0.10618909665525415, "grad_norm": 3.294492483139038, "learning_rate": 4.4697129312615224e-05, "loss": 2.2724, "step": 4032 }, { "epoch": 0.10621543323676587, "grad_norm": 1.878441572189331, "learning_rate": 4.469581248353964e-05, "loss": 2.2334, "step": 4033 }, { "epoch": 0.10624176981827758, "grad_norm": 3.995889663696289, "learning_rate": 4.4694495654464056e-05, "loss": 1.3493, "step": 4034 }, { "epoch": 0.10626810639978931, "grad_norm": 3.1018190383911133, "learning_rate": 4.4693178825388464e-05, "loss": 1.5428, "step": 4035 }, { "epoch": 0.10629444298130103, "grad_norm": 3.6806719303131104, "learning_rate": 4.469186199631288e-05, "loss": 1.6416, "step": 4036 }, { "epoch": 0.10632077956281275, "grad_norm": 2.073110580444336, "learning_rate": 4.4690545167237296e-05, "loss": 1.5843, "step": 4037 }, { "epoch": 0.10634711614432446, "grad_norm": 3.1895880699157715, "learning_rate": 4.468922833816171e-05, "loss": 2.2509, "step": 4038 }, { "epoch": 0.10637345272583619, "grad_norm": 1.8896957635879517, "learning_rate": 4.468791150908612e-05, "loss": 0.6443, "step": 4039 }, { "epoch": 0.1063997893073479, "grad_norm": 2.06345534324646, "learning_rate": 4.4686594680010536e-05, "loss": 1.6476, "step": 4040 }, { "epoch": 0.10642612588885962, "grad_norm": 2.168785333633423, "learning_rate": 4.468527785093495e-05, "loss": 2.1182, "step": 4041 }, { "epoch": 0.10645246247037135, "grad_norm": 3.1613264083862305, "learning_rate": 4.468396102185937e-05, "loss": 1.5156, "step": 4042 }, { "epoch": 0.10647879905188307, "grad_norm": 2.9733388423919678, "learning_rate": 4.468264419278378e-05, "loss": 0.9647, "step": 4043 }, { "epoch": 0.10650513563339478, "grad_norm": 3.550116539001465, "learning_rate": 4.468132736370819e-05, "loss": 1.8665, "step": 4044 }, { "epoch": 0.10653147221490651, "grad_norm": 4.879088878631592, "learning_rate": 4.468001053463261e-05, "loss": 1.4219, "step": 4045 }, { "epoch": 0.10655780879641823, "grad_norm": 2.0437891483306885, "learning_rate": 4.467869370555702e-05, "loss": 1.4834, "step": 4046 }, { "epoch": 0.10658414537792994, "grad_norm": 3.944870948791504, "learning_rate": 4.467737687648144e-05, "loss": 1.9115, "step": 4047 }, { "epoch": 0.10661048195944166, "grad_norm": 1.9715152978897095, "learning_rate": 4.467606004740585e-05, "loss": 1.826, "step": 4048 }, { "epoch": 0.10663681854095339, "grad_norm": 2.3294150829315186, "learning_rate": 4.467474321833026e-05, "loss": 2.1391, "step": 4049 }, { "epoch": 0.1066631551224651, "grad_norm": 2.313127279281616, "learning_rate": 4.467342638925467e-05, "loss": 0.3878, "step": 4050 }, { "epoch": 0.10668949170397682, "grad_norm": 5.281714916229248, "learning_rate": 4.4672109560179094e-05, "loss": 1.6318, "step": 4051 }, { "epoch": 0.10671582828548855, "grad_norm": 4.25741720199585, "learning_rate": 4.46707927311035e-05, "loss": 1.8003, "step": 4052 }, { "epoch": 0.10674216486700026, "grad_norm": 1.9785746335983276, "learning_rate": 4.466947590202792e-05, "loss": 1.6576, "step": 4053 }, { "epoch": 0.10676850144851198, "grad_norm": 3.613093614578247, "learning_rate": 4.4668159072952334e-05, "loss": 1.4468, "step": 4054 }, { "epoch": 0.10679483803002371, "grad_norm": 2.0653152465820312, "learning_rate": 4.466684224387674e-05, "loss": 1.9659, "step": 4055 }, { "epoch": 0.10682117461153542, "grad_norm": 1.8395441770553589, "learning_rate": 4.4665525414801165e-05, "loss": 2.1076, "step": 4056 }, { "epoch": 0.10684751119304714, "grad_norm": 1.7062104940414429, "learning_rate": 4.4664208585725574e-05, "loss": 1.736, "step": 4057 }, { "epoch": 0.10687384777455886, "grad_norm": 2.5451226234436035, "learning_rate": 4.466289175664999e-05, "loss": 1.6051, "step": 4058 }, { "epoch": 0.10690018435607059, "grad_norm": 2.8845815658569336, "learning_rate": 4.46615749275744e-05, "loss": 1.7561, "step": 4059 }, { "epoch": 0.1069265209375823, "grad_norm": 4.4622063636779785, "learning_rate": 4.466025809849882e-05, "loss": 1.5482, "step": 4060 }, { "epoch": 0.10695285751909402, "grad_norm": 4.345691680908203, "learning_rate": 4.465894126942323e-05, "loss": 2.1165, "step": 4061 }, { "epoch": 0.10697919410060575, "grad_norm": 3.109825849533081, "learning_rate": 4.4657624440347645e-05, "loss": 1.601, "step": 4062 }, { "epoch": 0.10700553068211746, "grad_norm": 2.058493137359619, "learning_rate": 4.465630761127206e-05, "loss": 2.0234, "step": 4063 }, { "epoch": 0.10703186726362918, "grad_norm": 3.9762372970581055, "learning_rate": 4.465499078219647e-05, "loss": 1.4851, "step": 4064 }, { "epoch": 0.1070582038451409, "grad_norm": 3.4705705642700195, "learning_rate": 4.465367395312089e-05, "loss": 2.2298, "step": 4065 }, { "epoch": 0.10708454042665262, "grad_norm": 2.324753761291504, "learning_rate": 4.46523571240453e-05, "loss": 2.2071, "step": 4066 }, { "epoch": 0.10711087700816434, "grad_norm": 2.5844130516052246, "learning_rate": 4.4651040294969716e-05, "loss": 1.4767, "step": 4067 }, { "epoch": 0.10713721358967607, "grad_norm": 1.953588604927063, "learning_rate": 4.4649723465894125e-05, "loss": 1.6706, "step": 4068 }, { "epoch": 0.10716355017118778, "grad_norm": 2.027574062347412, "learning_rate": 4.464840663681854e-05, "loss": 1.6953, "step": 4069 }, { "epoch": 0.1071898867526995, "grad_norm": 4.431998252868652, "learning_rate": 4.4647089807742956e-05, "loss": 1.5115, "step": 4070 }, { "epoch": 0.10721622333421121, "grad_norm": 3.2604241371154785, "learning_rate": 4.464577297866737e-05, "loss": 1.9019, "step": 4071 }, { "epoch": 0.10724255991572294, "grad_norm": 2.3794023990631104, "learning_rate": 4.464445614959179e-05, "loss": 1.8177, "step": 4072 }, { "epoch": 0.10726889649723466, "grad_norm": 2.0373446941375732, "learning_rate": 4.4643139320516196e-05, "loss": 1.8285, "step": 4073 }, { "epoch": 0.10729523307874637, "grad_norm": 2.27317476272583, "learning_rate": 4.464182249144062e-05, "loss": 1.8145, "step": 4074 }, { "epoch": 0.1073215696602581, "grad_norm": 2.8986411094665527, "learning_rate": 4.464050566236503e-05, "loss": 2.1543, "step": 4075 }, { "epoch": 0.10734790624176982, "grad_norm": 2.6787140369415283, "learning_rate": 4.463918883328944e-05, "loss": 2.4231, "step": 4076 }, { "epoch": 0.10737424282328153, "grad_norm": 1.8390060663223267, "learning_rate": 4.463787200421385e-05, "loss": 1.5888, "step": 4077 }, { "epoch": 0.10740057940479326, "grad_norm": 1.8164650201797485, "learning_rate": 4.463655517513827e-05, "loss": 2.0236, "step": 4078 }, { "epoch": 0.10742691598630498, "grad_norm": 2.007504940032959, "learning_rate": 4.463523834606268e-05, "loss": 1.6726, "step": 4079 }, { "epoch": 0.1074532525678167, "grad_norm": 1.733331322669983, "learning_rate": 4.46339215169871e-05, "loss": 1.7026, "step": 4080 }, { "epoch": 0.10747958914932841, "grad_norm": 4.508018970489502, "learning_rate": 4.4632604687911514e-05, "loss": 1.7218, "step": 4081 }, { "epoch": 0.10750592573084014, "grad_norm": 5.116292476654053, "learning_rate": 4.463128785883592e-05, "loss": 1.6436, "step": 4082 }, { "epoch": 0.10753226231235186, "grad_norm": 1.760777473449707, "learning_rate": 4.462997102976034e-05, "loss": 1.6767, "step": 4083 }, { "epoch": 0.10755859889386357, "grad_norm": 3.2570009231567383, "learning_rate": 4.4628654200684754e-05, "loss": 1.9036, "step": 4084 }, { "epoch": 0.1075849354753753, "grad_norm": 1.937599539756775, "learning_rate": 4.462733737160917e-05, "loss": 1.5554, "step": 4085 }, { "epoch": 0.10761127205688702, "grad_norm": 1.9227243661880493, "learning_rate": 4.462602054253358e-05, "loss": 0.6297, "step": 4086 }, { "epoch": 0.10763760863839873, "grad_norm": 2.8662517070770264, "learning_rate": 4.4624703713457994e-05, "loss": 1.8503, "step": 4087 }, { "epoch": 0.10766394521991046, "grad_norm": 1.8032362461090088, "learning_rate": 4.462338688438241e-05, "loss": 1.04, "step": 4088 }, { "epoch": 0.10769028180142218, "grad_norm": 4.272941589355469, "learning_rate": 4.4622070055306825e-05, "loss": 1.5103, "step": 4089 }, { "epoch": 0.10771661838293389, "grad_norm": 2.3737215995788574, "learning_rate": 4.462075322623124e-05, "loss": 0.9787, "step": 4090 }, { "epoch": 0.10774295496444561, "grad_norm": 2.4204230308532715, "learning_rate": 4.461943639715565e-05, "loss": 1.4417, "step": 4091 }, { "epoch": 0.10776929154595734, "grad_norm": 1.9358290433883667, "learning_rate": 4.4618119568080065e-05, "loss": 2.0035, "step": 4092 }, { "epoch": 0.10779562812746905, "grad_norm": 5.602682590484619, "learning_rate": 4.461680273900448e-05, "loss": 2.1958, "step": 4093 }, { "epoch": 0.10782196470898077, "grad_norm": 3.716036558151245, "learning_rate": 4.4615485909928897e-05, "loss": 1.3942, "step": 4094 }, { "epoch": 0.1078483012904925, "grad_norm": 3.2236311435699463, "learning_rate": 4.4614169080853305e-05, "loss": 2.1268, "step": 4095 }, { "epoch": 0.10787463787200421, "grad_norm": 3.3057029247283936, "learning_rate": 4.461285225177772e-05, "loss": 2.2917, "step": 4096 }, { "epoch": 0.10790097445351593, "grad_norm": 3.5505335330963135, "learning_rate": 4.461153542270213e-05, "loss": 0.6697, "step": 4097 }, { "epoch": 0.10792731103502766, "grad_norm": 2.0468151569366455, "learning_rate": 4.461021859362655e-05, "loss": 2.5624, "step": 4098 }, { "epoch": 0.10795364761653937, "grad_norm": 1.9204453229904175, "learning_rate": 4.460890176455096e-05, "loss": 1.5151, "step": 4099 }, { "epoch": 0.10797998419805109, "grad_norm": 1.622131586074829, "learning_rate": 4.460758493547538e-05, "loss": 1.4166, "step": 4100 }, { "epoch": 0.1080063207795628, "grad_norm": 2.30871319770813, "learning_rate": 4.460626810639979e-05, "loss": 1.3913, "step": 4101 }, { "epoch": 0.10803265736107454, "grad_norm": 3.7625954151153564, "learning_rate": 4.46049512773242e-05, "loss": 1.3604, "step": 4102 }, { "epoch": 0.10805899394258625, "grad_norm": 11.044343948364258, "learning_rate": 4.4603634448248623e-05, "loss": 1.9603, "step": 4103 }, { "epoch": 0.10808533052409797, "grad_norm": 2.669440746307373, "learning_rate": 4.460231761917303e-05, "loss": 2.1086, "step": 4104 }, { "epoch": 0.1081116671056097, "grad_norm": 4.807341575622559, "learning_rate": 4.460100079009745e-05, "loss": 1.349, "step": 4105 }, { "epoch": 0.10813800368712141, "grad_norm": 2.4318532943725586, "learning_rate": 4.459968396102186e-05, "loss": 2.1921, "step": 4106 }, { "epoch": 0.10816434026863313, "grad_norm": 3.815917491912842, "learning_rate": 4.459836713194628e-05, "loss": 0.2555, "step": 4107 }, { "epoch": 0.10819067685014486, "grad_norm": 1.9405546188354492, "learning_rate": 4.459705030287069e-05, "loss": 2.0096, "step": 4108 }, { "epoch": 0.10821701343165657, "grad_norm": 2.8776192665100098, "learning_rate": 4.4595733473795103e-05, "loss": 1.9219, "step": 4109 }, { "epoch": 0.10824335001316829, "grad_norm": 3.1089329719543457, "learning_rate": 4.459441664471952e-05, "loss": 0.4953, "step": 4110 }, { "epoch": 0.10826968659468002, "grad_norm": 2.188464403152466, "learning_rate": 4.459309981564393e-05, "loss": 1.3146, "step": 4111 }, { "epoch": 0.10829602317619173, "grad_norm": 5.935386657714844, "learning_rate": 4.459178298656835e-05, "loss": 1.643, "step": 4112 }, { "epoch": 0.10832235975770345, "grad_norm": 3.1096858978271484, "learning_rate": 4.459046615749276e-05, "loss": 1.1696, "step": 4113 }, { "epoch": 0.10834869633921516, "grad_norm": 1.9883707761764526, "learning_rate": 4.4589149328417175e-05, "loss": 1.9735, "step": 4114 }, { "epoch": 0.1083750329207269, "grad_norm": 2.773411512374878, "learning_rate": 4.4587832499341583e-05, "loss": 2.0659, "step": 4115 }, { "epoch": 0.10840136950223861, "grad_norm": 2.15742826461792, "learning_rate": 4.4586515670266e-05, "loss": 1.3497, "step": 4116 }, { "epoch": 0.10842770608375032, "grad_norm": 1.8726258277893066, "learning_rate": 4.4585198841190415e-05, "loss": 1.1951, "step": 4117 }, { "epoch": 0.10845404266526205, "grad_norm": 2.60878324508667, "learning_rate": 4.458388201211483e-05, "loss": 0.5625, "step": 4118 }, { "epoch": 0.10848037924677377, "grad_norm": 3.1422879695892334, "learning_rate": 4.4582565183039246e-05, "loss": 1.6653, "step": 4119 }, { "epoch": 0.10850671582828549, "grad_norm": 2.1638922691345215, "learning_rate": 4.4581248353963655e-05, "loss": 1.8287, "step": 4120 }, { "epoch": 0.10853305240979721, "grad_norm": 1.718577265739441, "learning_rate": 4.457993152488807e-05, "loss": 1.8536, "step": 4121 }, { "epoch": 0.10855938899130893, "grad_norm": 1.9097579717636108, "learning_rate": 4.4578614695812486e-05, "loss": 1.5093, "step": 4122 }, { "epoch": 0.10858572557282065, "grad_norm": 2.0030362606048584, "learning_rate": 4.45772978667369e-05, "loss": 2.147, "step": 4123 }, { "epoch": 0.10861206215433236, "grad_norm": 3.1014513969421387, "learning_rate": 4.457598103766131e-05, "loss": 1.6916, "step": 4124 }, { "epoch": 0.10863839873584409, "grad_norm": 2.2505898475646973, "learning_rate": 4.4574664208585726e-05, "loss": 2.3022, "step": 4125 }, { "epoch": 0.1086647353173558, "grad_norm": 1.8901978731155396, "learning_rate": 4.457334737951014e-05, "loss": 1.8867, "step": 4126 }, { "epoch": 0.10869107189886752, "grad_norm": 3.16646409034729, "learning_rate": 4.457203055043456e-05, "loss": 1.7358, "step": 4127 }, { "epoch": 0.10871740848037925, "grad_norm": 3.0400588512420654, "learning_rate": 4.457071372135897e-05, "loss": 1.9286, "step": 4128 }, { "epoch": 0.10874374506189097, "grad_norm": 3.8501882553100586, "learning_rate": 4.456939689228338e-05, "loss": 1.295, "step": 4129 }, { "epoch": 0.10877008164340268, "grad_norm": 4.805459022521973, "learning_rate": 4.45680800632078e-05, "loss": 2.2903, "step": 4130 }, { "epoch": 0.10879641822491441, "grad_norm": 2.4454216957092285, "learning_rate": 4.456676323413221e-05, "loss": 2.1617, "step": 4131 }, { "epoch": 0.10882275480642613, "grad_norm": 2.029470920562744, "learning_rate": 4.456544640505663e-05, "loss": 2.1926, "step": 4132 }, { "epoch": 0.10884909138793784, "grad_norm": 2.4619250297546387, "learning_rate": 4.456412957598104e-05, "loss": 2.8683, "step": 4133 }, { "epoch": 0.10887542796944956, "grad_norm": 7.48714542388916, "learning_rate": 4.456281274690545e-05, "loss": 1.8232, "step": 4134 }, { "epoch": 0.10890176455096129, "grad_norm": 1.9956969022750854, "learning_rate": 4.456149591782987e-05, "loss": 1.5241, "step": 4135 }, { "epoch": 0.108928101132473, "grad_norm": 2.5294861793518066, "learning_rate": 4.4560179088754284e-05, "loss": 1.8114, "step": 4136 }, { "epoch": 0.10895443771398472, "grad_norm": 3.4034552574157715, "learning_rate": 4.45588622596787e-05, "loss": 0.2944, "step": 4137 }, { "epoch": 0.10898077429549645, "grad_norm": 2.0800437927246094, "learning_rate": 4.455754543060311e-05, "loss": 1.628, "step": 4138 }, { "epoch": 0.10900711087700816, "grad_norm": 3.241866111755371, "learning_rate": 4.4556228601527524e-05, "loss": 1.865, "step": 4139 }, { "epoch": 0.10903344745851988, "grad_norm": 1.9164265394210815, "learning_rate": 4.455491177245194e-05, "loss": 1.8292, "step": 4140 }, { "epoch": 0.10905978404003161, "grad_norm": 2.2351768016815186, "learning_rate": 4.4553594943376355e-05, "loss": 0.3051, "step": 4141 }, { "epoch": 0.10908612062154333, "grad_norm": 2.382841110229492, "learning_rate": 4.4552278114300764e-05, "loss": 1.2609, "step": 4142 }, { "epoch": 0.10911245720305504, "grad_norm": 2.1350529193878174, "learning_rate": 4.455096128522518e-05, "loss": 0.786, "step": 4143 }, { "epoch": 0.10913879378456676, "grad_norm": 2.8577592372894287, "learning_rate": 4.4549644456149595e-05, "loss": 1.8832, "step": 4144 }, { "epoch": 0.10916513036607849, "grad_norm": 4.506251335144043, "learning_rate": 4.454832762707401e-05, "loss": 1.184, "step": 4145 }, { "epoch": 0.1091914669475902, "grad_norm": 4.11175537109375, "learning_rate": 4.4547010797998426e-05, "loss": 0.3751, "step": 4146 }, { "epoch": 0.10921780352910192, "grad_norm": 1.7342851161956787, "learning_rate": 4.4545693968922835e-05, "loss": 0.5253, "step": 4147 }, { "epoch": 0.10924414011061365, "grad_norm": 4.022400379180908, "learning_rate": 4.454437713984725e-05, "loss": 1.8998, "step": 4148 }, { "epoch": 0.10927047669212536, "grad_norm": 2.1567111015319824, "learning_rate": 4.454306031077166e-05, "loss": 1.7549, "step": 4149 }, { "epoch": 0.10929681327363708, "grad_norm": 3.7303826808929443, "learning_rate": 4.454174348169608e-05, "loss": 1.4587, "step": 4150 }, { "epoch": 0.10932314985514881, "grad_norm": 2.7888081073760986, "learning_rate": 4.454042665262049e-05, "loss": 1.4682, "step": 4151 }, { "epoch": 0.10934948643666052, "grad_norm": 2.8986802101135254, "learning_rate": 4.4539109823544906e-05, "loss": 0.6415, "step": 4152 }, { "epoch": 0.10937582301817224, "grad_norm": 2.200427770614624, "learning_rate": 4.4537792994469315e-05, "loss": 1.8819, "step": 4153 }, { "epoch": 0.10940215959968395, "grad_norm": 3.080820322036743, "learning_rate": 4.453647616539374e-05, "loss": 1.7558, "step": 4154 }, { "epoch": 0.10942849618119568, "grad_norm": 2.1669230461120605, "learning_rate": 4.4535159336318146e-05, "loss": 2.0016, "step": 4155 }, { "epoch": 0.1094548327627074, "grad_norm": 2.913347005844116, "learning_rate": 4.453384250724256e-05, "loss": 1.5105, "step": 4156 }, { "epoch": 0.10948116934421911, "grad_norm": 3.9509127140045166, "learning_rate": 4.453252567816698e-05, "loss": 2.5394, "step": 4157 }, { "epoch": 0.10950750592573084, "grad_norm": 2.0484113693237305, "learning_rate": 4.4531208849091386e-05, "loss": 1.6148, "step": 4158 }, { "epoch": 0.10953384250724256, "grad_norm": 3.5272421836853027, "learning_rate": 4.452989202001581e-05, "loss": 1.8762, "step": 4159 }, { "epoch": 0.10956017908875428, "grad_norm": 2.2493324279785156, "learning_rate": 4.452857519094022e-05, "loss": 2.4996, "step": 4160 }, { "epoch": 0.109586515670266, "grad_norm": 2.2131247520446777, "learning_rate": 4.452725836186463e-05, "loss": 2.3426, "step": 4161 }, { "epoch": 0.10961285225177772, "grad_norm": 1.9247936010360718, "learning_rate": 4.452594153278904e-05, "loss": 2.1403, "step": 4162 }, { "epoch": 0.10963918883328944, "grad_norm": 2.1571877002716064, "learning_rate": 4.452462470371346e-05, "loss": 1.6365, "step": 4163 }, { "epoch": 0.10966552541480117, "grad_norm": 1.528718113899231, "learning_rate": 4.452330787463787e-05, "loss": 0.2761, "step": 4164 }, { "epoch": 0.10969186199631288, "grad_norm": 1.7336452007293701, "learning_rate": 4.452199104556229e-05, "loss": 2.2681, "step": 4165 }, { "epoch": 0.1097181985778246, "grad_norm": 1.8995816707611084, "learning_rate": 4.4520674216486704e-05, "loss": 1.7106, "step": 4166 }, { "epoch": 0.10974453515933631, "grad_norm": 5.518387794494629, "learning_rate": 4.451935738741111e-05, "loss": 1.471, "step": 4167 }, { "epoch": 0.10977087174084804, "grad_norm": 4.71088981628418, "learning_rate": 4.451804055833553e-05, "loss": 1.0357, "step": 4168 }, { "epoch": 0.10979720832235976, "grad_norm": 2.0973432064056396, "learning_rate": 4.4516723729259944e-05, "loss": 2.1014, "step": 4169 }, { "epoch": 0.10982354490387147, "grad_norm": 2.453827142715454, "learning_rate": 4.451540690018436e-05, "loss": 1.4535, "step": 4170 }, { "epoch": 0.1098498814853832, "grad_norm": 1.9312286376953125, "learning_rate": 4.451409007110877e-05, "loss": 1.7561, "step": 4171 }, { "epoch": 0.10987621806689492, "grad_norm": 2.0955183506011963, "learning_rate": 4.4512773242033184e-05, "loss": 1.7761, "step": 4172 }, { "epoch": 0.10990255464840663, "grad_norm": 2.345484972000122, "learning_rate": 4.45114564129576e-05, "loss": 0.5964, "step": 4173 }, { "epoch": 0.10992889122991836, "grad_norm": 2.8912203311920166, "learning_rate": 4.4510139583882016e-05, "loss": 1.9131, "step": 4174 }, { "epoch": 0.10995522781143008, "grad_norm": 2.093498945236206, "learning_rate": 4.450882275480643e-05, "loss": 1.7598, "step": 4175 }, { "epoch": 0.1099815643929418, "grad_norm": 2.3428702354431152, "learning_rate": 4.450750592573084e-05, "loss": 1.8295, "step": 4176 }, { "epoch": 0.11000790097445351, "grad_norm": 2.0588157176971436, "learning_rate": 4.4506189096655256e-05, "loss": 1.612, "step": 4177 }, { "epoch": 0.11003423755596524, "grad_norm": 2.188891887664795, "learning_rate": 4.450487226757967e-05, "loss": 2.5297, "step": 4178 }, { "epoch": 0.11006057413747695, "grad_norm": 4.410722255706787, "learning_rate": 4.450355543850409e-05, "loss": 1.0905, "step": 4179 }, { "epoch": 0.11008691071898867, "grad_norm": 5.933962345123291, "learning_rate": 4.4502238609428496e-05, "loss": 1.5406, "step": 4180 }, { "epoch": 0.1101132473005004, "grad_norm": 2.8442652225494385, "learning_rate": 4.450092178035291e-05, "loss": 1.8647, "step": 4181 }, { "epoch": 0.11013958388201212, "grad_norm": 2.6928162574768066, "learning_rate": 4.449960495127733e-05, "loss": 1.7226, "step": 4182 }, { "epoch": 0.11016592046352383, "grad_norm": 2.676680564880371, "learning_rate": 4.449828812220174e-05, "loss": 1.7621, "step": 4183 }, { "epoch": 0.11019225704503556, "grad_norm": 1.8384419679641724, "learning_rate": 4.449697129312616e-05, "loss": 1.8842, "step": 4184 }, { "epoch": 0.11021859362654728, "grad_norm": 2.114710807800293, "learning_rate": 4.449565446405057e-05, "loss": 1.2852, "step": 4185 }, { "epoch": 0.11024493020805899, "grad_norm": 3.0580577850341797, "learning_rate": 4.449433763497498e-05, "loss": 1.1934, "step": 4186 }, { "epoch": 0.11027126678957071, "grad_norm": 1.8078515529632568, "learning_rate": 4.44930208058994e-05, "loss": 1.5872, "step": 4187 }, { "epoch": 0.11029760337108244, "grad_norm": 2.373305082321167, "learning_rate": 4.4491703976823814e-05, "loss": 1.7719, "step": 4188 }, { "epoch": 0.11032393995259415, "grad_norm": 2.870018482208252, "learning_rate": 4.449038714774822e-05, "loss": 2.0621, "step": 4189 }, { "epoch": 0.11035027653410587, "grad_norm": 2.607174873352051, "learning_rate": 4.448907031867264e-05, "loss": 1.8878, "step": 4190 }, { "epoch": 0.1103766131156176, "grad_norm": 2.303746223449707, "learning_rate": 4.4487753489597054e-05, "loss": 1.9227, "step": 4191 }, { "epoch": 0.11040294969712931, "grad_norm": 2.5911405086517334, "learning_rate": 4.448643666052147e-05, "loss": 1.0763, "step": 4192 }, { "epoch": 0.11042928627864103, "grad_norm": 3.95622181892395, "learning_rate": 4.4485119831445885e-05, "loss": 1.2832, "step": 4193 }, { "epoch": 0.11045562286015276, "grad_norm": 2.520411729812622, "learning_rate": 4.4483803002370294e-05, "loss": 2.4911, "step": 4194 }, { "epoch": 0.11048195944166447, "grad_norm": 3.7951996326446533, "learning_rate": 4.448248617329471e-05, "loss": 1.7614, "step": 4195 }, { "epoch": 0.11050829602317619, "grad_norm": 1.9339969158172607, "learning_rate": 4.448116934421912e-05, "loss": 1.4992, "step": 4196 }, { "epoch": 0.1105346326046879, "grad_norm": 4.729146957397461, "learning_rate": 4.447985251514354e-05, "loss": 1.4408, "step": 4197 }, { "epoch": 0.11056096918619963, "grad_norm": 2.343226671218872, "learning_rate": 4.447853568606795e-05, "loss": 1.6741, "step": 4198 }, { "epoch": 0.11058730576771135, "grad_norm": 1.902189016342163, "learning_rate": 4.4477218856992365e-05, "loss": 2.0228, "step": 4199 }, { "epoch": 0.11061364234922307, "grad_norm": 1.986051321029663, "learning_rate": 4.4475902027916774e-05, "loss": 2.0894, "step": 4200 }, { "epoch": 0.1106399789307348, "grad_norm": 2.330847978591919, "learning_rate": 4.447458519884119e-05, "loss": 0.2927, "step": 4201 }, { "epoch": 0.11066631551224651, "grad_norm": 2.673935651779175, "learning_rate": 4.4473268369765605e-05, "loss": 1.1512, "step": 4202 }, { "epoch": 0.11069265209375823, "grad_norm": 3.250035285949707, "learning_rate": 4.447195154069002e-05, "loss": 1.3046, "step": 4203 }, { "epoch": 0.11071898867526996, "grad_norm": 3.924764394760132, "learning_rate": 4.4470634711614436e-05, "loss": 2.0056, "step": 4204 }, { "epoch": 0.11074532525678167, "grad_norm": 5.089044570922852, "learning_rate": 4.4469317882538845e-05, "loss": 1.4291, "step": 4205 }, { "epoch": 0.11077166183829339, "grad_norm": 2.209367275238037, "learning_rate": 4.446800105346327e-05, "loss": 2.1428, "step": 4206 }, { "epoch": 0.11079799841980512, "grad_norm": 3.346619129180908, "learning_rate": 4.4466684224387676e-05, "loss": 0.4643, "step": 4207 }, { "epoch": 0.11082433500131683, "grad_norm": 4.271258354187012, "learning_rate": 4.446536739531209e-05, "loss": 1.0904, "step": 4208 }, { "epoch": 0.11085067158282855, "grad_norm": 3.2781922817230225, "learning_rate": 4.44640505662365e-05, "loss": 2.4328, "step": 4209 }, { "epoch": 0.11087700816434026, "grad_norm": 1.5749311447143555, "learning_rate": 4.4462733737160916e-05, "loss": 1.4835, "step": 4210 }, { "epoch": 0.11090334474585199, "grad_norm": 1.7231374979019165, "learning_rate": 4.446141690808533e-05, "loss": 1.6169, "step": 4211 }, { "epoch": 0.11092968132736371, "grad_norm": 2.5679404735565186, "learning_rate": 4.446010007900975e-05, "loss": 1.6938, "step": 4212 }, { "epoch": 0.11095601790887542, "grad_norm": 3.925067186355591, "learning_rate": 4.445878324993416e-05, "loss": 0.9371, "step": 4213 }, { "epoch": 0.11098235449038715, "grad_norm": 3.1073853969573975, "learning_rate": 4.445746642085857e-05, "loss": 2.0418, "step": 4214 }, { "epoch": 0.11100869107189887, "grad_norm": 2.345743179321289, "learning_rate": 4.445614959178299e-05, "loss": 3.329, "step": 4215 }, { "epoch": 0.11103502765341058, "grad_norm": 1.7030315399169922, "learning_rate": 4.44548327627074e-05, "loss": 2.2939, "step": 4216 }, { "epoch": 0.11106136423492231, "grad_norm": 2.1973705291748047, "learning_rate": 4.445351593363182e-05, "loss": 1.5548, "step": 4217 }, { "epoch": 0.11108770081643403, "grad_norm": 2.013047695159912, "learning_rate": 4.445219910455623e-05, "loss": 1.3957, "step": 4218 }, { "epoch": 0.11111403739794574, "grad_norm": 1.4847007989883423, "learning_rate": 4.445088227548064e-05, "loss": 1.7038, "step": 4219 }, { "epoch": 0.11114037397945746, "grad_norm": 1.9858585596084595, "learning_rate": 4.444956544640506e-05, "loss": 0.3813, "step": 4220 }, { "epoch": 0.11116671056096919, "grad_norm": 1.7559846639633179, "learning_rate": 4.4448248617329474e-05, "loss": 1.6474, "step": 4221 }, { "epoch": 0.1111930471424809, "grad_norm": 1.98350989818573, "learning_rate": 4.444693178825389e-05, "loss": 1.4676, "step": 4222 }, { "epoch": 0.11121938372399262, "grad_norm": 1.9280142784118652, "learning_rate": 4.44456149591783e-05, "loss": 2.0786, "step": 4223 }, { "epoch": 0.11124572030550435, "grad_norm": 3.110429525375366, "learning_rate": 4.4444298130102714e-05, "loss": 2.2039, "step": 4224 }, { "epoch": 0.11127205688701607, "grad_norm": 8.633528709411621, "learning_rate": 4.444298130102713e-05, "loss": 1.6533, "step": 4225 }, { "epoch": 0.11129839346852778, "grad_norm": 5.191372871398926, "learning_rate": 4.4441664471951545e-05, "loss": 0.8776, "step": 4226 }, { "epoch": 0.11132473005003951, "grad_norm": 2.7773046493530273, "learning_rate": 4.4440347642875954e-05, "loss": 2.2684, "step": 4227 }, { "epoch": 0.11135106663155123, "grad_norm": 1.843032956123352, "learning_rate": 4.443903081380037e-05, "loss": 1.562, "step": 4228 }, { "epoch": 0.11137740321306294, "grad_norm": 3.5764057636260986, "learning_rate": 4.4437713984724785e-05, "loss": 1.987, "step": 4229 }, { "epoch": 0.11140373979457466, "grad_norm": 2.1517395973205566, "learning_rate": 4.44363971556492e-05, "loss": 1.8344, "step": 4230 }, { "epoch": 0.11143007637608639, "grad_norm": 2.2035233974456787, "learning_rate": 4.443508032657362e-05, "loss": 1.7476, "step": 4231 }, { "epoch": 0.1114564129575981, "grad_norm": 1.5685129165649414, "learning_rate": 4.4433763497498025e-05, "loss": 2.0277, "step": 4232 }, { "epoch": 0.11148274953910982, "grad_norm": 3.6586594581604004, "learning_rate": 4.443244666842244e-05, "loss": 2.1176, "step": 4233 }, { "epoch": 0.11150908612062155, "grad_norm": 4.339465618133545, "learning_rate": 4.443112983934685e-05, "loss": 1.5764, "step": 4234 }, { "epoch": 0.11153542270213326, "grad_norm": 5.024068355560303, "learning_rate": 4.442981301027127e-05, "loss": 1.3535, "step": 4235 }, { "epoch": 0.11156175928364498, "grad_norm": 2.071877956390381, "learning_rate": 4.442849618119568e-05, "loss": 2.2792, "step": 4236 }, { "epoch": 0.11158809586515671, "grad_norm": 2.2989859580993652, "learning_rate": 4.44271793521201e-05, "loss": 2.0244, "step": 4237 }, { "epoch": 0.11161443244666842, "grad_norm": 3.2281546592712402, "learning_rate": 4.442586252304451e-05, "loss": 1.5432, "step": 4238 }, { "epoch": 0.11164076902818014, "grad_norm": 2.5454723834991455, "learning_rate": 4.442454569396893e-05, "loss": 1.3705, "step": 4239 }, { "epoch": 0.11166710560969186, "grad_norm": 2.0404036045074463, "learning_rate": 4.4423228864893343e-05, "loss": 2.1095, "step": 4240 }, { "epoch": 0.11169344219120358, "grad_norm": 1.8799490928649902, "learning_rate": 4.442191203581775e-05, "loss": 2.1393, "step": 4241 }, { "epoch": 0.1117197787727153, "grad_norm": 3.2960891723632812, "learning_rate": 4.442059520674217e-05, "loss": 1.0129, "step": 4242 }, { "epoch": 0.11174611535422702, "grad_norm": 2.4646873474121094, "learning_rate": 4.441927837766658e-05, "loss": 1.8543, "step": 4243 }, { "epoch": 0.11177245193573875, "grad_norm": 4.208311557769775, "learning_rate": 4.4417961548591e-05, "loss": 1.468, "step": 4244 }, { "epoch": 0.11179878851725046, "grad_norm": 2.6525988578796387, "learning_rate": 4.441664471951541e-05, "loss": 1.7819, "step": 4245 }, { "epoch": 0.11182512509876218, "grad_norm": 1.6626691818237305, "learning_rate": 4.4415327890439823e-05, "loss": 1.8913, "step": 4246 }, { "epoch": 0.1118514616802739, "grad_norm": 5.3551344871521, "learning_rate": 4.441401106136424e-05, "loss": 2.6188, "step": 4247 }, { "epoch": 0.11187779826178562, "grad_norm": 1.665309190750122, "learning_rate": 4.441269423228865e-05, "loss": 1.8647, "step": 4248 }, { "epoch": 0.11190413484329734, "grad_norm": 3.119785785675049, "learning_rate": 4.441137740321307e-05, "loss": 2.627, "step": 4249 }, { "epoch": 0.11193047142480907, "grad_norm": 2.7680482864379883, "learning_rate": 4.441006057413748e-05, "loss": 1.6754, "step": 4250 }, { "epoch": 0.11195680800632078, "grad_norm": 2.355776786804199, "learning_rate": 4.4408743745061895e-05, "loss": 1.3855, "step": 4251 }, { "epoch": 0.1119831445878325, "grad_norm": 2.3840339183807373, "learning_rate": 4.4407426915986304e-05, "loss": 1.5975, "step": 4252 }, { "epoch": 0.11200948116934421, "grad_norm": 1.9081884622573853, "learning_rate": 4.4406110086910726e-05, "loss": 2.3249, "step": 4253 }, { "epoch": 0.11203581775085594, "grad_norm": 2.7819390296936035, "learning_rate": 4.4404793257835135e-05, "loss": 0.9198, "step": 4254 }, { "epoch": 0.11206215433236766, "grad_norm": 2.959852695465088, "learning_rate": 4.440347642875955e-05, "loss": 2.663, "step": 4255 }, { "epoch": 0.11208849091387937, "grad_norm": 1.9725509881973267, "learning_rate": 4.440215959968396e-05, "loss": 1.8166, "step": 4256 }, { "epoch": 0.1121148274953911, "grad_norm": 2.1061978340148926, "learning_rate": 4.4400842770608375e-05, "loss": 1.8506, "step": 4257 }, { "epoch": 0.11214116407690282, "grad_norm": 4.271763801574707, "learning_rate": 4.439952594153279e-05, "loss": 2.2017, "step": 4258 }, { "epoch": 0.11216750065841453, "grad_norm": 5.5284647941589355, "learning_rate": 4.4398209112457206e-05, "loss": 1.2421, "step": 4259 }, { "epoch": 0.11219383723992626, "grad_norm": 6.562661170959473, "learning_rate": 4.439689228338162e-05, "loss": 1.4948, "step": 4260 }, { "epoch": 0.11222017382143798, "grad_norm": 2.111541748046875, "learning_rate": 4.439557545430603e-05, "loss": 2.1255, "step": 4261 }, { "epoch": 0.1122465104029497, "grad_norm": 1.9101742506027222, "learning_rate": 4.4394258625230446e-05, "loss": 2.2274, "step": 4262 }, { "epoch": 0.11227284698446141, "grad_norm": 3.4336953163146973, "learning_rate": 4.439294179615486e-05, "loss": 1.6914, "step": 4263 }, { "epoch": 0.11229918356597314, "grad_norm": 4.317891597747803, "learning_rate": 4.439162496707928e-05, "loss": 2.1988, "step": 4264 }, { "epoch": 0.11232552014748486, "grad_norm": 2.151965618133545, "learning_rate": 4.4390308138003686e-05, "loss": 1.9896, "step": 4265 }, { "epoch": 0.11235185672899657, "grad_norm": 2.7007954120635986, "learning_rate": 4.43889913089281e-05, "loss": 1.6312, "step": 4266 }, { "epoch": 0.1123781933105083, "grad_norm": 1.9390941858291626, "learning_rate": 4.438767447985252e-05, "loss": 2.132, "step": 4267 }, { "epoch": 0.11240452989202002, "grad_norm": 5.374405860900879, "learning_rate": 4.438635765077693e-05, "loss": 2.2693, "step": 4268 }, { "epoch": 0.11243086647353173, "grad_norm": 2.5718936920166016, "learning_rate": 4.438504082170135e-05, "loss": 0.6173, "step": 4269 }, { "epoch": 0.11245720305504346, "grad_norm": 1.7235485315322876, "learning_rate": 4.438372399262576e-05, "loss": 1.767, "step": 4270 }, { "epoch": 0.11248353963655518, "grad_norm": 2.4974799156188965, "learning_rate": 4.438240716355017e-05, "loss": 2.0177, "step": 4271 }, { "epoch": 0.11250987621806689, "grad_norm": 2.061169147491455, "learning_rate": 4.438109033447459e-05, "loss": 0.8659, "step": 4272 }, { "epoch": 0.11253621279957861, "grad_norm": 3.2096126079559326, "learning_rate": 4.4379773505399004e-05, "loss": 2.5423, "step": 4273 }, { "epoch": 0.11256254938109034, "grad_norm": 2.260287284851074, "learning_rate": 4.437845667632341e-05, "loss": 1.917, "step": 4274 }, { "epoch": 0.11258888596260205, "grad_norm": 2.7628087997436523, "learning_rate": 4.437713984724783e-05, "loss": 1.6597, "step": 4275 }, { "epoch": 0.11261522254411377, "grad_norm": 1.9369984865188599, "learning_rate": 4.4375823018172244e-05, "loss": 1.8688, "step": 4276 }, { "epoch": 0.1126415591256255, "grad_norm": 2.6044199466705322, "learning_rate": 4.437450618909666e-05, "loss": 1.4238, "step": 4277 }, { "epoch": 0.11266789570713721, "grad_norm": 2.716409206390381, "learning_rate": 4.4373189360021075e-05, "loss": 1.6664, "step": 4278 }, { "epoch": 0.11269423228864893, "grad_norm": 1.6078535318374634, "learning_rate": 4.4371872530945484e-05, "loss": 1.5937, "step": 4279 }, { "epoch": 0.11272056887016066, "grad_norm": 3.0117154121398926, "learning_rate": 4.43705557018699e-05, "loss": 2.0069, "step": 4280 }, { "epoch": 0.11274690545167237, "grad_norm": 5.708094120025635, "learning_rate": 4.436923887279431e-05, "loss": 1.7758, "step": 4281 }, { "epoch": 0.11277324203318409, "grad_norm": 3.4249651432037354, "learning_rate": 4.436792204371873e-05, "loss": 0.6774, "step": 4282 }, { "epoch": 0.1127995786146958, "grad_norm": 3.1791722774505615, "learning_rate": 4.436660521464314e-05, "loss": 1.8117, "step": 4283 }, { "epoch": 0.11282591519620754, "grad_norm": 1.8081732988357544, "learning_rate": 4.4365288385567555e-05, "loss": 2.0789, "step": 4284 }, { "epoch": 0.11285225177771925, "grad_norm": 2.9484305381774902, "learning_rate": 4.436397155649197e-05, "loss": 2.2982, "step": 4285 }, { "epoch": 0.11287858835923097, "grad_norm": 2.696687936782837, "learning_rate": 4.4362654727416386e-05, "loss": 3.3121, "step": 4286 }, { "epoch": 0.1129049249407427, "grad_norm": 2.456861734390259, "learning_rate": 4.43613378983408e-05, "loss": 1.4228, "step": 4287 }, { "epoch": 0.11293126152225441, "grad_norm": 3.1820244789123535, "learning_rate": 4.436002106926521e-05, "loss": 1.8875, "step": 4288 }, { "epoch": 0.11295759810376613, "grad_norm": 2.078258752822876, "learning_rate": 4.4358704240189626e-05, "loss": 1.7224, "step": 4289 }, { "epoch": 0.11298393468527786, "grad_norm": 5.4816975593566895, "learning_rate": 4.4357387411114035e-05, "loss": 1.4001, "step": 4290 }, { "epoch": 0.11301027126678957, "grad_norm": 3.8903496265411377, "learning_rate": 4.435607058203846e-05, "loss": 1.4585, "step": 4291 }, { "epoch": 0.11303660784830129, "grad_norm": 2.2217657566070557, "learning_rate": 4.4354753752962866e-05, "loss": 1.5178, "step": 4292 }, { "epoch": 0.11306294442981302, "grad_norm": 5.49448823928833, "learning_rate": 4.435343692388728e-05, "loss": 1.6709, "step": 4293 }, { "epoch": 0.11308928101132473, "grad_norm": 2.8648664951324463, "learning_rate": 4.43521200948117e-05, "loss": 1.9657, "step": 4294 }, { "epoch": 0.11311561759283645, "grad_norm": 1.9732083082199097, "learning_rate": 4.4350803265736106e-05, "loss": 1.5418, "step": 4295 }, { "epoch": 0.11314195417434816, "grad_norm": 2.34871244430542, "learning_rate": 4.434948643666053e-05, "loss": 0.8984, "step": 4296 }, { "epoch": 0.1131682907558599, "grad_norm": 3.1706786155700684, "learning_rate": 4.434816960758494e-05, "loss": 1.7576, "step": 4297 }, { "epoch": 0.11319462733737161, "grad_norm": 1.9729595184326172, "learning_rate": 4.434685277850935e-05, "loss": 1.951, "step": 4298 }, { "epoch": 0.11322096391888332, "grad_norm": 1.9731125831604004, "learning_rate": 4.434553594943376e-05, "loss": 1.5688, "step": 4299 }, { "epoch": 0.11324730050039505, "grad_norm": 2.2782936096191406, "learning_rate": 4.434421912035818e-05, "loss": 2.0701, "step": 4300 }, { "epoch": 0.11327363708190677, "grad_norm": 3.6428115367889404, "learning_rate": 4.434290229128259e-05, "loss": 1.0274, "step": 4301 }, { "epoch": 0.11329997366341849, "grad_norm": 3.1329870223999023, "learning_rate": 4.434158546220701e-05, "loss": 2.4332, "step": 4302 }, { "epoch": 0.11332631024493021, "grad_norm": 1.8293405771255493, "learning_rate": 4.434026863313142e-05, "loss": 1.6498, "step": 4303 }, { "epoch": 0.11335264682644193, "grad_norm": 1.7683696746826172, "learning_rate": 4.433895180405583e-05, "loss": 1.5913, "step": 4304 }, { "epoch": 0.11337898340795365, "grad_norm": 1.813410997390747, "learning_rate": 4.433763497498025e-05, "loss": 1.578, "step": 4305 }, { "epoch": 0.11340531998946536, "grad_norm": 4.987881183624268, "learning_rate": 4.4336318145904664e-05, "loss": 2.0715, "step": 4306 }, { "epoch": 0.11343165657097709, "grad_norm": 3.3109843730926514, "learning_rate": 4.433500131682908e-05, "loss": 2.0603, "step": 4307 }, { "epoch": 0.1134579931524888, "grad_norm": 3.753157138824463, "learning_rate": 4.433368448775349e-05, "loss": 1.6679, "step": 4308 }, { "epoch": 0.11348432973400052, "grad_norm": 3.773768901824951, "learning_rate": 4.4332367658677905e-05, "loss": 1.7714, "step": 4309 }, { "epoch": 0.11351066631551225, "grad_norm": 2.662864923477173, "learning_rate": 4.433105082960232e-05, "loss": 1.7384, "step": 4310 }, { "epoch": 0.11353700289702397, "grad_norm": 2.2981088161468506, "learning_rate": 4.4329734000526736e-05, "loss": 2.5741, "step": 4311 }, { "epoch": 0.11356333947853568, "grad_norm": 12.36007022857666, "learning_rate": 4.4328417171451145e-05, "loss": 1.2394, "step": 4312 }, { "epoch": 0.11358967606004741, "grad_norm": 2.2096548080444336, "learning_rate": 4.432710034237556e-05, "loss": 1.0605, "step": 4313 }, { "epoch": 0.11361601264155913, "grad_norm": 1.8356379270553589, "learning_rate": 4.4325783513299976e-05, "loss": 1.6582, "step": 4314 }, { "epoch": 0.11364234922307084, "grad_norm": 3.6824419498443604, "learning_rate": 4.432446668422439e-05, "loss": 0.8603, "step": 4315 }, { "epoch": 0.11366868580458256, "grad_norm": 2.039644479751587, "learning_rate": 4.432314985514881e-05, "loss": 1.8762, "step": 4316 }, { "epoch": 0.11369502238609429, "grad_norm": 5.607219696044922, "learning_rate": 4.4321833026073216e-05, "loss": 1.3149, "step": 4317 }, { "epoch": 0.113721358967606, "grad_norm": 2.1787233352661133, "learning_rate": 4.432051619699763e-05, "loss": 2.1058, "step": 4318 }, { "epoch": 0.11374769554911772, "grad_norm": 1.7568917274475098, "learning_rate": 4.431919936792205e-05, "loss": 1.8436, "step": 4319 }, { "epoch": 0.11377403213062945, "grad_norm": 3.1339690685272217, "learning_rate": 4.431788253884646e-05, "loss": 1.667, "step": 4320 }, { "epoch": 0.11380036871214116, "grad_norm": 4.216392993927002, "learning_rate": 4.431656570977087e-05, "loss": 1.0022, "step": 4321 }, { "epoch": 0.11382670529365288, "grad_norm": 3.501495122909546, "learning_rate": 4.431524888069529e-05, "loss": 0.5404, "step": 4322 }, { "epoch": 0.11385304187516461, "grad_norm": 2.12831711769104, "learning_rate": 4.43139320516197e-05, "loss": 1.9329, "step": 4323 }, { "epoch": 0.11387937845667633, "grad_norm": 2.5285794734954834, "learning_rate": 4.431261522254412e-05, "loss": 2.2999, "step": 4324 }, { "epoch": 0.11390571503818804, "grad_norm": 3.4024181365966797, "learning_rate": 4.4311298393468534e-05, "loss": 2.2381, "step": 4325 }, { "epoch": 0.11393205161969976, "grad_norm": 3.5027482509613037, "learning_rate": 4.430998156439294e-05, "loss": 1.9063, "step": 4326 }, { "epoch": 0.11395838820121149, "grad_norm": 3.702543258666992, "learning_rate": 4.430866473531736e-05, "loss": 1.6912, "step": 4327 }, { "epoch": 0.1139847247827232, "grad_norm": 3.80104398727417, "learning_rate": 4.430734790624177e-05, "loss": 1.3014, "step": 4328 }, { "epoch": 0.11401106136423492, "grad_norm": 3.2665998935699463, "learning_rate": 4.430603107716619e-05, "loss": 2.0383, "step": 4329 }, { "epoch": 0.11403739794574665, "grad_norm": 2.95882511138916, "learning_rate": 4.43047142480906e-05, "loss": 1.384, "step": 4330 }, { "epoch": 0.11406373452725836, "grad_norm": 2.2256672382354736, "learning_rate": 4.4303397419015014e-05, "loss": 1.7927, "step": 4331 }, { "epoch": 0.11409007110877008, "grad_norm": 2.3855338096618652, "learning_rate": 4.430208058993943e-05, "loss": 2.0536, "step": 4332 }, { "epoch": 0.11411640769028181, "grad_norm": 1.888027548789978, "learning_rate": 4.430076376086384e-05, "loss": 1.8424, "step": 4333 }, { "epoch": 0.11414274427179352, "grad_norm": 6.155549049377441, "learning_rate": 4.429944693178826e-05, "loss": 0.8036, "step": 4334 }, { "epoch": 0.11416908085330524, "grad_norm": 2.398144483566284, "learning_rate": 4.429813010271267e-05, "loss": 1.5755, "step": 4335 }, { "epoch": 0.11419541743481697, "grad_norm": 2.5533294677734375, "learning_rate": 4.4296813273637085e-05, "loss": 1.4531, "step": 4336 }, { "epoch": 0.11422175401632868, "grad_norm": 1.786769986152649, "learning_rate": 4.4295496444561494e-05, "loss": 2.0445, "step": 4337 }, { "epoch": 0.1142480905978404, "grad_norm": 1.889117956161499, "learning_rate": 4.4294179615485916e-05, "loss": 1.5585, "step": 4338 }, { "epoch": 0.11427442717935211, "grad_norm": 5.505272388458252, "learning_rate": 4.4292862786410325e-05, "loss": 1.442, "step": 4339 }, { "epoch": 0.11430076376086384, "grad_norm": 2.247007131576538, "learning_rate": 4.429154595733474e-05, "loss": 1.902, "step": 4340 }, { "epoch": 0.11432710034237556, "grad_norm": 2.9492099285125732, "learning_rate": 4.4290229128259156e-05, "loss": 2.0655, "step": 4341 }, { "epoch": 0.11435343692388727, "grad_norm": 4.861425399780273, "learning_rate": 4.4288912299183565e-05, "loss": 1.5592, "step": 4342 }, { "epoch": 0.114379773505399, "grad_norm": 2.548419952392578, "learning_rate": 4.428759547010799e-05, "loss": 1.8656, "step": 4343 }, { "epoch": 0.11440611008691072, "grad_norm": 3.408047914505005, "learning_rate": 4.4286278641032396e-05, "loss": 1.0554, "step": 4344 }, { "epoch": 0.11443244666842244, "grad_norm": 1.7642710208892822, "learning_rate": 4.428496181195681e-05, "loss": 1.8416, "step": 4345 }, { "epoch": 0.11445878324993417, "grad_norm": 3.3986451625823975, "learning_rate": 4.428364498288122e-05, "loss": 2.1011, "step": 4346 }, { "epoch": 0.11448511983144588, "grad_norm": 2.040459632873535, "learning_rate": 4.4282328153805636e-05, "loss": 2.2828, "step": 4347 }, { "epoch": 0.1145114564129576, "grad_norm": 3.525449275970459, "learning_rate": 4.428101132473005e-05, "loss": 1.4548, "step": 4348 }, { "epoch": 0.11453779299446931, "grad_norm": 2.458203077316284, "learning_rate": 4.427969449565447e-05, "loss": 2.0462, "step": 4349 }, { "epoch": 0.11456412957598104, "grad_norm": 2.7096199989318848, "learning_rate": 4.427837766657888e-05, "loss": 1.4242, "step": 4350 }, { "epoch": 0.11459046615749276, "grad_norm": 2.2402617931365967, "learning_rate": 4.427706083750329e-05, "loss": 0.9532, "step": 4351 }, { "epoch": 0.11461680273900447, "grad_norm": 2.5060954093933105, "learning_rate": 4.4275744008427714e-05, "loss": 1.7735, "step": 4352 }, { "epoch": 0.1146431393205162, "grad_norm": 3.0581908226013184, "learning_rate": 4.427442717935212e-05, "loss": 1.2129, "step": 4353 }, { "epoch": 0.11466947590202792, "grad_norm": 1.8943681716918945, "learning_rate": 4.427311035027654e-05, "loss": 1.5854, "step": 4354 }, { "epoch": 0.11469581248353963, "grad_norm": 1.9518150091171265, "learning_rate": 4.427179352120095e-05, "loss": 1.3087, "step": 4355 }, { "epoch": 0.11472214906505136, "grad_norm": 5.299782752990723, "learning_rate": 4.427047669212536e-05, "loss": 0.6505, "step": 4356 }, { "epoch": 0.11474848564656308, "grad_norm": 5.175239086151123, "learning_rate": 4.426915986304978e-05, "loss": 2.0356, "step": 4357 }, { "epoch": 0.1147748222280748, "grad_norm": 2.3872861862182617, "learning_rate": 4.4267843033974194e-05, "loss": 1.8172, "step": 4358 }, { "epoch": 0.11480115880958651, "grad_norm": 1.7819041013717651, "learning_rate": 4.42665262048986e-05, "loss": 1.2511, "step": 4359 }, { "epoch": 0.11482749539109824, "grad_norm": 2.0215232372283936, "learning_rate": 4.426520937582302e-05, "loss": 1.1734, "step": 4360 }, { "epoch": 0.11485383197260995, "grad_norm": 2.68278431892395, "learning_rate": 4.4263892546747434e-05, "loss": 1.2245, "step": 4361 }, { "epoch": 0.11488016855412167, "grad_norm": 3.159667491912842, "learning_rate": 4.426257571767185e-05, "loss": 1.6832, "step": 4362 }, { "epoch": 0.1149065051356334, "grad_norm": 2.626835584640503, "learning_rate": 4.4261258888596265e-05, "loss": 1.5326, "step": 4363 }, { "epoch": 0.11493284171714511, "grad_norm": 2.1548802852630615, "learning_rate": 4.4259942059520674e-05, "loss": 1.8406, "step": 4364 }, { "epoch": 0.11495917829865683, "grad_norm": 2.756100654602051, "learning_rate": 4.425862523044509e-05, "loss": 1.5675, "step": 4365 }, { "epoch": 0.11498551488016856, "grad_norm": 2.976527214050293, "learning_rate": 4.42573084013695e-05, "loss": 1.6547, "step": 4366 }, { "epoch": 0.11501185146168028, "grad_norm": 2.537472724914551, "learning_rate": 4.425599157229392e-05, "loss": 1.5997, "step": 4367 }, { "epoch": 0.11503818804319199, "grad_norm": 4.277581691741943, "learning_rate": 4.425467474321833e-05, "loss": 2.3393, "step": 4368 }, { "epoch": 0.1150645246247037, "grad_norm": 2.5709848403930664, "learning_rate": 4.4253357914142745e-05, "loss": 2.4128, "step": 4369 }, { "epoch": 0.11509086120621544, "grad_norm": 1.4659626483917236, "learning_rate": 4.425204108506716e-05, "loss": 0.7346, "step": 4370 }, { "epoch": 0.11511719778772715, "grad_norm": 1.9410346746444702, "learning_rate": 4.425072425599158e-05, "loss": 2.3115, "step": 4371 }, { "epoch": 0.11514353436923887, "grad_norm": 3.8114194869995117, "learning_rate": 4.424940742691599e-05, "loss": 1.9819, "step": 4372 }, { "epoch": 0.1151698709507506, "grad_norm": 2.099423885345459, "learning_rate": 4.42480905978404e-05, "loss": 1.8187, "step": 4373 }, { "epoch": 0.11519620753226231, "grad_norm": 5.10236120223999, "learning_rate": 4.424677376876482e-05, "loss": 1.2605, "step": 4374 }, { "epoch": 0.11522254411377403, "grad_norm": 6.026031494140625, "learning_rate": 4.4245456939689226e-05, "loss": 1.1565, "step": 4375 }, { "epoch": 0.11524888069528576, "grad_norm": 3.2395246028900146, "learning_rate": 4.424414011061365e-05, "loss": 1.5653, "step": 4376 }, { "epoch": 0.11527521727679747, "grad_norm": 4.563273906707764, "learning_rate": 4.424282328153806e-05, "loss": 2.3523, "step": 4377 }, { "epoch": 0.11530155385830919, "grad_norm": 1.7929317951202393, "learning_rate": 4.424150645246247e-05, "loss": 2.1543, "step": 4378 }, { "epoch": 0.11532789043982092, "grad_norm": 1.9524186849594116, "learning_rate": 4.424018962338689e-05, "loss": 1.9696, "step": 4379 }, { "epoch": 0.11535422702133263, "grad_norm": 2.2234647274017334, "learning_rate": 4.42388727943113e-05, "loss": 2.6179, "step": 4380 }, { "epoch": 0.11538056360284435, "grad_norm": 2.636587619781494, "learning_rate": 4.423755596523572e-05, "loss": 2.3227, "step": 4381 }, { "epoch": 0.11540690018435606, "grad_norm": 3.5364274978637695, "learning_rate": 4.423623913616013e-05, "loss": 1.9982, "step": 4382 }, { "epoch": 0.1154332367658678, "grad_norm": 1.893670678138733, "learning_rate": 4.4234922307084544e-05, "loss": 1.6248, "step": 4383 }, { "epoch": 0.11545957334737951, "grad_norm": 2.328727960586548, "learning_rate": 4.423360547800895e-05, "loss": 1.0495, "step": 4384 }, { "epoch": 0.11548590992889123, "grad_norm": 2.2435641288757324, "learning_rate": 4.4232288648933375e-05, "loss": 0.8707, "step": 4385 }, { "epoch": 0.11551224651040295, "grad_norm": 3.589625358581543, "learning_rate": 4.4230971819857784e-05, "loss": 0.4759, "step": 4386 }, { "epoch": 0.11553858309191467, "grad_norm": 1.8171758651733398, "learning_rate": 4.42296549907822e-05, "loss": 1.389, "step": 4387 }, { "epoch": 0.11556491967342639, "grad_norm": 3.317521333694458, "learning_rate": 4.4228338161706615e-05, "loss": 1.5492, "step": 4388 }, { "epoch": 0.11559125625493812, "grad_norm": 3.8359012603759766, "learning_rate": 4.4227021332631024e-05, "loss": 1.8196, "step": 4389 }, { "epoch": 0.11561759283644983, "grad_norm": 1.9237251281738281, "learning_rate": 4.4225704503555446e-05, "loss": 1.737, "step": 4390 }, { "epoch": 0.11564392941796155, "grad_norm": 2.886448383331299, "learning_rate": 4.4224387674479855e-05, "loss": 1.9436, "step": 4391 }, { "epoch": 0.11567026599947326, "grad_norm": 4.651662826538086, "learning_rate": 4.422307084540427e-05, "loss": 1.0072, "step": 4392 }, { "epoch": 0.11569660258098499, "grad_norm": 2.4865639209747314, "learning_rate": 4.422175401632868e-05, "loss": 2.3585, "step": 4393 }, { "epoch": 0.11572293916249671, "grad_norm": 1.986808180809021, "learning_rate": 4.4220437187253095e-05, "loss": 1.8663, "step": 4394 }, { "epoch": 0.11574927574400842, "grad_norm": 2.155029058456421, "learning_rate": 4.421912035817751e-05, "loss": 2.2016, "step": 4395 }, { "epoch": 0.11577561232552015, "grad_norm": 2.616759777069092, "learning_rate": 4.4217803529101926e-05, "loss": 2.0208, "step": 4396 }, { "epoch": 0.11580194890703187, "grad_norm": 3.513251543045044, "learning_rate": 4.421648670002634e-05, "loss": 1.2551, "step": 4397 }, { "epoch": 0.11582828548854358, "grad_norm": 1.8409112691879272, "learning_rate": 4.421516987095075e-05, "loss": 1.3185, "step": 4398 }, { "epoch": 0.11585462207005531, "grad_norm": 2.120098352432251, "learning_rate": 4.4213853041875166e-05, "loss": 2.6171, "step": 4399 }, { "epoch": 0.11588095865156703, "grad_norm": 2.0425994396209717, "learning_rate": 4.421253621279958e-05, "loss": 0.8799, "step": 4400 }, { "epoch": 0.11590729523307874, "grad_norm": 3.7647507190704346, "learning_rate": 4.4211219383724e-05, "loss": 1.5859, "step": 4401 }, { "epoch": 0.11593363181459046, "grad_norm": 2.8809452056884766, "learning_rate": 4.4209902554648406e-05, "loss": 2.4356, "step": 4402 }, { "epoch": 0.11595996839610219, "grad_norm": 1.9078770875930786, "learning_rate": 4.420858572557282e-05, "loss": 1.5714, "step": 4403 }, { "epoch": 0.1159863049776139, "grad_norm": 3.08406662940979, "learning_rate": 4.420726889649724e-05, "loss": 2.1748, "step": 4404 }, { "epoch": 0.11601264155912562, "grad_norm": 1.6534942388534546, "learning_rate": 4.420595206742165e-05, "loss": 1.8346, "step": 4405 }, { "epoch": 0.11603897814063735, "grad_norm": 1.8040796518325806, "learning_rate": 4.420463523834606e-05, "loss": 2.1775, "step": 4406 }, { "epoch": 0.11606531472214907, "grad_norm": 3.455887794494629, "learning_rate": 4.420331840927048e-05, "loss": 0.9073, "step": 4407 }, { "epoch": 0.11609165130366078, "grad_norm": 3.195606231689453, "learning_rate": 4.420200158019489e-05, "loss": 1.8064, "step": 4408 }, { "epoch": 0.11611798788517251, "grad_norm": 2.3142263889312744, "learning_rate": 4.420068475111931e-05, "loss": 1.9647, "step": 4409 }, { "epoch": 0.11614432446668423, "grad_norm": 2.7577309608459473, "learning_rate": 4.4199367922043724e-05, "loss": 2.1959, "step": 4410 }, { "epoch": 0.11617066104819594, "grad_norm": 2.356194257736206, "learning_rate": 4.419805109296813e-05, "loss": 2.4659, "step": 4411 }, { "epoch": 0.11619699762970766, "grad_norm": 2.681997537612915, "learning_rate": 4.419673426389255e-05, "loss": 1.7866, "step": 4412 }, { "epoch": 0.11622333421121939, "grad_norm": 2.571408987045288, "learning_rate": 4.419541743481696e-05, "loss": 1.5353, "step": 4413 }, { "epoch": 0.1162496707927311, "grad_norm": 2.0447728633880615, "learning_rate": 4.419410060574138e-05, "loss": 1.4663, "step": 4414 }, { "epoch": 0.11627600737424282, "grad_norm": 3.892343044281006, "learning_rate": 4.419278377666579e-05, "loss": 1.7325, "step": 4415 }, { "epoch": 0.11630234395575455, "grad_norm": 3.722790241241455, "learning_rate": 4.4191466947590204e-05, "loss": 2.0224, "step": 4416 }, { "epoch": 0.11632868053726626, "grad_norm": 3.7777955532073975, "learning_rate": 4.419015011851462e-05, "loss": 1.4628, "step": 4417 }, { "epoch": 0.11635501711877798, "grad_norm": 1.8749061822891235, "learning_rate": 4.4188833289439035e-05, "loss": 2.4254, "step": 4418 }, { "epoch": 0.11638135370028971, "grad_norm": 4.084940433502197, "learning_rate": 4.418751646036345e-05, "loss": 1.0573, "step": 4419 }, { "epoch": 0.11640769028180142, "grad_norm": 1.9295003414154053, "learning_rate": 4.418619963128786e-05, "loss": 0.38, "step": 4420 }, { "epoch": 0.11643402686331314, "grad_norm": 3.350985288619995, "learning_rate": 4.4184882802212275e-05, "loss": 2.4217, "step": 4421 }, { "epoch": 0.11646036344482485, "grad_norm": 3.1467080116271973, "learning_rate": 4.4183565973136684e-05, "loss": 1.3987, "step": 4422 }, { "epoch": 0.11648670002633658, "grad_norm": 2.770132303237915, "learning_rate": 4.4182249144061106e-05, "loss": 1.2506, "step": 4423 }, { "epoch": 0.1165130366078483, "grad_norm": 3.275036334991455, "learning_rate": 4.4180932314985515e-05, "loss": 1.0688, "step": 4424 }, { "epoch": 0.11653937318936002, "grad_norm": 2.3301479816436768, "learning_rate": 4.417961548590993e-05, "loss": 1.8775, "step": 4425 }, { "epoch": 0.11656570977087174, "grad_norm": 1.8609044551849365, "learning_rate": 4.4178298656834346e-05, "loss": 2.3729, "step": 4426 }, { "epoch": 0.11659204635238346, "grad_norm": 2.461369276046753, "learning_rate": 4.4176981827758755e-05, "loss": 1.6169, "step": 4427 }, { "epoch": 0.11661838293389518, "grad_norm": 2.050262212753296, "learning_rate": 4.417566499868318e-05, "loss": 1.8469, "step": 4428 }, { "epoch": 0.1166447195154069, "grad_norm": 3.288084030151367, "learning_rate": 4.4174348169607586e-05, "loss": 1.7402, "step": 4429 }, { "epoch": 0.11667105609691862, "grad_norm": 1.8451225757598877, "learning_rate": 4.4173031340532e-05, "loss": 1.852, "step": 4430 }, { "epoch": 0.11669739267843034, "grad_norm": 2.001999616622925, "learning_rate": 4.417171451145641e-05, "loss": 0.3579, "step": 4431 }, { "epoch": 0.11672372925994207, "grad_norm": 4.541996002197266, "learning_rate": 4.4170397682380827e-05, "loss": 1.3137, "step": 4432 }, { "epoch": 0.11675006584145378, "grad_norm": 1.6650617122650146, "learning_rate": 4.416908085330524e-05, "loss": 1.6788, "step": 4433 }, { "epoch": 0.1167764024229655, "grad_norm": 4.194488525390625, "learning_rate": 4.416776402422966e-05, "loss": 1.2758, "step": 4434 }, { "epoch": 0.11680273900447721, "grad_norm": 2.6946659088134766, "learning_rate": 4.416644719515407e-05, "loss": 2.0421, "step": 4435 }, { "epoch": 0.11682907558598894, "grad_norm": 2.4091713428497314, "learning_rate": 4.416513036607848e-05, "loss": 2.1079, "step": 4436 }, { "epoch": 0.11685541216750066, "grad_norm": 3.072972297668457, "learning_rate": 4.4163813537002904e-05, "loss": 1.7528, "step": 4437 }, { "epoch": 0.11688174874901237, "grad_norm": 4.11431884765625, "learning_rate": 4.416249670792731e-05, "loss": 2.0374, "step": 4438 }, { "epoch": 0.1169080853305241, "grad_norm": 2.783322334289551, "learning_rate": 4.416117987885173e-05, "loss": 1.8257, "step": 4439 }, { "epoch": 0.11693442191203582, "grad_norm": 1.4758005142211914, "learning_rate": 4.415986304977614e-05, "loss": 1.4166, "step": 4440 }, { "epoch": 0.11696075849354753, "grad_norm": 2.110168933868408, "learning_rate": 4.415854622070055e-05, "loss": 1.9646, "step": 4441 }, { "epoch": 0.11698709507505926, "grad_norm": 2.3040671348571777, "learning_rate": 4.415722939162497e-05, "loss": 2.2283, "step": 4442 }, { "epoch": 0.11701343165657098, "grad_norm": 4.929816722869873, "learning_rate": 4.4155912562549385e-05, "loss": 0.9143, "step": 4443 }, { "epoch": 0.1170397682380827, "grad_norm": 1.7468531131744385, "learning_rate": 4.41545957334738e-05, "loss": 1.6556, "step": 4444 }, { "epoch": 0.11706610481959441, "grad_norm": 5.195878028869629, "learning_rate": 4.415327890439821e-05, "loss": 2.1819, "step": 4445 }, { "epoch": 0.11709244140110614, "grad_norm": 2.4151008129119873, "learning_rate": 4.4151962075322625e-05, "loss": 2.0081, "step": 4446 }, { "epoch": 0.11711877798261786, "grad_norm": 2.482985496520996, "learning_rate": 4.415064524624704e-05, "loss": 1.6568, "step": 4447 }, { "epoch": 0.11714511456412957, "grad_norm": 2.796194314956665, "learning_rate": 4.4149328417171456e-05, "loss": 1.6216, "step": 4448 }, { "epoch": 0.1171714511456413, "grad_norm": 2.431485414505005, "learning_rate": 4.4148011588095865e-05, "loss": 1.9602, "step": 4449 }, { "epoch": 0.11719778772715302, "grad_norm": 2.156215190887451, "learning_rate": 4.414669475902028e-05, "loss": 1.5413, "step": 4450 }, { "epoch": 0.11722412430866473, "grad_norm": 5.469089031219482, "learning_rate": 4.4145377929944696e-05, "loss": 0.6391, "step": 4451 }, { "epoch": 0.11725046089017646, "grad_norm": 2.213456392288208, "learning_rate": 4.414406110086911e-05, "loss": 1.3894, "step": 4452 }, { "epoch": 0.11727679747168818, "grad_norm": 3.5617663860321045, "learning_rate": 4.414274427179352e-05, "loss": 1.3308, "step": 4453 }, { "epoch": 0.11730313405319989, "grad_norm": 2.372602701187134, "learning_rate": 4.4141427442717936e-05, "loss": 1.6788, "step": 4454 }, { "epoch": 0.11732947063471161, "grad_norm": 2.067337989807129, "learning_rate": 4.414011061364235e-05, "loss": 1.0302, "step": 4455 }, { "epoch": 0.11735580721622334, "grad_norm": 2.4312102794647217, "learning_rate": 4.413879378456677e-05, "loss": 1.8871, "step": 4456 }, { "epoch": 0.11738214379773505, "grad_norm": 3.186422348022461, "learning_rate": 4.413747695549118e-05, "loss": 1.5799, "step": 4457 }, { "epoch": 0.11740848037924677, "grad_norm": 2.366577625274658, "learning_rate": 4.413616012641559e-05, "loss": 3.096, "step": 4458 }, { "epoch": 0.1174348169607585, "grad_norm": 2.444317102432251, "learning_rate": 4.413484329734001e-05, "loss": 1.997, "step": 4459 }, { "epoch": 0.11746115354227021, "grad_norm": 2.177645206451416, "learning_rate": 4.4133526468264416e-05, "loss": 1.3416, "step": 4460 }, { "epoch": 0.11748749012378193, "grad_norm": 3.835712194442749, "learning_rate": 4.413220963918884e-05, "loss": 1.3609, "step": 4461 }, { "epoch": 0.11751382670529366, "grad_norm": 1.9772382974624634, "learning_rate": 4.413089281011325e-05, "loss": 2.2611, "step": 4462 }, { "epoch": 0.11754016328680537, "grad_norm": 2.3028218746185303, "learning_rate": 4.412957598103766e-05, "loss": 0.8586, "step": 4463 }, { "epoch": 0.11756649986831709, "grad_norm": 2.191162586212158, "learning_rate": 4.412825915196208e-05, "loss": 2.1453, "step": 4464 }, { "epoch": 0.1175928364498288, "grad_norm": 3.3839304447174072, "learning_rate": 4.4126942322886494e-05, "loss": 1.7196, "step": 4465 }, { "epoch": 0.11761917303134053, "grad_norm": 2.2345473766326904, "learning_rate": 4.412562549381091e-05, "loss": 1.9077, "step": 4466 }, { "epoch": 0.11764550961285225, "grad_norm": 5.274810314178467, "learning_rate": 4.412430866473532e-05, "loss": 2.0036, "step": 4467 }, { "epoch": 0.11767184619436397, "grad_norm": 3.1181788444519043, "learning_rate": 4.4122991835659734e-05, "loss": 1.215, "step": 4468 }, { "epoch": 0.1176981827758757, "grad_norm": 1.5179028511047363, "learning_rate": 4.412167500658414e-05, "loss": 0.4065, "step": 4469 }, { "epoch": 0.11772451935738741, "grad_norm": 3.7254276275634766, "learning_rate": 4.4120358177508565e-05, "loss": 2.488, "step": 4470 }, { "epoch": 0.11775085593889913, "grad_norm": 2.8294014930725098, "learning_rate": 4.4119041348432974e-05, "loss": 0.8961, "step": 4471 }, { "epoch": 0.11777719252041086, "grad_norm": 3.7836203575134277, "learning_rate": 4.411772451935739e-05, "loss": 0.4027, "step": 4472 }, { "epoch": 0.11780352910192257, "grad_norm": 3.876450777053833, "learning_rate": 4.4116407690281805e-05, "loss": 0.9419, "step": 4473 }, { "epoch": 0.11782986568343429, "grad_norm": 1.7913403511047363, "learning_rate": 4.4115090861206214e-05, "loss": 1.9551, "step": 4474 }, { "epoch": 0.11785620226494602, "grad_norm": 1.810213565826416, "learning_rate": 4.4113774032130636e-05, "loss": 2.0296, "step": 4475 }, { "epoch": 0.11788253884645773, "grad_norm": 1.8780161142349243, "learning_rate": 4.4112457203055045e-05, "loss": 1.8583, "step": 4476 }, { "epoch": 0.11790887542796945, "grad_norm": 1.73005211353302, "learning_rate": 4.411114037397946e-05, "loss": 1.7836, "step": 4477 }, { "epoch": 0.11793521200948116, "grad_norm": 2.005183458328247, "learning_rate": 4.410982354490387e-05, "loss": 1.4563, "step": 4478 }, { "epoch": 0.11796154859099289, "grad_norm": 2.8055245876312256, "learning_rate": 4.4108506715828285e-05, "loss": 1.2794, "step": 4479 }, { "epoch": 0.11798788517250461, "grad_norm": 3.500542163848877, "learning_rate": 4.41071898867527e-05, "loss": 1.1338, "step": 4480 }, { "epoch": 0.11801422175401632, "grad_norm": 3.309410333633423, "learning_rate": 4.4105873057677116e-05, "loss": 1.0386, "step": 4481 }, { "epoch": 0.11804055833552805, "grad_norm": 1.834859848022461, "learning_rate": 4.410455622860153e-05, "loss": 2.4931, "step": 4482 }, { "epoch": 0.11806689491703977, "grad_norm": 3.1914002895355225, "learning_rate": 4.410323939952594e-05, "loss": 1.3354, "step": 4483 }, { "epoch": 0.11809323149855148, "grad_norm": 3.621843099594116, "learning_rate": 4.410192257045036e-05, "loss": 1.3452, "step": 4484 }, { "epoch": 0.11811956808006321, "grad_norm": 2.77040696144104, "learning_rate": 4.410060574137477e-05, "loss": 0.4439, "step": 4485 }, { "epoch": 0.11814590466157493, "grad_norm": 1.976322054862976, "learning_rate": 4.409928891229919e-05, "loss": 2.6194, "step": 4486 }, { "epoch": 0.11817224124308665, "grad_norm": 1.6605380773544312, "learning_rate": 4.4097972083223596e-05, "loss": 1.6788, "step": 4487 }, { "epoch": 0.11819857782459836, "grad_norm": 1.6394164562225342, "learning_rate": 4.409665525414801e-05, "loss": 2.0224, "step": 4488 }, { "epoch": 0.11822491440611009, "grad_norm": 4.603525161743164, "learning_rate": 4.409533842507243e-05, "loss": 0.6049, "step": 4489 }, { "epoch": 0.1182512509876218, "grad_norm": 2.52878999710083, "learning_rate": 4.409402159599684e-05, "loss": 1.1091, "step": 4490 }, { "epoch": 0.11827758756913352, "grad_norm": 2.8281538486480713, "learning_rate": 4.409270476692126e-05, "loss": 1.707, "step": 4491 }, { "epoch": 0.11830392415064525, "grad_norm": 2.3989648818969727, "learning_rate": 4.409138793784567e-05, "loss": 1.2226, "step": 4492 }, { "epoch": 0.11833026073215697, "grad_norm": 2.66068172454834, "learning_rate": 4.409007110877008e-05, "loss": 1.646, "step": 4493 }, { "epoch": 0.11835659731366868, "grad_norm": 6.114838600158691, "learning_rate": 4.40887542796945e-05, "loss": 1.6339, "step": 4494 }, { "epoch": 0.11838293389518041, "grad_norm": 2.7871336936950684, "learning_rate": 4.4087437450618914e-05, "loss": 1.2906, "step": 4495 }, { "epoch": 0.11840927047669213, "grad_norm": 2.0278446674346924, "learning_rate": 4.408612062154332e-05, "loss": 1.9115, "step": 4496 }, { "epoch": 0.11843560705820384, "grad_norm": 3.571059226989746, "learning_rate": 4.408480379246774e-05, "loss": 0.5902, "step": 4497 }, { "epoch": 0.11846194363971556, "grad_norm": 3.208005666732788, "learning_rate": 4.4083486963392154e-05, "loss": 1.744, "step": 4498 }, { "epoch": 0.11848828022122729, "grad_norm": 3.4264402389526367, "learning_rate": 4.408217013431657e-05, "loss": 1.8997, "step": 4499 }, { "epoch": 0.118514616802739, "grad_norm": 2.0087668895721436, "learning_rate": 4.4080853305240986e-05, "loss": 0.9058, "step": 4500 }, { "epoch": 0.11854095338425072, "grad_norm": 2.763491630554199, "learning_rate": 4.4079536476165394e-05, "loss": 0.5899, "step": 4501 }, { "epoch": 0.11856728996576245, "grad_norm": 1.5228254795074463, "learning_rate": 4.407821964708981e-05, "loss": 1.2187, "step": 4502 }, { "epoch": 0.11859362654727416, "grad_norm": 4.362586975097656, "learning_rate": 4.4076902818014226e-05, "loss": 0.6471, "step": 4503 }, { "epoch": 0.11861996312878588, "grad_norm": 3.0736706256866455, "learning_rate": 4.407558598893864e-05, "loss": 1.9065, "step": 4504 }, { "epoch": 0.11864629971029761, "grad_norm": 2.788661479949951, "learning_rate": 4.407426915986305e-05, "loss": 1.7559, "step": 4505 }, { "epoch": 0.11867263629180932, "grad_norm": 2.1233596801757812, "learning_rate": 4.4072952330787466e-05, "loss": 1.9864, "step": 4506 }, { "epoch": 0.11869897287332104, "grad_norm": 2.975221872329712, "learning_rate": 4.4071635501711874e-05, "loss": 1.6282, "step": 4507 }, { "epoch": 0.11872530945483276, "grad_norm": 1.9074469804763794, "learning_rate": 4.40703186726363e-05, "loss": 1.7752, "step": 4508 }, { "epoch": 0.11875164603634449, "grad_norm": 2.320554733276367, "learning_rate": 4.4069001843560706e-05, "loss": 1.2108, "step": 4509 }, { "epoch": 0.1187779826178562, "grad_norm": 2.093651533126831, "learning_rate": 4.406768501448512e-05, "loss": 1.5476, "step": 4510 }, { "epoch": 0.11880431919936792, "grad_norm": 1.9673291444778442, "learning_rate": 4.406636818540954e-05, "loss": 1.1547, "step": 4511 }, { "epoch": 0.11883065578087965, "grad_norm": 2.612429141998291, "learning_rate": 4.4065051356333946e-05, "loss": 1.7214, "step": 4512 }, { "epoch": 0.11885699236239136, "grad_norm": 2.106727361679077, "learning_rate": 4.406373452725837e-05, "loss": 1.6774, "step": 4513 }, { "epoch": 0.11888332894390308, "grad_norm": 1.991256594657898, "learning_rate": 4.406241769818278e-05, "loss": 2.1493, "step": 4514 }, { "epoch": 0.1189096655254148, "grad_norm": 2.3556244373321533, "learning_rate": 4.406110086910719e-05, "loss": 2.2043, "step": 4515 }, { "epoch": 0.11893600210692652, "grad_norm": 7.154943943023682, "learning_rate": 4.40597840400316e-05, "loss": 2.2684, "step": 4516 }, { "epoch": 0.11896233868843824, "grad_norm": 4.374931335449219, "learning_rate": 4.4058467210956024e-05, "loss": 0.9902, "step": 4517 }, { "epoch": 0.11898867526994997, "grad_norm": 2.3428380489349365, "learning_rate": 4.405715038188043e-05, "loss": 1.4315, "step": 4518 }, { "epoch": 0.11901501185146168, "grad_norm": 2.3284173011779785, "learning_rate": 4.405583355280485e-05, "loss": 1.6703, "step": 4519 }, { "epoch": 0.1190413484329734, "grad_norm": 1.9064738750457764, "learning_rate": 4.4054516723729264e-05, "loss": 1.3962, "step": 4520 }, { "epoch": 0.11906768501448511, "grad_norm": 6.036314010620117, "learning_rate": 4.405319989465367e-05, "loss": 1.7169, "step": 4521 }, { "epoch": 0.11909402159599684, "grad_norm": 2.0272128582000732, "learning_rate": 4.4051883065578095e-05, "loss": 1.6366, "step": 4522 }, { "epoch": 0.11912035817750856, "grad_norm": 2.339869976043701, "learning_rate": 4.4050566236502504e-05, "loss": 1.4492, "step": 4523 }, { "epoch": 0.11914669475902027, "grad_norm": 5.801568508148193, "learning_rate": 4.404924940742692e-05, "loss": 1.7531, "step": 4524 }, { "epoch": 0.119173031340532, "grad_norm": 2.9039807319641113, "learning_rate": 4.404793257835133e-05, "loss": 1.3748, "step": 4525 }, { "epoch": 0.11919936792204372, "grad_norm": 2.5488572120666504, "learning_rate": 4.4046615749275744e-05, "loss": 1.5856, "step": 4526 }, { "epoch": 0.11922570450355544, "grad_norm": 2.6418426036834717, "learning_rate": 4.404529892020016e-05, "loss": 0.3907, "step": 4527 }, { "epoch": 0.11925204108506716, "grad_norm": 3.457848072052002, "learning_rate": 4.4043982091124575e-05, "loss": 1.5812, "step": 4528 }, { "epoch": 0.11927837766657888, "grad_norm": 2.1689720153808594, "learning_rate": 4.404266526204899e-05, "loss": 2.193, "step": 4529 }, { "epoch": 0.1193047142480906, "grad_norm": 1.7394002676010132, "learning_rate": 4.40413484329734e-05, "loss": 1.5981, "step": 4530 }, { "epoch": 0.11933105082960231, "grad_norm": 2.2339563369750977, "learning_rate": 4.404003160389782e-05, "loss": 2.0593, "step": 4531 }, { "epoch": 0.11935738741111404, "grad_norm": 2.624907970428467, "learning_rate": 4.403871477482223e-05, "loss": 1.7177, "step": 4532 }, { "epoch": 0.11938372399262576, "grad_norm": 3.305166482925415, "learning_rate": 4.4037397945746646e-05, "loss": 1.1525, "step": 4533 }, { "epoch": 0.11941006057413747, "grad_norm": 2.7178993225097656, "learning_rate": 4.4036081116671055e-05, "loss": 2.3004, "step": 4534 }, { "epoch": 0.1194363971556492, "grad_norm": 2.7217884063720703, "learning_rate": 4.403476428759547e-05, "loss": 1.9671, "step": 4535 }, { "epoch": 0.11946273373716092, "grad_norm": 5.2226362228393555, "learning_rate": 4.4033447458519886e-05, "loss": 1.4713, "step": 4536 }, { "epoch": 0.11948907031867263, "grad_norm": 2.570767641067505, "learning_rate": 4.40321306294443e-05, "loss": 1.4283, "step": 4537 }, { "epoch": 0.11951540690018436, "grad_norm": 3.9101603031158447, "learning_rate": 4.403081380036872e-05, "loss": 0.804, "step": 4538 }, { "epoch": 0.11954174348169608, "grad_norm": 2.461411237716675, "learning_rate": 4.4029496971293126e-05, "loss": 0.8779, "step": 4539 }, { "epoch": 0.1195680800632078, "grad_norm": 2.4096240997314453, "learning_rate": 4.402818014221754e-05, "loss": 2.0811, "step": 4540 }, { "epoch": 0.11959441664471951, "grad_norm": 5.311163425445557, "learning_rate": 4.402686331314196e-05, "loss": 2.0921, "step": 4541 }, { "epoch": 0.11962075322623124, "grad_norm": 2.9684927463531494, "learning_rate": 4.402554648406637e-05, "loss": 1.413, "step": 4542 }, { "epoch": 0.11964708980774295, "grad_norm": 4.3449225425720215, "learning_rate": 4.402422965499078e-05, "loss": 1.9694, "step": 4543 }, { "epoch": 0.11967342638925467, "grad_norm": 2.3601181507110596, "learning_rate": 4.40229128259152e-05, "loss": 2.0336, "step": 4544 }, { "epoch": 0.1196997629707664, "grad_norm": 2.681947708129883, "learning_rate": 4.402159599683961e-05, "loss": 1.2407, "step": 4545 }, { "epoch": 0.11972609955227811, "grad_norm": 3.324223279953003, "learning_rate": 4.402027916776403e-05, "loss": 1.5159, "step": 4546 }, { "epoch": 0.11975243613378983, "grad_norm": 1.7762402296066284, "learning_rate": 4.4018962338688444e-05, "loss": 2.0747, "step": 4547 }, { "epoch": 0.11977877271530156, "grad_norm": 2.1038661003112793, "learning_rate": 4.401764550961285e-05, "loss": 1.3491, "step": 4548 }, { "epoch": 0.11980510929681328, "grad_norm": 2.5586345195770264, "learning_rate": 4.401632868053727e-05, "loss": 1.7166, "step": 4549 }, { "epoch": 0.11983144587832499, "grad_norm": 2.3258981704711914, "learning_rate": 4.4015011851461684e-05, "loss": 1.2625, "step": 4550 }, { "epoch": 0.1198577824598367, "grad_norm": 2.0945794582366943, "learning_rate": 4.40136950223861e-05, "loss": 1.7931, "step": 4551 }, { "epoch": 0.11988411904134844, "grad_norm": 4.948436260223389, "learning_rate": 4.401237819331051e-05, "loss": 1.2743, "step": 4552 }, { "epoch": 0.11991045562286015, "grad_norm": 2.5348498821258545, "learning_rate": 4.4011061364234924e-05, "loss": 2.0265, "step": 4553 }, { "epoch": 0.11993679220437187, "grad_norm": 2.0693464279174805, "learning_rate": 4.400974453515933e-05, "loss": 1.8904, "step": 4554 }, { "epoch": 0.1199631287858836, "grad_norm": 3.35764217376709, "learning_rate": 4.4008427706083755e-05, "loss": 1.615, "step": 4555 }, { "epoch": 0.11998946536739531, "grad_norm": 1.5819950103759766, "learning_rate": 4.4007110877008164e-05, "loss": 2.043, "step": 4556 }, { "epoch": 0.12001580194890703, "grad_norm": 2.327550172805786, "learning_rate": 4.400579404793258e-05, "loss": 1.8321, "step": 4557 }, { "epoch": 0.12004213853041876, "grad_norm": 1.9448949098587036, "learning_rate": 4.4004477218856995e-05, "loss": 2.3237, "step": 4558 }, { "epoch": 0.12006847511193047, "grad_norm": 1.9863765239715576, "learning_rate": 4.4003160389781404e-05, "loss": 0.6008, "step": 4559 }, { "epoch": 0.12009481169344219, "grad_norm": 2.417666435241699, "learning_rate": 4.4001843560705827e-05, "loss": 0.5446, "step": 4560 }, { "epoch": 0.12012114827495392, "grad_norm": 2.5216031074523926, "learning_rate": 4.4000526731630235e-05, "loss": 2.6096, "step": 4561 }, { "epoch": 0.12014748485646563, "grad_norm": 4.250596523284912, "learning_rate": 4.399920990255465e-05, "loss": 1.689, "step": 4562 }, { "epoch": 0.12017382143797735, "grad_norm": 2.095987319946289, "learning_rate": 4.399789307347906e-05, "loss": 1.9332, "step": 4563 }, { "epoch": 0.12020015801948906, "grad_norm": 3.8956432342529297, "learning_rate": 4.399657624440348e-05, "loss": 2.0838, "step": 4564 }, { "epoch": 0.1202264946010008, "grad_norm": 1.9411897659301758, "learning_rate": 4.399525941532789e-05, "loss": 2.0952, "step": 4565 }, { "epoch": 0.12025283118251251, "grad_norm": 2.8632423877716064, "learning_rate": 4.3993942586252307e-05, "loss": 2.3953, "step": 4566 }, { "epoch": 0.12027916776402423, "grad_norm": 1.9260960817337036, "learning_rate": 4.399262575717672e-05, "loss": 1.8821, "step": 4567 }, { "epoch": 0.12030550434553595, "grad_norm": 2.5925159454345703, "learning_rate": 4.399130892810113e-05, "loss": 0.6486, "step": 4568 }, { "epoch": 0.12033184092704767, "grad_norm": 2.303072452545166, "learning_rate": 4.398999209902555e-05, "loss": 1.766, "step": 4569 }, { "epoch": 0.12035817750855939, "grad_norm": 1.9586025476455688, "learning_rate": 4.398867526994996e-05, "loss": 1.392, "step": 4570 }, { "epoch": 0.12038451409007112, "grad_norm": 4.656185626983643, "learning_rate": 4.398735844087438e-05, "loss": 1.5239, "step": 4571 }, { "epoch": 0.12041085067158283, "grad_norm": 4.9770097732543945, "learning_rate": 4.3986041611798787e-05, "loss": 1.2803, "step": 4572 }, { "epoch": 0.12043718725309455, "grad_norm": 2.0840556621551514, "learning_rate": 4.39847247827232e-05, "loss": 2.349, "step": 4573 }, { "epoch": 0.12046352383460626, "grad_norm": 2.684706211090088, "learning_rate": 4.398340795364762e-05, "loss": 1.922, "step": 4574 }, { "epoch": 0.12048986041611799, "grad_norm": 1.8182473182678223, "learning_rate": 4.398209112457203e-05, "loss": 2.0707, "step": 4575 }, { "epoch": 0.12051619699762971, "grad_norm": 1.966376781463623, "learning_rate": 4.398077429549645e-05, "loss": 1.8312, "step": 4576 }, { "epoch": 0.12054253357914142, "grad_norm": 3.737230062484741, "learning_rate": 4.397945746642086e-05, "loss": 1.2757, "step": 4577 }, { "epoch": 0.12056887016065315, "grad_norm": 2.854975700378418, "learning_rate": 4.397814063734527e-05, "loss": 1.5723, "step": 4578 }, { "epoch": 0.12059520674216487, "grad_norm": 2.072096824645996, "learning_rate": 4.397682380826969e-05, "loss": 1.4735, "step": 4579 }, { "epoch": 0.12062154332367658, "grad_norm": 3.2030415534973145, "learning_rate": 4.3975506979194105e-05, "loss": 1.8364, "step": 4580 }, { "epoch": 0.12064787990518831, "grad_norm": 3.832263946533203, "learning_rate": 4.3974190150118513e-05, "loss": 0.9231, "step": 4581 }, { "epoch": 0.12067421648670003, "grad_norm": 2.062819242477417, "learning_rate": 4.397287332104293e-05, "loss": 1.1318, "step": 4582 }, { "epoch": 0.12070055306821174, "grad_norm": 1.9557278156280518, "learning_rate": 4.3971556491967345e-05, "loss": 1.9448, "step": 4583 }, { "epoch": 0.12072688964972346, "grad_norm": 2.545456886291504, "learning_rate": 4.397023966289176e-05, "loss": 2.3229, "step": 4584 }, { "epoch": 0.12075322623123519, "grad_norm": 2.5751824378967285, "learning_rate": 4.3968922833816176e-05, "loss": 1.7099, "step": 4585 }, { "epoch": 0.1207795628127469, "grad_norm": 2.2981085777282715, "learning_rate": 4.3967606004740585e-05, "loss": 1.779, "step": 4586 }, { "epoch": 0.12080589939425862, "grad_norm": 2.2591090202331543, "learning_rate": 4.3966289175665e-05, "loss": 1.4552, "step": 4587 }, { "epoch": 0.12083223597577035, "grad_norm": 3.642763137817383, "learning_rate": 4.3964972346589416e-05, "loss": 1.5614, "step": 4588 }, { "epoch": 0.12085857255728207, "grad_norm": 2.989940881729126, "learning_rate": 4.396365551751383e-05, "loss": 1.5025, "step": 4589 }, { "epoch": 0.12088490913879378, "grad_norm": 3.334362745285034, "learning_rate": 4.396233868843824e-05, "loss": 1.6429, "step": 4590 }, { "epoch": 0.12091124572030551, "grad_norm": 2.275623083114624, "learning_rate": 4.3961021859362656e-05, "loss": 1.6776, "step": 4591 }, { "epoch": 0.12093758230181723, "grad_norm": 3.960989475250244, "learning_rate": 4.395970503028707e-05, "loss": 1.333, "step": 4592 }, { "epoch": 0.12096391888332894, "grad_norm": 2.0593600273132324, "learning_rate": 4.395838820121149e-05, "loss": 2.0727, "step": 4593 }, { "epoch": 0.12099025546484066, "grad_norm": 2.603482484817505, "learning_rate": 4.39570713721359e-05, "loss": 1.8435, "step": 4594 }, { "epoch": 0.12101659204635239, "grad_norm": 2.5595359802246094, "learning_rate": 4.395575454306031e-05, "loss": 2.0078, "step": 4595 }, { "epoch": 0.1210429286278641, "grad_norm": 2.7202510833740234, "learning_rate": 4.395443771398473e-05, "loss": 1.2769, "step": 4596 }, { "epoch": 0.12106926520937582, "grad_norm": 2.8965611457824707, "learning_rate": 4.395312088490914e-05, "loss": 0.5525, "step": 4597 }, { "epoch": 0.12109560179088755, "grad_norm": 3.5813839435577393, "learning_rate": 4.395180405583356e-05, "loss": 1.3512, "step": 4598 }, { "epoch": 0.12112193837239926, "grad_norm": 2.405693769454956, "learning_rate": 4.395048722675797e-05, "loss": 1.937, "step": 4599 }, { "epoch": 0.12114827495391098, "grad_norm": 2.0294175148010254, "learning_rate": 4.394917039768238e-05, "loss": 1.4466, "step": 4600 }, { "epoch": 0.12117461153542271, "grad_norm": 2.122802972793579, "learning_rate": 4.39478535686068e-05, "loss": 2.1325, "step": 4601 }, { "epoch": 0.12120094811693442, "grad_norm": 3.272130012512207, "learning_rate": 4.3946536739531214e-05, "loss": 2.0251, "step": 4602 }, { "epoch": 0.12122728469844614, "grad_norm": 1.7862468957901, "learning_rate": 4.394521991045563e-05, "loss": 2.0858, "step": 4603 }, { "epoch": 0.12125362127995787, "grad_norm": 1.9192084074020386, "learning_rate": 4.394390308138004e-05, "loss": 1.0839, "step": 4604 }, { "epoch": 0.12127995786146958, "grad_norm": 3.5498342514038086, "learning_rate": 4.3942586252304454e-05, "loss": 2.2891, "step": 4605 }, { "epoch": 0.1213062944429813, "grad_norm": 1.8106166124343872, "learning_rate": 4.394126942322886e-05, "loss": 2.3355, "step": 4606 }, { "epoch": 0.12133263102449302, "grad_norm": 3.650259256362915, "learning_rate": 4.3939952594153285e-05, "loss": 1.3046, "step": 4607 }, { "epoch": 0.12135896760600474, "grad_norm": 2.2454311847686768, "learning_rate": 4.3938635765077694e-05, "loss": 1.5541, "step": 4608 }, { "epoch": 0.12138530418751646, "grad_norm": 1.8960703611373901, "learning_rate": 4.393731893600211e-05, "loss": 0.7282, "step": 4609 }, { "epoch": 0.12141164076902818, "grad_norm": 5.468796730041504, "learning_rate": 4.393600210692652e-05, "loss": 1.2263, "step": 4610 }, { "epoch": 0.1214379773505399, "grad_norm": 2.372835636138916, "learning_rate": 4.3934685277850934e-05, "loss": 1.4008, "step": 4611 }, { "epoch": 0.12146431393205162, "grad_norm": 1.8523073196411133, "learning_rate": 4.393336844877535e-05, "loss": 0.6759, "step": 4612 }, { "epoch": 0.12149065051356334, "grad_norm": 3.4270529747009277, "learning_rate": 4.3932051619699765e-05, "loss": 2.3706, "step": 4613 }, { "epoch": 0.12151698709507507, "grad_norm": 3.632455348968506, "learning_rate": 4.393073479062418e-05, "loss": 1.1427, "step": 4614 }, { "epoch": 0.12154332367658678, "grad_norm": 6.611170768737793, "learning_rate": 4.392941796154859e-05, "loss": 1.3079, "step": 4615 }, { "epoch": 0.1215696602580985, "grad_norm": 3.0112364292144775, "learning_rate": 4.392810113247301e-05, "loss": 2.2547, "step": 4616 }, { "epoch": 0.12159599683961021, "grad_norm": 2.2450125217437744, "learning_rate": 4.392678430339742e-05, "loss": 1.7717, "step": 4617 }, { "epoch": 0.12162233342112194, "grad_norm": 3.6134462356567383, "learning_rate": 4.3925467474321836e-05, "loss": 2.7018, "step": 4618 }, { "epoch": 0.12164867000263366, "grad_norm": 2.246868848800659, "learning_rate": 4.3924150645246245e-05, "loss": 2.1674, "step": 4619 }, { "epoch": 0.12167500658414537, "grad_norm": 1.966476559638977, "learning_rate": 4.392283381617066e-05, "loss": 1.8422, "step": 4620 }, { "epoch": 0.1217013431656571, "grad_norm": 3.2045938968658447, "learning_rate": 4.3921516987095076e-05, "loss": 1.2722, "step": 4621 }, { "epoch": 0.12172767974716882, "grad_norm": 6.169275283813477, "learning_rate": 4.392020015801949e-05, "loss": 1.1417, "step": 4622 }, { "epoch": 0.12175401632868053, "grad_norm": 3.556260347366333, "learning_rate": 4.391888332894391e-05, "loss": 1.1964, "step": 4623 }, { "epoch": 0.12178035291019226, "grad_norm": 2.3873257637023926, "learning_rate": 4.3917566499868316e-05, "loss": 1.9118, "step": 4624 }, { "epoch": 0.12180668949170398, "grad_norm": 2.158400774002075, "learning_rate": 4.391624967079273e-05, "loss": 1.7923, "step": 4625 }, { "epoch": 0.1218330260732157, "grad_norm": 2.245790719985962, "learning_rate": 4.391493284171715e-05, "loss": 1.4026, "step": 4626 }, { "epoch": 0.12185936265472741, "grad_norm": 1.8083654642105103, "learning_rate": 4.391361601264156e-05, "loss": 1.8232, "step": 4627 }, { "epoch": 0.12188569923623914, "grad_norm": 4.552290439605713, "learning_rate": 4.391229918356597e-05, "loss": 1.6143, "step": 4628 }, { "epoch": 0.12191203581775086, "grad_norm": 2.729383945465088, "learning_rate": 4.391098235449039e-05, "loss": 1.6407, "step": 4629 }, { "epoch": 0.12193837239926257, "grad_norm": 2.899068593978882, "learning_rate": 4.39096655254148e-05, "loss": 2.163, "step": 4630 }, { "epoch": 0.1219647089807743, "grad_norm": 1.8001106977462769, "learning_rate": 4.390834869633922e-05, "loss": 1.5818, "step": 4631 }, { "epoch": 0.12199104556228602, "grad_norm": 5.082118988037109, "learning_rate": 4.3907031867263634e-05, "loss": 1.2649, "step": 4632 }, { "epoch": 0.12201738214379773, "grad_norm": 1.9306411743164062, "learning_rate": 4.390571503818804e-05, "loss": 1.8876, "step": 4633 }, { "epoch": 0.12204371872530946, "grad_norm": 4.088321685791016, "learning_rate": 4.390439820911246e-05, "loss": 0.6823, "step": 4634 }, { "epoch": 0.12207005530682118, "grad_norm": 1.8974683284759521, "learning_rate": 4.3903081380036874e-05, "loss": 0.7718, "step": 4635 }, { "epoch": 0.12209639188833289, "grad_norm": 1.7598472833633423, "learning_rate": 4.390176455096129e-05, "loss": 1.6616, "step": 4636 }, { "epoch": 0.12212272846984461, "grad_norm": 2.717180013656616, "learning_rate": 4.39004477218857e-05, "loss": 2.1474, "step": 4637 }, { "epoch": 0.12214906505135634, "grad_norm": 1.8452847003936768, "learning_rate": 4.3899130892810114e-05, "loss": 0.1924, "step": 4638 }, { "epoch": 0.12217540163286805, "grad_norm": 2.279766798019409, "learning_rate": 4.389781406373453e-05, "loss": 1.6768, "step": 4639 }, { "epoch": 0.12220173821437977, "grad_norm": 2.480419874191284, "learning_rate": 4.3896497234658946e-05, "loss": 1.458, "step": 4640 }, { "epoch": 0.1222280747958915, "grad_norm": 1.9092397689819336, "learning_rate": 4.389518040558336e-05, "loss": 1.0179, "step": 4641 }, { "epoch": 0.12225441137740321, "grad_norm": 3.4327282905578613, "learning_rate": 4.389386357650777e-05, "loss": 1.774, "step": 4642 }, { "epoch": 0.12228074795891493, "grad_norm": 3.2832391262054443, "learning_rate": 4.3892546747432186e-05, "loss": 1.3595, "step": 4643 }, { "epoch": 0.12230708454042666, "grad_norm": 2.413118362426758, "learning_rate": 4.3891229918356594e-05, "loss": 1.3041, "step": 4644 }, { "epoch": 0.12233342112193837, "grad_norm": 2.2710001468658447, "learning_rate": 4.388991308928102e-05, "loss": 1.5093, "step": 4645 }, { "epoch": 0.12235975770345009, "grad_norm": 3.111807107925415, "learning_rate": 4.3888596260205426e-05, "loss": 1.9066, "step": 4646 }, { "epoch": 0.12238609428496182, "grad_norm": 1.690782070159912, "learning_rate": 4.388727943112984e-05, "loss": 1.5699, "step": 4647 }, { "epoch": 0.12241243086647353, "grad_norm": 2.160367012023926, "learning_rate": 4.388596260205426e-05, "loss": 1.564, "step": 4648 }, { "epoch": 0.12243876744798525, "grad_norm": 2.432783842086792, "learning_rate": 4.388464577297867e-05, "loss": 2.3584, "step": 4649 }, { "epoch": 0.12246510402949697, "grad_norm": 3.1996405124664307, "learning_rate": 4.388332894390309e-05, "loss": 1.7356, "step": 4650 }, { "epoch": 0.1224914406110087, "grad_norm": 1.723605751991272, "learning_rate": 4.38820121148275e-05, "loss": 1.0746, "step": 4651 }, { "epoch": 0.12251777719252041, "grad_norm": 2.0372090339660645, "learning_rate": 4.388069528575191e-05, "loss": 1.2094, "step": 4652 }, { "epoch": 0.12254411377403213, "grad_norm": 1.6394349336624146, "learning_rate": 4.387937845667632e-05, "loss": 2.4443, "step": 4653 }, { "epoch": 0.12257045035554386, "grad_norm": 4.406085968017578, "learning_rate": 4.3878061627600744e-05, "loss": 1.4122, "step": 4654 }, { "epoch": 0.12259678693705557, "grad_norm": 3.4900963306427, "learning_rate": 4.387674479852515e-05, "loss": 2.1812, "step": 4655 }, { "epoch": 0.12262312351856729, "grad_norm": 2.078904867172241, "learning_rate": 4.387542796944957e-05, "loss": 1.9236, "step": 4656 }, { "epoch": 0.12264946010007902, "grad_norm": 2.413954734802246, "learning_rate": 4.387411114037398e-05, "loss": 2.0406, "step": 4657 }, { "epoch": 0.12267579668159073, "grad_norm": 1.8602428436279297, "learning_rate": 4.387279431129839e-05, "loss": 1.5564, "step": 4658 }, { "epoch": 0.12270213326310245, "grad_norm": 2.4664242267608643, "learning_rate": 4.387147748222281e-05, "loss": 1.7709, "step": 4659 }, { "epoch": 0.12272846984461416, "grad_norm": 2.4134647846221924, "learning_rate": 4.3870160653147224e-05, "loss": 2.1152, "step": 4660 }, { "epoch": 0.12275480642612589, "grad_norm": 2.130995750427246, "learning_rate": 4.386884382407164e-05, "loss": 1.8566, "step": 4661 }, { "epoch": 0.12278114300763761, "grad_norm": 1.8650908470153809, "learning_rate": 4.386752699499605e-05, "loss": 2.2976, "step": 4662 }, { "epoch": 0.12280747958914932, "grad_norm": 2.470064640045166, "learning_rate": 4.386621016592047e-05, "loss": 0.8065, "step": 4663 }, { "epoch": 0.12283381617066105, "grad_norm": 3.8376693725585938, "learning_rate": 4.386489333684488e-05, "loss": 0.8042, "step": 4664 }, { "epoch": 0.12286015275217277, "grad_norm": 1.9035619497299194, "learning_rate": 4.3863576507769295e-05, "loss": 2.5298, "step": 4665 }, { "epoch": 0.12288648933368448, "grad_norm": 2.975315809249878, "learning_rate": 4.3862259678693704e-05, "loss": 1.054, "step": 4666 }, { "epoch": 0.12291282591519621, "grad_norm": 3.355088233947754, "learning_rate": 4.386094284961812e-05, "loss": 1.1019, "step": 4667 }, { "epoch": 0.12293916249670793, "grad_norm": 3.4372174739837646, "learning_rate": 4.3859626020542535e-05, "loss": 1.185, "step": 4668 }, { "epoch": 0.12296549907821964, "grad_norm": 1.6783677339553833, "learning_rate": 4.385830919146695e-05, "loss": 1.6823, "step": 4669 }, { "epoch": 0.12299183565973136, "grad_norm": 1.8151222467422485, "learning_rate": 4.3856992362391366e-05, "loss": 0.9305, "step": 4670 }, { "epoch": 0.12301817224124309, "grad_norm": 2.9164371490478516, "learning_rate": 4.3855675533315775e-05, "loss": 1.2978, "step": 4671 }, { "epoch": 0.1230445088227548, "grad_norm": 1.9816832542419434, "learning_rate": 4.385435870424019e-05, "loss": 2.2482, "step": 4672 }, { "epoch": 0.12307084540426652, "grad_norm": 2.254826307296753, "learning_rate": 4.3853041875164606e-05, "loss": 1.6485, "step": 4673 }, { "epoch": 0.12309718198577825, "grad_norm": 2.6538503170013428, "learning_rate": 4.385172504608902e-05, "loss": 1.1908, "step": 4674 }, { "epoch": 0.12312351856728997, "grad_norm": 2.3163418769836426, "learning_rate": 4.385040821701343e-05, "loss": 1.822, "step": 4675 }, { "epoch": 0.12314985514880168, "grad_norm": 3.323758363723755, "learning_rate": 4.3849091387937846e-05, "loss": 0.9073, "step": 4676 }, { "epoch": 0.12317619173031341, "grad_norm": 2.26656174659729, "learning_rate": 4.384777455886226e-05, "loss": 1.5782, "step": 4677 }, { "epoch": 0.12320252831182513, "grad_norm": 2.4748458862304688, "learning_rate": 4.384645772978668e-05, "loss": 1.8417, "step": 4678 }, { "epoch": 0.12322886489333684, "grad_norm": 6.037837982177734, "learning_rate": 4.384514090071109e-05, "loss": 1.5944, "step": 4679 }, { "epoch": 0.12325520147484856, "grad_norm": 2.3028833866119385, "learning_rate": 4.38438240716355e-05, "loss": 1.4918, "step": 4680 }, { "epoch": 0.12328153805636029, "grad_norm": 2.6609320640563965, "learning_rate": 4.384250724255992e-05, "loss": 1.5208, "step": 4681 }, { "epoch": 0.123307874637872, "grad_norm": 1.6242191791534424, "learning_rate": 4.384119041348433e-05, "loss": 1.9448, "step": 4682 }, { "epoch": 0.12333421121938372, "grad_norm": 1.7447011470794678, "learning_rate": 4.383987358440875e-05, "loss": 1.5254, "step": 4683 }, { "epoch": 0.12336054780089545, "grad_norm": 2.2075204849243164, "learning_rate": 4.383855675533316e-05, "loss": 2.1902, "step": 4684 }, { "epoch": 0.12338688438240716, "grad_norm": 1.8057124614715576, "learning_rate": 4.383723992625757e-05, "loss": 2.1283, "step": 4685 }, { "epoch": 0.12341322096391888, "grad_norm": 4.820918083190918, "learning_rate": 4.383592309718199e-05, "loss": 1.2956, "step": 4686 }, { "epoch": 0.12343955754543061, "grad_norm": 1.7067325115203857, "learning_rate": 4.3834606268106404e-05, "loss": 1.4117, "step": 4687 }, { "epoch": 0.12346589412694232, "grad_norm": 2.3989779949188232, "learning_rate": 4.383328943903082e-05, "loss": 2.0893, "step": 4688 }, { "epoch": 0.12349223070845404, "grad_norm": 2.5785791873931885, "learning_rate": 4.383197260995523e-05, "loss": 1.694, "step": 4689 }, { "epoch": 0.12351856728996576, "grad_norm": 2.5003888607025146, "learning_rate": 4.3830655780879644e-05, "loss": 1.7038, "step": 4690 }, { "epoch": 0.12354490387147748, "grad_norm": 2.5338175296783447, "learning_rate": 4.382933895180405e-05, "loss": 1.5793, "step": 4691 }, { "epoch": 0.1235712404529892, "grad_norm": 3.209324598312378, "learning_rate": 4.3828022122728475e-05, "loss": 0.8265, "step": 4692 }, { "epoch": 0.12359757703450092, "grad_norm": 2.568403959274292, "learning_rate": 4.3826705293652884e-05, "loss": 1.5413, "step": 4693 }, { "epoch": 0.12362391361601265, "grad_norm": 1.9092358350753784, "learning_rate": 4.38253884645773e-05, "loss": 1.9075, "step": 4694 }, { "epoch": 0.12365025019752436, "grad_norm": 1.8922903537750244, "learning_rate": 4.3824071635501715e-05, "loss": 1.7268, "step": 4695 }, { "epoch": 0.12367658677903608, "grad_norm": 1.9539427757263184, "learning_rate": 4.382275480642613e-05, "loss": 2.9814, "step": 4696 }, { "epoch": 0.1237029233605478, "grad_norm": 1.4077868461608887, "learning_rate": 4.3821437977350547e-05, "loss": 1.6842, "step": 4697 }, { "epoch": 0.12372925994205952, "grad_norm": 3.8124887943267822, "learning_rate": 4.3820121148274955e-05, "loss": 1.8621, "step": 4698 }, { "epoch": 0.12375559652357124, "grad_norm": 2.13594913482666, "learning_rate": 4.381880431919937e-05, "loss": 0.4539, "step": 4699 }, { "epoch": 0.12378193310508297, "grad_norm": 2.145198345184326, "learning_rate": 4.381748749012378e-05, "loss": 1.0364, "step": 4700 }, { "epoch": 0.12380826968659468, "grad_norm": 1.9599852561950684, "learning_rate": 4.38161706610482e-05, "loss": 1.3169, "step": 4701 }, { "epoch": 0.1238346062681064, "grad_norm": 3.1412858963012695, "learning_rate": 4.381485383197261e-05, "loss": 1.6835, "step": 4702 }, { "epoch": 0.12386094284961811, "grad_norm": 3.127105712890625, "learning_rate": 4.3813537002897027e-05, "loss": 1.308, "step": 4703 }, { "epoch": 0.12388727943112984, "grad_norm": 3.4913148880004883, "learning_rate": 4.381222017382144e-05, "loss": 1.629, "step": 4704 }, { "epoch": 0.12391361601264156, "grad_norm": 3.8447394371032715, "learning_rate": 4.381090334474585e-05, "loss": 2.0455, "step": 4705 }, { "epoch": 0.12393995259415327, "grad_norm": 1.6441948413848877, "learning_rate": 4.380958651567027e-05, "loss": 0.2486, "step": 4706 }, { "epoch": 0.123966289175665, "grad_norm": 2.0133681297302246, "learning_rate": 4.380826968659468e-05, "loss": 1.5702, "step": 4707 }, { "epoch": 0.12399262575717672, "grad_norm": 2.8952345848083496, "learning_rate": 4.38069528575191e-05, "loss": 1.6503, "step": 4708 }, { "epoch": 0.12401896233868843, "grad_norm": 3.1991989612579346, "learning_rate": 4.380563602844351e-05, "loss": 1.5906, "step": 4709 }, { "epoch": 0.12404529892020016, "grad_norm": 1.8866389989852905, "learning_rate": 4.380431919936792e-05, "loss": 1.7536, "step": 4710 }, { "epoch": 0.12407163550171188, "grad_norm": 2.81256365776062, "learning_rate": 4.380300237029234e-05, "loss": 1.305, "step": 4711 }, { "epoch": 0.1240979720832236, "grad_norm": 4.803744316101074, "learning_rate": 4.3801685541216753e-05, "loss": 1.7908, "step": 4712 }, { "epoch": 0.12412430866473531, "grad_norm": 2.1657397747039795, "learning_rate": 4.380036871214116e-05, "loss": 1.4083, "step": 4713 }, { "epoch": 0.12415064524624704, "grad_norm": 2.398895025253296, "learning_rate": 4.379905188306558e-05, "loss": 1.4321, "step": 4714 }, { "epoch": 0.12417698182775876, "grad_norm": 2.256537914276123, "learning_rate": 4.3797735053989993e-05, "loss": 1.5301, "step": 4715 }, { "epoch": 0.12420331840927047, "grad_norm": 2.276139736175537, "learning_rate": 4.379641822491441e-05, "loss": 2.0023, "step": 4716 }, { "epoch": 0.1242296549907822, "grad_norm": 4.326174259185791, "learning_rate": 4.3795101395838825e-05, "loss": 1.5146, "step": 4717 }, { "epoch": 0.12425599157229392, "grad_norm": 4.015639781951904, "learning_rate": 4.3793784566763233e-05, "loss": 0.9385, "step": 4718 }, { "epoch": 0.12428232815380563, "grad_norm": 2.3562822341918945, "learning_rate": 4.379246773768765e-05, "loss": 1.251, "step": 4719 }, { "epoch": 0.12430866473531736, "grad_norm": 1.6875540018081665, "learning_rate": 4.3791150908612065e-05, "loss": 2.0256, "step": 4720 }, { "epoch": 0.12433500131682908, "grad_norm": 3.528935432434082, "learning_rate": 4.378983407953648e-05, "loss": 0.6218, "step": 4721 }, { "epoch": 0.12436133789834079, "grad_norm": 2.5248193740844727, "learning_rate": 4.378851725046089e-05, "loss": 1.5558, "step": 4722 }, { "epoch": 0.12438767447985251, "grad_norm": 2.8863699436187744, "learning_rate": 4.3787200421385305e-05, "loss": 2.2871, "step": 4723 }, { "epoch": 0.12441401106136424, "grad_norm": 2.0777347087860107, "learning_rate": 4.378588359230972e-05, "loss": 1.4763, "step": 4724 }, { "epoch": 0.12444034764287595, "grad_norm": 4.451682090759277, "learning_rate": 4.3784566763234136e-05, "loss": 1.5911, "step": 4725 }, { "epoch": 0.12446668422438767, "grad_norm": 1.9852718114852905, "learning_rate": 4.378324993415855e-05, "loss": 2.0354, "step": 4726 }, { "epoch": 0.1244930208058994, "grad_norm": 4.1549530029296875, "learning_rate": 4.378193310508296e-05, "loss": 1.0154, "step": 4727 }, { "epoch": 0.12451935738741111, "grad_norm": 2.3205487728118896, "learning_rate": 4.3780616276007376e-05, "loss": 1.8725, "step": 4728 }, { "epoch": 0.12454569396892283, "grad_norm": 1.9256564378738403, "learning_rate": 4.377929944693179e-05, "loss": 1.7368, "step": 4729 }, { "epoch": 0.12457203055043456, "grad_norm": 2.4510157108306885, "learning_rate": 4.377798261785621e-05, "loss": 1.9596, "step": 4730 }, { "epoch": 0.12459836713194627, "grad_norm": 2.789548635482788, "learning_rate": 4.3776665788780616e-05, "loss": 2.0809, "step": 4731 }, { "epoch": 0.12462470371345799, "grad_norm": 2.673419713973999, "learning_rate": 4.377534895970503e-05, "loss": 3.0775, "step": 4732 }, { "epoch": 0.1246510402949697, "grad_norm": 2.358151435852051, "learning_rate": 4.377403213062945e-05, "loss": 1.8011, "step": 4733 }, { "epoch": 0.12467737687648144, "grad_norm": 1.626806378364563, "learning_rate": 4.377271530155386e-05, "loss": 1.8686, "step": 4734 }, { "epoch": 0.12470371345799315, "grad_norm": 3.8930649757385254, "learning_rate": 4.377139847247828e-05, "loss": 1.2501, "step": 4735 }, { "epoch": 0.12473005003950487, "grad_norm": 6.212684154510498, "learning_rate": 4.377008164340269e-05, "loss": 2.0236, "step": 4736 }, { "epoch": 0.1247563866210166, "grad_norm": 3.866426944732666, "learning_rate": 4.37687648143271e-05, "loss": 1.3742, "step": 4737 }, { "epoch": 0.12478272320252831, "grad_norm": 3.705449104309082, "learning_rate": 4.376744798525151e-05, "loss": 1.393, "step": 4738 }, { "epoch": 0.12480905978404003, "grad_norm": 1.7463220357894897, "learning_rate": 4.3766131156175934e-05, "loss": 1.9235, "step": 4739 }, { "epoch": 0.12483539636555176, "grad_norm": 2.7118091583251953, "learning_rate": 4.376481432710034e-05, "loss": 2.5418, "step": 4740 }, { "epoch": 0.12486173294706347, "grad_norm": 3.8343799114227295, "learning_rate": 4.376349749802476e-05, "loss": 1.697, "step": 4741 }, { "epoch": 0.12488806952857519, "grad_norm": 1.4628170728683472, "learning_rate": 4.3762180668949174e-05, "loss": 2.3846, "step": 4742 }, { "epoch": 0.12491440611008692, "grad_norm": 1.8428781032562256, "learning_rate": 4.376086383987358e-05, "loss": 1.7052, "step": 4743 }, { "epoch": 0.12494074269159863, "grad_norm": 3.012197256088257, "learning_rate": 4.3759547010798005e-05, "loss": 1.3005, "step": 4744 }, { "epoch": 0.12496707927311035, "grad_norm": 2.2392208576202393, "learning_rate": 4.3758230181722414e-05, "loss": 1.1542, "step": 4745 }, { "epoch": 0.12499341585462206, "grad_norm": 3.821258306503296, "learning_rate": 4.375691335264683e-05, "loss": 0.7939, "step": 4746 }, { "epoch": 0.12501975243613378, "grad_norm": 1.9974212646484375, "learning_rate": 4.375559652357124e-05, "loss": 1.8858, "step": 4747 }, { "epoch": 0.12504608901764552, "grad_norm": 2.2754218578338623, "learning_rate": 4.375427969449566e-05, "loss": 1.9591, "step": 4748 }, { "epoch": 0.12507242559915724, "grad_norm": 1.5792393684387207, "learning_rate": 4.375296286542007e-05, "loss": 0.8318, "step": 4749 }, { "epoch": 0.12509876218066895, "grad_norm": 2.1220436096191406, "learning_rate": 4.3751646036344485e-05, "loss": 1.5546, "step": 4750 }, { "epoch": 0.12512509876218067, "grad_norm": 3.427337408065796, "learning_rate": 4.37503292072689e-05, "loss": 1.6584, "step": 4751 }, { "epoch": 0.12515143534369239, "grad_norm": 3.002098321914673, "learning_rate": 4.374901237819331e-05, "loss": 2.036, "step": 4752 }, { "epoch": 0.1251777719252041, "grad_norm": 2.1741037368774414, "learning_rate": 4.374769554911773e-05, "loss": 1.4488, "step": 4753 }, { "epoch": 0.12520410850671582, "grad_norm": 2.436464309692383, "learning_rate": 4.374637872004214e-05, "loss": 1.5231, "step": 4754 }, { "epoch": 0.12523044508822756, "grad_norm": 2.3738396167755127, "learning_rate": 4.3745061890966556e-05, "loss": 1.8681, "step": 4755 }, { "epoch": 0.12525678166973928, "grad_norm": 4.001001834869385, "learning_rate": 4.3743745061890965e-05, "loss": 1.1224, "step": 4756 }, { "epoch": 0.125283118251251, "grad_norm": 3.5733444690704346, "learning_rate": 4.374242823281538e-05, "loss": 1.6748, "step": 4757 }, { "epoch": 0.1253094548327627, "grad_norm": 8.686369895935059, "learning_rate": 4.3741111403739796e-05, "loss": 2.6691, "step": 4758 }, { "epoch": 0.12533579141427442, "grad_norm": 2.162923574447632, "learning_rate": 4.373979457466421e-05, "loss": 1.5718, "step": 4759 }, { "epoch": 0.12536212799578614, "grad_norm": 3.8729188442230225, "learning_rate": 4.373847774558862e-05, "loss": 1.7658, "step": 4760 }, { "epoch": 0.12538846457729785, "grad_norm": 2.4855551719665527, "learning_rate": 4.3737160916513036e-05, "loss": 2.3868, "step": 4761 }, { "epoch": 0.1254148011588096, "grad_norm": 1.9619601964950562, "learning_rate": 4.373584408743745e-05, "loss": 1.2659, "step": 4762 }, { "epoch": 0.1254411377403213, "grad_norm": 2.4714925289154053, "learning_rate": 4.373452725836187e-05, "loss": 1.7941, "step": 4763 }, { "epoch": 0.12546747432183303, "grad_norm": 2.151272773742676, "learning_rate": 4.373321042928628e-05, "loss": 1.8296, "step": 4764 }, { "epoch": 0.12549381090334474, "grad_norm": 2.1843862533569336, "learning_rate": 4.373189360021069e-05, "loss": 1.5575, "step": 4765 }, { "epoch": 0.12552014748485646, "grad_norm": 2.729646682739258, "learning_rate": 4.373057677113511e-05, "loss": 0.8134, "step": 4766 }, { "epoch": 0.12554648406636817, "grad_norm": 2.3161377906799316, "learning_rate": 4.372925994205952e-05, "loss": 1.6068, "step": 4767 }, { "epoch": 0.12557282064787992, "grad_norm": 1.9280983209609985, "learning_rate": 4.372794311298394e-05, "loss": 1.9839, "step": 4768 }, { "epoch": 0.12559915722939163, "grad_norm": 1.9818122386932373, "learning_rate": 4.372662628390835e-05, "loss": 1.8064, "step": 4769 }, { "epoch": 0.12562549381090335, "grad_norm": 1.721292495727539, "learning_rate": 4.372530945483276e-05, "loss": 0.533, "step": 4770 }, { "epoch": 0.12565183039241506, "grad_norm": 4.04070520401001, "learning_rate": 4.372399262575718e-05, "loss": 1.8365, "step": 4771 }, { "epoch": 0.12567816697392678, "grad_norm": 4.287169456481934, "learning_rate": 4.3722675796681594e-05, "loss": 1.5298, "step": 4772 }, { "epoch": 0.1257045035554385, "grad_norm": 2.601172685623169, "learning_rate": 4.372135896760601e-05, "loss": 1.0548, "step": 4773 }, { "epoch": 0.1257308401369502, "grad_norm": 2.757880210876465, "learning_rate": 4.372004213853042e-05, "loss": 1.7561, "step": 4774 }, { "epoch": 0.12575717671846195, "grad_norm": 2.030111789703369, "learning_rate": 4.3718725309454834e-05, "loss": 1.4399, "step": 4775 }, { "epoch": 0.12578351329997367, "grad_norm": 2.478980302810669, "learning_rate": 4.371740848037925e-05, "loss": 2.1174, "step": 4776 }, { "epoch": 0.12580984988148539, "grad_norm": 3.160599708557129, "learning_rate": 4.3716091651303666e-05, "loss": 1.8178, "step": 4777 }, { "epoch": 0.1258361864629971, "grad_norm": 1.8637760877609253, "learning_rate": 4.3714774822228074e-05, "loss": 1.6946, "step": 4778 }, { "epoch": 0.12586252304450882, "grad_norm": 2.038792371749878, "learning_rate": 4.371345799315249e-05, "loss": 1.9369, "step": 4779 }, { "epoch": 0.12588885962602053, "grad_norm": 2.1864607334136963, "learning_rate": 4.3712141164076906e-05, "loss": 1.6541, "step": 4780 }, { "epoch": 0.12591519620753228, "grad_norm": 3.3782198429107666, "learning_rate": 4.371082433500132e-05, "loss": 0.4839, "step": 4781 }, { "epoch": 0.125941532789044, "grad_norm": 2.9364256858825684, "learning_rate": 4.370950750592574e-05, "loss": 2.1456, "step": 4782 }, { "epoch": 0.1259678693705557, "grad_norm": 4.76910924911499, "learning_rate": 4.3708190676850146e-05, "loss": 1.4221, "step": 4783 }, { "epoch": 0.12599420595206742, "grad_norm": 1.7972780466079712, "learning_rate": 4.370687384777456e-05, "loss": 1.9187, "step": 4784 }, { "epoch": 0.12602054253357914, "grad_norm": 2.302013397216797, "learning_rate": 4.370555701869897e-05, "loss": 1.759, "step": 4785 }, { "epoch": 0.12604687911509085, "grad_norm": 2.296588659286499, "learning_rate": 4.370424018962339e-05, "loss": 1.8282, "step": 4786 }, { "epoch": 0.12607321569660257, "grad_norm": 2.1458377838134766, "learning_rate": 4.37029233605478e-05, "loss": 1.2792, "step": 4787 }, { "epoch": 0.1260995522781143, "grad_norm": 1.976080060005188, "learning_rate": 4.370160653147222e-05, "loss": 0.6266, "step": 4788 }, { "epoch": 0.12612588885962603, "grad_norm": 1.8807599544525146, "learning_rate": 4.370028970239663e-05, "loss": 1.45, "step": 4789 }, { "epoch": 0.12615222544113774, "grad_norm": 1.6742404699325562, "learning_rate": 4.369897287332104e-05, "loss": 1.5706, "step": 4790 }, { "epoch": 0.12617856202264946, "grad_norm": 2.7705278396606445, "learning_rate": 4.3697656044245464e-05, "loss": 1.9122, "step": 4791 }, { "epoch": 0.12620489860416118, "grad_norm": 2.441680669784546, "learning_rate": 4.369633921516987e-05, "loss": 1.7503, "step": 4792 }, { "epoch": 0.1262312351856729, "grad_norm": 2.1823110580444336, "learning_rate": 4.369502238609429e-05, "loss": 1.564, "step": 4793 }, { "epoch": 0.1262575717671846, "grad_norm": 1.9613033533096313, "learning_rate": 4.36937055570187e-05, "loss": 1.7863, "step": 4794 }, { "epoch": 0.12628390834869635, "grad_norm": 1.6110904216766357, "learning_rate": 4.369238872794312e-05, "loss": 2.3363, "step": 4795 }, { "epoch": 0.12631024493020807, "grad_norm": 2.40889573097229, "learning_rate": 4.369107189886753e-05, "loss": 1.9754, "step": 4796 }, { "epoch": 0.12633658151171978, "grad_norm": 3.6205410957336426, "learning_rate": 4.3689755069791944e-05, "loss": 1.1272, "step": 4797 }, { "epoch": 0.1263629180932315, "grad_norm": 4.322226047515869, "learning_rate": 4.368843824071636e-05, "loss": 1.5361, "step": 4798 }, { "epoch": 0.1263892546747432, "grad_norm": 2.373908758163452, "learning_rate": 4.368712141164077e-05, "loss": 0.8833, "step": 4799 }, { "epoch": 0.12641559125625493, "grad_norm": 3.53767466545105, "learning_rate": 4.368580458256519e-05, "loss": 1.4994, "step": 4800 }, { "epoch": 0.12644192783776667, "grad_norm": 2.071329116821289, "learning_rate": 4.36844877534896e-05, "loss": 1.8627, "step": 4801 }, { "epoch": 0.1264682644192784, "grad_norm": 1.9687137603759766, "learning_rate": 4.3683170924414015e-05, "loss": 2.0375, "step": 4802 }, { "epoch": 0.1264946010007901, "grad_norm": 1.76338529586792, "learning_rate": 4.3681854095338424e-05, "loss": 1.3896, "step": 4803 }, { "epoch": 0.12652093758230182, "grad_norm": 3.2583870887756348, "learning_rate": 4.368053726626284e-05, "loss": 1.7348, "step": 4804 }, { "epoch": 0.12654727416381353, "grad_norm": 2.3921329975128174, "learning_rate": 4.3679220437187255e-05, "loss": 2.3131, "step": 4805 }, { "epoch": 0.12657361074532525, "grad_norm": 3.8275365829467773, "learning_rate": 4.367790360811167e-05, "loss": 1.2922, "step": 4806 }, { "epoch": 0.12659994732683696, "grad_norm": 3.6275076866149902, "learning_rate": 4.3676586779036086e-05, "loss": 1.9291, "step": 4807 }, { "epoch": 0.1266262839083487, "grad_norm": 1.7525030374526978, "learning_rate": 4.3675269949960495e-05, "loss": 2.0664, "step": 4808 }, { "epoch": 0.12665262048986042, "grad_norm": 3.133984088897705, "learning_rate": 4.367395312088492e-05, "loss": 2.1156, "step": 4809 }, { "epoch": 0.12667895707137214, "grad_norm": 2.2137491703033447, "learning_rate": 4.3672636291809326e-05, "loss": 1.9825, "step": 4810 }, { "epoch": 0.12670529365288385, "grad_norm": 4.29949951171875, "learning_rate": 4.367131946273374e-05, "loss": 1.4226, "step": 4811 }, { "epoch": 0.12673163023439557, "grad_norm": 3.194653034210205, "learning_rate": 4.367000263365815e-05, "loss": 2.1576, "step": 4812 }, { "epoch": 0.12675796681590729, "grad_norm": 8.790407180786133, "learning_rate": 4.3668685804582566e-05, "loss": 0.582, "step": 4813 }, { "epoch": 0.126784303397419, "grad_norm": 1.9416428804397583, "learning_rate": 4.366736897550698e-05, "loss": 1.3912, "step": 4814 }, { "epoch": 0.12681063997893074, "grad_norm": 2.547950029373169, "learning_rate": 4.36660521464314e-05, "loss": 2.3496, "step": 4815 }, { "epoch": 0.12683697656044246, "grad_norm": 2.2785916328430176, "learning_rate": 4.3664735317355806e-05, "loss": 1.5071, "step": 4816 }, { "epoch": 0.12686331314195418, "grad_norm": 3.295245409011841, "learning_rate": 4.366341848828022e-05, "loss": 0.7992, "step": 4817 }, { "epoch": 0.1268896497234659, "grad_norm": 3.8526058197021484, "learning_rate": 4.366210165920464e-05, "loss": 1.9285, "step": 4818 }, { "epoch": 0.1269159863049776, "grad_norm": 2.907660484313965, "learning_rate": 4.366078483012905e-05, "loss": 1.9041, "step": 4819 }, { "epoch": 0.12694232288648932, "grad_norm": 3.1727676391601562, "learning_rate": 4.365946800105347e-05, "loss": 1.0849, "step": 4820 }, { "epoch": 0.12696865946800107, "grad_norm": 2.2417054176330566, "learning_rate": 4.365815117197788e-05, "loss": 1.3252, "step": 4821 }, { "epoch": 0.12699499604951278, "grad_norm": 2.5461864471435547, "learning_rate": 4.365683434290229e-05, "loss": 1.3434, "step": 4822 }, { "epoch": 0.1270213326310245, "grad_norm": 3.2222328186035156, "learning_rate": 4.36555175138267e-05, "loss": 2.0012, "step": 4823 }, { "epoch": 0.1270476692125362, "grad_norm": 4.481986999511719, "learning_rate": 4.3654200684751124e-05, "loss": 1.6714, "step": 4824 }, { "epoch": 0.12707400579404793, "grad_norm": 1.9870394468307495, "learning_rate": 4.365288385567553e-05, "loss": 1.7413, "step": 4825 }, { "epoch": 0.12710034237555964, "grad_norm": 2.8645412921905518, "learning_rate": 4.365156702659995e-05, "loss": 0.7193, "step": 4826 }, { "epoch": 0.12712667895707136, "grad_norm": 4.1999406814575195, "learning_rate": 4.3650250197524364e-05, "loss": 1.0632, "step": 4827 }, { "epoch": 0.1271530155385831, "grad_norm": 2.2947449684143066, "learning_rate": 4.364893336844878e-05, "loss": 2.4117, "step": 4828 }, { "epoch": 0.12717935212009482, "grad_norm": 1.658942699432373, "learning_rate": 4.3647616539373195e-05, "loss": 1.7368, "step": 4829 }, { "epoch": 0.12720568870160653, "grad_norm": 3.9009971618652344, "learning_rate": 4.3646299710297604e-05, "loss": 2.0825, "step": 4830 }, { "epoch": 0.12723202528311825, "grad_norm": 1.627124309539795, "learning_rate": 4.364498288122202e-05, "loss": 1.1915, "step": 4831 }, { "epoch": 0.12725836186462997, "grad_norm": 1.7527695894241333, "learning_rate": 4.364366605214643e-05, "loss": 1.9, "step": 4832 }, { "epoch": 0.12728469844614168, "grad_norm": 2.3488717079162598, "learning_rate": 4.364234922307085e-05, "loss": 1.7931, "step": 4833 }, { "epoch": 0.12731103502765342, "grad_norm": 2.6414217948913574, "learning_rate": 4.364103239399526e-05, "loss": 1.2725, "step": 4834 }, { "epoch": 0.12733737160916514, "grad_norm": 2.2473621368408203, "learning_rate": 4.3639715564919675e-05, "loss": 1.6283, "step": 4835 }, { "epoch": 0.12736370819067686, "grad_norm": 5.0474348068237305, "learning_rate": 4.363839873584409e-05, "loss": 1.8299, "step": 4836 }, { "epoch": 0.12739004477218857, "grad_norm": 2.4065027236938477, "learning_rate": 4.36370819067685e-05, "loss": 1.1844, "step": 4837 }, { "epoch": 0.1274163813537003, "grad_norm": 2.514936923980713, "learning_rate": 4.363576507769292e-05, "loss": 1.6467, "step": 4838 }, { "epoch": 0.127442717935212, "grad_norm": 3.0012967586517334, "learning_rate": 4.363444824861733e-05, "loss": 2.094, "step": 4839 }, { "epoch": 0.12746905451672372, "grad_norm": 2.124875783920288, "learning_rate": 4.363313141954175e-05, "loss": 2.0942, "step": 4840 }, { "epoch": 0.12749539109823546, "grad_norm": 2.279796600341797, "learning_rate": 4.3631814590466155e-05, "loss": 1.5752, "step": 4841 }, { "epoch": 0.12752172767974718, "grad_norm": 2.1477503776550293, "learning_rate": 4.363049776139058e-05, "loss": 2.0256, "step": 4842 }, { "epoch": 0.1275480642612589, "grad_norm": 2.6695644855499268, "learning_rate": 4.362918093231499e-05, "loss": 1.5857, "step": 4843 }, { "epoch": 0.1275744008427706, "grad_norm": 2.1043524742126465, "learning_rate": 4.36278641032394e-05, "loss": 1.5499, "step": 4844 }, { "epoch": 0.12760073742428232, "grad_norm": 3.444185972213745, "learning_rate": 4.362654727416382e-05, "loss": 1.6529, "step": 4845 }, { "epoch": 0.12762707400579404, "grad_norm": 3.8504743576049805, "learning_rate": 4.362523044508823e-05, "loss": 1.5419, "step": 4846 }, { "epoch": 0.12765341058730575, "grad_norm": 1.9481192827224731, "learning_rate": 4.362391361601265e-05, "loss": 1.9174, "step": 4847 }, { "epoch": 0.1276797471688175, "grad_norm": 1.85298490524292, "learning_rate": 4.362259678693706e-05, "loss": 1.0213, "step": 4848 }, { "epoch": 0.1277060837503292, "grad_norm": 1.7944222688674927, "learning_rate": 4.3621279957861473e-05, "loss": 2.5456, "step": 4849 }, { "epoch": 0.12773242033184093, "grad_norm": 2.6556293964385986, "learning_rate": 4.361996312878588e-05, "loss": 2.1569, "step": 4850 }, { "epoch": 0.12775875691335264, "grad_norm": 1.8357347249984741, "learning_rate": 4.36186462997103e-05, "loss": 1.1135, "step": 4851 }, { "epoch": 0.12778509349486436, "grad_norm": 2.0435867309570312, "learning_rate": 4.3617329470634713e-05, "loss": 2.1799, "step": 4852 }, { "epoch": 0.12781143007637608, "grad_norm": 2.5294694900512695, "learning_rate": 4.361601264155913e-05, "loss": 0.7898, "step": 4853 }, { "epoch": 0.12783776665788782, "grad_norm": 2.220301389694214, "learning_rate": 4.3614695812483545e-05, "loss": 1.2596, "step": 4854 }, { "epoch": 0.12786410323939953, "grad_norm": 2.7293739318847656, "learning_rate": 4.3613378983407953e-05, "loss": 1.9187, "step": 4855 }, { "epoch": 0.12789043982091125, "grad_norm": 1.6175456047058105, "learning_rate": 4.361206215433237e-05, "loss": 1.8108, "step": 4856 }, { "epoch": 0.12791677640242297, "grad_norm": 2.1558797359466553, "learning_rate": 4.3610745325256785e-05, "loss": 1.8806, "step": 4857 }, { "epoch": 0.12794311298393468, "grad_norm": 2.410466432571411, "learning_rate": 4.36094284961812e-05, "loss": 2.3897, "step": 4858 }, { "epoch": 0.1279694495654464, "grad_norm": 1.6283433437347412, "learning_rate": 4.360811166710561e-05, "loss": 1.8579, "step": 4859 }, { "epoch": 0.1279957861469581, "grad_norm": 1.9884577989578247, "learning_rate": 4.3606794838030025e-05, "loss": 1.6454, "step": 4860 }, { "epoch": 0.12802212272846986, "grad_norm": 2.200735092163086, "learning_rate": 4.360547800895444e-05, "loss": 1.9783, "step": 4861 }, { "epoch": 0.12804845930998157, "grad_norm": 2.014044761657715, "learning_rate": 4.3604161179878856e-05, "loss": 2.2169, "step": 4862 }, { "epoch": 0.1280747958914933, "grad_norm": 1.9855282306671143, "learning_rate": 4.3602844350803265e-05, "loss": 1.971, "step": 4863 }, { "epoch": 0.128101132473005, "grad_norm": 2.1201257705688477, "learning_rate": 4.360152752172768e-05, "loss": 1.6651, "step": 4864 }, { "epoch": 0.12812746905451672, "grad_norm": 3.384716033935547, "learning_rate": 4.3600210692652096e-05, "loss": 1.2641, "step": 4865 }, { "epoch": 0.12815380563602843, "grad_norm": 2.70874285697937, "learning_rate": 4.359889386357651e-05, "loss": 1.9227, "step": 4866 }, { "epoch": 0.12818014221754015, "grad_norm": 2.4259488582611084, "learning_rate": 4.359757703450093e-05, "loss": 1.4218, "step": 4867 }, { "epoch": 0.1282064787990519, "grad_norm": 1.9935314655303955, "learning_rate": 4.3596260205425336e-05, "loss": 1.9993, "step": 4868 }, { "epoch": 0.1282328153805636, "grad_norm": 3.310474395751953, "learning_rate": 4.359494337634975e-05, "loss": 2.1742, "step": 4869 }, { "epoch": 0.12825915196207532, "grad_norm": 4.381847381591797, "learning_rate": 4.359362654727416e-05, "loss": 1.1536, "step": 4870 }, { "epoch": 0.12828548854358704, "grad_norm": 2.7546303272247314, "learning_rate": 4.359230971819858e-05, "loss": 1.9668, "step": 4871 }, { "epoch": 0.12831182512509876, "grad_norm": 4.072642803192139, "learning_rate": 4.359099288912299e-05, "loss": 1.9433, "step": 4872 }, { "epoch": 0.12833816170661047, "grad_norm": 2.637031316757202, "learning_rate": 4.358967606004741e-05, "loss": 0.8301, "step": 4873 }, { "epoch": 0.12836449828812221, "grad_norm": 1.7782573699951172, "learning_rate": 4.358835923097182e-05, "loss": 2.1999, "step": 4874 }, { "epoch": 0.12839083486963393, "grad_norm": 2.4592955112457275, "learning_rate": 4.358704240189624e-05, "loss": 1.8854, "step": 4875 }, { "epoch": 0.12841717145114565, "grad_norm": 2.6669809818267822, "learning_rate": 4.3585725572820654e-05, "loss": 1.4189, "step": 4876 }, { "epoch": 0.12844350803265736, "grad_norm": 2.2287843227386475, "learning_rate": 4.358440874374506e-05, "loss": 1.8585, "step": 4877 }, { "epoch": 0.12846984461416908, "grad_norm": 1.9186570644378662, "learning_rate": 4.358309191466948e-05, "loss": 1.1951, "step": 4878 }, { "epoch": 0.1284961811956808, "grad_norm": 1.6467444896697998, "learning_rate": 4.358177508559389e-05, "loss": 1.4028, "step": 4879 }, { "epoch": 0.1285225177771925, "grad_norm": 2.876457452774048, "learning_rate": 4.358045825651831e-05, "loss": 2.62, "step": 4880 }, { "epoch": 0.12854885435870425, "grad_norm": 1.8239116668701172, "learning_rate": 4.357914142744272e-05, "loss": 1.6768, "step": 4881 }, { "epoch": 0.12857519094021597, "grad_norm": 1.834315538406372, "learning_rate": 4.3577824598367134e-05, "loss": 1.9142, "step": 4882 }, { "epoch": 0.12860152752172768, "grad_norm": 1.9506486654281616, "learning_rate": 4.357650776929155e-05, "loss": 1.9521, "step": 4883 }, { "epoch": 0.1286278641032394, "grad_norm": 2.1320502758026123, "learning_rate": 4.357519094021596e-05, "loss": 1.4065, "step": 4884 }, { "epoch": 0.1286542006847511, "grad_norm": 3.5862576961517334, "learning_rate": 4.357387411114038e-05, "loss": 1.5959, "step": 4885 }, { "epoch": 0.12868053726626283, "grad_norm": 3.104295492172241, "learning_rate": 4.357255728206479e-05, "loss": 2.0909, "step": 4886 }, { "epoch": 0.12870687384777457, "grad_norm": 2.3152544498443604, "learning_rate": 4.3571240452989205e-05, "loss": 2.2944, "step": 4887 }, { "epoch": 0.1287332104292863, "grad_norm": 2.4190573692321777, "learning_rate": 4.3569923623913614e-05, "loss": 2.707, "step": 4888 }, { "epoch": 0.128759547010798, "grad_norm": 2.317089557647705, "learning_rate": 4.356860679483803e-05, "loss": 1.7596, "step": 4889 }, { "epoch": 0.12878588359230972, "grad_norm": 2.1676392555236816, "learning_rate": 4.3567289965762445e-05, "loss": 1.9278, "step": 4890 }, { "epoch": 0.12881222017382143, "grad_norm": 2.047677755355835, "learning_rate": 4.356597313668686e-05, "loss": 2.1574, "step": 4891 }, { "epoch": 0.12883855675533315, "grad_norm": 2.923931360244751, "learning_rate": 4.3564656307611276e-05, "loss": 1.8902, "step": 4892 }, { "epoch": 0.12886489333684487, "grad_norm": 2.223792791366577, "learning_rate": 4.3563339478535685e-05, "loss": 2.0463, "step": 4893 }, { "epoch": 0.1288912299183566, "grad_norm": 3.2405905723571777, "learning_rate": 4.356202264946011e-05, "loss": 0.458, "step": 4894 }, { "epoch": 0.12891756649986832, "grad_norm": 1.754435658454895, "learning_rate": 4.3560705820384516e-05, "loss": 1.1948, "step": 4895 }, { "epoch": 0.12894390308138004, "grad_norm": 1.9974554777145386, "learning_rate": 4.355938899130893e-05, "loss": 2.2276, "step": 4896 }, { "epoch": 0.12897023966289176, "grad_norm": 2.052645206451416, "learning_rate": 4.355807216223334e-05, "loss": 2.0828, "step": 4897 }, { "epoch": 0.12899657624440347, "grad_norm": 2.0300865173339844, "learning_rate": 4.3556755333157756e-05, "loss": 2.0949, "step": 4898 }, { "epoch": 0.1290229128259152, "grad_norm": 5.598296642303467, "learning_rate": 4.355543850408217e-05, "loss": 1.4868, "step": 4899 }, { "epoch": 0.1290492494074269, "grad_norm": 4.022143840789795, "learning_rate": 4.355412167500659e-05, "loss": 0.7322, "step": 4900 }, { "epoch": 0.12907558598893865, "grad_norm": 2.7101197242736816, "learning_rate": 4.3552804845931e-05, "loss": 1.8003, "step": 4901 }, { "epoch": 0.12910192257045036, "grad_norm": 3.0394439697265625, "learning_rate": 4.355148801685541e-05, "loss": 1.2617, "step": 4902 }, { "epoch": 0.12912825915196208, "grad_norm": 3.5920867919921875, "learning_rate": 4.355017118777983e-05, "loss": 1.0817, "step": 4903 }, { "epoch": 0.1291545957334738, "grad_norm": 2.459078311920166, "learning_rate": 4.354885435870424e-05, "loss": 1.3892, "step": 4904 }, { "epoch": 0.1291809323149855, "grad_norm": 3.5525012016296387, "learning_rate": 4.354753752962866e-05, "loss": 1.0874, "step": 4905 }, { "epoch": 0.12920726889649722, "grad_norm": 3.375631093978882, "learning_rate": 4.354622070055307e-05, "loss": 1.8962, "step": 4906 }, { "epoch": 0.12923360547800897, "grad_norm": 2.7625391483306885, "learning_rate": 4.354490387147748e-05, "loss": 2.3779, "step": 4907 }, { "epoch": 0.12925994205952068, "grad_norm": 2.535620927810669, "learning_rate": 4.35435870424019e-05, "loss": 1.8906, "step": 4908 }, { "epoch": 0.1292862786410324, "grad_norm": 3.0496389865875244, "learning_rate": 4.3542270213326314e-05, "loss": 2.121, "step": 4909 }, { "epoch": 0.12931261522254411, "grad_norm": 2.272063970565796, "learning_rate": 4.354095338425073e-05, "loss": 2.343, "step": 4910 }, { "epoch": 0.12933895180405583, "grad_norm": 3.0031349658966064, "learning_rate": 4.353963655517514e-05, "loss": 1.7176, "step": 4911 }, { "epoch": 0.12936528838556755, "grad_norm": 2.950576066970825, "learning_rate": 4.3538319726099554e-05, "loss": 1.4557, "step": 4912 }, { "epoch": 0.12939162496707926, "grad_norm": 2.217730760574341, "learning_rate": 4.353700289702397e-05, "loss": 1.4137, "step": 4913 }, { "epoch": 0.129417961548591, "grad_norm": 1.7896230220794678, "learning_rate": 4.3535686067948386e-05, "loss": 2.3877, "step": 4914 }, { "epoch": 0.12944429813010272, "grad_norm": 2.033858060836792, "learning_rate": 4.3534369238872794e-05, "loss": 1.6677, "step": 4915 }, { "epoch": 0.12947063471161444, "grad_norm": 5.609281063079834, "learning_rate": 4.353305240979721e-05, "loss": 2.0149, "step": 4916 }, { "epoch": 0.12949697129312615, "grad_norm": 1.972954511642456, "learning_rate": 4.353173558072162e-05, "loss": 0.5295, "step": 4917 }, { "epoch": 0.12952330787463787, "grad_norm": 4.894515037536621, "learning_rate": 4.353041875164604e-05, "loss": 1.9628, "step": 4918 }, { "epoch": 0.12954964445614958, "grad_norm": 2.383387804031372, "learning_rate": 4.352910192257045e-05, "loss": 2.2276, "step": 4919 }, { "epoch": 0.12957598103766133, "grad_norm": 4.4952521324157715, "learning_rate": 4.3527785093494866e-05, "loss": 1.7002, "step": 4920 }, { "epoch": 0.12960231761917304, "grad_norm": 4.960112571716309, "learning_rate": 4.352646826441928e-05, "loss": 1.9125, "step": 4921 }, { "epoch": 0.12962865420068476, "grad_norm": 1.7794078588485718, "learning_rate": 4.352515143534369e-05, "loss": 1.572, "step": 4922 }, { "epoch": 0.12965499078219647, "grad_norm": 2.117893695831299, "learning_rate": 4.352383460626811e-05, "loss": 1.9986, "step": 4923 }, { "epoch": 0.1296813273637082, "grad_norm": 1.951404094696045, "learning_rate": 4.352251777719252e-05, "loss": 1.8263, "step": 4924 }, { "epoch": 0.1297076639452199, "grad_norm": 1.9802696704864502, "learning_rate": 4.352120094811694e-05, "loss": 1.6162, "step": 4925 }, { "epoch": 0.12973400052673162, "grad_norm": 1.7484253644943237, "learning_rate": 4.3519884119041346e-05, "loss": 1.4627, "step": 4926 }, { "epoch": 0.12976033710824336, "grad_norm": 2.6771886348724365, "learning_rate": 4.351856728996577e-05, "loss": 1.9035, "step": 4927 }, { "epoch": 0.12978667368975508, "grad_norm": 3.7285354137420654, "learning_rate": 4.351725046089018e-05, "loss": 1.6032, "step": 4928 }, { "epoch": 0.1298130102712668, "grad_norm": 3.075129270553589, "learning_rate": 4.351593363181459e-05, "loss": 1.7752, "step": 4929 }, { "epoch": 0.1298393468527785, "grad_norm": 2.682663679122925, "learning_rate": 4.351461680273901e-05, "loss": 1.6784, "step": 4930 }, { "epoch": 0.12986568343429022, "grad_norm": 3.128638744354248, "learning_rate": 4.351329997366342e-05, "loss": 1.5732, "step": 4931 }, { "epoch": 0.12989202001580194, "grad_norm": 2.231123924255371, "learning_rate": 4.351198314458784e-05, "loss": 2.0879, "step": 4932 }, { "epoch": 0.12991835659731366, "grad_norm": 2.612884283065796, "learning_rate": 4.351066631551225e-05, "loss": 1.0566, "step": 4933 }, { "epoch": 0.1299446931788254, "grad_norm": 3.413738250732422, "learning_rate": 4.3509349486436664e-05, "loss": 1.9005, "step": 4934 }, { "epoch": 0.12997102976033711, "grad_norm": 3.060476779937744, "learning_rate": 4.350803265736107e-05, "loss": 1.1479, "step": 4935 }, { "epoch": 0.12999736634184883, "grad_norm": 1.8835155963897705, "learning_rate": 4.350671582828549e-05, "loss": 1.5931, "step": 4936 }, { "epoch": 0.13002370292336055, "grad_norm": 1.9978944063186646, "learning_rate": 4.3505398999209904e-05, "loss": 1.6942, "step": 4937 }, { "epoch": 0.13005003950487226, "grad_norm": 1.673864483833313, "learning_rate": 4.350408217013432e-05, "loss": 2.6585, "step": 4938 }, { "epoch": 0.13007637608638398, "grad_norm": 2.6167819499969482, "learning_rate": 4.3502765341058735e-05, "loss": 1.9704, "step": 4939 }, { "epoch": 0.13010271266789572, "grad_norm": 1.9489105939865112, "learning_rate": 4.3501448511983144e-05, "loss": 1.9714, "step": 4940 }, { "epoch": 0.13012904924940744, "grad_norm": 2.7575132846832275, "learning_rate": 4.3500131682907566e-05, "loss": 1.6787, "step": 4941 }, { "epoch": 0.13015538583091915, "grad_norm": 2.8675830364227295, "learning_rate": 4.3498814853831975e-05, "loss": 1.7496, "step": 4942 }, { "epoch": 0.13018172241243087, "grad_norm": 3.113664388656616, "learning_rate": 4.349749802475639e-05, "loss": 1.8952, "step": 4943 }, { "epoch": 0.13020805899394258, "grad_norm": 2.416672706604004, "learning_rate": 4.34961811956808e-05, "loss": 2.2312, "step": 4944 }, { "epoch": 0.1302343955754543, "grad_norm": 2.531726121902466, "learning_rate": 4.3494864366605215e-05, "loss": 1.9506, "step": 4945 }, { "epoch": 0.130260732156966, "grad_norm": 2.0439252853393555, "learning_rate": 4.349354753752963e-05, "loss": 2.4105, "step": 4946 }, { "epoch": 0.13028706873847776, "grad_norm": 2.0388312339782715, "learning_rate": 4.3492230708454046e-05, "loss": 1.8746, "step": 4947 }, { "epoch": 0.13031340531998947, "grad_norm": 3.1810717582702637, "learning_rate": 4.349091387937846e-05, "loss": 0.7253, "step": 4948 }, { "epoch": 0.1303397419015012, "grad_norm": 6.146149635314941, "learning_rate": 4.348959705030287e-05, "loss": 1.1874, "step": 4949 }, { "epoch": 0.1303660784830129, "grad_norm": 13.208438873291016, "learning_rate": 4.3488280221227286e-05, "loss": 2.7628, "step": 4950 }, { "epoch": 0.13039241506452462, "grad_norm": 5.336616039276123, "learning_rate": 4.34869633921517e-05, "loss": 0.8695, "step": 4951 }, { "epoch": 0.13041875164603633, "grad_norm": 4.059808731079102, "learning_rate": 4.348564656307612e-05, "loss": 1.4424, "step": 4952 }, { "epoch": 0.13044508822754805, "grad_norm": 3.747255563735962, "learning_rate": 4.3484329734000526e-05, "loss": 1.6961, "step": 4953 }, { "epoch": 0.1304714248090598, "grad_norm": 1.8338106870651245, "learning_rate": 4.348301290492494e-05, "loss": 1.643, "step": 4954 }, { "epoch": 0.1304977613905715, "grad_norm": 3.2388992309570312, "learning_rate": 4.348169607584936e-05, "loss": 0.992, "step": 4955 }, { "epoch": 0.13052409797208323, "grad_norm": 2.0383005142211914, "learning_rate": 4.348037924677377e-05, "loss": 2.2097, "step": 4956 }, { "epoch": 0.13055043455359494, "grad_norm": 1.8579005002975464, "learning_rate": 4.347906241769819e-05, "loss": 1.7505, "step": 4957 }, { "epoch": 0.13057677113510666, "grad_norm": 2.4393959045410156, "learning_rate": 4.34777455886226e-05, "loss": 1.5814, "step": 4958 }, { "epoch": 0.13060310771661837, "grad_norm": 2.926823854446411, "learning_rate": 4.347642875954701e-05, "loss": 1.4456, "step": 4959 }, { "epoch": 0.13062944429813012, "grad_norm": 2.2165017127990723, "learning_rate": 4.347511193047143e-05, "loss": 1.7174, "step": 4960 }, { "epoch": 0.13065578087964183, "grad_norm": 1.8240114450454712, "learning_rate": 4.3473795101395844e-05, "loss": 1.8518, "step": 4961 }, { "epoch": 0.13068211746115355, "grad_norm": 2.0179178714752197, "learning_rate": 4.347247827232025e-05, "loss": 1.9723, "step": 4962 }, { "epoch": 0.13070845404266526, "grad_norm": 3.113919258117676, "learning_rate": 4.347116144324467e-05, "loss": 0.5512, "step": 4963 }, { "epoch": 0.13073479062417698, "grad_norm": 1.8319178819656372, "learning_rate": 4.346984461416908e-05, "loss": 1.9527, "step": 4964 }, { "epoch": 0.1307611272056887, "grad_norm": 2.051953077316284, "learning_rate": 4.34685277850935e-05, "loss": 1.2838, "step": 4965 }, { "epoch": 0.1307874637872004, "grad_norm": 1.9818252325057983, "learning_rate": 4.346721095601791e-05, "loss": 1.659, "step": 4966 }, { "epoch": 0.13081380036871215, "grad_norm": 2.53497576713562, "learning_rate": 4.3465894126942324e-05, "loss": 1.5803, "step": 4967 }, { "epoch": 0.13084013695022387, "grad_norm": 1.9523745775222778, "learning_rate": 4.346457729786674e-05, "loss": 1.9161, "step": 4968 }, { "epoch": 0.13086647353173558, "grad_norm": 3.2774455547332764, "learning_rate": 4.346326046879115e-05, "loss": 1.5952, "step": 4969 }, { "epoch": 0.1308928101132473, "grad_norm": 2.6098408699035645, "learning_rate": 4.346194363971557e-05, "loss": 1.4197, "step": 4970 }, { "epoch": 0.13091914669475901, "grad_norm": 3.690000534057617, "learning_rate": 4.346062681063998e-05, "loss": 0.8491, "step": 4971 }, { "epoch": 0.13094548327627073, "grad_norm": 2.4877395629882812, "learning_rate": 4.3459309981564395e-05, "loss": 2.0821, "step": 4972 }, { "epoch": 0.13097181985778247, "grad_norm": 3.571730613708496, "learning_rate": 4.3457993152488804e-05, "loss": 2.1172, "step": 4973 }, { "epoch": 0.1309981564392942, "grad_norm": 2.088020086288452, "learning_rate": 4.345667632341323e-05, "loss": 1.5379, "step": 4974 }, { "epoch": 0.1310244930208059, "grad_norm": 2.455195903778076, "learning_rate": 4.3455359494337635e-05, "loss": 2.1159, "step": 4975 }, { "epoch": 0.13105082960231762, "grad_norm": 2.3960843086242676, "learning_rate": 4.345404266526205e-05, "loss": 1.7115, "step": 4976 }, { "epoch": 0.13107716618382934, "grad_norm": 1.7375211715698242, "learning_rate": 4.345272583618647e-05, "loss": 2.0856, "step": 4977 }, { "epoch": 0.13110350276534105, "grad_norm": 4.1101226806640625, "learning_rate": 4.3451409007110876e-05, "loss": 0.9096, "step": 4978 }, { "epoch": 0.13112983934685277, "grad_norm": 2.3209168910980225, "learning_rate": 4.34500921780353e-05, "loss": 2.0009, "step": 4979 }, { "epoch": 0.1311561759283645, "grad_norm": 2.01666522026062, "learning_rate": 4.344877534895971e-05, "loss": 0.5954, "step": 4980 }, { "epoch": 0.13118251250987623, "grad_norm": 1.6926815509796143, "learning_rate": 4.344745851988412e-05, "loss": 1.9913, "step": 4981 }, { "epoch": 0.13120884909138794, "grad_norm": 2.902639389038086, "learning_rate": 4.344614169080853e-05, "loss": 0.8996, "step": 4982 }, { "epoch": 0.13123518567289966, "grad_norm": 2.468534231185913, "learning_rate": 4.344482486173295e-05, "loss": 2.0002, "step": 4983 }, { "epoch": 0.13126152225441137, "grad_norm": 1.7324596643447876, "learning_rate": 4.344350803265736e-05, "loss": 1.7669, "step": 4984 }, { "epoch": 0.1312878588359231, "grad_norm": 2.492088794708252, "learning_rate": 4.344219120358178e-05, "loss": 2.3001, "step": 4985 }, { "epoch": 0.1313141954174348, "grad_norm": 2.460096597671509, "learning_rate": 4.3440874374506194e-05, "loss": 1.9039, "step": 4986 }, { "epoch": 0.13134053199894655, "grad_norm": 1.9136123657226562, "learning_rate": 4.34395575454306e-05, "loss": 1.9006, "step": 4987 }, { "epoch": 0.13136686858045826, "grad_norm": 2.5846214294433594, "learning_rate": 4.343824071635502e-05, "loss": 1.6338, "step": 4988 }, { "epoch": 0.13139320516196998, "grad_norm": 2.966732978820801, "learning_rate": 4.3436923887279434e-05, "loss": 1.6819, "step": 4989 }, { "epoch": 0.1314195417434817, "grad_norm": 1.8261289596557617, "learning_rate": 4.343560705820385e-05, "loss": 1.071, "step": 4990 }, { "epoch": 0.1314458783249934, "grad_norm": 3.538123369216919, "learning_rate": 4.343429022912826e-05, "loss": 1.2945, "step": 4991 }, { "epoch": 0.13147221490650512, "grad_norm": 3.3220880031585693, "learning_rate": 4.3432973400052674e-05, "loss": 1.6983, "step": 4992 }, { "epoch": 0.13149855148801687, "grad_norm": 1.8842847347259521, "learning_rate": 4.343165657097709e-05, "loss": 1.873, "step": 4993 }, { "epoch": 0.13152488806952858, "grad_norm": 4.425382614135742, "learning_rate": 4.3430339741901505e-05, "loss": 1.9297, "step": 4994 }, { "epoch": 0.1315512246510403, "grad_norm": 2.080655336380005, "learning_rate": 4.342902291282592e-05, "loss": 1.8793, "step": 4995 }, { "epoch": 0.13157756123255201, "grad_norm": 1.711382508277893, "learning_rate": 4.342770608375033e-05, "loss": 0.6949, "step": 4996 }, { "epoch": 0.13160389781406373, "grad_norm": 2.0423460006713867, "learning_rate": 4.3426389254674745e-05, "loss": 2.1115, "step": 4997 }, { "epoch": 0.13163023439557545, "grad_norm": 6.049909591674805, "learning_rate": 4.342507242559916e-05, "loss": 1.6203, "step": 4998 }, { "epoch": 0.13165657097708716, "grad_norm": 2.2805228233337402, "learning_rate": 4.3423755596523576e-05, "loss": 1.6269, "step": 4999 }, { "epoch": 0.1316829075585989, "grad_norm": 2.6756784915924072, "learning_rate": 4.3422438767447985e-05, "loss": 2.2693, "step": 5000 }, { "epoch": 0.13170924414011062, "grad_norm": 2.5561397075653076, "learning_rate": 4.34211219383724e-05, "loss": 0.6804, "step": 5001 }, { "epoch": 0.13173558072162234, "grad_norm": 3.445418119430542, "learning_rate": 4.3419805109296816e-05, "loss": 1.6649, "step": 5002 }, { "epoch": 0.13176191730313405, "grad_norm": 2.469170093536377, "learning_rate": 4.341848828022123e-05, "loss": 2.1, "step": 5003 }, { "epoch": 0.13178825388464577, "grad_norm": 2.5011355876922607, "learning_rate": 4.341717145114565e-05, "loss": 1.6431, "step": 5004 }, { "epoch": 0.13181459046615748, "grad_norm": 2.8519020080566406, "learning_rate": 4.3415854622070056e-05, "loss": 1.4282, "step": 5005 }, { "epoch": 0.13184092704766923, "grad_norm": 2.6416091918945312, "learning_rate": 4.341453779299447e-05, "loss": 1.4372, "step": 5006 }, { "epoch": 0.13186726362918094, "grad_norm": 2.075183629989624, "learning_rate": 4.341322096391889e-05, "loss": 1.756, "step": 5007 }, { "epoch": 0.13189360021069266, "grad_norm": 3.2140908241271973, "learning_rate": 4.34119041348433e-05, "loss": 1.4966, "step": 5008 }, { "epoch": 0.13191993679220437, "grad_norm": 1.8258230686187744, "learning_rate": 4.341058730576771e-05, "loss": 1.8709, "step": 5009 }, { "epoch": 0.1319462733737161, "grad_norm": 3.9869112968444824, "learning_rate": 4.340927047669213e-05, "loss": 1.6423, "step": 5010 }, { "epoch": 0.1319726099552278, "grad_norm": 3.122354745864868, "learning_rate": 4.3407953647616536e-05, "loss": 1.8475, "step": 5011 }, { "epoch": 0.13199894653673952, "grad_norm": 2.240496873855591, "learning_rate": 4.340663681854096e-05, "loss": 1.635, "step": 5012 }, { "epoch": 0.13202528311825126, "grad_norm": 1.9871795177459717, "learning_rate": 4.340531998946537e-05, "loss": 1.4934, "step": 5013 }, { "epoch": 0.13205161969976298, "grad_norm": 1.8549997806549072, "learning_rate": 4.340400316038978e-05, "loss": 1.5648, "step": 5014 }, { "epoch": 0.1320779562812747, "grad_norm": 2.6075167655944824, "learning_rate": 4.34026863313142e-05, "loss": 1.5973, "step": 5015 }, { "epoch": 0.1321042928627864, "grad_norm": 4.122131824493408, "learning_rate": 4.340136950223861e-05, "loss": 0.6709, "step": 5016 }, { "epoch": 0.13213062944429813, "grad_norm": 2.711064577102661, "learning_rate": 4.340005267316303e-05, "loss": 1.0608, "step": 5017 }, { "epoch": 0.13215696602580984, "grad_norm": 3.5394294261932373, "learning_rate": 4.339873584408744e-05, "loss": 1.4017, "step": 5018 }, { "epoch": 0.13218330260732156, "grad_norm": 2.4709997177124023, "learning_rate": 4.3397419015011854e-05, "loss": 2.4186, "step": 5019 }, { "epoch": 0.1322096391888333, "grad_norm": 2.5101897716522217, "learning_rate": 4.339610218593626e-05, "loss": 1.653, "step": 5020 }, { "epoch": 0.13223597577034502, "grad_norm": 1.7431272268295288, "learning_rate": 4.339478535686068e-05, "loss": 1.737, "step": 5021 }, { "epoch": 0.13226231235185673, "grad_norm": 2.694981575012207, "learning_rate": 4.3393468527785094e-05, "loss": 1.3452, "step": 5022 }, { "epoch": 0.13228864893336845, "grad_norm": 4.8767476081848145, "learning_rate": 4.339215169870951e-05, "loss": 1.9107, "step": 5023 }, { "epoch": 0.13231498551488016, "grad_norm": 1.9706566333770752, "learning_rate": 4.3390834869633925e-05, "loss": 2.4199, "step": 5024 }, { "epoch": 0.13234132209639188, "grad_norm": 3.8935935497283936, "learning_rate": 4.3389518040558334e-05, "loss": 2.6432, "step": 5025 }, { "epoch": 0.13236765867790362, "grad_norm": 2.5822763442993164, "learning_rate": 4.3388201211482756e-05, "loss": 0.5459, "step": 5026 }, { "epoch": 0.13239399525941534, "grad_norm": 1.74319326877594, "learning_rate": 4.3386884382407165e-05, "loss": 1.7364, "step": 5027 }, { "epoch": 0.13242033184092705, "grad_norm": 1.8894635438919067, "learning_rate": 4.338556755333158e-05, "loss": 0.7984, "step": 5028 }, { "epoch": 0.13244666842243877, "grad_norm": 2.569887161254883, "learning_rate": 4.338425072425599e-05, "loss": 1.5337, "step": 5029 }, { "epoch": 0.13247300500395048, "grad_norm": 2.3233461380004883, "learning_rate": 4.3382933895180405e-05, "loss": 1.4143, "step": 5030 }, { "epoch": 0.1324993415854622, "grad_norm": 2.9843146800994873, "learning_rate": 4.338161706610482e-05, "loss": 1.4322, "step": 5031 }, { "epoch": 0.13252567816697391, "grad_norm": 2.2174909114837646, "learning_rate": 4.3380300237029236e-05, "loss": 1.4158, "step": 5032 }, { "epoch": 0.13255201474848566, "grad_norm": 2.8234145641326904, "learning_rate": 4.337898340795365e-05, "loss": 2.3157, "step": 5033 }, { "epoch": 0.13257835132999737, "grad_norm": 3.4443726539611816, "learning_rate": 4.337766657887806e-05, "loss": 1.3387, "step": 5034 }, { "epoch": 0.1326046879115091, "grad_norm": 4.984408855438232, "learning_rate": 4.3376349749802476e-05, "loss": 0.4735, "step": 5035 }, { "epoch": 0.1326310244930208, "grad_norm": 3.2236123085021973, "learning_rate": 4.337503292072689e-05, "loss": 1.5339, "step": 5036 }, { "epoch": 0.13265736107453252, "grad_norm": 2.643332004547119, "learning_rate": 4.337371609165131e-05, "loss": 0.853, "step": 5037 }, { "epoch": 0.13268369765604424, "grad_norm": 4.635191440582275, "learning_rate": 4.3372399262575716e-05, "loss": 1.551, "step": 5038 }, { "epoch": 0.13271003423755595, "grad_norm": 2.006472110748291, "learning_rate": 4.337108243350013e-05, "loss": 1.6254, "step": 5039 }, { "epoch": 0.1327363708190677, "grad_norm": 2.1525752544403076, "learning_rate": 4.336976560442455e-05, "loss": 2.2399, "step": 5040 }, { "epoch": 0.1327627074005794, "grad_norm": 3.3003857135772705, "learning_rate": 4.336844877534896e-05, "loss": 1.8106, "step": 5041 }, { "epoch": 0.13278904398209113, "grad_norm": 5.573011875152588, "learning_rate": 4.336713194627338e-05, "loss": 1.3819, "step": 5042 }, { "epoch": 0.13281538056360284, "grad_norm": 5.78550910949707, "learning_rate": 4.336581511719779e-05, "loss": 1.3246, "step": 5043 }, { "epoch": 0.13284171714511456, "grad_norm": 3.1437580585479736, "learning_rate": 4.33644982881222e-05, "loss": 1.7696, "step": 5044 }, { "epoch": 0.13286805372662627, "grad_norm": 2.3968985080718994, "learning_rate": 4.336318145904662e-05, "loss": 1.5599, "step": 5045 }, { "epoch": 0.13289439030813802, "grad_norm": 2.5823965072631836, "learning_rate": 4.3361864629971035e-05, "loss": 2.0172, "step": 5046 }, { "epoch": 0.13292072688964973, "grad_norm": 1.743168592453003, "learning_rate": 4.336054780089544e-05, "loss": 2.0607, "step": 5047 }, { "epoch": 0.13294706347116145, "grad_norm": 2.6268415451049805, "learning_rate": 4.335923097181986e-05, "loss": 2.1507, "step": 5048 }, { "epoch": 0.13297340005267316, "grad_norm": 2.297821044921875, "learning_rate": 4.3357914142744275e-05, "loss": 2.2302, "step": 5049 }, { "epoch": 0.13299973663418488, "grad_norm": 1.988111972808838, "learning_rate": 4.335659731366869e-05, "loss": 1.6013, "step": 5050 }, { "epoch": 0.1330260732156966, "grad_norm": 4.546943187713623, "learning_rate": 4.3355280484593106e-05, "loss": 1.2326, "step": 5051 }, { "epoch": 0.1330524097972083, "grad_norm": 3.1077394485473633, "learning_rate": 4.3353963655517515e-05, "loss": 1.5866, "step": 5052 }, { "epoch": 0.13307874637872005, "grad_norm": 13.691081047058105, "learning_rate": 4.335264682644193e-05, "loss": 2.7875, "step": 5053 }, { "epoch": 0.13310508296023177, "grad_norm": 2.2888708114624023, "learning_rate": 4.335132999736634e-05, "loss": 1.2606, "step": 5054 }, { "epoch": 0.13313141954174348, "grad_norm": 2.7057507038116455, "learning_rate": 4.335001316829076e-05, "loss": 1.8512, "step": 5055 }, { "epoch": 0.1331577561232552, "grad_norm": 2.7668721675872803, "learning_rate": 4.334869633921517e-05, "loss": 1.9467, "step": 5056 }, { "epoch": 0.13318409270476692, "grad_norm": 2.0387139320373535, "learning_rate": 4.3347379510139586e-05, "loss": 1.7528, "step": 5057 }, { "epoch": 0.13321042928627863, "grad_norm": 2.803766965866089, "learning_rate": 4.3346062681064e-05, "loss": 2.3653, "step": 5058 }, { "epoch": 0.13323676586779037, "grad_norm": 4.716665267944336, "learning_rate": 4.334474585198842e-05, "loss": 1.2364, "step": 5059 }, { "epoch": 0.1332631024493021, "grad_norm": 1.9011110067367554, "learning_rate": 4.334342902291283e-05, "loss": 1.4576, "step": 5060 }, { "epoch": 0.1332894390308138, "grad_norm": 3.019423484802246, "learning_rate": 4.334211219383724e-05, "loss": 0.7776, "step": 5061 }, { "epoch": 0.13331577561232552, "grad_norm": 2.002833127975464, "learning_rate": 4.334079536476166e-05, "loss": 2.0146, "step": 5062 }, { "epoch": 0.13334211219383724, "grad_norm": 5.950443744659424, "learning_rate": 4.3339478535686066e-05, "loss": 1.0814, "step": 5063 }, { "epoch": 0.13336844877534895, "grad_norm": 2.671046733856201, "learning_rate": 4.333816170661049e-05, "loss": 1.1213, "step": 5064 }, { "epoch": 0.13339478535686067, "grad_norm": 2.2727088928222656, "learning_rate": 4.33368448775349e-05, "loss": 1.684, "step": 5065 }, { "epoch": 0.1334211219383724, "grad_norm": 1.7243009805679321, "learning_rate": 4.333552804845931e-05, "loss": 2.0375, "step": 5066 }, { "epoch": 0.13344745851988413, "grad_norm": 4.161171913146973, "learning_rate": 4.333421121938372e-05, "loss": 1.1078, "step": 5067 }, { "epoch": 0.13347379510139584, "grad_norm": 6.115097522735596, "learning_rate": 4.333289439030814e-05, "loss": 1.3167, "step": 5068 }, { "epoch": 0.13350013168290756, "grad_norm": 1.7010360956192017, "learning_rate": 4.333157756123255e-05, "loss": 1.5568, "step": 5069 }, { "epoch": 0.13352646826441927, "grad_norm": 5.253629684448242, "learning_rate": 4.333026073215697e-05, "loss": 1.6685, "step": 5070 }, { "epoch": 0.133552804845931, "grad_norm": 6.189787864685059, "learning_rate": 4.3328943903081384e-05, "loss": 1.5829, "step": 5071 }, { "epoch": 0.1335791414274427, "grad_norm": 3.063844919204712, "learning_rate": 4.332762707400579e-05, "loss": 2.8246, "step": 5072 }, { "epoch": 0.13360547800895445, "grad_norm": 3.2947885990142822, "learning_rate": 4.3326310244930215e-05, "loss": 0.5063, "step": 5073 }, { "epoch": 0.13363181459046616, "grad_norm": 2.2428524494171143, "learning_rate": 4.3324993415854624e-05, "loss": 1.5747, "step": 5074 }, { "epoch": 0.13365815117197788, "grad_norm": 2.1810524463653564, "learning_rate": 4.332367658677904e-05, "loss": 1.1996, "step": 5075 }, { "epoch": 0.1336844877534896, "grad_norm": 3.762620449066162, "learning_rate": 4.332235975770345e-05, "loss": 1.7491, "step": 5076 }, { "epoch": 0.1337108243350013, "grad_norm": 2.241297960281372, "learning_rate": 4.3321042928627864e-05, "loss": 1.9091, "step": 5077 }, { "epoch": 0.13373716091651303, "grad_norm": 2.325817823410034, "learning_rate": 4.331972609955228e-05, "loss": 2.1807, "step": 5078 }, { "epoch": 0.13376349749802477, "grad_norm": 2.515507459640503, "learning_rate": 4.3318409270476695e-05, "loss": 1.7233, "step": 5079 }, { "epoch": 0.13378983407953648, "grad_norm": 2.0451629161834717, "learning_rate": 4.331709244140111e-05, "loss": 1.8693, "step": 5080 }, { "epoch": 0.1338161706610482, "grad_norm": 2.8333163261413574, "learning_rate": 4.331577561232552e-05, "loss": 0.6471, "step": 5081 }, { "epoch": 0.13384250724255992, "grad_norm": 2.7954535484313965, "learning_rate": 4.3314458783249935e-05, "loss": 0.9213, "step": 5082 }, { "epoch": 0.13386884382407163, "grad_norm": 2.4277091026306152, "learning_rate": 4.331314195417435e-05, "loss": 1.2899, "step": 5083 }, { "epoch": 0.13389518040558335, "grad_norm": 2.7773337364196777, "learning_rate": 4.3311825125098766e-05, "loss": 1.3181, "step": 5084 }, { "epoch": 0.13392151698709506, "grad_norm": 2.029878854751587, "learning_rate": 4.3310508296023175e-05, "loss": 1.6522, "step": 5085 }, { "epoch": 0.1339478535686068, "grad_norm": 4.992863178253174, "learning_rate": 4.330919146694759e-05, "loss": 1.5919, "step": 5086 }, { "epoch": 0.13397419015011852, "grad_norm": 3.617291212081909, "learning_rate": 4.3307874637872006e-05, "loss": 2.0032, "step": 5087 }, { "epoch": 0.13400052673163024, "grad_norm": 3.9616904258728027, "learning_rate": 4.330655780879642e-05, "loss": 2.205, "step": 5088 }, { "epoch": 0.13402686331314195, "grad_norm": 2.2840869426727295, "learning_rate": 4.330524097972084e-05, "loss": 1.9219, "step": 5089 }, { "epoch": 0.13405319989465367, "grad_norm": 3.934386730194092, "learning_rate": 4.3303924150645246e-05, "loss": 1.0264, "step": 5090 }, { "epoch": 0.13407953647616538, "grad_norm": 2.615556001663208, "learning_rate": 4.330260732156966e-05, "loss": 2.6591, "step": 5091 }, { "epoch": 0.1341058730576771, "grad_norm": 1.6333255767822266, "learning_rate": 4.330129049249408e-05, "loss": 1.4891, "step": 5092 }, { "epoch": 0.13413220963918884, "grad_norm": 1.5788342952728271, "learning_rate": 4.329997366341849e-05, "loss": 1.739, "step": 5093 }, { "epoch": 0.13415854622070056, "grad_norm": 3.678856611251831, "learning_rate": 4.32986568343429e-05, "loss": 1.141, "step": 5094 }, { "epoch": 0.13418488280221227, "grad_norm": 2.5560591220855713, "learning_rate": 4.329734000526732e-05, "loss": 2.0155, "step": 5095 }, { "epoch": 0.134211219383724, "grad_norm": 2.2019999027252197, "learning_rate": 4.329602317619173e-05, "loss": 1.6245, "step": 5096 }, { "epoch": 0.1342375559652357, "grad_norm": 3.82480525970459, "learning_rate": 4.329470634711615e-05, "loss": 1.9263, "step": 5097 }, { "epoch": 0.13426389254674742, "grad_norm": 2.56531023979187, "learning_rate": 4.3293389518040564e-05, "loss": 2.2144, "step": 5098 }, { "epoch": 0.13429022912825916, "grad_norm": 1.8532615900039673, "learning_rate": 4.329207268896497e-05, "loss": 1.8402, "step": 5099 }, { "epoch": 0.13431656570977088, "grad_norm": 3.629487991333008, "learning_rate": 4.329075585988939e-05, "loss": 2.5898, "step": 5100 }, { "epoch": 0.1343429022912826, "grad_norm": 2.0321056842803955, "learning_rate": 4.32894390308138e-05, "loss": 1.8867, "step": 5101 }, { "epoch": 0.1343692388727943, "grad_norm": 2.923245906829834, "learning_rate": 4.328812220173822e-05, "loss": 1.4847, "step": 5102 }, { "epoch": 0.13439557545430603, "grad_norm": 2.0400636196136475, "learning_rate": 4.328680537266263e-05, "loss": 2.1398, "step": 5103 }, { "epoch": 0.13442191203581774, "grad_norm": 3.8229000568389893, "learning_rate": 4.3285488543587044e-05, "loss": 0.9901, "step": 5104 }, { "epoch": 0.13444824861732946, "grad_norm": 2.7378499507904053, "learning_rate": 4.328417171451146e-05, "loss": 1.7758, "step": 5105 }, { "epoch": 0.1344745851988412, "grad_norm": 1.6413730382919312, "learning_rate": 4.3282854885435875e-05, "loss": 1.6947, "step": 5106 }, { "epoch": 0.13450092178035292, "grad_norm": 2.3563058376312256, "learning_rate": 4.328153805636029e-05, "loss": 2.247, "step": 5107 }, { "epoch": 0.13452725836186463, "grad_norm": 3.0296270847320557, "learning_rate": 4.32802212272847e-05, "loss": 1.2415, "step": 5108 }, { "epoch": 0.13455359494337635, "grad_norm": 1.9270447492599487, "learning_rate": 4.3278904398209116e-05, "loss": 2.082, "step": 5109 }, { "epoch": 0.13457993152488806, "grad_norm": 2.082814931869507, "learning_rate": 4.3277587569133524e-05, "loss": 0.4957, "step": 5110 }, { "epoch": 0.13460626810639978, "grad_norm": 2.044718027114868, "learning_rate": 4.327627074005795e-05, "loss": 2.3478, "step": 5111 }, { "epoch": 0.13463260468791152, "grad_norm": 4.23112678527832, "learning_rate": 4.3274953910982356e-05, "loss": 1.8147, "step": 5112 }, { "epoch": 0.13465894126942324, "grad_norm": 1.7309212684631348, "learning_rate": 4.327363708190677e-05, "loss": 1.9121, "step": 5113 }, { "epoch": 0.13468527785093495, "grad_norm": 1.6054695844650269, "learning_rate": 4.327232025283118e-05, "loss": 1.9575, "step": 5114 }, { "epoch": 0.13471161443244667, "grad_norm": 1.9052006006240845, "learning_rate": 4.3271003423755596e-05, "loss": 2.5209, "step": 5115 }, { "epoch": 0.13473795101395838, "grad_norm": 2.179224967956543, "learning_rate": 4.326968659468001e-05, "loss": 1.9486, "step": 5116 }, { "epoch": 0.1347642875954701, "grad_norm": 2.9472708702087402, "learning_rate": 4.326836976560443e-05, "loss": 1.3445, "step": 5117 }, { "epoch": 0.13479062417698182, "grad_norm": 4.949427127838135, "learning_rate": 4.326705293652884e-05, "loss": 1.6875, "step": 5118 }, { "epoch": 0.13481696075849356, "grad_norm": NaN, "learning_rate": 4.326705293652884e-05, "loss": 1.1288, "step": 5119 }, { "epoch": 0.13484329734000527, "grad_norm": 1.7440154552459717, "learning_rate": 4.326573610745325e-05, "loss": 1.9201, "step": 5120 }, { "epoch": 0.134869633921517, "grad_norm": 2.23038387298584, "learning_rate": 4.3264419278377674e-05, "loss": 1.0692, "step": 5121 }, { "epoch": 0.1348959705030287, "grad_norm": 1.9543406963348389, "learning_rate": 4.326310244930208e-05, "loss": 1.9731, "step": 5122 }, { "epoch": 0.13492230708454042, "grad_norm": 2.2074053287506104, "learning_rate": 4.32617856202265e-05, "loss": 2.8003, "step": 5123 }, { "epoch": 0.13494864366605214, "grad_norm": 1.8646095991134644, "learning_rate": 4.326046879115091e-05, "loss": 1.4831, "step": 5124 }, { "epoch": 0.13497498024756385, "grad_norm": 1.9908664226531982, "learning_rate": 4.325915196207532e-05, "loss": 1.6991, "step": 5125 }, { "epoch": 0.1350013168290756, "grad_norm": 4.11297607421875, "learning_rate": 4.325783513299974e-05, "loss": 0.7609, "step": 5126 }, { "epoch": 0.1350276534105873, "grad_norm": 5.092033386230469, "learning_rate": 4.3256518303924154e-05, "loss": 1.6576, "step": 5127 }, { "epoch": 0.13505398999209903, "grad_norm": 2.7819604873657227, "learning_rate": 4.325520147484857e-05, "loss": 1.633, "step": 5128 }, { "epoch": 0.13508032657361074, "grad_norm": 2.531390905380249, "learning_rate": 4.325388464577298e-05, "loss": 2.1205, "step": 5129 }, { "epoch": 0.13510666315512246, "grad_norm": 2.7674753665924072, "learning_rate": 4.3252567816697394e-05, "loss": 2.1588, "step": 5130 }, { "epoch": 0.13513299973663417, "grad_norm": 4.091637134552002, "learning_rate": 4.325125098762181e-05, "loss": 2.1967, "step": 5131 }, { "epoch": 0.13515933631814592, "grad_norm": 2.6674866676330566, "learning_rate": 4.3249934158546225e-05, "loss": 2.101, "step": 5132 }, { "epoch": 0.13518567289965763, "grad_norm": 2.726515769958496, "learning_rate": 4.3248617329470634e-05, "loss": 0.5051, "step": 5133 }, { "epoch": 0.13521200948116935, "grad_norm": 2.311643362045288, "learning_rate": 4.324730050039505e-05, "loss": 1.9457, "step": 5134 }, { "epoch": 0.13523834606268106, "grad_norm": 3.0122528076171875, "learning_rate": 4.3245983671319465e-05, "loss": 0.5019, "step": 5135 }, { "epoch": 0.13526468264419278, "grad_norm": 2.6952977180480957, "learning_rate": 4.324466684224388e-05, "loss": 2.2212, "step": 5136 }, { "epoch": 0.1352910192257045, "grad_norm": 4.774688243865967, "learning_rate": 4.3243350013168296e-05, "loss": 1.8913, "step": 5137 }, { "epoch": 0.1353173558072162, "grad_norm": 1.9841638803482056, "learning_rate": 4.3242033184092705e-05, "loss": 2.1792, "step": 5138 }, { "epoch": 0.13534369238872795, "grad_norm": 1.6687781810760498, "learning_rate": 4.324071635501712e-05, "loss": 1.8076, "step": 5139 }, { "epoch": 0.13537002897023967, "grad_norm": 3.0525543689727783, "learning_rate": 4.3239399525941536e-05, "loss": 2.4661, "step": 5140 }, { "epoch": 0.13539636555175139, "grad_norm": 1.989721417427063, "learning_rate": 4.323808269686595e-05, "loss": 1.7802, "step": 5141 }, { "epoch": 0.1354227021332631, "grad_norm": 1.4657495021820068, "learning_rate": 4.323676586779036e-05, "loss": 1.4623, "step": 5142 }, { "epoch": 0.13544903871477482, "grad_norm": 2.152785062789917, "learning_rate": 4.3235449038714776e-05, "loss": 1.9602, "step": 5143 }, { "epoch": 0.13547537529628653, "grad_norm": 4.588759422302246, "learning_rate": 4.323413220963919e-05, "loss": 0.5224, "step": 5144 }, { "epoch": 0.13550171187779828, "grad_norm": 2.155844211578369, "learning_rate": 4.323281538056361e-05, "loss": 2.0237, "step": 5145 }, { "epoch": 0.13552804845931, "grad_norm": 2.232266426086426, "learning_rate": 4.323149855148802e-05, "loss": 1.8489, "step": 5146 }, { "epoch": 0.1355543850408217, "grad_norm": 3.1980278491973877, "learning_rate": 4.323018172241243e-05, "loss": 1.0601, "step": 5147 }, { "epoch": 0.13558072162233342, "grad_norm": 2.158320426940918, "learning_rate": 4.322886489333685e-05, "loss": 1.9878, "step": 5148 }, { "epoch": 0.13560705820384514, "grad_norm": 2.39532208442688, "learning_rate": 4.3227548064261256e-05, "loss": 1.9243, "step": 5149 }, { "epoch": 0.13563339478535685, "grad_norm": 5.758508205413818, "learning_rate": 4.322623123518568e-05, "loss": 1.8366, "step": 5150 }, { "epoch": 0.13565973136686857, "grad_norm": 2.9325613975524902, "learning_rate": 4.322491440611009e-05, "loss": 0.7884, "step": 5151 }, { "epoch": 0.1356860679483803, "grad_norm": 2.801708459854126, "learning_rate": 4.32235975770345e-05, "loss": 0.9392, "step": 5152 }, { "epoch": 0.13571240452989203, "grad_norm": 3.685453414916992, "learning_rate": 4.322228074795892e-05, "loss": 2.3278, "step": 5153 }, { "epoch": 0.13573874111140374, "grad_norm": 2.65653920173645, "learning_rate": 4.3220963918883334e-05, "loss": 2.1537, "step": 5154 }, { "epoch": 0.13576507769291546, "grad_norm": 3.002680778503418, "learning_rate": 4.321964708980775e-05, "loss": 2.2661, "step": 5155 }, { "epoch": 0.13579141427442717, "grad_norm": 1.8394556045532227, "learning_rate": 4.321833026073216e-05, "loss": 2.2173, "step": 5156 }, { "epoch": 0.1358177508559389, "grad_norm": 2.311682939529419, "learning_rate": 4.3217013431656574e-05, "loss": 1.85, "step": 5157 }, { "epoch": 0.1358440874374506, "grad_norm": 2.7533247470855713, "learning_rate": 4.321569660258098e-05, "loss": 0.9464, "step": 5158 }, { "epoch": 0.13587042401896235, "grad_norm": 3.7418079376220703, "learning_rate": 4.3214379773505405e-05, "loss": 1.4527, "step": 5159 }, { "epoch": 0.13589676060047406, "grad_norm": 4.358343601226807, "learning_rate": 4.3213062944429814e-05, "loss": 1.3739, "step": 5160 }, { "epoch": 0.13592309718198578, "grad_norm": 4.877420902252197, "learning_rate": 4.321174611535423e-05, "loss": 2.4356, "step": 5161 }, { "epoch": 0.1359494337634975, "grad_norm": 4.410695552825928, "learning_rate": 4.3210429286278645e-05, "loss": 1.6822, "step": 5162 }, { "epoch": 0.1359757703450092, "grad_norm": 8.638838768005371, "learning_rate": 4.3209112457203054e-05, "loss": 2.0576, "step": 5163 }, { "epoch": 0.13600210692652093, "grad_norm": 2.762232780456543, "learning_rate": 4.3207795628127476e-05, "loss": 1.8403, "step": 5164 }, { "epoch": 0.13602844350803267, "grad_norm": 3.232184886932373, "learning_rate": 4.3206478799051885e-05, "loss": 1.6123, "step": 5165 }, { "epoch": 0.13605478008954439, "grad_norm": 3.2808170318603516, "learning_rate": 4.32051619699763e-05, "loss": 1.0584, "step": 5166 }, { "epoch": 0.1360811166710561, "grad_norm": 2.531832456588745, "learning_rate": 4.320384514090071e-05, "loss": 1.7505, "step": 5167 }, { "epoch": 0.13610745325256782, "grad_norm": 3.6374802589416504, "learning_rate": 4.3202528311825125e-05, "loss": 1.2648, "step": 5168 }, { "epoch": 0.13613378983407953, "grad_norm": 3.9847042560577393, "learning_rate": 4.320121148274954e-05, "loss": 2.2801, "step": 5169 }, { "epoch": 0.13616012641559125, "grad_norm": 1.6192246675491333, "learning_rate": 4.3199894653673957e-05, "loss": 2.1486, "step": 5170 }, { "epoch": 0.13618646299710296, "grad_norm": 1.9894036054611206, "learning_rate": 4.3198577824598365e-05, "loss": 1.7824, "step": 5171 }, { "epoch": 0.1362127995786147, "grad_norm": 2.728970766067505, "learning_rate": 4.319726099552278e-05, "loss": 1.5232, "step": 5172 }, { "epoch": 0.13623913616012642, "grad_norm": 1.9817637205123901, "learning_rate": 4.3195944166447197e-05, "loss": 2.2884, "step": 5173 }, { "epoch": 0.13626547274163814, "grad_norm": 1.8956005573272705, "learning_rate": 4.319462733737161e-05, "loss": 1.9107, "step": 5174 }, { "epoch": 0.13629180932314985, "grad_norm": 2.97184419631958, "learning_rate": 4.319331050829603e-05, "loss": 1.9211, "step": 5175 }, { "epoch": 0.13631814590466157, "grad_norm": 2.0654196739196777, "learning_rate": 4.3191993679220437e-05, "loss": 3.451, "step": 5176 }, { "epoch": 0.13634448248617329, "grad_norm": 2.0442097187042236, "learning_rate": 4.319067685014485e-05, "loss": 2.138, "step": 5177 }, { "epoch": 0.136370819067685, "grad_norm": 1.800565481185913, "learning_rate": 4.318936002106927e-05, "loss": 1.5426, "step": 5178 }, { "epoch": 0.13639715564919674, "grad_norm": 1.8814598321914673, "learning_rate": 4.318804319199368e-05, "loss": 1.6583, "step": 5179 }, { "epoch": 0.13642349223070846, "grad_norm": 2.1714470386505127, "learning_rate": 4.318672636291809e-05, "loss": 2.308, "step": 5180 }, { "epoch": 0.13644982881222018, "grad_norm": 2.5671043395996094, "learning_rate": 4.318540953384251e-05, "loss": 1.5029, "step": 5181 }, { "epoch": 0.1364761653937319, "grad_norm": 1.9010472297668457, "learning_rate": 4.318409270476692e-05, "loss": 1.458, "step": 5182 }, { "epoch": 0.1365025019752436, "grad_norm": 2.0927422046661377, "learning_rate": 4.318277587569134e-05, "loss": 2.095, "step": 5183 }, { "epoch": 0.13652883855675532, "grad_norm": 2.9164364337921143, "learning_rate": 4.3181459046615755e-05, "loss": 1.2047, "step": 5184 }, { "epoch": 0.13655517513826707, "grad_norm": 1.4114418029785156, "learning_rate": 4.318014221754016e-05, "loss": 1.9129, "step": 5185 }, { "epoch": 0.13658151171977878, "grad_norm": 2.114582061767578, "learning_rate": 4.317882538846458e-05, "loss": 1.8152, "step": 5186 }, { "epoch": 0.1366078483012905, "grad_norm": 2.897184133529663, "learning_rate": 4.3177508559388995e-05, "loss": 1.8791, "step": 5187 }, { "epoch": 0.1366341848828022, "grad_norm": 3.748249053955078, "learning_rate": 4.317619173031341e-05, "loss": 1.0289, "step": 5188 }, { "epoch": 0.13666052146431393, "grad_norm": 2.620304584503174, "learning_rate": 4.317487490123782e-05, "loss": 1.1702, "step": 5189 }, { "epoch": 0.13668685804582564, "grad_norm": 3.8103697299957275, "learning_rate": 4.3173558072162235e-05, "loss": 1.7618, "step": 5190 }, { "epoch": 0.13671319462733736, "grad_norm": 4.340106964111328, "learning_rate": 4.317224124308665e-05, "loss": 1.4328, "step": 5191 }, { "epoch": 0.1367395312088491, "grad_norm": 2.6316447257995605, "learning_rate": 4.3170924414011066e-05, "loss": 0.9378, "step": 5192 }, { "epoch": 0.13676586779036082, "grad_norm": 2.8905019760131836, "learning_rate": 4.316960758493548e-05, "loss": 1.5811, "step": 5193 }, { "epoch": 0.13679220437187253, "grad_norm": 3.1205954551696777, "learning_rate": 4.316829075585989e-05, "loss": 2.4785, "step": 5194 }, { "epoch": 0.13681854095338425, "grad_norm": 4.276710510253906, "learning_rate": 4.3166973926784306e-05, "loss": 1.0512, "step": 5195 }, { "epoch": 0.13684487753489596, "grad_norm": 1.6796799898147583, "learning_rate": 4.3165657097708715e-05, "loss": 1.81, "step": 5196 }, { "epoch": 0.13687121411640768, "grad_norm": 2.3783931732177734, "learning_rate": 4.316434026863314e-05, "loss": 1.8936, "step": 5197 }, { "epoch": 0.13689755069791942, "grad_norm": 1.7245712280273438, "learning_rate": 4.3163023439557546e-05, "loss": 1.8404, "step": 5198 }, { "epoch": 0.13692388727943114, "grad_norm": 2.432018518447876, "learning_rate": 4.316170661048196e-05, "loss": 1.7911, "step": 5199 }, { "epoch": 0.13695022386094285, "grad_norm": 3.3141655921936035, "learning_rate": 4.316038978140638e-05, "loss": 0.9153, "step": 5200 }, { "epoch": 0.13697656044245457, "grad_norm": 3.472564697265625, "learning_rate": 4.3159072952330786e-05, "loss": 1.5215, "step": 5201 }, { "epoch": 0.13700289702396629, "grad_norm": 2.152303695678711, "learning_rate": 4.315775612325521e-05, "loss": 1.695, "step": 5202 }, { "epoch": 0.137029233605478, "grad_norm": 4.102975845336914, "learning_rate": 4.315643929417962e-05, "loss": 2.8923, "step": 5203 }, { "epoch": 0.13705557018698972, "grad_norm": 1.769879937171936, "learning_rate": 4.315512246510403e-05, "loss": 2.0335, "step": 5204 }, { "epoch": 0.13708190676850146, "grad_norm": 2.633875608444214, "learning_rate": 4.315380563602844e-05, "loss": 2.0759, "step": 5205 }, { "epoch": 0.13710824335001318, "grad_norm": 1.999447226524353, "learning_rate": 4.3152488806952864e-05, "loss": 1.6448, "step": 5206 }, { "epoch": 0.1371345799315249, "grad_norm": 3.656423568725586, "learning_rate": 4.315117197787727e-05, "loss": 1.7968, "step": 5207 }, { "epoch": 0.1371609165130366, "grad_norm": 1.698837161064148, "learning_rate": 4.314985514880169e-05, "loss": 1.7172, "step": 5208 }, { "epoch": 0.13718725309454832, "grad_norm": 2.1964449882507324, "learning_rate": 4.3148538319726104e-05, "loss": 1.7485, "step": 5209 }, { "epoch": 0.13721358967606004, "grad_norm": 6.2827839851379395, "learning_rate": 4.314722149065051e-05, "loss": 1.9537, "step": 5210 }, { "epoch": 0.13723992625757175, "grad_norm": 1.9901573657989502, "learning_rate": 4.3145904661574935e-05, "loss": 1.5434, "step": 5211 }, { "epoch": 0.1372662628390835, "grad_norm": 2.386258125305176, "learning_rate": 4.3144587832499344e-05, "loss": 1.7594, "step": 5212 }, { "epoch": 0.1372925994205952, "grad_norm": 1.9436391592025757, "learning_rate": 4.314327100342376e-05, "loss": 1.5699, "step": 5213 }, { "epoch": 0.13731893600210693, "grad_norm": 1.8265070915222168, "learning_rate": 4.314195417434817e-05, "loss": 1.7264, "step": 5214 }, { "epoch": 0.13734527258361864, "grad_norm": 4.723089218139648, "learning_rate": 4.3140637345272584e-05, "loss": 2.0678, "step": 5215 }, { "epoch": 0.13737160916513036, "grad_norm": 2.4445502758026123, "learning_rate": 4.3139320516197e-05, "loss": 2.1188, "step": 5216 }, { "epoch": 0.13739794574664208, "grad_norm": 4.167450904846191, "learning_rate": 4.3138003687121415e-05, "loss": 1.5661, "step": 5217 }, { "epoch": 0.13742428232815382, "grad_norm": 3.330721616744995, "learning_rate": 4.3136686858045824e-05, "loss": 1.8171, "step": 5218 }, { "epoch": 0.13745061890966553, "grad_norm": 2.3990249633789062, "learning_rate": 4.313537002897024e-05, "loss": 1.7148, "step": 5219 }, { "epoch": 0.13747695549117725, "grad_norm": 2.406870126724243, "learning_rate": 4.3134053199894655e-05, "loss": 1.3883, "step": 5220 }, { "epoch": 0.13750329207268897, "grad_norm": 3.1100730895996094, "learning_rate": 4.313273637081907e-05, "loss": 1.6218, "step": 5221 }, { "epoch": 0.13752962865420068, "grad_norm": 2.6337523460388184, "learning_rate": 4.3131419541743486e-05, "loss": 2.2444, "step": 5222 }, { "epoch": 0.1375559652357124, "grad_norm": 1.8023779392242432, "learning_rate": 4.3130102712667895e-05, "loss": 1.7551, "step": 5223 }, { "epoch": 0.1375823018172241, "grad_norm": 2.707688570022583, "learning_rate": 4.312878588359231e-05, "loss": 1.931, "step": 5224 }, { "epoch": 0.13760863839873586, "grad_norm": 1.6479952335357666, "learning_rate": 4.3127469054516726e-05, "loss": 1.5038, "step": 5225 }, { "epoch": 0.13763497498024757, "grad_norm": 2.9928996562957764, "learning_rate": 4.312615222544114e-05, "loss": 1.4031, "step": 5226 }, { "epoch": 0.1376613115617593, "grad_norm": 2.356449842453003, "learning_rate": 4.312483539636555e-05, "loss": 1.7151, "step": 5227 }, { "epoch": 0.137687648143271, "grad_norm": 5.041073799133301, "learning_rate": 4.3123518567289966e-05, "loss": 0.9312, "step": 5228 }, { "epoch": 0.13771398472478272, "grad_norm": 3.6362922191619873, "learning_rate": 4.312220173821438e-05, "loss": 0.6874, "step": 5229 }, { "epoch": 0.13774032130629443, "grad_norm": 2.334411144256592, "learning_rate": 4.31208849091388e-05, "loss": 1.7671, "step": 5230 }, { "epoch": 0.13776665788780618, "grad_norm": 1.6797144412994385, "learning_rate": 4.311956808006321e-05, "loss": 1.4012, "step": 5231 }, { "epoch": 0.1377929944693179, "grad_norm": 4.525765895843506, "learning_rate": 4.311825125098762e-05, "loss": 0.6192, "step": 5232 }, { "epoch": 0.1378193310508296, "grad_norm": 3.5723278522491455, "learning_rate": 4.311693442191204e-05, "loss": 1.1591, "step": 5233 }, { "epoch": 0.13784566763234132, "grad_norm": 1.8095427751541138, "learning_rate": 4.3115617592836446e-05, "loss": 2.1363, "step": 5234 }, { "epoch": 0.13787200421385304, "grad_norm": 2.4696295261383057, "learning_rate": 4.311430076376087e-05, "loss": 1.5249, "step": 5235 }, { "epoch": 0.13789834079536475, "grad_norm": 1.9217630624771118, "learning_rate": 4.311298393468528e-05, "loss": 0.9953, "step": 5236 }, { "epoch": 0.13792467737687647, "grad_norm": 7.475039482116699, "learning_rate": 4.311166710560969e-05, "loss": 2.0335, "step": 5237 }, { "epoch": 0.1379510139583882, "grad_norm": 2.167692184448242, "learning_rate": 4.311035027653411e-05, "loss": 1.4621, "step": 5238 }, { "epoch": 0.13797735053989993, "grad_norm": 1.872039794921875, "learning_rate": 4.3109033447458524e-05, "loss": 1.7059, "step": 5239 }, { "epoch": 0.13800368712141164, "grad_norm": 1.7853387594223022, "learning_rate": 4.310771661838294e-05, "loss": 2.2233, "step": 5240 }, { "epoch": 0.13803002370292336, "grad_norm": 2.5420360565185547, "learning_rate": 4.310639978930735e-05, "loss": 1.4443, "step": 5241 }, { "epoch": 0.13805636028443508, "grad_norm": 2.502525806427002, "learning_rate": 4.3105082960231764e-05, "loss": 0.9432, "step": 5242 }, { "epoch": 0.1380826968659468, "grad_norm": 2.235863447189331, "learning_rate": 4.310376613115617e-05, "loss": 1.5561, "step": 5243 }, { "epoch": 0.1381090334474585, "grad_norm": 1.5832233428955078, "learning_rate": 4.3102449302080596e-05, "loss": 1.6918, "step": 5244 }, { "epoch": 0.13813537002897025, "grad_norm": 3.3127658367156982, "learning_rate": 4.3101132473005004e-05, "loss": 2.0622, "step": 5245 }, { "epoch": 0.13816170661048197, "grad_norm": 2.0305511951446533, "learning_rate": 4.309981564392942e-05, "loss": 2.0898, "step": 5246 }, { "epoch": 0.13818804319199368, "grad_norm": 1.9707815647125244, "learning_rate": 4.3098498814853836e-05, "loss": 2.4752, "step": 5247 }, { "epoch": 0.1382143797735054, "grad_norm": 2.595893144607544, "learning_rate": 4.3097181985778244e-05, "loss": 2.2197, "step": 5248 }, { "epoch": 0.1382407163550171, "grad_norm": 3.9535906314849854, "learning_rate": 4.309586515670267e-05, "loss": 0.9025, "step": 5249 }, { "epoch": 0.13826705293652883, "grad_norm": 8.726988792419434, "learning_rate": 4.3094548327627076e-05, "loss": 1.3627, "step": 5250 }, { "epoch": 0.13829338951804057, "grad_norm": 2.1553943157196045, "learning_rate": 4.309323149855149e-05, "loss": 1.641, "step": 5251 }, { "epoch": 0.1383197260995523, "grad_norm": 3.6073081493377686, "learning_rate": 4.30919146694759e-05, "loss": 1.3084, "step": 5252 }, { "epoch": 0.138346062681064, "grad_norm": 1.9121544361114502, "learning_rate": 4.309059784040032e-05, "loss": 1.9856, "step": 5253 }, { "epoch": 0.13837239926257572, "grad_norm": 2.1146183013916016, "learning_rate": 4.308928101132473e-05, "loss": 1.2974, "step": 5254 }, { "epoch": 0.13839873584408743, "grad_norm": 2.2476680278778076, "learning_rate": 4.308796418224915e-05, "loss": 1.7303, "step": 5255 }, { "epoch": 0.13842507242559915, "grad_norm": 1.6491483449935913, "learning_rate": 4.308664735317356e-05, "loss": 1.6044, "step": 5256 }, { "epoch": 0.13845140900711086, "grad_norm": 1.7803096771240234, "learning_rate": 4.308533052409797e-05, "loss": 1.233, "step": 5257 }, { "epoch": 0.1384777455886226, "grad_norm": 2.172182321548462, "learning_rate": 4.3084013695022394e-05, "loss": 2.0498, "step": 5258 }, { "epoch": 0.13850408217013432, "grad_norm": 2.2713263034820557, "learning_rate": 4.30826968659468e-05, "loss": 1.7635, "step": 5259 }, { "epoch": 0.13853041875164604, "grad_norm": 1.8058420419692993, "learning_rate": 4.308138003687122e-05, "loss": 0.3705, "step": 5260 }, { "epoch": 0.13855675533315776, "grad_norm": 2.0371696949005127, "learning_rate": 4.308006320779563e-05, "loss": 1.9953, "step": 5261 }, { "epoch": 0.13858309191466947, "grad_norm": 3.878199338912964, "learning_rate": 4.307874637872004e-05, "loss": 2.118, "step": 5262 }, { "epoch": 0.1386094284961812, "grad_norm": 1.545540452003479, "learning_rate": 4.307742954964446e-05, "loss": 1.3873, "step": 5263 }, { "epoch": 0.1386357650776929, "grad_norm": 2.0873782634735107, "learning_rate": 4.3076112720568874e-05, "loss": 1.6184, "step": 5264 }, { "epoch": 0.13866210165920465, "grad_norm": 2.4241604804992676, "learning_rate": 4.307479589149329e-05, "loss": 2.1249, "step": 5265 }, { "epoch": 0.13868843824071636, "grad_norm": 2.53702974319458, "learning_rate": 4.30734790624177e-05, "loss": 1.8551, "step": 5266 }, { "epoch": 0.13871477482222808, "grad_norm": 3.770756721496582, "learning_rate": 4.3072162233342114e-05, "loss": 1.0779, "step": 5267 }, { "epoch": 0.1387411114037398, "grad_norm": 2.1507341861724854, "learning_rate": 4.307084540426653e-05, "loss": 1.3775, "step": 5268 }, { "epoch": 0.1387674479852515, "grad_norm": 1.8591163158416748, "learning_rate": 4.3069528575190945e-05, "loss": 1.2577, "step": 5269 }, { "epoch": 0.13879378456676322, "grad_norm": 1.8454272747039795, "learning_rate": 4.3068211746115354e-05, "loss": 2.4208, "step": 5270 }, { "epoch": 0.13882012114827497, "grad_norm": 3.1865649223327637, "learning_rate": 4.306689491703977e-05, "loss": 1.126, "step": 5271 }, { "epoch": 0.13884645772978668, "grad_norm": 2.2785346508026123, "learning_rate": 4.3065578087964185e-05, "loss": 1.7934, "step": 5272 }, { "epoch": 0.1388727943112984, "grad_norm": 1.8782856464385986, "learning_rate": 4.30642612588886e-05, "loss": 1.7596, "step": 5273 }, { "epoch": 0.1388991308928101, "grad_norm": 2.8826820850372314, "learning_rate": 4.306294442981301e-05, "loss": 1.2076, "step": 5274 }, { "epoch": 0.13892546747432183, "grad_norm": 1.967508316040039, "learning_rate": 4.3061627600737425e-05, "loss": 1.2397, "step": 5275 }, { "epoch": 0.13895180405583354, "grad_norm": 2.5633678436279297, "learning_rate": 4.306031077166184e-05, "loss": 1.5113, "step": 5276 }, { "epoch": 0.13897814063734526, "grad_norm": 2.3238189220428467, "learning_rate": 4.3058993942586256e-05, "loss": 1.415, "step": 5277 }, { "epoch": 0.139004477218857, "grad_norm": 2.784306287765503, "learning_rate": 4.305767711351067e-05, "loss": 0.7378, "step": 5278 }, { "epoch": 0.13903081380036872, "grad_norm": 1.964415192604065, "learning_rate": 4.305636028443508e-05, "loss": 2.1115, "step": 5279 }, { "epoch": 0.13905715038188043, "grad_norm": 2.1276159286499023, "learning_rate": 4.3055043455359496e-05, "loss": 0.8108, "step": 5280 }, { "epoch": 0.13908348696339215, "grad_norm": 2.8581173419952393, "learning_rate": 4.3053726626283905e-05, "loss": 2.1912, "step": 5281 }, { "epoch": 0.13910982354490387, "grad_norm": 2.2042317390441895, "learning_rate": 4.305240979720833e-05, "loss": 2.4503, "step": 5282 }, { "epoch": 0.13913616012641558, "grad_norm": 1.9640851020812988, "learning_rate": 4.3051092968132736e-05, "loss": 1.9167, "step": 5283 }, { "epoch": 0.13916249670792732, "grad_norm": 1.6979402303695679, "learning_rate": 4.304977613905715e-05, "loss": 1.6881, "step": 5284 }, { "epoch": 0.13918883328943904, "grad_norm": 3.2987239360809326, "learning_rate": 4.304845930998157e-05, "loss": 1.9442, "step": 5285 }, { "epoch": 0.13921516987095076, "grad_norm": 2.034170150756836, "learning_rate": 4.304714248090598e-05, "loss": 1.5473, "step": 5286 }, { "epoch": 0.13924150645246247, "grad_norm": 1.9552794694900513, "learning_rate": 4.30458256518304e-05, "loss": 1.7752, "step": 5287 }, { "epoch": 0.1392678430339742, "grad_norm": 3.249704599380493, "learning_rate": 4.304450882275481e-05, "loss": 1.486, "step": 5288 }, { "epoch": 0.1392941796154859, "grad_norm": 2.3470027446746826, "learning_rate": 4.304319199367922e-05, "loss": 2.0688, "step": 5289 }, { "epoch": 0.13932051619699762, "grad_norm": 3.5709376335144043, "learning_rate": 4.304187516460363e-05, "loss": 2.2988, "step": 5290 }, { "epoch": 0.13934685277850936, "grad_norm": 1.8416324853897095, "learning_rate": 4.3040558335528054e-05, "loss": 1.2856, "step": 5291 }, { "epoch": 0.13937318936002108, "grad_norm": 1.8821313381195068, "learning_rate": 4.303924150645246e-05, "loss": 1.5477, "step": 5292 }, { "epoch": 0.1393995259415328, "grad_norm": 2.9171810150146484, "learning_rate": 4.303792467737688e-05, "loss": 1.3891, "step": 5293 }, { "epoch": 0.1394258625230445, "grad_norm": 3.3034346103668213, "learning_rate": 4.3036607848301294e-05, "loss": 0.7083, "step": 5294 }, { "epoch": 0.13945219910455622, "grad_norm": 1.651649832725525, "learning_rate": 4.30352910192257e-05, "loss": 1.6374, "step": 5295 }, { "epoch": 0.13947853568606794, "grad_norm": 2.5994820594787598, "learning_rate": 4.3033974190150125e-05, "loss": 1.5246, "step": 5296 }, { "epoch": 0.13950487226757965, "grad_norm": 2.1049609184265137, "learning_rate": 4.3032657361074534e-05, "loss": 2.0054, "step": 5297 }, { "epoch": 0.1395312088490914, "grad_norm": 1.8851418495178223, "learning_rate": 4.303134053199895e-05, "loss": 1.5025, "step": 5298 }, { "epoch": 0.1395575454306031, "grad_norm": 3.3094029426574707, "learning_rate": 4.303002370292336e-05, "loss": 1.6932, "step": 5299 }, { "epoch": 0.13958388201211483, "grad_norm": 2.852307081222534, "learning_rate": 4.3028706873847774e-05, "loss": 1.4242, "step": 5300 }, { "epoch": 0.13961021859362654, "grad_norm": 1.9864132404327393, "learning_rate": 4.302739004477219e-05, "loss": 1.6872, "step": 5301 }, { "epoch": 0.13963655517513826, "grad_norm": 1.6435332298278809, "learning_rate": 4.3026073215696605e-05, "loss": 1.5577, "step": 5302 }, { "epoch": 0.13966289175664998, "grad_norm": 4.0532026290893555, "learning_rate": 4.302475638662102e-05, "loss": 0.7639, "step": 5303 }, { "epoch": 0.13968922833816172, "grad_norm": 2.8549790382385254, "learning_rate": 4.302343955754543e-05, "loss": 2.1158, "step": 5304 }, { "epoch": 0.13971556491967344, "grad_norm": 2.293412208557129, "learning_rate": 4.302212272846985e-05, "loss": 1.6977, "step": 5305 }, { "epoch": 0.13974190150118515, "grad_norm": 1.766440510749817, "learning_rate": 4.302080589939426e-05, "loss": 1.617, "step": 5306 }, { "epoch": 0.13976823808269687, "grad_norm": 3.213672399520874, "learning_rate": 4.3019489070318677e-05, "loss": 1.5126, "step": 5307 }, { "epoch": 0.13979457466420858, "grad_norm": 3.596798896789551, "learning_rate": 4.3018172241243085e-05, "loss": 0.8919, "step": 5308 }, { "epoch": 0.1398209112457203, "grad_norm": 2.3942911624908447, "learning_rate": 4.30168554121675e-05, "loss": 0.3629, "step": 5309 }, { "epoch": 0.139847247827232, "grad_norm": 3.009469985961914, "learning_rate": 4.3015538583091917e-05, "loss": 1.8567, "step": 5310 }, { "epoch": 0.13987358440874376, "grad_norm": 1.6067448854446411, "learning_rate": 4.301422175401633e-05, "loss": 1.6066, "step": 5311 }, { "epoch": 0.13989992099025547, "grad_norm": 1.8433202505111694, "learning_rate": 4.301290492494075e-05, "loss": 1.7712, "step": 5312 }, { "epoch": 0.1399262575717672, "grad_norm": 2.316406011581421, "learning_rate": 4.3011588095865157e-05, "loss": 1.5872, "step": 5313 }, { "epoch": 0.1399525941532789, "grad_norm": 1.78685462474823, "learning_rate": 4.301027126678957e-05, "loss": 1.6376, "step": 5314 }, { "epoch": 0.13997893073479062, "grad_norm": 3.2384531497955322, "learning_rate": 4.300895443771399e-05, "loss": 1.6906, "step": 5315 }, { "epoch": 0.14000526731630233, "grad_norm": 2.1208434104919434, "learning_rate": 4.30076376086384e-05, "loss": 1.7709, "step": 5316 }, { "epoch": 0.14003160389781408, "grad_norm": 2.973423480987549, "learning_rate": 4.300632077956281e-05, "loss": 1.4483, "step": 5317 }, { "epoch": 0.1400579404793258, "grad_norm": 5.9371137619018555, "learning_rate": 4.300500395048723e-05, "loss": 1.6957, "step": 5318 }, { "epoch": 0.1400842770608375, "grad_norm": 2.1459813117980957, "learning_rate": 4.3003687121411643e-05, "loss": 2.0978, "step": 5319 }, { "epoch": 0.14011061364234922, "grad_norm": 2.6176798343658447, "learning_rate": 4.300237029233606e-05, "loss": 1.3801, "step": 5320 }, { "epoch": 0.14013695022386094, "grad_norm": 2.8769965171813965, "learning_rate": 4.300105346326047e-05, "loss": 1.9502, "step": 5321 }, { "epoch": 0.14016328680537266, "grad_norm": 2.8037261962890625, "learning_rate": 4.2999736634184883e-05, "loss": 2.1592, "step": 5322 }, { "epoch": 0.14018962338688437, "grad_norm": 3.456369161605835, "learning_rate": 4.29984198051093e-05, "loss": 1.4191, "step": 5323 }, { "epoch": 0.14021595996839611, "grad_norm": 1.7822257280349731, "learning_rate": 4.2997102976033715e-05, "loss": 2.4011, "step": 5324 }, { "epoch": 0.14024229654990783, "grad_norm": 2.4780359268188477, "learning_rate": 4.299578614695813e-05, "loss": 2.3557, "step": 5325 }, { "epoch": 0.14026863313141955, "grad_norm": 2.358567714691162, "learning_rate": 4.299446931788254e-05, "loss": 2.1387, "step": 5326 }, { "epoch": 0.14029496971293126, "grad_norm": 2.3652851581573486, "learning_rate": 4.2993152488806955e-05, "loss": 1.8933, "step": 5327 }, { "epoch": 0.14032130629444298, "grad_norm": 1.890342354774475, "learning_rate": 4.2991835659731363e-05, "loss": 1.6166, "step": 5328 }, { "epoch": 0.1403476428759547, "grad_norm": 1.642198920249939, "learning_rate": 4.2990518830655786e-05, "loss": 1.934, "step": 5329 }, { "epoch": 0.1403739794574664, "grad_norm": 2.969290018081665, "learning_rate": 4.2989202001580195e-05, "loss": 1.0556, "step": 5330 }, { "epoch": 0.14040031603897815, "grad_norm": 3.4994609355926514, "learning_rate": 4.298788517250461e-05, "loss": 1.791, "step": 5331 }, { "epoch": 0.14042665262048987, "grad_norm": 2.1280102729797363, "learning_rate": 4.2986568343429026e-05, "loss": 1.4283, "step": 5332 }, { "epoch": 0.14045298920200158, "grad_norm": 2.64162540435791, "learning_rate": 4.2985251514353435e-05, "loss": 1.8572, "step": 5333 }, { "epoch": 0.1404793257835133, "grad_norm": 2.2723042964935303, "learning_rate": 4.298393468527786e-05, "loss": 1.9276, "step": 5334 }, { "epoch": 0.140505662365025, "grad_norm": 2.2977218627929688, "learning_rate": 4.2982617856202266e-05, "loss": 0.5672, "step": 5335 }, { "epoch": 0.14053199894653673, "grad_norm": 2.1390562057495117, "learning_rate": 4.298130102712668e-05, "loss": 1.6938, "step": 5336 }, { "epoch": 0.14055833552804847, "grad_norm": 5.347156524658203, "learning_rate": 4.297998419805109e-05, "loss": 0.322, "step": 5337 }, { "epoch": 0.1405846721095602, "grad_norm": 4.87186861038208, "learning_rate": 4.297866736897551e-05, "loss": 2.3787, "step": 5338 }, { "epoch": 0.1406110086910719, "grad_norm": 1.8002957105636597, "learning_rate": 4.297735053989992e-05, "loss": 1.7799, "step": 5339 }, { "epoch": 0.14063734527258362, "grad_norm": 2.777805805206299, "learning_rate": 4.297603371082434e-05, "loss": 0.7216, "step": 5340 }, { "epoch": 0.14066368185409533, "grad_norm": 4.567877769470215, "learning_rate": 4.297471688174875e-05, "loss": 1.4741, "step": 5341 }, { "epoch": 0.14069001843560705, "grad_norm": 4.0375800132751465, "learning_rate": 4.297340005267316e-05, "loss": 2.5037, "step": 5342 }, { "epoch": 0.14071635501711877, "grad_norm": 3.46451735496521, "learning_rate": 4.2972083223597584e-05, "loss": 1.5835, "step": 5343 }, { "epoch": 0.1407426915986305, "grad_norm": 5.962404727935791, "learning_rate": 4.297076639452199e-05, "loss": 0.8314, "step": 5344 }, { "epoch": 0.14076902818014222, "grad_norm": 1.6963344812393188, "learning_rate": 4.296944956544641e-05, "loss": 2.2048, "step": 5345 }, { "epoch": 0.14079536476165394, "grad_norm": 2.7813546657562256, "learning_rate": 4.296813273637082e-05, "loss": 1.7117, "step": 5346 }, { "epoch": 0.14082170134316566, "grad_norm": 1.8465445041656494, "learning_rate": 4.296681590729523e-05, "loss": 2.1499, "step": 5347 }, { "epoch": 0.14084803792467737, "grad_norm": 1.759569764137268, "learning_rate": 4.296549907821965e-05, "loss": 2.4608, "step": 5348 }, { "epoch": 0.1408743745061891, "grad_norm": 3.9533181190490723, "learning_rate": 4.2964182249144064e-05, "loss": 0.9339, "step": 5349 }, { "epoch": 0.1409007110877008, "grad_norm": 2.5577051639556885, "learning_rate": 4.296286542006848e-05, "loss": 1.0319, "step": 5350 }, { "epoch": 0.14092704766921255, "grad_norm": 1.5836293697357178, "learning_rate": 4.296154859099289e-05, "loss": 1.7003, "step": 5351 }, { "epoch": 0.14095338425072426, "grad_norm": 3.7681639194488525, "learning_rate": 4.296023176191731e-05, "loss": 1.4407, "step": 5352 }, { "epoch": 0.14097972083223598, "grad_norm": 4.495939254760742, "learning_rate": 4.295891493284172e-05, "loss": 1.2109, "step": 5353 }, { "epoch": 0.1410060574137477, "grad_norm": 1.6568726301193237, "learning_rate": 4.2957598103766135e-05, "loss": 1.8267, "step": 5354 }, { "epoch": 0.1410323939952594, "grad_norm": 2.114642858505249, "learning_rate": 4.2956281274690544e-05, "loss": 2.169, "step": 5355 }, { "epoch": 0.14105873057677112, "grad_norm": 3.1813671588897705, "learning_rate": 4.295496444561496e-05, "loss": 2.3141, "step": 5356 }, { "epoch": 0.14108506715828287, "grad_norm": 2.931057929992676, "learning_rate": 4.2953647616539375e-05, "loss": 1.9705, "step": 5357 }, { "epoch": 0.14111140373979458, "grad_norm": 3.206650495529175, "learning_rate": 4.295233078746379e-05, "loss": 1.6772, "step": 5358 }, { "epoch": 0.1411377403213063, "grad_norm": 2.918741464614868, "learning_rate": 4.2951013958388206e-05, "loss": 1.8946, "step": 5359 }, { "epoch": 0.14116407690281801, "grad_norm": 1.9289441108703613, "learning_rate": 4.2949697129312615e-05, "loss": 2.2393, "step": 5360 }, { "epoch": 0.14119041348432973, "grad_norm": 3.684124708175659, "learning_rate": 4.294838030023703e-05, "loss": 1.6195, "step": 5361 }, { "epoch": 0.14121675006584145, "grad_norm": 2.0609798431396484, "learning_rate": 4.2947063471161446e-05, "loss": 2.0449, "step": 5362 }, { "epoch": 0.14124308664735316, "grad_norm": 1.9729681015014648, "learning_rate": 4.294574664208586e-05, "loss": 1.8299, "step": 5363 }, { "epoch": 0.1412694232288649, "grad_norm": 2.7491116523742676, "learning_rate": 4.294442981301027e-05, "loss": 0.3671, "step": 5364 }, { "epoch": 0.14129575981037662, "grad_norm": 3.1971828937530518, "learning_rate": 4.2943112983934686e-05, "loss": 2.3418, "step": 5365 }, { "epoch": 0.14132209639188834, "grad_norm": 2.176037311553955, "learning_rate": 4.29417961548591e-05, "loss": 1.4641, "step": 5366 }, { "epoch": 0.14134843297340005, "grad_norm": 2.338270425796509, "learning_rate": 4.294047932578352e-05, "loss": 2.6389, "step": 5367 }, { "epoch": 0.14137476955491177, "grad_norm": 1.7673707008361816, "learning_rate": 4.293916249670793e-05, "loss": 1.8622, "step": 5368 }, { "epoch": 0.14140110613642348, "grad_norm": 1.4852733612060547, "learning_rate": 4.293784566763234e-05, "loss": 1.9444, "step": 5369 }, { "epoch": 0.14142744271793523, "grad_norm": 6.920309066772461, "learning_rate": 4.293652883855676e-05, "loss": 1.3008, "step": 5370 }, { "epoch": 0.14145377929944694, "grad_norm": 2.7387514114379883, "learning_rate": 4.293521200948117e-05, "loss": 1.1729, "step": 5371 }, { "epoch": 0.14148011588095866, "grad_norm": 3.4094245433807373, "learning_rate": 4.293389518040559e-05, "loss": 1.4583, "step": 5372 }, { "epoch": 0.14150645246247037, "grad_norm": 1.66677725315094, "learning_rate": 4.293257835133e-05, "loss": 1.1794, "step": 5373 }, { "epoch": 0.1415327890439821, "grad_norm": 3.2916173934936523, "learning_rate": 4.293126152225441e-05, "loss": 1.5495, "step": 5374 }, { "epoch": 0.1415591256254938, "grad_norm": 3.422957181930542, "learning_rate": 4.292994469317882e-05, "loss": 0.886, "step": 5375 }, { "epoch": 0.14158546220700552, "grad_norm": 4.083506107330322, "learning_rate": 4.2928627864103244e-05, "loss": 1.4356, "step": 5376 }, { "epoch": 0.14161179878851726, "grad_norm": 3.9313323497772217, "learning_rate": 4.292731103502765e-05, "loss": 2.0779, "step": 5377 }, { "epoch": 0.14163813537002898, "grad_norm": 1.9234659671783447, "learning_rate": 4.292599420595207e-05, "loss": 1.7068, "step": 5378 }, { "epoch": 0.1416644719515407, "grad_norm": 2.0492684841156006, "learning_rate": 4.2924677376876484e-05, "loss": 1.626, "step": 5379 }, { "epoch": 0.1416908085330524, "grad_norm": 1.9004372358322144, "learning_rate": 4.292336054780089e-05, "loss": 1.7198, "step": 5380 }, { "epoch": 0.14171714511456412, "grad_norm": 3.774111032485962, "learning_rate": 4.2922043718725316e-05, "loss": 1.1441, "step": 5381 }, { "epoch": 0.14174348169607584, "grad_norm": 2.471282482147217, "learning_rate": 4.2920726889649724e-05, "loss": 1.725, "step": 5382 }, { "epoch": 0.14176981827758756, "grad_norm": 3.1594924926757812, "learning_rate": 4.291941006057414e-05, "loss": 0.8389, "step": 5383 }, { "epoch": 0.1417961548590993, "grad_norm": 2.274911403656006, "learning_rate": 4.291809323149855e-05, "loss": 2.0022, "step": 5384 }, { "epoch": 0.14182249144061101, "grad_norm": 5.938522815704346, "learning_rate": 4.291677640242297e-05, "loss": 2.0639, "step": 5385 }, { "epoch": 0.14184882802212273, "grad_norm": 6.627666473388672, "learning_rate": 4.291545957334738e-05, "loss": 1.1636, "step": 5386 }, { "epoch": 0.14187516460363445, "grad_norm": 2.3253440856933594, "learning_rate": 4.2914142744271796e-05, "loss": 2.3221, "step": 5387 }, { "epoch": 0.14190150118514616, "grad_norm": 2.9312222003936768, "learning_rate": 4.291282591519621e-05, "loss": 1.3796, "step": 5388 }, { "epoch": 0.14192783776665788, "grad_norm": 1.3603320121765137, "learning_rate": 4.291150908612062e-05, "loss": 1.706, "step": 5389 }, { "epoch": 0.14195417434816962, "grad_norm": 1.9491292238235474, "learning_rate": 4.291019225704504e-05, "loss": 1.3716, "step": 5390 }, { "epoch": 0.14198051092968134, "grad_norm": 1.877495288848877, "learning_rate": 4.290887542796945e-05, "loss": 2.1832, "step": 5391 }, { "epoch": 0.14200684751119305, "grad_norm": 1.968705415725708, "learning_rate": 4.290755859889387e-05, "loss": 1.839, "step": 5392 }, { "epoch": 0.14203318409270477, "grad_norm": 2.91729736328125, "learning_rate": 4.2906241769818276e-05, "loss": 1.793, "step": 5393 }, { "epoch": 0.14205952067421648, "grad_norm": 2.32688570022583, "learning_rate": 4.290492494074269e-05, "loss": 1.7722, "step": 5394 }, { "epoch": 0.1420858572557282, "grad_norm": 3.8894801139831543, "learning_rate": 4.290360811166711e-05, "loss": 1.7161, "step": 5395 }, { "epoch": 0.14211219383723991, "grad_norm": 3.7999372482299805, "learning_rate": 4.290229128259152e-05, "loss": 2.0999, "step": 5396 }, { "epoch": 0.14213853041875166, "grad_norm": 3.7862586975097656, "learning_rate": 4.290097445351594e-05, "loss": 0.7159, "step": 5397 }, { "epoch": 0.14216486700026337, "grad_norm": 1.9424610137939453, "learning_rate": 4.289965762444035e-05, "loss": 1.9509, "step": 5398 }, { "epoch": 0.1421912035817751, "grad_norm": 2.3114311695098877, "learning_rate": 4.289834079536477e-05, "loss": 1.5204, "step": 5399 }, { "epoch": 0.1422175401632868, "grad_norm": 3.0062406063079834, "learning_rate": 4.289702396628918e-05, "loss": 0.6657, "step": 5400 }, { "epoch": 0.14224387674479852, "grad_norm": 2.8383936882019043, "learning_rate": 4.2895707137213594e-05, "loss": 1.4026, "step": 5401 }, { "epoch": 0.14227021332631024, "grad_norm": 1.8685115575790405, "learning_rate": 4.2894390308138e-05, "loss": 1.484, "step": 5402 }, { "epoch": 0.14229654990782195, "grad_norm": 6.029784679412842, "learning_rate": 4.289307347906242e-05, "loss": 2.1724, "step": 5403 }, { "epoch": 0.1423228864893337, "grad_norm": 2.173064947128296, "learning_rate": 4.2891756649986834e-05, "loss": 1.3124, "step": 5404 }, { "epoch": 0.1423492230708454, "grad_norm": 1.7509270906448364, "learning_rate": 4.289043982091125e-05, "loss": 1.7561, "step": 5405 }, { "epoch": 0.14237555965235713, "grad_norm": 5.834602355957031, "learning_rate": 4.2889122991835665e-05, "loss": 1.899, "step": 5406 }, { "epoch": 0.14240189623386884, "grad_norm": 2.7962148189544678, "learning_rate": 4.2887806162760074e-05, "loss": 1.3563, "step": 5407 }, { "epoch": 0.14242823281538056, "grad_norm": 4.1680498123168945, "learning_rate": 4.288648933368449e-05, "loss": 1.2368, "step": 5408 }, { "epoch": 0.14245456939689227, "grad_norm": 1.9682034254074097, "learning_rate": 4.2885172504608905e-05, "loss": 2.9022, "step": 5409 }, { "epoch": 0.14248090597840402, "grad_norm": 1.7193231582641602, "learning_rate": 4.288385567553332e-05, "loss": 2.0055, "step": 5410 }, { "epoch": 0.14250724255991573, "grad_norm": 1.616080641746521, "learning_rate": 4.288253884645773e-05, "loss": 1.0752, "step": 5411 }, { "epoch": 0.14253357914142745, "grad_norm": 3.5982000827789307, "learning_rate": 4.2881222017382145e-05, "loss": 1.8522, "step": 5412 }, { "epoch": 0.14255991572293916, "grad_norm": 1.935608983039856, "learning_rate": 4.287990518830656e-05, "loss": 2.0533, "step": 5413 }, { "epoch": 0.14258625230445088, "grad_norm": 2.089115619659424, "learning_rate": 4.2878588359230976e-05, "loss": 1.4031, "step": 5414 }, { "epoch": 0.1426125888859626, "grad_norm": 1.8780349493026733, "learning_rate": 4.287727153015539e-05, "loss": 2.3138, "step": 5415 }, { "epoch": 0.1426389254674743, "grad_norm": 2.8866679668426514, "learning_rate": 4.28759547010798e-05, "loss": 2.4278, "step": 5416 }, { "epoch": 0.14266526204898605, "grad_norm": 2.3272926807403564, "learning_rate": 4.2874637872004216e-05, "loss": 0.3782, "step": 5417 }, { "epoch": 0.14269159863049777, "grad_norm": 1.9832936525344849, "learning_rate": 4.287332104292863e-05, "loss": 2.0203, "step": 5418 }, { "epoch": 0.14271793521200948, "grad_norm": 8.99078369140625, "learning_rate": 4.287200421385305e-05, "loss": 1.1857, "step": 5419 }, { "epoch": 0.1427442717935212, "grad_norm": 2.8711023330688477, "learning_rate": 4.2870687384777456e-05, "loss": 1.5664, "step": 5420 }, { "epoch": 0.14277060837503291, "grad_norm": 2.628261089324951, "learning_rate": 4.286937055570187e-05, "loss": 1.4518, "step": 5421 }, { "epoch": 0.14279694495654463, "grad_norm": 1.844959020614624, "learning_rate": 4.286805372662628e-05, "loss": 1.6122, "step": 5422 }, { "epoch": 0.14282328153805637, "grad_norm": 2.2086498737335205, "learning_rate": 4.28667368975507e-05, "loss": 0.5435, "step": 5423 }, { "epoch": 0.1428496181195681, "grad_norm": 2.238442897796631, "learning_rate": 4.286542006847511e-05, "loss": 1.8622, "step": 5424 }, { "epoch": 0.1428759547010798, "grad_norm": 2.8718161582946777, "learning_rate": 4.286410323939953e-05, "loss": 0.86, "step": 5425 }, { "epoch": 0.14290229128259152, "grad_norm": 2.1810548305511475, "learning_rate": 4.286278641032394e-05, "loss": 1.9892, "step": 5426 }, { "epoch": 0.14292862786410324, "grad_norm": 2.9680850505828857, "learning_rate": 4.286146958124835e-05, "loss": 1.5726, "step": 5427 }, { "epoch": 0.14295496444561495, "grad_norm": 2.748436450958252, "learning_rate": 4.2860152752172774e-05, "loss": 0.9523, "step": 5428 }, { "epoch": 0.14298130102712667, "grad_norm": 2.198032855987549, "learning_rate": 4.285883592309718e-05, "loss": 1.8121, "step": 5429 }, { "epoch": 0.1430076376086384, "grad_norm": 1.7660022974014282, "learning_rate": 4.28575190940216e-05, "loss": 1.6698, "step": 5430 }, { "epoch": 0.14303397419015013, "grad_norm": 1.730087399482727, "learning_rate": 4.285620226494601e-05, "loss": 1.7881, "step": 5431 }, { "epoch": 0.14306031077166184, "grad_norm": 2.480919599533081, "learning_rate": 4.285488543587043e-05, "loss": 1.8092, "step": 5432 }, { "epoch": 0.14308664735317356, "grad_norm": 1.8338545560836792, "learning_rate": 4.285356860679484e-05, "loss": 1.4528, "step": 5433 }, { "epoch": 0.14311298393468527, "grad_norm": 1.7347301244735718, "learning_rate": 4.2852251777719254e-05, "loss": 1.6911, "step": 5434 }, { "epoch": 0.143139320516197, "grad_norm": 4.22085428237915, "learning_rate": 4.285093494864367e-05, "loss": 1.1755, "step": 5435 }, { "epoch": 0.1431656570977087, "grad_norm": 2.8604445457458496, "learning_rate": 4.284961811956808e-05, "loss": 1.2737, "step": 5436 }, { "epoch": 0.14319199367922045, "grad_norm": 1.9961583614349365, "learning_rate": 4.28483012904925e-05, "loss": 1.8545, "step": 5437 }, { "epoch": 0.14321833026073216, "grad_norm": 4.361468315124512, "learning_rate": 4.284698446141691e-05, "loss": 2.0559, "step": 5438 }, { "epoch": 0.14324466684224388, "grad_norm": 3.103485584259033, "learning_rate": 4.2845667632341325e-05, "loss": 1.7961, "step": 5439 }, { "epoch": 0.1432710034237556, "grad_norm": 5.533717632293701, "learning_rate": 4.2844350803265734e-05, "loss": 1.6107, "step": 5440 }, { "epoch": 0.1432973400052673, "grad_norm": 2.6506903171539307, "learning_rate": 4.284303397419015e-05, "loss": 1.771, "step": 5441 }, { "epoch": 0.14332367658677903, "grad_norm": 2.8953254222869873, "learning_rate": 4.2841717145114565e-05, "loss": 1.1778, "step": 5442 }, { "epoch": 0.14335001316829077, "grad_norm": 1.9882150888442993, "learning_rate": 4.284040031603898e-05, "loss": 0.4579, "step": 5443 }, { "epoch": 0.14337634974980248, "grad_norm": 1.5002952814102173, "learning_rate": 4.2839083486963397e-05, "loss": 1.9492, "step": 5444 }, { "epoch": 0.1434026863313142, "grad_norm": 3.0559091567993164, "learning_rate": 4.2837766657887805e-05, "loss": 1.5274, "step": 5445 }, { "epoch": 0.14342902291282592, "grad_norm": 2.09110689163208, "learning_rate": 4.283644982881222e-05, "loss": 1.7752, "step": 5446 }, { "epoch": 0.14345535949433763, "grad_norm": 3.924508571624756, "learning_rate": 4.283513299973664e-05, "loss": 1.5881, "step": 5447 }, { "epoch": 0.14348169607584935, "grad_norm": 1.88231360912323, "learning_rate": 4.283381617066105e-05, "loss": 2.0659, "step": 5448 }, { "epoch": 0.14350803265736106, "grad_norm": 4.032523155212402, "learning_rate": 4.283249934158546e-05, "loss": 1.7669, "step": 5449 }, { "epoch": 0.1435343692388728, "grad_norm": 2.589891195297241, "learning_rate": 4.283118251250988e-05, "loss": 1.5581, "step": 5450 }, { "epoch": 0.14356070582038452, "grad_norm": 2.022139310836792, "learning_rate": 4.282986568343429e-05, "loss": 1.7378, "step": 5451 }, { "epoch": 0.14358704240189624, "grad_norm": 1.6292555332183838, "learning_rate": 4.282854885435871e-05, "loss": 2.0916, "step": 5452 }, { "epoch": 0.14361337898340795, "grad_norm": 2.1246378421783447, "learning_rate": 4.2827232025283123e-05, "loss": 1.8504, "step": 5453 }, { "epoch": 0.14363971556491967, "grad_norm": 5.700298309326172, "learning_rate": 4.282591519620753e-05, "loss": 1.1815, "step": 5454 }, { "epoch": 0.14366605214643138, "grad_norm": 2.0003139972686768, "learning_rate": 4.282459836713195e-05, "loss": 1.9312, "step": 5455 }, { "epoch": 0.14369238872794313, "grad_norm": 2.5048341751098633, "learning_rate": 4.2823281538056363e-05, "loss": 1.8645, "step": 5456 }, { "epoch": 0.14371872530945484, "grad_norm": 1.7863818407058716, "learning_rate": 4.282196470898078e-05, "loss": 1.9543, "step": 5457 }, { "epoch": 0.14374506189096656, "grad_norm": 3.901688814163208, "learning_rate": 4.282064787990519e-05, "loss": 1.1884, "step": 5458 }, { "epoch": 0.14377139847247827, "grad_norm": 1.991378664970398, "learning_rate": 4.2819331050829603e-05, "loss": 2.0037, "step": 5459 }, { "epoch": 0.14379773505399, "grad_norm": 3.321007013320923, "learning_rate": 4.281801422175402e-05, "loss": 1.3125, "step": 5460 }, { "epoch": 0.1438240716355017, "grad_norm": 2.6633682250976562, "learning_rate": 4.2816697392678435e-05, "loss": 1.9761, "step": 5461 }, { "epoch": 0.14385040821701342, "grad_norm": 1.621220350265503, "learning_rate": 4.281538056360285e-05, "loss": 0.544, "step": 5462 }, { "epoch": 0.14387674479852516, "grad_norm": 3.61556077003479, "learning_rate": 4.281406373452726e-05, "loss": 2.1541, "step": 5463 }, { "epoch": 0.14390308138003688, "grad_norm": 1.7097405195236206, "learning_rate": 4.2812746905451675e-05, "loss": 1.8777, "step": 5464 }, { "epoch": 0.1439294179615486, "grad_norm": 2.161938428878784, "learning_rate": 4.281143007637609e-05, "loss": 1.9369, "step": 5465 }, { "epoch": 0.1439557545430603, "grad_norm": 2.1637978553771973, "learning_rate": 4.2810113247300506e-05, "loss": 2.1154, "step": 5466 }, { "epoch": 0.14398209112457203, "grad_norm": 2.689159393310547, "learning_rate": 4.2808796418224915e-05, "loss": 1.481, "step": 5467 }, { "epoch": 0.14400842770608374, "grad_norm": 2.8505122661590576, "learning_rate": 4.280747958914933e-05, "loss": 1.4586, "step": 5468 }, { "epoch": 0.14403476428759546, "grad_norm": 4.715952396392822, "learning_rate": 4.280616276007374e-05, "loss": 2.2118, "step": 5469 }, { "epoch": 0.1440611008691072, "grad_norm": 5.930228233337402, "learning_rate": 4.280484593099816e-05, "loss": 1.4922, "step": 5470 }, { "epoch": 0.14408743745061892, "grad_norm": 1.986402153968811, "learning_rate": 4.280352910192258e-05, "loss": 1.3131, "step": 5471 }, { "epoch": 0.14411377403213063, "grad_norm": 1.7451683282852173, "learning_rate": 4.2802212272846986e-05, "loss": 2.1062, "step": 5472 }, { "epoch": 0.14414011061364235, "grad_norm": 2.088634729385376, "learning_rate": 4.28008954437714e-05, "loss": 1.433, "step": 5473 }, { "epoch": 0.14416644719515406, "grad_norm": 2.082184076309204, "learning_rate": 4.279957861469581e-05, "loss": 1.7962, "step": 5474 }, { "epoch": 0.14419278377666578, "grad_norm": 1.7730066776275635, "learning_rate": 4.279826178562023e-05, "loss": 2.0036, "step": 5475 }, { "epoch": 0.14421912035817752, "grad_norm": 1.8608057498931885, "learning_rate": 4.279694495654464e-05, "loss": 2.4359, "step": 5476 }, { "epoch": 0.14424545693968924, "grad_norm": 2.328056812286377, "learning_rate": 4.279562812746906e-05, "loss": 1.7591, "step": 5477 }, { "epoch": 0.14427179352120095, "grad_norm": 1.8919919729232788, "learning_rate": 4.2794311298393466e-05, "loss": 1.5027, "step": 5478 }, { "epoch": 0.14429813010271267, "grad_norm": 1.827974796295166, "learning_rate": 4.279299446931788e-05, "loss": 1.661, "step": 5479 }, { "epoch": 0.14432446668422438, "grad_norm": 2.0404622554779053, "learning_rate": 4.27916776402423e-05, "loss": 0.7236, "step": 5480 }, { "epoch": 0.1443508032657361, "grad_norm": 1.8391458988189697, "learning_rate": 4.279036081116671e-05, "loss": 1.7807, "step": 5481 }, { "epoch": 0.14437713984724782, "grad_norm": 2.4015066623687744, "learning_rate": 4.278904398209113e-05, "loss": 1.6419, "step": 5482 }, { "epoch": 0.14440347642875956, "grad_norm": 1.997928261756897, "learning_rate": 4.278772715301554e-05, "loss": 1.6739, "step": 5483 }, { "epoch": 0.14442981301027127, "grad_norm": 2.2104389667510986, "learning_rate": 4.278641032393996e-05, "loss": 1.9055, "step": 5484 }, { "epoch": 0.144456149591783, "grad_norm": 2.20745849609375, "learning_rate": 4.278509349486437e-05, "loss": 1.9459, "step": 5485 }, { "epoch": 0.1444824861732947, "grad_norm": 2.5828356742858887, "learning_rate": 4.2783776665788784e-05, "loss": 2.2961, "step": 5486 }, { "epoch": 0.14450882275480642, "grad_norm": 5.043327808380127, "learning_rate": 4.278245983671319e-05, "loss": 0.9504, "step": 5487 }, { "epoch": 0.14453515933631814, "grad_norm": 2.0591795444488525, "learning_rate": 4.278114300763761e-05, "loss": 1.4051, "step": 5488 }, { "epoch": 0.14456149591782985, "grad_norm": 1.72860586643219, "learning_rate": 4.2779826178562024e-05, "loss": 1.822, "step": 5489 }, { "epoch": 0.1445878324993416, "grad_norm": 2.47076153755188, "learning_rate": 4.277850934948644e-05, "loss": 1.6271, "step": 5490 }, { "epoch": 0.1446141690808533, "grad_norm": 3.8603389263153076, "learning_rate": 4.2777192520410855e-05, "loss": 2.1521, "step": 5491 }, { "epoch": 0.14464050566236503, "grad_norm": 4.7592267990112305, "learning_rate": 4.2775875691335264e-05, "loss": 1.4681, "step": 5492 }, { "epoch": 0.14466684224387674, "grad_norm": 1.618865966796875, "learning_rate": 4.277455886225968e-05, "loss": 1.3689, "step": 5493 }, { "epoch": 0.14469317882538846, "grad_norm": 1.8949971199035645, "learning_rate": 4.2773242033184095e-05, "loss": 1.7505, "step": 5494 }, { "epoch": 0.14471951540690017, "grad_norm": 1.722700834274292, "learning_rate": 4.277192520410851e-05, "loss": 1.4854, "step": 5495 }, { "epoch": 0.14474585198841192, "grad_norm": 2.876819610595703, "learning_rate": 4.277060837503292e-05, "loss": 2.2436, "step": 5496 }, { "epoch": 0.14477218856992363, "grad_norm": 2.3207411766052246, "learning_rate": 4.2769291545957335e-05, "loss": 1.6843, "step": 5497 }, { "epoch": 0.14479852515143535, "grad_norm": 4.414300918579102, "learning_rate": 4.276797471688175e-05, "loss": 1.5186, "step": 5498 }, { "epoch": 0.14482486173294706, "grad_norm": 1.7444515228271484, "learning_rate": 4.2766657887806166e-05, "loss": 1.8863, "step": 5499 }, { "epoch": 0.14485119831445878, "grad_norm": 1.6739468574523926, "learning_rate": 4.276534105873058e-05, "loss": 1.5695, "step": 5500 }, { "epoch": 0.1448775348959705, "grad_norm": 2.244536876678467, "learning_rate": 4.276402422965499e-05, "loss": 1.7561, "step": 5501 }, { "epoch": 0.1449038714774822, "grad_norm": 4.810878753662109, "learning_rate": 4.2762707400579406e-05, "loss": 1.4633, "step": 5502 }, { "epoch": 0.14493020805899395, "grad_norm": 3.660832643508911, "learning_rate": 4.276139057150382e-05, "loss": 0.5941, "step": 5503 }, { "epoch": 0.14495654464050567, "grad_norm": 2.0839686393737793, "learning_rate": 4.276007374242824e-05, "loss": 1.9839, "step": 5504 }, { "epoch": 0.14498288122201738, "grad_norm": 1.932313323020935, "learning_rate": 4.2758756913352646e-05, "loss": 1.5547, "step": 5505 }, { "epoch": 0.1450092178035291, "grad_norm": 3.07942795753479, "learning_rate": 4.275744008427706e-05, "loss": 1.8634, "step": 5506 }, { "epoch": 0.14503555438504082, "grad_norm": 2.2211496829986572, "learning_rate": 4.275612325520148e-05, "loss": 1.9627, "step": 5507 }, { "epoch": 0.14506189096655253, "grad_norm": 1.9119904041290283, "learning_rate": 4.275480642612589e-05, "loss": 0.308, "step": 5508 }, { "epoch": 0.14508822754806427, "grad_norm": 3.349985122680664, "learning_rate": 4.275348959705031e-05, "loss": 1.6698, "step": 5509 }, { "epoch": 0.145114564129576, "grad_norm": 1.9620261192321777, "learning_rate": 4.275217276797472e-05, "loss": 1.6017, "step": 5510 }, { "epoch": 0.1451409007110877, "grad_norm": 1.760911464691162, "learning_rate": 4.275085593889913e-05, "loss": 2.1849, "step": 5511 }, { "epoch": 0.14516723729259942, "grad_norm": 3.4776453971862793, "learning_rate": 4.274953910982354e-05, "loss": 0.8801, "step": 5512 }, { "epoch": 0.14519357387411114, "grad_norm": 2.4966847896575928, "learning_rate": 4.2748222280747964e-05, "loss": 1.7783, "step": 5513 }, { "epoch": 0.14521991045562285, "grad_norm": 1.7693718671798706, "learning_rate": 4.274690545167237e-05, "loss": 1.3738, "step": 5514 }, { "epoch": 0.14524624703713457, "grad_norm": 1.6176201105117798, "learning_rate": 4.274558862259679e-05, "loss": 1.4028, "step": 5515 }, { "epoch": 0.1452725836186463, "grad_norm": 3.2332048416137695, "learning_rate": 4.2744271793521204e-05, "loss": 1.9422, "step": 5516 }, { "epoch": 0.14529892020015803, "grad_norm": 2.100635051727295, "learning_rate": 4.274295496444562e-05, "loss": 1.7591, "step": 5517 }, { "epoch": 0.14532525678166974, "grad_norm": 1.867606520652771, "learning_rate": 4.2741638135370036e-05, "loss": 1.8669, "step": 5518 }, { "epoch": 0.14535159336318146, "grad_norm": 2.3108527660369873, "learning_rate": 4.2740321306294444e-05, "loss": 1.6326, "step": 5519 }, { "epoch": 0.14537792994469317, "grad_norm": 2.2782132625579834, "learning_rate": 4.273900447721886e-05, "loss": 2.0259, "step": 5520 }, { "epoch": 0.1454042665262049, "grad_norm": 2.0683164596557617, "learning_rate": 4.273768764814327e-05, "loss": 1.8712, "step": 5521 }, { "epoch": 0.1454306031077166, "grad_norm": 2.270616292953491, "learning_rate": 4.273637081906769e-05, "loss": 1.8014, "step": 5522 }, { "epoch": 0.14545693968922835, "grad_norm": 1.6381725072860718, "learning_rate": 4.27350539899921e-05, "loss": 1.6721, "step": 5523 }, { "epoch": 0.14548327627074006, "grad_norm": 2.6140260696411133, "learning_rate": 4.2733737160916516e-05, "loss": 1.869, "step": 5524 }, { "epoch": 0.14550961285225178, "grad_norm": 1.9675624370574951, "learning_rate": 4.2732420331840924e-05, "loss": 1.6683, "step": 5525 }, { "epoch": 0.1455359494337635, "grad_norm": 1.9817615747451782, "learning_rate": 4.273110350276534e-05, "loss": 1.7122, "step": 5526 }, { "epoch": 0.1455622860152752, "grad_norm": 2.7572903633117676, "learning_rate": 4.2729786673689756e-05, "loss": 1.972, "step": 5527 }, { "epoch": 0.14558862259678693, "grad_norm": 2.3718225955963135, "learning_rate": 4.272846984461417e-05, "loss": 1.8248, "step": 5528 }, { "epoch": 0.14561495917829867, "grad_norm": 3.4232852458953857, "learning_rate": 4.272715301553859e-05, "loss": 0.6497, "step": 5529 }, { "epoch": 0.14564129575981039, "grad_norm": 3.068422555923462, "learning_rate": 4.2725836186462996e-05, "loss": 1.3399, "step": 5530 }, { "epoch": 0.1456676323413221, "grad_norm": 2.2739012241363525, "learning_rate": 4.272451935738742e-05, "loss": 2.1571, "step": 5531 }, { "epoch": 0.14569396892283382, "grad_norm": 3.3147454261779785, "learning_rate": 4.272320252831183e-05, "loss": 1.0281, "step": 5532 }, { "epoch": 0.14572030550434553, "grad_norm": 2.3716354370117188, "learning_rate": 4.272188569923624e-05, "loss": 0.434, "step": 5533 }, { "epoch": 0.14574664208585725, "grad_norm": 2.1226859092712402, "learning_rate": 4.272056887016065e-05, "loss": 2.041, "step": 5534 }, { "epoch": 0.14577297866736896, "grad_norm": 2.0871102809906006, "learning_rate": 4.271925204108507e-05, "loss": 1.865, "step": 5535 }, { "epoch": 0.1457993152488807, "grad_norm": 1.696796178817749, "learning_rate": 4.271793521200948e-05, "loss": 1.5224, "step": 5536 }, { "epoch": 0.14582565183039242, "grad_norm": 1.9547193050384521, "learning_rate": 4.27166183829339e-05, "loss": 2.4481, "step": 5537 }, { "epoch": 0.14585198841190414, "grad_norm": 2.4103574752807617, "learning_rate": 4.2715301553858314e-05, "loss": 1.4425, "step": 5538 }, { "epoch": 0.14587832499341585, "grad_norm": 3.5615122318267822, "learning_rate": 4.271398472478272e-05, "loss": 1.5543, "step": 5539 }, { "epoch": 0.14590466157492757, "grad_norm": 1.5633794069290161, "learning_rate": 4.271266789570714e-05, "loss": 1.7641, "step": 5540 }, { "epoch": 0.14593099815643928, "grad_norm": 2.4548375606536865, "learning_rate": 4.2711351066631554e-05, "loss": 1.694, "step": 5541 }, { "epoch": 0.14595733473795103, "grad_norm": 2.1368513107299805, "learning_rate": 4.271003423755597e-05, "loss": 1.979, "step": 5542 }, { "epoch": 0.14598367131946274, "grad_norm": 1.925728678703308, "learning_rate": 4.270871740848038e-05, "loss": 1.9446, "step": 5543 }, { "epoch": 0.14601000790097446, "grad_norm": 1.9377928972244263, "learning_rate": 4.2707400579404794e-05, "loss": 1.7018, "step": 5544 }, { "epoch": 0.14603634448248617, "grad_norm": 3.370821475982666, "learning_rate": 4.270608375032921e-05, "loss": 1.4789, "step": 5545 }, { "epoch": 0.1460626810639979, "grad_norm": 4.439446449279785, "learning_rate": 4.2704766921253625e-05, "loss": 1.4931, "step": 5546 }, { "epoch": 0.1460890176455096, "grad_norm": 2.0447187423706055, "learning_rate": 4.270345009217804e-05, "loss": 2.1148, "step": 5547 }, { "epoch": 0.14611535422702132, "grad_norm": 1.9269649982452393, "learning_rate": 4.270213326310245e-05, "loss": 1.8261, "step": 5548 }, { "epoch": 0.14614169080853306, "grad_norm": 3.57525897026062, "learning_rate": 4.2700816434026865e-05, "loss": 1.8854, "step": 5549 }, { "epoch": 0.14616802739004478, "grad_norm": 1.9000951051712036, "learning_rate": 4.269949960495128e-05, "loss": 1.9626, "step": 5550 }, { "epoch": 0.1461943639715565, "grad_norm": 2.2235636711120605, "learning_rate": 4.2698182775875696e-05, "loss": 1.3551, "step": 5551 }, { "epoch": 0.1462207005530682, "grad_norm": 1.8338112831115723, "learning_rate": 4.2696865946800105e-05, "loss": 1.3472, "step": 5552 }, { "epoch": 0.14624703713457993, "grad_norm": 4.085555553436279, "learning_rate": 4.269554911772452e-05, "loss": 0.946, "step": 5553 }, { "epoch": 0.14627337371609164, "grad_norm": 2.6587250232696533, "learning_rate": 4.2694232288648936e-05, "loss": 1.5805, "step": 5554 }, { "epoch": 0.14629971029760336, "grad_norm": 2.6181790828704834, "learning_rate": 4.269291545957335e-05, "loss": 2.4685, "step": 5555 }, { "epoch": 0.1463260468791151, "grad_norm": 3.2557456493377686, "learning_rate": 4.269159863049777e-05, "loss": 0.7642, "step": 5556 }, { "epoch": 0.14635238346062682, "grad_norm": 2.2250311374664307, "learning_rate": 4.2690281801422176e-05, "loss": 1.9393, "step": 5557 }, { "epoch": 0.14637872004213853, "grad_norm": 2.177250623703003, "learning_rate": 4.268896497234659e-05, "loss": 1.6364, "step": 5558 }, { "epoch": 0.14640505662365025, "grad_norm": 2.7469115257263184, "learning_rate": 4.2687648143271e-05, "loss": 1.5623, "step": 5559 }, { "epoch": 0.14643139320516196, "grad_norm": 2.6492576599121094, "learning_rate": 4.268633131419542e-05, "loss": 1.3835, "step": 5560 }, { "epoch": 0.14645772978667368, "grad_norm": 1.836521863937378, "learning_rate": 4.268501448511983e-05, "loss": 2.1792, "step": 5561 }, { "epoch": 0.14648406636818542, "grad_norm": 4.0711750984191895, "learning_rate": 4.268369765604425e-05, "loss": 2.0186, "step": 5562 }, { "epoch": 0.14651040294969714, "grad_norm": 2.392573356628418, "learning_rate": 4.268238082696866e-05, "loss": 1.6863, "step": 5563 }, { "epoch": 0.14653673953120885, "grad_norm": 1.8146653175354004, "learning_rate": 4.268106399789308e-05, "loss": 1.0686, "step": 5564 }, { "epoch": 0.14656307611272057, "grad_norm": 2.204972267150879, "learning_rate": 4.2679747168817494e-05, "loss": 1.8641, "step": 5565 }, { "epoch": 0.14658941269423229, "grad_norm": 1.7449761629104614, "learning_rate": 4.26784303397419e-05, "loss": 0.4732, "step": 5566 }, { "epoch": 0.146615749275744, "grad_norm": 2.22798228263855, "learning_rate": 4.267711351066632e-05, "loss": 1.9302, "step": 5567 }, { "epoch": 0.14664208585725572, "grad_norm": 2.165184259414673, "learning_rate": 4.267579668159073e-05, "loss": 1.7797, "step": 5568 }, { "epoch": 0.14666842243876746, "grad_norm": 1.9731875658035278, "learning_rate": 4.267447985251515e-05, "loss": 1.5318, "step": 5569 }, { "epoch": 0.14669475902027918, "grad_norm": 1.9895118474960327, "learning_rate": 4.267316302343956e-05, "loss": 0.4572, "step": 5570 }, { "epoch": 0.1467210956017909, "grad_norm": 2.1495420932769775, "learning_rate": 4.2671846194363974e-05, "loss": 1.476, "step": 5571 }, { "epoch": 0.1467474321833026, "grad_norm": 2.0674545764923096, "learning_rate": 4.267052936528838e-05, "loss": 1.4671, "step": 5572 }, { "epoch": 0.14677376876481432, "grad_norm": 3.436493158340454, "learning_rate": 4.26692125362128e-05, "loss": 0.996, "step": 5573 }, { "epoch": 0.14680010534632604, "grad_norm": 2.3036608695983887, "learning_rate": 4.2667895707137214e-05, "loss": 2.6386, "step": 5574 }, { "epoch": 0.14682644192783775, "grad_norm": 2.138401508331299, "learning_rate": 4.266657887806163e-05, "loss": 1.8376, "step": 5575 }, { "epoch": 0.1468527785093495, "grad_norm": 5.032169818878174, "learning_rate": 4.2665262048986045e-05, "loss": 0.9696, "step": 5576 }, { "epoch": 0.1468791150908612, "grad_norm": 3.0435707569122314, "learning_rate": 4.2663945219910454e-05, "loss": 1.9079, "step": 5577 }, { "epoch": 0.14690545167237293, "grad_norm": 1.8727219104766846, "learning_rate": 4.266262839083487e-05, "loss": 2.1838, "step": 5578 }, { "epoch": 0.14693178825388464, "grad_norm": 3.9931259155273438, "learning_rate": 4.2661311561759285e-05, "loss": 1.3218, "step": 5579 }, { "epoch": 0.14695812483539636, "grad_norm": 2.8434035778045654, "learning_rate": 4.26599947326837e-05, "loss": 1.4108, "step": 5580 }, { "epoch": 0.14698446141690807, "grad_norm": 2.304365634918213, "learning_rate": 4.265867790360811e-05, "loss": 2.3515, "step": 5581 }, { "epoch": 0.14701079799841982, "grad_norm": 2.368215560913086, "learning_rate": 4.2657361074532525e-05, "loss": 1.785, "step": 5582 }, { "epoch": 0.14703713457993153, "grad_norm": 2.0873093605041504, "learning_rate": 4.265604424545694e-05, "loss": 1.5827, "step": 5583 }, { "epoch": 0.14706347116144325, "grad_norm": 5.458306312561035, "learning_rate": 4.265472741638136e-05, "loss": 1.1177, "step": 5584 }, { "epoch": 0.14708980774295496, "grad_norm": 1.6125884056091309, "learning_rate": 4.265341058730577e-05, "loss": 1.7194, "step": 5585 }, { "epoch": 0.14711614432446668, "grad_norm": 2.083245277404785, "learning_rate": 4.265209375823018e-05, "loss": 1.6941, "step": 5586 }, { "epoch": 0.1471424809059784, "grad_norm": 2.5999622344970703, "learning_rate": 4.26507769291546e-05, "loss": 1.2682, "step": 5587 }, { "epoch": 0.1471688174874901, "grad_norm": 2.612147808074951, "learning_rate": 4.264946010007901e-05, "loss": 0.5691, "step": 5588 }, { "epoch": 0.14719515406900185, "grad_norm": 1.8802284002304077, "learning_rate": 4.264814327100343e-05, "loss": 0.5906, "step": 5589 }, { "epoch": 0.14722149065051357, "grad_norm": 3.8250322341918945, "learning_rate": 4.264682644192784e-05, "loss": 0.6344, "step": 5590 }, { "epoch": 0.14724782723202529, "grad_norm": 2.7785391807556152, "learning_rate": 4.264550961285225e-05, "loss": 2.0483, "step": 5591 }, { "epoch": 0.147274163813537, "grad_norm": 1.826798915863037, "learning_rate": 4.264419278377667e-05, "loss": 1.8139, "step": 5592 }, { "epoch": 0.14730050039504872, "grad_norm": 3.454930543899536, "learning_rate": 4.2642875954701083e-05, "loss": 0.6545, "step": 5593 }, { "epoch": 0.14732683697656043, "grad_norm": 1.6765172481536865, "learning_rate": 4.26415591256255e-05, "loss": 2.0717, "step": 5594 }, { "epoch": 0.14735317355807218, "grad_norm": 2.713090419769287, "learning_rate": 4.264024229654991e-05, "loss": 2.0588, "step": 5595 }, { "epoch": 0.1473795101395839, "grad_norm": 2.3142645359039307, "learning_rate": 4.2638925467474324e-05, "loss": 1.4029, "step": 5596 }, { "epoch": 0.1474058467210956, "grad_norm": 2.2539315223693848, "learning_rate": 4.263760863839874e-05, "loss": 1.5575, "step": 5597 }, { "epoch": 0.14743218330260732, "grad_norm": 1.4994795322418213, "learning_rate": 4.2636291809323155e-05, "loss": 1.7899, "step": 5598 }, { "epoch": 0.14745851988411904, "grad_norm": 2.115285634994507, "learning_rate": 4.2634974980247564e-05, "loss": 1.8093, "step": 5599 }, { "epoch": 0.14748485646563075, "grad_norm": 1.8860194683074951, "learning_rate": 4.263365815117198e-05, "loss": 1.688, "step": 5600 }, { "epoch": 0.14751119304714247, "grad_norm": 3.8553032875061035, "learning_rate": 4.2632341322096395e-05, "loss": 1.9707, "step": 5601 }, { "epoch": 0.1475375296286542, "grad_norm": 2.2329020500183105, "learning_rate": 4.263102449302081e-05, "loss": 1.5304, "step": 5602 }, { "epoch": 0.14756386621016593, "grad_norm": 2.9448134899139404, "learning_rate": 4.2629707663945226e-05, "loss": 2.0549, "step": 5603 }, { "epoch": 0.14759020279167764, "grad_norm": 3.7261550426483154, "learning_rate": 4.2628390834869635e-05, "loss": 1.5402, "step": 5604 }, { "epoch": 0.14761653937318936, "grad_norm": 3.063939332962036, "learning_rate": 4.262707400579405e-05, "loss": 0.8647, "step": 5605 }, { "epoch": 0.14764287595470107, "grad_norm": 1.8527138233184814, "learning_rate": 4.262575717671846e-05, "loss": 1.7482, "step": 5606 }, { "epoch": 0.1476692125362128, "grad_norm": 2.4305081367492676, "learning_rate": 4.262444034764288e-05, "loss": 1.7115, "step": 5607 }, { "epoch": 0.1476955491177245, "grad_norm": 2.2378013134002686, "learning_rate": 4.262312351856729e-05, "loss": 1.7482, "step": 5608 }, { "epoch": 0.14772188569923625, "grad_norm": 2.779639959335327, "learning_rate": 4.2621806689491706e-05, "loss": 1.1633, "step": 5609 }, { "epoch": 0.14774822228074797, "grad_norm": 1.7249113321304321, "learning_rate": 4.262048986041612e-05, "loss": 1.781, "step": 5610 }, { "epoch": 0.14777455886225968, "grad_norm": 2.5951218605041504, "learning_rate": 4.261917303134053e-05, "loss": 1.9406, "step": 5611 }, { "epoch": 0.1478008954437714, "grad_norm": 1.778315782546997, "learning_rate": 4.261785620226495e-05, "loss": 1.6417, "step": 5612 }, { "epoch": 0.1478272320252831, "grad_norm": 4.979842185974121, "learning_rate": 4.261653937318936e-05, "loss": 1.2926, "step": 5613 }, { "epoch": 0.14785356860679483, "grad_norm": 2.3503050804138184, "learning_rate": 4.261522254411378e-05, "loss": 1.7667, "step": 5614 }, { "epoch": 0.14787990518830657, "grad_norm": 1.7864164113998413, "learning_rate": 4.2613905715038186e-05, "loss": 1.6159, "step": 5615 }, { "epoch": 0.1479062417698183, "grad_norm": 1.709181547164917, "learning_rate": 4.261258888596261e-05, "loss": 1.8267, "step": 5616 }, { "epoch": 0.14793257835133, "grad_norm": 3.542180299758911, "learning_rate": 4.261127205688702e-05, "loss": 0.6127, "step": 5617 }, { "epoch": 0.14795891493284172, "grad_norm": 3.4691050052642822, "learning_rate": 4.260995522781143e-05, "loss": 0.847, "step": 5618 }, { "epoch": 0.14798525151435343, "grad_norm": 1.9438574314117432, "learning_rate": 4.260863839873585e-05, "loss": 1.8687, "step": 5619 }, { "epoch": 0.14801158809586515, "grad_norm": 2.5891008377075195, "learning_rate": 4.260732156966026e-05, "loss": 0.6519, "step": 5620 }, { "epoch": 0.14803792467737686, "grad_norm": 4.490084171295166, "learning_rate": 4.260600474058468e-05, "loss": 0.9922, "step": 5621 }, { "epoch": 0.1480642612588886, "grad_norm": 2.307469129562378, "learning_rate": 4.260468791150909e-05, "loss": 1.4318, "step": 5622 }, { "epoch": 0.14809059784040032, "grad_norm": 1.982391595840454, "learning_rate": 4.2603371082433504e-05, "loss": 2.3119, "step": 5623 }, { "epoch": 0.14811693442191204, "grad_norm": 2.176826238632202, "learning_rate": 4.260205425335791e-05, "loss": 1.662, "step": 5624 }, { "epoch": 0.14814327100342375, "grad_norm": 1.7105432748794556, "learning_rate": 4.260073742428233e-05, "loss": 2.2901, "step": 5625 }, { "epoch": 0.14816960758493547, "grad_norm": 2.202418804168701, "learning_rate": 4.2599420595206744e-05, "loss": 1.379, "step": 5626 }, { "epoch": 0.14819594416644719, "grad_norm": 1.9317352771759033, "learning_rate": 4.259810376613116e-05, "loss": 1.8742, "step": 5627 }, { "epoch": 0.1482222807479589, "grad_norm": 3.7710983753204346, "learning_rate": 4.259678693705557e-05, "loss": 1.067, "step": 5628 }, { "epoch": 0.14824861732947064, "grad_norm": 1.6439965963363647, "learning_rate": 4.2595470107979984e-05, "loss": 1.6551, "step": 5629 }, { "epoch": 0.14827495391098236, "grad_norm": 1.9266676902770996, "learning_rate": 4.25941532789044e-05, "loss": 1.6155, "step": 5630 }, { "epoch": 0.14830129049249408, "grad_norm": 3.8426513671875, "learning_rate": 4.2592836449828815e-05, "loss": 1.3305, "step": 5631 }, { "epoch": 0.1483276270740058, "grad_norm": 3.590303421020508, "learning_rate": 4.259151962075323e-05, "loss": 1.5859, "step": 5632 }, { "epoch": 0.1483539636555175, "grad_norm": 1.9086166620254517, "learning_rate": 4.259020279167764e-05, "loss": 2.2934, "step": 5633 }, { "epoch": 0.14838030023702922, "grad_norm": 4.422410011291504, "learning_rate": 4.2588885962602055e-05, "loss": 0.9766, "step": 5634 }, { "epoch": 0.14840663681854097, "grad_norm": 2.8753767013549805, "learning_rate": 4.258756913352647e-05, "loss": 2.1532, "step": 5635 }, { "epoch": 0.14843297340005268, "grad_norm": 3.993929386138916, "learning_rate": 4.2586252304450886e-05, "loss": 1.3343, "step": 5636 }, { "epoch": 0.1484593099815644, "grad_norm": 3.0543391704559326, "learning_rate": 4.2584935475375295e-05, "loss": 1.6136, "step": 5637 }, { "epoch": 0.1484856465630761, "grad_norm": 3.9620182514190674, "learning_rate": 4.258361864629971e-05, "loss": 2.2358, "step": 5638 }, { "epoch": 0.14851198314458783, "grad_norm": 3.4023518562316895, "learning_rate": 4.2582301817224126e-05, "loss": 2.3857, "step": 5639 }, { "epoch": 0.14853831972609954, "grad_norm": 1.8487714529037476, "learning_rate": 4.258098498814854e-05, "loss": 1.9589, "step": 5640 }, { "epoch": 0.14856465630761126, "grad_norm": 2.166978120803833, "learning_rate": 4.257966815907296e-05, "loss": 1.8964, "step": 5641 }, { "epoch": 0.148590992889123, "grad_norm": 1.955157995223999, "learning_rate": 4.2578351329997366e-05, "loss": 2.2347, "step": 5642 }, { "epoch": 0.14861732947063472, "grad_norm": 2.189171314239502, "learning_rate": 4.257703450092178e-05, "loss": 1.1701, "step": 5643 }, { "epoch": 0.14864366605214643, "grad_norm": 3.317697048187256, "learning_rate": 4.257571767184619e-05, "loss": 1.9047, "step": 5644 }, { "epoch": 0.14867000263365815, "grad_norm": 5.230594158172607, "learning_rate": 4.257440084277061e-05, "loss": 1.1145, "step": 5645 }, { "epoch": 0.14869633921516986, "grad_norm": 2.1864380836486816, "learning_rate": 4.257308401369502e-05, "loss": 1.6484, "step": 5646 }, { "epoch": 0.14872267579668158, "grad_norm": 2.72421932220459, "learning_rate": 4.257176718461944e-05, "loss": 2.4499, "step": 5647 }, { "epoch": 0.14874901237819332, "grad_norm": 1.924412727355957, "learning_rate": 4.257045035554385e-05, "loss": 1.6235, "step": 5648 }, { "epoch": 0.14877534895970504, "grad_norm": 2.7526772022247314, "learning_rate": 4.256913352646827e-05, "loss": 0.8698, "step": 5649 }, { "epoch": 0.14880168554121675, "grad_norm": 2.680344581604004, "learning_rate": 4.2567816697392684e-05, "loss": 1.6177, "step": 5650 }, { "epoch": 0.14882802212272847, "grad_norm": 2.1998343467712402, "learning_rate": 4.256649986831709e-05, "loss": 1.1037, "step": 5651 }, { "epoch": 0.14885435870424019, "grad_norm": 4.720434188842773, "learning_rate": 4.256518303924151e-05, "loss": 1.4442, "step": 5652 }, { "epoch": 0.1488806952857519, "grad_norm": 2.881120204925537, "learning_rate": 4.256386621016592e-05, "loss": 1.6693, "step": 5653 }, { "epoch": 0.14890703186726362, "grad_norm": 2.2526192665100098, "learning_rate": 4.256254938109034e-05, "loss": 0.6066, "step": 5654 }, { "epoch": 0.14893336844877536, "grad_norm": 2.795752763748169, "learning_rate": 4.256123255201475e-05, "loss": 1.5708, "step": 5655 }, { "epoch": 0.14895970503028708, "grad_norm": 2.176715135574341, "learning_rate": 4.2559915722939165e-05, "loss": 1.5439, "step": 5656 }, { "epoch": 0.1489860416117988, "grad_norm": 3.603760004043579, "learning_rate": 4.255859889386358e-05, "loss": 1.6982, "step": 5657 }, { "epoch": 0.1490123781933105, "grad_norm": 3.7984838485717773, "learning_rate": 4.255728206478799e-05, "loss": 0.2778, "step": 5658 }, { "epoch": 0.14903871477482222, "grad_norm": 1.7825828790664673, "learning_rate": 4.255596523571241e-05, "loss": 1.7973, "step": 5659 }, { "epoch": 0.14906505135633394, "grad_norm": 2.003811836242676, "learning_rate": 4.255464840663682e-05, "loss": 1.6944, "step": 5660 }, { "epoch": 0.14909138793784565, "grad_norm": 2.4772143363952637, "learning_rate": 4.2553331577561236e-05, "loss": 1.3405, "step": 5661 }, { "epoch": 0.1491177245193574, "grad_norm": 2.669477939605713, "learning_rate": 4.2552014748485645e-05, "loss": 2.1803, "step": 5662 }, { "epoch": 0.1491440611008691, "grad_norm": 2.170348644256592, "learning_rate": 4.255069791941007e-05, "loss": 1.5437, "step": 5663 }, { "epoch": 0.14917039768238083, "grad_norm": 2.1788415908813477, "learning_rate": 4.2549381090334476e-05, "loss": 1.6009, "step": 5664 }, { "epoch": 0.14919673426389254, "grad_norm": 2.217836618423462, "learning_rate": 4.254806426125889e-05, "loss": 1.526, "step": 5665 }, { "epoch": 0.14922307084540426, "grad_norm": 4.839338302612305, "learning_rate": 4.254674743218331e-05, "loss": 1.0264, "step": 5666 }, { "epoch": 0.14924940742691598, "grad_norm": 3.5093142986297607, "learning_rate": 4.2545430603107716e-05, "loss": 2.0052, "step": 5667 }, { "epoch": 0.14927574400842772, "grad_norm": 2.4167253971099854, "learning_rate": 4.254411377403214e-05, "loss": 1.2477, "step": 5668 }, { "epoch": 0.14930208058993943, "grad_norm": 2.1677913665771484, "learning_rate": 4.254279694495655e-05, "loss": 1.8527, "step": 5669 }, { "epoch": 0.14932841717145115, "grad_norm": 4.827413558959961, "learning_rate": 4.254148011588096e-05, "loss": 0.8257, "step": 5670 }, { "epoch": 0.14935475375296287, "grad_norm": 2.6457746028900146, "learning_rate": 4.254016328680537e-05, "loss": 1.809, "step": 5671 }, { "epoch": 0.14938109033447458, "grad_norm": 1.7769595384597778, "learning_rate": 4.253884645772979e-05, "loss": 2.2455, "step": 5672 }, { "epoch": 0.1494074269159863, "grad_norm": 3.718106746673584, "learning_rate": 4.25375296286542e-05, "loss": 1.3036, "step": 5673 }, { "epoch": 0.149433763497498, "grad_norm": 2.424023151397705, "learning_rate": 4.253621279957862e-05, "loss": 1.1773, "step": 5674 }, { "epoch": 0.14946010007900976, "grad_norm": 2.3052265644073486, "learning_rate": 4.253489597050303e-05, "loss": 1.8745, "step": 5675 }, { "epoch": 0.14948643666052147, "grad_norm": 2.0570812225341797, "learning_rate": 4.253357914142744e-05, "loss": 1.6851, "step": 5676 }, { "epoch": 0.1495127732420332, "grad_norm": 4.507837295532227, "learning_rate": 4.253226231235186e-05, "loss": 1.3516, "step": 5677 }, { "epoch": 0.1495391098235449, "grad_norm": 2.8513917922973633, "learning_rate": 4.2530945483276274e-05, "loss": 1.4759, "step": 5678 }, { "epoch": 0.14956544640505662, "grad_norm": 2.387956380844116, "learning_rate": 4.252962865420069e-05, "loss": 1.612, "step": 5679 }, { "epoch": 0.14959178298656833, "grad_norm": 2.281764030456543, "learning_rate": 4.25283118251251e-05, "loss": 2.5507, "step": 5680 }, { "epoch": 0.14961811956808008, "grad_norm": 1.7326940298080444, "learning_rate": 4.2526994996049514e-05, "loss": 1.345, "step": 5681 }, { "epoch": 0.1496444561495918, "grad_norm": 2.286283016204834, "learning_rate": 4.252567816697393e-05, "loss": 2.2979, "step": 5682 }, { "epoch": 0.1496707927311035, "grad_norm": 1.9607174396514893, "learning_rate": 4.2524361337898345e-05, "loss": 0.7526, "step": 5683 }, { "epoch": 0.14969712931261522, "grad_norm": 2.015150785446167, "learning_rate": 4.2523044508822754e-05, "loss": 1.5515, "step": 5684 }, { "epoch": 0.14972346589412694, "grad_norm": 4.143985271453857, "learning_rate": 4.252172767974717e-05, "loss": 1.5143, "step": 5685 }, { "epoch": 0.14974980247563865, "grad_norm": 1.7674788236618042, "learning_rate": 4.2520410850671585e-05, "loss": 1.7692, "step": 5686 }, { "epoch": 0.14977613905715037, "grad_norm": 2.926802158355713, "learning_rate": 4.2519094021596e-05, "loss": 2.151, "step": 5687 }, { "epoch": 0.1498024756386621, "grad_norm": 1.8197352886199951, "learning_rate": 4.2517777192520416e-05, "loss": 1.7826, "step": 5688 }, { "epoch": 0.14982881222017383, "grad_norm": 2.624211072921753, "learning_rate": 4.2516460363444825e-05, "loss": 1.301, "step": 5689 }, { "epoch": 0.14985514880168554, "grad_norm": 1.795958399772644, "learning_rate": 4.251514353436924e-05, "loss": 2.1483, "step": 5690 }, { "epoch": 0.14988148538319726, "grad_norm": 2.796091079711914, "learning_rate": 4.251382670529365e-05, "loss": 2.1952, "step": 5691 }, { "epoch": 0.14990782196470898, "grad_norm": 1.5790303945541382, "learning_rate": 4.251250987621807e-05, "loss": 1.8803, "step": 5692 }, { "epoch": 0.1499341585462207, "grad_norm": 3.784172534942627, "learning_rate": 4.251119304714248e-05, "loss": 1.1298, "step": 5693 }, { "epoch": 0.1499604951277324, "grad_norm": 1.9059382677078247, "learning_rate": 4.2509876218066896e-05, "loss": 2.4398, "step": 5694 }, { "epoch": 0.14998683170924415, "grad_norm": 1.3986990451812744, "learning_rate": 4.250855938899131e-05, "loss": 0.3382, "step": 5695 }, { "epoch": 0.15001316829075587, "grad_norm": 3.4716920852661133, "learning_rate": 4.250724255991573e-05, "loss": 2.3116, "step": 5696 }, { "epoch": 0.15003950487226758, "grad_norm": 2.536043643951416, "learning_rate": 4.250592573084014e-05, "loss": 1.5416, "step": 5697 }, { "epoch": 0.1500658414537793, "grad_norm": 2.361250162124634, "learning_rate": 4.250460890176455e-05, "loss": 1.4363, "step": 5698 }, { "epoch": 0.150092178035291, "grad_norm": 2.0393974781036377, "learning_rate": 4.250329207268897e-05, "loss": 1.24, "step": 5699 }, { "epoch": 0.15011851461680273, "grad_norm": 1.827251672744751, "learning_rate": 4.2501975243613376e-05, "loss": 1.4273, "step": 5700 }, { "epoch": 0.15014485119831447, "grad_norm": 3.612252950668335, "learning_rate": 4.25006584145378e-05, "loss": 0.5793, "step": 5701 }, { "epoch": 0.1501711877798262, "grad_norm": 4.103225231170654, "learning_rate": 4.249934158546221e-05, "loss": 1.0051, "step": 5702 }, { "epoch": 0.1501975243613379, "grad_norm": 3.4101593494415283, "learning_rate": 4.249802475638662e-05, "loss": 1.4616, "step": 5703 }, { "epoch": 0.15022386094284962, "grad_norm": 2.2099180221557617, "learning_rate": 4.249670792731104e-05, "loss": 0.7844, "step": 5704 }, { "epoch": 0.15025019752436133, "grad_norm": 2.0153872966766357, "learning_rate": 4.249539109823545e-05, "loss": 1.3207, "step": 5705 }, { "epoch": 0.15027653410587305, "grad_norm": 2.031503677368164, "learning_rate": 4.249407426915987e-05, "loss": 1.8856, "step": 5706 }, { "epoch": 0.15030287068738477, "grad_norm": 3.351882219314575, "learning_rate": 4.249275744008428e-05, "loss": 2.2434, "step": 5707 }, { "epoch": 0.1503292072688965, "grad_norm": 5.673913955688477, "learning_rate": 4.2491440611008694e-05, "loss": 1.0375, "step": 5708 }, { "epoch": 0.15035554385040822, "grad_norm": 4.268165588378906, "learning_rate": 4.24901237819331e-05, "loss": 2.4776, "step": 5709 }, { "epoch": 0.15038188043191994, "grad_norm": 2.6325466632843018, "learning_rate": 4.2488806952857525e-05, "loss": 1.8972, "step": 5710 }, { "epoch": 0.15040821701343166, "grad_norm": 1.6593612432479858, "learning_rate": 4.2487490123781934e-05, "loss": 1.4751, "step": 5711 }, { "epoch": 0.15043455359494337, "grad_norm": 2.330878973007202, "learning_rate": 4.248617329470635e-05, "loss": 2.0391, "step": 5712 }, { "epoch": 0.1504608901764551, "grad_norm": 1.9831138849258423, "learning_rate": 4.2484856465630765e-05, "loss": 2.4672, "step": 5713 }, { "epoch": 0.1504872267579668, "grad_norm": 3.397312641143799, "learning_rate": 4.2483539636555174e-05, "loss": 0.7698, "step": 5714 }, { "epoch": 0.15051356333947855, "grad_norm": 1.577579379081726, "learning_rate": 4.24822228074796e-05, "loss": 1.7782, "step": 5715 }, { "epoch": 0.15053989992099026, "grad_norm": 1.7842483520507812, "learning_rate": 4.2480905978404006e-05, "loss": 1.9677, "step": 5716 }, { "epoch": 0.15056623650250198, "grad_norm": 3.7436392307281494, "learning_rate": 4.247958914932842e-05, "loss": 2.0013, "step": 5717 }, { "epoch": 0.1505925730840137, "grad_norm": 2.3510167598724365, "learning_rate": 4.247827232025283e-05, "loss": 1.6836, "step": 5718 }, { "epoch": 0.1506189096655254, "grad_norm": 3.6194794178009033, "learning_rate": 4.2476955491177246e-05, "loss": 0.7397, "step": 5719 }, { "epoch": 0.15064524624703712, "grad_norm": 1.8981231451034546, "learning_rate": 4.247563866210166e-05, "loss": 1.7762, "step": 5720 }, { "epoch": 0.15067158282854887, "grad_norm": 2.065647840499878, "learning_rate": 4.247432183302608e-05, "loss": 1.665, "step": 5721 }, { "epoch": 0.15069791941006058, "grad_norm": 2.4365437030792236, "learning_rate": 4.247300500395049e-05, "loss": 2.4467, "step": 5722 }, { "epoch": 0.1507242559915723, "grad_norm": 2.105393409729004, "learning_rate": 4.24716881748749e-05, "loss": 2.0182, "step": 5723 }, { "epoch": 0.150750592573084, "grad_norm": 2.433941125869751, "learning_rate": 4.247037134579932e-05, "loss": 1.8597, "step": 5724 }, { "epoch": 0.15077692915459573, "grad_norm": 2.0535686016082764, "learning_rate": 4.246905451672373e-05, "loss": 2.237, "step": 5725 }, { "epoch": 0.15080326573610744, "grad_norm": 3.9928605556488037, "learning_rate": 4.246773768764815e-05, "loss": 1.4777, "step": 5726 }, { "epoch": 0.15082960231761916, "grad_norm": 3.1019158363342285, "learning_rate": 4.246642085857256e-05, "loss": 1.9152, "step": 5727 }, { "epoch": 0.1508559388991309, "grad_norm": 1.9945987462997437, "learning_rate": 4.246510402949697e-05, "loss": 2.0096, "step": 5728 }, { "epoch": 0.15088227548064262, "grad_norm": 2.5533556938171387, "learning_rate": 4.246378720042139e-05, "loss": 2.1651, "step": 5729 }, { "epoch": 0.15090861206215433, "grad_norm": 5.138617515563965, "learning_rate": 4.2462470371345804e-05, "loss": 1.3718, "step": 5730 }, { "epoch": 0.15093494864366605, "grad_norm": 4.645598888397217, "learning_rate": 4.246115354227021e-05, "loss": 1.9161, "step": 5731 }, { "epoch": 0.15096128522517777, "grad_norm": 1.916338562965393, "learning_rate": 4.245983671319463e-05, "loss": 1.6664, "step": 5732 }, { "epoch": 0.15098762180668948, "grad_norm": 3.737555742263794, "learning_rate": 4.2458519884119044e-05, "loss": 0.9509, "step": 5733 }, { "epoch": 0.15101395838820122, "grad_norm": 3.6648004055023193, "learning_rate": 4.245720305504346e-05, "loss": 1.4428, "step": 5734 }, { "epoch": 0.15104029496971294, "grad_norm": 2.296168327331543, "learning_rate": 4.2455886225967875e-05, "loss": 1.603, "step": 5735 }, { "epoch": 0.15106663155122466, "grad_norm": 2.2506659030914307, "learning_rate": 4.2454569396892284e-05, "loss": 1.9465, "step": 5736 }, { "epoch": 0.15109296813273637, "grad_norm": 2.5359582901000977, "learning_rate": 4.24532525678167e-05, "loss": 2.2259, "step": 5737 }, { "epoch": 0.1511193047142481, "grad_norm": 3.7990384101867676, "learning_rate": 4.245193573874111e-05, "loss": 1.1931, "step": 5738 }, { "epoch": 0.1511456412957598, "grad_norm": 2.727841854095459, "learning_rate": 4.245061890966553e-05, "loss": 1.8222, "step": 5739 }, { "epoch": 0.15117197787727152, "grad_norm": 1.7299180030822754, "learning_rate": 4.244930208058994e-05, "loss": 1.5055, "step": 5740 }, { "epoch": 0.15119831445878326, "grad_norm": 5.285443305969238, "learning_rate": 4.2447985251514355e-05, "loss": 1.5848, "step": 5741 }, { "epoch": 0.15122465104029498, "grad_norm": 1.8614643812179565, "learning_rate": 4.244666842243877e-05, "loss": 1.8033, "step": 5742 }, { "epoch": 0.1512509876218067, "grad_norm": 2.1428117752075195, "learning_rate": 4.2445351593363186e-05, "loss": 2.1194, "step": 5743 }, { "epoch": 0.1512773242033184, "grad_norm": 5.770636081695557, "learning_rate": 4.24440347642876e-05, "loss": 1.7573, "step": 5744 }, { "epoch": 0.15130366078483012, "grad_norm": 3.3526344299316406, "learning_rate": 4.244271793521201e-05, "loss": 1.0669, "step": 5745 }, { "epoch": 0.15132999736634184, "grad_norm": 1.7971385717391968, "learning_rate": 4.2441401106136426e-05, "loss": 2.1406, "step": 5746 }, { "epoch": 0.15135633394785356, "grad_norm": 1.5810035467147827, "learning_rate": 4.2440084277060835e-05, "loss": 1.6191, "step": 5747 }, { "epoch": 0.1513826705293653, "grad_norm": 1.8192449808120728, "learning_rate": 4.243876744798526e-05, "loss": 1.4455, "step": 5748 }, { "epoch": 0.15140900711087701, "grad_norm": 2.4061219692230225, "learning_rate": 4.2437450618909666e-05, "loss": 2.3767, "step": 5749 }, { "epoch": 0.15143534369238873, "grad_norm": 2.6958067417144775, "learning_rate": 4.243613378983408e-05, "loss": 1.9432, "step": 5750 }, { "epoch": 0.15146168027390045, "grad_norm": 2.1837010383605957, "learning_rate": 4.24348169607585e-05, "loss": 0.4489, "step": 5751 }, { "epoch": 0.15148801685541216, "grad_norm": 1.946349859237671, "learning_rate": 4.2433500131682906e-05, "loss": 2.2124, "step": 5752 }, { "epoch": 0.15151435343692388, "grad_norm": 2.974581003189087, "learning_rate": 4.243218330260733e-05, "loss": 1.3092, "step": 5753 }, { "epoch": 0.15154069001843562, "grad_norm": 3.3074655532836914, "learning_rate": 4.243086647353174e-05, "loss": 1.5098, "step": 5754 }, { "epoch": 0.15156702659994734, "grad_norm": 2.089951276779175, "learning_rate": 4.242954964445615e-05, "loss": 1.58, "step": 5755 }, { "epoch": 0.15159336318145905, "grad_norm": 2.130981683731079, "learning_rate": 4.242823281538056e-05, "loss": 2.154, "step": 5756 }, { "epoch": 0.15161969976297077, "grad_norm": 2.550985097885132, "learning_rate": 4.242691598630498e-05, "loss": 1.6586, "step": 5757 }, { "epoch": 0.15164603634448248, "grad_norm": 2.7756130695343018, "learning_rate": 4.242559915722939e-05, "loss": 1.251, "step": 5758 }, { "epoch": 0.1516723729259942, "grad_norm": 3.435023784637451, "learning_rate": 4.242428232815381e-05, "loss": 1.1601, "step": 5759 }, { "epoch": 0.1516987095075059, "grad_norm": 2.8068125247955322, "learning_rate": 4.2422965499078224e-05, "loss": 1.3543, "step": 5760 }, { "epoch": 0.15172504608901766, "grad_norm": 3.4712586402893066, "learning_rate": 4.242164867000263e-05, "loss": 1.4659, "step": 5761 }, { "epoch": 0.15175138267052937, "grad_norm": 1.9302070140838623, "learning_rate": 4.2420331840927055e-05, "loss": 2.3592, "step": 5762 }, { "epoch": 0.1517777192520411, "grad_norm": 3.338070869445801, "learning_rate": 4.2419015011851464e-05, "loss": 1.0453, "step": 5763 }, { "epoch": 0.1518040558335528, "grad_norm": 4.471688270568848, "learning_rate": 4.241769818277588e-05, "loss": 1.6777, "step": 5764 }, { "epoch": 0.15183039241506452, "grad_norm": 3.090397357940674, "learning_rate": 4.241638135370029e-05, "loss": 0.4641, "step": 5765 }, { "epoch": 0.15185672899657623, "grad_norm": 2.181962013244629, "learning_rate": 4.2415064524624704e-05, "loss": 1.0882, "step": 5766 }, { "epoch": 0.15188306557808798, "grad_norm": 2.0378878116607666, "learning_rate": 4.241374769554912e-05, "loss": 0.949, "step": 5767 }, { "epoch": 0.1519094021595997, "grad_norm": 3.855985641479492, "learning_rate": 4.2412430866473535e-05, "loss": 1.6958, "step": 5768 }, { "epoch": 0.1519357387411114, "grad_norm": 2.005575180053711, "learning_rate": 4.241111403739795e-05, "loss": 1.6069, "step": 5769 }, { "epoch": 0.15196207532262312, "grad_norm": 1.6121034622192383, "learning_rate": 4.240979720832236e-05, "loss": 1.6984, "step": 5770 }, { "epoch": 0.15198841190413484, "grad_norm": 1.7583940029144287, "learning_rate": 4.2408480379246775e-05, "loss": 1.4442, "step": 5771 }, { "epoch": 0.15201474848564656, "grad_norm": 1.902397632598877, "learning_rate": 4.240716355017119e-05, "loss": 1.9341, "step": 5772 }, { "epoch": 0.15204108506715827, "grad_norm": 2.898103713989258, "learning_rate": 4.2405846721095606e-05, "loss": 1.8768, "step": 5773 }, { "epoch": 0.15206742164867001, "grad_norm": 1.9971503019332886, "learning_rate": 4.2404529892020015e-05, "loss": 1.7612, "step": 5774 }, { "epoch": 0.15209375823018173, "grad_norm": 4.194947242736816, "learning_rate": 4.240321306294443e-05, "loss": 1.7472, "step": 5775 }, { "epoch": 0.15212009481169345, "grad_norm": 3.3143115043640137, "learning_rate": 4.2401896233868846e-05, "loss": 0.9654, "step": 5776 }, { "epoch": 0.15214643139320516, "grad_norm": 1.7036153078079224, "learning_rate": 4.240057940479326e-05, "loss": 2.0887, "step": 5777 }, { "epoch": 0.15217276797471688, "grad_norm": 2.3918776512145996, "learning_rate": 4.239926257571767e-05, "loss": 1.7401, "step": 5778 }, { "epoch": 0.1521991045562286, "grad_norm": 2.058460235595703, "learning_rate": 4.2397945746642087e-05, "loss": 1.7126, "step": 5779 }, { "epoch": 0.1522254411377403, "grad_norm": 2.071458578109741, "learning_rate": 4.23966289175665e-05, "loss": 1.5062, "step": 5780 }, { "epoch": 0.15225177771925205, "grad_norm": 2.674434185028076, "learning_rate": 4.239531208849092e-05, "loss": 1.7909, "step": 5781 }, { "epoch": 0.15227811430076377, "grad_norm": 5.508162975311279, "learning_rate": 4.239399525941533e-05, "loss": 1.5076, "step": 5782 }, { "epoch": 0.15230445088227548, "grad_norm": 2.6319167613983154, "learning_rate": 4.239267843033974e-05, "loss": 2.1987, "step": 5783 }, { "epoch": 0.1523307874637872, "grad_norm": 1.5744264125823975, "learning_rate": 4.239136160126416e-05, "loss": 1.3576, "step": 5784 }, { "epoch": 0.15235712404529891, "grad_norm": 5.3464508056640625, "learning_rate": 4.2390044772188567e-05, "loss": 1.8021, "step": 5785 }, { "epoch": 0.15238346062681063, "grad_norm": 2.5581536293029785, "learning_rate": 4.238872794311299e-05, "loss": 2.1716, "step": 5786 }, { "epoch": 0.15240979720832237, "grad_norm": 6.438775539398193, "learning_rate": 4.23874111140374e-05, "loss": 1.9232, "step": 5787 }, { "epoch": 0.1524361337898341, "grad_norm": 3.27165150642395, "learning_rate": 4.238609428496181e-05, "loss": 1.9477, "step": 5788 }, { "epoch": 0.1524624703713458, "grad_norm": 2.4896976947784424, "learning_rate": 4.238477745588623e-05, "loss": 0.6221, "step": 5789 }, { "epoch": 0.15248880695285752, "grad_norm": 4.103713512420654, "learning_rate": 4.238346062681064e-05, "loss": 0.4915, "step": 5790 }, { "epoch": 0.15251514353436924, "grad_norm": 2.127819538116455, "learning_rate": 4.238214379773506e-05, "loss": 1.8545, "step": 5791 }, { "epoch": 0.15254148011588095, "grad_norm": 2.1432206630706787, "learning_rate": 4.238082696865947e-05, "loss": 0.5304, "step": 5792 }, { "epoch": 0.15256781669739267, "grad_norm": 2.7395143508911133, "learning_rate": 4.2379510139583885e-05, "loss": 0.702, "step": 5793 }, { "epoch": 0.1525941532789044, "grad_norm": 1.7573314905166626, "learning_rate": 4.237819331050829e-05, "loss": 1.4624, "step": 5794 }, { "epoch": 0.15262048986041613, "grad_norm": 1.7015575170516968, "learning_rate": 4.2376876481432716e-05, "loss": 1.2979, "step": 5795 }, { "epoch": 0.15264682644192784, "grad_norm": 4.185626983642578, "learning_rate": 4.2375559652357125e-05, "loss": 2.7003, "step": 5796 }, { "epoch": 0.15267316302343956, "grad_norm": 1.7594037055969238, "learning_rate": 4.237424282328154e-05, "loss": 1.8664, "step": 5797 }, { "epoch": 0.15269949960495127, "grad_norm": 2.6182191371917725, "learning_rate": 4.2372925994205956e-05, "loss": 1.7097, "step": 5798 }, { "epoch": 0.152725836186463, "grad_norm": 3.1217453479766846, "learning_rate": 4.2371609165130365e-05, "loss": 0.5211, "step": 5799 }, { "epoch": 0.1527521727679747, "grad_norm": 1.9852346181869507, "learning_rate": 4.237029233605479e-05, "loss": 1.6807, "step": 5800 }, { "epoch": 0.15277850934948645, "grad_norm": 1.7992138862609863, "learning_rate": 4.2368975506979196e-05, "loss": 1.5188, "step": 5801 }, { "epoch": 0.15280484593099816, "grad_norm": 3.129620313644409, "learning_rate": 4.236765867790361e-05, "loss": 1.5125, "step": 5802 }, { "epoch": 0.15283118251250988, "grad_norm": 2.4605212211608887, "learning_rate": 4.236634184882802e-05, "loss": 1.9487, "step": 5803 }, { "epoch": 0.1528575190940216, "grad_norm": 2.1186375617980957, "learning_rate": 4.2365025019752436e-05, "loss": 1.4671, "step": 5804 }, { "epoch": 0.1528838556755333, "grad_norm": 2.487518072128296, "learning_rate": 4.236370819067685e-05, "loss": 1.4194, "step": 5805 }, { "epoch": 0.15291019225704502, "grad_norm": 2.1010618209838867, "learning_rate": 4.236239136160127e-05, "loss": 2.1205, "step": 5806 }, { "epoch": 0.15293652883855677, "grad_norm": 3.364722728729248, "learning_rate": 4.236107453252568e-05, "loss": 1.8298, "step": 5807 }, { "epoch": 0.15296286542006848, "grad_norm": 1.6985399723052979, "learning_rate": 4.235975770345009e-05, "loss": 1.9882, "step": 5808 }, { "epoch": 0.1529892020015802, "grad_norm": 5.559665203094482, "learning_rate": 4.2358440874374514e-05, "loss": 0.9976, "step": 5809 }, { "epoch": 0.15301553858309191, "grad_norm": 3.8553013801574707, "learning_rate": 4.235712404529892e-05, "loss": 0.6162, "step": 5810 }, { "epoch": 0.15304187516460363, "grad_norm": 2.2669570446014404, "learning_rate": 4.235580721622334e-05, "loss": 1.9062, "step": 5811 }, { "epoch": 0.15306821174611535, "grad_norm": 3.381948232650757, "learning_rate": 4.235449038714775e-05, "loss": 2.244, "step": 5812 }, { "epoch": 0.15309454832762706, "grad_norm": 5.034691333770752, "learning_rate": 4.235317355807216e-05, "loss": 1.4453, "step": 5813 }, { "epoch": 0.1531208849091388, "grad_norm": 2.2089407444000244, "learning_rate": 4.235185672899658e-05, "loss": 1.9648, "step": 5814 }, { "epoch": 0.15314722149065052, "grad_norm": 1.9475656747817993, "learning_rate": 4.2350539899920994e-05, "loss": 1.8764, "step": 5815 }, { "epoch": 0.15317355807216224, "grad_norm": 1.8945266008377075, "learning_rate": 4.234922307084541e-05, "loss": 2.6819, "step": 5816 }, { "epoch": 0.15319989465367395, "grad_norm": 1.7970775365829468, "learning_rate": 4.234790624176982e-05, "loss": 0.5971, "step": 5817 }, { "epoch": 0.15322623123518567, "grad_norm": 1.876449704170227, "learning_rate": 4.2346589412694234e-05, "loss": 2.2377, "step": 5818 }, { "epoch": 0.15325256781669738, "grad_norm": 3.9584176540374756, "learning_rate": 4.234527258361865e-05, "loss": 2.1886, "step": 5819 }, { "epoch": 0.15327890439820913, "grad_norm": 2.0388834476470947, "learning_rate": 4.2343955754543065e-05, "loss": 1.7564, "step": 5820 }, { "epoch": 0.15330524097972084, "grad_norm": 1.9596577882766724, "learning_rate": 4.2342638925467474e-05, "loss": 2.2364, "step": 5821 }, { "epoch": 0.15333157756123256, "grad_norm": 5.078189373016357, "learning_rate": 4.234132209639189e-05, "loss": 1.8395, "step": 5822 }, { "epoch": 0.15335791414274427, "grad_norm": 2.767908811569214, "learning_rate": 4.2340005267316305e-05, "loss": 1.4972, "step": 5823 }, { "epoch": 0.153384250724256, "grad_norm": 3.0601449012756348, "learning_rate": 4.233868843824072e-05, "loss": 1.2673, "step": 5824 }, { "epoch": 0.1534105873057677, "grad_norm": 1.8390586376190186, "learning_rate": 4.2337371609165136e-05, "loss": 1.7768, "step": 5825 }, { "epoch": 0.15343692388727942, "grad_norm": 1.7578758001327515, "learning_rate": 4.2336054780089545e-05, "loss": 1.5339, "step": 5826 }, { "epoch": 0.15346326046879116, "grad_norm": 2.2950122356414795, "learning_rate": 4.233473795101396e-05, "loss": 2.3913, "step": 5827 }, { "epoch": 0.15348959705030288, "grad_norm": 2.3848166465759277, "learning_rate": 4.2333421121938376e-05, "loss": 2.1667, "step": 5828 }, { "epoch": 0.1535159336318146, "grad_norm": 2.779529333114624, "learning_rate": 4.233210429286279e-05, "loss": 1.5195, "step": 5829 }, { "epoch": 0.1535422702133263, "grad_norm": 2.484830379486084, "learning_rate": 4.23307874637872e-05, "loss": 1.6426, "step": 5830 }, { "epoch": 0.15356860679483803, "grad_norm": 4.905272483825684, "learning_rate": 4.2329470634711616e-05, "loss": 1.1128, "step": 5831 }, { "epoch": 0.15359494337634974, "grad_norm": 1.969044804573059, "learning_rate": 4.2328153805636025e-05, "loss": 1.4686, "step": 5832 }, { "epoch": 0.15362127995786146, "grad_norm": 2.164372205734253, "learning_rate": 4.232683697656045e-05, "loss": 1.7307, "step": 5833 }, { "epoch": 0.1536476165393732, "grad_norm": 2.4948770999908447, "learning_rate": 4.2325520147484856e-05, "loss": 0.5157, "step": 5834 }, { "epoch": 0.15367395312088492, "grad_norm": 4.046682834625244, "learning_rate": 4.232420331840927e-05, "loss": 1.154, "step": 5835 }, { "epoch": 0.15370028970239663, "grad_norm": 3.777179718017578, "learning_rate": 4.232288648933369e-05, "loss": 1.7118, "step": 5836 }, { "epoch": 0.15372662628390835, "grad_norm": 3.2744557857513428, "learning_rate": 4.2321569660258096e-05, "loss": 1.9194, "step": 5837 }, { "epoch": 0.15375296286542006, "grad_norm": 1.9500840902328491, "learning_rate": 4.232025283118252e-05, "loss": 2.2378, "step": 5838 }, { "epoch": 0.15377929944693178, "grad_norm": 2.1920278072357178, "learning_rate": 4.231893600210693e-05, "loss": 2.0352, "step": 5839 }, { "epoch": 0.15380563602844352, "grad_norm": 5.23524808883667, "learning_rate": 4.231761917303134e-05, "loss": 1.5955, "step": 5840 }, { "epoch": 0.15383197260995524, "grad_norm": 6.422600269317627, "learning_rate": 4.231630234395575e-05, "loss": 1.1283, "step": 5841 }, { "epoch": 0.15385830919146695, "grad_norm": 2.6680405139923096, "learning_rate": 4.2314985514880174e-05, "loss": 1.639, "step": 5842 }, { "epoch": 0.15388464577297867, "grad_norm": 1.9572499990463257, "learning_rate": 4.231366868580458e-05, "loss": 2.2494, "step": 5843 }, { "epoch": 0.15391098235449038, "grad_norm": 3.598973512649536, "learning_rate": 4.2312351856729e-05, "loss": 0.8718, "step": 5844 }, { "epoch": 0.1539373189360021, "grad_norm": 4.288572788238525, "learning_rate": 4.2311035027653414e-05, "loss": 1.7746, "step": 5845 }, { "epoch": 0.15396365551751381, "grad_norm": 2.566802501678467, "learning_rate": 4.230971819857782e-05, "loss": 1.4888, "step": 5846 }, { "epoch": 0.15398999209902556, "grad_norm": 2.754087209701538, "learning_rate": 4.2308401369502246e-05, "loss": 1.6778, "step": 5847 }, { "epoch": 0.15401632868053727, "grad_norm": 2.596709728240967, "learning_rate": 4.2307084540426654e-05, "loss": 1.6571, "step": 5848 }, { "epoch": 0.154042665262049, "grad_norm": 1.8524587154388428, "learning_rate": 4.230576771135107e-05, "loss": 1.7538, "step": 5849 }, { "epoch": 0.1540690018435607, "grad_norm": 2.5000038146972656, "learning_rate": 4.230445088227548e-05, "loss": 2.0407, "step": 5850 }, { "epoch": 0.15409533842507242, "grad_norm": 3.3258731365203857, "learning_rate": 4.2303134053199894e-05, "loss": 1.3404, "step": 5851 }, { "epoch": 0.15412167500658414, "grad_norm": 2.608553886413574, "learning_rate": 4.230181722412431e-05, "loss": 1.535, "step": 5852 }, { "epoch": 0.15414801158809588, "grad_norm": 2.5919270515441895, "learning_rate": 4.2300500395048726e-05, "loss": 0.9003, "step": 5853 }, { "epoch": 0.1541743481696076, "grad_norm": 2.164841413497925, "learning_rate": 4.229918356597314e-05, "loss": 1.7515, "step": 5854 }, { "epoch": 0.1542006847511193, "grad_norm": 8.342192649841309, "learning_rate": 4.229786673689755e-05, "loss": 1.4762, "step": 5855 }, { "epoch": 0.15422702133263103, "grad_norm": 2.4404876232147217, "learning_rate": 4.2296549907821966e-05, "loss": 1.7751, "step": 5856 }, { "epoch": 0.15425335791414274, "grad_norm": 1.9116249084472656, "learning_rate": 4.229523307874638e-05, "loss": 2.2532, "step": 5857 }, { "epoch": 0.15427969449565446, "grad_norm": 1.9052069187164307, "learning_rate": 4.22939162496708e-05, "loss": 1.7236, "step": 5858 }, { "epoch": 0.15430603107716617, "grad_norm": 2.3022842407226562, "learning_rate": 4.2292599420595206e-05, "loss": 2.0284, "step": 5859 }, { "epoch": 0.15433236765867792, "grad_norm": 2.4093234539031982, "learning_rate": 4.229128259151962e-05, "loss": 2.0325, "step": 5860 }, { "epoch": 0.15435870424018963, "grad_norm": 4.183977127075195, "learning_rate": 4.228996576244404e-05, "loss": 1.6332, "step": 5861 }, { "epoch": 0.15438504082170135, "grad_norm": 2.769796371459961, "learning_rate": 4.228864893336845e-05, "loss": 0.5197, "step": 5862 }, { "epoch": 0.15441137740321306, "grad_norm": 3.7631874084472656, "learning_rate": 4.228733210429287e-05, "loss": 1.8485, "step": 5863 }, { "epoch": 0.15443771398472478, "grad_norm": 2.9936611652374268, "learning_rate": 4.228601527521728e-05, "loss": 1.0529, "step": 5864 }, { "epoch": 0.1544640505662365, "grad_norm": 1.7187716960906982, "learning_rate": 4.228469844614169e-05, "loss": 2.0852, "step": 5865 }, { "epoch": 0.1544903871477482, "grad_norm": 2.605057716369629, "learning_rate": 4.228338161706611e-05, "loss": 1.7557, "step": 5866 }, { "epoch": 0.15451672372925995, "grad_norm": 2.624366521835327, "learning_rate": 4.2282064787990524e-05, "loss": 1.6579, "step": 5867 }, { "epoch": 0.15454306031077167, "grad_norm": 2.2019760608673096, "learning_rate": 4.228074795891493e-05, "loss": 1.8905, "step": 5868 }, { "epoch": 0.15456939689228338, "grad_norm": 3.4228620529174805, "learning_rate": 4.227943112983935e-05, "loss": 0.8433, "step": 5869 }, { "epoch": 0.1545957334737951, "grad_norm": 2.1239914894104004, "learning_rate": 4.2278114300763764e-05, "loss": 1.8914, "step": 5870 }, { "epoch": 0.15462207005530682, "grad_norm": 2.1944406032562256, "learning_rate": 4.227679747168818e-05, "loss": 2.1417, "step": 5871 }, { "epoch": 0.15464840663681853, "grad_norm": 1.8497647047042847, "learning_rate": 4.2275480642612595e-05, "loss": 1.9463, "step": 5872 }, { "epoch": 0.15467474321833027, "grad_norm": 2.543242931365967, "learning_rate": 4.2274163813537004e-05, "loss": 2.1293, "step": 5873 }, { "epoch": 0.154701079799842, "grad_norm": 1.7186356782913208, "learning_rate": 4.227284698446142e-05, "loss": 1.4707, "step": 5874 }, { "epoch": 0.1547274163813537, "grad_norm": 2.7276201248168945, "learning_rate": 4.2271530155385835e-05, "loss": 1.3838, "step": 5875 }, { "epoch": 0.15475375296286542, "grad_norm": 1.9710484743118286, "learning_rate": 4.227021332631025e-05, "loss": 1.7793, "step": 5876 }, { "epoch": 0.15478008954437714, "grad_norm": 2.6219520568847656, "learning_rate": 4.226889649723466e-05, "loss": 1.9442, "step": 5877 }, { "epoch": 0.15480642612588885, "grad_norm": 2.085421562194824, "learning_rate": 4.2267579668159075e-05, "loss": 1.7946, "step": 5878 }, { "epoch": 0.15483276270740057, "grad_norm": 6.2612738609313965, "learning_rate": 4.2266262839083484e-05, "loss": 2.296, "step": 5879 }, { "epoch": 0.1548590992889123, "grad_norm": 3.3673183917999268, "learning_rate": 4.2264946010007906e-05, "loss": 1.9816, "step": 5880 }, { "epoch": 0.15488543587042403, "grad_norm": 2.5657103061676025, "learning_rate": 4.2263629180932315e-05, "loss": 1.9609, "step": 5881 }, { "epoch": 0.15491177245193574, "grad_norm": 1.882688283920288, "learning_rate": 4.226231235185673e-05, "loss": 1.8117, "step": 5882 }, { "epoch": 0.15493810903344746, "grad_norm": 1.9992015361785889, "learning_rate": 4.2260995522781146e-05, "loss": 0.8731, "step": 5883 }, { "epoch": 0.15496444561495917, "grad_norm": 3.999999523162842, "learning_rate": 4.2259678693705555e-05, "loss": 1.2519, "step": 5884 }, { "epoch": 0.1549907821964709, "grad_norm": 2.3278517723083496, "learning_rate": 4.225836186462998e-05, "loss": 1.2304, "step": 5885 }, { "epoch": 0.1550171187779826, "grad_norm": 3.1681950092315674, "learning_rate": 4.2257045035554386e-05, "loss": 2.021, "step": 5886 }, { "epoch": 0.15504345535949435, "grad_norm": 2.4006595611572266, "learning_rate": 4.22557282064788e-05, "loss": 1.8774, "step": 5887 }, { "epoch": 0.15506979194100606, "grad_norm": 4.515176773071289, "learning_rate": 4.225441137740321e-05, "loss": 1.3982, "step": 5888 }, { "epoch": 0.15509612852251778, "grad_norm": 2.631305456161499, "learning_rate": 4.2253094548327626e-05, "loss": 1.4711, "step": 5889 }, { "epoch": 0.1551224651040295, "grad_norm": 1.4728326797485352, "learning_rate": 4.225177771925204e-05, "loss": 1.8721, "step": 5890 }, { "epoch": 0.1551488016855412, "grad_norm": 1.9591310024261475, "learning_rate": 4.225046089017646e-05, "loss": 1.8931, "step": 5891 }, { "epoch": 0.15517513826705293, "grad_norm": 2.040419816970825, "learning_rate": 4.224914406110087e-05, "loss": 1.7898, "step": 5892 }, { "epoch": 0.15520147484856467, "grad_norm": 1.7353781461715698, "learning_rate": 4.224782723202528e-05, "loss": 2.1022, "step": 5893 }, { "epoch": 0.15522781143007638, "grad_norm": 1.6009823083877563, "learning_rate": 4.2246510402949704e-05, "loss": 1.6196, "step": 5894 }, { "epoch": 0.1552541480115881, "grad_norm": 4.091841220855713, "learning_rate": 4.224519357387411e-05, "loss": 1.0809, "step": 5895 }, { "epoch": 0.15528048459309982, "grad_norm": 1.8655463457107544, "learning_rate": 4.224387674479853e-05, "loss": 2.0149, "step": 5896 }, { "epoch": 0.15530682117461153, "grad_norm": 2.3955373764038086, "learning_rate": 4.224255991572294e-05, "loss": 2.4355, "step": 5897 }, { "epoch": 0.15533315775612325, "grad_norm": 4.653458595275879, "learning_rate": 4.224124308664735e-05, "loss": 1.4602, "step": 5898 }, { "epoch": 0.15535949433763496, "grad_norm": 3.4966752529144287, "learning_rate": 4.223992625757177e-05, "loss": 1.204, "step": 5899 }, { "epoch": 0.1553858309191467, "grad_norm": 4.484646320343018, "learning_rate": 4.2238609428496184e-05, "loss": 1.6046, "step": 5900 }, { "epoch": 0.15541216750065842, "grad_norm": 2.649949550628662, "learning_rate": 4.22372925994206e-05, "loss": 1.954, "step": 5901 }, { "epoch": 0.15543850408217014, "grad_norm": 1.8868272304534912, "learning_rate": 4.223597577034501e-05, "loss": 1.4536, "step": 5902 }, { "epoch": 0.15546484066368185, "grad_norm": 2.0686264038085938, "learning_rate": 4.2234658941269424e-05, "loss": 1.6357, "step": 5903 }, { "epoch": 0.15549117724519357, "grad_norm": 1.8744795322418213, "learning_rate": 4.223334211219384e-05, "loss": 1.544, "step": 5904 }, { "epoch": 0.15551751382670528, "grad_norm": 2.567208766937256, "learning_rate": 4.2232025283118255e-05, "loss": 1.4843, "step": 5905 }, { "epoch": 0.15554385040821703, "grad_norm": 3.946901559829712, "learning_rate": 4.2230708454042664e-05, "loss": 1.6515, "step": 5906 }, { "epoch": 0.15557018698972874, "grad_norm": 1.8149755001068115, "learning_rate": 4.222939162496708e-05, "loss": 2.1445, "step": 5907 }, { "epoch": 0.15559652357124046, "grad_norm": 2.1219542026519775, "learning_rate": 4.2228074795891495e-05, "loss": 2.3332, "step": 5908 }, { "epoch": 0.15562286015275217, "grad_norm": 1.72514808177948, "learning_rate": 4.222675796681591e-05, "loss": 1.3152, "step": 5909 }, { "epoch": 0.1556491967342639, "grad_norm": 1.9629729986190796, "learning_rate": 4.2225441137740327e-05, "loss": 1.8084, "step": 5910 }, { "epoch": 0.1556755333157756, "grad_norm": 2.301128387451172, "learning_rate": 4.2224124308664735e-05, "loss": 1.7519, "step": 5911 }, { "epoch": 0.15570186989728732, "grad_norm": 2.1626784801483154, "learning_rate": 4.222280747958915e-05, "loss": 2.0657, "step": 5912 }, { "epoch": 0.15572820647879906, "grad_norm": 2.2094016075134277, "learning_rate": 4.2221490650513567e-05, "loss": 1.8229, "step": 5913 }, { "epoch": 0.15575454306031078, "grad_norm": 2.475167751312256, "learning_rate": 4.222017382143798e-05, "loss": 1.6667, "step": 5914 }, { "epoch": 0.1557808796418225, "grad_norm": 3.6198947429656982, "learning_rate": 4.221885699236239e-05, "loss": 1.192, "step": 5915 }, { "epoch": 0.1558072162233342, "grad_norm": 1.987687349319458, "learning_rate": 4.2217540163286807e-05, "loss": 2.4872, "step": 5916 }, { "epoch": 0.15583355280484593, "grad_norm": 2.642976760864258, "learning_rate": 4.221622333421122e-05, "loss": 1.0824, "step": 5917 }, { "epoch": 0.15585988938635764, "grad_norm": 1.8391703367233276, "learning_rate": 4.221490650513564e-05, "loss": 0.5032, "step": 5918 }, { "epoch": 0.15588622596786936, "grad_norm": 2.433856725692749, "learning_rate": 4.221358967606005e-05, "loss": 2.2421, "step": 5919 }, { "epoch": 0.1559125625493811, "grad_norm": 2.7764840126037598, "learning_rate": 4.221227284698446e-05, "loss": 1.4468, "step": 5920 }, { "epoch": 0.15593889913089282, "grad_norm": 1.781196117401123, "learning_rate": 4.221095601790888e-05, "loss": 1.862, "step": 5921 }, { "epoch": 0.15596523571240453, "grad_norm": 3.049417734146118, "learning_rate": 4.2209639188833287e-05, "loss": 1.8256, "step": 5922 }, { "epoch": 0.15599157229391625, "grad_norm": 4.471404075622559, "learning_rate": 4.220832235975771e-05, "loss": 2.0908, "step": 5923 }, { "epoch": 0.15601790887542796, "grad_norm": 2.175081491470337, "learning_rate": 4.220700553068212e-05, "loss": 1.7903, "step": 5924 }, { "epoch": 0.15604424545693968, "grad_norm": 1.913242220878601, "learning_rate": 4.2205688701606533e-05, "loss": 1.7829, "step": 5925 }, { "epoch": 0.15607058203845142, "grad_norm": 3.270598888397217, "learning_rate": 4.220437187253095e-05, "loss": 1.3993, "step": 5926 }, { "epoch": 0.15609691861996314, "grad_norm": 3.6847281455993652, "learning_rate": 4.2203055043455365e-05, "loss": 1.1214, "step": 5927 }, { "epoch": 0.15612325520147485, "grad_norm": 5.7359442710876465, "learning_rate": 4.220173821437978e-05, "loss": 1.7749, "step": 5928 }, { "epoch": 0.15614959178298657, "grad_norm": 2.10076642036438, "learning_rate": 4.220042138530419e-05, "loss": 2.0794, "step": 5929 }, { "epoch": 0.15617592836449828, "grad_norm": 2.211111545562744, "learning_rate": 4.2199104556228605e-05, "loss": 1.8982, "step": 5930 }, { "epoch": 0.15620226494601, "grad_norm": 2.931358814239502, "learning_rate": 4.2197787727153013e-05, "loss": 1.2573, "step": 5931 }, { "epoch": 0.15622860152752172, "grad_norm": 2.6709563732147217, "learning_rate": 4.2196470898077436e-05, "loss": 1.8708, "step": 5932 }, { "epoch": 0.15625493810903346, "grad_norm": 3.6660375595092773, "learning_rate": 4.2195154069001845e-05, "loss": 0.463, "step": 5933 }, { "epoch": 0.15628127469054517, "grad_norm": 2.1038217544555664, "learning_rate": 4.219383723992626e-05, "loss": 2.1543, "step": 5934 }, { "epoch": 0.1563076112720569, "grad_norm": 1.6519107818603516, "learning_rate": 4.219252041085067e-05, "loss": 2.2121, "step": 5935 }, { "epoch": 0.1563339478535686, "grad_norm": 2.7920644283294678, "learning_rate": 4.2191203581775085e-05, "loss": 0.483, "step": 5936 }, { "epoch": 0.15636028443508032, "grad_norm": 2.220864772796631, "learning_rate": 4.21898867526995e-05, "loss": 1.8855, "step": 5937 }, { "epoch": 0.15638662101659204, "grad_norm": 2.7000720500946045, "learning_rate": 4.2188569923623916e-05, "loss": 0.5763, "step": 5938 }, { "epoch": 0.15641295759810375, "grad_norm": 1.735314130783081, "learning_rate": 4.218725309454833e-05, "loss": 1.363, "step": 5939 }, { "epoch": 0.1564392941796155, "grad_norm": 3.029871940612793, "learning_rate": 4.218593626547274e-05, "loss": 1.6139, "step": 5940 }, { "epoch": 0.1564656307611272, "grad_norm": 1.7269423007965088, "learning_rate": 4.218461943639716e-05, "loss": 1.9701, "step": 5941 }, { "epoch": 0.15649196734263893, "grad_norm": 2.508410692214966, "learning_rate": 4.218330260732157e-05, "loss": 1.7556, "step": 5942 }, { "epoch": 0.15651830392415064, "grad_norm": 6.6043314933776855, "learning_rate": 4.218198577824599e-05, "loss": 2.4914, "step": 5943 }, { "epoch": 0.15654464050566236, "grad_norm": 2.9760141372680664, "learning_rate": 4.2180668949170396e-05, "loss": 2.1465, "step": 5944 }, { "epoch": 0.15657097708717407, "grad_norm": 1.6758267879486084, "learning_rate": 4.217935212009481e-05, "loss": 1.1808, "step": 5945 }, { "epoch": 0.15659731366868582, "grad_norm": 2.32725191116333, "learning_rate": 4.217803529101923e-05, "loss": 2.24, "step": 5946 }, { "epoch": 0.15662365025019753, "grad_norm": 2.053637981414795, "learning_rate": 4.217671846194364e-05, "loss": 1.7966, "step": 5947 }, { "epoch": 0.15664998683170925, "grad_norm": 1.6508933305740356, "learning_rate": 4.217540163286806e-05, "loss": 2.4334, "step": 5948 }, { "epoch": 0.15667632341322096, "grad_norm": 2.3459978103637695, "learning_rate": 4.217408480379247e-05, "loss": 2.1246, "step": 5949 }, { "epoch": 0.15670265999473268, "grad_norm": 2.6824984550476074, "learning_rate": 4.217276797471688e-05, "loss": 1.7024, "step": 5950 }, { "epoch": 0.1567289965762444, "grad_norm": 2.0082266330718994, "learning_rate": 4.21714511456413e-05, "loss": 1.6473, "step": 5951 }, { "epoch": 0.1567553331577561, "grad_norm": 2.1118314266204834, "learning_rate": 4.2170134316565714e-05, "loss": 1.8222, "step": 5952 }, { "epoch": 0.15678166973926785, "grad_norm": 2.196316719055176, "learning_rate": 4.216881748749012e-05, "loss": 1.1248, "step": 5953 }, { "epoch": 0.15680800632077957, "grad_norm": 3.4021639823913574, "learning_rate": 4.216750065841454e-05, "loss": 1.7389, "step": 5954 }, { "epoch": 0.15683434290229128, "grad_norm": 1.839551329612732, "learning_rate": 4.2166183829338954e-05, "loss": 1.8474, "step": 5955 }, { "epoch": 0.156860679483803, "grad_norm": 1.7327178716659546, "learning_rate": 4.216486700026337e-05, "loss": 1.9408, "step": 5956 }, { "epoch": 0.15688701606531472, "grad_norm": 3.7114171981811523, "learning_rate": 4.2163550171187785e-05, "loss": 2.2037, "step": 5957 }, { "epoch": 0.15691335264682643, "grad_norm": 2.627505302429199, "learning_rate": 4.2162233342112194e-05, "loss": 1.0022, "step": 5958 }, { "epoch": 0.15693968922833818, "grad_norm": 2.1350152492523193, "learning_rate": 4.216091651303661e-05, "loss": 0.3743, "step": 5959 }, { "epoch": 0.1569660258098499, "grad_norm": 3.300657033920288, "learning_rate": 4.2159599683961025e-05, "loss": 1.3048, "step": 5960 }, { "epoch": 0.1569923623913616, "grad_norm": 2.4122374057769775, "learning_rate": 4.215828285488544e-05, "loss": 2.2151, "step": 5961 }, { "epoch": 0.15701869897287332, "grad_norm": 3.083698272705078, "learning_rate": 4.215696602580985e-05, "loss": 1.3165, "step": 5962 }, { "epoch": 0.15704503555438504, "grad_norm": 2.317965269088745, "learning_rate": 4.2155649196734265e-05, "loss": 0.5157, "step": 5963 }, { "epoch": 0.15707137213589675, "grad_norm": 2.3910367488861084, "learning_rate": 4.215433236765868e-05, "loss": 2.3669, "step": 5964 }, { "epoch": 0.15709770871740847, "grad_norm": 1.941091537475586, "learning_rate": 4.2153015538583096e-05, "loss": 2.5886, "step": 5965 }, { "epoch": 0.1571240452989202, "grad_norm": 2.7749321460723877, "learning_rate": 4.215169870950751e-05, "loss": 1.7127, "step": 5966 }, { "epoch": 0.15715038188043193, "grad_norm": 5.011218547821045, "learning_rate": 4.215038188043192e-05, "loss": 1.9305, "step": 5967 }, { "epoch": 0.15717671846194364, "grad_norm": 2.4140255451202393, "learning_rate": 4.2149065051356336e-05, "loss": 2.2752, "step": 5968 }, { "epoch": 0.15720305504345536, "grad_norm": 3.319852113723755, "learning_rate": 4.2147748222280745e-05, "loss": 1.2568, "step": 5969 }, { "epoch": 0.15722939162496707, "grad_norm": 3.5438194274902344, "learning_rate": 4.214643139320517e-05, "loss": 1.2369, "step": 5970 }, { "epoch": 0.1572557282064788, "grad_norm": 6.879681587219238, "learning_rate": 4.2145114564129576e-05, "loss": 2.2988, "step": 5971 }, { "epoch": 0.1572820647879905, "grad_norm": 2.6652133464813232, "learning_rate": 4.214379773505399e-05, "loss": 1.8378, "step": 5972 }, { "epoch": 0.15730840136950225, "grad_norm": 2.0418589115142822, "learning_rate": 4.214248090597841e-05, "loss": 2.3504, "step": 5973 }, { "epoch": 0.15733473795101396, "grad_norm": 2.0920841693878174, "learning_rate": 4.214116407690282e-05, "loss": 2.1954, "step": 5974 }, { "epoch": 0.15736107453252568, "grad_norm": 6.576579570770264, "learning_rate": 4.213984724782724e-05, "loss": 2.1881, "step": 5975 }, { "epoch": 0.1573874111140374, "grad_norm": 2.9959959983825684, "learning_rate": 4.213853041875165e-05, "loss": 1.5807, "step": 5976 }, { "epoch": 0.1574137476955491, "grad_norm": 1.959325909614563, "learning_rate": 4.213721358967606e-05, "loss": 1.3493, "step": 5977 }, { "epoch": 0.15744008427706083, "grad_norm": 9.182555198669434, "learning_rate": 4.213589676060047e-05, "loss": 2.2217, "step": 5978 }, { "epoch": 0.15746642085857257, "grad_norm": 2.128164529800415, "learning_rate": 4.2134579931524894e-05, "loss": 1.9559, "step": 5979 }, { "epoch": 0.15749275744008429, "grad_norm": 1.984562635421753, "learning_rate": 4.21332631024493e-05, "loss": 2.0752, "step": 5980 }, { "epoch": 0.157519094021596, "grad_norm": 2.0138955116271973, "learning_rate": 4.213194627337372e-05, "loss": 1.9993, "step": 5981 }, { "epoch": 0.15754543060310772, "grad_norm": 2.8540875911712646, "learning_rate": 4.213062944429813e-05, "loss": 2.5846, "step": 5982 }, { "epoch": 0.15757176718461943, "grad_norm": 4.09743595123291, "learning_rate": 4.212931261522254e-05, "loss": 1.5782, "step": 5983 }, { "epoch": 0.15759810376613115, "grad_norm": 2.2861154079437256, "learning_rate": 4.212799578614696e-05, "loss": 1.4951, "step": 5984 }, { "epoch": 0.15762444034764286, "grad_norm": 3.8065881729125977, "learning_rate": 4.2126678957071374e-05, "loss": 1.2832, "step": 5985 }, { "epoch": 0.1576507769291546, "grad_norm": 2.0236055850982666, "learning_rate": 4.212536212799579e-05, "loss": 1.5231, "step": 5986 }, { "epoch": 0.15767711351066632, "grad_norm": 2.220747947692871, "learning_rate": 4.21240452989202e-05, "loss": 0.7072, "step": 5987 }, { "epoch": 0.15770345009217804, "grad_norm": 3.2718474864959717, "learning_rate": 4.212272846984462e-05, "loss": 1.3849, "step": 5988 }, { "epoch": 0.15772978667368975, "grad_norm": 1.6934703588485718, "learning_rate": 4.212141164076903e-05, "loss": 1.7882, "step": 5989 }, { "epoch": 0.15775612325520147, "grad_norm": 1.4751981496810913, "learning_rate": 4.2120094811693446e-05, "loss": 0.4046, "step": 5990 }, { "epoch": 0.15778245983671318, "grad_norm": 1.5996410846710205, "learning_rate": 4.2118777982617854e-05, "loss": 2.0365, "step": 5991 }, { "epoch": 0.15780879641822493, "grad_norm": 2.111327648162842, "learning_rate": 4.211746115354227e-05, "loss": 0.646, "step": 5992 }, { "epoch": 0.15783513299973664, "grad_norm": 2.0970349311828613, "learning_rate": 4.2116144324466686e-05, "loss": 1.7066, "step": 5993 }, { "epoch": 0.15786146958124836, "grad_norm": 3.1678991317749023, "learning_rate": 4.21148274953911e-05, "loss": 1.5934, "step": 5994 }, { "epoch": 0.15788780616276007, "grad_norm": 2.683861255645752, "learning_rate": 4.211351066631552e-05, "loss": 1.8981, "step": 5995 }, { "epoch": 0.1579141427442718, "grad_norm": 3.6035783290863037, "learning_rate": 4.2112193837239926e-05, "loss": 2.1261, "step": 5996 }, { "epoch": 0.1579404793257835, "grad_norm": 2.172053813934326, "learning_rate": 4.211087700816434e-05, "loss": 0.4462, "step": 5997 }, { "epoch": 0.15796681590729522, "grad_norm": 1.8016635179519653, "learning_rate": 4.210956017908876e-05, "loss": 1.7273, "step": 5998 }, { "epoch": 0.15799315248880696, "grad_norm": 5.404572486877441, "learning_rate": 4.210824335001317e-05, "loss": 2.403, "step": 5999 }, { "epoch": 0.15801948907031868, "grad_norm": 4.310553550720215, "learning_rate": 4.210692652093758e-05, "loss": 1.1839, "step": 6000 }, { "epoch": 0.1580458256518304, "grad_norm": 1.9727861881256104, "learning_rate": 4.2105609691862e-05, "loss": 1.3579, "step": 6001 }, { "epoch": 0.1580721622333421, "grad_norm": 2.3161890506744385, "learning_rate": 4.210429286278641e-05, "loss": 1.7293, "step": 6002 }, { "epoch": 0.15809849881485383, "grad_norm": 3.0780434608459473, "learning_rate": 4.210297603371083e-05, "loss": 1.6469, "step": 6003 }, { "epoch": 0.15812483539636554, "grad_norm": 2.0891878604888916, "learning_rate": 4.2101659204635244e-05, "loss": 1.943, "step": 6004 }, { "epoch": 0.15815117197787726, "grad_norm": 3.240722894668579, "learning_rate": 4.210034237555965e-05, "loss": 1.1894, "step": 6005 }, { "epoch": 0.158177508559389, "grad_norm": 2.5613348484039307, "learning_rate": 4.209902554648407e-05, "loss": 1.5481, "step": 6006 }, { "epoch": 0.15820384514090072, "grad_norm": 1.823609709739685, "learning_rate": 4.2097708717408484e-05, "loss": 1.7977, "step": 6007 }, { "epoch": 0.15823018172241243, "grad_norm": 2.269915819168091, "learning_rate": 4.20963918883329e-05, "loss": 2.1684, "step": 6008 }, { "epoch": 0.15825651830392415, "grad_norm": 2.3334107398986816, "learning_rate": 4.209507505925731e-05, "loss": 3.0635, "step": 6009 }, { "epoch": 0.15828285488543586, "grad_norm": 2.455613136291504, "learning_rate": 4.2093758230181724e-05, "loss": 0.5861, "step": 6010 }, { "epoch": 0.15830919146694758, "grad_norm": 2.6880509853363037, "learning_rate": 4.209244140110614e-05, "loss": 1.3298, "step": 6011 }, { "epoch": 0.15833552804845932, "grad_norm": 2.403714179992676, "learning_rate": 4.2091124572030555e-05, "loss": 1.9459, "step": 6012 }, { "epoch": 0.15836186462997104, "grad_norm": 1.4829332828521729, "learning_rate": 4.208980774295497e-05, "loss": 0.4841, "step": 6013 }, { "epoch": 0.15838820121148275, "grad_norm": 1.6144860982894897, "learning_rate": 4.208849091387938e-05, "loss": 1.8183, "step": 6014 }, { "epoch": 0.15841453779299447, "grad_norm": 2.2351977825164795, "learning_rate": 4.2087174084803795e-05, "loss": 2.0403, "step": 6015 }, { "epoch": 0.15844087437450619, "grad_norm": 2.477430820465088, "learning_rate": 4.2085857255728204e-05, "loss": 0.9024, "step": 6016 }, { "epoch": 0.1584672109560179, "grad_norm": 2.137519359588623, "learning_rate": 4.2084540426652626e-05, "loss": 1.5758, "step": 6017 }, { "epoch": 0.15849354753752962, "grad_norm": 4.021152496337891, "learning_rate": 4.2083223597577035e-05, "loss": 2.4805, "step": 6018 }, { "epoch": 0.15851988411904136, "grad_norm": 2.429715871810913, "learning_rate": 4.208190676850145e-05, "loss": 0.5486, "step": 6019 }, { "epoch": 0.15854622070055308, "grad_norm": 1.847489356994629, "learning_rate": 4.2080589939425866e-05, "loss": 2.4164, "step": 6020 }, { "epoch": 0.1585725572820648, "grad_norm": 2.0357372760772705, "learning_rate": 4.207927311035028e-05, "loss": 1.968, "step": 6021 }, { "epoch": 0.1585988938635765, "grad_norm": 2.8058173656463623, "learning_rate": 4.20779562812747e-05, "loss": 1.7579, "step": 6022 }, { "epoch": 0.15862523044508822, "grad_norm": 1.9289313554763794, "learning_rate": 4.2076639452199106e-05, "loss": 1.7415, "step": 6023 }, { "epoch": 0.15865156702659994, "grad_norm": 1.9205622673034668, "learning_rate": 4.207532262312352e-05, "loss": 1.9149, "step": 6024 }, { "epoch": 0.15867790360811165, "grad_norm": 5.0248637199401855, "learning_rate": 4.207400579404793e-05, "loss": 1.9054, "step": 6025 }, { "epoch": 0.1587042401896234, "grad_norm": 3.2937707901000977, "learning_rate": 4.207268896497235e-05, "loss": 1.3608, "step": 6026 }, { "epoch": 0.1587305767711351, "grad_norm": 2.269594192504883, "learning_rate": 4.207137213589676e-05, "loss": 0.7694, "step": 6027 }, { "epoch": 0.15875691335264683, "grad_norm": 3.008793592453003, "learning_rate": 4.207005530682118e-05, "loss": 1.2093, "step": 6028 }, { "epoch": 0.15878324993415854, "grad_norm": 2.551845073699951, "learning_rate": 4.2068738477745586e-05, "loss": 1.7906, "step": 6029 }, { "epoch": 0.15880958651567026, "grad_norm": 2.498796224594116, "learning_rate": 4.206742164867e-05, "loss": 1.7394, "step": 6030 }, { "epoch": 0.15883592309718197, "grad_norm": 2.4335715770721436, "learning_rate": 4.2066104819594424e-05, "loss": 1.7267, "step": 6031 }, { "epoch": 0.15886225967869372, "grad_norm": 1.9502872228622437, "learning_rate": 4.206478799051883e-05, "loss": 2.2899, "step": 6032 }, { "epoch": 0.15888859626020543, "grad_norm": 2.2342894077301025, "learning_rate": 4.206347116144325e-05, "loss": 1.6091, "step": 6033 }, { "epoch": 0.15891493284171715, "grad_norm": 1.7040371894836426, "learning_rate": 4.206215433236766e-05, "loss": 1.3489, "step": 6034 }, { "epoch": 0.15894126942322886, "grad_norm": 2.3223648071289062, "learning_rate": 4.206083750329207e-05, "loss": 1.6438, "step": 6035 }, { "epoch": 0.15896760600474058, "grad_norm": 1.715370535850525, "learning_rate": 4.205952067421649e-05, "loss": 1.6805, "step": 6036 }, { "epoch": 0.1589939425862523, "grad_norm": 3.320110559463501, "learning_rate": 4.2058203845140904e-05, "loss": 1.7764, "step": 6037 }, { "epoch": 0.159020279167764, "grad_norm": 3.9556894302368164, "learning_rate": 4.205688701606531e-05, "loss": 1.7075, "step": 6038 }, { "epoch": 0.15904661574927575, "grad_norm": 1.596814751625061, "learning_rate": 4.205557018698973e-05, "loss": 1.8104, "step": 6039 }, { "epoch": 0.15907295233078747, "grad_norm": 2.765883684158325, "learning_rate": 4.2054253357914144e-05, "loss": 1.5613, "step": 6040 }, { "epoch": 0.15909928891229919, "grad_norm": 1.7439045906066895, "learning_rate": 4.205293652883856e-05, "loss": 2.2978, "step": 6041 }, { "epoch": 0.1591256254938109, "grad_norm": 2.8818860054016113, "learning_rate": 4.2051619699762975e-05, "loss": 2.4009, "step": 6042 }, { "epoch": 0.15915196207532262, "grad_norm": 2.9042367935180664, "learning_rate": 4.2050302870687384e-05, "loss": 1.0585, "step": 6043 }, { "epoch": 0.15917829865683433, "grad_norm": 1.871683120727539, "learning_rate": 4.20489860416118e-05, "loss": 1.6521, "step": 6044 }, { "epoch": 0.15920463523834608, "grad_norm": 4.8750433921813965, "learning_rate": 4.2047669212536215e-05, "loss": 0.8817, "step": 6045 }, { "epoch": 0.1592309718198578, "grad_norm": 1.8055793046951294, "learning_rate": 4.204635238346063e-05, "loss": 1.9334, "step": 6046 }, { "epoch": 0.1592573084013695, "grad_norm": 2.7020907402038574, "learning_rate": 4.204503555438504e-05, "loss": 0.5869, "step": 6047 }, { "epoch": 0.15928364498288122, "grad_norm": 1.832258939743042, "learning_rate": 4.2043718725309455e-05, "loss": 1.9233, "step": 6048 }, { "epoch": 0.15930998156439294, "grad_norm": 2.2810006141662598, "learning_rate": 4.204240189623387e-05, "loss": 1.8354, "step": 6049 }, { "epoch": 0.15933631814590465, "grad_norm": 1.8633068799972534, "learning_rate": 4.2041085067158287e-05, "loss": 1.589, "step": 6050 }, { "epoch": 0.15936265472741637, "grad_norm": 2.8353729248046875, "learning_rate": 4.20397682380827e-05, "loss": 1.6054, "step": 6051 }, { "epoch": 0.1593889913089281, "grad_norm": 1.9458913803100586, "learning_rate": 4.203845140900711e-05, "loss": 1.167, "step": 6052 }, { "epoch": 0.15941532789043983, "grad_norm": 1.5614548921585083, "learning_rate": 4.2037134579931527e-05, "loss": 1.0224, "step": 6053 }, { "epoch": 0.15944166447195154, "grad_norm": 2.363529682159424, "learning_rate": 4.203581775085594e-05, "loss": 2.1562, "step": 6054 }, { "epoch": 0.15946800105346326, "grad_norm": 1.907061219215393, "learning_rate": 4.203450092178036e-05, "loss": 2.2413, "step": 6055 }, { "epoch": 0.15949433763497498, "grad_norm": 5.0372819900512695, "learning_rate": 4.203318409270477e-05, "loss": 0.5852, "step": 6056 }, { "epoch": 0.1595206742164867, "grad_norm": 1.8625068664550781, "learning_rate": 4.203186726362918e-05, "loss": 1.1972, "step": 6057 }, { "epoch": 0.1595470107979984, "grad_norm": 2.006800413131714, "learning_rate": 4.20305504345536e-05, "loss": 1.2923, "step": 6058 }, { "epoch": 0.15957334737951015, "grad_norm": 1.7793184518814087, "learning_rate": 4.2029233605478013e-05, "loss": 1.1352, "step": 6059 }, { "epoch": 0.15959968396102187, "grad_norm": 3.3056442737579346, "learning_rate": 4.202791677640243e-05, "loss": 1.5788, "step": 6060 }, { "epoch": 0.15962602054253358, "grad_norm": 1.7238984107971191, "learning_rate": 4.202659994732684e-05, "loss": 2.3818, "step": 6061 }, { "epoch": 0.1596523571240453, "grad_norm": 3.204756498336792, "learning_rate": 4.2025283118251253e-05, "loss": 2.0978, "step": 6062 }, { "epoch": 0.159678693705557, "grad_norm": 2.7013306617736816, "learning_rate": 4.202396628917566e-05, "loss": 1.4563, "step": 6063 }, { "epoch": 0.15970503028706873, "grad_norm": 1.7570176124572754, "learning_rate": 4.2022649460100085e-05, "loss": 1.7673, "step": 6064 }, { "epoch": 0.15973136686858047, "grad_norm": 1.803415060043335, "learning_rate": 4.2021332631024493e-05, "loss": 1.7306, "step": 6065 }, { "epoch": 0.1597577034500922, "grad_norm": 2.3842899799346924, "learning_rate": 4.202001580194891e-05, "loss": 2.4828, "step": 6066 }, { "epoch": 0.1597840400316039, "grad_norm": 3.046964645385742, "learning_rate": 4.2018698972873325e-05, "loss": 1.9093, "step": 6067 }, { "epoch": 0.15981037661311562, "grad_norm": 2.353818416595459, "learning_rate": 4.2017382143797733e-05, "loss": 1.5821, "step": 6068 }, { "epoch": 0.15983671319462733, "grad_norm": 2.210801839828491, "learning_rate": 4.2016065314722156e-05, "loss": 0.9711, "step": 6069 }, { "epoch": 0.15986304977613905, "grad_norm": 3.385026454925537, "learning_rate": 4.2014748485646565e-05, "loss": 1.688, "step": 6070 }, { "epoch": 0.15988938635765076, "grad_norm": 3.45007586479187, "learning_rate": 4.201343165657098e-05, "loss": 2.0668, "step": 6071 }, { "epoch": 0.1599157229391625, "grad_norm": 3.061713933944702, "learning_rate": 4.201211482749539e-05, "loss": 1.9086, "step": 6072 }, { "epoch": 0.15994205952067422, "grad_norm": 2.809657096862793, "learning_rate": 4.201079799841981e-05, "loss": 1.4217, "step": 6073 }, { "epoch": 0.15996839610218594, "grad_norm": 2.6209864616394043, "learning_rate": 4.200948116934422e-05, "loss": 1.914, "step": 6074 }, { "epoch": 0.15999473268369765, "grad_norm": 3.004488468170166, "learning_rate": 4.2008164340268636e-05, "loss": 2.0692, "step": 6075 }, { "epoch": 0.16002106926520937, "grad_norm": 3.5518791675567627, "learning_rate": 4.200684751119305e-05, "loss": 1.2734, "step": 6076 }, { "epoch": 0.16004740584672109, "grad_norm": 2.149366855621338, "learning_rate": 4.200553068211746e-05, "loss": 1.0786, "step": 6077 }, { "epoch": 0.16007374242823283, "grad_norm": 2.207019090652466, "learning_rate": 4.200421385304188e-05, "loss": 1.9736, "step": 6078 }, { "epoch": 0.16010007900974454, "grad_norm": 3.112743377685547, "learning_rate": 4.200289702396629e-05, "loss": 2.1226, "step": 6079 }, { "epoch": 0.16012641559125626, "grad_norm": 2.382636785507202, "learning_rate": 4.200158019489071e-05, "loss": 1.677, "step": 6080 }, { "epoch": 0.16015275217276798, "grad_norm": 2.8617513179779053, "learning_rate": 4.2000263365815116e-05, "loss": 1.5793, "step": 6081 }, { "epoch": 0.1601790887542797, "grad_norm": 2.2968790531158447, "learning_rate": 4.199894653673953e-05, "loss": 2.1024, "step": 6082 }, { "epoch": 0.1602054253357914, "grad_norm": 2.184506416320801, "learning_rate": 4.199762970766395e-05, "loss": 1.9684, "step": 6083 }, { "epoch": 0.16023176191730312, "grad_norm": 4.663570404052734, "learning_rate": 4.199631287858836e-05, "loss": 1.7537, "step": 6084 }, { "epoch": 0.16025809849881487, "grad_norm": 2.0986099243164062, "learning_rate": 4.199499604951277e-05, "loss": 1.8167, "step": 6085 }, { "epoch": 0.16028443508032658, "grad_norm": 3.0223231315612793, "learning_rate": 4.199367922043719e-05, "loss": 1.5036, "step": 6086 }, { "epoch": 0.1603107716618383, "grad_norm": 5.098803520202637, "learning_rate": 4.19923623913616e-05, "loss": 1.5529, "step": 6087 }, { "epoch": 0.16033710824335, "grad_norm": 3.4167065620422363, "learning_rate": 4.199104556228602e-05, "loss": 1.4869, "step": 6088 }, { "epoch": 0.16036344482486173, "grad_norm": 1.9871636629104614, "learning_rate": 4.1989728733210434e-05, "loss": 1.6781, "step": 6089 }, { "epoch": 0.16038978140637344, "grad_norm": 1.926769495010376, "learning_rate": 4.198841190413484e-05, "loss": 1.6861, "step": 6090 }, { "epoch": 0.16041611798788516, "grad_norm": 3.153432607650757, "learning_rate": 4.198709507505926e-05, "loss": 1.4504, "step": 6091 }, { "epoch": 0.1604424545693969, "grad_norm": 4.245177268981934, "learning_rate": 4.1985778245983674e-05, "loss": 1.6688, "step": 6092 }, { "epoch": 0.16046879115090862, "grad_norm": 6.108002185821533, "learning_rate": 4.198446141690809e-05, "loss": 2.126, "step": 6093 }, { "epoch": 0.16049512773242033, "grad_norm": 2.3100204467773438, "learning_rate": 4.19831445878325e-05, "loss": 1.6948, "step": 6094 }, { "epoch": 0.16052146431393205, "grad_norm": 2.5932717323303223, "learning_rate": 4.1981827758756914e-05, "loss": 1.6972, "step": 6095 }, { "epoch": 0.16054780089544377, "grad_norm": 2.8772497177124023, "learning_rate": 4.198051092968133e-05, "loss": 0.9477, "step": 6096 }, { "epoch": 0.16057413747695548, "grad_norm": 1.6577174663543701, "learning_rate": 4.1979194100605745e-05, "loss": 1.4815, "step": 6097 }, { "epoch": 0.16060047405846722, "grad_norm": 1.90794038772583, "learning_rate": 4.197787727153016e-05, "loss": 1.7716, "step": 6098 }, { "epoch": 0.16062681063997894, "grad_norm": 3.2309482097625732, "learning_rate": 4.197656044245457e-05, "loss": 0.4605, "step": 6099 }, { "epoch": 0.16065314722149066, "grad_norm": 2.1344552040100098, "learning_rate": 4.1975243613378985e-05, "loss": 1.7736, "step": 6100 }, { "epoch": 0.16067948380300237, "grad_norm": 2.5901072025299072, "learning_rate": 4.1973926784303394e-05, "loss": 1.629, "step": 6101 }, { "epoch": 0.1607058203845141, "grad_norm": 1.4747397899627686, "learning_rate": 4.1972609955227816e-05, "loss": 1.7209, "step": 6102 }, { "epoch": 0.1607321569660258, "grad_norm": 3.776444435119629, "learning_rate": 4.1971293126152225e-05, "loss": 1.5893, "step": 6103 }, { "epoch": 0.16075849354753752, "grad_norm": 2.1120872497558594, "learning_rate": 4.196997629707664e-05, "loss": 1.2256, "step": 6104 }, { "epoch": 0.16078483012904926, "grad_norm": 1.907181739807129, "learning_rate": 4.1968659468001056e-05, "loss": 1.0122, "step": 6105 }, { "epoch": 0.16081116671056098, "grad_norm": 2.6752939224243164, "learning_rate": 4.196734263892547e-05, "loss": 1.5295, "step": 6106 }, { "epoch": 0.1608375032920727, "grad_norm": 2.014364004135132, "learning_rate": 4.196602580984989e-05, "loss": 2.0183, "step": 6107 }, { "epoch": 0.1608638398735844, "grad_norm": 3.470395565032959, "learning_rate": 4.1964708980774296e-05, "loss": 1.6397, "step": 6108 }, { "epoch": 0.16089017645509612, "grad_norm": 1.9015464782714844, "learning_rate": 4.196339215169871e-05, "loss": 1.7849, "step": 6109 }, { "epoch": 0.16091651303660784, "grad_norm": 2.7722132205963135, "learning_rate": 4.196207532262312e-05, "loss": 2.2164, "step": 6110 }, { "epoch": 0.16094284961811955, "grad_norm": 3.117102861404419, "learning_rate": 4.196075849354754e-05, "loss": 0.886, "step": 6111 }, { "epoch": 0.1609691861996313, "grad_norm": 1.9750547409057617, "learning_rate": 4.195944166447195e-05, "loss": 2.0688, "step": 6112 }, { "epoch": 0.160995522781143, "grad_norm": 2.2968084812164307, "learning_rate": 4.195812483539637e-05, "loss": 1.6109, "step": 6113 }, { "epoch": 0.16102185936265473, "grad_norm": 3.515033483505249, "learning_rate": 4.195680800632078e-05, "loss": 1.7049, "step": 6114 }, { "epoch": 0.16104819594416644, "grad_norm": 2.277261972427368, "learning_rate": 4.195549117724519e-05, "loss": 2.1311, "step": 6115 }, { "epoch": 0.16107453252567816, "grad_norm": 1.7919727563858032, "learning_rate": 4.1954174348169614e-05, "loss": 1.7363, "step": 6116 }, { "epoch": 0.16110086910718988, "grad_norm": 3.9228217601776123, "learning_rate": 4.195285751909402e-05, "loss": 1.7039, "step": 6117 }, { "epoch": 0.16112720568870162, "grad_norm": 3.793929100036621, "learning_rate": 4.195154069001844e-05, "loss": 1.812, "step": 6118 }, { "epoch": 0.16115354227021333, "grad_norm": 3.7437984943389893, "learning_rate": 4.195022386094285e-05, "loss": 1.0529, "step": 6119 }, { "epoch": 0.16117987885172505, "grad_norm": 3.8403408527374268, "learning_rate": 4.194890703186727e-05, "loss": 1.6427, "step": 6120 }, { "epoch": 0.16120621543323677, "grad_norm": 3.307833194732666, "learning_rate": 4.194759020279168e-05, "loss": 1.6098, "step": 6121 }, { "epoch": 0.16123255201474848, "grad_norm": 2.962394952774048, "learning_rate": 4.1946273373716094e-05, "loss": 2.1496, "step": 6122 }, { "epoch": 0.1612588885962602, "grad_norm": 1.7113622426986694, "learning_rate": 4.194495654464051e-05, "loss": 1.3856, "step": 6123 }, { "epoch": 0.1612852251777719, "grad_norm": 5.497935771942139, "learning_rate": 4.194363971556492e-05, "loss": 1.1016, "step": 6124 }, { "epoch": 0.16131156175928366, "grad_norm": 5.836162567138672, "learning_rate": 4.194232288648934e-05, "loss": 1.5436, "step": 6125 }, { "epoch": 0.16133789834079537, "grad_norm": 2.716874122619629, "learning_rate": 4.194100605741375e-05, "loss": 2.0618, "step": 6126 }, { "epoch": 0.1613642349223071, "grad_norm": 3.397531270980835, "learning_rate": 4.1939689228338166e-05, "loss": 1.6688, "step": 6127 }, { "epoch": 0.1613905715038188, "grad_norm": 1.9136908054351807, "learning_rate": 4.1938372399262574e-05, "loss": 1.5963, "step": 6128 }, { "epoch": 0.16141690808533052, "grad_norm": 1.7691987752914429, "learning_rate": 4.193705557018699e-05, "loss": 0.4547, "step": 6129 }, { "epoch": 0.16144324466684223, "grad_norm": 1.9893584251403809, "learning_rate": 4.1935738741111406e-05, "loss": 1.6099, "step": 6130 }, { "epoch": 0.16146958124835398, "grad_norm": 1.7568279504776, "learning_rate": 4.193442191203582e-05, "loss": 2.1532, "step": 6131 }, { "epoch": 0.1614959178298657, "grad_norm": 2.622365713119507, "learning_rate": 4.193310508296023e-05, "loss": 2.0063, "step": 6132 }, { "epoch": 0.1615222544113774, "grad_norm": 2.176316738128662, "learning_rate": 4.1931788253884646e-05, "loss": 1.6139, "step": 6133 }, { "epoch": 0.16154859099288912, "grad_norm": 1.9413889646530151, "learning_rate": 4.193047142480906e-05, "loss": 2.0126, "step": 6134 }, { "epoch": 0.16157492757440084, "grad_norm": 3.032728672027588, "learning_rate": 4.192915459573348e-05, "loss": 2.1629, "step": 6135 }, { "epoch": 0.16160126415591256, "grad_norm": 4.204956531524658, "learning_rate": 4.192783776665789e-05, "loss": 1.4779, "step": 6136 }, { "epoch": 0.16162760073742427, "grad_norm": 2.328699827194214, "learning_rate": 4.19265209375823e-05, "loss": 1.7515, "step": 6137 }, { "epoch": 0.16165393731893601, "grad_norm": 2.612292528152466, "learning_rate": 4.192520410850672e-05, "loss": 1.3, "step": 6138 }, { "epoch": 0.16168027390044773, "grad_norm": 2.522700309753418, "learning_rate": 4.192388727943113e-05, "loss": 2.0042, "step": 6139 }, { "epoch": 0.16170661048195945, "grad_norm": 2.2790493965148926, "learning_rate": 4.192257045035555e-05, "loss": 2.5038, "step": 6140 }, { "epoch": 0.16173294706347116, "grad_norm": 2.1907012462615967, "learning_rate": 4.192125362127996e-05, "loss": 1.3398, "step": 6141 }, { "epoch": 0.16175928364498288, "grad_norm": 1.9467926025390625, "learning_rate": 4.191993679220437e-05, "loss": 1.9665, "step": 6142 }, { "epoch": 0.1617856202264946, "grad_norm": 2.0438179969787598, "learning_rate": 4.191861996312879e-05, "loss": 1.4199, "step": 6143 }, { "epoch": 0.1618119568080063, "grad_norm": 1.715132474899292, "learning_rate": 4.1917303134053204e-05, "loss": 1.4549, "step": 6144 }, { "epoch": 0.16183829338951805, "grad_norm": 1.9151490926742554, "learning_rate": 4.191598630497762e-05, "loss": 2.2842, "step": 6145 }, { "epoch": 0.16186462997102977, "grad_norm": 1.6929700374603271, "learning_rate": 4.191466947590203e-05, "loss": 1.7138, "step": 6146 }, { "epoch": 0.16189096655254148, "grad_norm": 1.975162148475647, "learning_rate": 4.1913352646826444e-05, "loss": 1.6597, "step": 6147 }, { "epoch": 0.1619173031340532, "grad_norm": 4.588531017303467, "learning_rate": 4.191203581775085e-05, "loss": 1.9448, "step": 6148 }, { "epoch": 0.1619436397155649, "grad_norm": 2.539626121520996, "learning_rate": 4.1910718988675275e-05, "loss": 2.2849, "step": 6149 }, { "epoch": 0.16196997629707663, "grad_norm": 1.9681073427200317, "learning_rate": 4.1909402159599684e-05, "loss": 1.4366, "step": 6150 }, { "epoch": 0.16199631287858837, "grad_norm": 4.842199802398682, "learning_rate": 4.19080853305241e-05, "loss": 1.5735, "step": 6151 }, { "epoch": 0.1620226494601001, "grad_norm": 1.9533764123916626, "learning_rate": 4.1906768501448515e-05, "loss": 1.6544, "step": 6152 }, { "epoch": 0.1620489860416118, "grad_norm": 2.1794426441192627, "learning_rate": 4.190545167237293e-05, "loss": 0.6113, "step": 6153 }, { "epoch": 0.16207532262312352, "grad_norm": 3.911369562149048, "learning_rate": 4.1904134843297346e-05, "loss": 1.2914, "step": 6154 }, { "epoch": 0.16210165920463523, "grad_norm": 2.940295696258545, "learning_rate": 4.1902818014221755e-05, "loss": 1.7576, "step": 6155 }, { "epoch": 0.16212799578614695, "grad_norm": 1.6716759204864502, "learning_rate": 4.190150118514617e-05, "loss": 1.9524, "step": 6156 }, { "epoch": 0.16215433236765867, "grad_norm": 1.8164016008377075, "learning_rate": 4.190018435607058e-05, "loss": 1.902, "step": 6157 }, { "epoch": 0.1621806689491704, "grad_norm": 3.107964038848877, "learning_rate": 4.1898867526995e-05, "loss": 2.0422, "step": 6158 }, { "epoch": 0.16220700553068212, "grad_norm": 2.4870853424072266, "learning_rate": 4.189755069791941e-05, "loss": 0.3071, "step": 6159 }, { "epoch": 0.16223334211219384, "grad_norm": 4.143788814544678, "learning_rate": 4.1896233868843826e-05, "loss": 0.534, "step": 6160 }, { "epoch": 0.16225967869370556, "grad_norm": 2.9861960411071777, "learning_rate": 4.189491703976824e-05, "loss": 1.9037, "step": 6161 }, { "epoch": 0.16228601527521727, "grad_norm": 4.6565327644348145, "learning_rate": 4.189360021069265e-05, "loss": 0.4978, "step": 6162 }, { "epoch": 0.162312351856729, "grad_norm": 2.0895419120788574, "learning_rate": 4.189228338161707e-05, "loss": 1.9403, "step": 6163 }, { "epoch": 0.1623386884382407, "grad_norm": 4.719089984893799, "learning_rate": 4.189096655254148e-05, "loss": 1.3816, "step": 6164 }, { "epoch": 0.16236502501975245, "grad_norm": 2.3824193477630615, "learning_rate": 4.18896497234659e-05, "loss": 0.949, "step": 6165 }, { "epoch": 0.16239136160126416, "grad_norm": 1.7241528034210205, "learning_rate": 4.1888332894390306e-05, "loss": 2.2248, "step": 6166 }, { "epoch": 0.16241769818277588, "grad_norm": 4.268033027648926, "learning_rate": 4.188701606531472e-05, "loss": 1.3197, "step": 6167 }, { "epoch": 0.1624440347642876, "grad_norm": 2.030693292617798, "learning_rate": 4.188569923623914e-05, "loss": 1.9805, "step": 6168 }, { "epoch": 0.1624703713457993, "grad_norm": 2.5701515674591064, "learning_rate": 4.188438240716355e-05, "loss": 1.8422, "step": 6169 }, { "epoch": 0.16249670792731102, "grad_norm": 2.068448543548584, "learning_rate": 4.188306557808797e-05, "loss": 2.0175, "step": 6170 }, { "epoch": 0.16252304450882277, "grad_norm": 3.963059663772583, "learning_rate": 4.188174874901238e-05, "loss": 1.6335, "step": 6171 }, { "epoch": 0.16254938109033448, "grad_norm": 1.912456750869751, "learning_rate": 4.18804319199368e-05, "loss": 1.8062, "step": 6172 }, { "epoch": 0.1625757176718462, "grad_norm": 2.8689017295837402, "learning_rate": 4.187911509086121e-05, "loss": 1.5806, "step": 6173 }, { "epoch": 0.1626020542533579, "grad_norm": 3.1292359828948975, "learning_rate": 4.1877798261785624e-05, "loss": 1.7629, "step": 6174 }, { "epoch": 0.16262839083486963, "grad_norm": 2.039140462875366, "learning_rate": 4.187648143271003e-05, "loss": 1.7487, "step": 6175 }, { "epoch": 0.16265472741638135, "grad_norm": 3.5004191398620605, "learning_rate": 4.187516460363445e-05, "loss": 1.0709, "step": 6176 }, { "epoch": 0.16268106399789306, "grad_norm": 2.3860397338867188, "learning_rate": 4.1873847774558864e-05, "loss": 1.8962, "step": 6177 }, { "epoch": 0.1627074005794048, "grad_norm": 2.4505045413970947, "learning_rate": 4.187253094548328e-05, "loss": 1.6988, "step": 6178 }, { "epoch": 0.16273373716091652, "grad_norm": 2.4118874073028564, "learning_rate": 4.1871214116407695e-05, "loss": 2.0337, "step": 6179 }, { "epoch": 0.16276007374242824, "grad_norm": 2.1553492546081543, "learning_rate": 4.1869897287332104e-05, "loss": 1.7035, "step": 6180 }, { "epoch": 0.16278641032393995, "grad_norm": 6.59297513961792, "learning_rate": 4.186858045825652e-05, "loss": 1.7664, "step": 6181 }, { "epoch": 0.16281274690545167, "grad_norm": 3.220435619354248, "learning_rate": 4.1867263629180935e-05, "loss": 1.2137, "step": 6182 }, { "epoch": 0.16283908348696338, "grad_norm": 3.285412073135376, "learning_rate": 4.186594680010535e-05, "loss": 2.2588, "step": 6183 }, { "epoch": 0.16286542006847513, "grad_norm": 2.169034481048584, "learning_rate": 4.186462997102976e-05, "loss": 1.836, "step": 6184 }, { "epoch": 0.16289175664998684, "grad_norm": 12.569430351257324, "learning_rate": 4.1863313141954175e-05, "loss": 1.3789, "step": 6185 }, { "epoch": 0.16291809323149856, "grad_norm": 4.0849080085754395, "learning_rate": 4.186199631287859e-05, "loss": 1.4266, "step": 6186 }, { "epoch": 0.16294442981301027, "grad_norm": 1.7760883569717407, "learning_rate": 4.186067948380301e-05, "loss": 1.8275, "step": 6187 }, { "epoch": 0.162970766394522, "grad_norm": 3.1768648624420166, "learning_rate": 4.1859362654727415e-05, "loss": 1.6196, "step": 6188 }, { "epoch": 0.1629971029760337, "grad_norm": 1.7993084192276, "learning_rate": 4.185804582565183e-05, "loss": 1.5371, "step": 6189 }, { "epoch": 0.16302343955754542, "grad_norm": 2.5547139644622803, "learning_rate": 4.185672899657625e-05, "loss": 2.1122, "step": 6190 }, { "epoch": 0.16304977613905716, "grad_norm": 2.568890333175659, "learning_rate": 4.185541216750066e-05, "loss": 1.3982, "step": 6191 }, { "epoch": 0.16307611272056888, "grad_norm": 1.9501646757125854, "learning_rate": 4.185409533842508e-05, "loss": 2.7147, "step": 6192 }, { "epoch": 0.1631024493020806, "grad_norm": 2.309138536453247, "learning_rate": 4.185277850934949e-05, "loss": 1.1929, "step": 6193 }, { "epoch": 0.1631287858835923, "grad_norm": 3.3782432079315186, "learning_rate": 4.18514616802739e-05, "loss": 1.6183, "step": 6194 }, { "epoch": 0.16315512246510402, "grad_norm": 1.8713680505752563, "learning_rate": 4.185014485119831e-05, "loss": 1.2001, "step": 6195 }, { "epoch": 0.16318145904661574, "grad_norm": 9.779690742492676, "learning_rate": 4.1848828022122733e-05, "loss": 1.5312, "step": 6196 }, { "epoch": 0.16320779562812746, "grad_norm": 2.7976479530334473, "learning_rate": 4.184751119304714e-05, "loss": 1.6511, "step": 6197 }, { "epoch": 0.1632341322096392, "grad_norm": 2.537200927734375, "learning_rate": 4.184619436397156e-05, "loss": 1.8137, "step": 6198 }, { "epoch": 0.16326046879115091, "grad_norm": 1.6719151735305786, "learning_rate": 4.1844877534895973e-05, "loss": 1.2775, "step": 6199 }, { "epoch": 0.16328680537266263, "grad_norm": 1.9692857265472412, "learning_rate": 4.184356070582038e-05, "loss": 2.1277, "step": 6200 }, { "epoch": 0.16331314195417435, "grad_norm": 1.7134268283843994, "learning_rate": 4.1842243876744805e-05, "loss": 2.4531, "step": 6201 }, { "epoch": 0.16333947853568606, "grad_norm": 1.583983063697815, "learning_rate": 4.1840927047669214e-05, "loss": 1.805, "step": 6202 }, { "epoch": 0.16336581511719778, "grad_norm": 3.674165725708008, "learning_rate": 4.183961021859363e-05, "loss": 0.8692, "step": 6203 }, { "epoch": 0.16339215169870952, "grad_norm": 5.6180620193481445, "learning_rate": 4.183829338951804e-05, "loss": 2.1638, "step": 6204 }, { "epoch": 0.16341848828022124, "grad_norm": 2.1328909397125244, "learning_rate": 4.183697656044246e-05, "loss": 1.9604, "step": 6205 }, { "epoch": 0.16344482486173295, "grad_norm": 2.927257537841797, "learning_rate": 4.183565973136687e-05, "loss": 1.3435, "step": 6206 }, { "epoch": 0.16347116144324467, "grad_norm": 1.9658448696136475, "learning_rate": 4.1834342902291285e-05, "loss": 1.912, "step": 6207 }, { "epoch": 0.16349749802475638, "grad_norm": 2.342245101928711, "learning_rate": 4.18330260732157e-05, "loss": 1.5374, "step": 6208 }, { "epoch": 0.1635238346062681, "grad_norm": 7.295834541320801, "learning_rate": 4.183170924414011e-05, "loss": 1.7454, "step": 6209 }, { "epoch": 0.1635501711877798, "grad_norm": 3.025780200958252, "learning_rate": 4.183039241506453e-05, "loss": 0.7368, "step": 6210 }, { "epoch": 0.16357650776929156, "grad_norm": 1.6089125871658325, "learning_rate": 4.182907558598894e-05, "loss": 1.0085, "step": 6211 }, { "epoch": 0.16360284435080327, "grad_norm": 1.572649359703064, "learning_rate": 4.1827758756913356e-05, "loss": 1.7478, "step": 6212 }, { "epoch": 0.163629180932315, "grad_norm": 2.767467737197876, "learning_rate": 4.1826441927837765e-05, "loss": 1.7427, "step": 6213 }, { "epoch": 0.1636555175138267, "grad_norm": 1.5712791681289673, "learning_rate": 4.182512509876218e-05, "loss": 0.3449, "step": 6214 }, { "epoch": 0.16368185409533842, "grad_norm": 2.7520124912261963, "learning_rate": 4.1823808269686596e-05, "loss": 0.5863, "step": 6215 }, { "epoch": 0.16370819067685013, "grad_norm": 2.7478747367858887, "learning_rate": 4.182249144061101e-05, "loss": 1.4459, "step": 6216 }, { "epoch": 0.16373452725836188, "grad_norm": 4.27979850769043, "learning_rate": 4.182117461153543e-05, "loss": 3.5372, "step": 6217 }, { "epoch": 0.1637608638398736, "grad_norm": 1.6363543272018433, "learning_rate": 4.1819857782459836e-05, "loss": 2.1614, "step": 6218 }, { "epoch": 0.1637872004213853, "grad_norm": 2.104304552078247, "learning_rate": 4.181854095338426e-05, "loss": 1.7553, "step": 6219 }, { "epoch": 0.16381353700289703, "grad_norm": 1.8212826251983643, "learning_rate": 4.181722412430867e-05, "loss": 1.9183, "step": 6220 }, { "epoch": 0.16383987358440874, "grad_norm": 4.419536113739014, "learning_rate": 4.181590729523308e-05, "loss": 1.2457, "step": 6221 }, { "epoch": 0.16386621016592046, "grad_norm": 3.3643383979797363, "learning_rate": 4.181459046615749e-05, "loss": 2.2115, "step": 6222 }, { "epoch": 0.16389254674743217, "grad_norm": 3.484835624694824, "learning_rate": 4.181327363708191e-05, "loss": 2.1387, "step": 6223 }, { "epoch": 0.16391888332894392, "grad_norm": 1.7184138298034668, "learning_rate": 4.181195680800632e-05, "loss": 1.5123, "step": 6224 }, { "epoch": 0.16394521991045563, "grad_norm": 1.5833100080490112, "learning_rate": 4.181063997893074e-05, "loss": 1.7886, "step": 6225 }, { "epoch": 0.16397155649196735, "grad_norm": 2.703047752380371, "learning_rate": 4.1809323149855154e-05, "loss": 1.4228, "step": 6226 }, { "epoch": 0.16399789307347906, "grad_norm": 2.10036563873291, "learning_rate": 4.180800632077956e-05, "loss": 2.0066, "step": 6227 }, { "epoch": 0.16402422965499078, "grad_norm": 2.0519776344299316, "learning_rate": 4.180668949170398e-05, "loss": 2.0121, "step": 6228 }, { "epoch": 0.1640505662365025, "grad_norm": 2.7119178771972656, "learning_rate": 4.1805372662628394e-05, "loss": 1.6072, "step": 6229 }, { "epoch": 0.1640769028180142, "grad_norm": 2.0890233516693115, "learning_rate": 4.180405583355281e-05, "loss": 1.1275, "step": 6230 }, { "epoch": 0.16410323939952595, "grad_norm": 3.6675961017608643, "learning_rate": 4.180273900447722e-05, "loss": 1.4267, "step": 6231 }, { "epoch": 0.16412957598103767, "grad_norm": 1.7845052480697632, "learning_rate": 4.1801422175401634e-05, "loss": 2.0269, "step": 6232 }, { "epoch": 0.16415591256254938, "grad_norm": 3.4579153060913086, "learning_rate": 4.180010534632604e-05, "loss": 2.2869, "step": 6233 }, { "epoch": 0.1641822491440611, "grad_norm": 2.1843247413635254, "learning_rate": 4.1798788517250465e-05, "loss": 2.0065, "step": 6234 }, { "epoch": 0.16420858572557281, "grad_norm": 1.7261897325515747, "learning_rate": 4.1797471688174874e-05, "loss": 0.9492, "step": 6235 }, { "epoch": 0.16423492230708453, "grad_norm": 5.073116302490234, "learning_rate": 4.179615485909929e-05, "loss": 0.5242, "step": 6236 }, { "epoch": 0.16426125888859627, "grad_norm": 4.026960849761963, "learning_rate": 4.1794838030023705e-05, "loss": 0.598, "step": 6237 }, { "epoch": 0.164287595470108, "grad_norm": 2.2366933822631836, "learning_rate": 4.179352120094812e-05, "loss": 1.9456, "step": 6238 }, { "epoch": 0.1643139320516197, "grad_norm": 1.8947323560714722, "learning_rate": 4.1792204371872536e-05, "loss": 1.6567, "step": 6239 }, { "epoch": 0.16434026863313142, "grad_norm": 1.7033538818359375, "learning_rate": 4.1790887542796945e-05, "loss": 1.9291, "step": 6240 }, { "epoch": 0.16436660521464314, "grad_norm": 1.8342688083648682, "learning_rate": 4.178957071372136e-05, "loss": 1.8968, "step": 6241 }, { "epoch": 0.16439294179615485, "grad_norm": 2.2728381156921387, "learning_rate": 4.178825388464577e-05, "loss": 1.5177, "step": 6242 }, { "epoch": 0.16441927837766657, "grad_norm": 2.142536163330078, "learning_rate": 4.178693705557019e-05, "loss": 1.4954, "step": 6243 }, { "epoch": 0.1644456149591783, "grad_norm": 1.87212073802948, "learning_rate": 4.17856202264946e-05, "loss": 0.2432, "step": 6244 }, { "epoch": 0.16447195154069003, "grad_norm": 1.8364509344100952, "learning_rate": 4.1784303397419016e-05, "loss": 1.779, "step": 6245 }, { "epoch": 0.16449828812220174, "grad_norm": 2.9865927696228027, "learning_rate": 4.178298656834343e-05, "loss": 1.2728, "step": 6246 }, { "epoch": 0.16452462470371346, "grad_norm": 1.7972159385681152, "learning_rate": 4.178166973926784e-05, "loss": 2.1082, "step": 6247 }, { "epoch": 0.16455096128522517, "grad_norm": 3.559039831161499, "learning_rate": 4.178035291019226e-05, "loss": 1.7195, "step": 6248 }, { "epoch": 0.1645772978667369, "grad_norm": 1.6659404039382935, "learning_rate": 4.177903608111667e-05, "loss": 1.3815, "step": 6249 }, { "epoch": 0.1646036344482486, "grad_norm": 2.0792975425720215, "learning_rate": 4.177771925204109e-05, "loss": 1.7125, "step": 6250 }, { "epoch": 0.16462997102976035, "grad_norm": 2.2669241428375244, "learning_rate": 4.1776402422965496e-05, "loss": 1.4363, "step": 6251 }, { "epoch": 0.16465630761127206, "grad_norm": 2.20127534866333, "learning_rate": 4.177508559388992e-05, "loss": 1.7931, "step": 6252 }, { "epoch": 0.16468264419278378, "grad_norm": 2.8577163219451904, "learning_rate": 4.177376876481433e-05, "loss": 1.613, "step": 6253 }, { "epoch": 0.1647089807742955, "grad_norm": 2.5715982913970947, "learning_rate": 4.177245193573874e-05, "loss": 1.5152, "step": 6254 }, { "epoch": 0.1647353173558072, "grad_norm": 3.654378890991211, "learning_rate": 4.177113510666316e-05, "loss": 1.9085, "step": 6255 }, { "epoch": 0.16476165393731892, "grad_norm": 1.9232184886932373, "learning_rate": 4.176981827758757e-05, "loss": 1.8253, "step": 6256 }, { "epoch": 0.16478799051883067, "grad_norm": 1.6215038299560547, "learning_rate": 4.176850144851199e-05, "loss": 2.5152, "step": 6257 }, { "epoch": 0.16481432710034238, "grad_norm": 2.3463377952575684, "learning_rate": 4.17671846194364e-05, "loss": 0.9331, "step": 6258 }, { "epoch": 0.1648406636818541, "grad_norm": 1.8195308446884155, "learning_rate": 4.1765867790360814e-05, "loss": 2.0912, "step": 6259 }, { "epoch": 0.16486700026336581, "grad_norm": 2.8469274044036865, "learning_rate": 4.176455096128522e-05, "loss": 1.3566, "step": 6260 }, { "epoch": 0.16489333684487753, "grad_norm": 2.154100179672241, "learning_rate": 4.176323413220964e-05, "loss": 2.2292, "step": 6261 }, { "epoch": 0.16491967342638925, "grad_norm": 1.9302618503570557, "learning_rate": 4.1761917303134054e-05, "loss": 1.8194, "step": 6262 }, { "epoch": 0.16494601000790096, "grad_norm": 2.492159843444824, "learning_rate": 4.176060047405847e-05, "loss": 1.3806, "step": 6263 }, { "epoch": 0.1649723465894127, "grad_norm": 4.924745559692383, "learning_rate": 4.1759283644982886e-05, "loss": 1.7607, "step": 6264 }, { "epoch": 0.16499868317092442, "grad_norm": 1.7426139116287231, "learning_rate": 4.1757966815907295e-05, "loss": 1.577, "step": 6265 }, { "epoch": 0.16502501975243614, "grad_norm": 2.427046775817871, "learning_rate": 4.175664998683171e-05, "loss": 2.5492, "step": 6266 }, { "epoch": 0.16505135633394785, "grad_norm": 1.8286429643630981, "learning_rate": 4.1755333157756126e-05, "loss": 2.2028, "step": 6267 }, { "epoch": 0.16507769291545957, "grad_norm": 2.536653757095337, "learning_rate": 4.175401632868054e-05, "loss": 0.9838, "step": 6268 }, { "epoch": 0.16510402949697128, "grad_norm": 2.0288827419281006, "learning_rate": 4.175269949960495e-05, "loss": 2.4019, "step": 6269 }, { "epoch": 0.16513036607848303, "grad_norm": 1.8791232109069824, "learning_rate": 4.1751382670529366e-05, "loss": 2.0111, "step": 6270 }, { "epoch": 0.16515670265999474, "grad_norm": 2.0149269104003906, "learning_rate": 4.175006584145378e-05, "loss": 1.8392, "step": 6271 }, { "epoch": 0.16518303924150646, "grad_norm": 1.7191165685653687, "learning_rate": 4.17487490123782e-05, "loss": 2.2062, "step": 6272 }, { "epoch": 0.16520937582301817, "grad_norm": 1.7985368967056274, "learning_rate": 4.174743218330261e-05, "loss": 1.9149, "step": 6273 }, { "epoch": 0.1652357124045299, "grad_norm": 2.5112264156341553, "learning_rate": 4.174611535422702e-05, "loss": 1.9655, "step": 6274 }, { "epoch": 0.1652620489860416, "grad_norm": 5.3558878898620605, "learning_rate": 4.174479852515144e-05, "loss": 1.9623, "step": 6275 }, { "epoch": 0.16528838556755332, "grad_norm": 3.8078792095184326, "learning_rate": 4.174348169607585e-05, "loss": 0.6285, "step": 6276 }, { "epoch": 0.16531472214906506, "grad_norm": 4.321228504180908, "learning_rate": 4.174216486700027e-05, "loss": 0.8884, "step": 6277 }, { "epoch": 0.16534105873057678, "grad_norm": 2.3482091426849365, "learning_rate": 4.174084803792468e-05, "loss": 2.2635, "step": 6278 }, { "epoch": 0.1653673953120885, "grad_norm": 2.1774532794952393, "learning_rate": 4.173953120884909e-05, "loss": 2.1163, "step": 6279 }, { "epoch": 0.1653937318936002, "grad_norm": 2.4843313694000244, "learning_rate": 4.173821437977351e-05, "loss": 1.7013, "step": 6280 }, { "epoch": 0.16542006847511193, "grad_norm": 1.821319341659546, "learning_rate": 4.1736897550697924e-05, "loss": 2.3493, "step": 6281 }, { "epoch": 0.16544640505662364, "grad_norm": 2.395052194595337, "learning_rate": 4.173558072162234e-05, "loss": 1.9074, "step": 6282 }, { "epoch": 0.16547274163813536, "grad_norm": 1.82941472530365, "learning_rate": 4.173426389254675e-05, "loss": 2.0776, "step": 6283 }, { "epoch": 0.1654990782196471, "grad_norm": 2.0111989974975586, "learning_rate": 4.1732947063471164e-05, "loss": 1.2991, "step": 6284 }, { "epoch": 0.16552541480115882, "grad_norm": 2.0000054836273193, "learning_rate": 4.173163023439558e-05, "loss": 1.8515, "step": 6285 }, { "epoch": 0.16555175138267053, "grad_norm": 2.083451986312866, "learning_rate": 4.1730313405319995e-05, "loss": 1.7277, "step": 6286 }, { "epoch": 0.16557808796418225, "grad_norm": 1.7810637950897217, "learning_rate": 4.1728996576244404e-05, "loss": 1.7744, "step": 6287 }, { "epoch": 0.16560442454569396, "grad_norm": 2.8143627643585205, "learning_rate": 4.172767974716882e-05, "loss": 0.6735, "step": 6288 }, { "epoch": 0.16563076112720568, "grad_norm": 3.857470750808716, "learning_rate": 4.172636291809323e-05, "loss": 1.1304, "step": 6289 }, { "epoch": 0.16565709770871742, "grad_norm": 2.2950751781463623, "learning_rate": 4.172504608901765e-05, "loss": 0.4887, "step": 6290 }, { "epoch": 0.16568343429022914, "grad_norm": 6.222325325012207, "learning_rate": 4.172372925994206e-05, "loss": 1.5784, "step": 6291 }, { "epoch": 0.16570977087174085, "grad_norm": 2.973029375076294, "learning_rate": 4.1722412430866475e-05, "loss": 1.1543, "step": 6292 }, { "epoch": 0.16573610745325257, "grad_norm": 2.169588565826416, "learning_rate": 4.172109560179089e-05, "loss": 2.4109, "step": 6293 }, { "epoch": 0.16576244403476428, "grad_norm": 1.5478515625, "learning_rate": 4.17197787727153e-05, "loss": 1.7889, "step": 6294 }, { "epoch": 0.165788780616276, "grad_norm": 3.2532997131347656, "learning_rate": 4.171846194363972e-05, "loss": 0.5807, "step": 6295 }, { "epoch": 0.16581511719778771, "grad_norm": 4.012513637542725, "learning_rate": 4.171714511456413e-05, "loss": 0.7988, "step": 6296 }, { "epoch": 0.16584145377929946, "grad_norm": 2.912229537963867, "learning_rate": 4.1715828285488546e-05, "loss": 0.8654, "step": 6297 }, { "epoch": 0.16586779036081117, "grad_norm": 2.3448755741119385, "learning_rate": 4.1714511456412955e-05, "loss": 1.5029, "step": 6298 }, { "epoch": 0.1658941269423229, "grad_norm": 2.821657180786133, "learning_rate": 4.171319462733738e-05, "loss": 1.6197, "step": 6299 }, { "epoch": 0.1659204635238346, "grad_norm": 3.6999006271362305, "learning_rate": 4.1711877798261786e-05, "loss": 1.5829, "step": 6300 }, { "epoch": 0.16594680010534632, "grad_norm": 1.7603660821914673, "learning_rate": 4.17105609691862e-05, "loss": 1.8102, "step": 6301 }, { "epoch": 0.16597313668685804, "grad_norm": 1.6552979946136475, "learning_rate": 4.170924414011062e-05, "loss": 2.0413, "step": 6302 }, { "epoch": 0.16599947326836978, "grad_norm": 4.111746311187744, "learning_rate": 4.1707927311035026e-05, "loss": 1.0443, "step": 6303 }, { "epoch": 0.1660258098498815, "grad_norm": 6.605031967163086, "learning_rate": 4.170661048195945e-05, "loss": 0.6922, "step": 6304 }, { "epoch": 0.1660521464313932, "grad_norm": 2.572258472442627, "learning_rate": 4.170529365288386e-05, "loss": 1.4043, "step": 6305 }, { "epoch": 0.16607848301290493, "grad_norm": 2.5222651958465576, "learning_rate": 4.170397682380827e-05, "loss": 1.6599, "step": 6306 }, { "epoch": 0.16610481959441664, "grad_norm": 2.052239418029785, "learning_rate": 4.170265999473268e-05, "loss": 1.5124, "step": 6307 }, { "epoch": 0.16613115617592836, "grad_norm": 2.027329444885254, "learning_rate": 4.17013431656571e-05, "loss": 1.193, "step": 6308 }, { "epoch": 0.16615749275744007, "grad_norm": 3.2304461002349854, "learning_rate": 4.170002633658151e-05, "loss": 1.5395, "step": 6309 }, { "epoch": 0.16618382933895182, "grad_norm": 3.3135313987731934, "learning_rate": 4.169870950750593e-05, "loss": 1.7107, "step": 6310 }, { "epoch": 0.16621016592046353, "grad_norm": 1.9948111772537231, "learning_rate": 4.1697392678430344e-05, "loss": 1.3134, "step": 6311 }, { "epoch": 0.16623650250197525, "grad_norm": 2.704598903656006, "learning_rate": 4.169607584935475e-05, "loss": 0.681, "step": 6312 }, { "epoch": 0.16626283908348696, "grad_norm": 2.8368823528289795, "learning_rate": 4.169475902027917e-05, "loss": 1.1584, "step": 6313 }, { "epoch": 0.16628917566499868, "grad_norm": 2.1248085498809814, "learning_rate": 4.1693442191203584e-05, "loss": 1.5046, "step": 6314 }, { "epoch": 0.1663155122465104, "grad_norm": 3.1605188846588135, "learning_rate": 4.1692125362128e-05, "loss": 0.5833, "step": 6315 }, { "epoch": 0.1663418488280221, "grad_norm": 2.2313647270202637, "learning_rate": 4.169080853305241e-05, "loss": 1.135, "step": 6316 }, { "epoch": 0.16636818540953385, "grad_norm": 2.5924830436706543, "learning_rate": 4.1689491703976824e-05, "loss": 1.5192, "step": 6317 }, { "epoch": 0.16639452199104557, "grad_norm": 4.161013126373291, "learning_rate": 4.168817487490124e-05, "loss": 2.529, "step": 6318 }, { "epoch": 0.16642085857255728, "grad_norm": 2.3811326026916504, "learning_rate": 4.1686858045825655e-05, "loss": 2.3211, "step": 6319 }, { "epoch": 0.166447195154069, "grad_norm": 3.1531453132629395, "learning_rate": 4.168554121675007e-05, "loss": 1.3661, "step": 6320 }, { "epoch": 0.16647353173558072, "grad_norm": 2.6829094886779785, "learning_rate": 4.168422438767448e-05, "loss": 1.6922, "step": 6321 }, { "epoch": 0.16649986831709243, "grad_norm": 2.0825557708740234, "learning_rate": 4.1682907558598895e-05, "loss": 1.755, "step": 6322 }, { "epoch": 0.16652620489860417, "grad_norm": 1.9286105632781982, "learning_rate": 4.168159072952331e-05, "loss": 1.583, "step": 6323 }, { "epoch": 0.1665525414801159, "grad_norm": 3.6238157749176025, "learning_rate": 4.168027390044773e-05, "loss": 1.9119, "step": 6324 }, { "epoch": 0.1665788780616276, "grad_norm": 1.9939560890197754, "learning_rate": 4.1678957071372136e-05, "loss": 0.7373, "step": 6325 }, { "epoch": 0.16660521464313932, "grad_norm": 2.046525716781616, "learning_rate": 4.167764024229655e-05, "loss": 1.7242, "step": 6326 }, { "epoch": 0.16663155122465104, "grad_norm": 3.0568177700042725, "learning_rate": 4.167632341322097e-05, "loss": 2.2206, "step": 6327 }, { "epoch": 0.16665788780616275, "grad_norm": 3.57637095451355, "learning_rate": 4.167500658414538e-05, "loss": 1.6992, "step": 6328 }, { "epoch": 0.16668422438767447, "grad_norm": 1.8986932039260864, "learning_rate": 4.16736897550698e-05, "loss": 1.8675, "step": 6329 }, { "epoch": 0.1667105609691862, "grad_norm": 1.8276662826538086, "learning_rate": 4.167237292599421e-05, "loss": 1.5032, "step": 6330 }, { "epoch": 0.16673689755069793, "grad_norm": 2.2076597213745117, "learning_rate": 4.167105609691862e-05, "loss": 1.5994, "step": 6331 }, { "epoch": 0.16676323413220964, "grad_norm": 2.1224892139434814, "learning_rate": 4.166973926784304e-05, "loss": 2.0706, "step": 6332 }, { "epoch": 0.16678957071372136, "grad_norm": 3.413240671157837, "learning_rate": 4.1668422438767454e-05, "loss": 0.8709, "step": 6333 }, { "epoch": 0.16681590729523307, "grad_norm": 1.8400017023086548, "learning_rate": 4.166710560969186e-05, "loss": 1.4148, "step": 6334 }, { "epoch": 0.1668422438767448, "grad_norm": 1.775442361831665, "learning_rate": 4.166578878061628e-05, "loss": 1.7614, "step": 6335 }, { "epoch": 0.1668685804582565, "grad_norm": 1.7555932998657227, "learning_rate": 4.166447195154069e-05, "loss": 1.9382, "step": 6336 }, { "epoch": 0.16689491703976825, "grad_norm": 3.6088674068450928, "learning_rate": 4.166315512246511e-05, "loss": 1.4609, "step": 6337 }, { "epoch": 0.16692125362127996, "grad_norm": 2.4535605907440186, "learning_rate": 4.166183829338952e-05, "loss": 1.9765, "step": 6338 }, { "epoch": 0.16694759020279168, "grad_norm": 2.234781503677368, "learning_rate": 4.1660521464313934e-05, "loss": 2.0701, "step": 6339 }, { "epoch": 0.1669739267843034, "grad_norm": 2.3924405574798584, "learning_rate": 4.165920463523835e-05, "loss": 1.7917, "step": 6340 }, { "epoch": 0.1670002633658151, "grad_norm": 2.171189785003662, "learning_rate": 4.165788780616276e-05, "loss": 1.9543, "step": 6341 }, { "epoch": 0.16702659994732683, "grad_norm": 2.223950147628784, "learning_rate": 4.165657097708718e-05, "loss": 1.3898, "step": 6342 }, { "epoch": 0.16705293652883857, "grad_norm": 1.7114344835281372, "learning_rate": 4.165525414801159e-05, "loss": 1.0963, "step": 6343 }, { "epoch": 0.16707927311035028, "grad_norm": 2.054962635040283, "learning_rate": 4.1653937318936005e-05, "loss": 1.5417, "step": 6344 }, { "epoch": 0.167105609691862, "grad_norm": 1.5714259147644043, "learning_rate": 4.1652620489860414e-05, "loss": 1.8038, "step": 6345 }, { "epoch": 0.16713194627337372, "grad_norm": 3.3276352882385254, "learning_rate": 4.165130366078483e-05, "loss": 1.6062, "step": 6346 }, { "epoch": 0.16715828285488543, "grad_norm": 2.267390727996826, "learning_rate": 4.1649986831709245e-05, "loss": 1.6488, "step": 6347 }, { "epoch": 0.16718461943639715, "grad_norm": 3.921799898147583, "learning_rate": 4.164867000263366e-05, "loss": 1.8439, "step": 6348 }, { "epoch": 0.16721095601790886, "grad_norm": 1.8757134675979614, "learning_rate": 4.1647353173558076e-05, "loss": 1.5935, "step": 6349 }, { "epoch": 0.1672372925994206, "grad_norm": 2.898759365081787, "learning_rate": 4.1646036344482485e-05, "loss": 1.9947, "step": 6350 }, { "epoch": 0.16726362918093232, "grad_norm": 1.6849979162216187, "learning_rate": 4.164471951540691e-05, "loss": 1.5262, "step": 6351 }, { "epoch": 0.16728996576244404, "grad_norm": 2.1206817626953125, "learning_rate": 4.1643402686331316e-05, "loss": 1.5047, "step": 6352 }, { "epoch": 0.16731630234395575, "grad_norm": 2.3731672763824463, "learning_rate": 4.164208585725573e-05, "loss": 0.4956, "step": 6353 }, { "epoch": 0.16734263892546747, "grad_norm": 2.3113131523132324, "learning_rate": 4.164076902818014e-05, "loss": 1.8525, "step": 6354 }, { "epoch": 0.16736897550697918, "grad_norm": 1.9033572673797607, "learning_rate": 4.1639452199104556e-05, "loss": 2.2655, "step": 6355 }, { "epoch": 0.16739531208849093, "grad_norm": 9.518451690673828, "learning_rate": 4.163813537002897e-05, "loss": 2.1358, "step": 6356 }, { "epoch": 0.16742164867000264, "grad_norm": 2.6317458152770996, "learning_rate": 4.163681854095339e-05, "loss": 1.5336, "step": 6357 }, { "epoch": 0.16744798525151436, "grad_norm": 2.8073017597198486, "learning_rate": 4.16355017118778e-05, "loss": 2.2666, "step": 6358 }, { "epoch": 0.16747432183302607, "grad_norm": 2.4643218517303467, "learning_rate": 4.163418488280221e-05, "loss": 1.7047, "step": 6359 }, { "epoch": 0.1675006584145378, "grad_norm": 4.039646625518799, "learning_rate": 4.163286805372663e-05, "loss": 0.8799, "step": 6360 }, { "epoch": 0.1675269949960495, "grad_norm": 5.677095413208008, "learning_rate": 4.163155122465104e-05, "loss": 2.553, "step": 6361 }, { "epoch": 0.16755333157756122, "grad_norm": 1.8579610586166382, "learning_rate": 4.163023439557546e-05, "loss": 1.3293, "step": 6362 }, { "epoch": 0.16757966815907296, "grad_norm": 3.0890026092529297, "learning_rate": 4.162891756649987e-05, "loss": 1.5316, "step": 6363 }, { "epoch": 0.16760600474058468, "grad_norm": 2.2251076698303223, "learning_rate": 4.162760073742428e-05, "loss": 1.4928, "step": 6364 }, { "epoch": 0.1676323413220964, "grad_norm": 1.745855450630188, "learning_rate": 4.16262839083487e-05, "loss": 1.8408, "step": 6365 }, { "epoch": 0.1676586779036081, "grad_norm": 2.259474277496338, "learning_rate": 4.1624967079273114e-05, "loss": 1.9137, "step": 6366 }, { "epoch": 0.16768501448511983, "grad_norm": 2.1383891105651855, "learning_rate": 4.162365025019753e-05, "loss": 2.0164, "step": 6367 }, { "epoch": 0.16771135106663154, "grad_norm": 3.731020927429199, "learning_rate": 4.162233342112194e-05, "loss": 0.6502, "step": 6368 }, { "epoch": 0.16773768764814326, "grad_norm": 6.15726900100708, "learning_rate": 4.1621016592046354e-05, "loss": 1.616, "step": 6369 }, { "epoch": 0.167764024229655, "grad_norm": 1.8864214420318604, "learning_rate": 4.161969976297077e-05, "loss": 1.562, "step": 6370 }, { "epoch": 0.16779036081116672, "grad_norm": 1.9158926010131836, "learning_rate": 4.1618382933895185e-05, "loss": 1.7563, "step": 6371 }, { "epoch": 0.16781669739267843, "grad_norm": 2.8187687397003174, "learning_rate": 4.1617066104819594e-05, "loss": 1.344, "step": 6372 }, { "epoch": 0.16784303397419015, "grad_norm": 1.8098351955413818, "learning_rate": 4.161574927574401e-05, "loss": 1.4523, "step": 6373 }, { "epoch": 0.16786937055570186, "grad_norm": 2.0661325454711914, "learning_rate": 4.1614432446668425e-05, "loss": 2.0314, "step": 6374 }, { "epoch": 0.16789570713721358, "grad_norm": 4.071370601654053, "learning_rate": 4.161311561759284e-05, "loss": 1.4825, "step": 6375 }, { "epoch": 0.16792204371872532, "grad_norm": 4.503345012664795, "learning_rate": 4.1611798788517256e-05, "loss": 0.865, "step": 6376 }, { "epoch": 0.16794838030023704, "grad_norm": 2.2279651165008545, "learning_rate": 4.1610481959441665e-05, "loss": 1.8657, "step": 6377 }, { "epoch": 0.16797471688174875, "grad_norm": 5.581786632537842, "learning_rate": 4.160916513036608e-05, "loss": 1.9477, "step": 6378 }, { "epoch": 0.16800105346326047, "grad_norm": 2.2516820430755615, "learning_rate": 4.160784830129049e-05, "loss": 1.1118, "step": 6379 }, { "epoch": 0.16802739004477218, "grad_norm": 2.087665319442749, "learning_rate": 4.160653147221491e-05, "loss": 1.9211, "step": 6380 }, { "epoch": 0.1680537266262839, "grad_norm": 3.2751336097717285, "learning_rate": 4.160521464313932e-05, "loss": 2.434, "step": 6381 }, { "epoch": 0.16808006320779562, "grad_norm": 2.2820301055908203, "learning_rate": 4.1603897814063736e-05, "loss": 1.8353, "step": 6382 }, { "epoch": 0.16810639978930736, "grad_norm": 1.8434832096099854, "learning_rate": 4.160258098498815e-05, "loss": 2.0345, "step": 6383 }, { "epoch": 0.16813273637081907, "grad_norm": 1.7959675788879395, "learning_rate": 4.160126415591257e-05, "loss": 1.8559, "step": 6384 }, { "epoch": 0.1681590729523308, "grad_norm": 3.790647506713867, "learning_rate": 4.159994732683698e-05, "loss": 0.675, "step": 6385 }, { "epoch": 0.1681854095338425, "grad_norm": 3.7308340072631836, "learning_rate": 4.159863049776139e-05, "loss": 1.0194, "step": 6386 }, { "epoch": 0.16821174611535422, "grad_norm": 3.0776009559631348, "learning_rate": 4.159731366868581e-05, "loss": 1.2393, "step": 6387 }, { "epoch": 0.16823808269686594, "grad_norm": 2.9640960693359375, "learning_rate": 4.1595996839610217e-05, "loss": 1.4725, "step": 6388 }, { "epoch": 0.16826441927837768, "grad_norm": 3.3266842365264893, "learning_rate": 4.159468001053464e-05, "loss": 1.0055, "step": 6389 }, { "epoch": 0.1682907558598894, "grad_norm": 3.820871353149414, "learning_rate": 4.159336318145905e-05, "loss": 0.6642, "step": 6390 }, { "epoch": 0.1683170924414011, "grad_norm": 2.974294900894165, "learning_rate": 4.159204635238346e-05, "loss": 1.9128, "step": 6391 }, { "epoch": 0.16834342902291283, "grad_norm": 2.456472635269165, "learning_rate": 4.159072952330787e-05, "loss": 1.8701, "step": 6392 }, { "epoch": 0.16836976560442454, "grad_norm": 3.471158027648926, "learning_rate": 4.158941269423229e-05, "loss": 1.1492, "step": 6393 }, { "epoch": 0.16839610218593626, "grad_norm": 3.248304605484009, "learning_rate": 4.15880958651567e-05, "loss": 1.9342, "step": 6394 }, { "epoch": 0.16842243876744797, "grad_norm": 5.455419540405273, "learning_rate": 4.158677903608112e-05, "loss": 1.2831, "step": 6395 }, { "epoch": 0.16844877534895972, "grad_norm": 3.4281656742095947, "learning_rate": 4.1585462207005535e-05, "loss": 1.116, "step": 6396 }, { "epoch": 0.16847511193047143, "grad_norm": 2.4252750873565674, "learning_rate": 4.158414537792994e-05, "loss": 1.4518, "step": 6397 }, { "epoch": 0.16850144851198315, "grad_norm": 1.7467241287231445, "learning_rate": 4.1582828548854366e-05, "loss": 1.6271, "step": 6398 }, { "epoch": 0.16852778509349486, "grad_norm": 5.699279308319092, "learning_rate": 4.1581511719778775e-05, "loss": 0.9435, "step": 6399 }, { "epoch": 0.16855412167500658, "grad_norm": 3.9772253036499023, "learning_rate": 4.158019489070319e-05, "loss": 0.774, "step": 6400 }, { "epoch": 0.1685804582565183, "grad_norm": 2.4051127433776855, "learning_rate": 4.15788780616276e-05, "loss": 1.7705, "step": 6401 }, { "epoch": 0.16860679483803, "grad_norm": 1.9040956497192383, "learning_rate": 4.1577561232552015e-05, "loss": 1.6628, "step": 6402 }, { "epoch": 0.16863313141954175, "grad_norm": 1.9643168449401855, "learning_rate": 4.157624440347643e-05, "loss": 1.1949, "step": 6403 }, { "epoch": 0.16865946800105347, "grad_norm": 3.734225273132324, "learning_rate": 4.1574927574400846e-05, "loss": 1.8115, "step": 6404 }, { "epoch": 0.16868580458256519, "grad_norm": 1.7562239170074463, "learning_rate": 4.157361074532526e-05, "loss": 1.5727, "step": 6405 }, { "epoch": 0.1687121411640769, "grad_norm": 3.9833977222442627, "learning_rate": 4.157229391624967e-05, "loss": 2.0763, "step": 6406 }, { "epoch": 0.16873847774558862, "grad_norm": 1.9149258136749268, "learning_rate": 4.1570977087174086e-05, "loss": 2.1516, "step": 6407 }, { "epoch": 0.16876481432710033, "grad_norm": 4.471598148345947, "learning_rate": 4.15696602580985e-05, "loss": 1.8673, "step": 6408 }, { "epoch": 0.16879115090861208, "grad_norm": 4.501389980316162, "learning_rate": 4.156834342902292e-05, "loss": 1.4924, "step": 6409 }, { "epoch": 0.1688174874901238, "grad_norm": 2.703948736190796, "learning_rate": 4.1567026599947326e-05, "loss": 1.4937, "step": 6410 }, { "epoch": 0.1688438240716355, "grad_norm": 3.3694334030151367, "learning_rate": 4.156570977087174e-05, "loss": 0.8934, "step": 6411 }, { "epoch": 0.16887016065314722, "grad_norm": 2.8854258060455322, "learning_rate": 4.156439294179616e-05, "loss": 0.887, "step": 6412 }, { "epoch": 0.16889649723465894, "grad_norm": 2.223555326461792, "learning_rate": 4.156307611272057e-05, "loss": 2.0356, "step": 6413 }, { "epoch": 0.16892283381617065, "grad_norm": 2.5965170860290527, "learning_rate": 4.156175928364499e-05, "loss": 0.6074, "step": 6414 }, { "epoch": 0.16894917039768237, "grad_norm": 2.111039876937866, "learning_rate": 4.15604424545694e-05, "loss": 1.847, "step": 6415 }, { "epoch": 0.1689755069791941, "grad_norm": 10.21255874633789, "learning_rate": 4.155912562549381e-05, "loss": 1.2639, "step": 6416 }, { "epoch": 0.16900184356070583, "grad_norm": 2.9755373001098633, "learning_rate": 4.155780879641823e-05, "loss": 0.5679, "step": 6417 }, { "epoch": 0.16902818014221754, "grad_norm": 1.7586227655410767, "learning_rate": 4.1556491967342644e-05, "loss": 1.7276, "step": 6418 }, { "epoch": 0.16905451672372926, "grad_norm": 3.6247305870056152, "learning_rate": 4.155517513826705e-05, "loss": 1.3592, "step": 6419 }, { "epoch": 0.16908085330524097, "grad_norm": 2.6267502307891846, "learning_rate": 4.155385830919147e-05, "loss": 1.3719, "step": 6420 }, { "epoch": 0.1691071898867527, "grad_norm": 2.071226119995117, "learning_rate": 4.1552541480115884e-05, "loss": 1.5613, "step": 6421 }, { "epoch": 0.1691335264682644, "grad_norm": 8.02920150756836, "learning_rate": 4.15512246510403e-05, "loss": 1.6523, "step": 6422 }, { "epoch": 0.16915986304977615, "grad_norm": 1.599113941192627, "learning_rate": 4.1549907821964715e-05, "loss": 2.5116, "step": 6423 }, { "epoch": 0.16918619963128786, "grad_norm": 1.7589728832244873, "learning_rate": 4.1548590992889124e-05, "loss": 0.9217, "step": 6424 }, { "epoch": 0.16921253621279958, "grad_norm": 2.3213589191436768, "learning_rate": 4.154727416381354e-05, "loss": 1.948, "step": 6425 }, { "epoch": 0.1692388727943113, "grad_norm": 1.698765754699707, "learning_rate": 4.154595733473795e-05, "loss": 1.4775, "step": 6426 }, { "epoch": 0.169265209375823, "grad_norm": 3.4839954376220703, "learning_rate": 4.154464050566237e-05, "loss": 1.6487, "step": 6427 }, { "epoch": 0.16929154595733473, "grad_norm": 3.0693438053131104, "learning_rate": 4.154332367658678e-05, "loss": 2.2047, "step": 6428 }, { "epoch": 0.16931788253884647, "grad_norm": 2.398834705352783, "learning_rate": 4.1542006847511195e-05, "loss": 2.2971, "step": 6429 }, { "epoch": 0.16934421912035819, "grad_norm": 3.881516456604004, "learning_rate": 4.154069001843561e-05, "loss": 1.8524, "step": 6430 }, { "epoch": 0.1693705557018699, "grad_norm": 2.030421257019043, "learning_rate": 4.1539373189360026e-05, "loss": 1.8384, "step": 6431 }, { "epoch": 0.16939689228338162, "grad_norm": 2.8169984817504883, "learning_rate": 4.153805636028444e-05, "loss": 1.3217, "step": 6432 }, { "epoch": 0.16942322886489333, "grad_norm": 3.0963947772979736, "learning_rate": 4.153673953120885e-05, "loss": 1.5044, "step": 6433 }, { "epoch": 0.16944956544640505, "grad_norm": 2.2183096408843994, "learning_rate": 4.1535422702133266e-05, "loss": 2.0749, "step": 6434 }, { "epoch": 0.16947590202791676, "grad_norm": 2.6742281913757324, "learning_rate": 4.1534105873057675e-05, "loss": 2.1226, "step": 6435 }, { "epoch": 0.1695022386094285, "grad_norm": 2.041149139404297, "learning_rate": 4.15327890439821e-05, "loss": 1.8089, "step": 6436 }, { "epoch": 0.16952857519094022, "grad_norm": 1.8499716520309448, "learning_rate": 4.1531472214906506e-05, "loss": 1.1947, "step": 6437 }, { "epoch": 0.16955491177245194, "grad_norm": 2.663862466812134, "learning_rate": 4.153015538583092e-05, "loss": 2.1878, "step": 6438 }, { "epoch": 0.16958124835396365, "grad_norm": 2.132267475128174, "learning_rate": 4.152883855675533e-05, "loss": 2.2935, "step": 6439 }, { "epoch": 0.16960758493547537, "grad_norm": 1.706586241722107, "learning_rate": 4.1527521727679746e-05, "loss": 0.457, "step": 6440 }, { "epoch": 0.16963392151698709, "grad_norm": 2.2798023223876953, "learning_rate": 4.152620489860416e-05, "loss": 1.5746, "step": 6441 }, { "epoch": 0.16966025809849883, "grad_norm": 2.7831649780273438, "learning_rate": 4.152488806952858e-05, "loss": 2.3509, "step": 6442 }, { "epoch": 0.16968659468001054, "grad_norm": 3.353847026824951, "learning_rate": 4.152357124045299e-05, "loss": 1.2742, "step": 6443 }, { "epoch": 0.16971293126152226, "grad_norm": 2.7306783199310303, "learning_rate": 4.15222544113774e-05, "loss": 2.2132, "step": 6444 }, { "epoch": 0.16973926784303398, "grad_norm": 3.453615427017212, "learning_rate": 4.152093758230182e-05, "loss": 2.1731, "step": 6445 }, { "epoch": 0.1697656044245457, "grad_norm": 2.8907649517059326, "learning_rate": 4.151962075322623e-05, "loss": 1.985, "step": 6446 }, { "epoch": 0.1697919410060574, "grad_norm": 3.216059923171997, "learning_rate": 4.151830392415065e-05, "loss": 1.7245, "step": 6447 }, { "epoch": 0.16981827758756912, "grad_norm": 2.4432458877563477, "learning_rate": 4.151698709507506e-05, "loss": 2.4146, "step": 6448 }, { "epoch": 0.16984461416908087, "grad_norm": 2.241258382797241, "learning_rate": 4.151567026599947e-05, "loss": 1.1708, "step": 6449 }, { "epoch": 0.16987095075059258, "grad_norm": 3.880747079849243, "learning_rate": 4.151435343692389e-05, "loss": 2.7322, "step": 6450 }, { "epoch": 0.1698972873321043, "grad_norm": 5.091732025146484, "learning_rate": 4.1513036607848304e-05, "loss": 1.1156, "step": 6451 }, { "epoch": 0.169923623913616, "grad_norm": 2.030904769897461, "learning_rate": 4.151171977877272e-05, "loss": 2.0242, "step": 6452 }, { "epoch": 0.16994996049512773, "grad_norm": 3.2502996921539307, "learning_rate": 4.151040294969713e-05, "loss": 1.492, "step": 6453 }, { "epoch": 0.16997629707663944, "grad_norm": 1.8628132343292236, "learning_rate": 4.1509086120621544e-05, "loss": 1.8729, "step": 6454 }, { "epoch": 0.17000263365815116, "grad_norm": 5.081038475036621, "learning_rate": 4.150776929154596e-05, "loss": 1.171, "step": 6455 }, { "epoch": 0.1700289702396629, "grad_norm": 2.8787131309509277, "learning_rate": 4.1506452462470376e-05, "loss": 1.8752, "step": 6456 }, { "epoch": 0.17005530682117462, "grad_norm": 4.341348171234131, "learning_rate": 4.1505135633394784e-05, "loss": 0.863, "step": 6457 }, { "epoch": 0.17008164340268633, "grad_norm": 2.048917293548584, "learning_rate": 4.15038188043192e-05, "loss": 1.969, "step": 6458 }, { "epoch": 0.17010797998419805, "grad_norm": 5.052336692810059, "learning_rate": 4.1502501975243616e-05, "loss": 0.8067, "step": 6459 }, { "epoch": 0.17013431656570976, "grad_norm": 1.709985375404358, "learning_rate": 4.150118514616803e-05, "loss": 1.4318, "step": 6460 }, { "epoch": 0.17016065314722148, "grad_norm": 3.442023992538452, "learning_rate": 4.149986831709245e-05, "loss": 1.7595, "step": 6461 }, { "epoch": 0.17018698972873322, "grad_norm": 2.279386520385742, "learning_rate": 4.1498551488016856e-05, "loss": 1.5026, "step": 6462 }, { "epoch": 0.17021332631024494, "grad_norm": 3.368828058242798, "learning_rate": 4.149723465894127e-05, "loss": 1.5382, "step": 6463 }, { "epoch": 0.17023966289175665, "grad_norm": 2.1874027252197266, "learning_rate": 4.149591782986569e-05, "loss": 1.3267, "step": 6464 }, { "epoch": 0.17026599947326837, "grad_norm": 2.9538815021514893, "learning_rate": 4.14946010007901e-05, "loss": 1.3876, "step": 6465 }, { "epoch": 0.17029233605478009, "grad_norm": 2.819532632827759, "learning_rate": 4.149328417171451e-05, "loss": 1.0644, "step": 6466 }, { "epoch": 0.1703186726362918, "grad_norm": 1.8892769813537598, "learning_rate": 4.149196734263893e-05, "loss": 1.9414, "step": 6467 }, { "epoch": 0.17034500921780352, "grad_norm": 2.458207607269287, "learning_rate": 4.149065051356334e-05, "loss": 1.2594, "step": 6468 }, { "epoch": 0.17037134579931526, "grad_norm": 1.8219809532165527, "learning_rate": 4.148933368448776e-05, "loss": 1.4863, "step": 6469 }, { "epoch": 0.17039768238082698, "grad_norm": 2.603959083557129, "learning_rate": 4.1488016855412174e-05, "loss": 2.0385, "step": 6470 }, { "epoch": 0.1704240189623387, "grad_norm": 4.030280590057373, "learning_rate": 4.148670002633658e-05, "loss": 0.72, "step": 6471 }, { "epoch": 0.1704503555438504, "grad_norm": 4.136963844299316, "learning_rate": 4.1485383197261e-05, "loss": 1.5288, "step": 6472 }, { "epoch": 0.17047669212536212, "grad_norm": 4.837243556976318, "learning_rate": 4.148406636818541e-05, "loss": 1.5353, "step": 6473 }, { "epoch": 0.17050302870687384, "grad_norm": 2.52107310295105, "learning_rate": 4.148274953910983e-05, "loss": 0.4875, "step": 6474 }, { "epoch": 0.17052936528838555, "grad_norm": 1.8930702209472656, "learning_rate": 4.148143271003424e-05, "loss": 1.8783, "step": 6475 }, { "epoch": 0.1705557018698973, "grad_norm": 1.9251892566680908, "learning_rate": 4.1480115880958654e-05, "loss": 2.0516, "step": 6476 }, { "epoch": 0.170582038451409, "grad_norm": 3.5314857959747314, "learning_rate": 4.147879905188307e-05, "loss": 1.2225, "step": 6477 }, { "epoch": 0.17060837503292073, "grad_norm": 2.236572027206421, "learning_rate": 4.147748222280748e-05, "loss": 2.0008, "step": 6478 }, { "epoch": 0.17063471161443244, "grad_norm": 2.191998243331909, "learning_rate": 4.14761653937319e-05, "loss": 1.4144, "step": 6479 }, { "epoch": 0.17066104819594416, "grad_norm": 2.1222879886627197, "learning_rate": 4.147484856465631e-05, "loss": 1.988, "step": 6480 }, { "epoch": 0.17068738477745588, "grad_norm": 2.2267048358917236, "learning_rate": 4.1473531735580725e-05, "loss": 1.4148, "step": 6481 }, { "epoch": 0.17071372135896762, "grad_norm": 1.9699732065200806, "learning_rate": 4.1472214906505134e-05, "loss": 1.7027, "step": 6482 }, { "epoch": 0.17074005794047933, "grad_norm": 2.0228424072265625, "learning_rate": 4.1470898077429556e-05, "loss": 2.1025, "step": 6483 }, { "epoch": 0.17076639452199105, "grad_norm": 2.1892802715301514, "learning_rate": 4.1469581248353965e-05, "loss": 1.9244, "step": 6484 }, { "epoch": 0.17079273110350277, "grad_norm": 2.0006086826324463, "learning_rate": 4.146826441927838e-05, "loss": 1.5034, "step": 6485 }, { "epoch": 0.17081906768501448, "grad_norm": 1.6685675382614136, "learning_rate": 4.146694759020279e-05, "loss": 2.0658, "step": 6486 }, { "epoch": 0.1708454042665262, "grad_norm": 1.7031437158584595, "learning_rate": 4.1465630761127205e-05, "loss": 1.7038, "step": 6487 }, { "epoch": 0.1708717408480379, "grad_norm": 2.0737340450286865, "learning_rate": 4.146431393205163e-05, "loss": 0.7668, "step": 6488 }, { "epoch": 0.17089807742954966, "grad_norm": 3.201235294342041, "learning_rate": 4.1462997102976036e-05, "loss": 0.6086, "step": 6489 }, { "epoch": 0.17092441401106137, "grad_norm": 2.792245864868164, "learning_rate": 4.146168027390045e-05, "loss": 1.6417, "step": 6490 }, { "epoch": 0.1709507505925731, "grad_norm": 1.8106646537780762, "learning_rate": 4.146036344482486e-05, "loss": 1.9466, "step": 6491 }, { "epoch": 0.1709770871740848, "grad_norm": 2.1337807178497314, "learning_rate": 4.1459046615749276e-05, "loss": 1.3608, "step": 6492 }, { "epoch": 0.17100342375559652, "grad_norm": 1.7259540557861328, "learning_rate": 4.145772978667369e-05, "loss": 1.649, "step": 6493 }, { "epoch": 0.17102976033710823, "grad_norm": 2.1733360290527344, "learning_rate": 4.145641295759811e-05, "loss": 1.8353, "step": 6494 }, { "epoch": 0.17105609691861998, "grad_norm": 1.8094542026519775, "learning_rate": 4.1455096128522516e-05, "loss": 1.7019, "step": 6495 }, { "epoch": 0.1710824335001317, "grad_norm": 2.2861244678497314, "learning_rate": 4.145377929944693e-05, "loss": 2.0309, "step": 6496 }, { "epoch": 0.1711087700816434, "grad_norm": 4.071788787841797, "learning_rate": 4.145246247037135e-05, "loss": 1.4797, "step": 6497 }, { "epoch": 0.17113510666315512, "grad_norm": 3.26546311378479, "learning_rate": 4.145114564129576e-05, "loss": 1.1261, "step": 6498 }, { "epoch": 0.17116144324466684, "grad_norm": 8.301631927490234, "learning_rate": 4.144982881222018e-05, "loss": 1.6048, "step": 6499 }, { "epoch": 0.17118777982617855, "grad_norm": 1.9324514865875244, "learning_rate": 4.144851198314459e-05, "loss": 0.3971, "step": 6500 }, { "epoch": 0.17121411640769027, "grad_norm": 3.5214147567749023, "learning_rate": 4.1447195154069e-05, "loss": 1.6008, "step": 6501 }, { "epoch": 0.171240452989202, "grad_norm": 2.0774364471435547, "learning_rate": 4.144587832499342e-05, "loss": 1.9798, "step": 6502 }, { "epoch": 0.17126678957071373, "grad_norm": 1.9394217729568481, "learning_rate": 4.1444561495917834e-05, "loss": 1.6655, "step": 6503 }, { "epoch": 0.17129312615222544, "grad_norm": 3.2418298721313477, "learning_rate": 4.144324466684224e-05, "loss": 1.675, "step": 6504 }, { "epoch": 0.17131946273373716, "grad_norm": 1.9306080341339111, "learning_rate": 4.144192783776666e-05, "loss": 1.8287, "step": 6505 }, { "epoch": 0.17134579931524888, "grad_norm": 1.3904544115066528, "learning_rate": 4.1440611008691074e-05, "loss": 2.1646, "step": 6506 }, { "epoch": 0.1713721358967606, "grad_norm": 1.7397165298461914, "learning_rate": 4.143929417961549e-05, "loss": 1.3778, "step": 6507 }, { "epoch": 0.1713984724782723, "grad_norm": 1.8746224641799927, "learning_rate": 4.1437977350539905e-05, "loss": 0.8404, "step": 6508 }, { "epoch": 0.17142480905978405, "grad_norm": 6.493264675140381, "learning_rate": 4.1436660521464314e-05, "loss": 1.4011, "step": 6509 }, { "epoch": 0.17145114564129577, "grad_norm": 2.357227087020874, "learning_rate": 4.143534369238873e-05, "loss": 1.1308, "step": 6510 }, { "epoch": 0.17147748222280748, "grad_norm": 1.5707148313522339, "learning_rate": 4.143402686331314e-05, "loss": 1.1768, "step": 6511 }, { "epoch": 0.1715038188043192, "grad_norm": 6.075654029846191, "learning_rate": 4.143271003423756e-05, "loss": 1.5775, "step": 6512 }, { "epoch": 0.1715301553858309, "grad_norm": 3.0365631580352783, "learning_rate": 4.143139320516197e-05, "loss": 1.7591, "step": 6513 }, { "epoch": 0.17155649196734263, "grad_norm": 2.6698756217956543, "learning_rate": 4.1430076376086385e-05, "loss": 1.5061, "step": 6514 }, { "epoch": 0.17158282854885437, "grad_norm": 2.3414206504821777, "learning_rate": 4.14287595470108e-05, "loss": 1.892, "step": 6515 }, { "epoch": 0.1716091651303661, "grad_norm": 3.2102322578430176, "learning_rate": 4.1427442717935217e-05, "loss": 0.8145, "step": 6516 }, { "epoch": 0.1716355017118778, "grad_norm": 4.5180864334106445, "learning_rate": 4.142612588885963e-05, "loss": 1.5787, "step": 6517 }, { "epoch": 0.17166183829338952, "grad_norm": 2.3173553943634033, "learning_rate": 4.142480905978404e-05, "loss": 1.7247, "step": 6518 }, { "epoch": 0.17168817487490123, "grad_norm": 5.040765285491943, "learning_rate": 4.1423492230708457e-05, "loss": 0.9599, "step": 6519 }, { "epoch": 0.17171451145641295, "grad_norm": 2.1651523113250732, "learning_rate": 4.1422175401632865e-05, "loss": 2.3724, "step": 6520 }, { "epoch": 0.17174084803792466, "grad_norm": 2.7365846633911133, "learning_rate": 4.142085857255729e-05, "loss": 1.509, "step": 6521 }, { "epoch": 0.1717671846194364, "grad_norm": 1.9443204402923584, "learning_rate": 4.1419541743481697e-05, "loss": 2.3737, "step": 6522 }, { "epoch": 0.17179352120094812, "grad_norm": 1.792307734489441, "learning_rate": 4.141822491440611e-05, "loss": 1.9221, "step": 6523 }, { "epoch": 0.17181985778245984, "grad_norm": 3.0441348552703857, "learning_rate": 4.141690808533053e-05, "loss": 1.3485, "step": 6524 }, { "epoch": 0.17184619436397156, "grad_norm": 1.9249099493026733, "learning_rate": 4.1415591256254937e-05, "loss": 0.8577, "step": 6525 }, { "epoch": 0.17187253094548327, "grad_norm": 2.668057680130005, "learning_rate": 4.141427442717936e-05, "loss": 0.6554, "step": 6526 }, { "epoch": 0.171898867526995, "grad_norm": 1.8003125190734863, "learning_rate": 4.141295759810377e-05, "loss": 1.5838, "step": 6527 }, { "epoch": 0.17192520410850673, "grad_norm": 2.3780336380004883, "learning_rate": 4.141164076902818e-05, "loss": 1.9685, "step": 6528 }, { "epoch": 0.17195154069001845, "grad_norm": 2.542705774307251, "learning_rate": 4.141032393995259e-05, "loss": 2.4901, "step": 6529 }, { "epoch": 0.17197787727153016, "grad_norm": 2.2208974361419678, "learning_rate": 4.1409007110877015e-05, "loss": 2.031, "step": 6530 }, { "epoch": 0.17200421385304188, "grad_norm": 3.307168483734131, "learning_rate": 4.140769028180142e-05, "loss": 1.452, "step": 6531 }, { "epoch": 0.1720305504345536, "grad_norm": 5.970710277557373, "learning_rate": 4.140637345272584e-05, "loss": 1.1322, "step": 6532 }, { "epoch": 0.1720568870160653, "grad_norm": 1.7820929288864136, "learning_rate": 4.1405056623650255e-05, "loss": 2.0271, "step": 6533 }, { "epoch": 0.17208322359757702, "grad_norm": 2.1190316677093506, "learning_rate": 4.1403739794574663e-05, "loss": 2.0157, "step": 6534 }, { "epoch": 0.17210956017908877, "grad_norm": 2.2790586948394775, "learning_rate": 4.1402422965499086e-05, "loss": 1.4108, "step": 6535 }, { "epoch": 0.17213589676060048, "grad_norm": 2.4451963901519775, "learning_rate": 4.1401106136423495e-05, "loss": 1.9138, "step": 6536 }, { "epoch": 0.1721622333421122, "grad_norm": 2.708577871322632, "learning_rate": 4.139978930734791e-05, "loss": 0.4258, "step": 6537 }, { "epoch": 0.1721885699236239, "grad_norm": 3.1696577072143555, "learning_rate": 4.139847247827232e-05, "loss": 1.4663, "step": 6538 }, { "epoch": 0.17221490650513563, "grad_norm": 1.8144752979278564, "learning_rate": 4.1397155649196735e-05, "loss": 1.7598, "step": 6539 }, { "epoch": 0.17224124308664734, "grad_norm": 2.4526026248931885, "learning_rate": 4.139583882012115e-05, "loss": 1.702, "step": 6540 }, { "epoch": 0.17226757966815906, "grad_norm": 1.9719762802124023, "learning_rate": 4.1394521991045566e-05, "loss": 1.6701, "step": 6541 }, { "epoch": 0.1722939162496708, "grad_norm": 3.4558773040771484, "learning_rate": 4.1393205161969975e-05, "loss": 1.3219, "step": 6542 }, { "epoch": 0.17232025283118252, "grad_norm": 3.586822032928467, "learning_rate": 4.139188833289439e-05, "loss": 1.8346, "step": 6543 }, { "epoch": 0.17234658941269423, "grad_norm": 2.3900208473205566, "learning_rate": 4.1390571503818806e-05, "loss": 1.7368, "step": 6544 }, { "epoch": 0.17237292599420595, "grad_norm": 3.041321277618408, "learning_rate": 4.138925467474322e-05, "loss": 1.2313, "step": 6545 }, { "epoch": 0.17239926257571767, "grad_norm": 1.6797538995742798, "learning_rate": 4.138793784566764e-05, "loss": 1.6437, "step": 6546 }, { "epoch": 0.17242559915722938, "grad_norm": 1.9204726219177246, "learning_rate": 4.1386621016592046e-05, "loss": 1.6408, "step": 6547 }, { "epoch": 0.17245193573874112, "grad_norm": 4.22277307510376, "learning_rate": 4.138530418751646e-05, "loss": 1.3494, "step": 6548 }, { "epoch": 0.17247827232025284, "grad_norm": 2.0165631771087646, "learning_rate": 4.138398735844088e-05, "loss": 1.788, "step": 6549 }, { "epoch": 0.17250460890176456, "grad_norm": 2.410510778427124, "learning_rate": 4.138267052936529e-05, "loss": 2.1595, "step": 6550 }, { "epoch": 0.17253094548327627, "grad_norm": 4.09640645980835, "learning_rate": 4.13813537002897e-05, "loss": 1.2022, "step": 6551 }, { "epoch": 0.172557282064788, "grad_norm": 1.8408640623092651, "learning_rate": 4.138003687121412e-05, "loss": 2.0705, "step": 6552 }, { "epoch": 0.1725836186462997, "grad_norm": 2.3870368003845215, "learning_rate": 4.137872004213853e-05, "loss": 1.8245, "step": 6553 }, { "epoch": 0.17260995522781142, "grad_norm": 2.1747100353240967, "learning_rate": 4.137740321306295e-05, "loss": 2.0954, "step": 6554 }, { "epoch": 0.17263629180932316, "grad_norm": 1.8999983072280884, "learning_rate": 4.1376086383987364e-05, "loss": 2.2851, "step": 6555 }, { "epoch": 0.17266262839083488, "grad_norm": 2.8208136558532715, "learning_rate": 4.137476955491177e-05, "loss": 1.5715, "step": 6556 }, { "epoch": 0.1726889649723466, "grad_norm": 3.2907090187072754, "learning_rate": 4.137345272583619e-05, "loss": 1.801, "step": 6557 }, { "epoch": 0.1727153015538583, "grad_norm": 2.177830696105957, "learning_rate": 4.13721358967606e-05, "loss": 2.0442, "step": 6558 }, { "epoch": 0.17274163813537002, "grad_norm": 4.38843297958374, "learning_rate": 4.137081906768502e-05, "loss": 2.3097, "step": 6559 }, { "epoch": 0.17276797471688174, "grad_norm": 3.4381134510040283, "learning_rate": 4.136950223860943e-05, "loss": 2.0296, "step": 6560 }, { "epoch": 0.17279431129839345, "grad_norm": 2.064281940460205, "learning_rate": 4.1368185409533844e-05, "loss": 1.7947, "step": 6561 }, { "epoch": 0.1728206478799052, "grad_norm": 1.9783134460449219, "learning_rate": 4.136686858045826e-05, "loss": 1.8043, "step": 6562 }, { "epoch": 0.1728469844614169, "grad_norm": 2.5466768741607666, "learning_rate": 4.1365551751382675e-05, "loss": 1.9159, "step": 6563 }, { "epoch": 0.17287332104292863, "grad_norm": 2.6055939197540283, "learning_rate": 4.136423492230709e-05, "loss": 2.2595, "step": 6564 }, { "epoch": 0.17289965762444034, "grad_norm": 1.7846686840057373, "learning_rate": 4.13629180932315e-05, "loss": 1.3956, "step": 6565 }, { "epoch": 0.17292599420595206, "grad_norm": 2.5492637157440186, "learning_rate": 4.1361601264155915e-05, "loss": 1.8095, "step": 6566 }, { "epoch": 0.17295233078746378, "grad_norm": 2.6915223598480225, "learning_rate": 4.1360284435080324e-05, "loss": 1.8663, "step": 6567 }, { "epoch": 0.17297866736897552, "grad_norm": 5.118632793426514, "learning_rate": 4.1358967606004746e-05, "loss": 1.8523, "step": 6568 }, { "epoch": 0.17300500395048724, "grad_norm": 1.6934292316436768, "learning_rate": 4.1357650776929155e-05, "loss": 1.4678, "step": 6569 }, { "epoch": 0.17303134053199895, "grad_norm": 4.224479675292969, "learning_rate": 4.135633394785357e-05, "loss": 2.2489, "step": 6570 }, { "epoch": 0.17305767711351067, "grad_norm": 3.1312708854675293, "learning_rate": 4.1355017118777986e-05, "loss": 1.3973, "step": 6571 }, { "epoch": 0.17308401369502238, "grad_norm": 2.5284790992736816, "learning_rate": 4.1353700289702395e-05, "loss": 1.6139, "step": 6572 }, { "epoch": 0.1731103502765341, "grad_norm": 2.208031177520752, "learning_rate": 4.135238346062682e-05, "loss": 1.7548, "step": 6573 }, { "epoch": 0.1731366868580458, "grad_norm": 1.7932910919189453, "learning_rate": 4.1351066631551226e-05, "loss": 1.7549, "step": 6574 }, { "epoch": 0.17316302343955756, "grad_norm": 4.175819396972656, "learning_rate": 4.134974980247564e-05, "loss": 1.5917, "step": 6575 }, { "epoch": 0.17318936002106927, "grad_norm": 2.4755589962005615, "learning_rate": 4.134843297340005e-05, "loss": 1.7424, "step": 6576 }, { "epoch": 0.173215696602581, "grad_norm": 1.7478114366531372, "learning_rate": 4.1347116144324466e-05, "loss": 1.5782, "step": 6577 }, { "epoch": 0.1732420331840927, "grad_norm": 3.353172540664673, "learning_rate": 4.134579931524888e-05, "loss": 0.3634, "step": 6578 }, { "epoch": 0.17326836976560442, "grad_norm": 3.7557127475738525, "learning_rate": 4.13444824861733e-05, "loss": 1.5429, "step": 6579 }, { "epoch": 0.17329470634711613, "grad_norm": 1.7842997312545776, "learning_rate": 4.134316565709771e-05, "loss": 1.5903, "step": 6580 }, { "epoch": 0.17332104292862788, "grad_norm": 4.350416660308838, "learning_rate": 4.134184882802212e-05, "loss": 1.0064, "step": 6581 }, { "epoch": 0.1733473795101396, "grad_norm": 3.2437937259674072, "learning_rate": 4.1340531998946544e-05, "loss": 0.9464, "step": 6582 }, { "epoch": 0.1733737160916513, "grad_norm": 2.259084701538086, "learning_rate": 4.133921516987095e-05, "loss": 2.1163, "step": 6583 }, { "epoch": 0.17340005267316302, "grad_norm": 3.8716816902160645, "learning_rate": 4.133789834079537e-05, "loss": 2.0451, "step": 6584 }, { "epoch": 0.17342638925467474, "grad_norm": 2.8280696868896484, "learning_rate": 4.133658151171978e-05, "loss": 1.5448, "step": 6585 }, { "epoch": 0.17345272583618646, "grad_norm": 2.231928825378418, "learning_rate": 4.133526468264419e-05, "loss": 2.0026, "step": 6586 }, { "epoch": 0.17347906241769817, "grad_norm": 2.909186840057373, "learning_rate": 4.133394785356861e-05, "loss": 0.635, "step": 6587 }, { "epoch": 0.17350539899920991, "grad_norm": 1.8089038133621216, "learning_rate": 4.1332631024493024e-05, "loss": 1.9228, "step": 6588 }, { "epoch": 0.17353173558072163, "grad_norm": 2.0774645805358887, "learning_rate": 4.133131419541743e-05, "loss": 2.1094, "step": 6589 }, { "epoch": 0.17355807216223335, "grad_norm": 1.901111364364624, "learning_rate": 4.132999736634185e-05, "loss": 1.8667, "step": 6590 }, { "epoch": 0.17358440874374506, "grad_norm": 3.020867347717285, "learning_rate": 4.1328680537266264e-05, "loss": 1.7473, "step": 6591 }, { "epoch": 0.17361074532525678, "grad_norm": 2.1292335987091064, "learning_rate": 4.132736370819068e-05, "loss": 1.9234, "step": 6592 }, { "epoch": 0.1736370819067685, "grad_norm": 2.397891044616699, "learning_rate": 4.1326046879115096e-05, "loss": 1.343, "step": 6593 }, { "epoch": 0.1736634184882802, "grad_norm": 2.3489630222320557, "learning_rate": 4.1324730050039504e-05, "loss": 1.9326, "step": 6594 }, { "epoch": 0.17368975506979195, "grad_norm": 2.04750394821167, "learning_rate": 4.132341322096392e-05, "loss": 2.1108, "step": 6595 }, { "epoch": 0.17371609165130367, "grad_norm": 1.9201856851577759, "learning_rate": 4.1322096391888336e-05, "loss": 1.6795, "step": 6596 }, { "epoch": 0.17374242823281538, "grad_norm": 2.228086471557617, "learning_rate": 4.132077956281275e-05, "loss": 2.0303, "step": 6597 }, { "epoch": 0.1737687648143271, "grad_norm": 1.8679587841033936, "learning_rate": 4.131946273373716e-05, "loss": 1.8791, "step": 6598 }, { "epoch": 0.1737951013958388, "grad_norm": 2.128915548324585, "learning_rate": 4.1318145904661576e-05, "loss": 1.9337, "step": 6599 }, { "epoch": 0.17382143797735053, "grad_norm": 1.869482398033142, "learning_rate": 4.131682907558599e-05, "loss": 1.7211, "step": 6600 }, { "epoch": 0.17384777455886227, "grad_norm": 1.9034199714660645, "learning_rate": 4.131551224651041e-05, "loss": 1.9604, "step": 6601 }, { "epoch": 0.173874111140374, "grad_norm": 1.6794320344924927, "learning_rate": 4.131419541743482e-05, "loss": 1.3221, "step": 6602 }, { "epoch": 0.1739004477218857, "grad_norm": 2.7474470138549805, "learning_rate": 4.131287858835923e-05, "loss": 0.6641, "step": 6603 }, { "epoch": 0.17392678430339742, "grad_norm": 2.337796688079834, "learning_rate": 4.131156175928365e-05, "loss": 2.3751, "step": 6604 }, { "epoch": 0.17395312088490913, "grad_norm": 1.8859007358551025, "learning_rate": 4.1310244930208056e-05, "loss": 1.7392, "step": 6605 }, { "epoch": 0.17397945746642085, "grad_norm": 3.564038038253784, "learning_rate": 4.130892810113248e-05, "loss": 1.3112, "step": 6606 }, { "epoch": 0.17400579404793257, "grad_norm": 2.746978521347046, "learning_rate": 4.130761127205689e-05, "loss": 2.0967, "step": 6607 }, { "epoch": 0.1740321306294443, "grad_norm": 4.363900661468506, "learning_rate": 4.13062944429813e-05, "loss": 1.4503, "step": 6608 }, { "epoch": 0.17405846721095602, "grad_norm": 1.7683770656585693, "learning_rate": 4.130497761390572e-05, "loss": 1.378, "step": 6609 }, { "epoch": 0.17408480379246774, "grad_norm": 2.616419792175293, "learning_rate": 4.1303660784830134e-05, "loss": 1.0869, "step": 6610 }, { "epoch": 0.17411114037397946, "grad_norm": 5.405492305755615, "learning_rate": 4.130234395575455e-05, "loss": 1.441, "step": 6611 }, { "epoch": 0.17413747695549117, "grad_norm": 2.149705171585083, "learning_rate": 4.130102712667896e-05, "loss": 2.3707, "step": 6612 }, { "epoch": 0.1741638135370029, "grad_norm": 1.776121973991394, "learning_rate": 4.1299710297603374e-05, "loss": 1.8169, "step": 6613 }, { "epoch": 0.17419015011851463, "grad_norm": 1.921848177909851, "learning_rate": 4.129839346852778e-05, "loss": 1.9691, "step": 6614 }, { "epoch": 0.17421648670002635, "grad_norm": 2.028383255004883, "learning_rate": 4.1297076639452205e-05, "loss": 1.8353, "step": 6615 }, { "epoch": 0.17424282328153806, "grad_norm": 1.543782114982605, "learning_rate": 4.1295759810376614e-05, "loss": 1.6592, "step": 6616 }, { "epoch": 0.17426915986304978, "grad_norm": 1.9178073406219482, "learning_rate": 4.129444298130103e-05, "loss": 1.6874, "step": 6617 }, { "epoch": 0.1742954964445615, "grad_norm": 1.7270618677139282, "learning_rate": 4.1293126152225445e-05, "loss": 1.9808, "step": 6618 }, { "epoch": 0.1743218330260732, "grad_norm": 3.44000244140625, "learning_rate": 4.1291809323149854e-05, "loss": 2.0983, "step": 6619 }, { "epoch": 0.17434816960758492, "grad_norm": 1.9246654510498047, "learning_rate": 4.1290492494074276e-05, "loss": 2.5746, "step": 6620 }, { "epoch": 0.17437450618909667, "grad_norm": 2.8694772720336914, "learning_rate": 4.1289175664998685e-05, "loss": 1.7488, "step": 6621 }, { "epoch": 0.17440084277060838, "grad_norm": 2.084517478942871, "learning_rate": 4.12878588359231e-05, "loss": 1.0779, "step": 6622 }, { "epoch": 0.1744271793521201, "grad_norm": 2.8672194480895996, "learning_rate": 4.128654200684751e-05, "loss": 1.8264, "step": 6623 }, { "epoch": 0.17445351593363181, "grad_norm": 1.9042001962661743, "learning_rate": 4.1285225177771925e-05, "loss": 1.7287, "step": 6624 }, { "epoch": 0.17447985251514353, "grad_norm": 2.259018659591675, "learning_rate": 4.128390834869634e-05, "loss": 2.4543, "step": 6625 }, { "epoch": 0.17450618909665525, "grad_norm": 2.1348445415496826, "learning_rate": 4.1282591519620756e-05, "loss": 1.7519, "step": 6626 }, { "epoch": 0.17453252567816696, "grad_norm": 2.405287027359009, "learning_rate": 4.128127469054517e-05, "loss": 1.6668, "step": 6627 }, { "epoch": 0.1745588622596787, "grad_norm": 4.773782730102539, "learning_rate": 4.127995786146958e-05, "loss": 0.9413, "step": 6628 }, { "epoch": 0.17458519884119042, "grad_norm": 1.9451075792312622, "learning_rate": 4.1278641032394e-05, "loss": 2.0911, "step": 6629 }, { "epoch": 0.17461153542270214, "grad_norm": 1.5977376699447632, "learning_rate": 4.127732420331841e-05, "loss": 1.5029, "step": 6630 }, { "epoch": 0.17463787200421385, "grad_norm": 3.224968433380127, "learning_rate": 4.127600737424283e-05, "loss": 1.4132, "step": 6631 }, { "epoch": 0.17466420858572557, "grad_norm": 3.925855875015259, "learning_rate": 4.1274690545167236e-05, "loss": 2.0179, "step": 6632 }, { "epoch": 0.17469054516723728, "grad_norm": 2.111485719680786, "learning_rate": 4.127337371609165e-05, "loss": 1.4731, "step": 6633 }, { "epoch": 0.17471688174874903, "grad_norm": 3.393571615219116, "learning_rate": 4.127205688701607e-05, "loss": 2.2736, "step": 6634 }, { "epoch": 0.17474321833026074, "grad_norm": 2.0212717056274414, "learning_rate": 4.127074005794048e-05, "loss": 2.0275, "step": 6635 }, { "epoch": 0.17476955491177246, "grad_norm": 1.6354917287826538, "learning_rate": 4.12694232288649e-05, "loss": 1.5985, "step": 6636 }, { "epoch": 0.17479589149328417, "grad_norm": 1.7444981336593628, "learning_rate": 4.126810639978931e-05, "loss": 1.7306, "step": 6637 }, { "epoch": 0.1748222280747959, "grad_norm": 2.5249311923980713, "learning_rate": 4.126678957071372e-05, "loss": 0.8634, "step": 6638 }, { "epoch": 0.1748485646563076, "grad_norm": 2.2393085956573486, "learning_rate": 4.126547274163814e-05, "loss": 1.5959, "step": 6639 }, { "epoch": 0.17487490123781932, "grad_norm": 2.3244261741638184, "learning_rate": 4.1264155912562554e-05, "loss": 2.3055, "step": 6640 }, { "epoch": 0.17490123781933106, "grad_norm": 2.5192677974700928, "learning_rate": 4.126283908348696e-05, "loss": 1.9003, "step": 6641 }, { "epoch": 0.17492757440084278, "grad_norm": 2.2412896156311035, "learning_rate": 4.126152225441138e-05, "loss": 1.8259, "step": 6642 }, { "epoch": 0.1749539109823545, "grad_norm": 2.9820196628570557, "learning_rate": 4.1260205425335794e-05, "loss": 0.8889, "step": 6643 }, { "epoch": 0.1749802475638662, "grad_norm": 5.168096542358398, "learning_rate": 4.125888859626021e-05, "loss": 2.1388, "step": 6644 }, { "epoch": 0.17500658414537792, "grad_norm": 1.8245837688446045, "learning_rate": 4.125757176718462e-05, "loss": 1.6325, "step": 6645 }, { "epoch": 0.17503292072688964, "grad_norm": 2.578749418258667, "learning_rate": 4.1256254938109034e-05, "loss": 1.847, "step": 6646 }, { "epoch": 0.17505925730840136, "grad_norm": 3.981011390686035, "learning_rate": 4.125493810903345e-05, "loss": 1.277, "step": 6647 }, { "epoch": 0.1750855938899131, "grad_norm": 2.3530519008636475, "learning_rate": 4.1253621279957865e-05, "loss": 1.7096, "step": 6648 }, { "epoch": 0.17511193047142481, "grad_norm": 1.8894442319869995, "learning_rate": 4.125230445088228e-05, "loss": 1.5303, "step": 6649 }, { "epoch": 0.17513826705293653, "grad_norm": 3.984992742538452, "learning_rate": 4.125098762180669e-05, "loss": 0.7781, "step": 6650 }, { "epoch": 0.17516460363444825, "grad_norm": 1.9461286067962646, "learning_rate": 4.1249670792731105e-05, "loss": 1.6609, "step": 6651 }, { "epoch": 0.17519094021595996, "grad_norm": 1.8759702444076538, "learning_rate": 4.1248353963655514e-05, "loss": 1.8718, "step": 6652 }, { "epoch": 0.17521727679747168, "grad_norm": 2.6989259719848633, "learning_rate": 4.1247037134579937e-05, "loss": 0.3704, "step": 6653 }, { "epoch": 0.17524361337898342, "grad_norm": 2.7729644775390625, "learning_rate": 4.1245720305504345e-05, "loss": 1.6543, "step": 6654 }, { "epoch": 0.17526994996049514, "grad_norm": 2.0761306285858154, "learning_rate": 4.124440347642876e-05, "loss": 1.8262, "step": 6655 }, { "epoch": 0.17529628654200685, "grad_norm": 1.3968690633773804, "learning_rate": 4.1243086647353177e-05, "loss": 1.8761, "step": 6656 }, { "epoch": 0.17532262312351857, "grad_norm": 3.094503164291382, "learning_rate": 4.1241769818277585e-05, "loss": 2.2206, "step": 6657 }, { "epoch": 0.17534895970503028, "grad_norm": 1.6809885501861572, "learning_rate": 4.124045298920201e-05, "loss": 1.7296, "step": 6658 }, { "epoch": 0.175375296286542, "grad_norm": 2.3383312225341797, "learning_rate": 4.1239136160126417e-05, "loss": 1.004, "step": 6659 }, { "epoch": 0.17540163286805371, "grad_norm": 1.7880600690841675, "learning_rate": 4.123781933105083e-05, "loss": 2.1348, "step": 6660 }, { "epoch": 0.17542796944956546, "grad_norm": 2.8334717750549316, "learning_rate": 4.123650250197524e-05, "loss": 1.7766, "step": 6661 }, { "epoch": 0.17545430603107717, "grad_norm": 2.4014248847961426, "learning_rate": 4.1235185672899663e-05, "loss": 1.8362, "step": 6662 }, { "epoch": 0.1754806426125889, "grad_norm": 1.9157991409301758, "learning_rate": 4.123386884382407e-05, "loss": 1.6226, "step": 6663 }, { "epoch": 0.1755069791941006, "grad_norm": 2.2434890270233154, "learning_rate": 4.123255201474849e-05, "loss": 2.0024, "step": 6664 }, { "epoch": 0.17553331577561232, "grad_norm": 2.862133264541626, "learning_rate": 4.1231235185672903e-05, "loss": 1.9178, "step": 6665 }, { "epoch": 0.17555965235712404, "grad_norm": 1.7017308473587036, "learning_rate": 4.122991835659731e-05, "loss": 1.5304, "step": 6666 }, { "epoch": 0.17558598893863578, "grad_norm": 3.603447437286377, "learning_rate": 4.1228601527521735e-05, "loss": 1.0318, "step": 6667 }, { "epoch": 0.1756123255201475, "grad_norm": 2.1857197284698486, "learning_rate": 4.1227284698446143e-05, "loss": 0.9257, "step": 6668 }, { "epoch": 0.1756386621016592, "grad_norm": 3.1623499393463135, "learning_rate": 4.122596786937056e-05, "loss": 1.9929, "step": 6669 }, { "epoch": 0.17566499868317093, "grad_norm": 1.7891215085983276, "learning_rate": 4.122465104029497e-05, "loss": 1.5044, "step": 6670 }, { "epoch": 0.17569133526468264, "grad_norm": 3.255054473876953, "learning_rate": 4.1223334211219383e-05, "loss": 1.9984, "step": 6671 }, { "epoch": 0.17571767184619436, "grad_norm": 1.884595513343811, "learning_rate": 4.12220173821438e-05, "loss": 1.8561, "step": 6672 }, { "epoch": 0.17574400842770607, "grad_norm": 2.8022518157958984, "learning_rate": 4.1220700553068215e-05, "loss": 1.6945, "step": 6673 }, { "epoch": 0.17577034500921782, "grad_norm": 2.192901611328125, "learning_rate": 4.121938372399263e-05, "loss": 2.0344, "step": 6674 }, { "epoch": 0.17579668159072953, "grad_norm": 2.163236379623413, "learning_rate": 4.121806689491704e-05, "loss": 1.7897, "step": 6675 }, { "epoch": 0.17582301817224125, "grad_norm": 1.5392736196517944, "learning_rate": 4.121675006584146e-05, "loss": 1.4971, "step": 6676 }, { "epoch": 0.17584935475375296, "grad_norm": 2.7338321208953857, "learning_rate": 4.121543323676587e-05, "loss": 1.2812, "step": 6677 }, { "epoch": 0.17587569133526468, "grad_norm": 2.262636184692383, "learning_rate": 4.1214116407690286e-05, "loss": 1.882, "step": 6678 }, { "epoch": 0.1759020279167764, "grad_norm": 4.032125473022461, "learning_rate": 4.1212799578614695e-05, "loss": 1.7786, "step": 6679 }, { "epoch": 0.1759283644982881, "grad_norm": 1.9584550857543945, "learning_rate": 4.121148274953911e-05, "loss": 1.105, "step": 6680 }, { "epoch": 0.17595470107979985, "grad_norm": 1.9815263748168945, "learning_rate": 4.1210165920463526e-05, "loss": 1.1418, "step": 6681 }, { "epoch": 0.17598103766131157, "grad_norm": 2.129667282104492, "learning_rate": 4.120884909138794e-05, "loss": 2.0092, "step": 6682 }, { "epoch": 0.17600737424282328, "grad_norm": 2.683131694793701, "learning_rate": 4.120753226231236e-05, "loss": 1.5317, "step": 6683 }, { "epoch": 0.176033710824335, "grad_norm": 2.292882204055786, "learning_rate": 4.1206215433236766e-05, "loss": 1.6714, "step": 6684 }, { "epoch": 0.17606004740584671, "grad_norm": 3.0176849365234375, "learning_rate": 4.120489860416118e-05, "loss": 1.8929, "step": 6685 }, { "epoch": 0.17608638398735843, "grad_norm": 2.804069757461548, "learning_rate": 4.12035817750856e-05, "loss": 1.4643, "step": 6686 }, { "epoch": 0.17611272056887017, "grad_norm": 1.796481728553772, "learning_rate": 4.120226494601001e-05, "loss": 1.7622, "step": 6687 }, { "epoch": 0.1761390571503819, "grad_norm": 1.8195126056671143, "learning_rate": 4.120094811693442e-05, "loss": 1.483, "step": 6688 }, { "epoch": 0.1761653937318936, "grad_norm": 3.9995951652526855, "learning_rate": 4.119963128785884e-05, "loss": 1.4732, "step": 6689 }, { "epoch": 0.17619173031340532, "grad_norm": 1.863304615020752, "learning_rate": 4.1198314458783246e-05, "loss": 1.7272, "step": 6690 }, { "epoch": 0.17621806689491704, "grad_norm": 2.2495763301849365, "learning_rate": 4.119699762970767e-05, "loss": 2.0331, "step": 6691 }, { "epoch": 0.17624440347642875, "grad_norm": 2.5434064865112305, "learning_rate": 4.119568080063208e-05, "loss": 1.7875, "step": 6692 }, { "epoch": 0.17627074005794047, "grad_norm": 2.723493814468384, "learning_rate": 4.119436397155649e-05, "loss": 1.4843, "step": 6693 }, { "epoch": 0.1762970766394522, "grad_norm": 1.5455756187438965, "learning_rate": 4.119304714248091e-05, "loss": 1.9669, "step": 6694 }, { "epoch": 0.17632341322096393, "grad_norm": 1.621565818786621, "learning_rate": 4.1191730313405324e-05, "loss": 1.7554, "step": 6695 }, { "epoch": 0.17634974980247564, "grad_norm": 2.664480209350586, "learning_rate": 4.119041348432974e-05, "loss": 1.3594, "step": 6696 }, { "epoch": 0.17637608638398736, "grad_norm": 3.1807620525360107, "learning_rate": 4.118909665525415e-05, "loss": 1.749, "step": 6697 }, { "epoch": 0.17640242296549907, "grad_norm": 1.6837384700775146, "learning_rate": 4.1187779826178564e-05, "loss": 2.1722, "step": 6698 }, { "epoch": 0.1764287595470108, "grad_norm": 2.1074440479278564, "learning_rate": 4.118646299710297e-05, "loss": 1.2512, "step": 6699 }, { "epoch": 0.1764550961285225, "grad_norm": 3.323525905609131, "learning_rate": 4.1185146168027395e-05, "loss": 1.4003, "step": 6700 }, { "epoch": 0.17648143271003425, "grad_norm": 2.9400622844696045, "learning_rate": 4.1183829338951804e-05, "loss": 1.7325, "step": 6701 }, { "epoch": 0.17650776929154596, "grad_norm": 3.638178586959839, "learning_rate": 4.118251250987622e-05, "loss": 1.1509, "step": 6702 }, { "epoch": 0.17653410587305768, "grad_norm": 1.7637969255447388, "learning_rate": 4.1181195680800635e-05, "loss": 2.327, "step": 6703 }, { "epoch": 0.1765604424545694, "grad_norm": 3.6068859100341797, "learning_rate": 4.1179878851725044e-05, "loss": 0.7452, "step": 6704 }, { "epoch": 0.1765867790360811, "grad_norm": 3.0534024238586426, "learning_rate": 4.1178562022649466e-05, "loss": 2.3666, "step": 6705 }, { "epoch": 0.17661311561759283, "grad_norm": 1.7166447639465332, "learning_rate": 4.1177245193573875e-05, "loss": 1.686, "step": 6706 }, { "epoch": 0.17663945219910457, "grad_norm": 2.3784408569335938, "learning_rate": 4.117592836449829e-05, "loss": 1.6467, "step": 6707 }, { "epoch": 0.17666578878061628, "grad_norm": 4.3146820068359375, "learning_rate": 4.11746115354227e-05, "loss": 1.5173, "step": 6708 }, { "epoch": 0.176692125362128, "grad_norm": 1.9582109451293945, "learning_rate": 4.117329470634712e-05, "loss": 1.898, "step": 6709 }, { "epoch": 0.17671846194363972, "grad_norm": 1.8350038528442383, "learning_rate": 4.117197787727153e-05, "loss": 1.7136, "step": 6710 }, { "epoch": 0.17674479852515143, "grad_norm": 1.874682903289795, "learning_rate": 4.1170661048195946e-05, "loss": 1.9957, "step": 6711 }, { "epoch": 0.17677113510666315, "grad_norm": 3.145806074142456, "learning_rate": 4.116934421912036e-05, "loss": 0.7783, "step": 6712 }, { "epoch": 0.17679747168817486, "grad_norm": 3.466364622116089, "learning_rate": 4.116802739004477e-05, "loss": 1.9053, "step": 6713 }, { "epoch": 0.1768238082696866, "grad_norm": 2.290138006210327, "learning_rate": 4.116671056096919e-05, "loss": 0.4749, "step": 6714 }, { "epoch": 0.17685014485119832, "grad_norm": 4.1517767906188965, "learning_rate": 4.11653937318936e-05, "loss": 2.096, "step": 6715 }, { "epoch": 0.17687648143271004, "grad_norm": 1.7356162071228027, "learning_rate": 4.116407690281802e-05, "loss": 2.6511, "step": 6716 }, { "epoch": 0.17690281801422175, "grad_norm": 3.1292381286621094, "learning_rate": 4.1162760073742426e-05, "loss": 1.3981, "step": 6717 }, { "epoch": 0.17692915459573347, "grad_norm": 3.1809940338134766, "learning_rate": 4.116144324466684e-05, "loss": 1.2252, "step": 6718 }, { "epoch": 0.17695549117724518, "grad_norm": 2.8673181533813477, "learning_rate": 4.116012641559126e-05, "loss": 1.6321, "step": 6719 }, { "epoch": 0.17698182775875693, "grad_norm": 4.243430137634277, "learning_rate": 4.115880958651567e-05, "loss": 1.9734, "step": 6720 }, { "epoch": 0.17700816434026864, "grad_norm": 3.3914794921875, "learning_rate": 4.115749275744009e-05, "loss": 2.1519, "step": 6721 }, { "epoch": 0.17703450092178036, "grad_norm": 4.168887138366699, "learning_rate": 4.11561759283645e-05, "loss": 0.6254, "step": 6722 }, { "epoch": 0.17706083750329207, "grad_norm": 1.6878275871276855, "learning_rate": 4.115485909928891e-05, "loss": 1.9985, "step": 6723 }, { "epoch": 0.1770871740848038, "grad_norm": 2.0527000427246094, "learning_rate": 4.115354227021333e-05, "loss": 1.8863, "step": 6724 }, { "epoch": 0.1771135106663155, "grad_norm": 1.7715539932250977, "learning_rate": 4.1152225441137744e-05, "loss": 2.1905, "step": 6725 }, { "epoch": 0.17713984724782722, "grad_norm": 2.5337605476379395, "learning_rate": 4.115090861206215e-05, "loss": 2.1008, "step": 6726 }, { "epoch": 0.17716618382933896, "grad_norm": 2.2231197357177734, "learning_rate": 4.114959178298657e-05, "loss": 1.4889, "step": 6727 }, { "epoch": 0.17719252041085068, "grad_norm": 2.2031188011169434, "learning_rate": 4.1148274953910984e-05, "loss": 2.1719, "step": 6728 }, { "epoch": 0.1772188569923624, "grad_norm": 2.0154097080230713, "learning_rate": 4.11469581248354e-05, "loss": 1.4547, "step": 6729 }, { "epoch": 0.1772451935738741, "grad_norm": 1.9133888483047485, "learning_rate": 4.1145641295759816e-05, "loss": 1.7201, "step": 6730 }, { "epoch": 0.17727153015538583, "grad_norm": 2.965532064437866, "learning_rate": 4.1144324466684224e-05, "loss": 1.8718, "step": 6731 }, { "epoch": 0.17729786673689754, "grad_norm": 5.10041618347168, "learning_rate": 4.114300763760864e-05, "loss": 1.0623, "step": 6732 }, { "epoch": 0.17732420331840926, "grad_norm": 2.9344160556793213, "learning_rate": 4.1141690808533056e-05, "loss": 2.0449, "step": 6733 }, { "epoch": 0.177350539899921, "grad_norm": 2.2921764850616455, "learning_rate": 4.114037397945747e-05, "loss": 1.422, "step": 6734 }, { "epoch": 0.17737687648143272, "grad_norm": 4.644622325897217, "learning_rate": 4.113905715038188e-05, "loss": 2.4622, "step": 6735 }, { "epoch": 0.17740321306294443, "grad_norm": 1.6904956102371216, "learning_rate": 4.1137740321306296e-05, "loss": 1.9558, "step": 6736 }, { "epoch": 0.17742954964445615, "grad_norm": 2.1949121952056885, "learning_rate": 4.113642349223071e-05, "loss": 1.4928, "step": 6737 }, { "epoch": 0.17745588622596786, "grad_norm": 2.7083094120025635, "learning_rate": 4.113510666315513e-05, "loss": 1.7994, "step": 6738 }, { "epoch": 0.17748222280747958, "grad_norm": 2.8347067832946777, "learning_rate": 4.113378983407954e-05, "loss": 1.7722, "step": 6739 }, { "epoch": 0.17750855938899132, "grad_norm": 1.8640238046646118, "learning_rate": 4.113247300500395e-05, "loss": 1.6394, "step": 6740 }, { "epoch": 0.17753489597050304, "grad_norm": 4.407964706420898, "learning_rate": 4.113115617592837e-05, "loss": 1.5077, "step": 6741 }, { "epoch": 0.17756123255201475, "grad_norm": 2.251040458679199, "learning_rate": 4.112983934685278e-05, "loss": 2.3118, "step": 6742 }, { "epoch": 0.17758756913352647, "grad_norm": 3.1646571159362793, "learning_rate": 4.11285225177772e-05, "loss": 1.354, "step": 6743 }, { "epoch": 0.17761390571503818, "grad_norm": 2.2493324279785156, "learning_rate": 4.112720568870161e-05, "loss": 2.8635, "step": 6744 }, { "epoch": 0.1776402422965499, "grad_norm": 2.4689788818359375, "learning_rate": 4.112588885962602e-05, "loss": 0.6435, "step": 6745 }, { "epoch": 0.17766657887806162, "grad_norm": 1.9772800207138062, "learning_rate": 4.112457203055043e-05, "loss": 1.9938, "step": 6746 }, { "epoch": 0.17769291545957336, "grad_norm": 2.447547674179077, "learning_rate": 4.1123255201474854e-05, "loss": 1.2604, "step": 6747 }, { "epoch": 0.17771925204108507, "grad_norm": 2.0492193698883057, "learning_rate": 4.112193837239926e-05, "loss": 2.1748, "step": 6748 }, { "epoch": 0.1777455886225968, "grad_norm": 2.1823806762695312, "learning_rate": 4.112062154332368e-05, "loss": 1.7454, "step": 6749 }, { "epoch": 0.1777719252041085, "grad_norm": 2.7477235794067383, "learning_rate": 4.1119304714248094e-05, "loss": 1.4454, "step": 6750 }, { "epoch": 0.17779826178562022, "grad_norm": 3.9620254039764404, "learning_rate": 4.11179878851725e-05, "loss": 1.5756, "step": 6751 }, { "epoch": 0.17782459836713194, "grad_norm": 1.6999497413635254, "learning_rate": 4.1116671056096925e-05, "loss": 1.5814, "step": 6752 }, { "epoch": 0.17785093494864368, "grad_norm": 2.3639976978302, "learning_rate": 4.1115354227021334e-05, "loss": 1.5001, "step": 6753 }, { "epoch": 0.1778772715301554, "grad_norm": 1.8791390657424927, "learning_rate": 4.111403739794575e-05, "loss": 1.9452, "step": 6754 }, { "epoch": 0.1779036081116671, "grad_norm": 4.2245073318481445, "learning_rate": 4.111272056887016e-05, "loss": 1.6914, "step": 6755 }, { "epoch": 0.17792994469317883, "grad_norm": 2.0726189613342285, "learning_rate": 4.1111403739794574e-05, "loss": 1.47, "step": 6756 }, { "epoch": 0.17795628127469054, "grad_norm": 3.5650253295898438, "learning_rate": 4.111008691071899e-05, "loss": 1.9222, "step": 6757 }, { "epoch": 0.17798261785620226, "grad_norm": 2.593458890914917, "learning_rate": 4.1108770081643405e-05, "loss": 1.6603, "step": 6758 }, { "epoch": 0.17800895443771397, "grad_norm": 2.0861594676971436, "learning_rate": 4.110745325256782e-05, "loss": 1.9041, "step": 6759 }, { "epoch": 0.17803529101922572, "grad_norm": 3.4253945350646973, "learning_rate": 4.110613642349223e-05, "loss": 1.5267, "step": 6760 }, { "epoch": 0.17806162760073743, "grad_norm": 2.0704896450042725, "learning_rate": 4.110481959441665e-05, "loss": 1.9616, "step": 6761 }, { "epoch": 0.17808796418224915, "grad_norm": 2.439908504486084, "learning_rate": 4.110350276534106e-05, "loss": 0.9929, "step": 6762 }, { "epoch": 0.17811430076376086, "grad_norm": 3.2816014289855957, "learning_rate": 4.1102185936265476e-05, "loss": 1.6153, "step": 6763 }, { "epoch": 0.17814063734527258, "grad_norm": 2.4091029167175293, "learning_rate": 4.1100869107189885e-05, "loss": 1.6636, "step": 6764 }, { "epoch": 0.1781669739267843, "grad_norm": 1.7577128410339355, "learning_rate": 4.10995522781143e-05, "loss": 1.8762, "step": 6765 }, { "epoch": 0.178193310508296, "grad_norm": 1.443149209022522, "learning_rate": 4.1098235449038716e-05, "loss": 1.746, "step": 6766 }, { "epoch": 0.17821964708980775, "grad_norm": 4.384613990783691, "learning_rate": 4.109691861996313e-05, "loss": 1.8637, "step": 6767 }, { "epoch": 0.17824598367131947, "grad_norm": 3.624572277069092, "learning_rate": 4.109560179088755e-05, "loss": 0.8891, "step": 6768 }, { "epoch": 0.17827232025283118, "grad_norm": 1.9998635053634644, "learning_rate": 4.1094284961811956e-05, "loss": 2.0141, "step": 6769 }, { "epoch": 0.1782986568343429, "grad_norm": 2.157447099685669, "learning_rate": 4.109296813273637e-05, "loss": 1.6568, "step": 6770 }, { "epoch": 0.17832499341585462, "grad_norm": 2.3195464611053467, "learning_rate": 4.109165130366079e-05, "loss": 1.3678, "step": 6771 }, { "epoch": 0.17835132999736633, "grad_norm": 4.908475399017334, "learning_rate": 4.10903344745852e-05, "loss": 1.6389, "step": 6772 }, { "epoch": 0.17837766657887807, "grad_norm": 2.4665000438690186, "learning_rate": 4.108901764550961e-05, "loss": 1.854, "step": 6773 }, { "epoch": 0.1784040031603898, "grad_norm": 4.117772102355957, "learning_rate": 4.108770081643403e-05, "loss": 1.8473, "step": 6774 }, { "epoch": 0.1784303397419015, "grad_norm": 2.614429235458374, "learning_rate": 4.108638398735844e-05, "loss": 1.6584, "step": 6775 }, { "epoch": 0.17845667632341322, "grad_norm": 4.220699787139893, "learning_rate": 4.108506715828286e-05, "loss": 2.3101, "step": 6776 }, { "epoch": 0.17848301290492494, "grad_norm": 5.078653812408447, "learning_rate": 4.1083750329207274e-05, "loss": 1.9481, "step": 6777 }, { "epoch": 0.17850934948643665, "grad_norm": 2.256284713745117, "learning_rate": 4.108243350013168e-05, "loss": 1.8684, "step": 6778 }, { "epoch": 0.17853568606794837, "grad_norm": 1.7752957344055176, "learning_rate": 4.10811166710561e-05, "loss": 1.3696, "step": 6779 }, { "epoch": 0.1785620226494601, "grad_norm": 1.5159751176834106, "learning_rate": 4.1079799841980514e-05, "loss": 1.4898, "step": 6780 }, { "epoch": 0.17858835923097183, "grad_norm": 2.9525413513183594, "learning_rate": 4.107848301290493e-05, "loss": 1.5798, "step": 6781 }, { "epoch": 0.17861469581248354, "grad_norm": 3.0003139972686768, "learning_rate": 4.107716618382934e-05, "loss": 1.2852, "step": 6782 }, { "epoch": 0.17864103239399526, "grad_norm": 2.001039505004883, "learning_rate": 4.1075849354753754e-05, "loss": 0.5788, "step": 6783 }, { "epoch": 0.17866736897550697, "grad_norm": 2.3685619831085205, "learning_rate": 4.107453252567817e-05, "loss": 1.7855, "step": 6784 }, { "epoch": 0.1786937055570187, "grad_norm": 1.8843557834625244, "learning_rate": 4.1073215696602585e-05, "loss": 1.5882, "step": 6785 }, { "epoch": 0.1787200421385304, "grad_norm": 1.9563547372817993, "learning_rate": 4.1071898867527e-05, "loss": 2.5971, "step": 6786 }, { "epoch": 0.17874637872004215, "grad_norm": 2.013260841369629, "learning_rate": 4.107058203845141e-05, "loss": 1.8646, "step": 6787 }, { "epoch": 0.17877271530155386, "grad_norm": 3.0720396041870117, "learning_rate": 4.1069265209375825e-05, "loss": 1.9464, "step": 6788 }, { "epoch": 0.17879905188306558, "grad_norm": 2.703814744949341, "learning_rate": 4.1067948380300234e-05, "loss": 1.3062, "step": 6789 }, { "epoch": 0.1788253884645773, "grad_norm": 1.8566234111785889, "learning_rate": 4.1066631551224657e-05, "loss": 1.7328, "step": 6790 }, { "epoch": 0.178851725046089, "grad_norm": 1.9944411516189575, "learning_rate": 4.1065314722149065e-05, "loss": 1.4583, "step": 6791 }, { "epoch": 0.17887806162760073, "grad_norm": 2.6147239208221436, "learning_rate": 4.106399789307348e-05, "loss": 1.3663, "step": 6792 }, { "epoch": 0.17890439820911247, "grad_norm": 2.9063587188720703, "learning_rate": 4.106268106399789e-05, "loss": 1.6115, "step": 6793 }, { "epoch": 0.17893073479062419, "grad_norm": 4.7841339111328125, "learning_rate": 4.106136423492231e-05, "loss": 2.4867, "step": 6794 }, { "epoch": 0.1789570713721359, "grad_norm": 3.1169638633728027, "learning_rate": 4.106004740584672e-05, "loss": 1.2408, "step": 6795 }, { "epoch": 0.17898340795364762, "grad_norm": 2.0345919132232666, "learning_rate": 4.105873057677114e-05, "loss": 0.9196, "step": 6796 }, { "epoch": 0.17900974453515933, "grad_norm": 2.243893623352051, "learning_rate": 4.105741374769555e-05, "loss": 1.7777, "step": 6797 }, { "epoch": 0.17903608111667105, "grad_norm": 1.65420663356781, "learning_rate": 4.105609691861996e-05, "loss": 2.1784, "step": 6798 }, { "epoch": 0.17906241769818276, "grad_norm": 3.017810344696045, "learning_rate": 4.1054780089544383e-05, "loss": 1.3427, "step": 6799 }, { "epoch": 0.1790887542796945, "grad_norm": 2.7063002586364746, "learning_rate": 4.105346326046879e-05, "loss": 1.8331, "step": 6800 }, { "epoch": 0.17911509086120622, "grad_norm": 2.237499952316284, "learning_rate": 4.105214643139321e-05, "loss": 1.9497, "step": 6801 }, { "epoch": 0.17914142744271794, "grad_norm": 2.082740545272827, "learning_rate": 4.105082960231762e-05, "loss": 1.4827, "step": 6802 }, { "epoch": 0.17916776402422965, "grad_norm": 2.144864559173584, "learning_rate": 4.104951277324203e-05, "loss": 1.8197, "step": 6803 }, { "epoch": 0.17919410060574137, "grad_norm": 2.2020671367645264, "learning_rate": 4.104819594416645e-05, "loss": 2.1981, "step": 6804 }, { "epoch": 0.17922043718725308, "grad_norm": 2.2199628353118896, "learning_rate": 4.1046879115090863e-05, "loss": 1.7829, "step": 6805 }, { "epoch": 0.17924677376876483, "grad_norm": 2.0566678047180176, "learning_rate": 4.104556228601528e-05, "loss": 1.9086, "step": 6806 }, { "epoch": 0.17927311035027654, "grad_norm": 1.9605798721313477, "learning_rate": 4.104424545693969e-05, "loss": 2.248, "step": 6807 }, { "epoch": 0.17929944693178826, "grad_norm": 1.7758398056030273, "learning_rate": 4.104292862786411e-05, "loss": 2.2688, "step": 6808 }, { "epoch": 0.17932578351329997, "grad_norm": 3.7575817108154297, "learning_rate": 4.104161179878852e-05, "loss": 2.3091, "step": 6809 }, { "epoch": 0.1793521200948117, "grad_norm": 4.644641399383545, "learning_rate": 4.1040294969712935e-05, "loss": 1.3487, "step": 6810 }, { "epoch": 0.1793784566763234, "grad_norm": 1.894126057624817, "learning_rate": 4.1038978140637344e-05, "loss": 1.6621, "step": 6811 }, { "epoch": 0.17940479325783512, "grad_norm": 2.8985049724578857, "learning_rate": 4.103766131156176e-05, "loss": 1.1934, "step": 6812 }, { "epoch": 0.17943112983934686, "grad_norm": 1.7191641330718994, "learning_rate": 4.1036344482486175e-05, "loss": 1.6747, "step": 6813 }, { "epoch": 0.17945746642085858, "grad_norm": 1.8535321950912476, "learning_rate": 4.103502765341059e-05, "loss": 2.0091, "step": 6814 }, { "epoch": 0.1794838030023703, "grad_norm": 2.8188588619232178, "learning_rate": 4.1033710824335006e-05, "loss": 1.3508, "step": 6815 }, { "epoch": 0.179510139583882, "grad_norm": 2.948801040649414, "learning_rate": 4.1032393995259415e-05, "loss": 0.8262, "step": 6816 }, { "epoch": 0.17953647616539373, "grad_norm": 3.9432289600372314, "learning_rate": 4.103107716618383e-05, "loss": 1.2866, "step": 6817 }, { "epoch": 0.17956281274690544, "grad_norm": 2.3603694438934326, "learning_rate": 4.1029760337108246e-05, "loss": 2.5574, "step": 6818 }, { "epoch": 0.17958914932841716, "grad_norm": 3.932704448699951, "learning_rate": 4.102844350803266e-05, "loss": 1.8841, "step": 6819 }, { "epoch": 0.1796154859099289, "grad_norm": 1.7816461324691772, "learning_rate": 4.102712667895707e-05, "loss": 2.3823, "step": 6820 }, { "epoch": 0.17964182249144062, "grad_norm": 1.6622763872146606, "learning_rate": 4.1025809849881486e-05, "loss": 1.8831, "step": 6821 }, { "epoch": 0.17966815907295233, "grad_norm": 2.4232850074768066, "learning_rate": 4.10244930208059e-05, "loss": 1.3076, "step": 6822 }, { "epoch": 0.17969449565446405, "grad_norm": 3.414458990097046, "learning_rate": 4.102317619173032e-05, "loss": 1.8218, "step": 6823 }, { "epoch": 0.17972083223597576, "grad_norm": 1.437122106552124, "learning_rate": 4.102185936265473e-05, "loss": 1.7803, "step": 6824 }, { "epoch": 0.17974716881748748, "grad_norm": 1.7008097171783447, "learning_rate": 4.102054253357914e-05, "loss": 1.7539, "step": 6825 }, { "epoch": 0.17977350539899922, "grad_norm": 6.7796950340271, "learning_rate": 4.101922570450356e-05, "loss": 1.5306, "step": 6826 }, { "epoch": 0.17979984198051094, "grad_norm": 2.5783803462982178, "learning_rate": 4.101790887542797e-05, "loss": 1.862, "step": 6827 }, { "epoch": 0.17982617856202265, "grad_norm": 1.9919512271881104, "learning_rate": 4.101659204635239e-05, "loss": 1.4089, "step": 6828 }, { "epoch": 0.17985251514353437, "grad_norm": 1.8648899793624878, "learning_rate": 4.10152752172768e-05, "loss": 0.4449, "step": 6829 }, { "epoch": 0.17987885172504608, "grad_norm": 3.0019474029541016, "learning_rate": 4.101395838820121e-05, "loss": 1.8668, "step": 6830 }, { "epoch": 0.1799051883065578, "grad_norm": 2.0431602001190186, "learning_rate": 4.101264155912563e-05, "loss": 0.9655, "step": 6831 }, { "epoch": 0.17993152488806952, "grad_norm": 1.9050644636154175, "learning_rate": 4.1011324730050044e-05, "loss": 2.3416, "step": 6832 }, { "epoch": 0.17995786146958126, "grad_norm": 1.5316270589828491, "learning_rate": 4.101000790097446e-05, "loss": 2.0118, "step": 6833 }, { "epoch": 0.17998419805109298, "grad_norm": 2.86852765083313, "learning_rate": 4.100869107189887e-05, "loss": 2.6354, "step": 6834 }, { "epoch": 0.1800105346326047, "grad_norm": 2.293701171875, "learning_rate": 4.1007374242823284e-05, "loss": 1.7266, "step": 6835 }, { "epoch": 0.1800368712141164, "grad_norm": 1.7296762466430664, "learning_rate": 4.100605741374769e-05, "loss": 1.4435, "step": 6836 }, { "epoch": 0.18006320779562812, "grad_norm": 1.9229036569595337, "learning_rate": 4.1004740584672115e-05, "loss": 2.377, "step": 6837 }, { "epoch": 0.18008954437713984, "grad_norm": 2.495021104812622, "learning_rate": 4.1003423755596524e-05, "loss": 2.3578, "step": 6838 }, { "epoch": 0.18011588095865158, "grad_norm": 2.3712573051452637, "learning_rate": 4.100210692652094e-05, "loss": 1.1256, "step": 6839 }, { "epoch": 0.1801422175401633, "grad_norm": 1.759839415550232, "learning_rate": 4.1000790097445355e-05, "loss": 2.0299, "step": 6840 }, { "epoch": 0.180168554121675, "grad_norm": 5.818070411682129, "learning_rate": 4.099947326836977e-05, "loss": 2.027, "step": 6841 }, { "epoch": 0.18019489070318673, "grad_norm": 1.5625042915344238, "learning_rate": 4.0998156439294186e-05, "loss": 1.9021, "step": 6842 }, { "epoch": 0.18022122728469844, "grad_norm": 1.7875632047653198, "learning_rate": 4.0996839610218595e-05, "loss": 1.9985, "step": 6843 }, { "epoch": 0.18024756386621016, "grad_norm": 6.783160209655762, "learning_rate": 4.099552278114301e-05, "loss": 0.8677, "step": 6844 }, { "epoch": 0.18027390044772187, "grad_norm": 2.866274356842041, "learning_rate": 4.099420595206742e-05, "loss": 2.0127, "step": 6845 }, { "epoch": 0.18030023702923362, "grad_norm": 2.130305767059326, "learning_rate": 4.099288912299184e-05, "loss": 2.098, "step": 6846 }, { "epoch": 0.18032657361074533, "grad_norm": 1.7019449472427368, "learning_rate": 4.099157229391625e-05, "loss": 1.7701, "step": 6847 }, { "epoch": 0.18035291019225705, "grad_norm": 3.656829595565796, "learning_rate": 4.0990255464840666e-05, "loss": 1.7441, "step": 6848 }, { "epoch": 0.18037924677376876, "grad_norm": 3.3117520809173584, "learning_rate": 4.0988938635765075e-05, "loss": 0.6602, "step": 6849 }, { "epoch": 0.18040558335528048, "grad_norm": 3.709299325942993, "learning_rate": 4.098762180668949e-05, "loss": 0.6624, "step": 6850 }, { "epoch": 0.1804319199367922, "grad_norm": 1.9445096254348755, "learning_rate": 4.0986304977613906e-05, "loss": 1.696, "step": 6851 }, { "epoch": 0.1804582565183039, "grad_norm": 1.9931341409683228, "learning_rate": 4.098498814853832e-05, "loss": 1.5443, "step": 6852 }, { "epoch": 0.18048459309981565, "grad_norm": 3.580620765686035, "learning_rate": 4.098367131946274e-05, "loss": 1.9921, "step": 6853 }, { "epoch": 0.18051092968132737, "grad_norm": 2.829792022705078, "learning_rate": 4.0982354490387146e-05, "loss": 2.0868, "step": 6854 }, { "epoch": 0.18053726626283909, "grad_norm": 1.9883331060409546, "learning_rate": 4.098103766131156e-05, "loss": 1.8577, "step": 6855 }, { "epoch": 0.1805636028443508, "grad_norm": 2.3210837841033936, "learning_rate": 4.097972083223598e-05, "loss": 1.7687, "step": 6856 }, { "epoch": 0.18058993942586252, "grad_norm": 1.977243185043335, "learning_rate": 4.097840400316039e-05, "loss": 1.9435, "step": 6857 }, { "epoch": 0.18061627600737423, "grad_norm": 2.4407622814178467, "learning_rate": 4.09770871740848e-05, "loss": 2.161, "step": 6858 }, { "epoch": 0.18064261258888598, "grad_norm": 1.699407696723938, "learning_rate": 4.097577034500922e-05, "loss": 1.2125, "step": 6859 }, { "epoch": 0.1806689491703977, "grad_norm": 1.610184669494629, "learning_rate": 4.097445351593363e-05, "loss": 1.8904, "step": 6860 }, { "epoch": 0.1806952857519094, "grad_norm": 1.7996221780776978, "learning_rate": 4.097313668685805e-05, "loss": 1.5021, "step": 6861 }, { "epoch": 0.18072162233342112, "grad_norm": 2.915076732635498, "learning_rate": 4.0971819857782464e-05, "loss": 2.0225, "step": 6862 }, { "epoch": 0.18074795891493284, "grad_norm": 1.4015133380889893, "learning_rate": 4.097050302870687e-05, "loss": 1.8607, "step": 6863 }, { "epoch": 0.18077429549644455, "grad_norm": 2.6054396629333496, "learning_rate": 4.096918619963129e-05, "loss": 0.9116, "step": 6864 }, { "epoch": 0.18080063207795627, "grad_norm": 1.9150731563568115, "learning_rate": 4.0967869370555704e-05, "loss": 1.885, "step": 6865 }, { "epoch": 0.180826968659468, "grad_norm": 2.529771327972412, "learning_rate": 4.096655254148012e-05, "loss": 1.9681, "step": 6866 }, { "epoch": 0.18085330524097973, "grad_norm": 2.0468051433563232, "learning_rate": 4.096523571240453e-05, "loss": 1.6722, "step": 6867 }, { "epoch": 0.18087964182249144, "grad_norm": 1.804702639579773, "learning_rate": 4.0963918883328944e-05, "loss": 1.9368, "step": 6868 }, { "epoch": 0.18090597840400316, "grad_norm": 1.5351810455322266, "learning_rate": 4.096260205425336e-05, "loss": 2.055, "step": 6869 }, { "epoch": 0.18093231498551487, "grad_norm": 1.6544311046600342, "learning_rate": 4.0961285225177776e-05, "loss": 1.7408, "step": 6870 }, { "epoch": 0.1809586515670266, "grad_norm": 2.6199283599853516, "learning_rate": 4.095996839610219e-05, "loss": 1.8655, "step": 6871 }, { "epoch": 0.1809849881485383, "grad_norm": 1.7403453588485718, "learning_rate": 4.09586515670266e-05, "loss": 2.0365, "step": 6872 }, { "epoch": 0.18101132473005005, "grad_norm": 2.176819086074829, "learning_rate": 4.0957334737951016e-05, "loss": 0.8515, "step": 6873 }, { "epoch": 0.18103766131156176, "grad_norm": 1.74526846408844, "learning_rate": 4.095601790887543e-05, "loss": 1.6314, "step": 6874 }, { "epoch": 0.18106399789307348, "grad_norm": 2.622490406036377, "learning_rate": 4.095470107979985e-05, "loss": 2.0877, "step": 6875 }, { "epoch": 0.1810903344745852, "grad_norm": 2.2714836597442627, "learning_rate": 4.0953384250724256e-05, "loss": 2.3627, "step": 6876 }, { "epoch": 0.1811166710560969, "grad_norm": 2.257474422454834, "learning_rate": 4.095206742164867e-05, "loss": 2.0761, "step": 6877 }, { "epoch": 0.18114300763760863, "grad_norm": 1.8922864198684692, "learning_rate": 4.095075059257309e-05, "loss": 2.2427, "step": 6878 }, { "epoch": 0.18116934421912037, "grad_norm": 3.556945323944092, "learning_rate": 4.09494337634975e-05, "loss": 1.3572, "step": 6879 }, { "epoch": 0.1811956808006321, "grad_norm": 3.8870959281921387, "learning_rate": 4.094811693442192e-05, "loss": 0.7372, "step": 6880 }, { "epoch": 0.1812220173821438, "grad_norm": 2.645507335662842, "learning_rate": 4.094680010534633e-05, "loss": 1.5219, "step": 6881 }, { "epoch": 0.18124835396365552, "grad_norm": 2.594209909439087, "learning_rate": 4.094548327627074e-05, "loss": 1.1077, "step": 6882 }, { "epoch": 0.18127469054516723, "grad_norm": 2.21266508102417, "learning_rate": 4.094416644719515e-05, "loss": 1.7842, "step": 6883 }, { "epoch": 0.18130102712667895, "grad_norm": 3.9246649742126465, "learning_rate": 4.0942849618119574e-05, "loss": 0.9665, "step": 6884 }, { "epoch": 0.18132736370819066, "grad_norm": 2.080536365509033, "learning_rate": 4.094153278904398e-05, "loss": 2.0872, "step": 6885 }, { "epoch": 0.1813537002897024, "grad_norm": 2.383018970489502, "learning_rate": 4.09402159599684e-05, "loss": 1.2582, "step": 6886 }, { "epoch": 0.18138003687121412, "grad_norm": 2.284088373184204, "learning_rate": 4.0938899130892814e-05, "loss": 2.1995, "step": 6887 }, { "epoch": 0.18140637345272584, "grad_norm": 2.257924795150757, "learning_rate": 4.093758230181722e-05, "loss": 1.7252, "step": 6888 }, { "epoch": 0.18143271003423755, "grad_norm": 2.044666290283203, "learning_rate": 4.0936265472741645e-05, "loss": 1.9257, "step": 6889 }, { "epoch": 0.18145904661574927, "grad_norm": 3.2120275497436523, "learning_rate": 4.0934948643666054e-05, "loss": 1.5806, "step": 6890 }, { "epoch": 0.18148538319726099, "grad_norm": 2.44638729095459, "learning_rate": 4.093363181459047e-05, "loss": 2.1068, "step": 6891 }, { "epoch": 0.18151171977877273, "grad_norm": 2.268062114715576, "learning_rate": 4.093231498551488e-05, "loss": 1.4517, "step": 6892 }, { "epoch": 0.18153805636028444, "grad_norm": 1.687217354774475, "learning_rate": 4.09309981564393e-05, "loss": 1.7188, "step": 6893 }, { "epoch": 0.18156439294179616, "grad_norm": 2.019071340560913, "learning_rate": 4.092968132736371e-05, "loss": 2.3121, "step": 6894 }, { "epoch": 0.18159072952330788, "grad_norm": 3.3610973358154297, "learning_rate": 4.0928364498288125e-05, "loss": 1.707, "step": 6895 }, { "epoch": 0.1816170661048196, "grad_norm": 2.2466182708740234, "learning_rate": 4.0927047669212534e-05, "loss": 1.1798, "step": 6896 }, { "epoch": 0.1816434026863313, "grad_norm": 1.7998205423355103, "learning_rate": 4.092573084013695e-05, "loss": 1.4341, "step": 6897 }, { "epoch": 0.18166973926784302, "grad_norm": 3.8328516483306885, "learning_rate": 4.0924414011061365e-05, "loss": 1.0946, "step": 6898 }, { "epoch": 0.18169607584935477, "grad_norm": 3.4311389923095703, "learning_rate": 4.092309718198578e-05, "loss": 0.8161, "step": 6899 }, { "epoch": 0.18172241243086648, "grad_norm": 5.416226387023926, "learning_rate": 4.0921780352910196e-05, "loss": 2.0961, "step": 6900 }, { "epoch": 0.1817487490123782, "grad_norm": 3.2724146842956543, "learning_rate": 4.0920463523834605e-05, "loss": 0.7855, "step": 6901 }, { "epoch": 0.1817750855938899, "grad_norm": 2.7790586948394775, "learning_rate": 4.091914669475902e-05, "loss": 0.9176, "step": 6902 }, { "epoch": 0.18180142217540163, "grad_norm": 1.815251111984253, "learning_rate": 4.0917829865683436e-05, "loss": 1.9013, "step": 6903 }, { "epoch": 0.18182775875691334, "grad_norm": 2.084998369216919, "learning_rate": 4.091651303660785e-05, "loss": 1.6303, "step": 6904 }, { "epoch": 0.18185409533842506, "grad_norm": 2.2930824756622314, "learning_rate": 4.091519620753226e-05, "loss": 1.759, "step": 6905 }, { "epoch": 0.1818804319199368, "grad_norm": 3.24057936668396, "learning_rate": 4.0913879378456676e-05, "loss": 1.1276, "step": 6906 }, { "epoch": 0.18190676850144852, "grad_norm": 2.000854969024658, "learning_rate": 4.091256254938109e-05, "loss": 1.8307, "step": 6907 }, { "epoch": 0.18193310508296023, "grad_norm": 2.2538535594940186, "learning_rate": 4.091124572030551e-05, "loss": 1.882, "step": 6908 }, { "epoch": 0.18195944166447195, "grad_norm": 2.1586172580718994, "learning_rate": 4.090992889122992e-05, "loss": 1.7585, "step": 6909 }, { "epoch": 0.18198577824598366, "grad_norm": 6.436657905578613, "learning_rate": 4.090861206215433e-05, "loss": 0.8941, "step": 6910 }, { "epoch": 0.18201211482749538, "grad_norm": 2.8224778175354004, "learning_rate": 4.090729523307875e-05, "loss": 1.6596, "step": 6911 }, { "epoch": 0.18203845140900712, "grad_norm": 1.6180421113967896, "learning_rate": 4.090597840400316e-05, "loss": 1.8923, "step": 6912 }, { "epoch": 0.18206478799051884, "grad_norm": 4.265376567840576, "learning_rate": 4.090466157492758e-05, "loss": 1.3665, "step": 6913 }, { "epoch": 0.18209112457203055, "grad_norm": 3.642742872238159, "learning_rate": 4.090334474585199e-05, "loss": 1.4445, "step": 6914 }, { "epoch": 0.18211746115354227, "grad_norm": 1.9837453365325928, "learning_rate": 4.09020279167764e-05, "loss": 1.9368, "step": 6915 }, { "epoch": 0.18214379773505399, "grad_norm": 1.541934609413147, "learning_rate": 4.090071108770082e-05, "loss": 1.5826, "step": 6916 }, { "epoch": 0.1821701343165657, "grad_norm": 1.923579216003418, "learning_rate": 4.0899394258625234e-05, "loss": 0.3904, "step": 6917 }, { "epoch": 0.18219647089807742, "grad_norm": 8.596114158630371, "learning_rate": 4.089807742954965e-05, "loss": 1.4123, "step": 6918 }, { "epoch": 0.18222280747958916, "grad_norm": 2.7523770332336426, "learning_rate": 4.089676060047406e-05, "loss": 0.5204, "step": 6919 }, { "epoch": 0.18224914406110088, "grad_norm": 4.1075758934021, "learning_rate": 4.0895443771398474e-05, "loss": 1.5215, "step": 6920 }, { "epoch": 0.1822754806426126, "grad_norm": 2.7514095306396484, "learning_rate": 4.089412694232289e-05, "loss": 0.6175, "step": 6921 }, { "epoch": 0.1823018172241243, "grad_norm": 1.769418478012085, "learning_rate": 4.0892810113247305e-05, "loss": 1.6328, "step": 6922 }, { "epoch": 0.18232815380563602, "grad_norm": 1.9333324432373047, "learning_rate": 4.0891493284171714e-05, "loss": 1.6509, "step": 6923 }, { "epoch": 0.18235449038714774, "grad_norm": 2.719660520553589, "learning_rate": 4.089017645509613e-05, "loss": 1.228, "step": 6924 }, { "epoch": 0.18238082696865948, "grad_norm": 2.9050986766815186, "learning_rate": 4.0888859626020545e-05, "loss": 1.1917, "step": 6925 }, { "epoch": 0.1824071635501712, "grad_norm": 3.8585193157196045, "learning_rate": 4.088754279694496e-05, "loss": 1.911, "step": 6926 }, { "epoch": 0.1824335001316829, "grad_norm": 2.263455390930176, "learning_rate": 4.088622596786938e-05, "loss": 0.5623, "step": 6927 }, { "epoch": 0.18245983671319463, "grad_norm": 1.7645083665847778, "learning_rate": 4.0884909138793785e-05, "loss": 2.1578, "step": 6928 }, { "epoch": 0.18248617329470634, "grad_norm": 1.7438981533050537, "learning_rate": 4.08835923097182e-05, "loss": 1.2436, "step": 6929 }, { "epoch": 0.18251250987621806, "grad_norm": 2.469877243041992, "learning_rate": 4.088227548064261e-05, "loss": 0.9466, "step": 6930 }, { "epoch": 0.18253884645772978, "grad_norm": 4.381165504455566, "learning_rate": 4.088095865156703e-05, "loss": 0.8198, "step": 6931 }, { "epoch": 0.18256518303924152, "grad_norm": 2.3040764331817627, "learning_rate": 4.087964182249144e-05, "loss": 1.5509, "step": 6932 }, { "epoch": 0.18259151962075323, "grad_norm": 2.239445447921753, "learning_rate": 4.087832499341586e-05, "loss": 2.2845, "step": 6933 }, { "epoch": 0.18261785620226495, "grad_norm": 1.7413301467895508, "learning_rate": 4.087700816434027e-05, "loss": 1.9569, "step": 6934 }, { "epoch": 0.18264419278377667, "grad_norm": 3.0984885692596436, "learning_rate": 4.087569133526468e-05, "loss": 1.7785, "step": 6935 }, { "epoch": 0.18267052936528838, "grad_norm": 2.176159381866455, "learning_rate": 4.0874374506189103e-05, "loss": 1.3328, "step": 6936 }, { "epoch": 0.1826968659468001, "grad_norm": 2.961353063583374, "learning_rate": 4.087305767711351e-05, "loss": 1.5821, "step": 6937 }, { "epoch": 0.1827232025283118, "grad_norm": 2.226867914199829, "learning_rate": 4.087174084803793e-05, "loss": 1.7769, "step": 6938 }, { "epoch": 0.18274953910982356, "grad_norm": 1.9836151599884033, "learning_rate": 4.087042401896234e-05, "loss": 1.9426, "step": 6939 }, { "epoch": 0.18277587569133527, "grad_norm": 2.933189630508423, "learning_rate": 4.086910718988676e-05, "loss": 1.5772, "step": 6940 }, { "epoch": 0.182802212272847, "grad_norm": 3.351119041442871, "learning_rate": 4.086779036081117e-05, "loss": 1.6856, "step": 6941 }, { "epoch": 0.1828285488543587, "grad_norm": 1.6739122867584229, "learning_rate": 4.0866473531735584e-05, "loss": 1.589, "step": 6942 }, { "epoch": 0.18285488543587042, "grad_norm": 2.4155874252319336, "learning_rate": 4.086515670266e-05, "loss": 1.7412, "step": 6943 }, { "epoch": 0.18288122201738213, "grad_norm": 4.7902398109436035, "learning_rate": 4.086383987358441e-05, "loss": 1.905, "step": 6944 }, { "epoch": 0.18290755859889388, "grad_norm": 3.0446271896362305, "learning_rate": 4.086252304450883e-05, "loss": 1.8619, "step": 6945 }, { "epoch": 0.1829338951804056, "grad_norm": 3.9771223068237305, "learning_rate": 4.086120621543324e-05, "loss": 1.6439, "step": 6946 }, { "epoch": 0.1829602317619173, "grad_norm": 2.1452763080596924, "learning_rate": 4.0859889386357655e-05, "loss": 1.3718, "step": 6947 }, { "epoch": 0.18298656834342902, "grad_norm": 2.008652448654175, "learning_rate": 4.0858572557282064e-05, "loss": 1.7454, "step": 6948 }, { "epoch": 0.18301290492494074, "grad_norm": 1.8076740503311157, "learning_rate": 4.085725572820648e-05, "loss": 2.0562, "step": 6949 }, { "epoch": 0.18303924150645245, "grad_norm": 1.9337666034698486, "learning_rate": 4.0855938899130895e-05, "loss": 1.8595, "step": 6950 }, { "epoch": 0.18306557808796417, "grad_norm": 1.9141703844070435, "learning_rate": 4.085462207005531e-05, "loss": 1.742, "step": 6951 }, { "epoch": 0.1830919146694759, "grad_norm": 2.366124153137207, "learning_rate": 4.085330524097972e-05, "loss": 1.3431, "step": 6952 }, { "epoch": 0.18311825125098763, "grad_norm": 1.792555332183838, "learning_rate": 4.0851988411904135e-05, "loss": 1.6651, "step": 6953 }, { "epoch": 0.18314458783249934, "grad_norm": 2.1594085693359375, "learning_rate": 4.085067158282855e-05, "loss": 2.1063, "step": 6954 }, { "epoch": 0.18317092441401106, "grad_norm": 3.452294111251831, "learning_rate": 4.0849354753752966e-05, "loss": 2.1025, "step": 6955 }, { "epoch": 0.18319726099552278, "grad_norm": 2.360137701034546, "learning_rate": 4.084803792467738e-05, "loss": 2.1452, "step": 6956 }, { "epoch": 0.1832235975770345, "grad_norm": 1.6228358745574951, "learning_rate": 4.084672109560179e-05, "loss": 2.2041, "step": 6957 }, { "epoch": 0.1832499341585462, "grad_norm": 2.022235155105591, "learning_rate": 4.0845404266526206e-05, "loss": 1.5295, "step": 6958 }, { "epoch": 0.18327627074005795, "grad_norm": 2.2516980171203613, "learning_rate": 4.084408743745062e-05, "loss": 1.9317, "step": 6959 }, { "epoch": 0.18330260732156967, "grad_norm": 1.9469623565673828, "learning_rate": 4.084277060837504e-05, "loss": 1.7795, "step": 6960 }, { "epoch": 0.18332894390308138, "grad_norm": 2.279630422592163, "learning_rate": 4.0841453779299446e-05, "loss": 1.7714, "step": 6961 }, { "epoch": 0.1833552804845931, "grad_norm": 2.4443295001983643, "learning_rate": 4.084013695022386e-05, "loss": 1.9419, "step": 6962 }, { "epoch": 0.1833816170661048, "grad_norm": 1.7123942375183105, "learning_rate": 4.083882012114828e-05, "loss": 1.8043, "step": 6963 }, { "epoch": 0.18340795364761653, "grad_norm": 4.375567436218262, "learning_rate": 4.083750329207269e-05, "loss": 0.9541, "step": 6964 }, { "epoch": 0.18343429022912827, "grad_norm": 4.862067699432373, "learning_rate": 4.083618646299711e-05, "loss": 1.048, "step": 6965 }, { "epoch": 0.18346062681064, "grad_norm": 2.5221095085144043, "learning_rate": 4.083486963392152e-05, "loss": 1.0404, "step": 6966 }, { "epoch": 0.1834869633921517, "grad_norm": 5.673100471496582, "learning_rate": 4.083355280484593e-05, "loss": 1.7307, "step": 6967 }, { "epoch": 0.18351329997366342, "grad_norm": 3.7432143688201904, "learning_rate": 4.083223597577034e-05, "loss": 1.4658, "step": 6968 }, { "epoch": 0.18353963655517513, "grad_norm": 1.9975262880325317, "learning_rate": 4.0830919146694764e-05, "loss": 1.8376, "step": 6969 }, { "epoch": 0.18356597313668685, "grad_norm": 3.3046092987060547, "learning_rate": 4.082960231761917e-05, "loss": 0.9348, "step": 6970 }, { "epoch": 0.18359230971819857, "grad_norm": 1.9514634609222412, "learning_rate": 4.082828548854359e-05, "loss": 1.5875, "step": 6971 }, { "epoch": 0.1836186462997103, "grad_norm": 3.79278826713562, "learning_rate": 4.0826968659468004e-05, "loss": 1.9848, "step": 6972 }, { "epoch": 0.18364498288122202, "grad_norm": 3.3770925998687744, "learning_rate": 4.082565183039242e-05, "loss": 1.7688, "step": 6973 }, { "epoch": 0.18367131946273374, "grad_norm": 3.727830171585083, "learning_rate": 4.0824335001316835e-05, "loss": 1.7535, "step": 6974 }, { "epoch": 0.18369765604424546, "grad_norm": 2.100135326385498, "learning_rate": 4.0823018172241244e-05, "loss": 2.1544, "step": 6975 }, { "epoch": 0.18372399262575717, "grad_norm": 2.4139041900634766, "learning_rate": 4.082170134316566e-05, "loss": 1.4866, "step": 6976 }, { "epoch": 0.1837503292072689, "grad_norm": 1.6062692403793335, "learning_rate": 4.082038451409007e-05, "loss": 1.7361, "step": 6977 }, { "epoch": 0.18377666578878063, "grad_norm": 2.123335361480713, "learning_rate": 4.081906768501449e-05, "loss": 1.6579, "step": 6978 }, { "epoch": 0.18380300237029235, "grad_norm": 2.155022382736206, "learning_rate": 4.08177508559389e-05, "loss": 2.8441, "step": 6979 }, { "epoch": 0.18382933895180406, "grad_norm": 4.40614652633667, "learning_rate": 4.0816434026863315e-05, "loss": 1.5018, "step": 6980 }, { "epoch": 0.18385567553331578, "grad_norm": 2.3432910442352295, "learning_rate": 4.081511719778773e-05, "loss": 1.9595, "step": 6981 }, { "epoch": 0.1838820121148275, "grad_norm": 2.8105359077453613, "learning_rate": 4.081380036871214e-05, "loss": 1.8624, "step": 6982 }, { "epoch": 0.1839083486963392, "grad_norm": 8.302210807800293, "learning_rate": 4.081248353963656e-05, "loss": 1.3675, "step": 6983 }, { "epoch": 0.18393468527785092, "grad_norm": 2.0491950511932373, "learning_rate": 4.081116671056097e-05, "loss": 2.1136, "step": 6984 }, { "epoch": 0.18396102185936267, "grad_norm": 4.874288558959961, "learning_rate": 4.0809849881485386e-05, "loss": 1.4397, "step": 6985 }, { "epoch": 0.18398735844087438, "grad_norm": 2.5536129474639893, "learning_rate": 4.0808533052409795e-05, "loss": 1.7797, "step": 6986 }, { "epoch": 0.1840136950223861, "grad_norm": 1.8840323686599731, "learning_rate": 4.080721622333422e-05, "loss": 2.4845, "step": 6987 }, { "epoch": 0.1840400316038978, "grad_norm": 2.0048911571502686, "learning_rate": 4.0805899394258626e-05, "loss": 1.5175, "step": 6988 }, { "epoch": 0.18406636818540953, "grad_norm": 2.539696455001831, "learning_rate": 4.080458256518304e-05, "loss": 1.9548, "step": 6989 }, { "epoch": 0.18409270476692124, "grad_norm": 1.951242446899414, "learning_rate": 4.080326573610746e-05, "loss": 1.3304, "step": 6990 }, { "epoch": 0.18411904134843296, "grad_norm": 4.876642227172852, "learning_rate": 4.0801948907031866e-05, "loss": 1.3783, "step": 6991 }, { "epoch": 0.1841453779299447, "grad_norm": 2.077089548110962, "learning_rate": 4.080063207795629e-05, "loss": 1.0857, "step": 6992 }, { "epoch": 0.18417171451145642, "grad_norm": 1.7162292003631592, "learning_rate": 4.07993152488807e-05, "loss": 1.4171, "step": 6993 }, { "epoch": 0.18419805109296813, "grad_norm": 2.0253751277923584, "learning_rate": 4.079799841980511e-05, "loss": 1.9105, "step": 6994 }, { "epoch": 0.18422438767447985, "grad_norm": 3.2778470516204834, "learning_rate": 4.079668159072952e-05, "loss": 1.6272, "step": 6995 }, { "epoch": 0.18425072425599157, "grad_norm": 2.9602034091949463, "learning_rate": 4.079536476165394e-05, "loss": 1.7009, "step": 6996 }, { "epoch": 0.18427706083750328, "grad_norm": 3.074713706970215, "learning_rate": 4.079404793257835e-05, "loss": 2.0371, "step": 6997 }, { "epoch": 0.18430339741901502, "grad_norm": 2.6605823040008545, "learning_rate": 4.079273110350277e-05, "loss": 2.234, "step": 6998 }, { "epoch": 0.18432973400052674, "grad_norm": 2.15189790725708, "learning_rate": 4.079141427442718e-05, "loss": 2.2861, "step": 6999 }, { "epoch": 0.18435607058203846, "grad_norm": 2.8215811252593994, "learning_rate": 4.079009744535159e-05, "loss": 2.0102, "step": 7000 }, { "epoch": 0.18438240716355017, "grad_norm": 1.8273508548736572, "learning_rate": 4.078878061627601e-05, "loss": 1.8234, "step": 7001 }, { "epoch": 0.1844087437450619, "grad_norm": 1.7942824363708496, "learning_rate": 4.0787463787200425e-05, "loss": 1.76, "step": 7002 }, { "epoch": 0.1844350803265736, "grad_norm": 2.399251937866211, "learning_rate": 4.078614695812484e-05, "loss": 2.5975, "step": 7003 }, { "epoch": 0.18446141690808532, "grad_norm": 2.5136725902557373, "learning_rate": 4.078483012904925e-05, "loss": 2.2768, "step": 7004 }, { "epoch": 0.18448775348959706, "grad_norm": 1.8775962591171265, "learning_rate": 4.0783513299973665e-05, "loss": 1.6603, "step": 7005 }, { "epoch": 0.18451409007110878, "grad_norm": 2.997371196746826, "learning_rate": 4.078219647089808e-05, "loss": 1.7046, "step": 7006 }, { "epoch": 0.1845404266526205, "grad_norm": 2.1706979274749756, "learning_rate": 4.0780879641822496e-05, "loss": 1.9254, "step": 7007 }, { "epoch": 0.1845667632341322, "grad_norm": 2.203439712524414, "learning_rate": 4.0779562812746905e-05, "loss": 2.3046, "step": 7008 }, { "epoch": 0.18459309981564392, "grad_norm": 2.1381664276123047, "learning_rate": 4.077824598367132e-05, "loss": 1.7123, "step": 7009 }, { "epoch": 0.18461943639715564, "grad_norm": 1.6301268339157104, "learning_rate": 4.0776929154595736e-05, "loss": 2.0264, "step": 7010 }, { "epoch": 0.18464577297866736, "grad_norm": 1.6790343523025513, "learning_rate": 4.077561232552015e-05, "loss": 1.3366, "step": 7011 }, { "epoch": 0.1846721095601791, "grad_norm": 1.6542704105377197, "learning_rate": 4.077429549644457e-05, "loss": 1.8495, "step": 7012 }, { "epoch": 0.18469844614169081, "grad_norm": 2.584717273712158, "learning_rate": 4.0772978667368976e-05, "loss": 1.6227, "step": 7013 }, { "epoch": 0.18472478272320253, "grad_norm": 2.4365644454956055, "learning_rate": 4.077166183829339e-05, "loss": 1.7179, "step": 7014 }, { "epoch": 0.18475111930471425, "grad_norm": 1.7049157619476318, "learning_rate": 4.07703450092178e-05, "loss": 1.6792, "step": 7015 }, { "epoch": 0.18477745588622596, "grad_norm": 2.7974448204040527, "learning_rate": 4.076902818014222e-05, "loss": 1.6315, "step": 7016 }, { "epoch": 0.18480379246773768, "grad_norm": 4.127683162689209, "learning_rate": 4.076771135106663e-05, "loss": 1.3024, "step": 7017 }, { "epoch": 0.18483012904924942, "grad_norm": 2.4587247371673584, "learning_rate": 4.076639452199105e-05, "loss": 1.4815, "step": 7018 }, { "epoch": 0.18485646563076114, "grad_norm": 1.9659017324447632, "learning_rate": 4.076507769291546e-05, "loss": 1.781, "step": 7019 }, { "epoch": 0.18488280221227285, "grad_norm": 1.780299425125122, "learning_rate": 4.076376086383988e-05, "loss": 1.739, "step": 7020 }, { "epoch": 0.18490913879378457, "grad_norm": 4.755344867706299, "learning_rate": 4.0762444034764294e-05, "loss": 0.8956, "step": 7021 }, { "epoch": 0.18493547537529628, "grad_norm": 1.8263736963272095, "learning_rate": 4.07611272056887e-05, "loss": 1.7942, "step": 7022 }, { "epoch": 0.184961811956808, "grad_norm": 2.6218724250793457, "learning_rate": 4.075981037661312e-05, "loss": 1.5062, "step": 7023 }, { "epoch": 0.1849881485383197, "grad_norm": 1.7783833742141724, "learning_rate": 4.075849354753753e-05, "loss": 1.6431, "step": 7024 }, { "epoch": 0.18501448511983146, "grad_norm": 2.4616410732269287, "learning_rate": 4.075717671846195e-05, "loss": 1.6206, "step": 7025 }, { "epoch": 0.18504082170134317, "grad_norm": 2.2635910511016846, "learning_rate": 4.075585988938636e-05, "loss": 1.5831, "step": 7026 }, { "epoch": 0.1850671582828549, "grad_norm": 2.142585277557373, "learning_rate": 4.0754543060310774e-05, "loss": 2.6223, "step": 7027 }, { "epoch": 0.1850934948643666, "grad_norm": 2.5691113471984863, "learning_rate": 4.075322623123519e-05, "loss": 2.0103, "step": 7028 }, { "epoch": 0.18511983144587832, "grad_norm": 1.8876320123672485, "learning_rate": 4.07519094021596e-05, "loss": 2.0573, "step": 7029 }, { "epoch": 0.18514616802739003, "grad_norm": 3.4261367321014404, "learning_rate": 4.075059257308402e-05, "loss": 1.104, "step": 7030 }, { "epoch": 0.18517250460890178, "grad_norm": 2.8919003009796143, "learning_rate": 4.074927574400843e-05, "loss": 2.3249, "step": 7031 }, { "epoch": 0.1851988411904135, "grad_norm": 2.020681619644165, "learning_rate": 4.0747958914932845e-05, "loss": 1.8441, "step": 7032 }, { "epoch": 0.1852251777719252, "grad_norm": 3.334242343902588, "learning_rate": 4.0746642085857254e-05, "loss": 1.6714, "step": 7033 }, { "epoch": 0.18525151435343692, "grad_norm": 2.288557529449463, "learning_rate": 4.074532525678167e-05, "loss": 1.7518, "step": 7034 }, { "epoch": 0.18527785093494864, "grad_norm": 2.5938003063201904, "learning_rate": 4.0744008427706085e-05, "loss": 0.2539, "step": 7035 }, { "epoch": 0.18530418751646036, "grad_norm": 2.171165704727173, "learning_rate": 4.07426915986305e-05, "loss": 1.9121, "step": 7036 }, { "epoch": 0.18533052409797207, "grad_norm": 2.739981174468994, "learning_rate": 4.0741374769554916e-05, "loss": 0.791, "step": 7037 }, { "epoch": 0.18535686067948381, "grad_norm": 2.657623291015625, "learning_rate": 4.0740057940479325e-05, "loss": 2.4257, "step": 7038 }, { "epoch": 0.18538319726099553, "grad_norm": 1.6616113185882568, "learning_rate": 4.073874111140375e-05, "loss": 1.9823, "step": 7039 }, { "epoch": 0.18540953384250725, "grad_norm": 4.093944072723389, "learning_rate": 4.0737424282328156e-05, "loss": 2.1575, "step": 7040 }, { "epoch": 0.18543587042401896, "grad_norm": 4.127021789550781, "learning_rate": 4.073610745325257e-05, "loss": 1.1244, "step": 7041 }, { "epoch": 0.18546220700553068, "grad_norm": 1.7074508666992188, "learning_rate": 4.073479062417698e-05, "loss": 0.444, "step": 7042 }, { "epoch": 0.1854885435870424, "grad_norm": 1.7890299558639526, "learning_rate": 4.0733473795101396e-05, "loss": 2.3078, "step": 7043 }, { "epoch": 0.1855148801685541, "grad_norm": 1.825681447982788, "learning_rate": 4.073215696602581e-05, "loss": 2.0246, "step": 7044 }, { "epoch": 0.18554121675006585, "grad_norm": 2.5056962966918945, "learning_rate": 4.073084013695023e-05, "loss": 0.8313, "step": 7045 }, { "epoch": 0.18556755333157757, "grad_norm": 1.8695944547653198, "learning_rate": 4.0729523307874636e-05, "loss": 1.9256, "step": 7046 }, { "epoch": 0.18559388991308928, "grad_norm": 2.4649107456207275, "learning_rate": 4.072820647879905e-05, "loss": 1.9475, "step": 7047 }, { "epoch": 0.185620226494601, "grad_norm": 6.007030487060547, "learning_rate": 4.072688964972347e-05, "loss": 2.2265, "step": 7048 }, { "epoch": 0.18564656307611271, "grad_norm": 2.180332899093628, "learning_rate": 4.072557282064788e-05, "loss": 2.2295, "step": 7049 }, { "epoch": 0.18567289965762443, "grad_norm": 2.95991849899292, "learning_rate": 4.07242559915723e-05, "loss": 1.4116, "step": 7050 }, { "epoch": 0.18569923623913617, "grad_norm": 1.8640722036361694, "learning_rate": 4.072293916249671e-05, "loss": 1.7615, "step": 7051 }, { "epoch": 0.1857255728206479, "grad_norm": 1.9431788921356201, "learning_rate": 4.072162233342112e-05, "loss": 2.1496, "step": 7052 }, { "epoch": 0.1857519094021596, "grad_norm": 3.0725719928741455, "learning_rate": 4.072030550434554e-05, "loss": 2.0362, "step": 7053 }, { "epoch": 0.18577824598367132, "grad_norm": 1.9439278841018677, "learning_rate": 4.0718988675269954e-05, "loss": 2.0799, "step": 7054 }, { "epoch": 0.18580458256518304, "grad_norm": 4.480274200439453, "learning_rate": 4.071767184619436e-05, "loss": 1.5139, "step": 7055 }, { "epoch": 0.18583091914669475, "grad_norm": 1.6987828016281128, "learning_rate": 4.071635501711878e-05, "loss": 1.7111, "step": 7056 }, { "epoch": 0.18585725572820647, "grad_norm": 2.4827072620391846, "learning_rate": 4.0715038188043194e-05, "loss": 1.8266, "step": 7057 }, { "epoch": 0.1858835923097182, "grad_norm": 2.5901682376861572, "learning_rate": 4.071372135896761e-05, "loss": 2.0084, "step": 7058 }, { "epoch": 0.18590992889122993, "grad_norm": 2.8255746364593506, "learning_rate": 4.0712404529892025e-05, "loss": 1.9873, "step": 7059 }, { "epoch": 0.18593626547274164, "grad_norm": 1.98200261592865, "learning_rate": 4.0711087700816434e-05, "loss": 2.0188, "step": 7060 }, { "epoch": 0.18596260205425336, "grad_norm": 2.1183969974517822, "learning_rate": 4.070977087174085e-05, "loss": 1.9547, "step": 7061 }, { "epoch": 0.18598893863576507, "grad_norm": 2.6942882537841797, "learning_rate": 4.070845404266526e-05, "loss": 2.0844, "step": 7062 }, { "epoch": 0.1860152752172768, "grad_norm": 2.1109118461608887, "learning_rate": 4.070713721358968e-05, "loss": 1.7476, "step": 7063 }, { "epoch": 0.18604161179878853, "grad_norm": 1.9203718900680542, "learning_rate": 4.070582038451409e-05, "loss": 1.9132, "step": 7064 }, { "epoch": 0.18606794838030025, "grad_norm": 2.0590131282806396, "learning_rate": 4.0704503555438506e-05, "loss": 1.8812, "step": 7065 }, { "epoch": 0.18609428496181196, "grad_norm": 2.406639575958252, "learning_rate": 4.070318672636292e-05, "loss": 1.9138, "step": 7066 }, { "epoch": 0.18612062154332368, "grad_norm": 1.6308205127716064, "learning_rate": 4.070186989728733e-05, "loss": 1.5473, "step": 7067 }, { "epoch": 0.1861469581248354, "grad_norm": 2.4697418212890625, "learning_rate": 4.070055306821175e-05, "loss": 1.7348, "step": 7068 }, { "epoch": 0.1861732947063471, "grad_norm": 1.9978829622268677, "learning_rate": 4.069923623913616e-05, "loss": 0.1428, "step": 7069 }, { "epoch": 0.18619963128785882, "grad_norm": 2.2762584686279297, "learning_rate": 4.069791941006058e-05, "loss": 1.7284, "step": 7070 }, { "epoch": 0.18622596786937057, "grad_norm": 1.59394109249115, "learning_rate": 4.0696602580984986e-05, "loss": 1.7665, "step": 7071 }, { "epoch": 0.18625230445088228, "grad_norm": 3.349914789199829, "learning_rate": 4.069528575190941e-05, "loss": 1.4147, "step": 7072 }, { "epoch": 0.186278641032394, "grad_norm": 2.417703151702881, "learning_rate": 4.069396892283382e-05, "loss": 1.507, "step": 7073 }, { "epoch": 0.18630497761390571, "grad_norm": 1.7976462841033936, "learning_rate": 4.069265209375823e-05, "loss": 1.2664, "step": 7074 }, { "epoch": 0.18633131419541743, "grad_norm": 2.6340603828430176, "learning_rate": 4.069133526468265e-05, "loss": 2.2531, "step": 7075 }, { "epoch": 0.18635765077692915, "grad_norm": 1.8017858266830444, "learning_rate": 4.069001843560706e-05, "loss": 1.568, "step": 7076 }, { "epoch": 0.18638398735844086, "grad_norm": 1.8842220306396484, "learning_rate": 4.068870160653148e-05, "loss": 1.9075, "step": 7077 }, { "epoch": 0.1864103239399526, "grad_norm": 1.8736804723739624, "learning_rate": 4.068738477745589e-05, "loss": 2.0031, "step": 7078 }, { "epoch": 0.18643666052146432, "grad_norm": 2.1913700103759766, "learning_rate": 4.0686067948380304e-05, "loss": 2.2514, "step": 7079 }, { "epoch": 0.18646299710297604, "grad_norm": 2.041095018386841, "learning_rate": 4.068475111930471e-05, "loss": 1.2421, "step": 7080 }, { "epoch": 0.18648933368448775, "grad_norm": 5.4691948890686035, "learning_rate": 4.068343429022913e-05, "loss": 0.9759, "step": 7081 }, { "epoch": 0.18651567026599947, "grad_norm": 2.5302422046661377, "learning_rate": 4.0682117461153544e-05, "loss": 1.8126, "step": 7082 }, { "epoch": 0.18654200684751118, "grad_norm": 2.04630446434021, "learning_rate": 4.068080063207796e-05, "loss": 1.8427, "step": 7083 }, { "epoch": 0.18656834342902293, "grad_norm": 2.717851161956787, "learning_rate": 4.0679483803002375e-05, "loss": 1.6675, "step": 7084 }, { "epoch": 0.18659468001053464, "grad_norm": 1.954132318496704, "learning_rate": 4.0678166973926784e-05, "loss": 1.7673, "step": 7085 }, { "epoch": 0.18662101659204636, "grad_norm": 2.1075642108917236, "learning_rate": 4.0676850144851206e-05, "loss": 2.1142, "step": 7086 }, { "epoch": 0.18664735317355807, "grad_norm": 5.034909725189209, "learning_rate": 4.0675533315775615e-05, "loss": 1.1115, "step": 7087 }, { "epoch": 0.1866736897550698, "grad_norm": 2.039327383041382, "learning_rate": 4.067421648670003e-05, "loss": 0.7899, "step": 7088 }, { "epoch": 0.1867000263365815, "grad_norm": 2.3611059188842773, "learning_rate": 4.067289965762444e-05, "loss": 1.8259, "step": 7089 }, { "epoch": 0.18672636291809322, "grad_norm": 1.709442138671875, "learning_rate": 4.0671582828548855e-05, "loss": 1.6856, "step": 7090 }, { "epoch": 0.18675269949960496, "grad_norm": 3.9820659160614014, "learning_rate": 4.067026599947327e-05, "loss": 1.5713, "step": 7091 }, { "epoch": 0.18677903608111668, "grad_norm": 2.329197406768799, "learning_rate": 4.0668949170397686e-05, "loss": 1.7154, "step": 7092 }, { "epoch": 0.1868053726626284, "grad_norm": 1.8741422891616821, "learning_rate": 4.06676323413221e-05, "loss": 1.8546, "step": 7093 }, { "epoch": 0.1868317092441401, "grad_norm": 2.687278985977173, "learning_rate": 4.066631551224651e-05, "loss": 1.5314, "step": 7094 }, { "epoch": 0.18685804582565183, "grad_norm": 4.32405424118042, "learning_rate": 4.0664998683170926e-05, "loss": 1.6117, "step": 7095 }, { "epoch": 0.18688438240716354, "grad_norm": 2.0460257530212402, "learning_rate": 4.066368185409534e-05, "loss": 2.1934, "step": 7096 }, { "epoch": 0.18691071898867526, "grad_norm": 1.694000482559204, "learning_rate": 4.066236502501976e-05, "loss": 1.7865, "step": 7097 }, { "epoch": 0.186937055570187, "grad_norm": 5.954615116119385, "learning_rate": 4.0661048195944166e-05, "loss": 2.3845, "step": 7098 }, { "epoch": 0.18696339215169872, "grad_norm": 1.9868425130844116, "learning_rate": 4.065973136686858e-05, "loss": 2.08, "step": 7099 }, { "epoch": 0.18698972873321043, "grad_norm": 2.7583935260772705, "learning_rate": 4.065841453779299e-05, "loss": 2.2414, "step": 7100 }, { "epoch": 0.18701606531472215, "grad_norm": 2.2052414417266846, "learning_rate": 4.065709770871741e-05, "loss": 1.7992, "step": 7101 }, { "epoch": 0.18704240189623386, "grad_norm": 3.4391391277313232, "learning_rate": 4.065578087964182e-05, "loss": 1.6053, "step": 7102 }, { "epoch": 0.18706873847774558, "grad_norm": 3.3688368797302246, "learning_rate": 4.065446405056624e-05, "loss": 1.7838, "step": 7103 }, { "epoch": 0.18709507505925732, "grad_norm": 5.613596439361572, "learning_rate": 4.065314722149065e-05, "loss": 1.586, "step": 7104 }, { "epoch": 0.18712141164076904, "grad_norm": 4.961678504943848, "learning_rate": 4.065183039241507e-05, "loss": 1.9074, "step": 7105 }, { "epoch": 0.18714774822228075, "grad_norm": 2.3860888481140137, "learning_rate": 4.0650513563339484e-05, "loss": 1.4551, "step": 7106 }, { "epoch": 0.18717408480379247, "grad_norm": 2.942716360092163, "learning_rate": 4.064919673426389e-05, "loss": 1.299, "step": 7107 }, { "epoch": 0.18720042138530418, "grad_norm": 1.6908869743347168, "learning_rate": 4.064787990518831e-05, "loss": 2.1328, "step": 7108 }, { "epoch": 0.1872267579668159, "grad_norm": 2.9016764163970947, "learning_rate": 4.064656307611272e-05, "loss": 0.5772, "step": 7109 }, { "epoch": 0.18725309454832761, "grad_norm": 2.7314696311950684, "learning_rate": 4.064524624703714e-05, "loss": 1.3018, "step": 7110 }, { "epoch": 0.18727943112983936, "grad_norm": 2.7531847953796387, "learning_rate": 4.064392941796155e-05, "loss": 2.3253, "step": 7111 }, { "epoch": 0.18730576771135107, "grad_norm": 1.993249535560608, "learning_rate": 4.0642612588885964e-05, "loss": 1.3962, "step": 7112 }, { "epoch": 0.1873321042928628, "grad_norm": 2.0955049991607666, "learning_rate": 4.064129575981038e-05, "loss": 1.7677, "step": 7113 }, { "epoch": 0.1873584408743745, "grad_norm": 2.7482011318206787, "learning_rate": 4.063997893073479e-05, "loss": 0.7222, "step": 7114 }, { "epoch": 0.18738477745588622, "grad_norm": 1.6753559112548828, "learning_rate": 4.063866210165921e-05, "loss": 1.9886, "step": 7115 }, { "epoch": 0.18741111403739794, "grad_norm": 1.7713512182235718, "learning_rate": 4.063734527258362e-05, "loss": 1.494, "step": 7116 }, { "epoch": 0.18743745061890968, "grad_norm": 2.061821460723877, "learning_rate": 4.0636028443508035e-05, "loss": 2.0898, "step": 7117 }, { "epoch": 0.1874637872004214, "grad_norm": 1.9059206247329712, "learning_rate": 4.0634711614432444e-05, "loss": 1.7705, "step": 7118 }, { "epoch": 0.1874901237819331, "grad_norm": 2.3998019695281982, "learning_rate": 4.0633394785356866e-05, "loss": 1.7339, "step": 7119 }, { "epoch": 0.18751646036344483, "grad_norm": 1.636765480041504, "learning_rate": 4.0632077956281275e-05, "loss": 2.3229, "step": 7120 }, { "epoch": 0.18754279694495654, "grad_norm": 1.693140983581543, "learning_rate": 4.063076112720569e-05, "loss": 1.5128, "step": 7121 }, { "epoch": 0.18756913352646826, "grad_norm": 3.616184949874878, "learning_rate": 4.0629444298130107e-05, "loss": 1.3153, "step": 7122 }, { "epoch": 0.18759547010797997, "grad_norm": 1.8281915187835693, "learning_rate": 4.0628127469054515e-05, "loss": 1.9864, "step": 7123 }, { "epoch": 0.18762180668949172, "grad_norm": 2.0416295528411865, "learning_rate": 4.062681063997894e-05, "loss": 2.1649, "step": 7124 }, { "epoch": 0.18764814327100343, "grad_norm": 3.8911514282226562, "learning_rate": 4.0625493810903347e-05, "loss": 1.2026, "step": 7125 }, { "epoch": 0.18767447985251515, "grad_norm": 1.6992837190628052, "learning_rate": 4.062417698182776e-05, "loss": 1.51, "step": 7126 }, { "epoch": 0.18770081643402686, "grad_norm": 2.820889711380005, "learning_rate": 4.062286015275217e-05, "loss": 1.893, "step": 7127 }, { "epoch": 0.18772715301553858, "grad_norm": 2.89487361907959, "learning_rate": 4.0621543323676587e-05, "loss": 0.7218, "step": 7128 }, { "epoch": 0.1877534895970503, "grad_norm": 1.7519015073776245, "learning_rate": 4.0620226494601e-05, "loss": 2.0041, "step": 7129 }, { "epoch": 0.187779826178562, "grad_norm": 6.424541473388672, "learning_rate": 4.061890966552542e-05, "loss": 1.0124, "step": 7130 }, { "epoch": 0.18780616276007375, "grad_norm": 2.0475144386291504, "learning_rate": 4.061759283644983e-05, "loss": 1.2348, "step": 7131 }, { "epoch": 0.18783249934158547, "grad_norm": 2.3664028644561768, "learning_rate": 4.061627600737424e-05, "loss": 1.5428, "step": 7132 }, { "epoch": 0.18785883592309718, "grad_norm": 2.0390541553497314, "learning_rate": 4.061495917829866e-05, "loss": 2.107, "step": 7133 }, { "epoch": 0.1878851725046089, "grad_norm": 5.205618381500244, "learning_rate": 4.061364234922307e-05, "loss": 0.8842, "step": 7134 }, { "epoch": 0.18791150908612061, "grad_norm": 4.260695457458496, "learning_rate": 4.061232552014749e-05, "loss": 1.6472, "step": 7135 }, { "epoch": 0.18793784566763233, "grad_norm": 3.7273504734039307, "learning_rate": 4.06110086910719e-05, "loss": 0.8915, "step": 7136 }, { "epoch": 0.18796418224914407, "grad_norm": 4.018039226531982, "learning_rate": 4.060969186199631e-05, "loss": 1.9645, "step": 7137 }, { "epoch": 0.1879905188306558, "grad_norm": 2.592669725418091, "learning_rate": 4.060837503292073e-05, "loss": 2.8759, "step": 7138 }, { "epoch": 0.1880168554121675, "grad_norm": 1.8754229545593262, "learning_rate": 4.0607058203845145e-05, "loss": 1.6851, "step": 7139 }, { "epoch": 0.18804319199367922, "grad_norm": 2.291823148727417, "learning_rate": 4.060574137476956e-05, "loss": 2.2798, "step": 7140 }, { "epoch": 0.18806952857519094, "grad_norm": 2.3167099952697754, "learning_rate": 4.060442454569397e-05, "loss": 1.614, "step": 7141 }, { "epoch": 0.18809586515670265, "grad_norm": 2.762929677963257, "learning_rate": 4.0603107716618385e-05, "loss": 1.2255, "step": 7142 }, { "epoch": 0.18812220173821437, "grad_norm": 1.8997184038162231, "learning_rate": 4.06017908875428e-05, "loss": 1.819, "step": 7143 }, { "epoch": 0.1881485383197261, "grad_norm": 2.7060487270355225, "learning_rate": 4.0600474058467216e-05, "loss": 1.522, "step": 7144 }, { "epoch": 0.18817487490123783, "grad_norm": 1.8368016481399536, "learning_rate": 4.0599157229391625e-05, "loss": 1.687, "step": 7145 }, { "epoch": 0.18820121148274954, "grad_norm": 2.0138535499572754, "learning_rate": 4.059784040031604e-05, "loss": 1.899, "step": 7146 }, { "epoch": 0.18822754806426126, "grad_norm": 2.190800189971924, "learning_rate": 4.059652357124045e-05, "loss": 1.8069, "step": 7147 }, { "epoch": 0.18825388464577297, "grad_norm": 3.160052537918091, "learning_rate": 4.059520674216487e-05, "loss": 1.5584, "step": 7148 }, { "epoch": 0.1882802212272847, "grad_norm": 2.6142807006835938, "learning_rate": 4.059388991308928e-05, "loss": 1.8108, "step": 7149 }, { "epoch": 0.18830655780879643, "grad_norm": 2.1586642265319824, "learning_rate": 4.0592573084013696e-05, "loss": 1.8957, "step": 7150 }, { "epoch": 0.18833289439030815, "grad_norm": 3.6322038173675537, "learning_rate": 4.059125625493811e-05, "loss": 1.4507, "step": 7151 }, { "epoch": 0.18835923097181986, "grad_norm": 2.8608357906341553, "learning_rate": 4.058993942586253e-05, "loss": 0.673, "step": 7152 }, { "epoch": 0.18838556755333158, "grad_norm": 2.3458855152130127, "learning_rate": 4.058862259678694e-05, "loss": 2.0301, "step": 7153 }, { "epoch": 0.1884119041348433, "grad_norm": 3.176971912384033, "learning_rate": 4.058730576771135e-05, "loss": 2.2794, "step": 7154 }, { "epoch": 0.188438240716355, "grad_norm": 4.816822052001953, "learning_rate": 4.058598893863577e-05, "loss": 0.9821, "step": 7155 }, { "epoch": 0.18846457729786673, "grad_norm": 2.430255651473999, "learning_rate": 4.0584672109560176e-05, "loss": 0.7312, "step": 7156 }, { "epoch": 0.18849091387937847, "grad_norm": 1.8843653202056885, "learning_rate": 4.05833552804846e-05, "loss": 1.3415, "step": 7157 }, { "epoch": 0.18851725046089018, "grad_norm": 3.2687995433807373, "learning_rate": 4.058203845140901e-05, "loss": 2.3528, "step": 7158 }, { "epoch": 0.1885435870424019, "grad_norm": 2.6169800758361816, "learning_rate": 4.058072162233342e-05, "loss": 1.8997, "step": 7159 }, { "epoch": 0.18856992362391362, "grad_norm": 2.1954383850097656, "learning_rate": 4.057940479325784e-05, "loss": 1.7589, "step": 7160 }, { "epoch": 0.18859626020542533, "grad_norm": 2.1346802711486816, "learning_rate": 4.057808796418225e-05, "loss": 1.878, "step": 7161 }, { "epoch": 0.18862259678693705, "grad_norm": 1.7980924844741821, "learning_rate": 4.057677113510667e-05, "loss": 1.7207, "step": 7162 }, { "epoch": 0.18864893336844876, "grad_norm": 2.881147861480713, "learning_rate": 4.057545430603108e-05, "loss": 0.8091, "step": 7163 }, { "epoch": 0.1886752699499605, "grad_norm": 5.948313236236572, "learning_rate": 4.0574137476955494e-05, "loss": 2.1255, "step": 7164 }, { "epoch": 0.18870160653147222, "grad_norm": 2.595654010772705, "learning_rate": 4.05728206478799e-05, "loss": 0.9837, "step": 7165 }, { "epoch": 0.18872794311298394, "grad_norm": 6.243841648101807, "learning_rate": 4.057150381880432e-05, "loss": 1.5669, "step": 7166 }, { "epoch": 0.18875427969449565, "grad_norm": 3.450916290283203, "learning_rate": 4.0570186989728734e-05, "loss": 2.7154, "step": 7167 }, { "epoch": 0.18878061627600737, "grad_norm": 3.38445782661438, "learning_rate": 4.056887016065315e-05, "loss": 1.58, "step": 7168 }, { "epoch": 0.18880695285751908, "grad_norm": 1.9969775676727295, "learning_rate": 4.0567553331577565e-05, "loss": 1.4763, "step": 7169 }, { "epoch": 0.18883328943903083, "grad_norm": 1.6472141742706299, "learning_rate": 4.0566236502501974e-05, "loss": 1.631, "step": 7170 }, { "epoch": 0.18885962602054254, "grad_norm": 2.0674567222595215, "learning_rate": 4.0564919673426396e-05, "loss": 1.8451, "step": 7171 }, { "epoch": 0.18888596260205426, "grad_norm": 2.0916426181793213, "learning_rate": 4.0563602844350805e-05, "loss": 1.3798, "step": 7172 }, { "epoch": 0.18891229918356597, "grad_norm": 3.7419512271881104, "learning_rate": 4.056228601527522e-05, "loss": 2.24, "step": 7173 }, { "epoch": 0.1889386357650777, "grad_norm": 4.419593334197998, "learning_rate": 4.056096918619963e-05, "loss": 0.9466, "step": 7174 }, { "epoch": 0.1889649723465894, "grad_norm": 1.7070976495742798, "learning_rate": 4.0559652357124045e-05, "loss": 1.5835, "step": 7175 }, { "epoch": 0.18899130892810112, "grad_norm": 1.7554688453674316, "learning_rate": 4.055833552804846e-05, "loss": 2.2235, "step": 7176 }, { "epoch": 0.18901764550961286, "grad_norm": 1.7871851921081543, "learning_rate": 4.0557018698972876e-05, "loss": 1.5603, "step": 7177 }, { "epoch": 0.18904398209112458, "grad_norm": 2.1068508625030518, "learning_rate": 4.055570186989729e-05, "loss": 1.6217, "step": 7178 }, { "epoch": 0.1890703186726363, "grad_norm": 5.742961883544922, "learning_rate": 4.05543850408217e-05, "loss": 1.5055, "step": 7179 }, { "epoch": 0.189096655254148, "grad_norm": 1.645997405052185, "learning_rate": 4.0553068211746116e-05, "loss": 1.6799, "step": 7180 }, { "epoch": 0.18912299183565973, "grad_norm": 2.6304750442504883, "learning_rate": 4.055175138267053e-05, "loss": 1.5918, "step": 7181 }, { "epoch": 0.18914932841717144, "grad_norm": 1.7568247318267822, "learning_rate": 4.055043455359495e-05, "loss": 0.6936, "step": 7182 }, { "epoch": 0.18917566499868316, "grad_norm": 1.9679300785064697, "learning_rate": 4.0549117724519356e-05, "loss": 1.8451, "step": 7183 }, { "epoch": 0.1892020015801949, "grad_norm": 1.9350433349609375, "learning_rate": 4.054780089544377e-05, "loss": 1.8593, "step": 7184 }, { "epoch": 0.18922833816170662, "grad_norm": 1.7362843751907349, "learning_rate": 4.054648406636819e-05, "loss": 1.858, "step": 7185 }, { "epoch": 0.18925467474321833, "grad_norm": 1.7821425199508667, "learning_rate": 4.05451672372926e-05, "loss": 0.4698, "step": 7186 }, { "epoch": 0.18928101132473005, "grad_norm": 2.8015902042388916, "learning_rate": 4.054385040821702e-05, "loss": 1.702, "step": 7187 }, { "epoch": 0.18930734790624176, "grad_norm": 1.5574791431427002, "learning_rate": 4.054253357914143e-05, "loss": 2.206, "step": 7188 }, { "epoch": 0.18933368448775348, "grad_norm": 1.8306288719177246, "learning_rate": 4.054121675006584e-05, "loss": 1.5303, "step": 7189 }, { "epoch": 0.18936002106926522, "grad_norm": 2.4754111766815186, "learning_rate": 4.053989992099026e-05, "loss": 2.0363, "step": 7190 }, { "epoch": 0.18938635765077694, "grad_norm": 1.332369327545166, "learning_rate": 4.0538583091914674e-05, "loss": 0.481, "step": 7191 }, { "epoch": 0.18941269423228865, "grad_norm": 1.8940297365188599, "learning_rate": 4.053726626283908e-05, "loss": 2.0337, "step": 7192 }, { "epoch": 0.18943903081380037, "grad_norm": 4.1673583984375, "learning_rate": 4.05359494337635e-05, "loss": 2.2207, "step": 7193 }, { "epoch": 0.18946536739531208, "grad_norm": 2.8621959686279297, "learning_rate": 4.0534632604687914e-05, "loss": 1.5712, "step": 7194 }, { "epoch": 0.1894917039768238, "grad_norm": 2.3763270378112793, "learning_rate": 4.053331577561233e-05, "loss": 1.1065, "step": 7195 }, { "epoch": 0.18951804055833552, "grad_norm": 2.1661956310272217, "learning_rate": 4.0531998946536746e-05, "loss": 2.0365, "step": 7196 }, { "epoch": 0.18954437713984726, "grad_norm": 2.1663999557495117, "learning_rate": 4.0530682117461154e-05, "loss": 0.4715, "step": 7197 }, { "epoch": 0.18957071372135897, "grad_norm": 2.5032851696014404, "learning_rate": 4.052936528838557e-05, "loss": 1.6401, "step": 7198 }, { "epoch": 0.1895970503028707, "grad_norm": 5.326064109802246, "learning_rate": 4.052804845930998e-05, "loss": 2.4489, "step": 7199 }, { "epoch": 0.1896233868843824, "grad_norm": 1.6689244508743286, "learning_rate": 4.05267316302344e-05, "loss": 2.0198, "step": 7200 }, { "epoch": 0.18964972346589412, "grad_norm": 4.380797386169434, "learning_rate": 4.052541480115881e-05, "loss": 2.0752, "step": 7201 }, { "epoch": 0.18967606004740584, "grad_norm": 2.0880212783813477, "learning_rate": 4.0524097972083226e-05, "loss": 1.7595, "step": 7202 }, { "epoch": 0.18970239662891758, "grad_norm": 2.943312883377075, "learning_rate": 4.0522781143007634e-05, "loss": 1.2728, "step": 7203 }, { "epoch": 0.1897287332104293, "grad_norm": 2.3224871158599854, "learning_rate": 4.052146431393206e-05, "loss": 2.5447, "step": 7204 }, { "epoch": 0.189755069791941, "grad_norm": 5.911948204040527, "learning_rate": 4.0520147484856466e-05, "loss": 1.5255, "step": 7205 }, { "epoch": 0.18978140637345273, "grad_norm": 1.636635661125183, "learning_rate": 4.051883065578088e-05, "loss": 1.6019, "step": 7206 }, { "epoch": 0.18980774295496444, "grad_norm": 4.187703609466553, "learning_rate": 4.05175138267053e-05, "loss": 2.4874, "step": 7207 }, { "epoch": 0.18983407953647616, "grad_norm": 1.7526487112045288, "learning_rate": 4.0516196997629706e-05, "loss": 1.5253, "step": 7208 }, { "epoch": 0.18986041611798787, "grad_norm": 2.7439773082733154, "learning_rate": 4.051488016855413e-05, "loss": 0.5517, "step": 7209 }, { "epoch": 0.18988675269949962, "grad_norm": 1.7597113847732544, "learning_rate": 4.051356333947854e-05, "loss": 1.7332, "step": 7210 }, { "epoch": 0.18991308928101133, "grad_norm": 6.962954521179199, "learning_rate": 4.051224651040295e-05, "loss": 1.8243, "step": 7211 }, { "epoch": 0.18993942586252305, "grad_norm": 2.2786426544189453, "learning_rate": 4.051092968132736e-05, "loss": 2.069, "step": 7212 }, { "epoch": 0.18996576244403476, "grad_norm": 3.446714162826538, "learning_rate": 4.050961285225178e-05, "loss": 0.7096, "step": 7213 }, { "epoch": 0.18999209902554648, "grad_norm": 1.9481279850006104, "learning_rate": 4.050829602317619e-05, "loss": 1.5411, "step": 7214 }, { "epoch": 0.1900184356070582, "grad_norm": 3.3399834632873535, "learning_rate": 4.050697919410061e-05, "loss": 0.9776, "step": 7215 }, { "epoch": 0.1900447721885699, "grad_norm": 1.6336928606033325, "learning_rate": 4.0505662365025024e-05, "loss": 0.7945, "step": 7216 }, { "epoch": 0.19007110877008165, "grad_norm": 2.0583527088165283, "learning_rate": 4.050434553594943e-05, "loss": 1.8646, "step": 7217 }, { "epoch": 0.19009744535159337, "grad_norm": 4.298874855041504, "learning_rate": 4.0503028706873855e-05, "loss": 1.1679, "step": 7218 }, { "epoch": 0.19012378193310508, "grad_norm": 1.989362120628357, "learning_rate": 4.0501711877798264e-05, "loss": 1.9351, "step": 7219 }, { "epoch": 0.1901501185146168, "grad_norm": 1.8836759328842163, "learning_rate": 4.050039504872268e-05, "loss": 2.3136, "step": 7220 }, { "epoch": 0.19017645509612852, "grad_norm": 2.2447330951690674, "learning_rate": 4.049907821964709e-05, "loss": 0.2346, "step": 7221 }, { "epoch": 0.19020279167764023, "grad_norm": 1.6300921440124512, "learning_rate": 4.0497761390571504e-05, "loss": 2.6292, "step": 7222 }, { "epoch": 0.19022912825915197, "grad_norm": 1.7325414419174194, "learning_rate": 4.049644456149592e-05, "loss": 1.7103, "step": 7223 }, { "epoch": 0.1902554648406637, "grad_norm": 2.182236909866333, "learning_rate": 4.0495127732420335e-05, "loss": 1.5909, "step": 7224 }, { "epoch": 0.1902818014221754, "grad_norm": 2.5211610794067383, "learning_rate": 4.049381090334475e-05, "loss": 1.4177, "step": 7225 }, { "epoch": 0.19030813800368712, "grad_norm": 1.613010287284851, "learning_rate": 4.049249407426916e-05, "loss": 1.4293, "step": 7226 }, { "epoch": 0.19033447458519884, "grad_norm": 1.730679988861084, "learning_rate": 4.0491177245193575e-05, "loss": 1.6866, "step": 7227 }, { "epoch": 0.19036081116671055, "grad_norm": 1.7852935791015625, "learning_rate": 4.048986041611799e-05, "loss": 2.0999, "step": 7228 }, { "epoch": 0.19038714774822227, "grad_norm": 2.4702770709991455, "learning_rate": 4.0488543587042406e-05, "loss": 1.6319, "step": 7229 }, { "epoch": 0.190413484329734, "grad_norm": 1.6914503574371338, "learning_rate": 4.0487226757966815e-05, "loss": 1.3973, "step": 7230 }, { "epoch": 0.19043982091124573, "grad_norm": 4.83398962020874, "learning_rate": 4.048590992889123e-05, "loss": 1.7376, "step": 7231 }, { "epoch": 0.19046615749275744, "grad_norm": 4.740577220916748, "learning_rate": 4.0484593099815646e-05, "loss": 1.3972, "step": 7232 }, { "epoch": 0.19049249407426916, "grad_norm": 2.465036153793335, "learning_rate": 4.048327627074006e-05, "loss": 1.5403, "step": 7233 }, { "epoch": 0.19051883065578087, "grad_norm": 1.5254331827163696, "learning_rate": 4.048195944166448e-05, "loss": 1.8449, "step": 7234 }, { "epoch": 0.1905451672372926, "grad_norm": 2.33536434173584, "learning_rate": 4.0480642612588886e-05, "loss": 1.7078, "step": 7235 }, { "epoch": 0.1905715038188043, "grad_norm": 2.0740175247192383, "learning_rate": 4.04793257835133e-05, "loss": 2.3895, "step": 7236 }, { "epoch": 0.19059784040031605, "grad_norm": 1.8939977884292603, "learning_rate": 4.047800895443772e-05, "loss": 1.2493, "step": 7237 }, { "epoch": 0.19062417698182776, "grad_norm": 3.422680616378784, "learning_rate": 4.047669212536213e-05, "loss": 1.5391, "step": 7238 }, { "epoch": 0.19065051356333948, "grad_norm": 3.1775338649749756, "learning_rate": 4.047537529628654e-05, "loss": 1.6707, "step": 7239 }, { "epoch": 0.1906768501448512, "grad_norm": 3.0954630374908447, "learning_rate": 4.047405846721096e-05, "loss": 1.8821, "step": 7240 }, { "epoch": 0.1907031867263629, "grad_norm": 2.1109166145324707, "learning_rate": 4.047274163813537e-05, "loss": 2.3079, "step": 7241 }, { "epoch": 0.19072952330787463, "grad_norm": 3.317548990249634, "learning_rate": 4.047142480905979e-05, "loss": 2.4559, "step": 7242 }, { "epoch": 0.19075585988938637, "grad_norm": 2.191545248031616, "learning_rate": 4.0470107979984204e-05, "loss": 1.9968, "step": 7243 }, { "epoch": 0.19078219647089809, "grad_norm": 4.669031620025635, "learning_rate": 4.046879115090861e-05, "loss": 0.7549, "step": 7244 }, { "epoch": 0.1908085330524098, "grad_norm": 1.9186196327209473, "learning_rate": 4.046747432183303e-05, "loss": 1.6467, "step": 7245 }, { "epoch": 0.19083486963392152, "grad_norm": 4.040922164916992, "learning_rate": 4.046615749275744e-05, "loss": 0.5172, "step": 7246 }, { "epoch": 0.19086120621543323, "grad_norm": 2.6314992904663086, "learning_rate": 4.046484066368186e-05, "loss": 1.5515, "step": 7247 }, { "epoch": 0.19088754279694495, "grad_norm": 1.7427303791046143, "learning_rate": 4.046352383460627e-05, "loss": 0.3602, "step": 7248 }, { "epoch": 0.19091387937845666, "grad_norm": 3.3643410205841064, "learning_rate": 4.0462207005530684e-05, "loss": 1.5196, "step": 7249 }, { "epoch": 0.1909402159599684, "grad_norm": 2.2444002628326416, "learning_rate": 4.046089017645509e-05, "loss": 1.8196, "step": 7250 }, { "epoch": 0.19096655254148012, "grad_norm": 3.979214668273926, "learning_rate": 4.0459573347379515e-05, "loss": 2.3359, "step": 7251 }, { "epoch": 0.19099288912299184, "grad_norm": 3.34232497215271, "learning_rate": 4.0458256518303924e-05, "loss": 1.5687, "step": 7252 }, { "epoch": 0.19101922570450355, "grad_norm": 1.6711429357528687, "learning_rate": 4.045693968922834e-05, "loss": 1.9792, "step": 7253 }, { "epoch": 0.19104556228601527, "grad_norm": 4.695145606994629, "learning_rate": 4.0455622860152755e-05, "loss": 1.0408, "step": 7254 }, { "epoch": 0.19107189886752698, "grad_norm": 3.7791454792022705, "learning_rate": 4.0454306031077164e-05, "loss": 0.7948, "step": 7255 }, { "epoch": 0.19109823544903873, "grad_norm": 2.1113526821136475, "learning_rate": 4.0452989202001587e-05, "loss": 1.7528, "step": 7256 }, { "epoch": 0.19112457203055044, "grad_norm": 3.21728253364563, "learning_rate": 4.0451672372925995e-05, "loss": 1.6714, "step": 7257 }, { "epoch": 0.19115090861206216, "grad_norm": 1.7481330633163452, "learning_rate": 4.045035554385041e-05, "loss": 1.9234, "step": 7258 }, { "epoch": 0.19117724519357387, "grad_norm": 1.6544904708862305, "learning_rate": 4.044903871477482e-05, "loss": 1.7649, "step": 7259 }, { "epoch": 0.1912035817750856, "grad_norm": 1.567003846168518, "learning_rate": 4.0447721885699235e-05, "loss": 1.6215, "step": 7260 }, { "epoch": 0.1912299183565973, "grad_norm": 1.9183847904205322, "learning_rate": 4.044640505662365e-05, "loss": 1.7934, "step": 7261 }, { "epoch": 0.19125625493810902, "grad_norm": 2.2221667766571045, "learning_rate": 4.0445088227548067e-05, "loss": 1.9812, "step": 7262 }, { "epoch": 0.19128259151962076, "grad_norm": 4.729203701019287, "learning_rate": 4.044377139847248e-05, "loss": 0.9816, "step": 7263 }, { "epoch": 0.19130892810113248, "grad_norm": 1.99595046043396, "learning_rate": 4.044245456939689e-05, "loss": 2.0178, "step": 7264 }, { "epoch": 0.1913352646826442, "grad_norm": 2.347024440765381, "learning_rate": 4.044113774032131e-05, "loss": 1.3836, "step": 7265 }, { "epoch": 0.1913616012641559, "grad_norm": 1.84151029586792, "learning_rate": 4.043982091124572e-05, "loss": 2.1222, "step": 7266 }, { "epoch": 0.19138793784566763, "grad_norm": 1.9768502712249756, "learning_rate": 4.043850408217014e-05, "loss": 2.3555, "step": 7267 }, { "epoch": 0.19141427442717934, "grad_norm": 1.647308349609375, "learning_rate": 4.0437187253094547e-05, "loss": 1.7452, "step": 7268 }, { "epoch": 0.19144061100869106, "grad_norm": 1.953292965888977, "learning_rate": 4.043587042401896e-05, "loss": 2.0305, "step": 7269 }, { "epoch": 0.1914669475902028, "grad_norm": 2.776155471801758, "learning_rate": 4.043455359494338e-05, "loss": 1.7882, "step": 7270 }, { "epoch": 0.19149328417171452, "grad_norm": 5.300607681274414, "learning_rate": 4.0433236765867793e-05, "loss": 1.7522, "step": 7271 }, { "epoch": 0.19151962075322623, "grad_norm": 1.906984567642212, "learning_rate": 4.043191993679221e-05, "loss": 1.6374, "step": 7272 }, { "epoch": 0.19154595733473795, "grad_norm": 1.7142235040664673, "learning_rate": 4.043060310771662e-05, "loss": 1.8931, "step": 7273 }, { "epoch": 0.19157229391624966, "grad_norm": 2.2480897903442383, "learning_rate": 4.0429286278641033e-05, "loss": 1.9641, "step": 7274 }, { "epoch": 0.19159863049776138, "grad_norm": 1.8112406730651855, "learning_rate": 4.042796944956545e-05, "loss": 1.6118, "step": 7275 }, { "epoch": 0.19162496707927312, "grad_norm": 1.7845630645751953, "learning_rate": 4.0426652620489865e-05, "loss": 1.8619, "step": 7276 }, { "epoch": 0.19165130366078484, "grad_norm": 2.2230772972106934, "learning_rate": 4.0425335791414273e-05, "loss": 2.0625, "step": 7277 }, { "epoch": 0.19167764024229655, "grad_norm": 2.6940886974334717, "learning_rate": 4.042401896233869e-05, "loss": 1.5254, "step": 7278 }, { "epoch": 0.19170397682380827, "grad_norm": 1.3871856927871704, "learning_rate": 4.0422702133263105e-05, "loss": 1.9714, "step": 7279 }, { "epoch": 0.19173031340531999, "grad_norm": 3.610910177230835, "learning_rate": 4.042138530418752e-05, "loss": 2.1208, "step": 7280 }, { "epoch": 0.1917566499868317, "grad_norm": 1.9261505603790283, "learning_rate": 4.0420068475111936e-05, "loss": 1.9128, "step": 7281 }, { "epoch": 0.19178298656834342, "grad_norm": 1.932108998298645, "learning_rate": 4.0418751646036345e-05, "loss": 1.8876, "step": 7282 }, { "epoch": 0.19180932314985516, "grad_norm": 2.5684380531311035, "learning_rate": 4.041743481696076e-05, "loss": 1.8817, "step": 7283 }, { "epoch": 0.19183565973136688, "grad_norm": 1.7948907613754272, "learning_rate": 4.0416117987885176e-05, "loss": 2.3632, "step": 7284 }, { "epoch": 0.1918619963128786, "grad_norm": 1.8713473081588745, "learning_rate": 4.041480115880959e-05, "loss": 1.821, "step": 7285 }, { "epoch": 0.1918883328943903, "grad_norm": 2.3834478855133057, "learning_rate": 4.0413484329734e-05, "loss": 2.336, "step": 7286 }, { "epoch": 0.19191466947590202, "grad_norm": 2.312260389328003, "learning_rate": 4.0412167500658416e-05, "loss": 1.5203, "step": 7287 }, { "epoch": 0.19194100605741374, "grad_norm": 2.2332422733306885, "learning_rate": 4.041085067158283e-05, "loss": 2.0257, "step": 7288 }, { "epoch": 0.19196734263892548, "grad_norm": 3.375622272491455, "learning_rate": 4.040953384250725e-05, "loss": 1.1201, "step": 7289 }, { "epoch": 0.1919936792204372, "grad_norm": 2.6326589584350586, "learning_rate": 4.040821701343166e-05, "loss": 1.9488, "step": 7290 }, { "epoch": 0.1920200158019489, "grad_norm": 3.281137228012085, "learning_rate": 4.040690018435607e-05, "loss": 2.6588, "step": 7291 }, { "epoch": 0.19204635238346063, "grad_norm": 3.1981658935546875, "learning_rate": 4.040558335528049e-05, "loss": 1.3096, "step": 7292 }, { "epoch": 0.19207268896497234, "grad_norm": 2.4739367961883545, "learning_rate": 4.0404266526204896e-05, "loss": 1.3927, "step": 7293 }, { "epoch": 0.19209902554648406, "grad_norm": 4.966895580291748, "learning_rate": 4.040294969712932e-05, "loss": 1.7816, "step": 7294 }, { "epoch": 0.19212536212799577, "grad_norm": 3.845078229904175, "learning_rate": 4.040163286805373e-05, "loss": 1.0441, "step": 7295 }, { "epoch": 0.19215169870950752, "grad_norm": 2.0101237297058105, "learning_rate": 4.040031603897814e-05, "loss": 1.9505, "step": 7296 }, { "epoch": 0.19217803529101923, "grad_norm": 1.7244296073913574, "learning_rate": 4.039899920990256e-05, "loss": 1.1956, "step": 7297 }, { "epoch": 0.19220437187253095, "grad_norm": 1.756265640258789, "learning_rate": 4.0397682380826974e-05, "loss": 2.237, "step": 7298 }, { "epoch": 0.19223070845404266, "grad_norm": 2.3653173446655273, "learning_rate": 4.039636555175139e-05, "loss": 1.6345, "step": 7299 }, { "epoch": 0.19225704503555438, "grad_norm": 6.648596286773682, "learning_rate": 4.03950487226758e-05, "loss": 2.5191, "step": 7300 }, { "epoch": 0.1922833816170661, "grad_norm": 2.2856199741363525, "learning_rate": 4.0393731893600214e-05, "loss": 2.0633, "step": 7301 }, { "epoch": 0.1923097181985778, "grad_norm": 3.177938938140869, "learning_rate": 4.039241506452462e-05, "loss": 1.6583, "step": 7302 }, { "epoch": 0.19233605478008955, "grad_norm": 1.6299958229064941, "learning_rate": 4.0391098235449045e-05, "loss": 1.2568, "step": 7303 }, { "epoch": 0.19236239136160127, "grad_norm": 1.894034504890442, "learning_rate": 4.0389781406373454e-05, "loss": 2.1901, "step": 7304 }, { "epoch": 0.19238872794311299, "grad_norm": 2.6950366497039795, "learning_rate": 4.038846457729787e-05, "loss": 0.8633, "step": 7305 }, { "epoch": 0.1924150645246247, "grad_norm": 3.5823185443878174, "learning_rate": 4.038714774822228e-05, "loss": 1.4209, "step": 7306 }, { "epoch": 0.19244140110613642, "grad_norm": 1.8233287334442139, "learning_rate": 4.0385830919146694e-05, "loss": 1.8241, "step": 7307 }, { "epoch": 0.19246773768764813, "grad_norm": 6.177656173706055, "learning_rate": 4.038451409007111e-05, "loss": 1.9694, "step": 7308 }, { "epoch": 0.19249407426915988, "grad_norm": 1.7469977140426636, "learning_rate": 4.0383197260995525e-05, "loss": 2.02, "step": 7309 }, { "epoch": 0.1925204108506716, "grad_norm": 1.8026920557022095, "learning_rate": 4.038188043191994e-05, "loss": 2.1229, "step": 7310 }, { "epoch": 0.1925467474321833, "grad_norm": 2.9835667610168457, "learning_rate": 4.038056360284435e-05, "loss": 1.9499, "step": 7311 }, { "epoch": 0.19257308401369502, "grad_norm": 2.2449615001678467, "learning_rate": 4.0379246773768765e-05, "loss": 1.8997, "step": 7312 }, { "epoch": 0.19259942059520674, "grad_norm": 1.4395685195922852, "learning_rate": 4.037792994469318e-05, "loss": 1.742, "step": 7313 }, { "epoch": 0.19262575717671845, "grad_norm": 1.9676905870437622, "learning_rate": 4.0376613115617596e-05, "loss": 2.0727, "step": 7314 }, { "epoch": 0.19265209375823017, "grad_norm": 1.7489992380142212, "learning_rate": 4.0375296286542005e-05, "loss": 1.4162, "step": 7315 }, { "epoch": 0.1926784303397419, "grad_norm": 2.629103660583496, "learning_rate": 4.037397945746642e-05, "loss": 2.1102, "step": 7316 }, { "epoch": 0.19270476692125363, "grad_norm": 2.397937059402466, "learning_rate": 4.0372662628390836e-05, "loss": 1.4276, "step": 7317 }, { "epoch": 0.19273110350276534, "grad_norm": 1.9332218170166016, "learning_rate": 4.037134579931525e-05, "loss": 2.0168, "step": 7318 }, { "epoch": 0.19275744008427706, "grad_norm": 2.219738483428955, "learning_rate": 4.037002897023967e-05, "loss": 1.9417, "step": 7319 }, { "epoch": 0.19278377666578878, "grad_norm": 2.0492563247680664, "learning_rate": 4.0368712141164076e-05, "loss": 1.4014, "step": 7320 }, { "epoch": 0.1928101132473005, "grad_norm": 3.0343124866485596, "learning_rate": 4.036739531208849e-05, "loss": 1.321, "step": 7321 }, { "epoch": 0.1928364498288122, "grad_norm": 3.1353862285614014, "learning_rate": 4.036607848301291e-05, "loss": 0.5889, "step": 7322 }, { "epoch": 0.19286278641032395, "grad_norm": 2.5549745559692383, "learning_rate": 4.036476165393732e-05, "loss": 1.016, "step": 7323 }, { "epoch": 0.19288912299183567, "grad_norm": 3.035165786743164, "learning_rate": 4.036344482486173e-05, "loss": 1.497, "step": 7324 }, { "epoch": 0.19291545957334738, "grad_norm": 2.4611964225769043, "learning_rate": 4.036212799578615e-05, "loss": 1.3419, "step": 7325 }, { "epoch": 0.1929417961548591, "grad_norm": 2.7616629600524902, "learning_rate": 4.036081116671056e-05, "loss": 1.9925, "step": 7326 }, { "epoch": 0.1929681327363708, "grad_norm": 4.698431491851807, "learning_rate": 4.035949433763498e-05, "loss": 0.7497, "step": 7327 }, { "epoch": 0.19299446931788253, "grad_norm": 1.5474358797073364, "learning_rate": 4.0358177508559394e-05, "loss": 1.6545, "step": 7328 }, { "epoch": 0.19302080589939427, "grad_norm": 4.562552452087402, "learning_rate": 4.03568606794838e-05, "loss": 1.0649, "step": 7329 }, { "epoch": 0.193047142480906, "grad_norm": 3.056469202041626, "learning_rate": 4.035554385040822e-05, "loss": 1.6468, "step": 7330 }, { "epoch": 0.1930734790624177, "grad_norm": 1.775338888168335, "learning_rate": 4.0354227021332634e-05, "loss": 2.0318, "step": 7331 }, { "epoch": 0.19309981564392942, "grad_norm": 3.844957113265991, "learning_rate": 4.035291019225705e-05, "loss": 1.6792, "step": 7332 }, { "epoch": 0.19312615222544113, "grad_norm": 2.6300363540649414, "learning_rate": 4.035159336318146e-05, "loss": 1.4449, "step": 7333 }, { "epoch": 0.19315248880695285, "grad_norm": 2.6600770950317383, "learning_rate": 4.0350276534105874e-05, "loss": 1.8157, "step": 7334 }, { "epoch": 0.19317882538846456, "grad_norm": 1.9569789171218872, "learning_rate": 4.034895970503029e-05, "loss": 1.7563, "step": 7335 }, { "epoch": 0.1932051619699763, "grad_norm": 2.6759989261627197, "learning_rate": 4.0347642875954706e-05, "loss": 1.6504, "step": 7336 }, { "epoch": 0.19323149855148802, "grad_norm": 2.748774766921997, "learning_rate": 4.034632604687912e-05, "loss": 1.4448, "step": 7337 }, { "epoch": 0.19325783513299974, "grad_norm": 3.0889992713928223, "learning_rate": 4.034500921780353e-05, "loss": 0.995, "step": 7338 }, { "epoch": 0.19328417171451145, "grad_norm": 2.6113176345825195, "learning_rate": 4.0343692388727946e-05, "loss": 1.7881, "step": 7339 }, { "epoch": 0.19331050829602317, "grad_norm": 3.723393440246582, "learning_rate": 4.0342375559652354e-05, "loss": 0.6865, "step": 7340 }, { "epoch": 0.19333684487753489, "grad_norm": 1.691326379776001, "learning_rate": 4.034105873057678e-05, "loss": 1.8833, "step": 7341 }, { "epoch": 0.19336318145904663, "grad_norm": 1.5007450580596924, "learning_rate": 4.0339741901501186e-05, "loss": 0.3572, "step": 7342 }, { "epoch": 0.19338951804055834, "grad_norm": 5.650027275085449, "learning_rate": 4.03384250724256e-05, "loss": 0.983, "step": 7343 }, { "epoch": 0.19341585462207006, "grad_norm": 2.8245296478271484, "learning_rate": 4.033710824335002e-05, "loss": 1.5689, "step": 7344 }, { "epoch": 0.19344219120358178, "grad_norm": 2.153578042984009, "learning_rate": 4.0335791414274426e-05, "loss": 1.3835, "step": 7345 }, { "epoch": 0.1934685277850935, "grad_norm": 3.891183853149414, "learning_rate": 4.033447458519885e-05, "loss": 1.5072, "step": 7346 }, { "epoch": 0.1934948643666052, "grad_norm": 2.4269511699676514, "learning_rate": 4.033315775612326e-05, "loss": 1.9788, "step": 7347 }, { "epoch": 0.19352120094811692, "grad_norm": 2.1383001804351807, "learning_rate": 4.033184092704767e-05, "loss": 1.8104, "step": 7348 }, { "epoch": 0.19354753752962867, "grad_norm": 1.9490697383880615, "learning_rate": 4.033052409797208e-05, "loss": 1.573, "step": 7349 }, { "epoch": 0.19357387411114038, "grad_norm": 2.5149269104003906, "learning_rate": 4.0329207268896504e-05, "loss": 2.5045, "step": 7350 }, { "epoch": 0.1936002106926521, "grad_norm": 1.8030511140823364, "learning_rate": 4.032789043982091e-05, "loss": 1.9566, "step": 7351 }, { "epoch": 0.1936265472741638, "grad_norm": 2.608902931213379, "learning_rate": 4.032657361074533e-05, "loss": 0.9981, "step": 7352 }, { "epoch": 0.19365288385567553, "grad_norm": 1.9074355363845825, "learning_rate": 4.032525678166974e-05, "loss": 1.7424, "step": 7353 }, { "epoch": 0.19367922043718724, "grad_norm": 2.6567094326019287, "learning_rate": 4.032393995259415e-05, "loss": 2.2168, "step": 7354 }, { "epoch": 0.19370555701869896, "grad_norm": 1.9052550792694092, "learning_rate": 4.032262312351857e-05, "loss": 1.5677, "step": 7355 }, { "epoch": 0.1937318936002107, "grad_norm": 1.7533397674560547, "learning_rate": 4.0321306294442984e-05, "loss": 1.8813, "step": 7356 }, { "epoch": 0.19375823018172242, "grad_norm": 1.938494324684143, "learning_rate": 4.03199894653674e-05, "loss": 2.1152, "step": 7357 }, { "epoch": 0.19378456676323413, "grad_norm": 2.4668755531311035, "learning_rate": 4.031867263629181e-05, "loss": 1.6126, "step": 7358 }, { "epoch": 0.19381090334474585, "grad_norm": 2.6709344387054443, "learning_rate": 4.0317355807216224e-05, "loss": 2.4992, "step": 7359 }, { "epoch": 0.19383723992625757, "grad_norm": 3.1036911010742188, "learning_rate": 4.031603897814064e-05, "loss": 2.1746, "step": 7360 }, { "epoch": 0.19386357650776928, "grad_norm": 7.545497417449951, "learning_rate": 4.0314722149065055e-05, "loss": 1.3802, "step": 7361 }, { "epoch": 0.19388991308928102, "grad_norm": 2.495110511779785, "learning_rate": 4.0313405319989464e-05, "loss": 1.4182, "step": 7362 }, { "epoch": 0.19391624967079274, "grad_norm": 2.202874183654785, "learning_rate": 4.031208849091388e-05, "loss": 1.9828, "step": 7363 }, { "epoch": 0.19394258625230446, "grad_norm": 1.6390376091003418, "learning_rate": 4.0310771661838295e-05, "loss": 2.7577, "step": 7364 }, { "epoch": 0.19396892283381617, "grad_norm": 2.361163377761841, "learning_rate": 4.030945483276271e-05, "loss": 2.1492, "step": 7365 }, { "epoch": 0.1939952594153279, "grad_norm": 3.4130513668060303, "learning_rate": 4.0308138003687126e-05, "loss": 0.7838, "step": 7366 }, { "epoch": 0.1940215959968396, "grad_norm": 2.5923759937286377, "learning_rate": 4.0306821174611535e-05, "loss": 2.2555, "step": 7367 }, { "epoch": 0.19404793257835132, "grad_norm": 2.3452064990997314, "learning_rate": 4.030550434553595e-05, "loss": 1.9365, "step": 7368 }, { "epoch": 0.19407426915986306, "grad_norm": 2.118030548095703, "learning_rate": 4.0304187516460366e-05, "loss": 1.3735, "step": 7369 }, { "epoch": 0.19410060574137478, "grad_norm": 1.9401692152023315, "learning_rate": 4.030287068738478e-05, "loss": 1.5864, "step": 7370 }, { "epoch": 0.1941269423228865, "grad_norm": 2.1829240322113037, "learning_rate": 4.030155385830919e-05, "loss": 1.7901, "step": 7371 }, { "epoch": 0.1941532789043982, "grad_norm": 1.7431983947753906, "learning_rate": 4.0300237029233606e-05, "loss": 1.653, "step": 7372 }, { "epoch": 0.19417961548590992, "grad_norm": 2.2366416454315186, "learning_rate": 4.029892020015802e-05, "loss": 1.4286, "step": 7373 }, { "epoch": 0.19420595206742164, "grad_norm": 4.20750617980957, "learning_rate": 4.029760337108244e-05, "loss": 1.6322, "step": 7374 }, { "epoch": 0.19423228864893338, "grad_norm": 1.7967501878738403, "learning_rate": 4.029628654200685e-05, "loss": 1.71, "step": 7375 }, { "epoch": 0.1942586252304451, "grad_norm": 1.6806516647338867, "learning_rate": 4.029496971293126e-05, "loss": 1.9395, "step": 7376 }, { "epoch": 0.1942849618119568, "grad_norm": 1.7641276121139526, "learning_rate": 4.029365288385568e-05, "loss": 1.8902, "step": 7377 }, { "epoch": 0.19431129839346853, "grad_norm": 4.335816860198975, "learning_rate": 4.0292336054780086e-05, "loss": 1.4072, "step": 7378 }, { "epoch": 0.19433763497498024, "grad_norm": 2.0037953853607178, "learning_rate": 4.029101922570451e-05, "loss": 1.7427, "step": 7379 }, { "epoch": 0.19436397155649196, "grad_norm": 4.062325954437256, "learning_rate": 4.028970239662892e-05, "loss": 1.2865, "step": 7380 }, { "epoch": 0.19439030813800368, "grad_norm": 4.657859802246094, "learning_rate": 4.028838556755333e-05, "loss": 1.484, "step": 7381 }, { "epoch": 0.19441664471951542, "grad_norm": 2.2243223190307617, "learning_rate": 4.028706873847775e-05, "loss": 1.552, "step": 7382 }, { "epoch": 0.19444298130102713, "grad_norm": 2.2360358238220215, "learning_rate": 4.0285751909402164e-05, "loss": 1.6119, "step": 7383 }, { "epoch": 0.19446931788253885, "grad_norm": 4.571104526519775, "learning_rate": 4.028443508032658e-05, "loss": 1.6022, "step": 7384 }, { "epoch": 0.19449565446405057, "grad_norm": 2.8746490478515625, "learning_rate": 4.028311825125099e-05, "loss": 1.3261, "step": 7385 }, { "epoch": 0.19452199104556228, "grad_norm": 2.1421756744384766, "learning_rate": 4.0281801422175404e-05, "loss": 1.5621, "step": 7386 }, { "epoch": 0.194548327627074, "grad_norm": 1.843734622001648, "learning_rate": 4.028048459309981e-05, "loss": 1.5633, "step": 7387 }, { "epoch": 0.1945746642085857, "grad_norm": 4.400885581970215, "learning_rate": 4.0279167764024235e-05, "loss": 1.0452, "step": 7388 }, { "epoch": 0.19460100079009746, "grad_norm": 2.5814390182495117, "learning_rate": 4.0277850934948644e-05, "loss": 1.5494, "step": 7389 }, { "epoch": 0.19462733737160917, "grad_norm": 2.7782809734344482, "learning_rate": 4.027653410587306e-05, "loss": 2.3513, "step": 7390 }, { "epoch": 0.1946536739531209, "grad_norm": 2.638397216796875, "learning_rate": 4.0275217276797475e-05, "loss": 0.6205, "step": 7391 }, { "epoch": 0.1946800105346326, "grad_norm": 1.8589435815811157, "learning_rate": 4.0273900447721884e-05, "loss": 1.2768, "step": 7392 }, { "epoch": 0.19470634711614432, "grad_norm": 1.9502047300338745, "learning_rate": 4.0272583618646307e-05, "loss": 2.1278, "step": 7393 }, { "epoch": 0.19473268369765603, "grad_norm": 1.6248949766159058, "learning_rate": 4.0271266789570715e-05, "loss": 1.6731, "step": 7394 }, { "epoch": 0.19475902027916778, "grad_norm": 1.7083396911621094, "learning_rate": 4.026994996049513e-05, "loss": 1.6324, "step": 7395 }, { "epoch": 0.1947853568606795, "grad_norm": 2.254757881164551, "learning_rate": 4.026863313141954e-05, "loss": 2.2385, "step": 7396 }, { "epoch": 0.1948116934421912, "grad_norm": 2.203688144683838, "learning_rate": 4.026731630234396e-05, "loss": 1.8163, "step": 7397 }, { "epoch": 0.19483803002370292, "grad_norm": 1.5879720449447632, "learning_rate": 4.026599947326837e-05, "loss": 1.5672, "step": 7398 }, { "epoch": 0.19486436660521464, "grad_norm": 4.591485023498535, "learning_rate": 4.026468264419279e-05, "loss": 2.2126, "step": 7399 }, { "epoch": 0.19489070318672636, "grad_norm": 3.664841890335083, "learning_rate": 4.02633658151172e-05, "loss": 0.6928, "step": 7400 }, { "epoch": 0.19491703976823807, "grad_norm": 3.8438456058502197, "learning_rate": 4.026204898604161e-05, "loss": 1.3409, "step": 7401 }, { "epoch": 0.19494337634974981, "grad_norm": 1.617113709449768, "learning_rate": 4.0260732156966033e-05, "loss": 1.7714, "step": 7402 }, { "epoch": 0.19496971293126153, "grad_norm": 4.050449848175049, "learning_rate": 4.025941532789044e-05, "loss": 1.5317, "step": 7403 }, { "epoch": 0.19499604951277325, "grad_norm": 4.882868766784668, "learning_rate": 4.025809849881486e-05, "loss": 1.1551, "step": 7404 }, { "epoch": 0.19502238609428496, "grad_norm": 2.0997579097747803, "learning_rate": 4.025678166973927e-05, "loss": 0.6815, "step": 7405 }, { "epoch": 0.19504872267579668, "grad_norm": 2.240664482116699, "learning_rate": 4.025546484066368e-05, "loss": 2.0105, "step": 7406 }, { "epoch": 0.1950750592573084, "grad_norm": 2.197084426879883, "learning_rate": 4.02541480115881e-05, "loss": 1.0667, "step": 7407 }, { "epoch": 0.1951013958388201, "grad_norm": 4.773092746734619, "learning_rate": 4.0252831182512513e-05, "loss": 2.2518, "step": 7408 }, { "epoch": 0.19512773242033185, "grad_norm": 4.768918991088867, "learning_rate": 4.025151435343692e-05, "loss": 1.3712, "step": 7409 }, { "epoch": 0.19515406900184357, "grad_norm": 2.93772029876709, "learning_rate": 4.025019752436134e-05, "loss": 2.4711, "step": 7410 }, { "epoch": 0.19518040558335528, "grad_norm": 1.7906513214111328, "learning_rate": 4.0248880695285753e-05, "loss": 1.4001, "step": 7411 }, { "epoch": 0.195206742164867, "grad_norm": 3.617037296295166, "learning_rate": 4.024756386621017e-05, "loss": 1.9739, "step": 7412 }, { "epoch": 0.1952330787463787, "grad_norm": 1.9617981910705566, "learning_rate": 4.0246247037134585e-05, "loss": 0.5342, "step": 7413 }, { "epoch": 0.19525941532789043, "grad_norm": 3.30856990814209, "learning_rate": 4.0244930208058993e-05, "loss": 1.9601, "step": 7414 }, { "epoch": 0.19528575190940217, "grad_norm": 4.260293960571289, "learning_rate": 4.024361337898341e-05, "loss": 1.744, "step": 7415 }, { "epoch": 0.1953120884909139, "grad_norm": 1.68104088306427, "learning_rate": 4.0242296549907825e-05, "loss": 1.4665, "step": 7416 }, { "epoch": 0.1953384250724256, "grad_norm": 2.505425214767456, "learning_rate": 4.024097972083224e-05, "loss": 1.1959, "step": 7417 }, { "epoch": 0.19536476165393732, "grad_norm": 7.117185115814209, "learning_rate": 4.023966289175665e-05, "loss": 1.3316, "step": 7418 }, { "epoch": 0.19539109823544903, "grad_norm": 1.8270654678344727, "learning_rate": 4.0238346062681065e-05, "loss": 1.7066, "step": 7419 }, { "epoch": 0.19541743481696075, "grad_norm": 1.9719904661178589, "learning_rate": 4.023702923360548e-05, "loss": 2.1703, "step": 7420 }, { "epoch": 0.19544377139847247, "grad_norm": 3.1282644271850586, "learning_rate": 4.0235712404529896e-05, "loss": 1.3494, "step": 7421 }, { "epoch": 0.1954701079799842, "grad_norm": 4.2231340408325195, "learning_rate": 4.023439557545431e-05, "loss": 1.9168, "step": 7422 }, { "epoch": 0.19549644456149592, "grad_norm": 3.291358709335327, "learning_rate": 4.023307874637872e-05, "loss": 1.4818, "step": 7423 }, { "epoch": 0.19552278114300764, "grad_norm": 3.567249298095703, "learning_rate": 4.0231761917303136e-05, "loss": 1.9085, "step": 7424 }, { "epoch": 0.19554911772451936, "grad_norm": 3.560485601425171, "learning_rate": 4.0230445088227545e-05, "loss": 1.199, "step": 7425 }, { "epoch": 0.19557545430603107, "grad_norm": 2.3450729846954346, "learning_rate": 4.022912825915197e-05, "loss": 1.7006, "step": 7426 }, { "epoch": 0.1956017908875428, "grad_norm": 3.816150665283203, "learning_rate": 4.0227811430076376e-05, "loss": 1.1221, "step": 7427 }, { "epoch": 0.19562812746905453, "grad_norm": 2.1040356159210205, "learning_rate": 4.022649460100079e-05, "loss": 1.8347, "step": 7428 }, { "epoch": 0.19565446405056625, "grad_norm": 1.7644938230514526, "learning_rate": 4.022517777192521e-05, "loss": 2.7302, "step": 7429 }, { "epoch": 0.19568080063207796, "grad_norm": 2.0935416221618652, "learning_rate": 4.022386094284962e-05, "loss": 2.4944, "step": 7430 }, { "epoch": 0.19570713721358968, "grad_norm": 6.663039684295654, "learning_rate": 4.022254411377404e-05, "loss": 2.3264, "step": 7431 }, { "epoch": 0.1957334737951014, "grad_norm": 4.065506935119629, "learning_rate": 4.022122728469845e-05, "loss": 1.9679, "step": 7432 }, { "epoch": 0.1957598103766131, "grad_norm": 1.5186145305633545, "learning_rate": 4.021991045562286e-05, "loss": 1.8782, "step": 7433 }, { "epoch": 0.19578614695812482, "grad_norm": 2.1323468685150146, "learning_rate": 4.021859362654727e-05, "loss": 1.5221, "step": 7434 }, { "epoch": 0.19581248353963657, "grad_norm": 2.432612419128418, "learning_rate": 4.0217276797471694e-05, "loss": 1.9125, "step": 7435 }, { "epoch": 0.19583882012114828, "grad_norm": 5.779027938842773, "learning_rate": 4.02159599683961e-05, "loss": 1.1144, "step": 7436 }, { "epoch": 0.19586515670266, "grad_norm": 1.8980718851089478, "learning_rate": 4.021464313932052e-05, "loss": 1.9799, "step": 7437 }, { "epoch": 0.1958914932841717, "grad_norm": 4.200015068054199, "learning_rate": 4.0213326310244934e-05, "loss": 1.3378, "step": 7438 }, { "epoch": 0.19591782986568343, "grad_norm": 8.51160717010498, "learning_rate": 4.021200948116934e-05, "loss": 1.6298, "step": 7439 }, { "epoch": 0.19594416644719514, "grad_norm": 1.7395036220550537, "learning_rate": 4.0210692652093765e-05, "loss": 2.0194, "step": 7440 }, { "epoch": 0.19597050302870686, "grad_norm": 1.6162654161453247, "learning_rate": 4.0209375823018174e-05, "loss": 1.6401, "step": 7441 }, { "epoch": 0.1959968396102186, "grad_norm": 1.832571029663086, "learning_rate": 4.020805899394259e-05, "loss": 2.0474, "step": 7442 }, { "epoch": 0.19602317619173032, "grad_norm": 2.5724527835845947, "learning_rate": 4.0206742164867e-05, "loss": 1.201, "step": 7443 }, { "epoch": 0.19604951277324204, "grad_norm": 2.5892958641052246, "learning_rate": 4.0205425335791414e-05, "loss": 1.9161, "step": 7444 }, { "epoch": 0.19607584935475375, "grad_norm": 2.293693780899048, "learning_rate": 4.020410850671583e-05, "loss": 0.3135, "step": 7445 }, { "epoch": 0.19610218593626547, "grad_norm": 1.8297778367996216, "learning_rate": 4.0202791677640245e-05, "loss": 1.5035, "step": 7446 }, { "epoch": 0.19612852251777718, "grad_norm": 2.4194862842559814, "learning_rate": 4.020147484856466e-05, "loss": 0.5079, "step": 7447 }, { "epoch": 0.19615485909928893, "grad_norm": 1.869683027267456, "learning_rate": 4.020015801948907e-05, "loss": 1.7675, "step": 7448 }, { "epoch": 0.19618119568080064, "grad_norm": 2.2527916431427, "learning_rate": 4.019884119041349e-05, "loss": 1.8835, "step": 7449 }, { "epoch": 0.19620753226231236, "grad_norm": 2.494058132171631, "learning_rate": 4.01975243613379e-05, "loss": 1.8796, "step": 7450 }, { "epoch": 0.19623386884382407, "grad_norm": 2.0544040203094482, "learning_rate": 4.0196207532262316e-05, "loss": 2.1759, "step": 7451 }, { "epoch": 0.1962602054253358, "grad_norm": 3.9496214389801025, "learning_rate": 4.0194890703186725e-05, "loss": 1.4214, "step": 7452 }, { "epoch": 0.1962865420068475, "grad_norm": 1.7092251777648926, "learning_rate": 4.019357387411114e-05, "loss": 0.5373, "step": 7453 }, { "epoch": 0.19631287858835922, "grad_norm": 2.4389095306396484, "learning_rate": 4.0192257045035556e-05, "loss": 1.4949, "step": 7454 }, { "epoch": 0.19633921516987096, "grad_norm": 1.9976189136505127, "learning_rate": 4.019094021595997e-05, "loss": 1.69, "step": 7455 }, { "epoch": 0.19636555175138268, "grad_norm": 1.8615301847457886, "learning_rate": 4.018962338688438e-05, "loss": 1.7446, "step": 7456 }, { "epoch": 0.1963918883328944, "grad_norm": 2.5110418796539307, "learning_rate": 4.0188306557808796e-05, "loss": 2.0329, "step": 7457 }, { "epoch": 0.1964182249144061, "grad_norm": 2.4457669258117676, "learning_rate": 4.018698972873321e-05, "loss": 1.9296, "step": 7458 }, { "epoch": 0.19644456149591782, "grad_norm": 2.5045831203460693, "learning_rate": 4.018567289965763e-05, "loss": 1.1637, "step": 7459 }, { "epoch": 0.19647089807742954, "grad_norm": 1.568500280380249, "learning_rate": 4.018435607058204e-05, "loss": 2.1627, "step": 7460 }, { "epoch": 0.19649723465894128, "grad_norm": 1.9266376495361328, "learning_rate": 4.018303924150645e-05, "loss": 2.2608, "step": 7461 }, { "epoch": 0.196523571240453, "grad_norm": 1.9290324449539185, "learning_rate": 4.018172241243087e-05, "loss": 1.8411, "step": 7462 }, { "epoch": 0.19654990782196471, "grad_norm": 4.751797676086426, "learning_rate": 4.018040558335528e-05, "loss": 2.4956, "step": 7463 }, { "epoch": 0.19657624440347643, "grad_norm": 2.8945770263671875, "learning_rate": 4.01790887542797e-05, "loss": 1.6411, "step": 7464 }, { "epoch": 0.19660258098498815, "grad_norm": 2.223255157470703, "learning_rate": 4.017777192520411e-05, "loss": 2.4477, "step": 7465 }, { "epoch": 0.19662891756649986, "grad_norm": 2.0027096271514893, "learning_rate": 4.017645509612852e-05, "loss": 1.3819, "step": 7466 }, { "epoch": 0.19665525414801158, "grad_norm": 2.9613234996795654, "learning_rate": 4.017513826705294e-05, "loss": 0.9618, "step": 7467 }, { "epoch": 0.19668159072952332, "grad_norm": 2.3709774017333984, "learning_rate": 4.0173821437977354e-05, "loss": 0.9803, "step": 7468 }, { "epoch": 0.19670792731103504, "grad_norm": 4.378414154052734, "learning_rate": 4.017250460890177e-05, "loss": 1.1307, "step": 7469 }, { "epoch": 0.19673426389254675, "grad_norm": 1.6801087856292725, "learning_rate": 4.017118777982618e-05, "loss": 1.7372, "step": 7470 }, { "epoch": 0.19676060047405847, "grad_norm": 5.7648539543151855, "learning_rate": 4.0169870950750594e-05, "loss": 1.9151, "step": 7471 }, { "epoch": 0.19678693705557018, "grad_norm": 4.914088249206543, "learning_rate": 4.0168554121675e-05, "loss": 1.2514, "step": 7472 }, { "epoch": 0.1968132736370819, "grad_norm": 2.4787395000457764, "learning_rate": 4.0167237292599426e-05, "loss": 1.3829, "step": 7473 }, { "epoch": 0.1968396102185936, "grad_norm": 3.0546462535858154, "learning_rate": 4.0165920463523834e-05, "loss": 2.0835, "step": 7474 }, { "epoch": 0.19686594680010536, "grad_norm": 1.8343952894210815, "learning_rate": 4.016460363444825e-05, "loss": 2.1522, "step": 7475 }, { "epoch": 0.19689228338161707, "grad_norm": 1.6831419467926025, "learning_rate": 4.0163286805372666e-05, "loss": 1.9958, "step": 7476 }, { "epoch": 0.1969186199631288, "grad_norm": 3.2217142581939697, "learning_rate": 4.0161969976297074e-05, "loss": 1.5006, "step": 7477 }, { "epoch": 0.1969449565446405, "grad_norm": 3.7038283348083496, "learning_rate": 4.01606531472215e-05, "loss": 0.662, "step": 7478 }, { "epoch": 0.19697129312615222, "grad_norm": 1.6220823526382446, "learning_rate": 4.0159336318145906e-05, "loss": 2.3064, "step": 7479 }, { "epoch": 0.19699762970766393, "grad_norm": 3.2559425830841064, "learning_rate": 4.015801948907032e-05, "loss": 1.7083, "step": 7480 }, { "epoch": 0.19702396628917568, "grad_norm": 3.1635656356811523, "learning_rate": 4.015670265999473e-05, "loss": 1.559, "step": 7481 }, { "epoch": 0.1970503028706874, "grad_norm": 1.976241111755371, "learning_rate": 4.015538583091915e-05, "loss": 1.5995, "step": 7482 }, { "epoch": 0.1970766394521991, "grad_norm": NaN, "learning_rate": 4.015538583091915e-05, "loss": 2.5233, "step": 7483 }, { "epoch": 0.19710297603371082, "grad_norm": 3.5263054370880127, "learning_rate": 4.015406900184356e-05, "loss": 1.7435, "step": 7484 }, { "epoch": 0.19712931261522254, "grad_norm": 5.218058109283447, "learning_rate": 4.015275217276798e-05, "loss": 0.985, "step": 7485 }, { "epoch": 0.19715564919673426, "grad_norm": 4.067135810852051, "learning_rate": 4.015143534369239e-05, "loss": 1.7492, "step": 7486 }, { "epoch": 0.19718198577824597, "grad_norm": 2.8353230953216553, "learning_rate": 4.01501185146168e-05, "loss": 0.9261, "step": 7487 }, { "epoch": 0.19720832235975772, "grad_norm": 2.1453819274902344, "learning_rate": 4.0148801685541224e-05, "loss": 1.7639, "step": 7488 }, { "epoch": 0.19723465894126943, "grad_norm": 2.8541440963745117, "learning_rate": 4.014748485646563e-05, "loss": 1.1455, "step": 7489 }, { "epoch": 0.19726099552278115, "grad_norm": 2.0490052700042725, "learning_rate": 4.014616802739005e-05, "loss": 2.527, "step": 7490 }, { "epoch": 0.19728733210429286, "grad_norm": 1.9368599653244019, "learning_rate": 4.014485119831446e-05, "loss": 2.3578, "step": 7491 }, { "epoch": 0.19731366868580458, "grad_norm": 2.8101859092712402, "learning_rate": 4.014353436923887e-05, "loss": 1.3551, "step": 7492 }, { "epoch": 0.1973400052673163, "grad_norm": 3.3241987228393555, "learning_rate": 4.014221754016329e-05, "loss": 0.9608, "step": 7493 }, { "epoch": 0.197366341848828, "grad_norm": 1.8203599452972412, "learning_rate": 4.0140900711087704e-05, "loss": 2.0779, "step": 7494 }, { "epoch": 0.19739267843033975, "grad_norm": 2.1838648319244385, "learning_rate": 4.013958388201212e-05, "loss": 1.6494, "step": 7495 }, { "epoch": 0.19741901501185147, "grad_norm": 2.1721134185791016, "learning_rate": 4.013826705293653e-05, "loss": 1.6743, "step": 7496 }, { "epoch": 0.19744535159336318, "grad_norm": 1.8995773792266846, "learning_rate": 4.013695022386095e-05, "loss": 1.5751, "step": 7497 }, { "epoch": 0.1974716881748749, "grad_norm": 2.529229164123535, "learning_rate": 4.013563339478536e-05, "loss": 2.0341, "step": 7498 }, { "epoch": 0.19749802475638661, "grad_norm": 7.55031681060791, "learning_rate": 4.0134316565709775e-05, "loss": 1.9435, "step": 7499 }, { "epoch": 0.19752436133789833, "grad_norm": 7.909158229827881, "learning_rate": 4.0132999736634184e-05, "loss": 1.9976, "step": 7500 }, { "epoch": 0.19755069791941007, "grad_norm": 2.531377077102661, "learning_rate": 4.01316829075586e-05, "loss": 1.0616, "step": 7501 }, { "epoch": 0.1975770345009218, "grad_norm": 3.3430471420288086, "learning_rate": 4.0130366078483015e-05, "loss": 1.5957, "step": 7502 }, { "epoch": 0.1976033710824335, "grad_norm": 2.2487452030181885, "learning_rate": 4.012904924940743e-05, "loss": 1.7767, "step": 7503 }, { "epoch": 0.19762970766394522, "grad_norm": 1.7168707847595215, "learning_rate": 4.0127732420331846e-05, "loss": 1.7068, "step": 7504 }, { "epoch": 0.19765604424545694, "grad_norm": 2.2454960346221924, "learning_rate": 4.0126415591256255e-05, "loss": 1.6414, "step": 7505 }, { "epoch": 0.19768238082696865, "grad_norm": 3.3308098316192627, "learning_rate": 4.012509876218067e-05, "loss": 1.9613, "step": 7506 }, { "epoch": 0.19770871740848037, "grad_norm": 1.918514370918274, "learning_rate": 4.0123781933105086e-05, "loss": 2.301, "step": 7507 }, { "epoch": 0.1977350539899921, "grad_norm": 1.9155902862548828, "learning_rate": 4.01224651040295e-05, "loss": 1.7896, "step": 7508 }, { "epoch": 0.19776139057150383, "grad_norm": 1.9534783363342285, "learning_rate": 4.012114827495391e-05, "loss": 1.7568, "step": 7509 }, { "epoch": 0.19778772715301554, "grad_norm": 1.777553915977478, "learning_rate": 4.0119831445878326e-05, "loss": 0.297, "step": 7510 }, { "epoch": 0.19781406373452726, "grad_norm": 2.4240622520446777, "learning_rate": 4.011851461680274e-05, "loss": 1.4927, "step": 7511 }, { "epoch": 0.19784040031603897, "grad_norm": 9.307279586791992, "learning_rate": 4.011719778772716e-05, "loss": 1.2356, "step": 7512 }, { "epoch": 0.1978667368975507, "grad_norm": 2.1926536560058594, "learning_rate": 4.0115880958651566e-05, "loss": 1.9209, "step": 7513 }, { "epoch": 0.19789307347906243, "grad_norm": 4.81790018081665, "learning_rate": 4.011456412957598e-05, "loss": 1.8867, "step": 7514 }, { "epoch": 0.19791941006057415, "grad_norm": 1.858687162399292, "learning_rate": 4.01132473005004e-05, "loss": 2.5035, "step": 7515 }, { "epoch": 0.19794574664208586, "grad_norm": 1.717914342880249, "learning_rate": 4.011193047142481e-05, "loss": 1.411, "step": 7516 }, { "epoch": 0.19797208322359758, "grad_norm": 1.7054836750030518, "learning_rate": 4.011061364234923e-05, "loss": 1.8476, "step": 7517 }, { "epoch": 0.1979984198051093, "grad_norm": 3.8859050273895264, "learning_rate": 4.010929681327364e-05, "loss": 0.9733, "step": 7518 }, { "epoch": 0.198024756386621, "grad_norm": 2.1310760974884033, "learning_rate": 4.010797998419805e-05, "loss": 1.5537, "step": 7519 }, { "epoch": 0.19805109296813272, "grad_norm": 4.281438827514648, "learning_rate": 4.010666315512246e-05, "loss": 1.502, "step": 7520 }, { "epoch": 0.19807742954964447, "grad_norm": 2.4652278423309326, "learning_rate": 4.0105346326046884e-05, "loss": 1.9364, "step": 7521 }, { "epoch": 0.19810376613115618, "grad_norm": 1.8965740203857422, "learning_rate": 4.010402949697129e-05, "loss": 1.3641, "step": 7522 }, { "epoch": 0.1981301027126679, "grad_norm": 4.128808498382568, "learning_rate": 4.010271266789571e-05, "loss": 1.1411, "step": 7523 }, { "epoch": 0.19815643929417961, "grad_norm": 1.7077562808990479, "learning_rate": 4.0101395838820124e-05, "loss": 1.8364, "step": 7524 }, { "epoch": 0.19818277587569133, "grad_norm": 4.297139644622803, "learning_rate": 4.010007900974453e-05, "loss": 1.7296, "step": 7525 }, { "epoch": 0.19820911245720305, "grad_norm": 2.6248910427093506, "learning_rate": 4.0098762180668955e-05, "loss": 1.7391, "step": 7526 }, { "epoch": 0.19823544903871476, "grad_norm": 1.6063523292541504, "learning_rate": 4.0097445351593364e-05, "loss": 0.7346, "step": 7527 }, { "epoch": 0.1982617856202265, "grad_norm": 2.1467947959899902, "learning_rate": 4.009612852251778e-05, "loss": 1.827, "step": 7528 }, { "epoch": 0.19828812220173822, "grad_norm": 1.945920467376709, "learning_rate": 4.009481169344219e-05, "loss": 1.9262, "step": 7529 }, { "epoch": 0.19831445878324994, "grad_norm": 1.5966516733169556, "learning_rate": 4.009349486436661e-05, "loss": 1.6245, "step": 7530 }, { "epoch": 0.19834079536476165, "grad_norm": 3.683802604675293, "learning_rate": 4.009217803529102e-05, "loss": 1.8138, "step": 7531 }, { "epoch": 0.19836713194627337, "grad_norm": 4.670269012451172, "learning_rate": 4.0090861206215435e-05, "loss": 1.1864, "step": 7532 }, { "epoch": 0.19839346852778508, "grad_norm": 2.1269919872283936, "learning_rate": 4.008954437713985e-05, "loss": 1.9501, "step": 7533 }, { "epoch": 0.19841980510929683, "grad_norm": 2.673978328704834, "learning_rate": 4.008822754806426e-05, "loss": 1.3698, "step": 7534 }, { "epoch": 0.19844614169080854, "grad_norm": 3.6344099044799805, "learning_rate": 4.008691071898868e-05, "loss": 0.2323, "step": 7535 }, { "epoch": 0.19847247827232026, "grad_norm": 3.196615219116211, "learning_rate": 4.008559388991309e-05, "loss": 1.5375, "step": 7536 }, { "epoch": 0.19849881485383197, "grad_norm": 1.5741772651672363, "learning_rate": 4.008427706083751e-05, "loss": 1.9346, "step": 7537 }, { "epoch": 0.1985251514353437, "grad_norm": 1.7590272426605225, "learning_rate": 4.0082960231761915e-05, "loss": 1.8614, "step": 7538 }, { "epoch": 0.1985514880168554, "grad_norm": 1.7377195358276367, "learning_rate": 4.008164340268633e-05, "loss": 0.7918, "step": 7539 }, { "epoch": 0.19857782459836712, "grad_norm": 3.493013858795166, "learning_rate": 4.008032657361075e-05, "loss": 1.7633, "step": 7540 }, { "epoch": 0.19860416117987886, "grad_norm": 2.4719555377960205, "learning_rate": 4.007900974453516e-05, "loss": 0.4619, "step": 7541 }, { "epoch": 0.19863049776139058, "grad_norm": 1.7563979625701904, "learning_rate": 4.007769291545958e-05, "loss": 1.9729, "step": 7542 }, { "epoch": 0.1986568343429023, "grad_norm": 5.096905708312988, "learning_rate": 4.007637608638399e-05, "loss": 0.8012, "step": 7543 }, { "epoch": 0.198683170924414, "grad_norm": 2.1098737716674805, "learning_rate": 4.007505925730841e-05, "loss": 0.9153, "step": 7544 }, { "epoch": 0.19870950750592573, "grad_norm": 1.6241297721862793, "learning_rate": 4.007374242823282e-05, "loss": 1.1565, "step": 7545 }, { "epoch": 0.19873584408743744, "grad_norm": 2.1424388885498047, "learning_rate": 4.0072425599157233e-05, "loss": 0.8252, "step": 7546 }, { "epoch": 0.19876218066894916, "grad_norm": 2.1336257457733154, "learning_rate": 4.007110877008164e-05, "loss": 1.5883, "step": 7547 }, { "epoch": 0.1987885172504609, "grad_norm": 1.9175602197647095, "learning_rate": 4.006979194100606e-05, "loss": 1.7617, "step": 7548 }, { "epoch": 0.19881485383197262, "grad_norm": 2.0442330837249756, "learning_rate": 4.0068475111930474e-05, "loss": 2.5846, "step": 7549 }, { "epoch": 0.19884119041348433, "grad_norm": 5.907866954803467, "learning_rate": 4.006715828285489e-05, "loss": 0.3052, "step": 7550 }, { "epoch": 0.19886752699499605, "grad_norm": 2.0236098766326904, "learning_rate": 4.0065841453779305e-05, "loss": 1.7809, "step": 7551 }, { "epoch": 0.19889386357650776, "grad_norm": 1.7944226264953613, "learning_rate": 4.0064524624703714e-05, "loss": 1.6861, "step": 7552 }, { "epoch": 0.19892020015801948, "grad_norm": 2.323636054992676, "learning_rate": 4.006320779562813e-05, "loss": 1.1197, "step": 7553 }, { "epoch": 0.19894653673953122, "grad_norm": 3.2054295539855957, "learning_rate": 4.0061890966552545e-05, "loss": 1.1433, "step": 7554 }, { "epoch": 0.19897287332104294, "grad_norm": 2.9711010456085205, "learning_rate": 4.006057413747696e-05, "loss": 0.8065, "step": 7555 }, { "epoch": 0.19899920990255465, "grad_norm": 1.9916502237319946, "learning_rate": 4.005925730840137e-05, "loss": 1.8929, "step": 7556 }, { "epoch": 0.19902554648406637, "grad_norm": 3.20845365524292, "learning_rate": 4.0057940479325785e-05, "loss": 0.5068, "step": 7557 }, { "epoch": 0.19905188306557808, "grad_norm": 2.848965644836426, "learning_rate": 4.0056623650250194e-05, "loss": 2.1222, "step": 7558 }, { "epoch": 0.1990782196470898, "grad_norm": 2.531967878341675, "learning_rate": 4.0055306821174616e-05, "loss": 1.8606, "step": 7559 }, { "epoch": 0.19910455622860151, "grad_norm": 2.262629270553589, "learning_rate": 4.0053989992099025e-05, "loss": 1.6576, "step": 7560 }, { "epoch": 0.19913089281011326, "grad_norm": 3.4171698093414307, "learning_rate": 4.005267316302344e-05, "loss": 1.3773, "step": 7561 }, { "epoch": 0.19915722939162497, "grad_norm": 7.297581195831299, "learning_rate": 4.0051356333947856e-05, "loss": 1.4364, "step": 7562 }, { "epoch": 0.1991835659731367, "grad_norm": 2.33355975151062, "learning_rate": 4.005003950487227e-05, "loss": 1.3524, "step": 7563 }, { "epoch": 0.1992099025546484, "grad_norm": 3.3187313079833984, "learning_rate": 4.004872267579669e-05, "loss": 1.369, "step": 7564 }, { "epoch": 0.19923623913616012, "grad_norm": 3.691459894180298, "learning_rate": 4.0047405846721096e-05, "loss": 1.2118, "step": 7565 }, { "epoch": 0.19926257571767184, "grad_norm": 1.7060962915420532, "learning_rate": 4.004608901764551e-05, "loss": 1.9935, "step": 7566 }, { "epoch": 0.19928891229918358, "grad_norm": 6.282314300537109, "learning_rate": 4.004477218856992e-05, "loss": 1.4531, "step": 7567 }, { "epoch": 0.1993152488806953, "grad_norm": 2.377469778060913, "learning_rate": 4.004345535949434e-05, "loss": 2.0445, "step": 7568 }, { "epoch": 0.199341585462207, "grad_norm": 2.4734559059143066, "learning_rate": 4.004213853041875e-05, "loss": 1.8534, "step": 7569 }, { "epoch": 0.19936792204371873, "grad_norm": 2.94069504737854, "learning_rate": 4.004082170134317e-05, "loss": 2.2641, "step": 7570 }, { "epoch": 0.19939425862523044, "grad_norm": 2.2741403579711914, "learning_rate": 4.003950487226758e-05, "loss": 2.3334, "step": 7571 }, { "epoch": 0.19942059520674216, "grad_norm": 2.0813348293304443, "learning_rate": 4.003818804319199e-05, "loss": 1.7502, "step": 7572 }, { "epoch": 0.19944693178825387, "grad_norm": 1.5913808345794678, "learning_rate": 4.0036871214116414e-05, "loss": 1.7071, "step": 7573 }, { "epoch": 0.19947326836976562, "grad_norm": 2.7190074920654297, "learning_rate": 4.003555438504082e-05, "loss": 2.4468, "step": 7574 }, { "epoch": 0.19949960495127733, "grad_norm": 1.7631863355636597, "learning_rate": 4.003423755596524e-05, "loss": 1.2718, "step": 7575 }, { "epoch": 0.19952594153278905, "grad_norm": 1.6568305492401123, "learning_rate": 4.003292072688965e-05, "loss": 1.812, "step": 7576 }, { "epoch": 0.19955227811430076, "grad_norm": 2.389723777770996, "learning_rate": 4.003160389781407e-05, "loss": 1.7941, "step": 7577 }, { "epoch": 0.19957861469581248, "grad_norm": 3.206573963165283, "learning_rate": 4.003028706873848e-05, "loss": 1.3594, "step": 7578 }, { "epoch": 0.1996049512773242, "grad_norm": 2.3536691665649414, "learning_rate": 4.0028970239662894e-05, "loss": 1.8424, "step": 7579 }, { "epoch": 0.1996312878588359, "grad_norm": 2.3670692443847656, "learning_rate": 4.002765341058731e-05, "loss": 1.9065, "step": 7580 }, { "epoch": 0.19965762444034765, "grad_norm": 3.001194953918457, "learning_rate": 4.002633658151172e-05, "loss": 1.7191, "step": 7581 }, { "epoch": 0.19968396102185937, "grad_norm": 2.1872239112854004, "learning_rate": 4.002501975243614e-05, "loss": 2.6511, "step": 7582 }, { "epoch": 0.19971029760337108, "grad_norm": 1.8038225173950195, "learning_rate": 4.002370292336055e-05, "loss": 2.0818, "step": 7583 }, { "epoch": 0.1997366341848828, "grad_norm": 2.3791491985321045, "learning_rate": 4.0022386094284965e-05, "loss": 1.5658, "step": 7584 }, { "epoch": 0.19976297076639452, "grad_norm": 2.512356996536255, "learning_rate": 4.0021069265209374e-05, "loss": 0.212, "step": 7585 }, { "epoch": 0.19978930734790623, "grad_norm": 2.272510051727295, "learning_rate": 4.001975243613379e-05, "loss": 1.697, "step": 7586 }, { "epoch": 0.19981564392941797, "grad_norm": 1.9434016942977905, "learning_rate": 4.0018435607058205e-05, "loss": 2.2373, "step": 7587 }, { "epoch": 0.1998419805109297, "grad_norm": 1.732528567314148, "learning_rate": 4.001711877798262e-05, "loss": 1.8638, "step": 7588 }, { "epoch": 0.1998683170924414, "grad_norm": 3.1505141258239746, "learning_rate": 4.0015801948907036e-05, "loss": 2.2199, "step": 7589 }, { "epoch": 0.19989465367395312, "grad_norm": 1.6942317485809326, "learning_rate": 4.0014485119831445e-05, "loss": 1.828, "step": 7590 }, { "epoch": 0.19992099025546484, "grad_norm": 1.985947847366333, "learning_rate": 4.001316829075586e-05, "loss": 1.9926, "step": 7591 }, { "epoch": 0.19994732683697655, "grad_norm": 2.9298174381256104, "learning_rate": 4.0011851461680276e-05, "loss": 1.153, "step": 7592 }, { "epoch": 0.19997366341848827, "grad_norm": 1.9928040504455566, "learning_rate": 4.001053463260469e-05, "loss": 1.9263, "step": 7593 }, { "epoch": 0.2, "grad_norm": 1.906722068786621, "learning_rate": 4.00092178035291e-05, "loss": 2.216, "step": 7594 }, { "epoch": 0.20002633658151173, "grad_norm": 2.406498432159424, "learning_rate": 4.0007900974453516e-05, "loss": 1.336, "step": 7595 }, { "epoch": 0.20005267316302344, "grad_norm": 4.151043891906738, "learning_rate": 4.000658414537793e-05, "loss": 0.9129, "step": 7596 }, { "epoch": 0.20007900974453516, "grad_norm": 2.00247859954834, "learning_rate": 4.000526731630235e-05, "loss": 1.4707, "step": 7597 }, { "epoch": 0.20010534632604687, "grad_norm": 3.530992031097412, "learning_rate": 4.000395048722676e-05, "loss": 0.7884, "step": 7598 }, { "epoch": 0.2001316829075586, "grad_norm": 1.859633445739746, "learning_rate": 4.000263365815117e-05, "loss": 1.3441, "step": 7599 }, { "epoch": 0.20015801948907033, "grad_norm": 2.1075634956359863, "learning_rate": 4.000131682907559e-05, "loss": 1.915, "step": 7600 }, { "epoch": 0.20018435607058205, "grad_norm": 1.7875258922576904, "learning_rate": 4e-05, "loss": 2.1025, "step": 7601 }, { "epoch": 0.20021069265209376, "grad_norm": 2.694178581237793, "learning_rate": 3.999868317092442e-05, "loss": 2.1537, "step": 7602 }, { "epoch": 0.20023702923360548, "grad_norm": 1.946129322052002, "learning_rate": 3.999736634184883e-05, "loss": 1.8532, "step": 7603 }, { "epoch": 0.2002633658151172, "grad_norm": 2.714689254760742, "learning_rate": 3.999604951277324e-05, "loss": 1.7262, "step": 7604 }, { "epoch": 0.2002897023966289, "grad_norm": 2.034921169281006, "learning_rate": 3.999473268369765e-05, "loss": 2.065, "step": 7605 }, { "epoch": 0.20031603897814063, "grad_norm": 2.1698575019836426, "learning_rate": 3.9993415854622074e-05, "loss": 1.7837, "step": 7606 }, { "epoch": 0.20034237555965237, "grad_norm": 1.7990353107452393, "learning_rate": 3.999209902554648e-05, "loss": 1.7209, "step": 7607 }, { "epoch": 0.20036871214116408, "grad_norm": 3.398479461669922, "learning_rate": 3.99907821964709e-05, "loss": 1.4449, "step": 7608 }, { "epoch": 0.2003950487226758, "grad_norm": 1.9705686569213867, "learning_rate": 3.9989465367395315e-05, "loss": 1.5597, "step": 7609 }, { "epoch": 0.20042138530418752, "grad_norm": 5.314675807952881, "learning_rate": 3.998814853831973e-05, "loss": 1.1518, "step": 7610 }, { "epoch": 0.20044772188569923, "grad_norm": 4.319832801818848, "learning_rate": 3.9986831709244146e-05, "loss": 2.132, "step": 7611 }, { "epoch": 0.20047405846721095, "grad_norm": 1.8207014799118042, "learning_rate": 3.9985514880168555e-05, "loss": 1.807, "step": 7612 }, { "epoch": 0.20050039504872266, "grad_norm": 2.74749493598938, "learning_rate": 3.998419805109297e-05, "loss": 2.2016, "step": 7613 }, { "epoch": 0.2005267316302344, "grad_norm": 2.00268816947937, "learning_rate": 3.998288122201738e-05, "loss": 1.9807, "step": 7614 }, { "epoch": 0.20055306821174612, "grad_norm": 1.9017468690872192, "learning_rate": 3.99815643929418e-05, "loss": 1.917, "step": 7615 }, { "epoch": 0.20057940479325784, "grad_norm": 1.6493947505950928, "learning_rate": 3.998024756386621e-05, "loss": 1.3457, "step": 7616 }, { "epoch": 0.20060574137476955, "grad_norm": 1.6002165079116821, "learning_rate": 3.9978930734790626e-05, "loss": 2.2266, "step": 7617 }, { "epoch": 0.20063207795628127, "grad_norm": 3.303194522857666, "learning_rate": 3.997761390571504e-05, "loss": 2.2041, "step": 7618 }, { "epoch": 0.20065841453779298, "grad_norm": 3.5344648361206055, "learning_rate": 3.997629707663945e-05, "loss": 1.554, "step": 7619 }, { "epoch": 0.20068475111930473, "grad_norm": 4.115609169006348, "learning_rate": 3.997498024756387e-05, "loss": 2.4147, "step": 7620 }, { "epoch": 0.20071108770081644, "grad_norm": 2.985434055328369, "learning_rate": 3.997366341848828e-05, "loss": 0.844, "step": 7621 }, { "epoch": 0.20073742428232816, "grad_norm": 1.9391499757766724, "learning_rate": 3.99723465894127e-05, "loss": 1.9651, "step": 7622 }, { "epoch": 0.20076376086383987, "grad_norm": 3.0145184993743896, "learning_rate": 3.9971029760337106e-05, "loss": 1.1077, "step": 7623 }, { "epoch": 0.2007900974453516, "grad_norm": 2.0803065299987793, "learning_rate": 3.996971293126152e-05, "loss": 2.4723, "step": 7624 }, { "epoch": 0.2008164340268633, "grad_norm": 3.519578456878662, "learning_rate": 3.996839610218594e-05, "loss": 1.2927, "step": 7625 }, { "epoch": 0.20084277060837502, "grad_norm": 2.8513505458831787, "learning_rate": 3.996707927311035e-05, "loss": 0.4603, "step": 7626 }, { "epoch": 0.20086910718988676, "grad_norm": 1.9014256000518799, "learning_rate": 3.996576244403477e-05, "loss": 1.816, "step": 7627 }, { "epoch": 0.20089544377139848, "grad_norm": 1.4538400173187256, "learning_rate": 3.996444561495918e-05, "loss": 1.4964, "step": 7628 }, { "epoch": 0.2009217803529102, "grad_norm": 1.9181309938430786, "learning_rate": 3.99631287858836e-05, "loss": 1.6319, "step": 7629 }, { "epoch": 0.2009481169344219, "grad_norm": 2.9189956188201904, "learning_rate": 3.996181195680801e-05, "loss": 2.1589, "step": 7630 }, { "epoch": 0.20097445351593363, "grad_norm": 2.1787703037261963, "learning_rate": 3.9960495127732424e-05, "loss": 1.8131, "step": 7631 }, { "epoch": 0.20100079009744534, "grad_norm": 3.29821515083313, "learning_rate": 3.995917829865683e-05, "loss": 1.8631, "step": 7632 }, { "epoch": 0.20102712667895706, "grad_norm": 2.814399480819702, "learning_rate": 3.995786146958125e-05, "loss": 0.7122, "step": 7633 }, { "epoch": 0.2010534632604688, "grad_norm": 2.0861523151397705, "learning_rate": 3.9956544640505664e-05, "loss": 2.0949, "step": 7634 }, { "epoch": 0.20107979984198052, "grad_norm": 2.0584568977355957, "learning_rate": 3.995522781143008e-05, "loss": 1.3592, "step": 7635 }, { "epoch": 0.20110613642349223, "grad_norm": 2.8228025436401367, "learning_rate": 3.9953910982354495e-05, "loss": 1.5879, "step": 7636 }, { "epoch": 0.20113247300500395, "grad_norm": 3.7035892009735107, "learning_rate": 3.9952594153278904e-05, "loss": 1.6861, "step": 7637 }, { "epoch": 0.20115880958651566, "grad_norm": 2.04569149017334, "learning_rate": 3.995127732420332e-05, "loss": 0.3637, "step": 7638 }, { "epoch": 0.20118514616802738, "grad_norm": 2.5820789337158203, "learning_rate": 3.9949960495127735e-05, "loss": 1.9954, "step": 7639 }, { "epoch": 0.20121148274953912, "grad_norm": 2.9092183113098145, "learning_rate": 3.994864366605215e-05, "loss": 2.0456, "step": 7640 }, { "epoch": 0.20123781933105084, "grad_norm": 2.382683038711548, "learning_rate": 3.994732683697656e-05, "loss": 0.9198, "step": 7641 }, { "epoch": 0.20126415591256255, "grad_norm": 2.3362839221954346, "learning_rate": 3.9946010007900975e-05, "loss": 2.4442, "step": 7642 }, { "epoch": 0.20129049249407427, "grad_norm": 2.193953275680542, "learning_rate": 3.994469317882539e-05, "loss": 0.476, "step": 7643 }, { "epoch": 0.20131682907558598, "grad_norm": 2.4420206546783447, "learning_rate": 3.9943376349749806e-05, "loss": 1.7504, "step": 7644 }, { "epoch": 0.2013431656570977, "grad_norm": 2.1141293048858643, "learning_rate": 3.994205952067422e-05, "loss": 1.151, "step": 7645 }, { "epoch": 0.20136950223860942, "grad_norm": 5.707983016967773, "learning_rate": 3.994074269159863e-05, "loss": 1.8366, "step": 7646 }, { "epoch": 0.20139583882012116, "grad_norm": 3.67509126663208, "learning_rate": 3.9939425862523046e-05, "loss": 1.8506, "step": 7647 }, { "epoch": 0.20142217540163287, "grad_norm": 2.4796736240386963, "learning_rate": 3.993810903344746e-05, "loss": 1.7999, "step": 7648 }, { "epoch": 0.2014485119831446, "grad_norm": 1.6437618732452393, "learning_rate": 3.993679220437188e-05, "loss": 2.5569, "step": 7649 }, { "epoch": 0.2014748485646563, "grad_norm": 2.386502265930176, "learning_rate": 3.9935475375296286e-05, "loss": 1.4422, "step": 7650 }, { "epoch": 0.20150118514616802, "grad_norm": 1.9960157871246338, "learning_rate": 3.99341585462207e-05, "loss": 2.1284, "step": 7651 }, { "epoch": 0.20152752172767974, "grad_norm": 2.392915725708008, "learning_rate": 3.993284171714512e-05, "loss": 1.9481, "step": 7652 }, { "epoch": 0.20155385830919148, "grad_norm": 2.5523152351379395, "learning_rate": 3.993152488806953e-05, "loss": 2.005, "step": 7653 }, { "epoch": 0.2015801948907032, "grad_norm": 1.7755165100097656, "learning_rate": 3.993020805899395e-05, "loss": 2.0937, "step": 7654 }, { "epoch": 0.2016065314722149, "grad_norm": 1.5651800632476807, "learning_rate": 3.992889122991836e-05, "loss": 2.0953, "step": 7655 }, { "epoch": 0.20163286805372663, "grad_norm": 1.7528629302978516, "learning_rate": 3.992757440084277e-05, "loss": 2.5557, "step": 7656 }, { "epoch": 0.20165920463523834, "grad_norm": 2.3901150226593018, "learning_rate": 3.992625757176718e-05, "loss": 2.2694, "step": 7657 }, { "epoch": 0.20168554121675006, "grad_norm": 2.3250856399536133, "learning_rate": 3.9924940742691604e-05, "loss": 1.598, "step": 7658 }, { "epoch": 0.20171187779826177, "grad_norm": 1.883156657218933, "learning_rate": 3.992362391361601e-05, "loss": 1.973, "step": 7659 }, { "epoch": 0.20173821437977352, "grad_norm": 1.984703540802002, "learning_rate": 3.992230708454043e-05, "loss": 1.3489, "step": 7660 }, { "epoch": 0.20176455096128523, "grad_norm": 4.213223457336426, "learning_rate": 3.992099025546484e-05, "loss": 1.4312, "step": 7661 }, { "epoch": 0.20179088754279695, "grad_norm": 1.8153281211853027, "learning_rate": 3.991967342638926e-05, "loss": 1.92, "step": 7662 }, { "epoch": 0.20181722412430866, "grad_norm": 2.0963289737701416, "learning_rate": 3.991835659731367e-05, "loss": 1.8755, "step": 7663 }, { "epoch": 0.20184356070582038, "grad_norm": 2.7666232585906982, "learning_rate": 3.9917039768238084e-05, "loss": 1.8736, "step": 7664 }, { "epoch": 0.2018698972873321, "grad_norm": 4.68382453918457, "learning_rate": 3.99157229391625e-05, "loss": 1.4773, "step": 7665 }, { "epoch": 0.2018962338688438, "grad_norm": 2.283851146697998, "learning_rate": 3.991440611008691e-05, "loss": 1.8943, "step": 7666 }, { "epoch": 0.20192257045035555, "grad_norm": 2.262950897216797, "learning_rate": 3.991308928101133e-05, "loss": 1.973, "step": 7667 }, { "epoch": 0.20194890703186727, "grad_norm": 1.9184603691101074, "learning_rate": 3.991177245193574e-05, "loss": 1.4776, "step": 7668 }, { "epoch": 0.20197524361337899, "grad_norm": 1.4635335206985474, "learning_rate": 3.9910455622860155e-05, "loss": 1.4053, "step": 7669 }, { "epoch": 0.2020015801948907, "grad_norm": 1.6465815305709839, "learning_rate": 3.9909138793784564e-05, "loss": 1.8378, "step": 7670 }, { "epoch": 0.20202791677640242, "grad_norm": 1.9513689279556274, "learning_rate": 3.990782196470898e-05, "loss": 1.1274, "step": 7671 }, { "epoch": 0.20205425335791413, "grad_norm": 4.086761474609375, "learning_rate": 3.9906505135633396e-05, "loss": 1.8083, "step": 7672 }, { "epoch": 0.20208058993942588, "grad_norm": 1.6377832889556885, "learning_rate": 3.990518830655781e-05, "loss": 1.4912, "step": 7673 }, { "epoch": 0.2021069265209376, "grad_norm": 1.731050729751587, "learning_rate": 3.990387147748223e-05, "loss": 1.8024, "step": 7674 }, { "epoch": 0.2021332631024493, "grad_norm": 3.2154223918914795, "learning_rate": 3.9902554648406636e-05, "loss": 1.5255, "step": 7675 }, { "epoch": 0.20215959968396102, "grad_norm": 2.671917200088501, "learning_rate": 3.990123781933106e-05, "loss": 1.1043, "step": 7676 }, { "epoch": 0.20218593626547274, "grad_norm": 3.1091525554656982, "learning_rate": 3.989992099025547e-05, "loss": 1.2368, "step": 7677 }, { "epoch": 0.20221227284698445, "grad_norm": 1.976817011833191, "learning_rate": 3.989860416117988e-05, "loss": 1.6252, "step": 7678 }, { "epoch": 0.20223860942849617, "grad_norm": 1.9005625247955322, "learning_rate": 3.989728733210429e-05, "loss": 1.9913, "step": 7679 }, { "epoch": 0.2022649460100079, "grad_norm": 2.9008219242095947, "learning_rate": 3.989597050302871e-05, "loss": 2.0195, "step": 7680 }, { "epoch": 0.20229128259151963, "grad_norm": 2.651066541671753, "learning_rate": 3.989465367395312e-05, "loss": 1.1277, "step": 7681 }, { "epoch": 0.20231761917303134, "grad_norm": 2.88639760017395, "learning_rate": 3.989333684487754e-05, "loss": 1.2632, "step": 7682 }, { "epoch": 0.20234395575454306, "grad_norm": 1.7234835624694824, "learning_rate": 3.9892020015801954e-05, "loss": 1.3508, "step": 7683 }, { "epoch": 0.20237029233605477, "grad_norm": 3.25190806388855, "learning_rate": 3.989070318672636e-05, "loss": 1.5945, "step": 7684 }, { "epoch": 0.2023966289175665, "grad_norm": 3.527798652648926, "learning_rate": 3.988938635765078e-05, "loss": 0.7888, "step": 7685 }, { "epoch": 0.20242296549907823, "grad_norm": 2.441800117492676, "learning_rate": 3.9888069528575194e-05, "loss": 0.9202, "step": 7686 }, { "epoch": 0.20244930208058995, "grad_norm": 2.2201015949249268, "learning_rate": 3.988675269949961e-05, "loss": 1.5023, "step": 7687 }, { "epoch": 0.20247563866210166, "grad_norm": 2.8298521041870117, "learning_rate": 3.988543587042402e-05, "loss": 2.0057, "step": 7688 }, { "epoch": 0.20250197524361338, "grad_norm": 4.628076076507568, "learning_rate": 3.9884119041348434e-05, "loss": 1.1235, "step": 7689 }, { "epoch": 0.2025283118251251, "grad_norm": 1.9111008644104004, "learning_rate": 3.988280221227285e-05, "loss": 1.3718, "step": 7690 }, { "epoch": 0.2025546484066368, "grad_norm": 1.8705377578735352, "learning_rate": 3.9881485383197265e-05, "loss": 1.6203, "step": 7691 }, { "epoch": 0.20258098498814853, "grad_norm": 1.9395408630371094, "learning_rate": 3.988016855412168e-05, "loss": 1.9583, "step": 7692 }, { "epoch": 0.20260732156966027, "grad_norm": 1.6386895179748535, "learning_rate": 3.987885172504609e-05, "loss": 1.5762, "step": 7693 }, { "epoch": 0.20263365815117199, "grad_norm": 1.8084867000579834, "learning_rate": 3.9877534895970505e-05, "loss": 1.6403, "step": 7694 }, { "epoch": 0.2026599947326837, "grad_norm": 1.8858562707901, "learning_rate": 3.987621806689492e-05, "loss": 1.6637, "step": 7695 }, { "epoch": 0.20268633131419542, "grad_norm": 2.83836030960083, "learning_rate": 3.9874901237819336e-05, "loss": 2.3678, "step": 7696 }, { "epoch": 0.20271266789570713, "grad_norm": 3.136502504348755, "learning_rate": 3.9873584408743745e-05, "loss": 1.5297, "step": 7697 }, { "epoch": 0.20273900447721885, "grad_norm": 2.024453639984131, "learning_rate": 3.987226757966816e-05, "loss": 1.895, "step": 7698 }, { "epoch": 0.20276534105873056, "grad_norm": 2.590121030807495, "learning_rate": 3.9870950750592576e-05, "loss": 1.9034, "step": 7699 }, { "epoch": 0.2027916776402423, "grad_norm": 1.5360984802246094, "learning_rate": 3.986963392151699e-05, "loss": 1.6613, "step": 7700 }, { "epoch": 0.20281801422175402, "grad_norm": 1.9278558492660522, "learning_rate": 3.986831709244141e-05, "loss": 1.6669, "step": 7701 }, { "epoch": 0.20284435080326574, "grad_norm": 2.4258224964141846, "learning_rate": 3.9867000263365816e-05, "loss": 1.9488, "step": 7702 }, { "epoch": 0.20287068738477745, "grad_norm": 2.2309935092926025, "learning_rate": 3.986568343429023e-05, "loss": 0.4837, "step": 7703 }, { "epoch": 0.20289702396628917, "grad_norm": 4.881872177124023, "learning_rate": 3.986436660521464e-05, "loss": 1.7573, "step": 7704 }, { "epoch": 0.20292336054780089, "grad_norm": 3.214388132095337, "learning_rate": 3.986304977613906e-05, "loss": 2.0867, "step": 7705 }, { "epoch": 0.20294969712931263, "grad_norm": 1.8366135358810425, "learning_rate": 3.986173294706347e-05, "loss": 1.807, "step": 7706 }, { "epoch": 0.20297603371082434, "grad_norm": 1.7630726099014282, "learning_rate": 3.986041611798789e-05, "loss": 1.189, "step": 7707 }, { "epoch": 0.20300237029233606, "grad_norm": 2.373157024383545, "learning_rate": 3.9859099288912296e-05, "loss": 1.0657, "step": 7708 }, { "epoch": 0.20302870687384778, "grad_norm": 2.849299430847168, "learning_rate": 3.985778245983672e-05, "loss": 1.8557, "step": 7709 }, { "epoch": 0.2030550434553595, "grad_norm": 1.7904765605926514, "learning_rate": 3.985646563076113e-05, "loss": 1.7583, "step": 7710 }, { "epoch": 0.2030813800368712, "grad_norm": 3.3243374824523926, "learning_rate": 3.985514880168554e-05, "loss": 1.4315, "step": 7711 }, { "epoch": 0.20310771661838292, "grad_norm": 2.1492202281951904, "learning_rate": 3.985383197260996e-05, "loss": 1.9562, "step": 7712 }, { "epoch": 0.20313405319989467, "grad_norm": 1.5599113702774048, "learning_rate": 3.985251514353437e-05, "loss": 1.7551, "step": 7713 }, { "epoch": 0.20316038978140638, "grad_norm": 1.7659612894058228, "learning_rate": 3.985119831445879e-05, "loss": 2.1388, "step": 7714 }, { "epoch": 0.2031867263629181, "grad_norm": 1.6247694492340088, "learning_rate": 3.98498814853832e-05, "loss": 1.5643, "step": 7715 }, { "epoch": 0.2032130629444298, "grad_norm": 3.0842278003692627, "learning_rate": 3.9848564656307614e-05, "loss": 0.759, "step": 7716 }, { "epoch": 0.20323939952594153, "grad_norm": 2.41835355758667, "learning_rate": 3.984724782723202e-05, "loss": 1.8493, "step": 7717 }, { "epoch": 0.20326573610745324, "grad_norm": 3.290679454803467, "learning_rate": 3.984593099815644e-05, "loss": 1.9082, "step": 7718 }, { "epoch": 0.20329207268896496, "grad_norm": 4.662559986114502, "learning_rate": 3.9844614169080854e-05, "loss": 1.4023, "step": 7719 }, { "epoch": 0.2033184092704767, "grad_norm": 3.381883144378662, "learning_rate": 3.984329734000527e-05, "loss": 1.71, "step": 7720 }, { "epoch": 0.20334474585198842, "grad_norm": 1.6710078716278076, "learning_rate": 3.9841980510929685e-05, "loss": 1.7233, "step": 7721 }, { "epoch": 0.20337108243350013, "grad_norm": 2.7248570919036865, "learning_rate": 3.9840663681854094e-05, "loss": 2.2979, "step": 7722 }, { "epoch": 0.20339741901501185, "grad_norm": 1.873510479927063, "learning_rate": 3.983934685277851e-05, "loss": 1.6392, "step": 7723 }, { "epoch": 0.20342375559652356, "grad_norm": 2.1086153984069824, "learning_rate": 3.9838030023702925e-05, "loss": 1.5673, "step": 7724 }, { "epoch": 0.20345009217803528, "grad_norm": 4.5790581703186035, "learning_rate": 3.983671319462734e-05, "loss": 1.4436, "step": 7725 }, { "epoch": 0.20347642875954702, "grad_norm": 1.9357166290283203, "learning_rate": 3.983539636555175e-05, "loss": 1.6495, "step": 7726 }, { "epoch": 0.20350276534105874, "grad_norm": 1.9797827005386353, "learning_rate": 3.9834079536476165e-05, "loss": 1.9115, "step": 7727 }, { "epoch": 0.20352910192257045, "grad_norm": 3.078951120376587, "learning_rate": 3.983276270740058e-05, "loss": 2.25, "step": 7728 }, { "epoch": 0.20355543850408217, "grad_norm": 2.1000711917877197, "learning_rate": 3.9831445878324996e-05, "loss": 1.2215, "step": 7729 }, { "epoch": 0.20358177508559389, "grad_norm": 2.479434013366699, "learning_rate": 3.983012904924941e-05, "loss": 1.5245, "step": 7730 }, { "epoch": 0.2036081116671056, "grad_norm": 3.928635835647583, "learning_rate": 3.982881222017382e-05, "loss": 1.3264, "step": 7731 }, { "epoch": 0.20363444824861732, "grad_norm": 2.1148130893707275, "learning_rate": 3.9827495391098237e-05, "loss": 1.7587, "step": 7732 }, { "epoch": 0.20366078483012906, "grad_norm": 2.1748244762420654, "learning_rate": 3.982617856202265e-05, "loss": 2.2599, "step": 7733 }, { "epoch": 0.20368712141164078, "grad_norm": 4.344339847564697, "learning_rate": 3.982486173294707e-05, "loss": 0.4548, "step": 7734 }, { "epoch": 0.2037134579931525, "grad_norm": 1.735168218612671, "learning_rate": 3.9823544903871477e-05, "loss": 2.2722, "step": 7735 }, { "epoch": 0.2037397945746642, "grad_norm": 1.856628179550171, "learning_rate": 3.982222807479589e-05, "loss": 2.2704, "step": 7736 }, { "epoch": 0.20376613115617592, "grad_norm": 1.8732333183288574, "learning_rate": 3.982091124572031e-05, "loss": 2.3249, "step": 7737 }, { "epoch": 0.20379246773768764, "grad_norm": 3.1309595108032227, "learning_rate": 3.981959441664472e-05, "loss": 1.7805, "step": 7738 }, { "epoch": 0.20381880431919938, "grad_norm": 3.8597004413604736, "learning_rate": 3.981827758756914e-05, "loss": 1.185, "step": 7739 }, { "epoch": 0.2038451409007111, "grad_norm": 1.5584781169891357, "learning_rate": 3.981696075849355e-05, "loss": 1.5417, "step": 7740 }, { "epoch": 0.2038714774822228, "grad_norm": 2.627574920654297, "learning_rate": 3.981564392941796e-05, "loss": 0.9549, "step": 7741 }, { "epoch": 0.20389781406373453, "grad_norm": 1.9498339891433716, "learning_rate": 3.981432710034238e-05, "loss": 1.8448, "step": 7742 }, { "epoch": 0.20392415064524624, "grad_norm": 2.078964948654175, "learning_rate": 3.9813010271266795e-05, "loss": 0.96, "step": 7743 }, { "epoch": 0.20395048722675796, "grad_norm": 2.1001758575439453, "learning_rate": 3.98116934421912e-05, "loss": 1.2231, "step": 7744 }, { "epoch": 0.20397682380826967, "grad_norm": 1.8574081659317017, "learning_rate": 3.981037661311562e-05, "loss": 2.3228, "step": 7745 }, { "epoch": 0.20400316038978142, "grad_norm": 3.739337205886841, "learning_rate": 3.9809059784040035e-05, "loss": 1.4, "step": 7746 }, { "epoch": 0.20402949697129313, "grad_norm": 3.632378578186035, "learning_rate": 3.980774295496445e-05, "loss": 0.3844, "step": 7747 }, { "epoch": 0.20405583355280485, "grad_norm": 2.4780163764953613, "learning_rate": 3.9806426125888866e-05, "loss": 1.9745, "step": 7748 }, { "epoch": 0.20408217013431657, "grad_norm": 3.5137109756469727, "learning_rate": 3.9805109296813275e-05, "loss": 1.6529, "step": 7749 }, { "epoch": 0.20410850671582828, "grad_norm": 3.2006072998046875, "learning_rate": 3.980379246773769e-05, "loss": 1.1761, "step": 7750 }, { "epoch": 0.20413484329734, "grad_norm": 1.951158046722412, "learning_rate": 3.98024756386621e-05, "loss": 0.6224, "step": 7751 }, { "epoch": 0.2041611798788517, "grad_norm": 1.973855972290039, "learning_rate": 3.980115880958652e-05, "loss": 1.6062, "step": 7752 }, { "epoch": 0.20418751646036346, "grad_norm": 2.5205283164978027, "learning_rate": 3.979984198051093e-05, "loss": 2.3939, "step": 7753 }, { "epoch": 0.20421385304187517, "grad_norm": 1.919022560119629, "learning_rate": 3.9798525151435346e-05, "loss": 1.5804, "step": 7754 }, { "epoch": 0.2042401896233869, "grad_norm": 3.5701918601989746, "learning_rate": 3.979720832235976e-05, "loss": 1.8725, "step": 7755 }, { "epoch": 0.2042665262048986, "grad_norm": 2.765096664428711, "learning_rate": 3.979589149328417e-05, "loss": 1.3745, "step": 7756 }, { "epoch": 0.20429286278641032, "grad_norm": 2.1180624961853027, "learning_rate": 3.979457466420859e-05, "loss": 1.9313, "step": 7757 }, { "epoch": 0.20431919936792203, "grad_norm": 2.988429307937622, "learning_rate": 3.9793257835133e-05, "loss": 1.2353, "step": 7758 }, { "epoch": 0.20434553594943378, "grad_norm": 3.308535099029541, "learning_rate": 3.979194100605742e-05, "loss": 1.2714, "step": 7759 }, { "epoch": 0.2043718725309455, "grad_norm": 2.4223952293395996, "learning_rate": 3.9790624176981826e-05, "loss": 2.1543, "step": 7760 }, { "epoch": 0.2043982091124572, "grad_norm": 2.9478893280029297, "learning_rate": 3.978930734790625e-05, "loss": 1.9599, "step": 7761 }, { "epoch": 0.20442454569396892, "grad_norm": 2.621234893798828, "learning_rate": 3.978799051883066e-05, "loss": 2.0541, "step": 7762 }, { "epoch": 0.20445088227548064, "grad_norm": 1.7383151054382324, "learning_rate": 3.978667368975507e-05, "loss": 1.8054, "step": 7763 }, { "epoch": 0.20447721885699235, "grad_norm": 1.9030951261520386, "learning_rate": 3.978535686067948e-05, "loss": 1.2585, "step": 7764 }, { "epoch": 0.20450355543850407, "grad_norm": 2.5771563053131104, "learning_rate": 3.97840400316039e-05, "loss": 0.3584, "step": 7765 }, { "epoch": 0.2045298920200158, "grad_norm": 2.9761757850646973, "learning_rate": 3.978272320252831e-05, "loss": 1.765, "step": 7766 }, { "epoch": 0.20455622860152753, "grad_norm": 2.0030739307403564, "learning_rate": 3.978140637345273e-05, "loss": 1.6391, "step": 7767 }, { "epoch": 0.20458256518303924, "grad_norm": 2.9255735874176025, "learning_rate": 3.9780089544377144e-05, "loss": 1.7632, "step": 7768 }, { "epoch": 0.20460890176455096, "grad_norm": 2.131049394607544, "learning_rate": 3.977877271530155e-05, "loss": 0.7539, "step": 7769 }, { "epoch": 0.20463523834606268, "grad_norm": 3.041555643081665, "learning_rate": 3.977745588622597e-05, "loss": 2.6063, "step": 7770 }, { "epoch": 0.2046615749275744, "grad_norm": 2.188349723815918, "learning_rate": 3.9776139057150384e-05, "loss": 1.5763, "step": 7771 }, { "epoch": 0.2046879115090861, "grad_norm": 1.6239274740219116, "learning_rate": 3.97748222280748e-05, "loss": 1.3054, "step": 7772 }, { "epoch": 0.20471424809059785, "grad_norm": 1.9158140420913696, "learning_rate": 3.977350539899921e-05, "loss": 1.7708, "step": 7773 }, { "epoch": 0.20474058467210957, "grad_norm": 2.2672629356384277, "learning_rate": 3.9772188569923624e-05, "loss": 1.8807, "step": 7774 }, { "epoch": 0.20476692125362128, "grad_norm": 1.7786533832550049, "learning_rate": 3.977087174084804e-05, "loss": 1.8365, "step": 7775 }, { "epoch": 0.204793257835133, "grad_norm": 3.6295759677886963, "learning_rate": 3.9769554911772455e-05, "loss": 1.4215, "step": 7776 }, { "epoch": 0.2048195944166447, "grad_norm": 1.8746525049209595, "learning_rate": 3.976823808269687e-05, "loss": 1.9446, "step": 7777 }, { "epoch": 0.20484593099815643, "grad_norm": 3.5790648460388184, "learning_rate": 3.976692125362128e-05, "loss": 1.2644, "step": 7778 }, { "epoch": 0.20487226757966817, "grad_norm": 4.819107532501221, "learning_rate": 3.9765604424545695e-05, "loss": 2.1637, "step": 7779 }, { "epoch": 0.2048986041611799, "grad_norm": 4.649570465087891, "learning_rate": 3.976428759547011e-05, "loss": 1.3861, "step": 7780 }, { "epoch": 0.2049249407426916, "grad_norm": 2.5128061771392822, "learning_rate": 3.9762970766394526e-05, "loss": 2.654, "step": 7781 }, { "epoch": 0.20495127732420332, "grad_norm": 2.1249327659606934, "learning_rate": 3.9761653937318935e-05, "loss": 1.5575, "step": 7782 }, { "epoch": 0.20497761390571503, "grad_norm": 3.7322750091552734, "learning_rate": 3.976033710824335e-05, "loss": 1.4085, "step": 7783 }, { "epoch": 0.20500395048722675, "grad_norm": 3.530750274658203, "learning_rate": 3.9759020279167766e-05, "loss": 2.4919, "step": 7784 }, { "epoch": 0.20503028706873846, "grad_norm": 3.203949213027954, "learning_rate": 3.975770345009218e-05, "loss": 1.3842, "step": 7785 }, { "epoch": 0.2050566236502502, "grad_norm": 3.26556134223938, "learning_rate": 3.97563866210166e-05, "loss": 0.3793, "step": 7786 }, { "epoch": 0.20508296023176192, "grad_norm": 2.6151974201202393, "learning_rate": 3.9755069791941006e-05, "loss": 2.2486, "step": 7787 }, { "epoch": 0.20510929681327364, "grad_norm": 1.9042385816574097, "learning_rate": 3.975375296286542e-05, "loss": 1.3336, "step": 7788 }, { "epoch": 0.20513563339478535, "grad_norm": 1.940236210823059, "learning_rate": 3.975243613378983e-05, "loss": 1.7221, "step": 7789 }, { "epoch": 0.20516196997629707, "grad_norm": 2.0851359367370605, "learning_rate": 3.975111930471425e-05, "loss": 1.5509, "step": 7790 }, { "epoch": 0.2051883065578088, "grad_norm": 1.7379056215286255, "learning_rate": 3.974980247563866e-05, "loss": 1.3931, "step": 7791 }, { "epoch": 0.20521464313932053, "grad_norm": 4.5370097160339355, "learning_rate": 3.974848564656308e-05, "loss": 1.0969, "step": 7792 }, { "epoch": 0.20524097972083225, "grad_norm": 1.7076642513275146, "learning_rate": 3.974716881748749e-05, "loss": 1.7391, "step": 7793 }, { "epoch": 0.20526731630234396, "grad_norm": 1.7618438005447388, "learning_rate": 3.974585198841191e-05, "loss": 1.5138, "step": 7794 }, { "epoch": 0.20529365288385568, "grad_norm": 2.485245704650879, "learning_rate": 3.9744535159336324e-05, "loss": 1.6382, "step": 7795 }, { "epoch": 0.2053199894653674, "grad_norm": 1.801835060119629, "learning_rate": 3.974321833026073e-05, "loss": 1.5347, "step": 7796 }, { "epoch": 0.2053463260468791, "grad_norm": 2.680452585220337, "learning_rate": 3.974190150118515e-05, "loss": 0.6489, "step": 7797 }, { "epoch": 0.20537266262839082, "grad_norm": 3.282606363296509, "learning_rate": 3.974058467210956e-05, "loss": 1.2582, "step": 7798 }, { "epoch": 0.20539899920990257, "grad_norm": 4.64738655090332, "learning_rate": 3.973926784303398e-05, "loss": 2.0028, "step": 7799 }, { "epoch": 0.20542533579141428, "grad_norm": 2.5218048095703125, "learning_rate": 3.973795101395839e-05, "loss": 2.6483, "step": 7800 }, { "epoch": 0.205451672372926, "grad_norm": 3.784498929977417, "learning_rate": 3.9736634184882804e-05, "loss": 1.6081, "step": 7801 }, { "epoch": 0.2054780089544377, "grad_norm": 2.962722063064575, "learning_rate": 3.973531735580722e-05, "loss": 1.9634, "step": 7802 }, { "epoch": 0.20550434553594943, "grad_norm": 2.387471914291382, "learning_rate": 3.973400052673163e-05, "loss": 2.4667, "step": 7803 }, { "epoch": 0.20553068211746114, "grad_norm": 1.9236146211624146, "learning_rate": 3.973268369765605e-05, "loss": 2.0, "step": 7804 }, { "epoch": 0.20555701869897286, "grad_norm": 3.2074873447418213, "learning_rate": 3.973136686858046e-05, "loss": 1.2928, "step": 7805 }, { "epoch": 0.2055833552804846, "grad_norm": 3.463780403137207, "learning_rate": 3.9730050039504876e-05, "loss": 1.3878, "step": 7806 }, { "epoch": 0.20560969186199632, "grad_norm": 3.3297958374023438, "learning_rate": 3.9728733210429284e-05, "loss": 0.9296, "step": 7807 }, { "epoch": 0.20563602844350803, "grad_norm": 4.504313945770264, "learning_rate": 3.972741638135371e-05, "loss": 2.1668, "step": 7808 }, { "epoch": 0.20566236502501975, "grad_norm": 2.5805702209472656, "learning_rate": 3.9726099552278116e-05, "loss": 1.8362, "step": 7809 }, { "epoch": 0.20568870160653147, "grad_norm": 1.6617090702056885, "learning_rate": 3.972478272320253e-05, "loss": 2.0187, "step": 7810 }, { "epoch": 0.20571503818804318, "grad_norm": 1.7884222269058228, "learning_rate": 3.972346589412694e-05, "loss": 1.6353, "step": 7811 }, { "epoch": 0.20574137476955492, "grad_norm": 3.2227025032043457, "learning_rate": 3.9722149065051356e-05, "loss": 1.6051, "step": 7812 }, { "epoch": 0.20576771135106664, "grad_norm": 2.211688995361328, "learning_rate": 3.972083223597577e-05, "loss": 1.137, "step": 7813 }, { "epoch": 0.20579404793257836, "grad_norm": 1.8780994415283203, "learning_rate": 3.971951540690019e-05, "loss": 2.3147, "step": 7814 }, { "epoch": 0.20582038451409007, "grad_norm": 2.0965094566345215, "learning_rate": 3.97181985778246e-05, "loss": 2.4826, "step": 7815 }, { "epoch": 0.2058467210956018, "grad_norm": 2.779721260070801, "learning_rate": 3.971688174874901e-05, "loss": 1.6571, "step": 7816 }, { "epoch": 0.2058730576771135, "grad_norm": 1.896346092224121, "learning_rate": 3.971556491967343e-05, "loss": 1.7357, "step": 7817 }, { "epoch": 0.20589939425862522, "grad_norm": 2.545363664627075, "learning_rate": 3.971424809059784e-05, "loss": 1.9361, "step": 7818 }, { "epoch": 0.20592573084013696, "grad_norm": 1.7324621677398682, "learning_rate": 3.971293126152226e-05, "loss": 1.6786, "step": 7819 }, { "epoch": 0.20595206742164868, "grad_norm": 2.820807933807373, "learning_rate": 3.971161443244667e-05, "loss": 1.6343, "step": 7820 }, { "epoch": 0.2059784040031604, "grad_norm": 4.268813133239746, "learning_rate": 3.971029760337108e-05, "loss": 0.8925, "step": 7821 }, { "epoch": 0.2060047405846721, "grad_norm": 1.4191045761108398, "learning_rate": 3.97089807742955e-05, "loss": 1.7947, "step": 7822 }, { "epoch": 0.20603107716618382, "grad_norm": 4.610223293304443, "learning_rate": 3.9707663945219914e-05, "loss": 1.7113, "step": 7823 }, { "epoch": 0.20605741374769554, "grad_norm": 6.960455417633057, "learning_rate": 3.970634711614433e-05, "loss": 1.22, "step": 7824 }, { "epoch": 0.20608375032920728, "grad_norm": 1.6945736408233643, "learning_rate": 3.970503028706874e-05, "loss": 2.01, "step": 7825 }, { "epoch": 0.206110086910719, "grad_norm": 1.7801034450531006, "learning_rate": 3.9703713457993154e-05, "loss": 1.7731, "step": 7826 }, { "epoch": 0.2061364234922307, "grad_norm": 1.8213963508605957, "learning_rate": 3.970239662891757e-05, "loss": 2.5041, "step": 7827 }, { "epoch": 0.20616276007374243, "grad_norm": 1.783178687095642, "learning_rate": 3.9701079799841985e-05, "loss": 2.6439, "step": 7828 }, { "epoch": 0.20618909665525414, "grad_norm": 1.5821774005889893, "learning_rate": 3.9699762970766394e-05, "loss": 1.7259, "step": 7829 }, { "epoch": 0.20621543323676586, "grad_norm": 1.9780511856079102, "learning_rate": 3.969844614169081e-05, "loss": 1.6204, "step": 7830 }, { "epoch": 0.20624176981827758, "grad_norm": 2.2103583812713623, "learning_rate": 3.9697129312615225e-05, "loss": 1.5859, "step": 7831 }, { "epoch": 0.20626810639978932, "grad_norm": 3.5597517490386963, "learning_rate": 3.969581248353964e-05, "loss": 1.6507, "step": 7832 }, { "epoch": 0.20629444298130103, "grad_norm": 1.7971757650375366, "learning_rate": 3.9694495654464056e-05, "loss": 1.845, "step": 7833 }, { "epoch": 0.20632077956281275, "grad_norm": 2.1414246559143066, "learning_rate": 3.9693178825388465e-05, "loss": 2.5907, "step": 7834 }, { "epoch": 0.20634711614432447, "grad_norm": 1.8185389041900635, "learning_rate": 3.969186199631288e-05, "loss": 1.9206, "step": 7835 }, { "epoch": 0.20637345272583618, "grad_norm": 2.236732006072998, "learning_rate": 3.969054516723729e-05, "loss": 2.1211, "step": 7836 }, { "epoch": 0.2063997893073479, "grad_norm": 2.784217357635498, "learning_rate": 3.968922833816171e-05, "loss": 2.3413, "step": 7837 }, { "epoch": 0.2064261258888596, "grad_norm": 1.8819929361343384, "learning_rate": 3.968791150908612e-05, "loss": 2.4825, "step": 7838 }, { "epoch": 0.20645246247037136, "grad_norm": 2.5416150093078613, "learning_rate": 3.9686594680010536e-05, "loss": 2.0612, "step": 7839 }, { "epoch": 0.20647879905188307, "grad_norm": 2.7699124813079834, "learning_rate": 3.968527785093495e-05, "loss": 1.5596, "step": 7840 }, { "epoch": 0.2065051356333948, "grad_norm": 2.565701484680176, "learning_rate": 3.968396102185937e-05, "loss": 1.4777, "step": 7841 }, { "epoch": 0.2065314722149065, "grad_norm": 2.101482629776001, "learning_rate": 3.968264419278378e-05, "loss": 1.5001, "step": 7842 }, { "epoch": 0.20655780879641822, "grad_norm": 3.0457894802093506, "learning_rate": 3.968132736370819e-05, "loss": 1.7086, "step": 7843 }, { "epoch": 0.20658414537792993, "grad_norm": 1.9946099519729614, "learning_rate": 3.968001053463261e-05, "loss": 2.1921, "step": 7844 }, { "epoch": 0.20661048195944168, "grad_norm": 5.115975379943848, "learning_rate": 3.9678693705557016e-05, "loss": 1.0979, "step": 7845 }, { "epoch": 0.2066368185409534, "grad_norm": 1.4720896482467651, "learning_rate": 3.967737687648144e-05, "loss": 1.9008, "step": 7846 }, { "epoch": 0.2066631551224651, "grad_norm": 2.379565715789795, "learning_rate": 3.967606004740585e-05, "loss": 1.3649, "step": 7847 }, { "epoch": 0.20668949170397682, "grad_norm": 3.408848762512207, "learning_rate": 3.967474321833026e-05, "loss": 1.704, "step": 7848 }, { "epoch": 0.20671582828548854, "grad_norm": 2.993016004562378, "learning_rate": 3.967342638925468e-05, "loss": 2.2501, "step": 7849 }, { "epoch": 0.20674216486700026, "grad_norm": 2.8244590759277344, "learning_rate": 3.967210956017909e-05, "loss": 2.5329, "step": 7850 }, { "epoch": 0.20676850144851197, "grad_norm": 2.7097952365875244, "learning_rate": 3.967079273110351e-05, "loss": 0.5743, "step": 7851 }, { "epoch": 0.20679483803002371, "grad_norm": 2.438704252243042, "learning_rate": 3.966947590202792e-05, "loss": 1.8576, "step": 7852 }, { "epoch": 0.20682117461153543, "grad_norm": 2.6458921432495117, "learning_rate": 3.9668159072952334e-05, "loss": 1.5375, "step": 7853 }, { "epoch": 0.20684751119304715, "grad_norm": 3.281848430633545, "learning_rate": 3.966684224387674e-05, "loss": 1.2269, "step": 7854 }, { "epoch": 0.20687384777455886, "grad_norm": 4.1413798332214355, "learning_rate": 3.9665525414801165e-05, "loss": 0.9242, "step": 7855 }, { "epoch": 0.20690018435607058, "grad_norm": 3.585641622543335, "learning_rate": 3.9664208585725574e-05, "loss": 1.3356, "step": 7856 }, { "epoch": 0.2069265209375823, "grad_norm": 2.10442852973938, "learning_rate": 3.966289175664999e-05, "loss": 2.0747, "step": 7857 }, { "epoch": 0.206952857519094, "grad_norm": 2.297427177429199, "learning_rate": 3.9661574927574405e-05, "loss": 1.764, "step": 7858 }, { "epoch": 0.20697919410060575, "grad_norm": 1.7722662687301636, "learning_rate": 3.9660258098498814e-05, "loss": 2.133, "step": 7859 }, { "epoch": 0.20700553068211747, "grad_norm": 1.921962857246399, "learning_rate": 3.9658941269423237e-05, "loss": 2.1509, "step": 7860 }, { "epoch": 0.20703186726362918, "grad_norm": 1.724496603012085, "learning_rate": 3.9657624440347645e-05, "loss": 0.4232, "step": 7861 }, { "epoch": 0.2070582038451409, "grad_norm": 1.533440113067627, "learning_rate": 3.965630761127206e-05, "loss": 1.4872, "step": 7862 }, { "epoch": 0.2070845404266526, "grad_norm": 1.7583832740783691, "learning_rate": 3.965499078219647e-05, "loss": 1.4856, "step": 7863 }, { "epoch": 0.20711087700816433, "grad_norm": 2.3645436763763428, "learning_rate": 3.9653673953120885e-05, "loss": 2.5735, "step": 7864 }, { "epoch": 0.20713721358967607, "grad_norm": 2.202500581741333, "learning_rate": 3.96523571240453e-05, "loss": 1.5335, "step": 7865 }, { "epoch": 0.2071635501711878, "grad_norm": 3.426295757293701, "learning_rate": 3.9651040294969717e-05, "loss": 1.2126, "step": 7866 }, { "epoch": 0.2071898867526995, "grad_norm": 2.2901694774627686, "learning_rate": 3.9649723465894125e-05, "loss": 1.9396, "step": 7867 }, { "epoch": 0.20721622333421122, "grad_norm": 1.7368117570877075, "learning_rate": 3.964840663681854e-05, "loss": 2.2809, "step": 7868 }, { "epoch": 0.20724255991572293, "grad_norm": 2.8945674896240234, "learning_rate": 3.9647089807742957e-05, "loss": 1.7752, "step": 7869 }, { "epoch": 0.20726889649723465, "grad_norm": 3.279308557510376, "learning_rate": 3.964577297866737e-05, "loss": 1.0625, "step": 7870 }, { "epoch": 0.20729523307874637, "grad_norm": 6.087759971618652, "learning_rate": 3.964445614959179e-05, "loss": 2.4217, "step": 7871 }, { "epoch": 0.2073215696602581, "grad_norm": 1.9429460763931274, "learning_rate": 3.9643139320516197e-05, "loss": 1.2704, "step": 7872 }, { "epoch": 0.20734790624176982, "grad_norm": 1.9781205654144287, "learning_rate": 3.964182249144061e-05, "loss": 2.1478, "step": 7873 }, { "epoch": 0.20737424282328154, "grad_norm": 2.5948643684387207, "learning_rate": 3.964050566236503e-05, "loss": 0.6268, "step": 7874 }, { "epoch": 0.20740057940479326, "grad_norm": 2.821392059326172, "learning_rate": 3.963918883328944e-05, "loss": 1.141, "step": 7875 }, { "epoch": 0.20742691598630497, "grad_norm": 1.7201229333877563, "learning_rate": 3.963787200421385e-05, "loss": 0.2368, "step": 7876 }, { "epoch": 0.2074532525678167, "grad_norm": 2.5808374881744385, "learning_rate": 3.963655517513827e-05, "loss": 1.6653, "step": 7877 }, { "epoch": 0.20747958914932843, "grad_norm": 1.9911696910858154, "learning_rate": 3.963523834606268e-05, "loss": 1.6758, "step": 7878 }, { "epoch": 0.20750592573084015, "grad_norm": 2.5546717643737793, "learning_rate": 3.96339215169871e-05, "loss": 2.1234, "step": 7879 }, { "epoch": 0.20753226231235186, "grad_norm": 3.1540091037750244, "learning_rate": 3.9632604687911515e-05, "loss": 0.481, "step": 7880 }, { "epoch": 0.20755859889386358, "grad_norm": 3.5949947834014893, "learning_rate": 3.9631287858835923e-05, "loss": 1.4144, "step": 7881 }, { "epoch": 0.2075849354753753, "grad_norm": 1.531070351600647, "learning_rate": 3.962997102976034e-05, "loss": 0.3953, "step": 7882 }, { "epoch": 0.207611272056887, "grad_norm": 1.7296377420425415, "learning_rate": 3.962865420068475e-05, "loss": 1.5199, "step": 7883 }, { "epoch": 0.20763760863839872, "grad_norm": 3.0197765827178955, "learning_rate": 3.962733737160917e-05, "loss": 0.3168, "step": 7884 }, { "epoch": 0.20766394521991047, "grad_norm": 1.5494749546051025, "learning_rate": 3.962602054253358e-05, "loss": 1.657, "step": 7885 }, { "epoch": 0.20769028180142218, "grad_norm": 8.258563041687012, "learning_rate": 3.9624703713457995e-05, "loss": 1.8136, "step": 7886 }, { "epoch": 0.2077166183829339, "grad_norm": 3.0122323036193848, "learning_rate": 3.962338688438241e-05, "loss": 0.8083, "step": 7887 }, { "epoch": 0.20774295496444561, "grad_norm": 4.37794303894043, "learning_rate": 3.9622070055306826e-05, "loss": 2.3482, "step": 7888 }, { "epoch": 0.20776929154595733, "grad_norm": 1.7530187368392944, "learning_rate": 3.962075322623124e-05, "loss": 1.936, "step": 7889 }, { "epoch": 0.20779562812746905, "grad_norm": 2.242640972137451, "learning_rate": 3.961943639715565e-05, "loss": 0.9809, "step": 7890 }, { "epoch": 0.20782196470898076, "grad_norm": 3.2633426189422607, "learning_rate": 3.9618119568080066e-05, "loss": 1.6336, "step": 7891 }, { "epoch": 0.2078483012904925, "grad_norm": 2.079007148742676, "learning_rate": 3.9616802739004475e-05, "loss": 1.8929, "step": 7892 }, { "epoch": 0.20787463787200422, "grad_norm": 2.1454122066497803, "learning_rate": 3.96154859099289e-05, "loss": 1.1304, "step": 7893 }, { "epoch": 0.20790097445351594, "grad_norm": 3.15075421333313, "learning_rate": 3.9614169080853306e-05, "loss": 1.4001, "step": 7894 }, { "epoch": 0.20792731103502765, "grad_norm": 3.404799222946167, "learning_rate": 3.961285225177772e-05, "loss": 0.5493, "step": 7895 }, { "epoch": 0.20795364761653937, "grad_norm": 3.996750831604004, "learning_rate": 3.961153542270214e-05, "loss": 0.5825, "step": 7896 }, { "epoch": 0.20797998419805108, "grad_norm": 2.167515277862549, "learning_rate": 3.9610218593626546e-05, "loss": 1.8508, "step": 7897 }, { "epoch": 0.20800632077956283, "grad_norm": 2.107673168182373, "learning_rate": 3.960890176455097e-05, "loss": 0.6475, "step": 7898 }, { "epoch": 0.20803265736107454, "grad_norm": 2.8799149990081787, "learning_rate": 3.960758493547538e-05, "loss": 1.3423, "step": 7899 }, { "epoch": 0.20805899394258626, "grad_norm": 3.097374439239502, "learning_rate": 3.960626810639979e-05, "loss": 1.8045, "step": 7900 }, { "epoch": 0.20808533052409797, "grad_norm": 3.866426944732666, "learning_rate": 3.96049512773242e-05, "loss": 2.3912, "step": 7901 }, { "epoch": 0.2081116671056097, "grad_norm": 1.7553191184997559, "learning_rate": 3.960363444824862e-05, "loss": 1.6756, "step": 7902 }, { "epoch": 0.2081380036871214, "grad_norm": 6.046514511108398, "learning_rate": 3.960231761917303e-05, "loss": 1.7576, "step": 7903 }, { "epoch": 0.20816434026863312, "grad_norm": 3.3796393871307373, "learning_rate": 3.960100079009745e-05, "loss": 2.0495, "step": 7904 }, { "epoch": 0.20819067685014486, "grad_norm": 2.272343158721924, "learning_rate": 3.9599683961021864e-05, "loss": 1.525, "step": 7905 }, { "epoch": 0.20821701343165658, "grad_norm": 1.5682498216629028, "learning_rate": 3.959836713194627e-05, "loss": 1.3021, "step": 7906 }, { "epoch": 0.2082433500131683, "grad_norm": 3.398550033569336, "learning_rate": 3.9597050302870695e-05, "loss": 1.5152, "step": 7907 }, { "epoch": 0.20826968659468, "grad_norm": 2.3274986743927, "learning_rate": 3.9595733473795104e-05, "loss": 1.2908, "step": 7908 }, { "epoch": 0.20829602317619172, "grad_norm": 3.44852876663208, "learning_rate": 3.959441664471952e-05, "loss": 1.4496, "step": 7909 }, { "epoch": 0.20832235975770344, "grad_norm": 2.179391622543335, "learning_rate": 3.959309981564393e-05, "loss": 1.7392, "step": 7910 }, { "epoch": 0.20834869633921518, "grad_norm": 1.858309268951416, "learning_rate": 3.9591782986568344e-05, "loss": 1.9066, "step": 7911 }, { "epoch": 0.2083750329207269, "grad_norm": 2.2308707237243652, "learning_rate": 3.959046615749276e-05, "loss": 2.0192, "step": 7912 }, { "epoch": 0.20840136950223861, "grad_norm": 3.1797797679901123, "learning_rate": 3.9589149328417175e-05, "loss": 1.8226, "step": 7913 }, { "epoch": 0.20842770608375033, "grad_norm": 2.7949371337890625, "learning_rate": 3.9587832499341584e-05, "loss": 1.8544, "step": 7914 }, { "epoch": 0.20845404266526205, "grad_norm": 4.285569190979004, "learning_rate": 3.9586515670266e-05, "loss": 1.4802, "step": 7915 }, { "epoch": 0.20848037924677376, "grad_norm": 1.9868905544281006, "learning_rate": 3.9585198841190415e-05, "loss": 1.9951, "step": 7916 }, { "epoch": 0.20850671582828548, "grad_norm": 2.244162082672119, "learning_rate": 3.958388201211483e-05, "loss": 1.7914, "step": 7917 }, { "epoch": 0.20853305240979722, "grad_norm": 4.924079418182373, "learning_rate": 3.9582565183039246e-05, "loss": 1.8412, "step": 7918 }, { "epoch": 0.20855938899130894, "grad_norm": 2.948178291320801, "learning_rate": 3.9581248353963655e-05, "loss": 1.5166, "step": 7919 }, { "epoch": 0.20858572557282065, "grad_norm": 2.1109559535980225, "learning_rate": 3.957993152488807e-05, "loss": 1.6516, "step": 7920 }, { "epoch": 0.20861206215433237, "grad_norm": 3.0030221939086914, "learning_rate": 3.9578614695812486e-05, "loss": 1.0304, "step": 7921 }, { "epoch": 0.20863839873584408, "grad_norm": 2.160588026046753, "learning_rate": 3.95772978667369e-05, "loss": 0.8921, "step": 7922 }, { "epoch": 0.2086647353173558, "grad_norm": 2.5613348484039307, "learning_rate": 3.957598103766131e-05, "loss": 1.8314, "step": 7923 }, { "epoch": 0.20869107189886751, "grad_norm": 2.6622626781463623, "learning_rate": 3.9574664208585726e-05, "loss": 2.0809, "step": 7924 }, { "epoch": 0.20871740848037926, "grad_norm": 5.511320114135742, "learning_rate": 3.957334737951014e-05, "loss": 1.8326, "step": 7925 }, { "epoch": 0.20874374506189097, "grad_norm": 1.4537285566329956, "learning_rate": 3.957203055043456e-05, "loss": 1.9966, "step": 7926 }, { "epoch": 0.2087700816434027, "grad_norm": 2.2338857650756836, "learning_rate": 3.957071372135897e-05, "loss": 1.8528, "step": 7927 }, { "epoch": 0.2087964182249144, "grad_norm": 2.480236768722534, "learning_rate": 3.956939689228338e-05, "loss": 0.5629, "step": 7928 }, { "epoch": 0.20882275480642612, "grad_norm": 1.8498308658599854, "learning_rate": 3.95680800632078e-05, "loss": 1.9333, "step": 7929 }, { "epoch": 0.20884909138793784, "grad_norm": 5.344048023223877, "learning_rate": 3.9566763234132206e-05, "loss": 1.9673, "step": 7930 }, { "epoch": 0.20887542796944958, "grad_norm": 3.506174325942993, "learning_rate": 3.956544640505663e-05, "loss": 1.2029, "step": 7931 }, { "epoch": 0.2089017645509613, "grad_norm": 2.1873860359191895, "learning_rate": 3.956412957598104e-05, "loss": 1.3081, "step": 7932 }, { "epoch": 0.208928101132473, "grad_norm": 5.196494102478027, "learning_rate": 3.956281274690545e-05, "loss": 2.702, "step": 7933 }, { "epoch": 0.20895443771398473, "grad_norm": 2.246225595474243, "learning_rate": 3.956149591782987e-05, "loss": 1.9787, "step": 7934 }, { "epoch": 0.20898077429549644, "grad_norm": 4.003600597381592, "learning_rate": 3.956017908875428e-05, "loss": 1.4045, "step": 7935 }, { "epoch": 0.20900711087700816, "grad_norm": 2.243312120437622, "learning_rate": 3.95588622596787e-05, "loss": 1.6769, "step": 7936 }, { "epoch": 0.20903344745851987, "grad_norm": 3.09871244430542, "learning_rate": 3.955754543060311e-05, "loss": 1.3345, "step": 7937 }, { "epoch": 0.20905978404003162, "grad_norm": 1.5548056364059448, "learning_rate": 3.9556228601527524e-05, "loss": 1.7791, "step": 7938 }, { "epoch": 0.20908612062154333, "grad_norm": 1.826427698135376, "learning_rate": 3.955491177245193e-05, "loss": 1.4339, "step": 7939 }, { "epoch": 0.20911245720305505, "grad_norm": 2.251462459564209, "learning_rate": 3.9553594943376356e-05, "loss": 2.0785, "step": 7940 }, { "epoch": 0.20913879378456676, "grad_norm": 2.6527514457702637, "learning_rate": 3.9552278114300764e-05, "loss": 1.6428, "step": 7941 }, { "epoch": 0.20916513036607848, "grad_norm": 1.7091227769851685, "learning_rate": 3.955096128522518e-05, "loss": 1.4601, "step": 7942 }, { "epoch": 0.2091914669475902, "grad_norm": 2.1505069732666016, "learning_rate": 3.9549644456149596e-05, "loss": 1.8869, "step": 7943 }, { "epoch": 0.2092178035291019, "grad_norm": 2.5253753662109375, "learning_rate": 3.9548327627074004e-05, "loss": 1.8705, "step": 7944 }, { "epoch": 0.20924414011061365, "grad_norm": 2.7928519248962402, "learning_rate": 3.954701079799843e-05, "loss": 1.5282, "step": 7945 }, { "epoch": 0.20927047669212537, "grad_norm": 1.557199239730835, "learning_rate": 3.9545693968922836e-05, "loss": 1.8086, "step": 7946 }, { "epoch": 0.20929681327363708, "grad_norm": 2.3240275382995605, "learning_rate": 3.954437713984725e-05, "loss": 1.7553, "step": 7947 }, { "epoch": 0.2093231498551488, "grad_norm": 1.8232619762420654, "learning_rate": 3.954306031077166e-05, "loss": 1.5949, "step": 7948 }, { "epoch": 0.20934948643666051, "grad_norm": 1.6279993057250977, "learning_rate": 3.9541743481696076e-05, "loss": 1.0965, "step": 7949 }, { "epoch": 0.20937582301817223, "grad_norm": 1.9425480365753174, "learning_rate": 3.954042665262049e-05, "loss": 1.7829, "step": 7950 }, { "epoch": 0.20940215959968397, "grad_norm": 1.9660980701446533, "learning_rate": 3.953910982354491e-05, "loss": 1.6255, "step": 7951 }, { "epoch": 0.2094284961811957, "grad_norm": 2.075363874435425, "learning_rate": 3.953779299446932e-05, "loss": 2.031, "step": 7952 }, { "epoch": 0.2094548327627074, "grad_norm": 2.5722994804382324, "learning_rate": 3.953647616539373e-05, "loss": 2.08, "step": 7953 }, { "epoch": 0.20948116934421912, "grad_norm": 1.8071092367172241, "learning_rate": 3.9535159336318154e-05, "loss": 2.0004, "step": 7954 }, { "epoch": 0.20950750592573084, "grad_norm": 6.527217388153076, "learning_rate": 3.953384250724256e-05, "loss": 2.1184, "step": 7955 }, { "epoch": 0.20953384250724255, "grad_norm": 5.131868839263916, "learning_rate": 3.953252567816698e-05, "loss": 2.0516, "step": 7956 }, { "epoch": 0.20956017908875427, "grad_norm": 2.3576133251190186, "learning_rate": 3.953120884909139e-05, "loss": 1.3078, "step": 7957 }, { "epoch": 0.209586515670266, "grad_norm": 3.3976054191589355, "learning_rate": 3.95298920200158e-05, "loss": 1.2705, "step": 7958 }, { "epoch": 0.20961285225177773, "grad_norm": 3.6096582412719727, "learning_rate": 3.952857519094022e-05, "loss": 1.1667, "step": 7959 }, { "epoch": 0.20963918883328944, "grad_norm": 3.5529160499572754, "learning_rate": 3.9527258361864634e-05, "loss": 1.494, "step": 7960 }, { "epoch": 0.20966552541480116, "grad_norm": 4.715931415557861, "learning_rate": 3.952594153278905e-05, "loss": 1.4576, "step": 7961 }, { "epoch": 0.20969186199631287, "grad_norm": 1.8498784303665161, "learning_rate": 3.952462470371346e-05, "loss": 1.6296, "step": 7962 }, { "epoch": 0.2097181985778246, "grad_norm": 2.778883457183838, "learning_rate": 3.9523307874637874e-05, "loss": 1.9152, "step": 7963 }, { "epoch": 0.20974453515933633, "grad_norm": 2.1728055477142334, "learning_rate": 3.952199104556229e-05, "loss": 1.5361, "step": 7964 }, { "epoch": 0.20977087174084805, "grad_norm": 2.0484607219696045, "learning_rate": 3.9520674216486705e-05, "loss": 1.5986, "step": 7965 }, { "epoch": 0.20979720832235976, "grad_norm": 1.9635207653045654, "learning_rate": 3.9519357387411114e-05, "loss": 2.329, "step": 7966 }, { "epoch": 0.20982354490387148, "grad_norm": 2.995798110961914, "learning_rate": 3.951804055833553e-05, "loss": 1.4193, "step": 7967 }, { "epoch": 0.2098498814853832, "grad_norm": 2.4451513290405273, "learning_rate": 3.951672372925994e-05, "loss": 1.6387, "step": 7968 }, { "epoch": 0.2098762180668949, "grad_norm": 3.6962287425994873, "learning_rate": 3.951540690018436e-05, "loss": 0.9997, "step": 7969 }, { "epoch": 0.20990255464840663, "grad_norm": 1.6026966571807861, "learning_rate": 3.951409007110877e-05, "loss": 1.3702, "step": 7970 }, { "epoch": 0.20992889122991837, "grad_norm": 3.512328863143921, "learning_rate": 3.9512773242033185e-05, "loss": 0.7588, "step": 7971 }, { "epoch": 0.20995522781143008, "grad_norm": 3.359097480773926, "learning_rate": 3.95114564129576e-05, "loss": 1.204, "step": 7972 }, { "epoch": 0.2099815643929418, "grad_norm": 1.5736089944839478, "learning_rate": 3.9510139583882016e-05, "loss": 1.8898, "step": 7973 }, { "epoch": 0.21000790097445352, "grad_norm": 2.251445770263672, "learning_rate": 3.950882275480643e-05, "loss": 1.6537, "step": 7974 }, { "epoch": 0.21003423755596523, "grad_norm": 1.9817070960998535, "learning_rate": 3.950750592573084e-05, "loss": 2.0569, "step": 7975 }, { "epoch": 0.21006057413747695, "grad_norm": 1.7193737030029297, "learning_rate": 3.9506189096655256e-05, "loss": 2.2624, "step": 7976 }, { "epoch": 0.21008691071898866, "grad_norm": 4.927498817443848, "learning_rate": 3.9504872267579665e-05, "loss": 2.5041, "step": 7977 }, { "epoch": 0.2101132473005004, "grad_norm": 3.848907232284546, "learning_rate": 3.950355543850409e-05, "loss": 1.2478, "step": 7978 }, { "epoch": 0.21013958388201212, "grad_norm": 4.467748641967773, "learning_rate": 3.9502238609428496e-05, "loss": 0.8114, "step": 7979 }, { "epoch": 0.21016592046352384, "grad_norm": 1.7255222797393799, "learning_rate": 3.950092178035291e-05, "loss": 1.8872, "step": 7980 }, { "epoch": 0.21019225704503555, "grad_norm": 1.6148625612258911, "learning_rate": 3.949960495127733e-05, "loss": 2.1682, "step": 7981 }, { "epoch": 0.21021859362654727, "grad_norm": 2.7554187774658203, "learning_rate": 3.9498288122201736e-05, "loss": 0.5923, "step": 7982 }, { "epoch": 0.21024493020805898, "grad_norm": 3.0917000770568848, "learning_rate": 3.949697129312616e-05, "loss": 2.0324, "step": 7983 }, { "epoch": 0.21027126678957073, "grad_norm": 2.207585573196411, "learning_rate": 3.949565446405057e-05, "loss": 2.1554, "step": 7984 }, { "epoch": 0.21029760337108244, "grad_norm": 2.8638224601745605, "learning_rate": 3.949433763497498e-05, "loss": 2.3346, "step": 7985 }, { "epoch": 0.21032393995259416, "grad_norm": 2.264763832092285, "learning_rate": 3.949302080589939e-05, "loss": 0.598, "step": 7986 }, { "epoch": 0.21035027653410587, "grad_norm": 2.041144609451294, "learning_rate": 3.9491703976823814e-05, "loss": 1.193, "step": 7987 }, { "epoch": 0.2103766131156176, "grad_norm": 1.8995825052261353, "learning_rate": 3.949038714774822e-05, "loss": 1.4939, "step": 7988 }, { "epoch": 0.2104029496971293, "grad_norm": 2.838090181350708, "learning_rate": 3.948907031867264e-05, "loss": 1.4816, "step": 7989 }, { "epoch": 0.21042928627864102, "grad_norm": 1.7127673625946045, "learning_rate": 3.9487753489597054e-05, "loss": 2.0585, "step": 7990 }, { "epoch": 0.21045562286015276, "grad_norm": 3.825827121734619, "learning_rate": 3.948643666052146e-05, "loss": 1.0772, "step": 7991 }, { "epoch": 0.21048195944166448, "grad_norm": 3.1475493907928467, "learning_rate": 3.9485119831445885e-05, "loss": 0.6566, "step": 7992 }, { "epoch": 0.2105082960231762, "grad_norm": 3.4131553173065186, "learning_rate": 3.9483803002370294e-05, "loss": 1.3837, "step": 7993 }, { "epoch": 0.2105346326046879, "grad_norm": 2.033433437347412, "learning_rate": 3.948248617329471e-05, "loss": 2.53, "step": 7994 }, { "epoch": 0.21056096918619963, "grad_norm": 2.1205668449401855, "learning_rate": 3.948116934421912e-05, "loss": 0.4346, "step": 7995 }, { "epoch": 0.21058730576771134, "grad_norm": 4.05999755859375, "learning_rate": 3.9479852515143534e-05, "loss": 1.8416, "step": 7996 }, { "epoch": 0.21061364234922308, "grad_norm": 2.101485252380371, "learning_rate": 3.947853568606795e-05, "loss": 1.964, "step": 7997 }, { "epoch": 0.2106399789307348, "grad_norm": 3.6859042644500732, "learning_rate": 3.9477218856992365e-05, "loss": 1.0435, "step": 7998 }, { "epoch": 0.21066631551224652, "grad_norm": 2.1933062076568604, "learning_rate": 3.947590202791678e-05, "loss": 1.4206, "step": 7999 }, { "epoch": 0.21069265209375823, "grad_norm": 2.146085500717163, "learning_rate": 3.947458519884119e-05, "loss": 1.8722, "step": 8000 }, { "epoch": 0.21071898867526995, "grad_norm": 2.975377082824707, "learning_rate": 3.9473268369765605e-05, "loss": 1.5781, "step": 8001 }, { "epoch": 0.21074532525678166, "grad_norm": 2.0121936798095703, "learning_rate": 3.947195154069002e-05, "loss": 1.9954, "step": 8002 }, { "epoch": 0.21077166183829338, "grad_norm": 2.756566286087036, "learning_rate": 3.9470634711614437e-05, "loss": 1.8285, "step": 8003 }, { "epoch": 0.21079799841980512, "grad_norm": 4.1593017578125, "learning_rate": 3.9469317882538845e-05, "loss": 0.464, "step": 8004 }, { "epoch": 0.21082433500131684, "grad_norm": 1.49372398853302, "learning_rate": 3.946800105346326e-05, "loss": 2.0069, "step": 8005 }, { "epoch": 0.21085067158282855, "grad_norm": 5.051287651062012, "learning_rate": 3.9466684224387677e-05, "loss": 0.9049, "step": 8006 }, { "epoch": 0.21087700816434027, "grad_norm": 6.765331268310547, "learning_rate": 3.946536739531209e-05, "loss": 1.9278, "step": 8007 }, { "epoch": 0.21090334474585198, "grad_norm": 1.9707120656967163, "learning_rate": 3.946405056623651e-05, "loss": 1.7449, "step": 8008 }, { "epoch": 0.2109296813273637, "grad_norm": 2.769061803817749, "learning_rate": 3.946273373716092e-05, "loss": 1.6184, "step": 8009 }, { "epoch": 0.21095601790887542, "grad_norm": 2.0725924968719482, "learning_rate": 3.946141690808533e-05, "loss": 1.8683, "step": 8010 }, { "epoch": 0.21098235449038716, "grad_norm": 1.9264018535614014, "learning_rate": 3.946010007900975e-05, "loss": 2.1224, "step": 8011 }, { "epoch": 0.21100869107189887, "grad_norm": 1.8104604482650757, "learning_rate": 3.9458783249934163e-05, "loss": 1.8013, "step": 8012 }, { "epoch": 0.2110350276534106, "grad_norm": 2.7635090351104736, "learning_rate": 3.945746642085857e-05, "loss": 1.6869, "step": 8013 }, { "epoch": 0.2110613642349223, "grad_norm": 4.2592387199401855, "learning_rate": 3.945614959178299e-05, "loss": 2.6423, "step": 8014 }, { "epoch": 0.21108770081643402, "grad_norm": 2.7908198833465576, "learning_rate": 3.94548327627074e-05, "loss": 1.6301, "step": 8015 }, { "epoch": 0.21111403739794574, "grad_norm": 3.105816602706909, "learning_rate": 3.945351593363182e-05, "loss": 2.0555, "step": 8016 }, { "epoch": 0.21114037397945748, "grad_norm": 1.9376375675201416, "learning_rate": 3.945219910455623e-05, "loss": 2.1331, "step": 8017 }, { "epoch": 0.2111667105609692, "grad_norm": 3.2177765369415283, "learning_rate": 3.9450882275480643e-05, "loss": 2.3851, "step": 8018 }, { "epoch": 0.2111930471424809, "grad_norm": 3.5235183238983154, "learning_rate": 3.944956544640506e-05, "loss": 1.1531, "step": 8019 }, { "epoch": 0.21121938372399263, "grad_norm": 1.6920013427734375, "learning_rate": 3.9448248617329475e-05, "loss": 1.6535, "step": 8020 }, { "epoch": 0.21124572030550434, "grad_norm": 1.7701884508132935, "learning_rate": 3.944693178825389e-05, "loss": 2.3294, "step": 8021 }, { "epoch": 0.21127205688701606, "grad_norm": 3.3554046154022217, "learning_rate": 3.94456149591783e-05, "loss": 1.0569, "step": 8022 }, { "epoch": 0.21129839346852777, "grad_norm": 1.471840739250183, "learning_rate": 3.9444298130102715e-05, "loss": 2.2983, "step": 8023 }, { "epoch": 0.21132473005003952, "grad_norm": 1.7614887952804565, "learning_rate": 3.9442981301027123e-05, "loss": 2.1428, "step": 8024 }, { "epoch": 0.21135106663155123, "grad_norm": 1.8810094594955444, "learning_rate": 3.9441664471951546e-05, "loss": 2.6995, "step": 8025 }, { "epoch": 0.21137740321306295, "grad_norm": 1.9735652208328247, "learning_rate": 3.9440347642875955e-05, "loss": 1.5631, "step": 8026 }, { "epoch": 0.21140373979457466, "grad_norm": 2.613091230392456, "learning_rate": 3.943903081380037e-05, "loss": 1.3443, "step": 8027 }, { "epoch": 0.21143007637608638, "grad_norm": 1.8289783000946045, "learning_rate": 3.9437713984724786e-05, "loss": 1.7056, "step": 8028 }, { "epoch": 0.2114564129575981, "grad_norm": 2.3008124828338623, "learning_rate": 3.9436397155649195e-05, "loss": 1.8188, "step": 8029 }, { "epoch": 0.2114827495391098, "grad_norm": 1.3884129524230957, "learning_rate": 3.943508032657362e-05, "loss": 1.3644, "step": 8030 }, { "epoch": 0.21150908612062155, "grad_norm": 2.093122959136963, "learning_rate": 3.9433763497498026e-05, "loss": 1.4573, "step": 8031 }, { "epoch": 0.21153542270213327, "grad_norm": 1.619066596031189, "learning_rate": 3.943244666842244e-05, "loss": 1.9759, "step": 8032 }, { "epoch": 0.21156175928364498, "grad_norm": 3.6559643745422363, "learning_rate": 3.943112983934685e-05, "loss": 1.308, "step": 8033 }, { "epoch": 0.2115880958651567, "grad_norm": 2.3009626865386963, "learning_rate": 3.9429813010271266e-05, "loss": 1.958, "step": 8034 }, { "epoch": 0.21161443244666842, "grad_norm": 1.6011055707931519, "learning_rate": 3.942849618119568e-05, "loss": 1.6162, "step": 8035 }, { "epoch": 0.21164076902818013, "grad_norm": 2.6110005378723145, "learning_rate": 3.94271793521201e-05, "loss": 1.6828, "step": 8036 }, { "epoch": 0.21166710560969187, "grad_norm": 2.4418509006500244, "learning_rate": 3.942586252304451e-05, "loss": 2.273, "step": 8037 }, { "epoch": 0.2116934421912036, "grad_norm": 1.865779995918274, "learning_rate": 3.942454569396892e-05, "loss": 1.5067, "step": 8038 }, { "epoch": 0.2117197787727153, "grad_norm": 2.686727285385132, "learning_rate": 3.9423228864893344e-05, "loss": 1.6904, "step": 8039 }, { "epoch": 0.21174611535422702, "grad_norm": 1.7998002767562866, "learning_rate": 3.942191203581775e-05, "loss": 1.8184, "step": 8040 }, { "epoch": 0.21177245193573874, "grad_norm": 5.24328088760376, "learning_rate": 3.942059520674217e-05, "loss": 1.3782, "step": 8041 }, { "epoch": 0.21179878851725045, "grad_norm": 2.5756797790527344, "learning_rate": 3.941927837766658e-05, "loss": 2.2707, "step": 8042 }, { "epoch": 0.21182512509876217, "grad_norm": 1.9890908002853394, "learning_rate": 3.941796154859099e-05, "loss": 0.9836, "step": 8043 }, { "epoch": 0.2118514616802739, "grad_norm": 2.926546096801758, "learning_rate": 3.941664471951541e-05, "loss": 2.0829, "step": 8044 }, { "epoch": 0.21187779826178563, "grad_norm": 3.032869815826416, "learning_rate": 3.9415327890439824e-05, "loss": 0.4195, "step": 8045 }, { "epoch": 0.21190413484329734, "grad_norm": 2.888871908187866, "learning_rate": 3.941401106136424e-05, "loss": 1.5956, "step": 8046 }, { "epoch": 0.21193047142480906, "grad_norm": 3.109415292739868, "learning_rate": 3.941269423228865e-05, "loss": 1.4676, "step": 8047 }, { "epoch": 0.21195680800632077, "grad_norm": 1.9078865051269531, "learning_rate": 3.9411377403213064e-05, "loss": 1.9799, "step": 8048 }, { "epoch": 0.2119831445878325, "grad_norm": 2.3028550148010254, "learning_rate": 3.941006057413748e-05, "loss": 1.7985, "step": 8049 }, { "epoch": 0.21200948116934423, "grad_norm": 4.7780280113220215, "learning_rate": 3.9408743745061895e-05, "loss": 0.7895, "step": 8050 }, { "epoch": 0.21203581775085595, "grad_norm": 1.716330885887146, "learning_rate": 3.9407426915986304e-05, "loss": 1.8136, "step": 8051 }, { "epoch": 0.21206215433236766, "grad_norm": 2.3866307735443115, "learning_rate": 3.940611008691072e-05, "loss": 1.8914, "step": 8052 }, { "epoch": 0.21208849091387938, "grad_norm": 2.0796773433685303, "learning_rate": 3.9404793257835135e-05, "loss": 1.6821, "step": 8053 }, { "epoch": 0.2121148274953911, "grad_norm": 3.524477005004883, "learning_rate": 3.940347642875955e-05, "loss": 0.962, "step": 8054 }, { "epoch": 0.2121411640769028, "grad_norm": 2.139411449432373, "learning_rate": 3.9402159599683966e-05, "loss": 0.2723, "step": 8055 }, { "epoch": 0.21216750065841453, "grad_norm": 1.8068301677703857, "learning_rate": 3.9400842770608375e-05, "loss": 1.7596, "step": 8056 }, { "epoch": 0.21219383723992627, "grad_norm": 1.8532053232192993, "learning_rate": 3.939952594153279e-05, "loss": 1.6732, "step": 8057 }, { "epoch": 0.21222017382143799, "grad_norm": 2.1167242527008057, "learning_rate": 3.9398209112457206e-05, "loss": 1.9052, "step": 8058 }, { "epoch": 0.2122465104029497, "grad_norm": 2.0105865001678467, "learning_rate": 3.939689228338162e-05, "loss": 2.0846, "step": 8059 }, { "epoch": 0.21227284698446142, "grad_norm": 2.3318231105804443, "learning_rate": 3.939557545430603e-05, "loss": 1.567, "step": 8060 }, { "epoch": 0.21229918356597313, "grad_norm": 4.269582271575928, "learning_rate": 3.9394258625230446e-05, "loss": 1.2244, "step": 8061 }, { "epoch": 0.21232552014748485, "grad_norm": 3.0847957134246826, "learning_rate": 3.9392941796154855e-05, "loss": 1.5797, "step": 8062 }, { "epoch": 0.21235185672899656, "grad_norm": 2.812221050262451, "learning_rate": 3.939162496707928e-05, "loss": 2.1353, "step": 8063 }, { "epoch": 0.2123781933105083, "grad_norm": 2.608167886734009, "learning_rate": 3.939030813800369e-05, "loss": 2.238, "step": 8064 }, { "epoch": 0.21240452989202002, "grad_norm": 2.6146254539489746, "learning_rate": 3.93889913089281e-05, "loss": 1.3269, "step": 8065 }, { "epoch": 0.21243086647353174, "grad_norm": 3.309941053390503, "learning_rate": 3.938767447985252e-05, "loss": 0.8602, "step": 8066 }, { "epoch": 0.21245720305504345, "grad_norm": 4.590884208679199, "learning_rate": 3.9386357650776926e-05, "loss": 1.4454, "step": 8067 }, { "epoch": 0.21248353963655517, "grad_norm": 1.7693549394607544, "learning_rate": 3.938504082170135e-05, "loss": 1.9132, "step": 8068 }, { "epoch": 0.21250987621806688, "grad_norm": 3.7004294395446777, "learning_rate": 3.938372399262576e-05, "loss": 1.0606, "step": 8069 }, { "epoch": 0.21253621279957863, "grad_norm": 2.025315761566162, "learning_rate": 3.938240716355017e-05, "loss": 0.9544, "step": 8070 }, { "epoch": 0.21256254938109034, "grad_norm": 1.7604368925094604, "learning_rate": 3.938109033447458e-05, "loss": 2.0598, "step": 8071 }, { "epoch": 0.21258888596260206, "grad_norm": 1.6236159801483154, "learning_rate": 3.9379773505399004e-05, "loss": 1.9443, "step": 8072 }, { "epoch": 0.21261522254411377, "grad_norm": 1.57721745967865, "learning_rate": 3.937845667632341e-05, "loss": 1.1893, "step": 8073 }, { "epoch": 0.2126415591256255, "grad_norm": 1.517897129058838, "learning_rate": 3.937713984724783e-05, "loss": 1.8691, "step": 8074 }, { "epoch": 0.2126678957071372, "grad_norm": 1.708858847618103, "learning_rate": 3.9375823018172244e-05, "loss": 1.4678, "step": 8075 }, { "epoch": 0.21269423228864892, "grad_norm": 3.3207497596740723, "learning_rate": 3.937450618909665e-05, "loss": 1.1941, "step": 8076 }, { "epoch": 0.21272056887016066, "grad_norm": 2.11263370513916, "learning_rate": 3.9373189360021076e-05, "loss": 0.6194, "step": 8077 }, { "epoch": 0.21274690545167238, "grad_norm": 2.0888655185699463, "learning_rate": 3.9371872530945484e-05, "loss": 1.9023, "step": 8078 }, { "epoch": 0.2127732420331841, "grad_norm": 3.2067151069641113, "learning_rate": 3.93705557018699e-05, "loss": 1.5088, "step": 8079 }, { "epoch": 0.2127995786146958, "grad_norm": 3.494105577468872, "learning_rate": 3.936923887279431e-05, "loss": 1.0594, "step": 8080 }, { "epoch": 0.21282591519620753, "grad_norm": 1.7514755725860596, "learning_rate": 3.9367922043718724e-05, "loss": 1.3005, "step": 8081 }, { "epoch": 0.21285225177771924, "grad_norm": 2.5412983894348145, "learning_rate": 3.936660521464314e-05, "loss": 2.2427, "step": 8082 }, { "epoch": 0.21287858835923096, "grad_norm": 3.1526293754577637, "learning_rate": 3.9365288385567556e-05, "loss": 0.5093, "step": 8083 }, { "epoch": 0.2129049249407427, "grad_norm": 1.6861717700958252, "learning_rate": 3.936397155649197e-05, "loss": 2.0373, "step": 8084 }, { "epoch": 0.21293126152225442, "grad_norm": 2.658473014831543, "learning_rate": 3.936265472741638e-05, "loss": 0.8306, "step": 8085 }, { "epoch": 0.21295759810376613, "grad_norm": 4.400993824005127, "learning_rate": 3.93613378983408e-05, "loss": 0.7366, "step": 8086 }, { "epoch": 0.21298393468527785, "grad_norm": 3.7876157760620117, "learning_rate": 3.936002106926521e-05, "loss": 0.705, "step": 8087 }, { "epoch": 0.21301027126678956, "grad_norm": 2.0813534259796143, "learning_rate": 3.935870424018963e-05, "loss": 0.7664, "step": 8088 }, { "epoch": 0.21303660784830128, "grad_norm": 3.0450165271759033, "learning_rate": 3.9357387411114036e-05, "loss": 1.3153, "step": 8089 }, { "epoch": 0.21306294442981302, "grad_norm": 2.5086708068847656, "learning_rate": 3.935607058203845e-05, "loss": 1.8139, "step": 8090 }, { "epoch": 0.21308928101132474, "grad_norm": 1.7112269401550293, "learning_rate": 3.935475375296287e-05, "loss": 1.5665, "step": 8091 }, { "epoch": 0.21311561759283645, "grad_norm": 1.685372233390808, "learning_rate": 3.935343692388728e-05, "loss": 1.7403, "step": 8092 }, { "epoch": 0.21314195417434817, "grad_norm": 3.176758289337158, "learning_rate": 3.93521200948117e-05, "loss": 1.1468, "step": 8093 }, { "epoch": 0.21316829075585988, "grad_norm": 3.393217086791992, "learning_rate": 3.935080326573611e-05, "loss": 1.772, "step": 8094 }, { "epoch": 0.2131946273373716, "grad_norm": 1.8444277048110962, "learning_rate": 3.934948643666052e-05, "loss": 2.1946, "step": 8095 }, { "epoch": 0.21322096391888332, "grad_norm": 3.4809162616729736, "learning_rate": 3.934816960758494e-05, "loss": 2.087, "step": 8096 }, { "epoch": 0.21324730050039506, "grad_norm": 1.779625415802002, "learning_rate": 3.9346852778509354e-05, "loss": 2.1212, "step": 8097 }, { "epoch": 0.21327363708190678, "grad_norm": 4.503268241882324, "learning_rate": 3.934553594943376e-05, "loss": 2.1692, "step": 8098 }, { "epoch": 0.2132999736634185, "grad_norm": 1.8279377222061157, "learning_rate": 3.934421912035818e-05, "loss": 1.3458, "step": 8099 }, { "epoch": 0.2133263102449302, "grad_norm": 2.2709789276123047, "learning_rate": 3.9342902291282594e-05, "loss": 1.43, "step": 8100 }, { "epoch": 0.21335264682644192, "grad_norm": 4.984404563903809, "learning_rate": 3.934158546220701e-05, "loss": 0.9168, "step": 8101 }, { "epoch": 0.21337898340795364, "grad_norm": 3.854466438293457, "learning_rate": 3.9340268633131425e-05, "loss": 1.6919, "step": 8102 }, { "epoch": 0.21340531998946538, "grad_norm": 2.469802141189575, "learning_rate": 3.9338951804055834e-05, "loss": 1.4695, "step": 8103 }, { "epoch": 0.2134316565709771, "grad_norm": 2.463916778564453, "learning_rate": 3.933763497498025e-05, "loss": 1.8117, "step": 8104 }, { "epoch": 0.2134579931524888, "grad_norm": 2.3769354820251465, "learning_rate": 3.9336318145904665e-05, "loss": 2.4983, "step": 8105 }, { "epoch": 0.21348432973400053, "grad_norm": 1.6960322856903076, "learning_rate": 3.933500131682908e-05, "loss": 1.7101, "step": 8106 }, { "epoch": 0.21351066631551224, "grad_norm": 2.715395450592041, "learning_rate": 3.933368448775349e-05, "loss": 1.855, "step": 8107 }, { "epoch": 0.21353700289702396, "grad_norm": 1.7114248275756836, "learning_rate": 3.9332367658677905e-05, "loss": 1.4534, "step": 8108 }, { "epoch": 0.21356333947853567, "grad_norm": 1.9994984865188599, "learning_rate": 3.933105082960232e-05, "loss": 2.0607, "step": 8109 }, { "epoch": 0.21358967606004742, "grad_norm": 2.062744379043579, "learning_rate": 3.9329734000526736e-05, "loss": 1.4558, "step": 8110 }, { "epoch": 0.21361601264155913, "grad_norm": 3.768462657928467, "learning_rate": 3.932841717145115e-05, "loss": 0.8915, "step": 8111 }, { "epoch": 0.21364234922307085, "grad_norm": 2.3410520553588867, "learning_rate": 3.932710034237556e-05, "loss": 0.4157, "step": 8112 }, { "epoch": 0.21366868580458256, "grad_norm": 1.735868215560913, "learning_rate": 3.9325783513299976e-05, "loss": 1.4064, "step": 8113 }, { "epoch": 0.21369502238609428, "grad_norm": 1.9412875175476074, "learning_rate": 3.9324466684224385e-05, "loss": 1.5671, "step": 8114 }, { "epoch": 0.213721358967606, "grad_norm": 1.9730874300003052, "learning_rate": 3.932314985514881e-05, "loss": 1.279, "step": 8115 }, { "epoch": 0.2137476955491177, "grad_norm": 3.5073163509368896, "learning_rate": 3.9321833026073216e-05, "loss": 1.6784, "step": 8116 }, { "epoch": 0.21377403213062945, "grad_norm": 2.093677520751953, "learning_rate": 3.932051619699763e-05, "loss": 0.8411, "step": 8117 }, { "epoch": 0.21380036871214117, "grad_norm": 1.8346515893936157, "learning_rate": 3.931919936792204e-05, "loss": 2.2679, "step": 8118 }, { "epoch": 0.21382670529365289, "grad_norm": 2.296050786972046, "learning_rate": 3.931788253884646e-05, "loss": 1.7204, "step": 8119 }, { "epoch": 0.2138530418751646, "grad_norm": 1.9171382188796997, "learning_rate": 3.931656570977087e-05, "loss": 1.5225, "step": 8120 }, { "epoch": 0.21387937845667632, "grad_norm": 2.800374746322632, "learning_rate": 3.931524888069529e-05, "loss": 1.4449, "step": 8121 }, { "epoch": 0.21390571503818803, "grad_norm": 3.167890787124634, "learning_rate": 3.93139320516197e-05, "loss": 2.0883, "step": 8122 }, { "epoch": 0.21393205161969978, "grad_norm": 2.944833993911743, "learning_rate": 3.931261522254411e-05, "loss": 2.0241, "step": 8123 }, { "epoch": 0.2139583882012115, "grad_norm": 2.202903985977173, "learning_rate": 3.9311298393468534e-05, "loss": 2.2159, "step": 8124 }, { "epoch": 0.2139847247827232, "grad_norm": 1.5188966989517212, "learning_rate": 3.930998156439294e-05, "loss": 1.3158, "step": 8125 }, { "epoch": 0.21401106136423492, "grad_norm": 3.3034634590148926, "learning_rate": 3.930866473531736e-05, "loss": 1.3759, "step": 8126 }, { "epoch": 0.21403739794574664, "grad_norm": 3.7458913326263428, "learning_rate": 3.930734790624177e-05, "loss": 1.1541, "step": 8127 }, { "epoch": 0.21406373452725835, "grad_norm": 2.579270362854004, "learning_rate": 3.930603107716618e-05, "loss": 1.4533, "step": 8128 }, { "epoch": 0.21409007110877007, "grad_norm": 2.903691530227661, "learning_rate": 3.93047142480906e-05, "loss": 2.4269, "step": 8129 }, { "epoch": 0.2141164076902818, "grad_norm": 2.269531488418579, "learning_rate": 3.9303397419015014e-05, "loss": 1.958, "step": 8130 }, { "epoch": 0.21414274427179353, "grad_norm": 2.460794448852539, "learning_rate": 3.930208058993943e-05, "loss": 1.6406, "step": 8131 }, { "epoch": 0.21416908085330524, "grad_norm": 2.968259811401367, "learning_rate": 3.930076376086384e-05, "loss": 0.8086, "step": 8132 }, { "epoch": 0.21419541743481696, "grad_norm": 2.827868700027466, "learning_rate": 3.929944693178826e-05, "loss": 1.6039, "step": 8133 }, { "epoch": 0.21422175401632867, "grad_norm": 2.632143497467041, "learning_rate": 3.929813010271267e-05, "loss": 1.8037, "step": 8134 }, { "epoch": 0.2142480905978404, "grad_norm": 3.924879789352417, "learning_rate": 3.9296813273637085e-05, "loss": 1.2124, "step": 8135 }, { "epoch": 0.21427442717935213, "grad_norm": 2.677705764770508, "learning_rate": 3.9295496444561494e-05, "loss": 1.4815, "step": 8136 }, { "epoch": 0.21430076376086385, "grad_norm": 2.3418219089508057, "learning_rate": 3.929417961548591e-05, "loss": 1.3942, "step": 8137 }, { "epoch": 0.21432710034237556, "grad_norm": 1.8835002183914185, "learning_rate": 3.9292862786410325e-05, "loss": 1.9455, "step": 8138 }, { "epoch": 0.21435343692388728, "grad_norm": 1.9340204000473022, "learning_rate": 3.929154595733474e-05, "loss": 1.9129, "step": 8139 }, { "epoch": 0.214379773505399, "grad_norm": 1.8426212072372437, "learning_rate": 3.929022912825916e-05, "loss": 1.9917, "step": 8140 }, { "epoch": 0.2144061100869107, "grad_norm": 1.7304800748825073, "learning_rate": 3.9288912299183565e-05, "loss": 1.6529, "step": 8141 }, { "epoch": 0.21443244666842243, "grad_norm": 3.842853307723999, "learning_rate": 3.928759547010798e-05, "loss": 1.7706, "step": 8142 }, { "epoch": 0.21445878324993417, "grad_norm": 2.678192138671875, "learning_rate": 3.92862786410324e-05, "loss": 1.1649, "step": 8143 }, { "epoch": 0.2144851198314459, "grad_norm": 2.3395910263061523, "learning_rate": 3.928496181195681e-05, "loss": 2.067, "step": 8144 }, { "epoch": 0.2145114564129576, "grad_norm": 2.5694284439086914, "learning_rate": 3.928364498288122e-05, "loss": 2.1264, "step": 8145 }, { "epoch": 0.21453779299446932, "grad_norm": 1.923811912536621, "learning_rate": 3.928232815380564e-05, "loss": 1.6467, "step": 8146 }, { "epoch": 0.21456412957598103, "grad_norm": 1.795712947845459, "learning_rate": 3.928101132473005e-05, "loss": 2.1434, "step": 8147 }, { "epoch": 0.21459046615749275, "grad_norm": 3.4010791778564453, "learning_rate": 3.927969449565447e-05, "loss": 1.1761, "step": 8148 }, { "epoch": 0.21461680273900446, "grad_norm": 2.623258590698242, "learning_rate": 3.9278377666578883e-05, "loss": 2.1505, "step": 8149 }, { "epoch": 0.2146431393205162, "grad_norm": 1.5902289152145386, "learning_rate": 3.927706083750329e-05, "loss": 1.9774, "step": 8150 }, { "epoch": 0.21466947590202792, "grad_norm": 1.7851754426956177, "learning_rate": 3.927574400842771e-05, "loss": 1.9715, "step": 8151 }, { "epoch": 0.21469581248353964, "grad_norm": 2.2035019397735596, "learning_rate": 3.9274427179352123e-05, "loss": 1.2878, "step": 8152 }, { "epoch": 0.21472214906505135, "grad_norm": 2.0112054347991943, "learning_rate": 3.927311035027654e-05, "loss": 1.5258, "step": 8153 }, { "epoch": 0.21474848564656307, "grad_norm": 5.362493515014648, "learning_rate": 3.927179352120095e-05, "loss": 1.764, "step": 8154 }, { "epoch": 0.21477482222807479, "grad_norm": 3.442628860473633, "learning_rate": 3.9270476692125363e-05, "loss": 2.2062, "step": 8155 }, { "epoch": 0.21480115880958653, "grad_norm": 2.052248477935791, "learning_rate": 3.926915986304978e-05, "loss": 2.1628, "step": 8156 }, { "epoch": 0.21482749539109824, "grad_norm": 1.6592828035354614, "learning_rate": 3.9267843033974195e-05, "loss": 1.7517, "step": 8157 }, { "epoch": 0.21485383197260996, "grad_norm": 1.9546493291854858, "learning_rate": 3.926652620489861e-05, "loss": 1.1069, "step": 8158 }, { "epoch": 0.21488016855412168, "grad_norm": 1.9161245822906494, "learning_rate": 3.926520937582302e-05, "loss": 1.6825, "step": 8159 }, { "epoch": 0.2149065051356334, "grad_norm": 1.6454392671585083, "learning_rate": 3.9263892546747435e-05, "loss": 1.9439, "step": 8160 }, { "epoch": 0.2149328417171451, "grad_norm": 3.8970978260040283, "learning_rate": 3.9262575717671844e-05, "loss": 1.0736, "step": 8161 }, { "epoch": 0.21495917829865682, "grad_norm": 1.952965259552002, "learning_rate": 3.9261258888596266e-05, "loss": 1.2462, "step": 8162 }, { "epoch": 0.21498551488016857, "grad_norm": 2.411151885986328, "learning_rate": 3.9259942059520675e-05, "loss": 1.919, "step": 8163 }, { "epoch": 0.21501185146168028, "grad_norm": 4.017030715942383, "learning_rate": 3.925862523044509e-05, "loss": 0.3093, "step": 8164 }, { "epoch": 0.215038188043192, "grad_norm": 1.7976256608963013, "learning_rate": 3.92573084013695e-05, "loss": 1.2207, "step": 8165 }, { "epoch": 0.2150645246247037, "grad_norm": 2.222109317779541, "learning_rate": 3.925599157229392e-05, "loss": 1.636, "step": 8166 }, { "epoch": 0.21509086120621543, "grad_norm": 2.583864688873291, "learning_rate": 3.925467474321833e-05, "loss": 0.7542, "step": 8167 }, { "epoch": 0.21511719778772714, "grad_norm": 1.7864383459091187, "learning_rate": 3.9253357914142746e-05, "loss": 1.9142, "step": 8168 }, { "epoch": 0.21514353436923886, "grad_norm": 1.547110915184021, "learning_rate": 3.925204108506716e-05, "loss": 2.2719, "step": 8169 }, { "epoch": 0.2151698709507506, "grad_norm": 4.676783561706543, "learning_rate": 3.925072425599157e-05, "loss": 1.7861, "step": 8170 }, { "epoch": 0.21519620753226232, "grad_norm": 3.0446035861968994, "learning_rate": 3.924940742691599e-05, "loss": 2.1101, "step": 8171 }, { "epoch": 0.21522254411377403, "grad_norm": 2.4542737007141113, "learning_rate": 3.92480905978404e-05, "loss": 1.2738, "step": 8172 }, { "epoch": 0.21524888069528575, "grad_norm": 2.255436658859253, "learning_rate": 3.924677376876482e-05, "loss": 0.5037, "step": 8173 }, { "epoch": 0.21527521727679746, "grad_norm": 4.145355701446533, "learning_rate": 3.9245456939689226e-05, "loss": 1.8512, "step": 8174 }, { "epoch": 0.21530155385830918, "grad_norm": 3.1572439670562744, "learning_rate": 3.924414011061364e-05, "loss": 2.0275, "step": 8175 }, { "epoch": 0.21532789043982092, "grad_norm": 4.270723819732666, "learning_rate": 3.924282328153806e-05, "loss": 1.8923, "step": 8176 }, { "epoch": 0.21535422702133264, "grad_norm": 3.3614985942840576, "learning_rate": 3.924150645246247e-05, "loss": 1.5709, "step": 8177 }, { "epoch": 0.21538056360284435, "grad_norm": 1.5842660665512085, "learning_rate": 3.924018962338689e-05, "loss": 1.5191, "step": 8178 }, { "epoch": 0.21540690018435607, "grad_norm": 2.3136115074157715, "learning_rate": 3.92388727943113e-05, "loss": 1.4973, "step": 8179 }, { "epoch": 0.21543323676586779, "grad_norm": 3.8716657161712646, "learning_rate": 3.923755596523571e-05, "loss": 1.9589, "step": 8180 }, { "epoch": 0.2154595733473795, "grad_norm": 2.2407655715942383, "learning_rate": 3.923623913616013e-05, "loss": 1.5226, "step": 8181 }, { "epoch": 0.21548590992889122, "grad_norm": 2.9864284992218018, "learning_rate": 3.9234922307084544e-05, "loss": 0.9025, "step": 8182 }, { "epoch": 0.21551224651040296, "grad_norm": 6.218202114105225, "learning_rate": 3.923360547800895e-05, "loss": 2.3504, "step": 8183 }, { "epoch": 0.21553858309191468, "grad_norm": 3.4184114933013916, "learning_rate": 3.923228864893337e-05, "loss": 1.9084, "step": 8184 }, { "epoch": 0.2155649196734264, "grad_norm": 3.4305033683776855, "learning_rate": 3.9230971819857784e-05, "loss": 1.4184, "step": 8185 }, { "epoch": 0.2155912562549381, "grad_norm": 3.723510503768921, "learning_rate": 3.92296549907822e-05, "loss": 2.2996, "step": 8186 }, { "epoch": 0.21561759283644982, "grad_norm": 2.1249070167541504, "learning_rate": 3.9228338161706615e-05, "loss": 1.625, "step": 8187 }, { "epoch": 0.21564392941796154, "grad_norm": 2.4870362281799316, "learning_rate": 3.9227021332631024e-05, "loss": 0.6118, "step": 8188 }, { "epoch": 0.21567026599947328, "grad_norm": 2.8394012451171875, "learning_rate": 3.922570450355544e-05, "loss": 1.3732, "step": 8189 }, { "epoch": 0.215696602580985, "grad_norm": 2.797839403152466, "learning_rate": 3.9224387674479855e-05, "loss": 0.9737, "step": 8190 }, { "epoch": 0.2157229391624967, "grad_norm": 1.4362404346466064, "learning_rate": 3.922307084540427e-05, "loss": 0.3778, "step": 8191 }, { "epoch": 0.21574927574400843, "grad_norm": 2.275557041168213, "learning_rate": 3.922175401632868e-05, "loss": 2.0854, "step": 8192 }, { "epoch": 0.21577561232552014, "grad_norm": 1.8016016483306885, "learning_rate": 3.9220437187253095e-05, "loss": 1.6024, "step": 8193 }, { "epoch": 0.21580194890703186, "grad_norm": 4.864360332489014, "learning_rate": 3.921912035817751e-05, "loss": 1.9912, "step": 8194 }, { "epoch": 0.21582828548854358, "grad_norm": 3.0966172218322754, "learning_rate": 3.9217803529101926e-05, "loss": 1.7076, "step": 8195 }, { "epoch": 0.21585462207005532, "grad_norm": 2.613614559173584, "learning_rate": 3.921648670002634e-05, "loss": 1.8714, "step": 8196 }, { "epoch": 0.21588095865156703, "grad_norm": 1.6378625631332397, "learning_rate": 3.921516987095075e-05, "loss": 1.7882, "step": 8197 }, { "epoch": 0.21590729523307875, "grad_norm": 1.9100937843322754, "learning_rate": 3.9213853041875166e-05, "loss": 1.4412, "step": 8198 }, { "epoch": 0.21593363181459047, "grad_norm": 3.474773406982422, "learning_rate": 3.921253621279958e-05, "loss": 0.6582, "step": 8199 }, { "epoch": 0.21595996839610218, "grad_norm": 4.38875150680542, "learning_rate": 3.9211219383724e-05, "loss": 1.4912, "step": 8200 }, { "epoch": 0.2159863049776139, "grad_norm": 1.802634596824646, "learning_rate": 3.9209902554648406e-05, "loss": 1.9476, "step": 8201 }, { "epoch": 0.2160126415591256, "grad_norm": 4.4473676681518555, "learning_rate": 3.920858572557282e-05, "loss": 0.8807, "step": 8202 }, { "epoch": 0.21603897814063736, "grad_norm": 2.7201600074768066, "learning_rate": 3.920726889649724e-05, "loss": 2.1349, "step": 8203 }, { "epoch": 0.21606531472214907, "grad_norm": 1.725846529006958, "learning_rate": 3.920595206742165e-05, "loss": 2.3868, "step": 8204 }, { "epoch": 0.2160916513036608, "grad_norm": 1.6200461387634277, "learning_rate": 3.920463523834607e-05, "loss": 1.7024, "step": 8205 }, { "epoch": 0.2161179878851725, "grad_norm": 3.9600892066955566, "learning_rate": 3.920331840927048e-05, "loss": 1.2288, "step": 8206 }, { "epoch": 0.21614432446668422, "grad_norm": 3.557732105255127, "learning_rate": 3.920200158019489e-05, "loss": 0.3516, "step": 8207 }, { "epoch": 0.21617066104819593, "grad_norm": 1.9008049964904785, "learning_rate": 3.92006847511193e-05, "loss": 1.6975, "step": 8208 }, { "epoch": 0.21619699762970768, "grad_norm": 2.6943721771240234, "learning_rate": 3.9199367922043724e-05, "loss": 0.8698, "step": 8209 }, { "epoch": 0.2162233342112194, "grad_norm": 4.952938556671143, "learning_rate": 3.919805109296813e-05, "loss": 1.1114, "step": 8210 }, { "epoch": 0.2162496707927311, "grad_norm": 1.764625072479248, "learning_rate": 3.919673426389255e-05, "loss": 1.96, "step": 8211 }, { "epoch": 0.21627600737424282, "grad_norm": 2.2616498470306396, "learning_rate": 3.9195417434816964e-05, "loss": 1.4512, "step": 8212 }, { "epoch": 0.21630234395575454, "grad_norm": 3.3392961025238037, "learning_rate": 3.919410060574137e-05, "loss": 1.8557, "step": 8213 }, { "epoch": 0.21632868053726625, "grad_norm": 2.444782018661499, "learning_rate": 3.9192783776665796e-05, "loss": 2.4076, "step": 8214 }, { "epoch": 0.21635501711877797, "grad_norm": 1.9985350370407104, "learning_rate": 3.9191466947590204e-05, "loss": 1.8849, "step": 8215 }, { "epoch": 0.2163813537002897, "grad_norm": 4.963436126708984, "learning_rate": 3.919015011851462e-05, "loss": 1.7917, "step": 8216 }, { "epoch": 0.21640769028180143, "grad_norm": 4.0998101234436035, "learning_rate": 3.918883328943903e-05, "loss": 1.798, "step": 8217 }, { "epoch": 0.21643402686331314, "grad_norm": 1.7398664951324463, "learning_rate": 3.918751646036345e-05, "loss": 1.3166, "step": 8218 }, { "epoch": 0.21646036344482486, "grad_norm": 3.3675451278686523, "learning_rate": 3.918619963128786e-05, "loss": 1.3902, "step": 8219 }, { "epoch": 0.21648670002633658, "grad_norm": 2.7753496170043945, "learning_rate": 3.9184882802212276e-05, "loss": 1.4967, "step": 8220 }, { "epoch": 0.2165130366078483, "grad_norm": 1.997812032699585, "learning_rate": 3.9183565973136685e-05, "loss": 1.9883, "step": 8221 }, { "epoch": 0.21653937318936003, "grad_norm": 3.634472131729126, "learning_rate": 3.91822491440611e-05, "loss": 1.9869, "step": 8222 }, { "epoch": 0.21656570977087175, "grad_norm": 1.9734104871749878, "learning_rate": 3.9180932314985516e-05, "loss": 1.7723, "step": 8223 }, { "epoch": 0.21659204635238347, "grad_norm": 1.8444863557815552, "learning_rate": 3.917961548590993e-05, "loss": 1.7217, "step": 8224 }, { "epoch": 0.21661838293389518, "grad_norm": 5.061600685119629, "learning_rate": 3.917829865683435e-05, "loss": 1.4748, "step": 8225 }, { "epoch": 0.2166447195154069, "grad_norm": 3.497251272201538, "learning_rate": 3.9176981827758756e-05, "loss": 0.6821, "step": 8226 }, { "epoch": 0.2166710560969186, "grad_norm": 3.130624294281006, "learning_rate": 3.917566499868317e-05, "loss": 0.8868, "step": 8227 }, { "epoch": 0.21669739267843033, "grad_norm": 1.7078262567520142, "learning_rate": 3.917434816960759e-05, "loss": 1.4787, "step": 8228 }, { "epoch": 0.21672372925994207, "grad_norm": 2.04734468460083, "learning_rate": 3.9173031340532e-05, "loss": 0.2967, "step": 8229 }, { "epoch": 0.2167500658414538, "grad_norm": 3.667281150817871, "learning_rate": 3.917171451145641e-05, "loss": 1.3739, "step": 8230 }, { "epoch": 0.2167764024229655, "grad_norm": 2.5657331943511963, "learning_rate": 3.917039768238083e-05, "loss": 1.4296, "step": 8231 }, { "epoch": 0.21680273900447722, "grad_norm": 4.715991497039795, "learning_rate": 3.916908085330524e-05, "loss": 1.5589, "step": 8232 }, { "epoch": 0.21682907558598893, "grad_norm": 2.082846164703369, "learning_rate": 3.916776402422966e-05, "loss": 1.7433, "step": 8233 }, { "epoch": 0.21685541216750065, "grad_norm": 4.797408103942871, "learning_rate": 3.9166447195154074e-05, "loss": 1.0144, "step": 8234 }, { "epoch": 0.21688174874901237, "grad_norm": 2.381758451461792, "learning_rate": 3.916513036607848e-05, "loss": 1.5672, "step": 8235 }, { "epoch": 0.2169080853305241, "grad_norm": 1.805557131767273, "learning_rate": 3.91638135370029e-05, "loss": 1.6703, "step": 8236 }, { "epoch": 0.21693442191203582, "grad_norm": 1.748612403869629, "learning_rate": 3.9162496707927314e-05, "loss": 1.5996, "step": 8237 }, { "epoch": 0.21696075849354754, "grad_norm": 5.534695625305176, "learning_rate": 3.916117987885173e-05, "loss": 1.5567, "step": 8238 }, { "epoch": 0.21698709507505926, "grad_norm": 6.371755123138428, "learning_rate": 3.915986304977614e-05, "loss": 1.2917, "step": 8239 }, { "epoch": 0.21701343165657097, "grad_norm": 2.238318681716919, "learning_rate": 3.9158546220700554e-05, "loss": 0.422, "step": 8240 }, { "epoch": 0.2170397682380827, "grad_norm": 2.0366439819335938, "learning_rate": 3.915722939162497e-05, "loss": 1.8634, "step": 8241 }, { "epoch": 0.21706610481959443, "grad_norm": 2.263731002807617, "learning_rate": 3.9155912562549385e-05, "loss": 1.8592, "step": 8242 }, { "epoch": 0.21709244140110615, "grad_norm": 2.038421869277954, "learning_rate": 3.91545957334738e-05, "loss": 2.0266, "step": 8243 }, { "epoch": 0.21711877798261786, "grad_norm": 1.932921290397644, "learning_rate": 3.915327890439821e-05, "loss": 1.6988, "step": 8244 }, { "epoch": 0.21714511456412958, "grad_norm": 1.6675976514816284, "learning_rate": 3.9151962075322625e-05, "loss": 2.0794, "step": 8245 }, { "epoch": 0.2171714511456413, "grad_norm": 1.9884326457977295, "learning_rate": 3.9150645246247034e-05, "loss": 1.9962, "step": 8246 }, { "epoch": 0.217197787727153, "grad_norm": 2.5663902759552, "learning_rate": 3.9149328417171456e-05, "loss": 1.8525, "step": 8247 }, { "epoch": 0.21722412430866472, "grad_norm": 2.4816417694091797, "learning_rate": 3.9148011588095865e-05, "loss": 1.9044, "step": 8248 }, { "epoch": 0.21725046089017647, "grad_norm": 2.0489895343780518, "learning_rate": 3.914669475902028e-05, "loss": 1.9175, "step": 8249 }, { "epoch": 0.21727679747168818, "grad_norm": 3.5776524543762207, "learning_rate": 3.9145377929944696e-05, "loss": 1.3965, "step": 8250 }, { "epoch": 0.2173031340531999, "grad_norm": 2.6768641471862793, "learning_rate": 3.914406110086911e-05, "loss": 1.0229, "step": 8251 }, { "epoch": 0.2173294706347116, "grad_norm": 7.188946723937988, "learning_rate": 3.914274427179353e-05, "loss": 2.6083, "step": 8252 }, { "epoch": 0.21735580721622333, "grad_norm": 1.8915756940841675, "learning_rate": 3.9141427442717936e-05, "loss": 2.0729, "step": 8253 }, { "epoch": 0.21738214379773504, "grad_norm": 3.2362358570098877, "learning_rate": 3.914011061364235e-05, "loss": 1.3204, "step": 8254 }, { "epoch": 0.21740848037924676, "grad_norm": 2.287172555923462, "learning_rate": 3.913879378456676e-05, "loss": 2.1033, "step": 8255 }, { "epoch": 0.2174348169607585, "grad_norm": 1.8927334547042847, "learning_rate": 3.913747695549118e-05, "loss": 2.0713, "step": 8256 }, { "epoch": 0.21746115354227022, "grad_norm": 2.418806314468384, "learning_rate": 3.913616012641559e-05, "loss": 1.1095, "step": 8257 }, { "epoch": 0.21748749012378193, "grad_norm": 3.6046488285064697, "learning_rate": 3.913484329734001e-05, "loss": 0.8562, "step": 8258 }, { "epoch": 0.21751382670529365, "grad_norm": 1.9790934324264526, "learning_rate": 3.913352646826442e-05, "loss": 2.047, "step": 8259 }, { "epoch": 0.21754016328680537, "grad_norm": 2.254774332046509, "learning_rate": 3.913220963918883e-05, "loss": 1.5768, "step": 8260 }, { "epoch": 0.21756649986831708, "grad_norm": 3.8717594146728516, "learning_rate": 3.9130892810113254e-05, "loss": 1.3509, "step": 8261 }, { "epoch": 0.21759283644982882, "grad_norm": 1.9925005435943604, "learning_rate": 3.912957598103766e-05, "loss": 0.4052, "step": 8262 }, { "epoch": 0.21761917303134054, "grad_norm": 1.6390680074691772, "learning_rate": 3.912825915196208e-05, "loss": 1.64, "step": 8263 }, { "epoch": 0.21764550961285226, "grad_norm": 2.836636781692505, "learning_rate": 3.912694232288649e-05, "loss": 1.927, "step": 8264 }, { "epoch": 0.21767184619436397, "grad_norm": 1.683029294013977, "learning_rate": 3.912562549381091e-05, "loss": 1.6537, "step": 8265 }, { "epoch": 0.2176981827758757, "grad_norm": 3.8068337440490723, "learning_rate": 3.912430866473532e-05, "loss": 1.6014, "step": 8266 }, { "epoch": 0.2177245193573874, "grad_norm": 3.7345430850982666, "learning_rate": 3.9122991835659734e-05, "loss": 1.5089, "step": 8267 }, { "epoch": 0.21775085593889912, "grad_norm": 2.826350688934326, "learning_rate": 3.912167500658414e-05, "loss": 2.228, "step": 8268 }, { "epoch": 0.21777719252041086, "grad_norm": 1.8692281246185303, "learning_rate": 3.912035817750856e-05, "loss": 1.5176, "step": 8269 }, { "epoch": 0.21780352910192258, "grad_norm": 1.8478991985321045, "learning_rate": 3.9119041348432974e-05, "loss": 1.6985, "step": 8270 }, { "epoch": 0.2178298656834343, "grad_norm": 2.1795806884765625, "learning_rate": 3.911772451935739e-05, "loss": 2.0444, "step": 8271 }, { "epoch": 0.217856202264946, "grad_norm": 1.4706774950027466, "learning_rate": 3.9116407690281805e-05, "loss": 1.9017, "step": 8272 }, { "epoch": 0.21788253884645772, "grad_norm": 2.129595994949341, "learning_rate": 3.9115090861206214e-05, "loss": 1.246, "step": 8273 }, { "epoch": 0.21790887542796944, "grad_norm": 2.0203781127929688, "learning_rate": 3.911377403213063e-05, "loss": 1.4599, "step": 8274 }, { "epoch": 0.21793521200948118, "grad_norm": 1.4200843572616577, "learning_rate": 3.9112457203055045e-05, "loss": 1.4933, "step": 8275 }, { "epoch": 0.2179615485909929, "grad_norm": 2.4956321716308594, "learning_rate": 3.911114037397946e-05, "loss": 2.3448, "step": 8276 }, { "epoch": 0.21798788517250461, "grad_norm": 3.0443646907806396, "learning_rate": 3.910982354490387e-05, "loss": 2.114, "step": 8277 }, { "epoch": 0.21801422175401633, "grad_norm": 1.8393988609313965, "learning_rate": 3.9108506715828286e-05, "loss": 2.6578, "step": 8278 }, { "epoch": 0.21804055833552805, "grad_norm": 3.3731274604797363, "learning_rate": 3.91071898867527e-05, "loss": 2.4777, "step": 8279 }, { "epoch": 0.21806689491703976, "grad_norm": 2.2712483406066895, "learning_rate": 3.910587305767712e-05, "loss": 1.9861, "step": 8280 }, { "epoch": 0.21809323149855148, "grad_norm": 2.6521594524383545, "learning_rate": 3.910455622860153e-05, "loss": 1.853, "step": 8281 }, { "epoch": 0.21811956808006322, "grad_norm": 3.0817089080810547, "learning_rate": 3.910323939952594e-05, "loss": 0.445, "step": 8282 }, { "epoch": 0.21814590466157494, "grad_norm": 1.8929076194763184, "learning_rate": 3.910192257045036e-05, "loss": 1.9595, "step": 8283 }, { "epoch": 0.21817224124308665, "grad_norm": 5.388201713562012, "learning_rate": 3.910060574137477e-05, "loss": 2.1167, "step": 8284 }, { "epoch": 0.21819857782459837, "grad_norm": 1.7223021984100342, "learning_rate": 3.909928891229919e-05, "loss": 1.076, "step": 8285 }, { "epoch": 0.21822491440611008, "grad_norm": 3.4151270389556885, "learning_rate": 3.90979720832236e-05, "loss": 2.0, "step": 8286 }, { "epoch": 0.2182512509876218, "grad_norm": 2.442265510559082, "learning_rate": 3.909665525414801e-05, "loss": 1.4741, "step": 8287 }, { "epoch": 0.2182775875691335, "grad_norm": 2.2576863765716553, "learning_rate": 3.909533842507243e-05, "loss": 2.2601, "step": 8288 }, { "epoch": 0.21830392415064526, "grad_norm": 2.8746497631073, "learning_rate": 3.9094021595996844e-05, "loss": 0.8524, "step": 8289 }, { "epoch": 0.21833026073215697, "grad_norm": 2.574685573577881, "learning_rate": 3.909270476692126e-05, "loss": 1.7461, "step": 8290 }, { "epoch": 0.2183565973136687, "grad_norm": 1.8106601238250732, "learning_rate": 3.909138793784567e-05, "loss": 2.0349, "step": 8291 }, { "epoch": 0.2183829338951804, "grad_norm": 1.9987387657165527, "learning_rate": 3.9090071108770084e-05, "loss": 2.3239, "step": 8292 }, { "epoch": 0.21840927047669212, "grad_norm": 5.2676801681518555, "learning_rate": 3.908875427969449e-05, "loss": 1.3818, "step": 8293 }, { "epoch": 0.21843560705820383, "grad_norm": 3.1931509971618652, "learning_rate": 3.9087437450618915e-05, "loss": 1.4199, "step": 8294 }, { "epoch": 0.21846194363971558, "grad_norm": 4.456040382385254, "learning_rate": 3.9086120621543324e-05, "loss": 2.0105, "step": 8295 }, { "epoch": 0.2184882802212273, "grad_norm": 2.5653867721557617, "learning_rate": 3.908480379246774e-05, "loss": 1.6507, "step": 8296 }, { "epoch": 0.218514616802739, "grad_norm": 5.016998767852783, "learning_rate": 3.9083486963392155e-05, "loss": 1.202, "step": 8297 }, { "epoch": 0.21854095338425072, "grad_norm": 1.8235292434692383, "learning_rate": 3.908217013431657e-05, "loss": 0.6868, "step": 8298 }, { "epoch": 0.21856728996576244, "grad_norm": 2.589324951171875, "learning_rate": 3.9080853305240986e-05, "loss": 1.5412, "step": 8299 }, { "epoch": 0.21859362654727416, "grad_norm": 1.9480090141296387, "learning_rate": 3.9079536476165395e-05, "loss": 1.5285, "step": 8300 }, { "epoch": 0.21861996312878587, "grad_norm": 3.0515410900115967, "learning_rate": 3.907821964708981e-05, "loss": 1.7817, "step": 8301 }, { "epoch": 0.21864629971029761, "grad_norm": 2.4323890209198, "learning_rate": 3.907690281801422e-05, "loss": 1.974, "step": 8302 }, { "epoch": 0.21867263629180933, "grad_norm": 2.8883697986602783, "learning_rate": 3.907558598893864e-05, "loss": 1.6774, "step": 8303 }, { "epoch": 0.21869897287332105, "grad_norm": 3.349668264389038, "learning_rate": 3.907426915986305e-05, "loss": 1.3268, "step": 8304 }, { "epoch": 0.21872530945483276, "grad_norm": 2.9275786876678467, "learning_rate": 3.9072952330787466e-05, "loss": 2.3916, "step": 8305 }, { "epoch": 0.21875164603634448, "grad_norm": 2.2405154705047607, "learning_rate": 3.907163550171188e-05, "loss": 1.4918, "step": 8306 }, { "epoch": 0.2187779826178562, "grad_norm": 2.03804087638855, "learning_rate": 3.907031867263629e-05, "loss": 1.431, "step": 8307 }, { "epoch": 0.2188043191993679, "grad_norm": 2.5511977672576904, "learning_rate": 3.906900184356071e-05, "loss": 1.6714, "step": 8308 }, { "epoch": 0.21883065578087965, "grad_norm": 1.6598297357559204, "learning_rate": 3.906768501448512e-05, "loss": 1.7592, "step": 8309 }, { "epoch": 0.21885699236239137, "grad_norm": 2.0941429138183594, "learning_rate": 3.906636818540954e-05, "loss": 1.936, "step": 8310 }, { "epoch": 0.21888332894390308, "grad_norm": 2.101958990097046, "learning_rate": 3.9065051356333946e-05, "loss": 2.4416, "step": 8311 }, { "epoch": 0.2189096655254148, "grad_norm": 2.879706859588623, "learning_rate": 3.906373452725836e-05, "loss": 1.453, "step": 8312 }, { "epoch": 0.2189360021069265, "grad_norm": 5.836144924163818, "learning_rate": 3.906241769818278e-05, "loss": 0.6958, "step": 8313 }, { "epoch": 0.21896233868843823, "grad_norm": 2.725416660308838, "learning_rate": 3.906110086910719e-05, "loss": 2.085, "step": 8314 }, { "epoch": 0.21898867526994997, "grad_norm": 4.340921878814697, "learning_rate": 3.905978404003161e-05, "loss": 1.7496, "step": 8315 }, { "epoch": 0.2190150118514617, "grad_norm": 2.4558308124542236, "learning_rate": 3.905846721095602e-05, "loss": 1.9737, "step": 8316 }, { "epoch": 0.2190413484329734, "grad_norm": 2.857482671737671, "learning_rate": 3.905715038188044e-05, "loss": 1.9177, "step": 8317 }, { "epoch": 0.21906768501448512, "grad_norm": 2.1477527618408203, "learning_rate": 3.905583355280485e-05, "loss": 1.4247, "step": 8318 }, { "epoch": 0.21909402159599684, "grad_norm": 3.0266008377075195, "learning_rate": 3.9054516723729264e-05, "loss": 1.608, "step": 8319 }, { "epoch": 0.21912035817750855, "grad_norm": 7.984438896179199, "learning_rate": 3.905319989465367e-05, "loss": 1.6013, "step": 8320 }, { "epoch": 0.21914669475902027, "grad_norm": 3.805899143218994, "learning_rate": 3.905188306557809e-05, "loss": 0.1864, "step": 8321 }, { "epoch": 0.219173031340532, "grad_norm": 2.510054588317871, "learning_rate": 3.9050566236502504e-05, "loss": 1.7109, "step": 8322 }, { "epoch": 0.21919936792204373, "grad_norm": 2.0700831413269043, "learning_rate": 3.904924940742692e-05, "loss": 1.0419, "step": 8323 }, { "epoch": 0.21922570450355544, "grad_norm": 2.5530920028686523, "learning_rate": 3.904793257835133e-05, "loss": 1.6875, "step": 8324 }, { "epoch": 0.21925204108506716, "grad_norm": 2.2969350814819336, "learning_rate": 3.9046615749275744e-05, "loss": 1.2661, "step": 8325 }, { "epoch": 0.21927837766657887, "grad_norm": 3.7111880779266357, "learning_rate": 3.904529892020016e-05, "loss": 0.6292, "step": 8326 }, { "epoch": 0.2193047142480906, "grad_norm": 5.629997730255127, "learning_rate": 3.9043982091124575e-05, "loss": 1.0741, "step": 8327 }, { "epoch": 0.21933105082960233, "grad_norm": 1.9615687131881714, "learning_rate": 3.904266526204899e-05, "loss": 1.7914, "step": 8328 }, { "epoch": 0.21935738741111405, "grad_norm": 2.299870491027832, "learning_rate": 3.90413484329734e-05, "loss": 1.5477, "step": 8329 }, { "epoch": 0.21938372399262576, "grad_norm": 2.5918002128601074, "learning_rate": 3.9040031603897815e-05, "loss": 2.2188, "step": 8330 }, { "epoch": 0.21941006057413748, "grad_norm": 2.9231417179107666, "learning_rate": 3.903871477482223e-05, "loss": 1.2888, "step": 8331 }, { "epoch": 0.2194363971556492, "grad_norm": 2.1001057624816895, "learning_rate": 3.9037397945746646e-05, "loss": 1.4899, "step": 8332 }, { "epoch": 0.2194627337371609, "grad_norm": 1.4785897731781006, "learning_rate": 3.9036081116671055e-05, "loss": 2.1613, "step": 8333 }, { "epoch": 0.21948907031867262, "grad_norm": 1.7580170631408691, "learning_rate": 3.903476428759547e-05, "loss": 1.9895, "step": 8334 }, { "epoch": 0.21951540690018437, "grad_norm": 3.9516351222991943, "learning_rate": 3.9033447458519886e-05, "loss": 1.3631, "step": 8335 }, { "epoch": 0.21954174348169608, "grad_norm": 2.0070955753326416, "learning_rate": 3.90321306294443e-05, "loss": 2.1751, "step": 8336 }, { "epoch": 0.2195680800632078, "grad_norm": 1.9762344360351562, "learning_rate": 3.903081380036872e-05, "loss": 1.8103, "step": 8337 }, { "epoch": 0.21959441664471951, "grad_norm": 2.8037970066070557, "learning_rate": 3.9029496971293126e-05, "loss": 2.0264, "step": 8338 }, { "epoch": 0.21962075322623123, "grad_norm": 1.8411526679992676, "learning_rate": 3.902818014221754e-05, "loss": 1.6505, "step": 8339 }, { "epoch": 0.21964708980774295, "grad_norm": 1.9780889749526978, "learning_rate": 3.902686331314195e-05, "loss": 1.597, "step": 8340 }, { "epoch": 0.21967342638925466, "grad_norm": 2.954498052597046, "learning_rate": 3.902554648406637e-05, "loss": 1.4721, "step": 8341 }, { "epoch": 0.2196997629707664, "grad_norm": 2.501983404159546, "learning_rate": 3.902422965499078e-05, "loss": 0.8074, "step": 8342 }, { "epoch": 0.21972609955227812, "grad_norm": 1.7199556827545166, "learning_rate": 3.90229128259152e-05, "loss": 2.0235, "step": 8343 }, { "epoch": 0.21975243613378984, "grad_norm": 2.8954038619995117, "learning_rate": 3.902159599683961e-05, "loss": 1.326, "step": 8344 }, { "epoch": 0.21977877271530155, "grad_norm": 4.732748031616211, "learning_rate": 3.902027916776402e-05, "loss": 1.4896, "step": 8345 }, { "epoch": 0.21980510929681327, "grad_norm": 3.54403018951416, "learning_rate": 3.9018962338688445e-05, "loss": 1.3631, "step": 8346 }, { "epoch": 0.21983144587832498, "grad_norm": 1.9328362941741943, "learning_rate": 3.901764550961285e-05, "loss": 1.6679, "step": 8347 }, { "epoch": 0.21985778245983673, "grad_norm": 1.5179873704910278, "learning_rate": 3.901632868053727e-05, "loss": 1.3858, "step": 8348 }, { "epoch": 0.21988411904134844, "grad_norm": 1.7189298868179321, "learning_rate": 3.901501185146168e-05, "loss": 1.5546, "step": 8349 }, { "epoch": 0.21991045562286016, "grad_norm": 1.4995951652526855, "learning_rate": 3.90136950223861e-05, "loss": 1.4979, "step": 8350 }, { "epoch": 0.21993679220437187, "grad_norm": 5.189961910247803, "learning_rate": 3.901237819331051e-05, "loss": 2.0618, "step": 8351 }, { "epoch": 0.2199631287858836, "grad_norm": 1.784239411354065, "learning_rate": 3.9011061364234925e-05, "loss": 2.2262, "step": 8352 }, { "epoch": 0.2199894653673953, "grad_norm": 1.5889970064163208, "learning_rate": 3.900974453515934e-05, "loss": 1.281, "step": 8353 }, { "epoch": 0.22001580194890702, "grad_norm": 2.6087942123413086, "learning_rate": 3.900842770608375e-05, "loss": 1.6153, "step": 8354 }, { "epoch": 0.22004213853041876, "grad_norm": 1.5636043548583984, "learning_rate": 3.900711087700817e-05, "loss": 1.7055, "step": 8355 }, { "epoch": 0.22006847511193048, "grad_norm": 2.0564708709716797, "learning_rate": 3.900579404793258e-05, "loss": 1.8526, "step": 8356 }, { "epoch": 0.2200948116934422, "grad_norm": 2.7637977600097656, "learning_rate": 3.9004477218856996e-05, "loss": 1.9168, "step": 8357 }, { "epoch": 0.2201211482749539, "grad_norm": 1.9751542806625366, "learning_rate": 3.9003160389781405e-05, "loss": 0.7221, "step": 8358 }, { "epoch": 0.22014748485646563, "grad_norm": 2.228431224822998, "learning_rate": 3.900184356070582e-05, "loss": 2.3343, "step": 8359 }, { "epoch": 0.22017382143797734, "grad_norm": 4.339964866638184, "learning_rate": 3.9000526731630236e-05, "loss": 1.872, "step": 8360 }, { "epoch": 0.22020015801948908, "grad_norm": 2.463357448577881, "learning_rate": 3.899920990255465e-05, "loss": 1.0659, "step": 8361 }, { "epoch": 0.2202264946010008, "grad_norm": 4.519904136657715, "learning_rate": 3.899789307347907e-05, "loss": 1.8219, "step": 8362 }, { "epoch": 0.22025283118251252, "grad_norm": 3.036019802093506, "learning_rate": 3.8996576244403476e-05, "loss": 1.8034, "step": 8363 }, { "epoch": 0.22027916776402423, "grad_norm": 3.402838945388794, "learning_rate": 3.89952594153279e-05, "loss": 1.917, "step": 8364 }, { "epoch": 0.22030550434553595, "grad_norm": 2.916759490966797, "learning_rate": 3.899394258625231e-05, "loss": 1.6904, "step": 8365 }, { "epoch": 0.22033184092704766, "grad_norm": 1.6138890981674194, "learning_rate": 3.899262575717672e-05, "loss": 1.7718, "step": 8366 }, { "epoch": 0.22035817750855938, "grad_norm": 5.818990230560303, "learning_rate": 3.899130892810113e-05, "loss": 1.5933, "step": 8367 }, { "epoch": 0.22038451409007112, "grad_norm": 2.0275235176086426, "learning_rate": 3.898999209902555e-05, "loss": 1.7, "step": 8368 }, { "epoch": 0.22041085067158284, "grad_norm": 2.8279902935028076, "learning_rate": 3.898867526994996e-05, "loss": 1.817, "step": 8369 }, { "epoch": 0.22043718725309455, "grad_norm": 2.10720157623291, "learning_rate": 3.898735844087438e-05, "loss": 1.232, "step": 8370 }, { "epoch": 0.22046352383460627, "grad_norm": 2.1118297576904297, "learning_rate": 3.898604161179879e-05, "loss": 1.6767, "step": 8371 }, { "epoch": 0.22048986041611798, "grad_norm": 3.26546049118042, "learning_rate": 3.89847247827232e-05, "loss": 1.4664, "step": 8372 }, { "epoch": 0.2205161969976297, "grad_norm": 1.9262683391571045, "learning_rate": 3.898340795364762e-05, "loss": 0.7762, "step": 8373 }, { "epoch": 0.22054253357914141, "grad_norm": 3.0245628356933594, "learning_rate": 3.8982091124572034e-05, "loss": 1.4516, "step": 8374 }, { "epoch": 0.22056887016065316, "grad_norm": 1.8919014930725098, "learning_rate": 3.898077429549645e-05, "loss": 0.6358, "step": 8375 }, { "epoch": 0.22059520674216487, "grad_norm": 3.9048259258270264, "learning_rate": 3.897945746642086e-05, "loss": 0.7696, "step": 8376 }, { "epoch": 0.2206215433236766, "grad_norm": 2.440701961517334, "learning_rate": 3.8978140637345274e-05, "loss": 0.4367, "step": 8377 }, { "epoch": 0.2206478799051883, "grad_norm": 2.389115571975708, "learning_rate": 3.897682380826968e-05, "loss": 1.5765, "step": 8378 }, { "epoch": 0.22067421648670002, "grad_norm": 2.5260136127471924, "learning_rate": 3.8975506979194105e-05, "loss": 1.9794, "step": 8379 }, { "epoch": 0.22070055306821174, "grad_norm": 2.118802785873413, "learning_rate": 3.8974190150118514e-05, "loss": 1.6562, "step": 8380 }, { "epoch": 0.22072688964972348, "grad_norm": 2.1320691108703613, "learning_rate": 3.897287332104293e-05, "loss": 1.4441, "step": 8381 }, { "epoch": 0.2207532262312352, "grad_norm": 1.8724281787872314, "learning_rate": 3.8971556491967345e-05, "loss": 2.1027, "step": 8382 }, { "epoch": 0.2207795628127469, "grad_norm": 1.9332189559936523, "learning_rate": 3.897023966289176e-05, "loss": 2.0203, "step": 8383 }, { "epoch": 0.22080589939425863, "grad_norm": 1.377746343612671, "learning_rate": 3.8968922833816176e-05, "loss": 0.9216, "step": 8384 }, { "epoch": 0.22083223597577034, "grad_norm": 1.7230174541473389, "learning_rate": 3.8967606004740585e-05, "loss": 1.5728, "step": 8385 }, { "epoch": 0.22085857255728206, "grad_norm": 3.3916029930114746, "learning_rate": 3.8966289175665e-05, "loss": 1.4796, "step": 8386 }, { "epoch": 0.22088490913879377, "grad_norm": 1.917340636253357, "learning_rate": 3.896497234658941e-05, "loss": 1.8327, "step": 8387 }, { "epoch": 0.22091124572030552, "grad_norm": 1.668746829032898, "learning_rate": 3.896365551751383e-05, "loss": 2.0111, "step": 8388 }, { "epoch": 0.22093758230181723, "grad_norm": 1.511021614074707, "learning_rate": 3.896233868843824e-05, "loss": 1.7179, "step": 8389 }, { "epoch": 0.22096391888332895, "grad_norm": 2.1119842529296875, "learning_rate": 3.8961021859362656e-05, "loss": 2.7975, "step": 8390 }, { "epoch": 0.22099025546484066, "grad_norm": 1.987365961074829, "learning_rate": 3.895970503028707e-05, "loss": 1.3163, "step": 8391 }, { "epoch": 0.22101659204635238, "grad_norm": 1.7661840915679932, "learning_rate": 3.895838820121148e-05, "loss": 2.0107, "step": 8392 }, { "epoch": 0.2210429286278641, "grad_norm": 1.6379493474960327, "learning_rate": 3.89570713721359e-05, "loss": 1.8292, "step": 8393 }, { "epoch": 0.2210692652093758, "grad_norm": 4.291928768157959, "learning_rate": 3.895575454306031e-05, "loss": 1.6041, "step": 8394 }, { "epoch": 0.22109560179088755, "grad_norm": 2.297240972518921, "learning_rate": 3.895443771398473e-05, "loss": 1.5278, "step": 8395 }, { "epoch": 0.22112193837239927, "grad_norm": 1.8344429731369019, "learning_rate": 3.8953120884909136e-05, "loss": 1.9794, "step": 8396 }, { "epoch": 0.22114827495391098, "grad_norm": 3.064854860305786, "learning_rate": 3.895180405583356e-05, "loss": 2.056, "step": 8397 }, { "epoch": 0.2211746115354227, "grad_norm": 1.4964947700500488, "learning_rate": 3.895048722675797e-05, "loss": 2.024, "step": 8398 }, { "epoch": 0.22120094811693441, "grad_norm": 1.6163251399993896, "learning_rate": 3.894917039768238e-05, "loss": 1.2892, "step": 8399 }, { "epoch": 0.22122728469844613, "grad_norm": 2.7338666915893555, "learning_rate": 3.89478535686068e-05, "loss": 2.1958, "step": 8400 }, { "epoch": 0.22125362127995787, "grad_norm": 1.7635051012039185, "learning_rate": 3.894653673953121e-05, "loss": 2.1056, "step": 8401 }, { "epoch": 0.2212799578614696, "grad_norm": 2.8649826049804688, "learning_rate": 3.894521991045563e-05, "loss": 1.349, "step": 8402 }, { "epoch": 0.2213062944429813, "grad_norm": 5.887941837310791, "learning_rate": 3.894390308138004e-05, "loss": 1.3976, "step": 8403 }, { "epoch": 0.22133263102449302, "grad_norm": 4.322699069976807, "learning_rate": 3.8942586252304454e-05, "loss": 1.8634, "step": 8404 }, { "epoch": 0.22135896760600474, "grad_norm": 1.8168007135391235, "learning_rate": 3.894126942322886e-05, "loss": 1.2342, "step": 8405 }, { "epoch": 0.22138530418751645, "grad_norm": 2.0172464847564697, "learning_rate": 3.893995259415328e-05, "loss": 1.347, "step": 8406 }, { "epoch": 0.22141164076902817, "grad_norm": 1.9244657754898071, "learning_rate": 3.8938635765077694e-05, "loss": 1.7271, "step": 8407 }, { "epoch": 0.2214379773505399, "grad_norm": 1.7695342302322388, "learning_rate": 3.893731893600211e-05, "loss": 2.49, "step": 8408 }, { "epoch": 0.22146431393205163, "grad_norm": 2.9757330417633057, "learning_rate": 3.8936002106926526e-05, "loss": 1.6874, "step": 8409 }, { "epoch": 0.22149065051356334, "grad_norm": 1.7494467496871948, "learning_rate": 3.8934685277850934e-05, "loss": 2.6485, "step": 8410 }, { "epoch": 0.22151698709507506, "grad_norm": 2.0662641525268555, "learning_rate": 3.893336844877535e-05, "loss": 1.6856, "step": 8411 }, { "epoch": 0.22154332367658677, "grad_norm": 2.0488367080688477, "learning_rate": 3.8932051619699766e-05, "loss": 2.3371, "step": 8412 }, { "epoch": 0.2215696602580985, "grad_norm": 2.8280792236328125, "learning_rate": 3.893073479062418e-05, "loss": 2.217, "step": 8413 }, { "epoch": 0.22159599683961023, "grad_norm": 4.707918643951416, "learning_rate": 3.892941796154859e-05, "loss": 1.7576, "step": 8414 }, { "epoch": 0.22162233342112195, "grad_norm": 1.8562642335891724, "learning_rate": 3.8928101132473006e-05, "loss": 2.2244, "step": 8415 }, { "epoch": 0.22164867000263366, "grad_norm": 2.3995871543884277, "learning_rate": 3.892678430339742e-05, "loss": 1.3258, "step": 8416 }, { "epoch": 0.22167500658414538, "grad_norm": 2.1873204708099365, "learning_rate": 3.892546747432184e-05, "loss": 1.9068, "step": 8417 }, { "epoch": 0.2217013431656571, "grad_norm": 1.8489776849746704, "learning_rate": 3.892415064524625e-05, "loss": 1.674, "step": 8418 }, { "epoch": 0.2217276797471688, "grad_norm": 4.264925479888916, "learning_rate": 3.892283381617066e-05, "loss": 0.8233, "step": 8419 }, { "epoch": 0.22175401632868053, "grad_norm": 1.5581228733062744, "learning_rate": 3.892151698709508e-05, "loss": 2.2477, "step": 8420 }, { "epoch": 0.22178035291019227, "grad_norm": 2.964524269104004, "learning_rate": 3.892020015801949e-05, "loss": 1.5558, "step": 8421 }, { "epoch": 0.22180668949170398, "grad_norm": 2.144294023513794, "learning_rate": 3.891888332894391e-05, "loss": 1.9218, "step": 8422 }, { "epoch": 0.2218330260732157, "grad_norm": 3.3459372520446777, "learning_rate": 3.891756649986832e-05, "loss": 1.964, "step": 8423 }, { "epoch": 0.22185936265472742, "grad_norm": 2.1398229598999023, "learning_rate": 3.891624967079273e-05, "loss": 0.6664, "step": 8424 }, { "epoch": 0.22188569923623913, "grad_norm": 4.393989562988281, "learning_rate": 3.891493284171714e-05, "loss": 1.631, "step": 8425 }, { "epoch": 0.22191203581775085, "grad_norm": 2.6729469299316406, "learning_rate": 3.8913616012641564e-05, "loss": 2.2048, "step": 8426 }, { "epoch": 0.22193837239926256, "grad_norm": 2.602407693862915, "learning_rate": 3.891229918356597e-05, "loss": 1.3373, "step": 8427 }, { "epoch": 0.2219647089807743, "grad_norm": 5.29533052444458, "learning_rate": 3.891098235449039e-05, "loss": 1.7386, "step": 8428 }, { "epoch": 0.22199104556228602, "grad_norm": 3.888007402420044, "learning_rate": 3.8909665525414804e-05, "loss": 0.9082, "step": 8429 }, { "epoch": 0.22201738214379774, "grad_norm": 1.585597038269043, "learning_rate": 3.890834869633922e-05, "loss": 1.7358, "step": 8430 }, { "epoch": 0.22204371872530945, "grad_norm": 1.3620662689208984, "learning_rate": 3.8907031867263635e-05, "loss": 1.7565, "step": 8431 }, { "epoch": 0.22207005530682117, "grad_norm": 1.9046671390533447, "learning_rate": 3.8905715038188044e-05, "loss": 1.4714, "step": 8432 }, { "epoch": 0.22209639188833288, "grad_norm": 1.5443295240402222, "learning_rate": 3.890439820911246e-05, "loss": 2.1616, "step": 8433 }, { "epoch": 0.22212272846984463, "grad_norm": 1.9211442470550537, "learning_rate": 3.890308138003687e-05, "loss": 1.8194, "step": 8434 }, { "epoch": 0.22214906505135634, "grad_norm": 3.39355731010437, "learning_rate": 3.890176455096129e-05, "loss": 1.2593, "step": 8435 }, { "epoch": 0.22217540163286806, "grad_norm": 2.3450522422790527, "learning_rate": 3.89004477218857e-05, "loss": 1.9045, "step": 8436 }, { "epoch": 0.22220173821437977, "grad_norm": 1.8722589015960693, "learning_rate": 3.8899130892810115e-05, "loss": 1.7518, "step": 8437 }, { "epoch": 0.2222280747958915, "grad_norm": 4.835537433624268, "learning_rate": 3.889781406373453e-05, "loss": 2.1612, "step": 8438 }, { "epoch": 0.2222544113774032, "grad_norm": 3.000201463699341, "learning_rate": 3.889649723465894e-05, "loss": 2.1411, "step": 8439 }, { "epoch": 0.22228074795891492, "grad_norm": 1.4438332319259644, "learning_rate": 3.889518040558336e-05, "loss": 1.7441, "step": 8440 }, { "epoch": 0.22230708454042666, "grad_norm": 2.154694080352783, "learning_rate": 3.889386357650777e-05, "loss": 2.3642, "step": 8441 }, { "epoch": 0.22233342112193838, "grad_norm": 1.914270281791687, "learning_rate": 3.8892546747432186e-05, "loss": 1.8811, "step": 8442 }, { "epoch": 0.2223597577034501, "grad_norm": 2.5694048404693604, "learning_rate": 3.8891229918356595e-05, "loss": 1.3285, "step": 8443 }, { "epoch": 0.2223860942849618, "grad_norm": 2.4416675567626953, "learning_rate": 3.888991308928102e-05, "loss": 1.613, "step": 8444 }, { "epoch": 0.22241243086647353, "grad_norm": 1.5371558666229248, "learning_rate": 3.8888596260205426e-05, "loss": 1.4086, "step": 8445 }, { "epoch": 0.22243876744798524, "grad_norm": 1.6044889688491821, "learning_rate": 3.888727943112984e-05, "loss": 1.6043, "step": 8446 }, { "epoch": 0.22246510402949699, "grad_norm": 2.9604430198669434, "learning_rate": 3.888596260205426e-05, "loss": 2.1392, "step": 8447 }, { "epoch": 0.2224914406110087, "grad_norm": 2.8567779064178467, "learning_rate": 3.8884645772978666e-05, "loss": 2.2244, "step": 8448 }, { "epoch": 0.22251777719252042, "grad_norm": 2.0942060947418213, "learning_rate": 3.888332894390309e-05, "loss": 1.9639, "step": 8449 }, { "epoch": 0.22254411377403213, "grad_norm": 2.455822706222534, "learning_rate": 3.88820121148275e-05, "loss": 1.7601, "step": 8450 }, { "epoch": 0.22257045035554385, "grad_norm": 3.185560941696167, "learning_rate": 3.888069528575191e-05, "loss": 1.8894, "step": 8451 }, { "epoch": 0.22259678693705556, "grad_norm": 2.9746787548065186, "learning_rate": 3.887937845667632e-05, "loss": 2.138, "step": 8452 }, { "epoch": 0.22262312351856728, "grad_norm": 2.757542610168457, "learning_rate": 3.887806162760074e-05, "loss": 1.5617, "step": 8453 }, { "epoch": 0.22264946010007902, "grad_norm": 3.7855117321014404, "learning_rate": 3.887674479852515e-05, "loss": 1.6345, "step": 8454 }, { "epoch": 0.22267579668159074, "grad_norm": 1.7267675399780273, "learning_rate": 3.887542796944957e-05, "loss": 1.8609, "step": 8455 }, { "epoch": 0.22270213326310245, "grad_norm": 1.4913760423660278, "learning_rate": 3.8874111140373984e-05, "loss": 1.6093, "step": 8456 }, { "epoch": 0.22272846984461417, "grad_norm": 3.7315099239349365, "learning_rate": 3.887279431129839e-05, "loss": 1.4499, "step": 8457 }, { "epoch": 0.22275480642612588, "grad_norm": 2.175516366958618, "learning_rate": 3.887147748222281e-05, "loss": 1.322, "step": 8458 }, { "epoch": 0.2227811430076376, "grad_norm": 3.672293186187744, "learning_rate": 3.8870160653147224e-05, "loss": 1.78, "step": 8459 }, { "epoch": 0.22280747958914932, "grad_norm": 3.357968807220459, "learning_rate": 3.886884382407164e-05, "loss": 1.387, "step": 8460 }, { "epoch": 0.22283381617066106, "grad_norm": 1.8350592851638794, "learning_rate": 3.886752699499605e-05, "loss": 1.474, "step": 8461 }, { "epoch": 0.22286015275217277, "grad_norm": 2.6834311485290527, "learning_rate": 3.8866210165920464e-05, "loss": 1.1394, "step": 8462 }, { "epoch": 0.2228864893336845, "grad_norm": 3.189242124557495, "learning_rate": 3.886489333684488e-05, "loss": 1.065, "step": 8463 }, { "epoch": 0.2229128259151962, "grad_norm": 2.0183005332946777, "learning_rate": 3.8863576507769295e-05, "loss": 1.9932, "step": 8464 }, { "epoch": 0.22293916249670792, "grad_norm": 3.5083160400390625, "learning_rate": 3.886225967869371e-05, "loss": 2.7922, "step": 8465 }, { "epoch": 0.22296549907821964, "grad_norm": 2.4788997173309326, "learning_rate": 3.886094284961812e-05, "loss": 1.4043, "step": 8466 }, { "epoch": 0.22299183565973138, "grad_norm": 2.37233829498291, "learning_rate": 3.8859626020542535e-05, "loss": 0.5077, "step": 8467 }, { "epoch": 0.2230181722412431, "grad_norm": 2.2201685905456543, "learning_rate": 3.885830919146695e-05, "loss": 1.2089, "step": 8468 }, { "epoch": 0.2230445088227548, "grad_norm": 2.6045634746551514, "learning_rate": 3.8856992362391367e-05, "loss": 1.7408, "step": 8469 }, { "epoch": 0.22307084540426653, "grad_norm": 1.5373564958572388, "learning_rate": 3.8855675533315775e-05, "loss": 2.3959, "step": 8470 }, { "epoch": 0.22309718198577824, "grad_norm": 1.8168038129806519, "learning_rate": 3.885435870424019e-05, "loss": 1.8689, "step": 8471 }, { "epoch": 0.22312351856728996, "grad_norm": 6.167013168334961, "learning_rate": 3.88530418751646e-05, "loss": 2.291, "step": 8472 }, { "epoch": 0.22314985514880167, "grad_norm": 2.320302963256836, "learning_rate": 3.885172504608902e-05, "loss": 1.5089, "step": 8473 }, { "epoch": 0.22317619173031342, "grad_norm": 2.7399489879608154, "learning_rate": 3.885040821701343e-05, "loss": 1.7874, "step": 8474 }, { "epoch": 0.22320252831182513, "grad_norm": 1.4955161809921265, "learning_rate": 3.8849091387937847e-05, "loss": 1.4719, "step": 8475 }, { "epoch": 0.22322886489333685, "grad_norm": 3.9908576011657715, "learning_rate": 3.884777455886226e-05, "loss": 1.4037, "step": 8476 }, { "epoch": 0.22325520147484856, "grad_norm": 2.6037309169769287, "learning_rate": 3.884645772978668e-05, "loss": 0.7771, "step": 8477 }, { "epoch": 0.22328153805636028, "grad_norm": 1.9858044385910034, "learning_rate": 3.884514090071109e-05, "loss": 2.0154, "step": 8478 }, { "epoch": 0.223307874637872, "grad_norm": 1.781132459640503, "learning_rate": 3.88438240716355e-05, "loss": 1.6182, "step": 8479 }, { "epoch": 0.2233342112193837, "grad_norm": 1.9039111137390137, "learning_rate": 3.884250724255992e-05, "loss": 2.3106, "step": 8480 }, { "epoch": 0.22336054780089545, "grad_norm": 2.432525396347046, "learning_rate": 3.8841190413484327e-05, "loss": 1.3043, "step": 8481 }, { "epoch": 0.22338688438240717, "grad_norm": 1.993685007095337, "learning_rate": 3.883987358440875e-05, "loss": 1.7583, "step": 8482 }, { "epoch": 0.22341322096391888, "grad_norm": 2.3282101154327393, "learning_rate": 3.883855675533316e-05, "loss": 1.4714, "step": 8483 }, { "epoch": 0.2234395575454306, "grad_norm": 3.4791910648345947, "learning_rate": 3.883723992625757e-05, "loss": 1.6807, "step": 8484 }, { "epoch": 0.22346589412694232, "grad_norm": 2.2094662189483643, "learning_rate": 3.883592309718199e-05, "loss": 2.3535, "step": 8485 }, { "epoch": 0.22349223070845403, "grad_norm": 1.927895188331604, "learning_rate": 3.88346062681064e-05, "loss": 1.6366, "step": 8486 }, { "epoch": 0.22351856728996577, "grad_norm": 2.2030980587005615, "learning_rate": 3.883328943903082e-05, "loss": 1.6209, "step": 8487 }, { "epoch": 0.2235449038714775, "grad_norm": 1.9624310731887817, "learning_rate": 3.883197260995523e-05, "loss": 2.0201, "step": 8488 }, { "epoch": 0.2235712404529892, "grad_norm": 1.8407219648361206, "learning_rate": 3.8830655780879645e-05, "loss": 1.8291, "step": 8489 }, { "epoch": 0.22359757703450092, "grad_norm": 3.4186699390411377, "learning_rate": 3.8829338951804053e-05, "loss": 1.4813, "step": 8490 }, { "epoch": 0.22362391361601264, "grad_norm": 2.8706843852996826, "learning_rate": 3.882802212272847e-05, "loss": 1.9512, "step": 8491 }, { "epoch": 0.22365025019752435, "grad_norm": 2.0186493396759033, "learning_rate": 3.8826705293652885e-05, "loss": 1.708, "step": 8492 }, { "epoch": 0.22367658677903607, "grad_norm": 2.8753669261932373, "learning_rate": 3.88253884645773e-05, "loss": 0.2223, "step": 8493 }, { "epoch": 0.2237029233605478, "grad_norm": 3.4306418895721436, "learning_rate": 3.8824071635501716e-05, "loss": 1.1844, "step": 8494 }, { "epoch": 0.22372925994205953, "grad_norm": 2.8431928157806396, "learning_rate": 3.8822754806426125e-05, "loss": 0.7607, "step": 8495 }, { "epoch": 0.22375559652357124, "grad_norm": 2.5781729221343994, "learning_rate": 3.882143797735055e-05, "loss": 1.8525, "step": 8496 }, { "epoch": 0.22378193310508296, "grad_norm": 6.239348411560059, "learning_rate": 3.8820121148274956e-05, "loss": 1.2585, "step": 8497 }, { "epoch": 0.22380826968659467, "grad_norm": 1.6619477272033691, "learning_rate": 3.881880431919937e-05, "loss": 2.1441, "step": 8498 }, { "epoch": 0.2238346062681064, "grad_norm": 3.3440001010894775, "learning_rate": 3.881748749012378e-05, "loss": 2.2624, "step": 8499 }, { "epoch": 0.22386094284961813, "grad_norm": 2.327094793319702, "learning_rate": 3.8816170661048196e-05, "loss": 2.2825, "step": 8500 }, { "epoch": 0.22388727943112985, "grad_norm": 2.0841317176818848, "learning_rate": 3.881485383197261e-05, "loss": 2.0829, "step": 8501 }, { "epoch": 0.22391361601264156, "grad_norm": 1.7484424114227295, "learning_rate": 3.881353700289703e-05, "loss": 0.6641, "step": 8502 }, { "epoch": 0.22393995259415328, "grad_norm": 1.9922446012496948, "learning_rate": 3.881222017382144e-05, "loss": 2.0024, "step": 8503 }, { "epoch": 0.223966289175665, "grad_norm": 2.3496694564819336, "learning_rate": 3.881090334474585e-05, "loss": 1.3751, "step": 8504 }, { "epoch": 0.2239926257571767, "grad_norm": 3.416658401489258, "learning_rate": 3.880958651567027e-05, "loss": 1.425, "step": 8505 }, { "epoch": 0.22401896233868843, "grad_norm": 2.39328670501709, "learning_rate": 3.880826968659468e-05, "loss": 1.3231, "step": 8506 }, { "epoch": 0.22404529892020017, "grad_norm": 2.422624111175537, "learning_rate": 3.88069528575191e-05, "loss": 1.6258, "step": 8507 }, { "epoch": 0.22407163550171189, "grad_norm": 4.1461334228515625, "learning_rate": 3.880563602844351e-05, "loss": 1.278, "step": 8508 }, { "epoch": 0.2240979720832236, "grad_norm": 2.1527082920074463, "learning_rate": 3.880431919936792e-05, "loss": 1.9502, "step": 8509 }, { "epoch": 0.22412430866473532, "grad_norm": 3.2467570304870605, "learning_rate": 3.880300237029234e-05, "loss": 1.6126, "step": 8510 }, { "epoch": 0.22415064524624703, "grad_norm": 2.087463617324829, "learning_rate": 3.8801685541216754e-05, "loss": 2.4824, "step": 8511 }, { "epoch": 0.22417698182775875, "grad_norm": 1.4734315872192383, "learning_rate": 3.880036871214117e-05, "loss": 1.6918, "step": 8512 }, { "epoch": 0.22420331840927046, "grad_norm": 2.37542986869812, "learning_rate": 3.879905188306558e-05, "loss": 1.5313, "step": 8513 }, { "epoch": 0.2242296549907822, "grad_norm": 1.6531273126602173, "learning_rate": 3.8797735053989994e-05, "loss": 0.7821, "step": 8514 }, { "epoch": 0.22425599157229392, "grad_norm": 2.556692361831665, "learning_rate": 3.879641822491441e-05, "loss": 1.7999, "step": 8515 }, { "epoch": 0.22428232815380564, "grad_norm": 1.7149287462234497, "learning_rate": 3.8795101395838825e-05, "loss": 1.41, "step": 8516 }, { "epoch": 0.22430866473531735, "grad_norm": 1.8761473894119263, "learning_rate": 3.8793784566763234e-05, "loss": 1.8582, "step": 8517 }, { "epoch": 0.22433500131682907, "grad_norm": 2.844177722930908, "learning_rate": 3.879246773768765e-05, "loss": 1.6425, "step": 8518 }, { "epoch": 0.22436133789834078, "grad_norm": 4.278332233428955, "learning_rate": 3.879115090861206e-05, "loss": 1.5334, "step": 8519 }, { "epoch": 0.22438767447985253, "grad_norm": 4.926701545715332, "learning_rate": 3.878983407953648e-05, "loss": 1.7066, "step": 8520 }, { "epoch": 0.22441401106136424, "grad_norm": 2.12789249420166, "learning_rate": 3.8788517250460896e-05, "loss": 1.9511, "step": 8521 }, { "epoch": 0.22444034764287596, "grad_norm": 3.5499444007873535, "learning_rate": 3.8787200421385305e-05, "loss": 0.4712, "step": 8522 }, { "epoch": 0.22446668422438767, "grad_norm": 2.7263238430023193, "learning_rate": 3.878588359230972e-05, "loss": 1.6893, "step": 8523 }, { "epoch": 0.2244930208058994, "grad_norm": 1.685333251953125, "learning_rate": 3.878456676323413e-05, "loss": 1.1083, "step": 8524 }, { "epoch": 0.2245193573874111, "grad_norm": 2.1979739665985107, "learning_rate": 3.878324993415855e-05, "loss": 1.131, "step": 8525 }, { "epoch": 0.22454569396892282, "grad_norm": 1.5049335956573486, "learning_rate": 3.878193310508296e-05, "loss": 1.7592, "step": 8526 }, { "epoch": 0.22457203055043456, "grad_norm": 1.8547112941741943, "learning_rate": 3.8780616276007376e-05, "loss": 1.7506, "step": 8527 }, { "epoch": 0.22459836713194628, "grad_norm": 1.5974327325820923, "learning_rate": 3.8779299446931785e-05, "loss": 1.9607, "step": 8528 }, { "epoch": 0.224624703713458, "grad_norm": 5.663325786590576, "learning_rate": 3.877798261785621e-05, "loss": 1.395, "step": 8529 }, { "epoch": 0.2246510402949697, "grad_norm": 2.4018030166625977, "learning_rate": 3.8776665788780616e-05, "loss": 1.5803, "step": 8530 }, { "epoch": 0.22467737687648143, "grad_norm": 2.81136417388916, "learning_rate": 3.877534895970503e-05, "loss": 1.9475, "step": 8531 }, { "epoch": 0.22470371345799314, "grad_norm": 8.86347484588623, "learning_rate": 3.877403213062945e-05, "loss": 2.1431, "step": 8532 }, { "epoch": 0.22473005003950489, "grad_norm": 4.538589954376221, "learning_rate": 3.8772715301553856e-05, "loss": 1.9601, "step": 8533 }, { "epoch": 0.2247563866210166, "grad_norm": 3.4627835750579834, "learning_rate": 3.877139847247828e-05, "loss": 2.2073, "step": 8534 }, { "epoch": 0.22478272320252832, "grad_norm": 1.9426106214523315, "learning_rate": 3.877008164340269e-05, "loss": 2.5875, "step": 8535 }, { "epoch": 0.22480905978404003, "grad_norm": 3.228882312774658, "learning_rate": 3.87687648143271e-05, "loss": 1.3448, "step": 8536 }, { "epoch": 0.22483539636555175, "grad_norm": 1.9041908979415894, "learning_rate": 3.876744798525151e-05, "loss": 1.8954, "step": 8537 }, { "epoch": 0.22486173294706346, "grad_norm": 2.3903872966766357, "learning_rate": 3.876613115617593e-05, "loss": 2.2403, "step": 8538 }, { "epoch": 0.22488806952857518, "grad_norm": 3.6657254695892334, "learning_rate": 3.876481432710034e-05, "loss": 2.3583, "step": 8539 }, { "epoch": 0.22491440611008692, "grad_norm": 2.3356516361236572, "learning_rate": 3.876349749802476e-05, "loss": 2.2189, "step": 8540 }, { "epoch": 0.22494074269159864, "grad_norm": 1.4591888189315796, "learning_rate": 3.8762180668949174e-05, "loss": 0.7034, "step": 8541 }, { "epoch": 0.22496707927311035, "grad_norm": 1.8052279949188232, "learning_rate": 3.876086383987358e-05, "loss": 0.9604, "step": 8542 }, { "epoch": 0.22499341585462207, "grad_norm": 2.158219814300537, "learning_rate": 3.8759547010798006e-05, "loss": 1.6281, "step": 8543 }, { "epoch": 0.22501975243613379, "grad_norm": 2.6899774074554443, "learning_rate": 3.8758230181722414e-05, "loss": 2.2567, "step": 8544 }, { "epoch": 0.2250460890176455, "grad_norm": 2.3808324337005615, "learning_rate": 3.875691335264683e-05, "loss": 1.771, "step": 8545 }, { "epoch": 0.22507242559915722, "grad_norm": 1.6924216747283936, "learning_rate": 3.875559652357124e-05, "loss": 0.634, "step": 8546 }, { "epoch": 0.22509876218066896, "grad_norm": 1.7309690713882446, "learning_rate": 3.8754279694495654e-05, "loss": 2.135, "step": 8547 }, { "epoch": 0.22512509876218068, "grad_norm": 1.836403489112854, "learning_rate": 3.875296286542007e-05, "loss": 1.7224, "step": 8548 }, { "epoch": 0.2251514353436924, "grad_norm": 1.7984205484390259, "learning_rate": 3.8751646036344486e-05, "loss": 1.8589, "step": 8549 }, { "epoch": 0.2251777719252041, "grad_norm": 2.807633876800537, "learning_rate": 3.87503292072689e-05, "loss": 2.1739, "step": 8550 }, { "epoch": 0.22520410850671582, "grad_norm": 2.8705146312713623, "learning_rate": 3.874901237819331e-05, "loss": 2.244, "step": 8551 }, { "epoch": 0.22523044508822754, "grad_norm": 2.9991888999938965, "learning_rate": 3.8747695549117726e-05, "loss": 2.9163, "step": 8552 }, { "epoch": 0.22525678166973928, "grad_norm": 1.6255139112472534, "learning_rate": 3.874637872004214e-05, "loss": 1.8206, "step": 8553 }, { "epoch": 0.225283118251251, "grad_norm": 1.4978634119033813, "learning_rate": 3.874506189096656e-05, "loss": 0.3102, "step": 8554 }, { "epoch": 0.2253094548327627, "grad_norm": 4.614594459533691, "learning_rate": 3.8743745061890966e-05, "loss": 1.5802, "step": 8555 }, { "epoch": 0.22533579141427443, "grad_norm": 1.7774407863616943, "learning_rate": 3.874242823281538e-05, "loss": 2.0975, "step": 8556 }, { "epoch": 0.22536212799578614, "grad_norm": 1.6214098930358887, "learning_rate": 3.87411114037398e-05, "loss": 1.718, "step": 8557 }, { "epoch": 0.22538846457729786, "grad_norm": 3.455246686935425, "learning_rate": 3.873979457466421e-05, "loss": 1.6606, "step": 8558 }, { "epoch": 0.22541480115880957, "grad_norm": 3.538031578063965, "learning_rate": 3.873847774558863e-05, "loss": 1.8543, "step": 8559 }, { "epoch": 0.22544113774032132, "grad_norm": 1.7402997016906738, "learning_rate": 3.873716091651304e-05, "loss": 1.7954, "step": 8560 }, { "epoch": 0.22546747432183303, "grad_norm": 2.8732120990753174, "learning_rate": 3.873584408743745e-05, "loss": 2.2023, "step": 8561 }, { "epoch": 0.22549381090334475, "grad_norm": 2.5810511112213135, "learning_rate": 3.873452725836187e-05, "loss": 1.1899, "step": 8562 }, { "epoch": 0.22552014748485646, "grad_norm": 5.389581680297852, "learning_rate": 3.8733210429286284e-05, "loss": 2.722, "step": 8563 }, { "epoch": 0.22554648406636818, "grad_norm": 2.085892677307129, "learning_rate": 3.873189360021069e-05, "loss": 1.722, "step": 8564 }, { "epoch": 0.2255728206478799, "grad_norm": 2.844398021697998, "learning_rate": 3.873057677113511e-05, "loss": 1.2333, "step": 8565 }, { "epoch": 0.2255991572293916, "grad_norm": 2.088977336883545, "learning_rate": 3.8729259942059524e-05, "loss": 1.0623, "step": 8566 }, { "epoch": 0.22562549381090335, "grad_norm": 4.0470757484436035, "learning_rate": 3.872794311298394e-05, "loss": 0.6148, "step": 8567 }, { "epoch": 0.22565183039241507, "grad_norm": 1.453710913658142, "learning_rate": 3.8726626283908355e-05, "loss": 2.4347, "step": 8568 }, { "epoch": 0.22567816697392679, "grad_norm": 1.9698339700698853, "learning_rate": 3.8725309454832764e-05, "loss": 2.1407, "step": 8569 }, { "epoch": 0.2257045035554385, "grad_norm": 2.1381795406341553, "learning_rate": 3.872399262575718e-05, "loss": 1.4423, "step": 8570 }, { "epoch": 0.22573084013695022, "grad_norm": 4.505320072174072, "learning_rate": 3.872267579668159e-05, "loss": 1.1319, "step": 8571 }, { "epoch": 0.22575717671846193, "grad_norm": 2.297389268875122, "learning_rate": 3.872135896760601e-05, "loss": 1.8162, "step": 8572 }, { "epoch": 0.22578351329997368, "grad_norm": 4.679892063140869, "learning_rate": 3.872004213853042e-05, "loss": 1.24, "step": 8573 }, { "epoch": 0.2258098498814854, "grad_norm": 2.213726043701172, "learning_rate": 3.8718725309454835e-05, "loss": 2.1396, "step": 8574 }, { "epoch": 0.2258361864629971, "grad_norm": 1.8329153060913086, "learning_rate": 3.8717408480379244e-05, "loss": 2.0536, "step": 8575 }, { "epoch": 0.22586252304450882, "grad_norm": 3.7163002490997314, "learning_rate": 3.8716091651303666e-05, "loss": 1.6194, "step": 8576 }, { "epoch": 0.22588885962602054, "grad_norm": 2.213752269744873, "learning_rate": 3.8714774822228075e-05, "loss": 2.2845, "step": 8577 }, { "epoch": 0.22591519620753225, "grad_norm": 2.993347644805908, "learning_rate": 3.871345799315249e-05, "loss": 1.2136, "step": 8578 }, { "epoch": 0.22594153278904397, "grad_norm": 2.7208147048950195, "learning_rate": 3.8712141164076906e-05, "loss": 1.3, "step": 8579 }, { "epoch": 0.2259678693705557, "grad_norm": 3.664532423019409, "learning_rate": 3.8710824335001315e-05, "loss": 1.4404, "step": 8580 }, { "epoch": 0.22599420595206743, "grad_norm": 3.191732168197632, "learning_rate": 3.870950750592574e-05, "loss": 1.2009, "step": 8581 }, { "epoch": 0.22602054253357914, "grad_norm": 2.4372830390930176, "learning_rate": 3.8708190676850146e-05, "loss": 1.9618, "step": 8582 }, { "epoch": 0.22604687911509086, "grad_norm": 1.9489986896514893, "learning_rate": 3.870687384777456e-05, "loss": 1.8435, "step": 8583 }, { "epoch": 0.22607321569660258, "grad_norm": 2.5722455978393555, "learning_rate": 3.870555701869897e-05, "loss": 0.7802, "step": 8584 }, { "epoch": 0.2260995522781143, "grad_norm": 2.0546438694000244, "learning_rate": 3.8704240189623386e-05, "loss": 1.7552, "step": 8585 }, { "epoch": 0.22612588885962603, "grad_norm": 1.6043113470077515, "learning_rate": 3.87029233605478e-05, "loss": 1.2752, "step": 8586 }, { "epoch": 0.22615222544113775, "grad_norm": 1.9670912027359009, "learning_rate": 3.870160653147222e-05, "loss": 1.8038, "step": 8587 }, { "epoch": 0.22617856202264947, "grad_norm": 4.275622367858887, "learning_rate": 3.870028970239663e-05, "loss": 0.8754, "step": 8588 }, { "epoch": 0.22620489860416118, "grad_norm": 2.9276010990142822, "learning_rate": 3.869897287332104e-05, "loss": 1.949, "step": 8589 }, { "epoch": 0.2262312351856729, "grad_norm": 1.6066828966140747, "learning_rate": 3.869765604424546e-05, "loss": 2.0154, "step": 8590 }, { "epoch": 0.2262575717671846, "grad_norm": 3.8568646907806396, "learning_rate": 3.869633921516987e-05, "loss": 1.5891, "step": 8591 }, { "epoch": 0.22628390834869633, "grad_norm": 2.780968427658081, "learning_rate": 3.869502238609429e-05, "loss": 1.7727, "step": 8592 }, { "epoch": 0.22631024493020807, "grad_norm": 3.259148359298706, "learning_rate": 3.86937055570187e-05, "loss": 2.8154, "step": 8593 }, { "epoch": 0.2263365815117198, "grad_norm": 2.9252030849456787, "learning_rate": 3.869238872794311e-05, "loss": 1.9262, "step": 8594 }, { "epoch": 0.2263629180932315, "grad_norm": 3.966649293899536, "learning_rate": 3.869107189886753e-05, "loss": 1.4662, "step": 8595 }, { "epoch": 0.22638925467474322, "grad_norm": 2.670992612838745, "learning_rate": 3.8689755069791944e-05, "loss": 0.5004, "step": 8596 }, { "epoch": 0.22641559125625493, "grad_norm": 2.465890884399414, "learning_rate": 3.868843824071636e-05, "loss": 1.4256, "step": 8597 }, { "epoch": 0.22644192783776665, "grad_norm": 3.0254628658294678, "learning_rate": 3.868712141164077e-05, "loss": 1.8366, "step": 8598 }, { "epoch": 0.22646826441927836, "grad_norm": 4.357585430145264, "learning_rate": 3.8685804582565184e-05, "loss": 1.8118, "step": 8599 }, { "epoch": 0.2264946010007901, "grad_norm": 2.545929193496704, "learning_rate": 3.86844877534896e-05, "loss": 1.2503, "step": 8600 }, { "epoch": 0.22652093758230182, "grad_norm": 1.5786738395690918, "learning_rate": 3.8683170924414015e-05, "loss": 2.1923, "step": 8601 }, { "epoch": 0.22654727416381354, "grad_norm": 1.7433059215545654, "learning_rate": 3.8681854095338424e-05, "loss": 2.0981, "step": 8602 }, { "epoch": 0.22657361074532525, "grad_norm": 2.3527612686157227, "learning_rate": 3.868053726626284e-05, "loss": 1.7351, "step": 8603 }, { "epoch": 0.22659994732683697, "grad_norm": 6.146824359893799, "learning_rate": 3.8679220437187255e-05, "loss": 1.8199, "step": 8604 }, { "epoch": 0.22662628390834869, "grad_norm": 1.9697847366333008, "learning_rate": 3.867790360811167e-05, "loss": 1.2718, "step": 8605 }, { "epoch": 0.22665262048986043, "grad_norm": 2.5407605171203613, "learning_rate": 3.8676586779036087e-05, "loss": 2.0836, "step": 8606 }, { "epoch": 0.22667895707137214, "grad_norm": 2.063901662826538, "learning_rate": 3.8675269949960495e-05, "loss": 1.6355, "step": 8607 }, { "epoch": 0.22670529365288386, "grad_norm": 2.405883312225342, "learning_rate": 3.867395312088491e-05, "loss": 0.6567, "step": 8608 }, { "epoch": 0.22673163023439558, "grad_norm": 2.1857759952545166, "learning_rate": 3.8672636291809327e-05, "loss": 1.3335, "step": 8609 }, { "epoch": 0.2267579668159073, "grad_norm": 3.159747362136841, "learning_rate": 3.867131946273374e-05, "loss": 1.0167, "step": 8610 }, { "epoch": 0.226784303397419, "grad_norm": 1.9634112119674683, "learning_rate": 3.867000263365815e-05, "loss": 2.5962, "step": 8611 }, { "epoch": 0.22681063997893072, "grad_norm": 1.862107753753662, "learning_rate": 3.8668685804582567e-05, "loss": 1.3851, "step": 8612 }, { "epoch": 0.22683697656044247, "grad_norm": 2.9085402488708496, "learning_rate": 3.866736897550698e-05, "loss": 0.8244, "step": 8613 }, { "epoch": 0.22686331314195418, "grad_norm": 1.575588583946228, "learning_rate": 3.86660521464314e-05, "loss": 1.224, "step": 8614 }, { "epoch": 0.2268896497234659, "grad_norm": 2.716517686843872, "learning_rate": 3.866473531735581e-05, "loss": 2.0725, "step": 8615 }, { "epoch": 0.2269159863049776, "grad_norm": 1.6224349737167358, "learning_rate": 3.866341848828022e-05, "loss": 1.5208, "step": 8616 }, { "epoch": 0.22694232288648933, "grad_norm": 2.7150113582611084, "learning_rate": 3.866210165920464e-05, "loss": 2.0537, "step": 8617 }, { "epoch": 0.22696865946800104, "grad_norm": 3.4237709045410156, "learning_rate": 3.866078483012905e-05, "loss": 1.7736, "step": 8618 }, { "epoch": 0.22699499604951276, "grad_norm": 1.5503649711608887, "learning_rate": 3.865946800105347e-05, "loss": 1.6825, "step": 8619 }, { "epoch": 0.2270213326310245, "grad_norm": 3.7680604457855225, "learning_rate": 3.865815117197788e-05, "loss": 1.5244, "step": 8620 }, { "epoch": 0.22704766921253622, "grad_norm": 2.461878538131714, "learning_rate": 3.8656834342902293e-05, "loss": 1.8361, "step": 8621 }, { "epoch": 0.22707400579404793, "grad_norm": 2.5986108779907227, "learning_rate": 3.86555175138267e-05, "loss": 1.7973, "step": 8622 }, { "epoch": 0.22710034237555965, "grad_norm": 1.5644572973251343, "learning_rate": 3.865420068475112e-05, "loss": 0.3525, "step": 8623 }, { "epoch": 0.22712667895707137, "grad_norm": 4.417360305786133, "learning_rate": 3.865288385567554e-05, "loss": 1.1063, "step": 8624 }, { "epoch": 0.22715301553858308, "grad_norm": 1.637245535850525, "learning_rate": 3.865156702659995e-05, "loss": 1.7159, "step": 8625 }, { "epoch": 0.22717935212009482, "grad_norm": 3.72438907623291, "learning_rate": 3.8650250197524365e-05, "loss": 1.7055, "step": 8626 }, { "epoch": 0.22720568870160654, "grad_norm": 2.523892641067505, "learning_rate": 3.8648933368448773e-05, "loss": 1.6392, "step": 8627 }, { "epoch": 0.22723202528311826, "grad_norm": 4.000173091888428, "learning_rate": 3.8647616539373196e-05, "loss": 1.2853, "step": 8628 }, { "epoch": 0.22725836186462997, "grad_norm": 2.9876821041107178, "learning_rate": 3.8646299710297605e-05, "loss": 2.1374, "step": 8629 }, { "epoch": 0.2272846984461417, "grad_norm": 2.0255048274993896, "learning_rate": 3.864498288122202e-05, "loss": 1.5352, "step": 8630 }, { "epoch": 0.2273110350276534, "grad_norm": 2.19376802444458, "learning_rate": 3.864366605214643e-05, "loss": 1.8505, "step": 8631 }, { "epoch": 0.22733737160916512, "grad_norm": 3.0949249267578125, "learning_rate": 3.8642349223070845e-05, "loss": 1.5474, "step": 8632 }, { "epoch": 0.22736370819067686, "grad_norm": 2.001195192337036, "learning_rate": 3.864103239399526e-05, "loss": 1.9927, "step": 8633 }, { "epoch": 0.22739004477218858, "grad_norm": 2.0876452922821045, "learning_rate": 3.8639715564919676e-05, "loss": 1.933, "step": 8634 }, { "epoch": 0.2274163813537003, "grad_norm": 1.8517053127288818, "learning_rate": 3.863839873584409e-05, "loss": 2.1866, "step": 8635 }, { "epoch": 0.227442717935212, "grad_norm": 3.2314717769622803, "learning_rate": 3.86370819067685e-05, "loss": 1.1993, "step": 8636 }, { "epoch": 0.22746905451672372, "grad_norm": 1.7439364194869995, "learning_rate": 3.8635765077692916e-05, "loss": 2.1373, "step": 8637 }, { "epoch": 0.22749539109823544, "grad_norm": 1.895079255104065, "learning_rate": 3.863444824861733e-05, "loss": 1.349, "step": 8638 }, { "epoch": 0.22752172767974718, "grad_norm": 3.603242874145508, "learning_rate": 3.863313141954175e-05, "loss": 1.0467, "step": 8639 }, { "epoch": 0.2275480642612589, "grad_norm": 1.9533138275146484, "learning_rate": 3.8631814590466156e-05, "loss": 1.6837, "step": 8640 }, { "epoch": 0.2275744008427706, "grad_norm": 1.6531928777694702, "learning_rate": 3.863049776139057e-05, "loss": 1.247, "step": 8641 }, { "epoch": 0.22760073742428233, "grad_norm": 2.8849799633026123, "learning_rate": 3.862918093231499e-05, "loss": 0.5407, "step": 8642 }, { "epoch": 0.22762707400579404, "grad_norm": 1.795291543006897, "learning_rate": 3.86278641032394e-05, "loss": 1.8623, "step": 8643 }, { "epoch": 0.22765341058730576, "grad_norm": 1.7819745540618896, "learning_rate": 3.862654727416382e-05, "loss": 1.6354, "step": 8644 }, { "epoch": 0.22767974716881748, "grad_norm": 2.4156219959259033, "learning_rate": 3.862523044508823e-05, "loss": 1.0683, "step": 8645 }, { "epoch": 0.22770608375032922, "grad_norm": 2.533712148666382, "learning_rate": 3.862391361601264e-05, "loss": 1.6396, "step": 8646 }, { "epoch": 0.22773242033184093, "grad_norm": 2.627272367477417, "learning_rate": 3.862259678693706e-05, "loss": 2.0669, "step": 8647 }, { "epoch": 0.22775875691335265, "grad_norm": 1.8279216289520264, "learning_rate": 3.8621279957861474e-05, "loss": 1.713, "step": 8648 }, { "epoch": 0.22778509349486437, "grad_norm": 3.0978190898895264, "learning_rate": 3.861996312878588e-05, "loss": 0.9317, "step": 8649 }, { "epoch": 0.22781143007637608, "grad_norm": 2.731104612350464, "learning_rate": 3.86186462997103e-05, "loss": 0.7556, "step": 8650 }, { "epoch": 0.2278377666578878, "grad_norm": 2.8023698329925537, "learning_rate": 3.8617329470634714e-05, "loss": 1.6117, "step": 8651 }, { "epoch": 0.2278641032393995, "grad_norm": 4.751079082489014, "learning_rate": 3.861601264155913e-05, "loss": 0.7975, "step": 8652 }, { "epoch": 0.22789043982091126, "grad_norm": 4.326756000518799, "learning_rate": 3.8614695812483545e-05, "loss": 0.9873, "step": 8653 }, { "epoch": 0.22791677640242297, "grad_norm": 2.7647929191589355, "learning_rate": 3.8613378983407954e-05, "loss": 1.5003, "step": 8654 }, { "epoch": 0.2279431129839347, "grad_norm": 2.266779899597168, "learning_rate": 3.861206215433237e-05, "loss": 1.2917, "step": 8655 }, { "epoch": 0.2279694495654464, "grad_norm": 1.672104001045227, "learning_rate": 3.861074532525678e-05, "loss": 2.2904, "step": 8656 }, { "epoch": 0.22799578614695812, "grad_norm": 9.445061683654785, "learning_rate": 3.86094284961812e-05, "loss": 1.6486, "step": 8657 }, { "epoch": 0.22802212272846983, "grad_norm": 2.426698923110962, "learning_rate": 3.860811166710561e-05, "loss": 1.8155, "step": 8658 }, { "epoch": 0.22804845930998158, "grad_norm": 2.0746910572052, "learning_rate": 3.8606794838030025e-05, "loss": 1.2942, "step": 8659 }, { "epoch": 0.2280747958914933, "grad_norm": 3.771878242492676, "learning_rate": 3.860547800895444e-05, "loss": 1.7471, "step": 8660 }, { "epoch": 0.228101132473005, "grad_norm": 2.3991341590881348, "learning_rate": 3.8604161179878856e-05, "loss": 2.0276, "step": 8661 }, { "epoch": 0.22812746905451672, "grad_norm": 3.5623087882995605, "learning_rate": 3.860284435080327e-05, "loss": 1.838, "step": 8662 }, { "epoch": 0.22815380563602844, "grad_norm": 2.001025676727295, "learning_rate": 3.860152752172768e-05, "loss": 2.0591, "step": 8663 }, { "epoch": 0.22818014221754016, "grad_norm": 1.6273208856582642, "learning_rate": 3.8600210692652096e-05, "loss": 1.4582, "step": 8664 }, { "epoch": 0.22820647879905187, "grad_norm": 2.0918726921081543, "learning_rate": 3.8598893863576505e-05, "loss": 1.5005, "step": 8665 }, { "epoch": 0.22823281538056361, "grad_norm": 4.101078033447266, "learning_rate": 3.859757703450093e-05, "loss": 1.5328, "step": 8666 }, { "epoch": 0.22825915196207533, "grad_norm": 2.197524070739746, "learning_rate": 3.8596260205425336e-05, "loss": 1.294, "step": 8667 }, { "epoch": 0.22828548854358705, "grad_norm": 1.9406253099441528, "learning_rate": 3.859494337634975e-05, "loss": 2.0219, "step": 8668 }, { "epoch": 0.22831182512509876, "grad_norm": 3.7249929904937744, "learning_rate": 3.859362654727417e-05, "loss": 2.7619, "step": 8669 }, { "epoch": 0.22833816170661048, "grad_norm": 2.5667500495910645, "learning_rate": 3.8592309718198576e-05, "loss": 1.7927, "step": 8670 }, { "epoch": 0.2283644982881222, "grad_norm": 1.8960399627685547, "learning_rate": 3.8590992889123e-05, "loss": 2.1342, "step": 8671 }, { "epoch": 0.22839083486963394, "grad_norm": 2.6433939933776855, "learning_rate": 3.858967606004741e-05, "loss": 1.6063, "step": 8672 }, { "epoch": 0.22841717145114565, "grad_norm": 3.4651176929473877, "learning_rate": 3.858835923097182e-05, "loss": 1.0563, "step": 8673 }, { "epoch": 0.22844350803265737, "grad_norm": 2.608092784881592, "learning_rate": 3.858704240189623e-05, "loss": 2.0434, "step": 8674 }, { "epoch": 0.22846984461416908, "grad_norm": 2.3010904788970947, "learning_rate": 3.8585725572820654e-05, "loss": 1.5795, "step": 8675 }, { "epoch": 0.2284961811956808, "grad_norm": 1.8976138830184937, "learning_rate": 3.858440874374506e-05, "loss": 1.5403, "step": 8676 }, { "epoch": 0.2285225177771925, "grad_norm": 2.553696632385254, "learning_rate": 3.858309191466948e-05, "loss": 1.9057, "step": 8677 }, { "epoch": 0.22854885435870423, "grad_norm": 2.6526734828948975, "learning_rate": 3.858177508559389e-05, "loss": 1.5814, "step": 8678 }, { "epoch": 0.22857519094021597, "grad_norm": 2.6710147857666016, "learning_rate": 3.85804582565183e-05, "loss": 1.928, "step": 8679 }, { "epoch": 0.2286015275217277, "grad_norm": 1.5780129432678223, "learning_rate": 3.857914142744272e-05, "loss": 1.3791, "step": 8680 }, { "epoch": 0.2286278641032394, "grad_norm": 1.8756920099258423, "learning_rate": 3.8577824598367134e-05, "loss": 1.6023, "step": 8681 }, { "epoch": 0.22865420068475112, "grad_norm": 2.1717419624328613, "learning_rate": 3.857650776929155e-05, "loss": 2.0012, "step": 8682 }, { "epoch": 0.22868053726626283, "grad_norm": 2.16422176361084, "learning_rate": 3.857519094021596e-05, "loss": 2.1048, "step": 8683 }, { "epoch": 0.22870687384777455, "grad_norm": 1.9747705459594727, "learning_rate": 3.8573874111140374e-05, "loss": 1.5198, "step": 8684 }, { "epoch": 0.22873321042928627, "grad_norm": 1.9603595733642578, "learning_rate": 3.857255728206479e-05, "loss": 1.9746, "step": 8685 }, { "epoch": 0.228759547010798, "grad_norm": 1.5197569131851196, "learning_rate": 3.8571240452989206e-05, "loss": 2.3079, "step": 8686 }, { "epoch": 0.22878588359230972, "grad_norm": 7.051483631134033, "learning_rate": 3.8569923623913614e-05, "loss": 1.4505, "step": 8687 }, { "epoch": 0.22881222017382144, "grad_norm": 1.56491219997406, "learning_rate": 3.856860679483803e-05, "loss": 1.7867, "step": 8688 }, { "epoch": 0.22883855675533316, "grad_norm": 2.5304079055786133, "learning_rate": 3.8567289965762446e-05, "loss": 0.6964, "step": 8689 }, { "epoch": 0.22886489333684487, "grad_norm": 2.102848529815674, "learning_rate": 3.856597313668686e-05, "loss": 1.6336, "step": 8690 }, { "epoch": 0.2288912299183566, "grad_norm": 3.069793224334717, "learning_rate": 3.856465630761128e-05, "loss": 1.8046, "step": 8691 }, { "epoch": 0.22891756649986833, "grad_norm": 3.1155624389648438, "learning_rate": 3.8563339478535686e-05, "loss": 1.6283, "step": 8692 }, { "epoch": 0.22894390308138005, "grad_norm": 1.3181477785110474, "learning_rate": 3.85620226494601e-05, "loss": 0.8697, "step": 8693 }, { "epoch": 0.22897023966289176, "grad_norm": 2.799595355987549, "learning_rate": 3.856070582038452e-05, "loss": 1.8261, "step": 8694 }, { "epoch": 0.22899657624440348, "grad_norm": 2.9487504959106445, "learning_rate": 3.855938899130893e-05, "loss": 2.2233, "step": 8695 }, { "epoch": 0.2290229128259152, "grad_norm": 1.8640750646591187, "learning_rate": 3.855807216223334e-05, "loss": 1.7792, "step": 8696 }, { "epoch": 0.2290492494074269, "grad_norm": 1.5914180278778076, "learning_rate": 3.855675533315776e-05, "loss": 2.6861, "step": 8697 }, { "epoch": 0.22907558598893862, "grad_norm": 2.3961830139160156, "learning_rate": 3.855543850408217e-05, "loss": 2.3185, "step": 8698 }, { "epoch": 0.22910192257045037, "grad_norm": 2.0692360401153564, "learning_rate": 3.855412167500659e-05, "loss": 1.9847, "step": 8699 }, { "epoch": 0.22912825915196208, "grad_norm": 3.728013277053833, "learning_rate": 3.8552804845931004e-05, "loss": 2.3875, "step": 8700 }, { "epoch": 0.2291545957334738, "grad_norm": 3.812608003616333, "learning_rate": 3.855148801685541e-05, "loss": 0.7028, "step": 8701 }, { "epoch": 0.2291809323149855, "grad_norm": 1.8863166570663452, "learning_rate": 3.855017118777983e-05, "loss": 2.0994, "step": 8702 }, { "epoch": 0.22920726889649723, "grad_norm": 1.8751435279846191, "learning_rate": 3.854885435870424e-05, "loss": 1.5438, "step": 8703 }, { "epoch": 0.22923360547800894, "grad_norm": 2.2387068271636963, "learning_rate": 3.854753752962866e-05, "loss": 1.6201, "step": 8704 }, { "epoch": 0.22925994205952066, "grad_norm": 2.119868755340576, "learning_rate": 3.854622070055307e-05, "loss": 1.9271, "step": 8705 }, { "epoch": 0.2292862786410324, "grad_norm": 2.0415561199188232, "learning_rate": 3.8544903871477484e-05, "loss": 1.8325, "step": 8706 }, { "epoch": 0.22931261522254412, "grad_norm": 2.3197097778320312, "learning_rate": 3.85435870424019e-05, "loss": 2.1155, "step": 8707 }, { "epoch": 0.22933895180405584, "grad_norm": 1.7115092277526855, "learning_rate": 3.8542270213326315e-05, "loss": 0.441, "step": 8708 }, { "epoch": 0.22936528838556755, "grad_norm": 1.941003441810608, "learning_rate": 3.854095338425073e-05, "loss": 1.8441, "step": 8709 }, { "epoch": 0.22939162496707927, "grad_norm": 2.673499345779419, "learning_rate": 3.853963655517514e-05, "loss": 0.7984, "step": 8710 }, { "epoch": 0.22941796154859098, "grad_norm": 1.9680812358856201, "learning_rate": 3.8538319726099555e-05, "loss": 1.8498, "step": 8711 }, { "epoch": 0.22944429813010273, "grad_norm": 2.821601152420044, "learning_rate": 3.8537002897023964e-05, "loss": 1.9629, "step": 8712 }, { "epoch": 0.22947063471161444, "grad_norm": 3.7915849685668945, "learning_rate": 3.8535686067948386e-05, "loss": 1.6068, "step": 8713 }, { "epoch": 0.22949697129312616, "grad_norm": 2.261084794998169, "learning_rate": 3.8534369238872795e-05, "loss": 1.568, "step": 8714 }, { "epoch": 0.22952330787463787, "grad_norm": 1.4744764566421509, "learning_rate": 3.853305240979721e-05, "loss": 1.5126, "step": 8715 }, { "epoch": 0.2295496444561496, "grad_norm": 2.1644668579101562, "learning_rate": 3.8531735580721626e-05, "loss": 1.5953, "step": 8716 }, { "epoch": 0.2295759810376613, "grad_norm": 2.81292462348938, "learning_rate": 3.8530418751646035e-05, "loss": 1.2637, "step": 8717 }, { "epoch": 0.22960231761917302, "grad_norm": 2.1572868824005127, "learning_rate": 3.852910192257046e-05, "loss": 1.9149, "step": 8718 }, { "epoch": 0.22962865420068476, "grad_norm": 2.567028522491455, "learning_rate": 3.8527785093494866e-05, "loss": 1.6274, "step": 8719 }, { "epoch": 0.22965499078219648, "grad_norm": 1.6461268663406372, "learning_rate": 3.852646826441928e-05, "loss": 1.7598, "step": 8720 }, { "epoch": 0.2296813273637082, "grad_norm": 1.711926817893982, "learning_rate": 3.852515143534369e-05, "loss": 1.0451, "step": 8721 }, { "epoch": 0.2297076639452199, "grad_norm": 2.110971689224243, "learning_rate": 3.8523834606268106e-05, "loss": 1.6453, "step": 8722 }, { "epoch": 0.22973400052673162, "grad_norm": 2.7735655307769775, "learning_rate": 3.852251777719252e-05, "loss": 1.7858, "step": 8723 }, { "epoch": 0.22976033710824334, "grad_norm": 1.9985178709030151, "learning_rate": 3.852120094811694e-05, "loss": 1.7008, "step": 8724 }, { "epoch": 0.22978667368975508, "grad_norm": 1.734743595123291, "learning_rate": 3.8519884119041346e-05, "loss": 0.6781, "step": 8725 }, { "epoch": 0.2298130102712668, "grad_norm": 2.0480659008026123, "learning_rate": 3.851856728996576e-05, "loss": 1.1985, "step": 8726 }, { "epoch": 0.22983934685277851, "grad_norm": 2.515310049057007, "learning_rate": 3.851725046089018e-05, "loss": 2.2086, "step": 8727 }, { "epoch": 0.22986568343429023, "grad_norm": 1.9793624877929688, "learning_rate": 3.851593363181459e-05, "loss": 1.4715, "step": 8728 }, { "epoch": 0.22989202001580195, "grad_norm": 1.7837162017822266, "learning_rate": 3.851461680273901e-05, "loss": 2.1454, "step": 8729 }, { "epoch": 0.22991835659731366, "grad_norm": 1.9805967807769775, "learning_rate": 3.851329997366342e-05, "loss": 1.6898, "step": 8730 }, { "epoch": 0.22994469317882538, "grad_norm": 2.5281834602355957, "learning_rate": 3.851198314458783e-05, "loss": 1.4195, "step": 8731 }, { "epoch": 0.22997102976033712, "grad_norm": 4.5295820236206055, "learning_rate": 3.851066631551225e-05, "loss": 1.5477, "step": 8732 }, { "epoch": 0.22999736634184884, "grad_norm": 5.615579605102539, "learning_rate": 3.8509349486436664e-05, "loss": 1.9838, "step": 8733 }, { "epoch": 0.23002370292336055, "grad_norm": 3.848259687423706, "learning_rate": 3.850803265736107e-05, "loss": 1.6755, "step": 8734 }, { "epoch": 0.23005003950487227, "grad_norm": 1.982571005821228, "learning_rate": 3.850671582828549e-05, "loss": 2.6243, "step": 8735 }, { "epoch": 0.23007637608638398, "grad_norm": 1.8228106498718262, "learning_rate": 3.8505398999209904e-05, "loss": 1.8012, "step": 8736 }, { "epoch": 0.2301027126678957, "grad_norm": 2.055089235305786, "learning_rate": 3.850408217013432e-05, "loss": 2.1505, "step": 8737 }, { "epoch": 0.2301290492494074, "grad_norm": 2.0300891399383545, "learning_rate": 3.8502765341058735e-05, "loss": 0.6957, "step": 8738 }, { "epoch": 0.23015538583091916, "grad_norm": 4.362094402313232, "learning_rate": 3.8501448511983144e-05, "loss": 1.9208, "step": 8739 }, { "epoch": 0.23018172241243087, "grad_norm": 3.210151195526123, "learning_rate": 3.850013168290756e-05, "loss": 1.6491, "step": 8740 }, { "epoch": 0.2302080589939426, "grad_norm": 1.9036731719970703, "learning_rate": 3.8498814853831975e-05, "loss": 1.7272, "step": 8741 }, { "epoch": 0.2302343955754543, "grad_norm": 4.6128082275390625, "learning_rate": 3.849749802475639e-05, "loss": 1.7663, "step": 8742 }, { "epoch": 0.23026073215696602, "grad_norm": 3.624314308166504, "learning_rate": 3.84961811956808e-05, "loss": 1.9732, "step": 8743 }, { "epoch": 0.23028706873847773, "grad_norm": 1.6349352598190308, "learning_rate": 3.8494864366605215e-05, "loss": 1.2109, "step": 8744 }, { "epoch": 0.23031340531998948, "grad_norm": 3.401796817779541, "learning_rate": 3.849354753752963e-05, "loss": 0.9124, "step": 8745 }, { "epoch": 0.2303397419015012, "grad_norm": 1.9930672645568848, "learning_rate": 3.849223070845405e-05, "loss": 1.1927, "step": 8746 }, { "epoch": 0.2303660784830129, "grad_norm": 4.248035430908203, "learning_rate": 3.849091387937846e-05, "loss": 0.7357, "step": 8747 }, { "epoch": 0.23039241506452462, "grad_norm": 2.0215535163879395, "learning_rate": 3.848959705030287e-05, "loss": 1.3645, "step": 8748 }, { "epoch": 0.23041875164603634, "grad_norm": 2.5462646484375, "learning_rate": 3.848828022122729e-05, "loss": 1.1863, "step": 8749 }, { "epoch": 0.23044508822754806, "grad_norm": 2.440798282623291, "learning_rate": 3.8486963392151695e-05, "loss": 1.8985, "step": 8750 }, { "epoch": 0.23047142480905977, "grad_norm": 2.9896252155303955, "learning_rate": 3.848564656307612e-05, "loss": 1.5054, "step": 8751 }, { "epoch": 0.23049776139057152, "grad_norm": 1.8389194011688232, "learning_rate": 3.848432973400053e-05, "loss": 1.977, "step": 8752 }, { "epoch": 0.23052409797208323, "grad_norm": 1.918176531791687, "learning_rate": 3.848301290492494e-05, "loss": 1.731, "step": 8753 }, { "epoch": 0.23055043455359495, "grad_norm": 2.1689364910125732, "learning_rate": 3.848169607584936e-05, "loss": 2.4061, "step": 8754 }, { "epoch": 0.23057677113510666, "grad_norm": 2.393303632736206, "learning_rate": 3.8480379246773773e-05, "loss": 1.6514, "step": 8755 }, { "epoch": 0.23060310771661838, "grad_norm": 2.7109014987945557, "learning_rate": 3.847906241769819e-05, "loss": 1.9559, "step": 8756 }, { "epoch": 0.2306294442981301, "grad_norm": 3.7777557373046875, "learning_rate": 3.84777455886226e-05, "loss": 1.4828, "step": 8757 }, { "epoch": 0.23065578087964184, "grad_norm": 1.9608242511749268, "learning_rate": 3.8476428759547013e-05, "loss": 1.4569, "step": 8758 }, { "epoch": 0.23068211746115355, "grad_norm": 1.7105075120925903, "learning_rate": 3.847511193047142e-05, "loss": 2.0292, "step": 8759 }, { "epoch": 0.23070845404266527, "grad_norm": 1.7458996772766113, "learning_rate": 3.8473795101395845e-05, "loss": 1.7096, "step": 8760 }, { "epoch": 0.23073479062417698, "grad_norm": 1.5461111068725586, "learning_rate": 3.8472478272320253e-05, "loss": 1.824, "step": 8761 }, { "epoch": 0.2307611272056887, "grad_norm": 1.8432605266571045, "learning_rate": 3.847116144324467e-05, "loss": 2.0229, "step": 8762 }, { "epoch": 0.23078746378720041, "grad_norm": 1.6574029922485352, "learning_rate": 3.8469844614169085e-05, "loss": 1.8793, "step": 8763 }, { "epoch": 0.23081380036871213, "grad_norm": 2.9303579330444336, "learning_rate": 3.8468527785093493e-05, "loss": 0.7993, "step": 8764 }, { "epoch": 0.23084013695022387, "grad_norm": 2.927445650100708, "learning_rate": 3.8467210956017916e-05, "loss": 2.1572, "step": 8765 }, { "epoch": 0.2308664735317356, "grad_norm": 2.6877167224884033, "learning_rate": 3.8465894126942325e-05, "loss": 0.5014, "step": 8766 }, { "epoch": 0.2308928101132473, "grad_norm": 1.4402517080307007, "learning_rate": 3.846457729786674e-05, "loss": 1.0874, "step": 8767 }, { "epoch": 0.23091914669475902, "grad_norm": 1.9190280437469482, "learning_rate": 3.846326046879115e-05, "loss": 1.78, "step": 8768 }, { "epoch": 0.23094548327627074, "grad_norm": 6.018390655517578, "learning_rate": 3.8461943639715565e-05, "loss": 1.4677, "step": 8769 }, { "epoch": 0.23097181985778245, "grad_norm": 1.4859380722045898, "learning_rate": 3.846062681063998e-05, "loss": 2.048, "step": 8770 }, { "epoch": 0.23099815643929417, "grad_norm": 1.430658221244812, "learning_rate": 3.8459309981564396e-05, "loss": 1.9816, "step": 8771 }, { "epoch": 0.2310244930208059, "grad_norm": 7.866768836975098, "learning_rate": 3.845799315248881e-05, "loss": 2.1579, "step": 8772 }, { "epoch": 0.23105082960231763, "grad_norm": 2.8055260181427, "learning_rate": 3.845667632341322e-05, "loss": 1.7819, "step": 8773 }, { "epoch": 0.23107716618382934, "grad_norm": 3.5743625164031982, "learning_rate": 3.845535949433764e-05, "loss": 1.326, "step": 8774 }, { "epoch": 0.23110350276534106, "grad_norm": 2.8899219036102295, "learning_rate": 3.845404266526205e-05, "loss": 0.8887, "step": 8775 }, { "epoch": 0.23112983934685277, "grad_norm": 2.2441341876983643, "learning_rate": 3.845272583618647e-05, "loss": 1.3165, "step": 8776 }, { "epoch": 0.2311561759283645, "grad_norm": 2.4311869144439697, "learning_rate": 3.8451409007110876e-05, "loss": 0.7973, "step": 8777 }, { "epoch": 0.23118251250987623, "grad_norm": 2.458587884902954, "learning_rate": 3.845009217803529e-05, "loss": 1.4253, "step": 8778 }, { "epoch": 0.23120884909138795, "grad_norm": 2.057854413986206, "learning_rate": 3.844877534895971e-05, "loss": 2.2464, "step": 8779 }, { "epoch": 0.23123518567289966, "grad_norm": 3.1313319206237793, "learning_rate": 3.844745851988412e-05, "loss": 1.7106, "step": 8780 }, { "epoch": 0.23126152225441138, "grad_norm": 2.7365825176239014, "learning_rate": 3.844614169080853e-05, "loss": 2.2137, "step": 8781 }, { "epoch": 0.2312878588359231, "grad_norm": 3.1917386054992676, "learning_rate": 3.844482486173295e-05, "loss": 2.1663, "step": 8782 }, { "epoch": 0.2313141954174348, "grad_norm": 4.121448993682861, "learning_rate": 3.844350803265736e-05, "loss": 0.5522, "step": 8783 }, { "epoch": 0.23134053199894652, "grad_norm": 4.569949626922607, "learning_rate": 3.844219120358178e-05, "loss": 1.788, "step": 8784 }, { "epoch": 0.23136686858045827, "grad_norm": 2.647071123123169, "learning_rate": 3.8440874374506194e-05, "loss": 2.181, "step": 8785 }, { "epoch": 0.23139320516196998, "grad_norm": 2.7681360244750977, "learning_rate": 3.84395575454306e-05, "loss": 1.3345, "step": 8786 }, { "epoch": 0.2314195417434817, "grad_norm": 3.2671942710876465, "learning_rate": 3.843824071635502e-05, "loss": 2.0853, "step": 8787 }, { "epoch": 0.23144587832499341, "grad_norm": 2.92760968208313, "learning_rate": 3.8436923887279434e-05, "loss": 0.8017, "step": 8788 }, { "epoch": 0.23147221490650513, "grad_norm": 2.052534580230713, "learning_rate": 3.843560705820385e-05, "loss": 0.7354, "step": 8789 }, { "epoch": 0.23149855148801685, "grad_norm": 2.1374247074127197, "learning_rate": 3.843429022912826e-05, "loss": 0.7031, "step": 8790 }, { "epoch": 0.23152488806952856, "grad_norm": 1.9190794229507446, "learning_rate": 3.8432973400052674e-05, "loss": 0.8202, "step": 8791 }, { "epoch": 0.2315512246510403, "grad_norm": 3.549086332321167, "learning_rate": 3.843165657097709e-05, "loss": 0.6385, "step": 8792 }, { "epoch": 0.23157756123255202, "grad_norm": 2.4704179763793945, "learning_rate": 3.8430339741901505e-05, "loss": 2.4266, "step": 8793 }, { "epoch": 0.23160389781406374, "grad_norm": 1.7736200094223022, "learning_rate": 3.842902291282592e-05, "loss": 1.9201, "step": 8794 }, { "epoch": 0.23163023439557545, "grad_norm": 4.950268745422363, "learning_rate": 3.842770608375033e-05, "loss": 1.6361, "step": 8795 }, { "epoch": 0.23165657097708717, "grad_norm": 2.5951664447784424, "learning_rate": 3.8426389254674745e-05, "loss": 1.0932, "step": 8796 }, { "epoch": 0.23168290755859888, "grad_norm": 2.1899523735046387, "learning_rate": 3.8425072425599154e-05, "loss": 1.1663, "step": 8797 }, { "epoch": 0.23170924414011063, "grad_norm": 2.146493911743164, "learning_rate": 3.8423755596523576e-05, "loss": 1.5842, "step": 8798 }, { "epoch": 0.23173558072162234, "grad_norm": 1.532976746559143, "learning_rate": 3.8422438767447985e-05, "loss": 0.4931, "step": 8799 }, { "epoch": 0.23176191730313406, "grad_norm": 2.102846145629883, "learning_rate": 3.84211219383724e-05, "loss": 1.738, "step": 8800 }, { "epoch": 0.23178825388464577, "grad_norm": 6.328239917755127, "learning_rate": 3.8419805109296816e-05, "loss": 2.3549, "step": 8801 }, { "epoch": 0.2318145904661575, "grad_norm": 3.091710329055786, "learning_rate": 3.8418488280221225e-05, "loss": 1.3087, "step": 8802 }, { "epoch": 0.2318409270476692, "grad_norm": 2.3866326808929443, "learning_rate": 3.841717145114565e-05, "loss": 1.8067, "step": 8803 }, { "epoch": 0.23186726362918092, "grad_norm": 4.606212139129639, "learning_rate": 3.8415854622070056e-05, "loss": 2.209, "step": 8804 }, { "epoch": 0.23189360021069266, "grad_norm": 4.543942928314209, "learning_rate": 3.841453779299447e-05, "loss": 1.5549, "step": 8805 }, { "epoch": 0.23191993679220438, "grad_norm": 3.7005951404571533, "learning_rate": 3.841322096391888e-05, "loss": 1.0762, "step": 8806 }, { "epoch": 0.2319462733737161, "grad_norm": 3.2195284366607666, "learning_rate": 3.84119041348433e-05, "loss": 1.2201, "step": 8807 }, { "epoch": 0.2319726099552278, "grad_norm": 2.577655553817749, "learning_rate": 3.841058730576771e-05, "loss": 1.4712, "step": 8808 }, { "epoch": 0.23199894653673953, "grad_norm": 2.9648633003234863, "learning_rate": 3.840927047669213e-05, "loss": 1.5106, "step": 8809 }, { "epoch": 0.23202528311825124, "grad_norm": 1.9411506652832031, "learning_rate": 3.840795364761654e-05, "loss": 1.7886, "step": 8810 }, { "epoch": 0.23205161969976298, "grad_norm": 2.274669647216797, "learning_rate": 3.840663681854095e-05, "loss": 1.5786, "step": 8811 }, { "epoch": 0.2320779562812747, "grad_norm": 3.650491237640381, "learning_rate": 3.8405319989465374e-05, "loss": 2.1217, "step": 8812 }, { "epoch": 0.23210429286278642, "grad_norm": 1.8162002563476562, "learning_rate": 3.840400316038978e-05, "loss": 1.9206, "step": 8813 }, { "epoch": 0.23213062944429813, "grad_norm": 3.3469510078430176, "learning_rate": 3.84026863313142e-05, "loss": 1.7331, "step": 8814 }, { "epoch": 0.23215696602580985, "grad_norm": 7.383838653564453, "learning_rate": 3.840136950223861e-05, "loss": 1.7761, "step": 8815 }, { "epoch": 0.23218330260732156, "grad_norm": 3.184968948364258, "learning_rate": 3.840005267316302e-05, "loss": 1.1129, "step": 8816 }, { "epoch": 0.23220963918883328, "grad_norm": 2.9226291179656982, "learning_rate": 3.839873584408744e-05, "loss": 1.6157, "step": 8817 }, { "epoch": 0.23223597577034502, "grad_norm": 1.9389110803604126, "learning_rate": 3.8397419015011854e-05, "loss": 2.3295, "step": 8818 }, { "epoch": 0.23226231235185674, "grad_norm": 2.777867078781128, "learning_rate": 3.839610218593627e-05, "loss": 2.1738, "step": 8819 }, { "epoch": 0.23228864893336845, "grad_norm": 2.0441317558288574, "learning_rate": 3.839478535686068e-05, "loss": 0.8182, "step": 8820 }, { "epoch": 0.23231498551488017, "grad_norm": 3.5684680938720703, "learning_rate": 3.83934685277851e-05, "loss": 2.0045, "step": 8821 }, { "epoch": 0.23234132209639188, "grad_norm": 2.815356492996216, "learning_rate": 3.839215169870951e-05, "loss": 1.831, "step": 8822 }, { "epoch": 0.2323676586779036, "grad_norm": 3.127657651901245, "learning_rate": 3.8390834869633926e-05, "loss": 0.5086, "step": 8823 }, { "epoch": 0.23239399525941531, "grad_norm": 1.9306000471115112, "learning_rate": 3.8389518040558334e-05, "loss": 1.7544, "step": 8824 }, { "epoch": 0.23242033184092706, "grad_norm": 1.8755202293395996, "learning_rate": 3.838820121148275e-05, "loss": 1.9621, "step": 8825 }, { "epoch": 0.23244666842243877, "grad_norm": 2.0463740825653076, "learning_rate": 3.8386884382407166e-05, "loss": 1.5315, "step": 8826 }, { "epoch": 0.2324730050039505, "grad_norm": 1.8066118955612183, "learning_rate": 3.838556755333158e-05, "loss": 2.4106, "step": 8827 }, { "epoch": 0.2324993415854622, "grad_norm": 2.7764689922332764, "learning_rate": 3.838425072425599e-05, "loss": 0.9899, "step": 8828 }, { "epoch": 0.23252567816697392, "grad_norm": 1.6359795331954956, "learning_rate": 3.8382933895180406e-05, "loss": 1.9047, "step": 8829 }, { "epoch": 0.23255201474848564, "grad_norm": 4.946788787841797, "learning_rate": 3.838161706610482e-05, "loss": 0.9244, "step": 8830 }, { "epoch": 0.23257835132999738, "grad_norm": 2.9737300872802734, "learning_rate": 3.838030023702924e-05, "loss": 1.648, "step": 8831 }, { "epoch": 0.2326046879115091, "grad_norm": 2.2024760246276855, "learning_rate": 3.837898340795365e-05, "loss": 0.5833, "step": 8832 }, { "epoch": 0.2326310244930208, "grad_norm": 5.991247653961182, "learning_rate": 3.837766657887806e-05, "loss": 1.0519, "step": 8833 }, { "epoch": 0.23265736107453253, "grad_norm": 1.4719481468200684, "learning_rate": 3.837634974980248e-05, "loss": 1.001, "step": 8834 }, { "epoch": 0.23268369765604424, "grad_norm": 1.9669474363327026, "learning_rate": 3.8375032920726886e-05, "loss": 2.4314, "step": 8835 }, { "epoch": 0.23271003423755596, "grad_norm": 2.3021914958953857, "learning_rate": 3.837371609165131e-05, "loss": 1.7301, "step": 8836 }, { "epoch": 0.23273637081906767, "grad_norm": 5.052045822143555, "learning_rate": 3.837239926257572e-05, "loss": 1.4613, "step": 8837 }, { "epoch": 0.23276270740057942, "grad_norm": 2.0064573287963867, "learning_rate": 3.837108243350013e-05, "loss": 1.8329, "step": 8838 }, { "epoch": 0.23278904398209113, "grad_norm": 1.8239521980285645, "learning_rate": 3.836976560442455e-05, "loss": 1.5283, "step": 8839 }, { "epoch": 0.23281538056360285, "grad_norm": 2.086679220199585, "learning_rate": 3.8368448775348964e-05, "loss": 2.1112, "step": 8840 }, { "epoch": 0.23284171714511456, "grad_norm": 1.819541573524475, "learning_rate": 3.836713194627338e-05, "loss": 2.1888, "step": 8841 }, { "epoch": 0.23286805372662628, "grad_norm": 2.143944025039673, "learning_rate": 3.836581511719779e-05, "loss": 2.3739, "step": 8842 }, { "epoch": 0.232894390308138, "grad_norm": 2.3751306533813477, "learning_rate": 3.8364498288122204e-05, "loss": 1.367, "step": 8843 }, { "epoch": 0.2329207268896497, "grad_norm": 1.697487473487854, "learning_rate": 3.836318145904661e-05, "loss": 1.6871, "step": 8844 }, { "epoch": 0.23294706347116145, "grad_norm": 1.8505195379257202, "learning_rate": 3.8361864629971035e-05, "loss": 1.4418, "step": 8845 }, { "epoch": 0.23297340005267317, "grad_norm": 2.0387566089630127, "learning_rate": 3.8360547800895444e-05, "loss": 1.7394, "step": 8846 }, { "epoch": 0.23299973663418488, "grad_norm": 2.9100518226623535, "learning_rate": 3.835923097181986e-05, "loss": 1.4584, "step": 8847 }, { "epoch": 0.2330260732156966, "grad_norm": 4.852202415466309, "learning_rate": 3.8357914142744275e-05, "loss": 2.2022, "step": 8848 }, { "epoch": 0.23305240979720832, "grad_norm": 3.094252109527588, "learning_rate": 3.8356597313668684e-05, "loss": 1.3535, "step": 8849 }, { "epoch": 0.23307874637872003, "grad_norm": 1.699992299079895, "learning_rate": 3.8355280484593106e-05, "loss": 1.9854, "step": 8850 }, { "epoch": 0.23310508296023177, "grad_norm": 3.062504768371582, "learning_rate": 3.8353963655517515e-05, "loss": 0.6641, "step": 8851 }, { "epoch": 0.2331314195417435, "grad_norm": 2.3903961181640625, "learning_rate": 3.835264682644193e-05, "loss": 1.7965, "step": 8852 }, { "epoch": 0.2331577561232552, "grad_norm": 2.542442560195923, "learning_rate": 3.835132999736634e-05, "loss": 2.3973, "step": 8853 }, { "epoch": 0.23318409270476692, "grad_norm": 1.528419852256775, "learning_rate": 3.835001316829076e-05, "loss": 1.7073, "step": 8854 }, { "epoch": 0.23321042928627864, "grad_norm": 2.7661101818084717, "learning_rate": 3.834869633921517e-05, "loss": 2.2516, "step": 8855 }, { "epoch": 0.23323676586779035, "grad_norm": 4.172338962554932, "learning_rate": 3.8347379510139586e-05, "loss": 1.1142, "step": 8856 }, { "epoch": 0.23326310244930207, "grad_norm": 3.0811731815338135, "learning_rate": 3.8346062681064e-05, "loss": 1.9807, "step": 8857 }, { "epoch": 0.2332894390308138, "grad_norm": 1.539762020111084, "learning_rate": 3.834474585198841e-05, "loss": 1.7075, "step": 8858 }, { "epoch": 0.23331577561232553, "grad_norm": 2.513007879257202, "learning_rate": 3.834342902291283e-05, "loss": 0.744, "step": 8859 }, { "epoch": 0.23334211219383724, "grad_norm": 1.6982077360153198, "learning_rate": 3.834211219383724e-05, "loss": 1.8962, "step": 8860 }, { "epoch": 0.23336844877534896, "grad_norm": 4.641372203826904, "learning_rate": 3.834079536476166e-05, "loss": 1.4002, "step": 8861 }, { "epoch": 0.23339478535686067, "grad_norm": 1.8269281387329102, "learning_rate": 3.8339478535686066e-05, "loss": 1.7396, "step": 8862 }, { "epoch": 0.2334211219383724, "grad_norm": 2.211444139480591, "learning_rate": 3.833816170661048e-05, "loss": 1.7778, "step": 8863 }, { "epoch": 0.23344745851988413, "grad_norm": 1.9364101886749268, "learning_rate": 3.83368448775349e-05, "loss": 0.6063, "step": 8864 }, { "epoch": 0.23347379510139585, "grad_norm": 2.820455312728882, "learning_rate": 3.833552804845931e-05, "loss": 2.2119, "step": 8865 }, { "epoch": 0.23350013168290756, "grad_norm": 1.9101132154464722, "learning_rate": 3.833421121938373e-05, "loss": 0.9698, "step": 8866 }, { "epoch": 0.23352646826441928, "grad_norm": 1.4153937101364136, "learning_rate": 3.833289439030814e-05, "loss": 2.0112, "step": 8867 }, { "epoch": 0.233552804845931, "grad_norm": 1.9030933380126953, "learning_rate": 3.833157756123255e-05, "loss": 1.5937, "step": 8868 }, { "epoch": 0.2335791414274427, "grad_norm": 1.5382078886032104, "learning_rate": 3.833026073215697e-05, "loss": 1.6992, "step": 8869 }, { "epoch": 0.23360547800895443, "grad_norm": 1.5609915256500244, "learning_rate": 3.8328943903081384e-05, "loss": 1.6782, "step": 8870 }, { "epoch": 0.23363181459046617, "grad_norm": 3.965820789337158, "learning_rate": 3.832762707400579e-05, "loss": 1.6336, "step": 8871 }, { "epoch": 0.23365815117197788, "grad_norm": 3.165330648422241, "learning_rate": 3.832631024493021e-05, "loss": 1.5335, "step": 8872 }, { "epoch": 0.2336844877534896, "grad_norm": 2.4327380657196045, "learning_rate": 3.8324993415854624e-05, "loss": 1.2543, "step": 8873 }, { "epoch": 0.23371082433500132, "grad_norm": 2.084284543991089, "learning_rate": 3.832367658677904e-05, "loss": 2.1076, "step": 8874 }, { "epoch": 0.23373716091651303, "grad_norm": 1.7423182725906372, "learning_rate": 3.8322359757703455e-05, "loss": 2.0306, "step": 8875 }, { "epoch": 0.23376349749802475, "grad_norm": 2.029839515686035, "learning_rate": 3.8321042928627864e-05, "loss": 0.688, "step": 8876 }, { "epoch": 0.23378983407953646, "grad_norm": 1.9252088069915771, "learning_rate": 3.831972609955228e-05, "loss": 0.8262, "step": 8877 }, { "epoch": 0.2338161706610482, "grad_norm": 3.151832342147827, "learning_rate": 3.8318409270476695e-05, "loss": 1.7748, "step": 8878 }, { "epoch": 0.23384250724255992, "grad_norm": 2.617858409881592, "learning_rate": 3.831709244140111e-05, "loss": 2.2719, "step": 8879 }, { "epoch": 0.23386884382407164, "grad_norm": 2.3172693252563477, "learning_rate": 3.831577561232552e-05, "loss": 1.9707, "step": 8880 }, { "epoch": 0.23389518040558335, "grad_norm": 4.9011921882629395, "learning_rate": 3.8314458783249935e-05, "loss": 1.8664, "step": 8881 }, { "epoch": 0.23392151698709507, "grad_norm": 2.3205227851867676, "learning_rate": 3.8313141954174344e-05, "loss": 0.7283, "step": 8882 }, { "epoch": 0.23394785356860678, "grad_norm": 2.58687424659729, "learning_rate": 3.831182512509877e-05, "loss": 1.4471, "step": 8883 }, { "epoch": 0.23397419015011853, "grad_norm": 4.4518938064575195, "learning_rate": 3.8310508296023175e-05, "loss": 1.2096, "step": 8884 }, { "epoch": 0.23400052673163024, "grad_norm": 4.318954944610596, "learning_rate": 3.830919146694759e-05, "loss": 1.1394, "step": 8885 }, { "epoch": 0.23402686331314196, "grad_norm": 1.8930089473724365, "learning_rate": 3.830787463787201e-05, "loss": 1.5532, "step": 8886 }, { "epoch": 0.23405319989465367, "grad_norm": 1.6042455434799194, "learning_rate": 3.830655780879642e-05, "loss": 2.1812, "step": 8887 }, { "epoch": 0.2340795364761654, "grad_norm": 1.786065697669983, "learning_rate": 3.830524097972084e-05, "loss": 2.0149, "step": 8888 }, { "epoch": 0.2341058730576771, "grad_norm": 3.1769027709960938, "learning_rate": 3.830392415064525e-05, "loss": 2.0626, "step": 8889 }, { "epoch": 0.23413220963918882, "grad_norm": 3.0187506675720215, "learning_rate": 3.830260732156966e-05, "loss": 1.7158, "step": 8890 }, { "epoch": 0.23415854622070056, "grad_norm": 2.120159387588501, "learning_rate": 3.830129049249407e-05, "loss": 0.3464, "step": 8891 }, { "epoch": 0.23418488280221228, "grad_norm": 4.124617576599121, "learning_rate": 3.8299973663418493e-05, "loss": 0.9884, "step": 8892 }, { "epoch": 0.234211219383724, "grad_norm": 1.733627438545227, "learning_rate": 3.82986568343429e-05, "loss": 0.7134, "step": 8893 }, { "epoch": 0.2342375559652357, "grad_norm": 2.665647268295288, "learning_rate": 3.829734000526732e-05, "loss": 1.8067, "step": 8894 }, { "epoch": 0.23426389254674743, "grad_norm": 1.5547395944595337, "learning_rate": 3.8296023176191734e-05, "loss": 0.5754, "step": 8895 }, { "epoch": 0.23429022912825914, "grad_norm": 1.867587685585022, "learning_rate": 3.829470634711614e-05, "loss": 2.0367, "step": 8896 }, { "epoch": 0.23431656570977089, "grad_norm": 3.2139203548431396, "learning_rate": 3.8293389518040565e-05, "loss": 0.7397, "step": 8897 }, { "epoch": 0.2343429022912826, "grad_norm": 1.5913513898849487, "learning_rate": 3.8292072688964974e-05, "loss": 0.8447, "step": 8898 }, { "epoch": 0.23436923887279432, "grad_norm": 1.602441668510437, "learning_rate": 3.829075585988939e-05, "loss": 1.5207, "step": 8899 }, { "epoch": 0.23439557545430603, "grad_norm": 1.5813241004943848, "learning_rate": 3.82894390308138e-05, "loss": 1.8338, "step": 8900 }, { "epoch": 0.23442191203581775, "grad_norm": 2.0932424068450928, "learning_rate": 3.8288122201738214e-05, "loss": 2.3471, "step": 8901 }, { "epoch": 0.23444824861732946, "grad_norm": 2.052216053009033, "learning_rate": 3.828680537266263e-05, "loss": 2.3069, "step": 8902 }, { "epoch": 0.23447458519884118, "grad_norm": 3.160872220993042, "learning_rate": 3.8285488543587045e-05, "loss": 1.7844, "step": 8903 }, { "epoch": 0.23450092178035292, "grad_norm": 3.706263780593872, "learning_rate": 3.828417171451146e-05, "loss": 1.4202, "step": 8904 }, { "epoch": 0.23452725836186464, "grad_norm": 2.0213115215301514, "learning_rate": 3.828285488543587e-05, "loss": 2.0583, "step": 8905 }, { "epoch": 0.23455359494337635, "grad_norm": 2.1955199241638184, "learning_rate": 3.828153805636029e-05, "loss": 1.889, "step": 8906 }, { "epoch": 0.23457993152488807, "grad_norm": 1.8465187549591064, "learning_rate": 3.82802212272847e-05, "loss": 1.3723, "step": 8907 }, { "epoch": 0.23460626810639978, "grad_norm": 1.5337226390838623, "learning_rate": 3.8278904398209116e-05, "loss": 1.718, "step": 8908 }, { "epoch": 0.2346326046879115, "grad_norm": 2.3587608337402344, "learning_rate": 3.8277587569133525e-05, "loss": 2.153, "step": 8909 }, { "epoch": 0.23465894126942322, "grad_norm": 3.854475736618042, "learning_rate": 3.827627074005794e-05, "loss": 1.5479, "step": 8910 }, { "epoch": 0.23468527785093496, "grad_norm": 2.820453643798828, "learning_rate": 3.8274953910982356e-05, "loss": 0.7081, "step": 8911 }, { "epoch": 0.23471161443244667, "grad_norm": 1.948861002922058, "learning_rate": 3.827363708190677e-05, "loss": 2.0023, "step": 8912 }, { "epoch": 0.2347379510139584, "grad_norm": 2.404877185821533, "learning_rate": 3.827232025283119e-05, "loss": 2.1299, "step": 8913 }, { "epoch": 0.2347642875954701, "grad_norm": 2.2956411838531494, "learning_rate": 3.8271003423755596e-05, "loss": 1.4998, "step": 8914 }, { "epoch": 0.23479062417698182, "grad_norm": 3.2377097606658936, "learning_rate": 3.826968659468001e-05, "loss": 1.4843, "step": 8915 }, { "epoch": 0.23481696075849354, "grad_norm": 2.416879653930664, "learning_rate": 3.826836976560443e-05, "loss": 1.8884, "step": 8916 }, { "epoch": 0.23484329734000528, "grad_norm": 3.0116682052612305, "learning_rate": 3.826705293652884e-05, "loss": 2.5791, "step": 8917 }, { "epoch": 0.234869633921517, "grad_norm": 4.6304612159729, "learning_rate": 3.826573610745325e-05, "loss": 1.0188, "step": 8918 }, { "epoch": 0.2348959705030287, "grad_norm": 3.1109516620635986, "learning_rate": 3.826441927837767e-05, "loss": 2.2889, "step": 8919 }, { "epoch": 0.23492230708454043, "grad_norm": 2.2072911262512207, "learning_rate": 3.826310244930208e-05, "loss": 2.4439, "step": 8920 }, { "epoch": 0.23494864366605214, "grad_norm": 3.9602625370025635, "learning_rate": 3.82617856202265e-05, "loss": 1.5081, "step": 8921 }, { "epoch": 0.23497498024756386, "grad_norm": 2.291158437728882, "learning_rate": 3.8260468791150914e-05, "loss": 1.6707, "step": 8922 }, { "epoch": 0.23500131682907557, "grad_norm": 4.118983745574951, "learning_rate": 3.825915196207532e-05, "loss": 2.3479, "step": 8923 }, { "epoch": 0.23502765341058732, "grad_norm": 1.745460867881775, "learning_rate": 3.825783513299974e-05, "loss": 1.7726, "step": 8924 }, { "epoch": 0.23505398999209903, "grad_norm": 1.618269920349121, "learning_rate": 3.8256518303924154e-05, "loss": 1.6318, "step": 8925 }, { "epoch": 0.23508032657361075, "grad_norm": 4.956629276275635, "learning_rate": 3.825520147484857e-05, "loss": 2.3063, "step": 8926 }, { "epoch": 0.23510666315512246, "grad_norm": 3.23354434967041, "learning_rate": 3.825388464577298e-05, "loss": 1.2218, "step": 8927 }, { "epoch": 0.23513299973663418, "grad_norm": 1.6146546602249146, "learning_rate": 3.8252567816697394e-05, "loss": 1.593, "step": 8928 }, { "epoch": 0.2351593363181459, "grad_norm": 1.7741047143936157, "learning_rate": 3.82512509876218e-05, "loss": 1.7241, "step": 8929 }, { "epoch": 0.2351856728996576, "grad_norm": 2.6417601108551025, "learning_rate": 3.8249934158546225e-05, "loss": 0.5566, "step": 8930 }, { "epoch": 0.23521200948116935, "grad_norm": 1.7497538328170776, "learning_rate": 3.8248617329470634e-05, "loss": 1.2505, "step": 8931 }, { "epoch": 0.23523834606268107, "grad_norm": 6.498108386993408, "learning_rate": 3.824730050039505e-05, "loss": 1.7914, "step": 8932 }, { "epoch": 0.23526468264419279, "grad_norm": 3.6201765537261963, "learning_rate": 3.8245983671319465e-05, "loss": 1.905, "step": 8933 }, { "epoch": 0.2352910192257045, "grad_norm": 1.7686960697174072, "learning_rate": 3.8244666842243874e-05, "loss": 2.0151, "step": 8934 }, { "epoch": 0.23531735580721622, "grad_norm": 2.550863027572632, "learning_rate": 3.8243350013168296e-05, "loss": 1.9397, "step": 8935 }, { "epoch": 0.23534369238872793, "grad_norm": 2.5576789379119873, "learning_rate": 3.8242033184092705e-05, "loss": 0.429, "step": 8936 }, { "epoch": 0.23537002897023968, "grad_norm": 2.9477245807647705, "learning_rate": 3.824071635501712e-05, "loss": 1.4103, "step": 8937 }, { "epoch": 0.2353963655517514, "grad_norm": 4.0079498291015625, "learning_rate": 3.823939952594153e-05, "loss": 1.0337, "step": 8938 }, { "epoch": 0.2354227021332631, "grad_norm": 1.8713847398757935, "learning_rate": 3.823808269686595e-05, "loss": 2.0018, "step": 8939 }, { "epoch": 0.23544903871477482, "grad_norm": 1.8662177324295044, "learning_rate": 3.823676586779036e-05, "loss": 1.5008, "step": 8940 }, { "epoch": 0.23547537529628654, "grad_norm": 3.5579030513763428, "learning_rate": 3.8235449038714776e-05, "loss": 1.049, "step": 8941 }, { "epoch": 0.23550171187779825, "grad_norm": 1.962793231010437, "learning_rate": 3.823413220963919e-05, "loss": 1.3536, "step": 8942 }, { "epoch": 0.23552804845930997, "grad_norm": 3.0141546726226807, "learning_rate": 3.82328153805636e-05, "loss": 1.6883, "step": 8943 }, { "epoch": 0.2355543850408217, "grad_norm": 2.788313388824463, "learning_rate": 3.823149855148802e-05, "loss": 1.6545, "step": 8944 }, { "epoch": 0.23558072162233343, "grad_norm": 2.075899600982666, "learning_rate": 3.823018172241243e-05, "loss": 1.3079, "step": 8945 }, { "epoch": 0.23560705820384514, "grad_norm": 5.5302734375, "learning_rate": 3.822886489333685e-05, "loss": 1.0914, "step": 8946 }, { "epoch": 0.23563339478535686, "grad_norm": 5.843254566192627, "learning_rate": 3.8227548064261257e-05, "loss": 1.6165, "step": 8947 }, { "epoch": 0.23565973136686857, "grad_norm": 2.9635374546051025, "learning_rate": 3.822623123518567e-05, "loss": 1.4004, "step": 8948 }, { "epoch": 0.2356860679483803, "grad_norm": 2.420072317123413, "learning_rate": 3.822491440611009e-05, "loss": 1.0397, "step": 8949 }, { "epoch": 0.23571240452989203, "grad_norm": 2.560356855392456, "learning_rate": 3.82235975770345e-05, "loss": 1.0354, "step": 8950 }, { "epoch": 0.23573874111140375, "grad_norm": 3.587472438812256, "learning_rate": 3.822228074795892e-05, "loss": 0.5169, "step": 8951 }, { "epoch": 0.23576507769291546, "grad_norm": 3.9375946521759033, "learning_rate": 3.822096391888333e-05, "loss": 2.734, "step": 8952 }, { "epoch": 0.23579141427442718, "grad_norm": 2.2739546298980713, "learning_rate": 3.821964708980775e-05, "loss": 1.3848, "step": 8953 }, { "epoch": 0.2358177508559389, "grad_norm": 2.0366899967193604, "learning_rate": 3.821833026073216e-05, "loss": 1.4427, "step": 8954 }, { "epoch": 0.2358440874374506, "grad_norm": 1.8842791318893433, "learning_rate": 3.8217013431656575e-05, "loss": 1.8446, "step": 8955 }, { "epoch": 0.23587042401896233, "grad_norm": 1.8873313665390015, "learning_rate": 3.821569660258098e-05, "loss": 1.6234, "step": 8956 }, { "epoch": 0.23589676060047407, "grad_norm": 2.7381744384765625, "learning_rate": 3.82143797735054e-05, "loss": 0.8085, "step": 8957 }, { "epoch": 0.23592309718198579, "grad_norm": 3.809654712677002, "learning_rate": 3.8213062944429815e-05, "loss": 0.6761, "step": 8958 }, { "epoch": 0.2359494337634975, "grad_norm": 2.1754937171936035, "learning_rate": 3.821174611535423e-05, "loss": 1.44, "step": 8959 }, { "epoch": 0.23597577034500922, "grad_norm": 3.9207212924957275, "learning_rate": 3.8210429286278646e-05, "loss": 0.7976, "step": 8960 }, { "epoch": 0.23600210692652093, "grad_norm": 3.327483892440796, "learning_rate": 3.8209112457203055e-05, "loss": 0.4344, "step": 8961 }, { "epoch": 0.23602844350803265, "grad_norm": 3.8254613876342773, "learning_rate": 3.820779562812747e-05, "loss": 1.1983, "step": 8962 }, { "epoch": 0.23605478008954436, "grad_norm": 1.8120373487472534, "learning_rate": 3.8206478799051886e-05, "loss": 1.789, "step": 8963 }, { "epoch": 0.2360811166710561, "grad_norm": 2.8455874919891357, "learning_rate": 3.82051619699763e-05, "loss": 1.2715, "step": 8964 }, { "epoch": 0.23610745325256782, "grad_norm": 2.352724552154541, "learning_rate": 3.820384514090071e-05, "loss": 1.5288, "step": 8965 }, { "epoch": 0.23613378983407954, "grad_norm": 3.0580079555511475, "learning_rate": 3.8202528311825126e-05, "loss": 2.1386, "step": 8966 }, { "epoch": 0.23616012641559125, "grad_norm": 3.5651791095733643, "learning_rate": 3.820121148274954e-05, "loss": 2.6831, "step": 8967 }, { "epoch": 0.23618646299710297, "grad_norm": 3.2723565101623535, "learning_rate": 3.819989465367396e-05, "loss": 1.5994, "step": 8968 }, { "epoch": 0.23621279957861469, "grad_norm": 1.4684089422225952, "learning_rate": 3.819857782459837e-05, "loss": 2.1179, "step": 8969 }, { "epoch": 0.23623913616012643, "grad_norm": 1.7993519306182861, "learning_rate": 3.819726099552278e-05, "loss": 1.7187, "step": 8970 }, { "epoch": 0.23626547274163814, "grad_norm": 2.4115402698516846, "learning_rate": 3.81959441664472e-05, "loss": 1.4155, "step": 8971 }, { "epoch": 0.23629180932314986, "grad_norm": 1.740483045578003, "learning_rate": 3.819462733737161e-05, "loss": 1.9074, "step": 8972 }, { "epoch": 0.23631814590466158, "grad_norm": 1.5675009489059448, "learning_rate": 3.819331050829603e-05, "loss": 1.7608, "step": 8973 }, { "epoch": 0.2363444824861733, "grad_norm": 2.571057081222534, "learning_rate": 3.819199367922044e-05, "loss": 1.5261, "step": 8974 }, { "epoch": 0.236370819067685, "grad_norm": 3.5776636600494385, "learning_rate": 3.819067685014485e-05, "loss": 0.8671, "step": 8975 }, { "epoch": 0.23639715564919672, "grad_norm": 2.386263132095337, "learning_rate": 3.818936002106927e-05, "loss": 2.2032, "step": 8976 }, { "epoch": 0.23642349223070847, "grad_norm": 1.9783543348312378, "learning_rate": 3.8188043191993684e-05, "loss": 1.4508, "step": 8977 }, { "epoch": 0.23644982881222018, "grad_norm": 1.8393594026565552, "learning_rate": 3.81867263629181e-05, "loss": 0.7998, "step": 8978 }, { "epoch": 0.2364761653937319, "grad_norm": 2.0811891555786133, "learning_rate": 3.818540953384251e-05, "loss": 1.763, "step": 8979 }, { "epoch": 0.2365025019752436, "grad_norm": 3.1697185039520264, "learning_rate": 3.8184092704766924e-05, "loss": 0.9507, "step": 8980 }, { "epoch": 0.23652883855675533, "grad_norm": 2.105729579925537, "learning_rate": 3.818277587569133e-05, "loss": 1.6966, "step": 8981 }, { "epoch": 0.23655517513826704, "grad_norm": 3.137927770614624, "learning_rate": 3.8181459046615755e-05, "loss": 0.5815, "step": 8982 }, { "epoch": 0.2365815117197788, "grad_norm": 1.87382173538208, "learning_rate": 3.8180142217540164e-05, "loss": 1.9441, "step": 8983 }, { "epoch": 0.2366078483012905, "grad_norm": 1.5987637042999268, "learning_rate": 3.817882538846458e-05, "loss": 1.7637, "step": 8984 }, { "epoch": 0.23663418488280222, "grad_norm": 1.9208414554595947, "learning_rate": 3.817750855938899e-05, "loss": 1.9402, "step": 8985 }, { "epoch": 0.23666052146431393, "grad_norm": 1.628235936164856, "learning_rate": 3.817619173031341e-05, "loss": 0.525, "step": 8986 }, { "epoch": 0.23668685804582565, "grad_norm": 2.460887908935547, "learning_rate": 3.817487490123782e-05, "loss": 2.2777, "step": 8987 }, { "epoch": 0.23671319462733736, "grad_norm": 1.8031535148620605, "learning_rate": 3.8173558072162235e-05, "loss": 1.734, "step": 8988 }, { "epoch": 0.23673953120884908, "grad_norm": 2.9295225143432617, "learning_rate": 3.817224124308665e-05, "loss": 2.0813, "step": 8989 }, { "epoch": 0.23676586779036082, "grad_norm": 8.491218566894531, "learning_rate": 3.817092441401106e-05, "loss": 1.5099, "step": 8990 }, { "epoch": 0.23679220437187254, "grad_norm": 6.878724575042725, "learning_rate": 3.816960758493548e-05, "loss": 2.0645, "step": 8991 }, { "epoch": 0.23681854095338425, "grad_norm": 3.4732770919799805, "learning_rate": 3.816829075585989e-05, "loss": 1.4976, "step": 8992 }, { "epoch": 0.23684487753489597, "grad_norm": 3.7131786346435547, "learning_rate": 3.8166973926784306e-05, "loss": 2.8987, "step": 8993 }, { "epoch": 0.23687121411640769, "grad_norm": 1.7151122093200684, "learning_rate": 3.8165657097708715e-05, "loss": 0.5245, "step": 8994 }, { "epoch": 0.2368975506979194, "grad_norm": 3.2155251502990723, "learning_rate": 3.816434026863313e-05, "loss": 1.6148, "step": 8995 }, { "epoch": 0.23692388727943112, "grad_norm": 1.8275508880615234, "learning_rate": 3.8163023439557546e-05, "loss": 1.5996, "step": 8996 }, { "epoch": 0.23695022386094286, "grad_norm": 1.5089374780654907, "learning_rate": 3.816170661048196e-05, "loss": 1.6117, "step": 8997 }, { "epoch": 0.23697656044245458, "grad_norm": 1.5572309494018555, "learning_rate": 3.816038978140638e-05, "loss": 2.0907, "step": 8998 }, { "epoch": 0.2370028970239663, "grad_norm": 4.3987555503845215, "learning_rate": 3.8159072952330786e-05, "loss": 2.0462, "step": 8999 }, { "epoch": 0.237029233605478, "grad_norm": 3.0734729766845703, "learning_rate": 3.81577561232552e-05, "loss": 2.6626, "step": 9000 }, { "epoch": 0.23705557018698972, "grad_norm": 1.7914612293243408, "learning_rate": 3.815643929417962e-05, "loss": 1.7885, "step": 9001 }, { "epoch": 0.23708190676850144, "grad_norm": 1.5557209253311157, "learning_rate": 3.815512246510403e-05, "loss": 1.4949, "step": 9002 }, { "epoch": 0.23710824335001318, "grad_norm": 2.067309856414795, "learning_rate": 3.815380563602844e-05, "loss": 2.0708, "step": 9003 }, { "epoch": 0.2371345799315249, "grad_norm": 2.3373732566833496, "learning_rate": 3.815248880695286e-05, "loss": 2.0256, "step": 9004 }, { "epoch": 0.2371609165130366, "grad_norm": 4.035815238952637, "learning_rate": 3.815117197787727e-05, "loss": 1.4594, "step": 9005 }, { "epoch": 0.23718725309454833, "grad_norm": 1.917300820350647, "learning_rate": 3.814985514880169e-05, "loss": 0.3119, "step": 9006 }, { "epoch": 0.23721358967606004, "grad_norm": 1.7235420942306519, "learning_rate": 3.8148538319726104e-05, "loss": 1.7185, "step": 9007 }, { "epoch": 0.23723992625757176, "grad_norm": 4.063268184661865, "learning_rate": 3.814722149065051e-05, "loss": 1.0906, "step": 9008 }, { "epoch": 0.23726626283908347, "grad_norm": 2.1704537868499756, "learning_rate": 3.814590466157493e-05, "loss": 1.584, "step": 9009 }, { "epoch": 0.23729259942059522, "grad_norm": 1.6469006538391113, "learning_rate": 3.8144587832499344e-05, "loss": 1.5391, "step": 9010 }, { "epoch": 0.23731893600210693, "grad_norm": 2.907432794570923, "learning_rate": 3.814327100342376e-05, "loss": 1.5396, "step": 9011 }, { "epoch": 0.23734527258361865, "grad_norm": 3.2695088386535645, "learning_rate": 3.814195417434817e-05, "loss": 1.2955, "step": 9012 }, { "epoch": 0.23737160916513037, "grad_norm": 5.305758953094482, "learning_rate": 3.8140637345272584e-05, "loss": 1.2816, "step": 9013 }, { "epoch": 0.23739794574664208, "grad_norm": 3.164250373840332, "learning_rate": 3.8139320516197e-05, "loss": 1.4329, "step": 9014 }, { "epoch": 0.2374242823281538, "grad_norm": 2.0373001098632812, "learning_rate": 3.8138003687121416e-05, "loss": 1.4984, "step": 9015 }, { "epoch": 0.2374506189096655, "grad_norm": 2.999027729034424, "learning_rate": 3.813668685804583e-05, "loss": 1.3634, "step": 9016 }, { "epoch": 0.23747695549117726, "grad_norm": 2.648401975631714, "learning_rate": 3.813537002897024e-05, "loss": 1.6166, "step": 9017 }, { "epoch": 0.23750329207268897, "grad_norm": 1.7565820217132568, "learning_rate": 3.8134053199894656e-05, "loss": 1.6777, "step": 9018 }, { "epoch": 0.2375296286542007, "grad_norm": 2.587958335876465, "learning_rate": 3.813273637081907e-05, "loss": 0.6778, "step": 9019 }, { "epoch": 0.2375559652357124, "grad_norm": 2.393542766571045, "learning_rate": 3.813141954174349e-05, "loss": 1.7176, "step": 9020 }, { "epoch": 0.23758230181722412, "grad_norm": 4.26826810836792, "learning_rate": 3.8130102712667896e-05, "loss": 1.0747, "step": 9021 }, { "epoch": 0.23760863839873583, "grad_norm": 1.5846703052520752, "learning_rate": 3.812878588359231e-05, "loss": 1.8606, "step": 9022 }, { "epoch": 0.23763497498024758, "grad_norm": 2.3236591815948486, "learning_rate": 3.812746905451673e-05, "loss": 0.696, "step": 9023 }, { "epoch": 0.2376613115617593, "grad_norm": 2.3510401248931885, "learning_rate": 3.812615222544114e-05, "loss": 1.832, "step": 9024 }, { "epoch": 0.237687648143271, "grad_norm": 2.3777177333831787, "learning_rate": 3.812483539636556e-05, "loss": 2.0763, "step": 9025 }, { "epoch": 0.23771398472478272, "grad_norm": 5.4839301109313965, "learning_rate": 3.812351856728997e-05, "loss": 2.0837, "step": 9026 }, { "epoch": 0.23774032130629444, "grad_norm": 2.2859816551208496, "learning_rate": 3.812220173821438e-05, "loss": 1.4909, "step": 9027 }, { "epoch": 0.23776665788780615, "grad_norm": 1.9150323867797852, "learning_rate": 3.812088490913879e-05, "loss": 1.8763, "step": 9028 }, { "epoch": 0.23779299446931787, "grad_norm": 3.0372586250305176, "learning_rate": 3.8119568080063214e-05, "loss": 1.7709, "step": 9029 }, { "epoch": 0.2378193310508296, "grad_norm": 3.4950637817382812, "learning_rate": 3.811825125098762e-05, "loss": 1.5326, "step": 9030 }, { "epoch": 0.23784566763234133, "grad_norm": 2.4495930671691895, "learning_rate": 3.811693442191204e-05, "loss": 1.5065, "step": 9031 }, { "epoch": 0.23787200421385304, "grad_norm": 1.6274694204330444, "learning_rate": 3.811561759283645e-05, "loss": 1.608, "step": 9032 }, { "epoch": 0.23789834079536476, "grad_norm": 3.8316566944122314, "learning_rate": 3.811430076376086e-05, "loss": 1.9117, "step": 9033 }, { "epoch": 0.23792467737687648, "grad_norm": 2.6330342292785645, "learning_rate": 3.811298393468528e-05, "loss": 1.8568, "step": 9034 }, { "epoch": 0.2379510139583882, "grad_norm": 1.8126193284988403, "learning_rate": 3.8111667105609694e-05, "loss": 2.1386, "step": 9035 }, { "epoch": 0.23797735053989993, "grad_norm": 2.749074697494507, "learning_rate": 3.811035027653411e-05, "loss": 0.6549, "step": 9036 }, { "epoch": 0.23800368712141165, "grad_norm": 2.4599876403808594, "learning_rate": 3.810903344745852e-05, "loss": 1.6619, "step": 9037 }, { "epoch": 0.23803002370292337, "grad_norm": 2.7067179679870605, "learning_rate": 3.810771661838294e-05, "loss": 1.4006, "step": 9038 }, { "epoch": 0.23805636028443508, "grad_norm": 1.9752647876739502, "learning_rate": 3.810639978930735e-05, "loss": 0.3052, "step": 9039 }, { "epoch": 0.2380826968659468, "grad_norm": 1.9348949193954468, "learning_rate": 3.8105082960231765e-05, "loss": 2.0829, "step": 9040 }, { "epoch": 0.2381090334474585, "grad_norm": 11.67044448852539, "learning_rate": 3.8103766131156174e-05, "loss": 1.9523, "step": 9041 }, { "epoch": 0.23813537002897023, "grad_norm": 1.4356908798217773, "learning_rate": 3.810244930208059e-05, "loss": 0.6888, "step": 9042 }, { "epoch": 0.23816170661048197, "grad_norm": 3.7504501342773438, "learning_rate": 3.8101132473005005e-05, "loss": 1.1703, "step": 9043 }, { "epoch": 0.2381880431919937, "grad_norm": 3.1898488998413086, "learning_rate": 3.809981564392942e-05, "loss": 2.3678, "step": 9044 }, { "epoch": 0.2382143797735054, "grad_norm": 3.4472553730010986, "learning_rate": 3.8098498814853836e-05, "loss": 1.8964, "step": 9045 }, { "epoch": 0.23824071635501712, "grad_norm": 1.5964696407318115, "learning_rate": 3.8097181985778245e-05, "loss": 2.2231, "step": 9046 }, { "epoch": 0.23826705293652883, "grad_norm": 1.8006901741027832, "learning_rate": 3.809586515670266e-05, "loss": 1.6885, "step": 9047 }, { "epoch": 0.23829338951804055, "grad_norm": 2.045583963394165, "learning_rate": 3.8094548327627076e-05, "loss": 1.2975, "step": 9048 }, { "epoch": 0.23831972609955226, "grad_norm": 2.854395866394043, "learning_rate": 3.809323149855149e-05, "loss": 1.8723, "step": 9049 }, { "epoch": 0.238346062681064, "grad_norm": 1.7403329610824585, "learning_rate": 3.80919146694759e-05, "loss": 1.3867, "step": 9050 }, { "epoch": 0.23837239926257572, "grad_norm": 1.701168417930603, "learning_rate": 3.8090597840400316e-05, "loss": 2.1915, "step": 9051 }, { "epoch": 0.23839873584408744, "grad_norm": 1.7723575830459595, "learning_rate": 3.808928101132473e-05, "loss": 1.3636, "step": 9052 }, { "epoch": 0.23842507242559915, "grad_norm": 2.355898380279541, "learning_rate": 3.808796418224915e-05, "loss": 1.6345, "step": 9053 }, { "epoch": 0.23845140900711087, "grad_norm": 2.2978508472442627, "learning_rate": 3.808664735317356e-05, "loss": 1.7493, "step": 9054 }, { "epoch": 0.23847774558862259, "grad_norm": 3.296550750732422, "learning_rate": 3.808533052409797e-05, "loss": 1.6683, "step": 9055 }, { "epoch": 0.23850408217013433, "grad_norm": 2.169276475906372, "learning_rate": 3.808401369502239e-05, "loss": 1.7903, "step": 9056 }, { "epoch": 0.23853041875164605, "grad_norm": 1.586709976196289, "learning_rate": 3.80826968659468e-05, "loss": 1.8071, "step": 9057 }, { "epoch": 0.23855675533315776, "grad_norm": 2.4321131706237793, "learning_rate": 3.808138003687122e-05, "loss": 1.2764, "step": 9058 }, { "epoch": 0.23858309191466948, "grad_norm": 4.890425682067871, "learning_rate": 3.808006320779563e-05, "loss": 1.5407, "step": 9059 }, { "epoch": 0.2386094284961812, "grad_norm": 1.9176816940307617, "learning_rate": 3.807874637872004e-05, "loss": 0.7178, "step": 9060 }, { "epoch": 0.2386357650776929, "grad_norm": 1.8805367946624756, "learning_rate": 3.807742954964446e-05, "loss": 1.3999, "step": 9061 }, { "epoch": 0.23866210165920462, "grad_norm": 2.8186519145965576, "learning_rate": 3.8076112720568874e-05, "loss": 2.0395, "step": 9062 }, { "epoch": 0.23868843824071637, "grad_norm": 2.288559913635254, "learning_rate": 3.807479589149329e-05, "loss": 1.4923, "step": 9063 }, { "epoch": 0.23871477482222808, "grad_norm": 4.442562580108643, "learning_rate": 3.80734790624177e-05, "loss": 0.6598, "step": 9064 }, { "epoch": 0.2387411114037398, "grad_norm": 5.586864948272705, "learning_rate": 3.8072162233342114e-05, "loss": 1.0739, "step": 9065 }, { "epoch": 0.2387674479852515, "grad_norm": 2.1113388538360596, "learning_rate": 3.807084540426653e-05, "loss": 1.4191, "step": 9066 }, { "epoch": 0.23879378456676323, "grad_norm": 2.2378933429718018, "learning_rate": 3.8069528575190945e-05, "loss": 2.125, "step": 9067 }, { "epoch": 0.23882012114827494, "grad_norm": 2.3209145069122314, "learning_rate": 3.8068211746115354e-05, "loss": 1.3652, "step": 9068 }, { "epoch": 0.2388464577297867, "grad_norm": 1.9900540113449097, "learning_rate": 3.806689491703977e-05, "loss": 2.5439, "step": 9069 }, { "epoch": 0.2388727943112984, "grad_norm": 2.5610764026641846, "learning_rate": 3.8065578087964185e-05, "loss": 0.5374, "step": 9070 }, { "epoch": 0.23889913089281012, "grad_norm": 2.2821385860443115, "learning_rate": 3.80642612588886e-05, "loss": 1.256, "step": 9071 }, { "epoch": 0.23892546747432183, "grad_norm": 3.5710813999176025, "learning_rate": 3.8062944429813016e-05, "loss": 0.9633, "step": 9072 }, { "epoch": 0.23895180405583355, "grad_norm": 1.5225094556808472, "learning_rate": 3.8061627600737425e-05, "loss": 0.3832, "step": 9073 }, { "epoch": 0.23897814063734527, "grad_norm": 3.0547895431518555, "learning_rate": 3.806031077166184e-05, "loss": 1.1273, "step": 9074 }, { "epoch": 0.23900447721885698, "grad_norm": 2.001650333404541, "learning_rate": 3.805899394258625e-05, "loss": 1.6785, "step": 9075 }, { "epoch": 0.23903081380036872, "grad_norm": 3.8463213443756104, "learning_rate": 3.805767711351067e-05, "loss": 1.4484, "step": 9076 }, { "epoch": 0.23905715038188044, "grad_norm": 1.5718679428100586, "learning_rate": 3.805636028443508e-05, "loss": 1.4768, "step": 9077 }, { "epoch": 0.23908348696339216, "grad_norm": 2.100315809249878, "learning_rate": 3.8055043455359497e-05, "loss": 1.626, "step": 9078 }, { "epoch": 0.23910982354490387, "grad_norm": 3.065241575241089, "learning_rate": 3.8053726626283905e-05, "loss": 1.6768, "step": 9079 }, { "epoch": 0.2391361601264156, "grad_norm": 6.284860134124756, "learning_rate": 3.805240979720832e-05, "loss": 1.3801, "step": 9080 }, { "epoch": 0.2391624967079273, "grad_norm": 2.563067674636841, "learning_rate": 3.805109296813274e-05, "loss": 2.074, "step": 9081 }, { "epoch": 0.23918883328943902, "grad_norm": 2.1605827808380127, "learning_rate": 3.804977613905715e-05, "loss": 2.5389, "step": 9082 }, { "epoch": 0.23921516987095076, "grad_norm": 3.8266963958740234, "learning_rate": 3.804845930998157e-05, "loss": 1.9169, "step": 9083 }, { "epoch": 0.23924150645246248, "grad_norm": 4.3211188316345215, "learning_rate": 3.8047142480905977e-05, "loss": 1.9045, "step": 9084 }, { "epoch": 0.2392678430339742, "grad_norm": 2.3694863319396973, "learning_rate": 3.80458256518304e-05, "loss": 1.9605, "step": 9085 }, { "epoch": 0.2392941796154859, "grad_norm": 1.8015602827072144, "learning_rate": 3.804450882275481e-05, "loss": 1.8454, "step": 9086 }, { "epoch": 0.23932051619699762, "grad_norm": 2.8384194374084473, "learning_rate": 3.804319199367922e-05, "loss": 1.9281, "step": 9087 }, { "epoch": 0.23934685277850934, "grad_norm": 2.843345880508423, "learning_rate": 3.804187516460363e-05, "loss": 2.0674, "step": 9088 }, { "epoch": 0.23937318936002108, "grad_norm": 1.6297242641448975, "learning_rate": 3.804055833552805e-05, "loss": 2.4755, "step": 9089 }, { "epoch": 0.2393995259415328, "grad_norm": 3.193549633026123, "learning_rate": 3.803924150645246e-05, "loss": 1.8073, "step": 9090 }, { "epoch": 0.2394258625230445, "grad_norm": 3.4964730739593506, "learning_rate": 3.803792467737688e-05, "loss": 1.4154, "step": 9091 }, { "epoch": 0.23945219910455623, "grad_norm": 3.969574451446533, "learning_rate": 3.8036607848301295e-05, "loss": 0.9161, "step": 9092 }, { "epoch": 0.23947853568606794, "grad_norm": 1.7516297101974487, "learning_rate": 3.80352910192257e-05, "loss": 1.5023, "step": 9093 }, { "epoch": 0.23950487226757966, "grad_norm": 2.2985336780548096, "learning_rate": 3.803397419015012e-05, "loss": 2.0499, "step": 9094 }, { "epoch": 0.23953120884909138, "grad_norm": 2.7875607013702393, "learning_rate": 3.8032657361074535e-05, "loss": 1.8492, "step": 9095 }, { "epoch": 0.23955754543060312, "grad_norm": 1.9965405464172363, "learning_rate": 3.803134053199895e-05, "loss": 1.7533, "step": 9096 }, { "epoch": 0.23958388201211483, "grad_norm": 2.5655736923217773, "learning_rate": 3.803002370292336e-05, "loss": 1.4348, "step": 9097 }, { "epoch": 0.23961021859362655, "grad_norm": 2.612562656402588, "learning_rate": 3.8028706873847775e-05, "loss": 2.1023, "step": 9098 }, { "epoch": 0.23963655517513827, "grad_norm": 4.2543511390686035, "learning_rate": 3.802739004477219e-05, "loss": 1.4321, "step": 9099 }, { "epoch": 0.23966289175664998, "grad_norm": 2.7995800971984863, "learning_rate": 3.8026073215696606e-05, "loss": 1.1688, "step": 9100 }, { "epoch": 0.2396892283381617, "grad_norm": 3.423882484436035, "learning_rate": 3.802475638662102e-05, "loss": 2.5094, "step": 9101 }, { "epoch": 0.2397155649196734, "grad_norm": 3.353536605834961, "learning_rate": 3.802343955754543e-05, "loss": 1.7535, "step": 9102 }, { "epoch": 0.23974190150118516, "grad_norm": 1.7034701108932495, "learning_rate": 3.8022122728469846e-05, "loss": 0.8672, "step": 9103 }, { "epoch": 0.23976823808269687, "grad_norm": 3.1750571727752686, "learning_rate": 3.802080589939426e-05, "loss": 1.1865, "step": 9104 }, { "epoch": 0.2397945746642086, "grad_norm": 1.6175776720046997, "learning_rate": 3.801948907031868e-05, "loss": 1.6612, "step": 9105 }, { "epoch": 0.2398209112457203, "grad_norm": 3.0557985305786133, "learning_rate": 3.8018172241243086e-05, "loss": 0.9933, "step": 9106 }, { "epoch": 0.23984724782723202, "grad_norm": 2.8511931896209717, "learning_rate": 3.80168554121675e-05, "loss": 2.2315, "step": 9107 }, { "epoch": 0.23987358440874373, "grad_norm": 2.606637954711914, "learning_rate": 3.801553858309192e-05, "loss": 1.4886, "step": 9108 }, { "epoch": 0.23989992099025548, "grad_norm": 2.238206624984741, "learning_rate": 3.801422175401633e-05, "loss": 1.8346, "step": 9109 }, { "epoch": 0.2399262575717672, "grad_norm": 2.12542724609375, "learning_rate": 3.801290492494075e-05, "loss": 1.8932, "step": 9110 }, { "epoch": 0.2399525941532789, "grad_norm": 2.5104942321777344, "learning_rate": 3.801158809586516e-05, "loss": 1.3672, "step": 9111 }, { "epoch": 0.23997893073479062, "grad_norm": 2.029500961303711, "learning_rate": 3.801027126678957e-05, "loss": 2.0438, "step": 9112 }, { "epoch": 0.24000526731630234, "grad_norm": 2.992079019546509, "learning_rate": 3.800895443771398e-05, "loss": 1.8737, "step": 9113 }, { "epoch": 0.24003160389781406, "grad_norm": 3.4098570346832275, "learning_rate": 3.8007637608638404e-05, "loss": 1.3028, "step": 9114 }, { "epoch": 0.24005794047932577, "grad_norm": 5.301133632659912, "learning_rate": 3.800632077956281e-05, "loss": 1.7078, "step": 9115 }, { "epoch": 0.24008427706083751, "grad_norm": 1.9357210397720337, "learning_rate": 3.800500395048723e-05, "loss": 2.0063, "step": 9116 }, { "epoch": 0.24011061364234923, "grad_norm": 2.0835912227630615, "learning_rate": 3.8003687121411644e-05, "loss": 2.0374, "step": 9117 }, { "epoch": 0.24013695022386095, "grad_norm": 1.8628278970718384, "learning_rate": 3.800237029233606e-05, "loss": 1.9094, "step": 9118 }, { "epoch": 0.24016328680537266, "grad_norm": 1.7916760444641113, "learning_rate": 3.8001053463260475e-05, "loss": 1.6695, "step": 9119 }, { "epoch": 0.24018962338688438, "grad_norm": 2.5060653686523438, "learning_rate": 3.7999736634184884e-05, "loss": 0.8932, "step": 9120 }, { "epoch": 0.2402159599683961, "grad_norm": 1.9011561870574951, "learning_rate": 3.79984198051093e-05, "loss": 1.7803, "step": 9121 }, { "epoch": 0.24024229654990784, "grad_norm": 3.5035581588745117, "learning_rate": 3.799710297603371e-05, "loss": 0.4986, "step": 9122 }, { "epoch": 0.24026863313141955, "grad_norm": 4.221654415130615, "learning_rate": 3.799578614695813e-05, "loss": 1.03, "step": 9123 }, { "epoch": 0.24029496971293127, "grad_norm": 7.845032691955566, "learning_rate": 3.799446931788254e-05, "loss": 1.5312, "step": 9124 }, { "epoch": 0.24032130629444298, "grad_norm": 2.5314698219299316, "learning_rate": 3.7993152488806955e-05, "loss": 1.739, "step": 9125 }, { "epoch": 0.2403476428759547, "grad_norm": 1.9726166725158691, "learning_rate": 3.799183565973137e-05, "loss": 1.8529, "step": 9126 }, { "epoch": 0.2403739794574664, "grad_norm": 3.5129637718200684, "learning_rate": 3.799051883065578e-05, "loss": 1.5295, "step": 9127 }, { "epoch": 0.24040031603897813, "grad_norm": 2.7426881790161133, "learning_rate": 3.79892020015802e-05, "loss": 1.7939, "step": 9128 }, { "epoch": 0.24042665262048987, "grad_norm": 1.4880263805389404, "learning_rate": 3.798788517250461e-05, "loss": 1.3835, "step": 9129 }, { "epoch": 0.2404529892020016, "grad_norm": 2.7338085174560547, "learning_rate": 3.7986568343429026e-05, "loss": 0.5393, "step": 9130 }, { "epoch": 0.2404793257835133, "grad_norm": 1.945019245147705, "learning_rate": 3.7985251514353435e-05, "loss": 0.5236, "step": 9131 }, { "epoch": 0.24050566236502502, "grad_norm": 4.0086493492126465, "learning_rate": 3.798393468527786e-05, "loss": 2.0813, "step": 9132 }, { "epoch": 0.24053199894653673, "grad_norm": 2.984492301940918, "learning_rate": 3.7982617856202266e-05, "loss": 1.8398, "step": 9133 }, { "epoch": 0.24055833552804845, "grad_norm": 3.3578622341156006, "learning_rate": 3.798130102712668e-05, "loss": 0.9848, "step": 9134 }, { "epoch": 0.24058467210956017, "grad_norm": 2.6167993545532227, "learning_rate": 3.797998419805109e-05, "loss": 1.6624, "step": 9135 }, { "epoch": 0.2406110086910719, "grad_norm": 2.0487067699432373, "learning_rate": 3.7978667368975506e-05, "loss": 2.4291, "step": 9136 }, { "epoch": 0.24063734527258362, "grad_norm": 3.0543909072875977, "learning_rate": 3.797735053989992e-05, "loss": 1.7561, "step": 9137 }, { "epoch": 0.24066368185409534, "grad_norm": 5.187302112579346, "learning_rate": 3.797603371082434e-05, "loss": 1.6032, "step": 9138 }, { "epoch": 0.24069001843560706, "grad_norm": 1.904990553855896, "learning_rate": 3.797471688174875e-05, "loss": 1.9227, "step": 9139 }, { "epoch": 0.24071635501711877, "grad_norm": 1.8882147073745728, "learning_rate": 3.797340005267316e-05, "loss": 1.5641, "step": 9140 }, { "epoch": 0.2407426915986305, "grad_norm": 2.557469606399536, "learning_rate": 3.797208322359758e-05, "loss": 1.4592, "step": 9141 }, { "epoch": 0.24076902818014223, "grad_norm": 1.8143415451049805, "learning_rate": 3.797076639452199e-05, "loss": 1.685, "step": 9142 }, { "epoch": 0.24079536476165395, "grad_norm": 3.132756471633911, "learning_rate": 3.796944956544641e-05, "loss": 1.4478, "step": 9143 }, { "epoch": 0.24082170134316566, "grad_norm": 2.269442319869995, "learning_rate": 3.796813273637082e-05, "loss": 1.9974, "step": 9144 }, { "epoch": 0.24084803792467738, "grad_norm": 3.5727124214172363, "learning_rate": 3.796681590729523e-05, "loss": 0.6317, "step": 9145 }, { "epoch": 0.2408743745061891, "grad_norm": 3.152127504348755, "learning_rate": 3.796549907821965e-05, "loss": 1.6475, "step": 9146 }, { "epoch": 0.2409007110877008, "grad_norm": 2.6046741008758545, "learning_rate": 3.7964182249144064e-05, "loss": 1.1536, "step": 9147 }, { "epoch": 0.24092704766921252, "grad_norm": 1.647168517112732, "learning_rate": 3.796286542006848e-05, "loss": 1.7544, "step": 9148 }, { "epoch": 0.24095338425072427, "grad_norm": 3.2681217193603516, "learning_rate": 3.796154859099289e-05, "loss": 1.6074, "step": 9149 }, { "epoch": 0.24097972083223598, "grad_norm": 2.1700446605682373, "learning_rate": 3.7960231761917304e-05, "loss": 2.002, "step": 9150 }, { "epoch": 0.2410060574137477, "grad_norm": 3.1137824058532715, "learning_rate": 3.795891493284172e-05, "loss": 1.5384, "step": 9151 }, { "epoch": 0.24103239399525941, "grad_norm": 1.6794297695159912, "learning_rate": 3.7957598103766136e-05, "loss": 1.6418, "step": 9152 }, { "epoch": 0.24105873057677113, "grad_norm": 2.084293842315674, "learning_rate": 3.7956281274690544e-05, "loss": 0.8927, "step": 9153 }, { "epoch": 0.24108506715828285, "grad_norm": 1.805039644241333, "learning_rate": 3.795496444561496e-05, "loss": 1.6548, "step": 9154 }, { "epoch": 0.24111140373979456, "grad_norm": 1.7283471822738647, "learning_rate": 3.7953647616539376e-05, "loss": 1.5744, "step": 9155 }, { "epoch": 0.2411377403213063, "grad_norm": 1.9662374258041382, "learning_rate": 3.795233078746379e-05, "loss": 1.859, "step": 9156 }, { "epoch": 0.24116407690281802, "grad_norm": 2.372647523880005, "learning_rate": 3.795101395838821e-05, "loss": 1.816, "step": 9157 }, { "epoch": 0.24119041348432974, "grad_norm": 2.753685712814331, "learning_rate": 3.7949697129312616e-05, "loss": 1.5058, "step": 9158 }, { "epoch": 0.24121675006584145, "grad_norm": 1.8998364210128784, "learning_rate": 3.794838030023703e-05, "loss": 2.2558, "step": 9159 }, { "epoch": 0.24124308664735317, "grad_norm": 2.019319534301758, "learning_rate": 3.794706347116144e-05, "loss": 0.751, "step": 9160 }, { "epoch": 0.24126942322886488, "grad_norm": 1.8629621267318726, "learning_rate": 3.794574664208586e-05, "loss": 1.9928, "step": 9161 }, { "epoch": 0.24129575981037663, "grad_norm": 3.576493501663208, "learning_rate": 3.794442981301027e-05, "loss": 1.7328, "step": 9162 }, { "epoch": 0.24132209639188834, "grad_norm": 2.0093233585357666, "learning_rate": 3.794311298393469e-05, "loss": 1.6453, "step": 9163 }, { "epoch": 0.24134843297340006, "grad_norm": 1.61049222946167, "learning_rate": 3.79417961548591e-05, "loss": 0.5151, "step": 9164 }, { "epoch": 0.24137476955491177, "grad_norm": 4.014994144439697, "learning_rate": 3.794047932578352e-05, "loss": 0.8762, "step": 9165 }, { "epoch": 0.2414011061364235, "grad_norm": 1.7349263429641724, "learning_rate": 3.7939162496707934e-05, "loss": 1.9185, "step": 9166 }, { "epoch": 0.2414274427179352, "grad_norm": 2.0635013580322266, "learning_rate": 3.793784566763234e-05, "loss": 2.1196, "step": 9167 }, { "epoch": 0.24145377929944692, "grad_norm": 3.3577659130096436, "learning_rate": 3.793652883855676e-05, "loss": 1.9061, "step": 9168 }, { "epoch": 0.24148011588095866, "grad_norm": 7.884344100952148, "learning_rate": 3.793521200948117e-05, "loss": 1.7619, "step": 9169 }, { "epoch": 0.24150645246247038, "grad_norm": 3.6228582859039307, "learning_rate": 3.793389518040559e-05, "loss": 1.1273, "step": 9170 }, { "epoch": 0.2415327890439821, "grad_norm": 3.201294422149658, "learning_rate": 3.793257835133e-05, "loss": 2.2822, "step": 9171 }, { "epoch": 0.2415591256254938, "grad_norm": 2.4404683113098145, "learning_rate": 3.7931261522254414e-05, "loss": 1.8809, "step": 9172 }, { "epoch": 0.24158546220700552, "grad_norm": 1.6834548711776733, "learning_rate": 3.792994469317883e-05, "loss": 1.7893, "step": 9173 }, { "epoch": 0.24161179878851724, "grad_norm": 3.217729330062866, "learning_rate": 3.792862786410324e-05, "loss": 1.4802, "step": 9174 }, { "epoch": 0.24163813537002898, "grad_norm": 1.4206888675689697, "learning_rate": 3.792731103502766e-05, "loss": 0.2576, "step": 9175 }, { "epoch": 0.2416644719515407, "grad_norm": 1.8686909675598145, "learning_rate": 3.792599420595207e-05, "loss": 1.7188, "step": 9176 }, { "epoch": 0.24169080853305241, "grad_norm": 4.568958759307861, "learning_rate": 3.7924677376876485e-05, "loss": 1.3826, "step": 9177 }, { "epoch": 0.24171714511456413, "grad_norm": 2.1568028926849365, "learning_rate": 3.7923360547800894e-05, "loss": 1.4661, "step": 9178 }, { "epoch": 0.24174348169607585, "grad_norm": 2.121312379837036, "learning_rate": 3.792204371872531e-05, "loss": 1.9278, "step": 9179 }, { "epoch": 0.24176981827758756, "grad_norm": 2.118227243423462, "learning_rate": 3.7920726889649725e-05, "loss": 1.8126, "step": 9180 }, { "epoch": 0.24179615485909928, "grad_norm": 2.5075924396514893, "learning_rate": 3.791941006057414e-05, "loss": 0.182, "step": 9181 }, { "epoch": 0.24182249144061102, "grad_norm": 3.305026054382324, "learning_rate": 3.791809323149855e-05, "loss": 1.9408, "step": 9182 }, { "epoch": 0.24184882802212274, "grad_norm": 1.909716248512268, "learning_rate": 3.7916776402422965e-05, "loss": 1.8562, "step": 9183 }, { "epoch": 0.24187516460363445, "grad_norm": 2.0505053997039795, "learning_rate": 3.791545957334739e-05, "loss": 1.9769, "step": 9184 }, { "epoch": 0.24190150118514617, "grad_norm": 1.7961729764938354, "learning_rate": 3.7914142744271796e-05, "loss": 0.5204, "step": 9185 }, { "epoch": 0.24192783776665788, "grad_norm": 2.0554816722869873, "learning_rate": 3.791282591519621e-05, "loss": 1.4629, "step": 9186 }, { "epoch": 0.2419541743481696, "grad_norm": 2.257201910018921, "learning_rate": 3.791150908612062e-05, "loss": 1.7419, "step": 9187 }, { "epoch": 0.24198051092968131, "grad_norm": 1.7493021488189697, "learning_rate": 3.7910192257045036e-05, "loss": 1.8869, "step": 9188 }, { "epoch": 0.24200684751119306, "grad_norm": 1.585355281829834, "learning_rate": 3.790887542796945e-05, "loss": 2.3149, "step": 9189 }, { "epoch": 0.24203318409270477, "grad_norm": 2.6028037071228027, "learning_rate": 3.790755859889387e-05, "loss": 1.4708, "step": 9190 }, { "epoch": 0.2420595206742165, "grad_norm": 2.316624402999878, "learning_rate": 3.7906241769818276e-05, "loss": 1.5252, "step": 9191 }, { "epoch": 0.2420858572557282, "grad_norm": 3.5405890941619873, "learning_rate": 3.790492494074269e-05, "loss": 0.8296, "step": 9192 }, { "epoch": 0.24211219383723992, "grad_norm": 1.8235653638839722, "learning_rate": 3.790360811166711e-05, "loss": 2.1218, "step": 9193 }, { "epoch": 0.24213853041875164, "grad_norm": 1.585139274597168, "learning_rate": 3.790229128259152e-05, "loss": 2.1037, "step": 9194 }, { "epoch": 0.24216486700026338, "grad_norm": 2.2083747386932373, "learning_rate": 3.790097445351594e-05, "loss": 0.8379, "step": 9195 }, { "epoch": 0.2421912035817751, "grad_norm": 1.646804690361023, "learning_rate": 3.789965762444035e-05, "loss": 1.8582, "step": 9196 }, { "epoch": 0.2422175401632868, "grad_norm": 4.820605278015137, "learning_rate": 3.789834079536476e-05, "loss": 1.6665, "step": 9197 }, { "epoch": 0.24224387674479853, "grad_norm": 4.446354866027832, "learning_rate": 3.789702396628918e-05, "loss": 1.4955, "step": 9198 }, { "epoch": 0.24227021332631024, "grad_norm": 1.4681748151779175, "learning_rate": 3.7895707137213594e-05, "loss": 2.2789, "step": 9199 }, { "epoch": 0.24229654990782196, "grad_norm": 2.9111170768737793, "learning_rate": 3.7894390308138e-05, "loss": 2.0965, "step": 9200 }, { "epoch": 0.24232288648933367, "grad_norm": 3.720651865005493, "learning_rate": 3.789307347906242e-05, "loss": 2.1935, "step": 9201 }, { "epoch": 0.24234922307084542, "grad_norm": 2.03873872756958, "learning_rate": 3.7891756649986834e-05, "loss": 1.2195, "step": 9202 }, { "epoch": 0.24237555965235713, "grad_norm": 2.1937954425811768, "learning_rate": 3.789043982091125e-05, "loss": 2.0024, "step": 9203 }, { "epoch": 0.24240189623386885, "grad_norm": 1.8905243873596191, "learning_rate": 3.7889122991835665e-05, "loss": 1.5583, "step": 9204 }, { "epoch": 0.24242823281538056, "grad_norm": 1.5541455745697021, "learning_rate": 3.7887806162760074e-05, "loss": 2.2141, "step": 9205 }, { "epoch": 0.24245456939689228, "grad_norm": 3.704789400100708, "learning_rate": 3.788648933368449e-05, "loss": 0.8928, "step": 9206 }, { "epoch": 0.242480905978404, "grad_norm": 2.060554027557373, "learning_rate": 3.78851725046089e-05, "loss": 2.0226, "step": 9207 }, { "epoch": 0.24250724255991574, "grad_norm": 1.6431756019592285, "learning_rate": 3.788385567553332e-05, "loss": 2.1302, "step": 9208 }, { "epoch": 0.24253357914142745, "grad_norm": 3.4376893043518066, "learning_rate": 3.788253884645773e-05, "loss": 1.0592, "step": 9209 }, { "epoch": 0.24255991572293917, "grad_norm": 2.362325668334961, "learning_rate": 3.7881222017382145e-05, "loss": 1.1576, "step": 9210 }, { "epoch": 0.24258625230445088, "grad_norm": 2.5028932094573975, "learning_rate": 3.787990518830656e-05, "loss": 1.3836, "step": 9211 }, { "epoch": 0.2426125888859626, "grad_norm": 1.8605539798736572, "learning_rate": 3.787858835923097e-05, "loss": 2.4236, "step": 9212 }, { "epoch": 0.24263892546747431, "grad_norm": 4.813013076782227, "learning_rate": 3.787727153015539e-05, "loss": 1.0566, "step": 9213 }, { "epoch": 0.24266526204898603, "grad_norm": 2.4445302486419678, "learning_rate": 3.78759547010798e-05, "loss": 0.6525, "step": 9214 }, { "epoch": 0.24269159863049777, "grad_norm": 2.031694173812866, "learning_rate": 3.7874637872004217e-05, "loss": 1.6449, "step": 9215 }, { "epoch": 0.2427179352120095, "grad_norm": 1.2822418212890625, "learning_rate": 3.7873321042928625e-05, "loss": 1.7172, "step": 9216 }, { "epoch": 0.2427442717935212, "grad_norm": 2.2313852310180664, "learning_rate": 3.787200421385305e-05, "loss": 2.4019, "step": 9217 }, { "epoch": 0.24277060837503292, "grad_norm": 3.127763271331787, "learning_rate": 3.7870687384777457e-05, "loss": 1.0407, "step": 9218 }, { "epoch": 0.24279694495654464, "grad_norm": 2.701584815979004, "learning_rate": 3.786937055570187e-05, "loss": 1.8679, "step": 9219 }, { "epoch": 0.24282328153805635, "grad_norm": 2.3775875568389893, "learning_rate": 3.786805372662629e-05, "loss": 1.9762, "step": 9220 }, { "epoch": 0.24284961811956807, "grad_norm": 1.9174168109893799, "learning_rate": 3.7866736897550697e-05, "loss": 1.9555, "step": 9221 }, { "epoch": 0.2428759547010798, "grad_norm": 5.04189395904541, "learning_rate": 3.786542006847512e-05, "loss": 0.5557, "step": 9222 }, { "epoch": 0.24290229128259153, "grad_norm": 3.3648109436035156, "learning_rate": 3.786410323939953e-05, "loss": 1.2738, "step": 9223 }, { "epoch": 0.24292862786410324, "grad_norm": 3.704732894897461, "learning_rate": 3.7862786410323943e-05, "loss": 1.0398, "step": 9224 }, { "epoch": 0.24295496444561496, "grad_norm": 3.191092014312744, "learning_rate": 3.786146958124835e-05, "loss": 1.7896, "step": 9225 }, { "epoch": 0.24298130102712667, "grad_norm": 2.2654383182525635, "learning_rate": 3.786015275217277e-05, "loss": 2.048, "step": 9226 }, { "epoch": 0.2430076376086384, "grad_norm": 2.8884716033935547, "learning_rate": 3.7858835923097183e-05, "loss": 1.2261, "step": 9227 }, { "epoch": 0.24303397419015013, "grad_norm": 4.790567874908447, "learning_rate": 3.78575190940216e-05, "loss": 0.7408, "step": 9228 }, { "epoch": 0.24306031077166185, "grad_norm": 1.757062554359436, "learning_rate": 3.7856202264946015e-05, "loss": 2.1357, "step": 9229 }, { "epoch": 0.24308664735317356, "grad_norm": 2.4384217262268066, "learning_rate": 3.7854885435870423e-05, "loss": 1.7885, "step": 9230 }, { "epoch": 0.24311298393468528, "grad_norm": 1.7342662811279297, "learning_rate": 3.7853568606794846e-05, "loss": 1.6452, "step": 9231 }, { "epoch": 0.243139320516197, "grad_norm": 1.8292129039764404, "learning_rate": 3.7852251777719255e-05, "loss": 1.9006, "step": 9232 }, { "epoch": 0.2431656570977087, "grad_norm": 7.914848327636719, "learning_rate": 3.785093494864367e-05, "loss": 1.334, "step": 9233 }, { "epoch": 0.24319199367922043, "grad_norm": 2.721095085144043, "learning_rate": 3.784961811956808e-05, "loss": 0.6011, "step": 9234 }, { "epoch": 0.24321833026073217, "grad_norm": 2.089242458343506, "learning_rate": 3.7848301290492495e-05, "loss": 2.5129, "step": 9235 }, { "epoch": 0.24324466684224388, "grad_norm": 3.5150718688964844, "learning_rate": 3.784698446141691e-05, "loss": 0.7179, "step": 9236 }, { "epoch": 0.2432710034237556, "grad_norm": 1.794505000114441, "learning_rate": 3.7845667632341326e-05, "loss": 2.337, "step": 9237 }, { "epoch": 0.24329734000526732, "grad_norm": 2.0927414894104004, "learning_rate": 3.7844350803265735e-05, "loss": 1.1237, "step": 9238 }, { "epoch": 0.24332367658677903, "grad_norm": 1.8351795673370361, "learning_rate": 3.784303397419015e-05, "loss": 1.9189, "step": 9239 }, { "epoch": 0.24335001316829075, "grad_norm": 2.752796173095703, "learning_rate": 3.7841717145114566e-05, "loss": 2.0942, "step": 9240 }, { "epoch": 0.24337634974980246, "grad_norm": 3.8616297245025635, "learning_rate": 3.784040031603898e-05, "loss": 1.3218, "step": 9241 }, { "epoch": 0.2434026863313142, "grad_norm": 1.4050439596176147, "learning_rate": 3.78390834869634e-05, "loss": 1.6661, "step": 9242 }, { "epoch": 0.24342902291282592, "grad_norm": 6.63370943069458, "learning_rate": 3.7837766657887806e-05, "loss": 3.2083, "step": 9243 }, { "epoch": 0.24345535949433764, "grad_norm": 2.123335361480713, "learning_rate": 3.783644982881222e-05, "loss": 2.1397, "step": 9244 }, { "epoch": 0.24348169607584935, "grad_norm": 2.123399257659912, "learning_rate": 3.783513299973663e-05, "loss": 1.8308, "step": 9245 }, { "epoch": 0.24350803265736107, "grad_norm": 5.068130970001221, "learning_rate": 3.783381617066105e-05, "loss": 2.5819, "step": 9246 }, { "epoch": 0.24353436923887278, "grad_norm": 2.0146961212158203, "learning_rate": 3.783249934158546e-05, "loss": 0.486, "step": 9247 }, { "epoch": 0.24356070582038453, "grad_norm": 1.8296869993209839, "learning_rate": 3.783118251250988e-05, "loss": 1.52, "step": 9248 }, { "epoch": 0.24358704240189624, "grad_norm": 2.267693281173706, "learning_rate": 3.782986568343429e-05, "loss": 1.995, "step": 9249 }, { "epoch": 0.24361337898340796, "grad_norm": 2.441950798034668, "learning_rate": 3.782854885435871e-05, "loss": 1.3178, "step": 9250 }, { "epoch": 0.24363971556491967, "grad_norm": 1.6176522970199585, "learning_rate": 3.7827232025283124e-05, "loss": 1.6522, "step": 9251 }, { "epoch": 0.2436660521464314, "grad_norm": 2.1549603939056396, "learning_rate": 3.782591519620753e-05, "loss": 1.8736, "step": 9252 }, { "epoch": 0.2436923887279431, "grad_norm": 1.7482891082763672, "learning_rate": 3.782459836713195e-05, "loss": 1.644, "step": 9253 }, { "epoch": 0.24371872530945482, "grad_norm": 2.006420135498047, "learning_rate": 3.782328153805636e-05, "loss": 1.7266, "step": 9254 }, { "epoch": 0.24374506189096656, "grad_norm": 1.6657357215881348, "learning_rate": 3.782196470898078e-05, "loss": 1.2061, "step": 9255 }, { "epoch": 0.24377139847247828, "grad_norm": 1.837896704673767, "learning_rate": 3.782064787990519e-05, "loss": 1.5926, "step": 9256 }, { "epoch": 0.24379773505399, "grad_norm": 2.1787383556365967, "learning_rate": 3.7819331050829604e-05, "loss": 2.0773, "step": 9257 }, { "epoch": 0.2438240716355017, "grad_norm": 1.453203558921814, "learning_rate": 3.781801422175402e-05, "loss": 2.2414, "step": 9258 }, { "epoch": 0.24385040821701343, "grad_norm": 3.399627447128296, "learning_rate": 3.781669739267843e-05, "loss": 1.6381, "step": 9259 }, { "epoch": 0.24387674479852514, "grad_norm": 3.2974464893341064, "learning_rate": 3.781538056360285e-05, "loss": 1.0649, "step": 9260 }, { "epoch": 0.24390308138003688, "grad_norm": 2.1594533920288086, "learning_rate": 3.781406373452726e-05, "loss": 0.9662, "step": 9261 }, { "epoch": 0.2439294179615486, "grad_norm": 1.795055627822876, "learning_rate": 3.7812746905451675e-05, "loss": 1.9675, "step": 9262 }, { "epoch": 0.24395575454306032, "grad_norm": 1.754931092262268, "learning_rate": 3.7811430076376084e-05, "loss": 2.0979, "step": 9263 }, { "epoch": 0.24398209112457203, "grad_norm": 2.881612777709961, "learning_rate": 3.7810113247300506e-05, "loss": 1.9986, "step": 9264 }, { "epoch": 0.24400842770608375, "grad_norm": 2.1526458263397217, "learning_rate": 3.7808796418224915e-05, "loss": 2.1643, "step": 9265 }, { "epoch": 0.24403476428759546, "grad_norm": 2.0660085678100586, "learning_rate": 3.780747958914933e-05, "loss": 1.6235, "step": 9266 }, { "epoch": 0.24406110086910718, "grad_norm": 2.211167335510254, "learning_rate": 3.7806162760073746e-05, "loss": 2.0696, "step": 9267 }, { "epoch": 0.24408743745061892, "grad_norm": 3.424025297164917, "learning_rate": 3.7804845930998155e-05, "loss": 1.6922, "step": 9268 }, { "epoch": 0.24411377403213064, "grad_norm": 2.800313949584961, "learning_rate": 3.780352910192258e-05, "loss": 1.008, "step": 9269 }, { "epoch": 0.24414011061364235, "grad_norm": 2.6103975772857666, "learning_rate": 3.7802212272846986e-05, "loss": 2.2026, "step": 9270 }, { "epoch": 0.24416644719515407, "grad_norm": 1.4573440551757812, "learning_rate": 3.78008954437714e-05, "loss": 1.6872, "step": 9271 }, { "epoch": 0.24419278377666578, "grad_norm": 2.1108124256134033, "learning_rate": 3.779957861469581e-05, "loss": 1.2656, "step": 9272 }, { "epoch": 0.2442191203581775, "grad_norm": 1.9290087223052979, "learning_rate": 3.7798261785620226e-05, "loss": 0.4959, "step": 9273 }, { "epoch": 0.24424545693968922, "grad_norm": 1.559777021408081, "learning_rate": 3.779694495654464e-05, "loss": 1.5429, "step": 9274 }, { "epoch": 0.24427179352120096, "grad_norm": 2.850970983505249, "learning_rate": 3.779562812746906e-05, "loss": 1.7653, "step": 9275 }, { "epoch": 0.24429813010271267, "grad_norm": 1.8330388069152832, "learning_rate": 3.779431129839347e-05, "loss": 1.538, "step": 9276 }, { "epoch": 0.2443244666842244, "grad_norm": 2.406566619873047, "learning_rate": 3.779299446931788e-05, "loss": 1.8792, "step": 9277 }, { "epoch": 0.2443508032657361, "grad_norm": 1.6691175699234009, "learning_rate": 3.77916776402423e-05, "loss": 1.5142, "step": 9278 }, { "epoch": 0.24437713984724782, "grad_norm": 1.8278650045394897, "learning_rate": 3.779036081116671e-05, "loss": 1.9484, "step": 9279 }, { "epoch": 0.24440347642875954, "grad_norm": 1.9198180437088013, "learning_rate": 3.778904398209113e-05, "loss": 1.4031, "step": 9280 }, { "epoch": 0.24442981301027128, "grad_norm": 2.0104076862335205, "learning_rate": 3.778772715301554e-05, "loss": 1.44, "step": 9281 }, { "epoch": 0.244456149591783, "grad_norm": 1.4412245750427246, "learning_rate": 3.778641032393995e-05, "loss": 1.9661, "step": 9282 }, { "epoch": 0.2444824861732947, "grad_norm": 1.3967386484146118, "learning_rate": 3.778509349486437e-05, "loss": 1.5204, "step": 9283 }, { "epoch": 0.24450882275480643, "grad_norm": 3.0562007427215576, "learning_rate": 3.7783776665788784e-05, "loss": 0.5748, "step": 9284 }, { "epoch": 0.24453515933631814, "grad_norm": 2.1729013919830322, "learning_rate": 3.778245983671319e-05, "loss": 2.5757, "step": 9285 }, { "epoch": 0.24456149591782986, "grad_norm": 1.6921613216400146, "learning_rate": 3.778114300763761e-05, "loss": 1.3119, "step": 9286 }, { "epoch": 0.24458783249934157, "grad_norm": 1.8305718898773193, "learning_rate": 3.7779826178562024e-05, "loss": 1.7485, "step": 9287 }, { "epoch": 0.24461416908085332, "grad_norm": 3.1347134113311768, "learning_rate": 3.777850934948644e-05, "loss": 1.2699, "step": 9288 }, { "epoch": 0.24464050566236503, "grad_norm": 3.9561259746551514, "learning_rate": 3.7777192520410856e-05, "loss": 1.2056, "step": 9289 }, { "epoch": 0.24466684224387675, "grad_norm": 2.354530096054077, "learning_rate": 3.7775875691335264e-05, "loss": 2.2964, "step": 9290 }, { "epoch": 0.24469317882538846, "grad_norm": 3.5120456218719482, "learning_rate": 3.777455886225968e-05, "loss": 0.8609, "step": 9291 }, { "epoch": 0.24471951540690018, "grad_norm": 1.886204719543457, "learning_rate": 3.777324203318409e-05, "loss": 1.966, "step": 9292 }, { "epoch": 0.2447458519884119, "grad_norm": 2.966050148010254, "learning_rate": 3.777192520410851e-05, "loss": 1.5666, "step": 9293 }, { "epoch": 0.24477218856992364, "grad_norm": 1.7321619987487793, "learning_rate": 3.777060837503292e-05, "loss": 2.0653, "step": 9294 }, { "epoch": 0.24479852515143535, "grad_norm": 1.6937493085861206, "learning_rate": 3.7769291545957336e-05, "loss": 1.6134, "step": 9295 }, { "epoch": 0.24482486173294707, "grad_norm": 2.7914743423461914, "learning_rate": 3.776797471688175e-05, "loss": 0.3406, "step": 9296 }, { "epoch": 0.24485119831445878, "grad_norm": 2.6603357791900635, "learning_rate": 3.776665788780617e-05, "loss": 0.7411, "step": 9297 }, { "epoch": 0.2448775348959705, "grad_norm": 1.9323912858963013, "learning_rate": 3.776534105873058e-05, "loss": 2.189, "step": 9298 }, { "epoch": 0.24490387147748222, "grad_norm": 2.30798602104187, "learning_rate": 3.776402422965499e-05, "loss": 1.8301, "step": 9299 }, { "epoch": 0.24493020805899393, "grad_norm": 3.0702922344207764, "learning_rate": 3.776270740057941e-05, "loss": 0.7602, "step": 9300 }, { "epoch": 0.24495654464050567, "grad_norm": 2.0642573833465576, "learning_rate": 3.7761390571503816e-05, "loss": 2.2123, "step": 9301 }, { "epoch": 0.2449828812220174, "grad_norm": 1.8380906581878662, "learning_rate": 3.776007374242824e-05, "loss": 1.4186, "step": 9302 }, { "epoch": 0.2450092178035291, "grad_norm": 1.7286680936813354, "learning_rate": 3.775875691335265e-05, "loss": 1.8775, "step": 9303 }, { "epoch": 0.24503555438504082, "grad_norm": 3.878495454788208, "learning_rate": 3.775744008427706e-05, "loss": 2.6115, "step": 9304 }, { "epoch": 0.24506189096655254, "grad_norm": 3.5993738174438477, "learning_rate": 3.775612325520148e-05, "loss": 0.9903, "step": 9305 }, { "epoch": 0.24508822754806425, "grad_norm": 3.8528695106506348, "learning_rate": 3.775480642612589e-05, "loss": 1.0769, "step": 9306 }, { "epoch": 0.24511456412957597, "grad_norm": 1.7006994485855103, "learning_rate": 3.775348959705031e-05, "loss": 1.5106, "step": 9307 }, { "epoch": 0.2451409007110877, "grad_norm": 2.118520736694336, "learning_rate": 3.775217276797472e-05, "loss": 1.5786, "step": 9308 }, { "epoch": 0.24516723729259943, "grad_norm": 1.5905060768127441, "learning_rate": 3.7750855938899134e-05, "loss": 2.0823, "step": 9309 }, { "epoch": 0.24519357387411114, "grad_norm": 2.3758137226104736, "learning_rate": 3.774953910982354e-05, "loss": 1.7364, "step": 9310 }, { "epoch": 0.24521991045562286, "grad_norm": 1.9911389350891113, "learning_rate": 3.774822228074796e-05, "loss": 1.4854, "step": 9311 }, { "epoch": 0.24524624703713457, "grad_norm": 3.24302339553833, "learning_rate": 3.7746905451672374e-05, "loss": 2.0166, "step": 9312 }, { "epoch": 0.2452725836186463, "grad_norm": 1.7609572410583496, "learning_rate": 3.774558862259679e-05, "loss": 2.0834, "step": 9313 }, { "epoch": 0.24529892020015803, "grad_norm": 2.0814785957336426, "learning_rate": 3.7744271793521205e-05, "loss": 2.4209, "step": 9314 }, { "epoch": 0.24532525678166975, "grad_norm": 1.5561857223510742, "learning_rate": 3.7742954964445614e-05, "loss": 1.3433, "step": 9315 }, { "epoch": 0.24535159336318146, "grad_norm": 1.8540364503860474, "learning_rate": 3.7741638135370036e-05, "loss": 1.7081, "step": 9316 }, { "epoch": 0.24537792994469318, "grad_norm": 1.9510375261306763, "learning_rate": 3.7740321306294445e-05, "loss": 1.9614, "step": 9317 }, { "epoch": 0.2454042665262049, "grad_norm": 3.4084575176239014, "learning_rate": 3.773900447721886e-05, "loss": 1.8789, "step": 9318 }, { "epoch": 0.2454306031077166, "grad_norm": 2.46370792388916, "learning_rate": 3.773768764814327e-05, "loss": 1.4233, "step": 9319 }, { "epoch": 0.24545693968922833, "grad_norm": 1.9882310628890991, "learning_rate": 3.7736370819067685e-05, "loss": 1.8152, "step": 9320 }, { "epoch": 0.24548327627074007, "grad_norm": 2.6884074211120605, "learning_rate": 3.77350539899921e-05, "loss": 0.9255, "step": 9321 }, { "epoch": 0.24550961285225179, "grad_norm": 1.8098918199539185, "learning_rate": 3.7733737160916516e-05, "loss": 2.2152, "step": 9322 }, { "epoch": 0.2455359494337635, "grad_norm": 2.385812997817993, "learning_rate": 3.773242033184093e-05, "loss": 2.6968, "step": 9323 }, { "epoch": 0.24556228601527522, "grad_norm": 1.5554205179214478, "learning_rate": 3.773110350276534e-05, "loss": 1.531, "step": 9324 }, { "epoch": 0.24558862259678693, "grad_norm": 3.7166237831115723, "learning_rate": 3.7729786673689756e-05, "loss": 1.86, "step": 9325 }, { "epoch": 0.24561495917829865, "grad_norm": 1.9936190843582153, "learning_rate": 3.772846984461417e-05, "loss": 1.6313, "step": 9326 }, { "epoch": 0.24564129575981036, "grad_norm": 2.7776761054992676, "learning_rate": 3.772715301553859e-05, "loss": 1.0065, "step": 9327 }, { "epoch": 0.2456676323413221, "grad_norm": 1.987105369567871, "learning_rate": 3.7725836186462996e-05, "loss": 1.8912, "step": 9328 }, { "epoch": 0.24569396892283382, "grad_norm": 2.3901798725128174, "learning_rate": 3.772451935738741e-05, "loss": 1.5387, "step": 9329 }, { "epoch": 0.24572030550434554, "grad_norm": 2.1114540100097656, "learning_rate": 3.772320252831183e-05, "loss": 0.4655, "step": 9330 }, { "epoch": 0.24574664208585725, "grad_norm": 2.5467770099639893, "learning_rate": 3.772188569923624e-05, "loss": 0.6255, "step": 9331 }, { "epoch": 0.24577297866736897, "grad_norm": 3.1845436096191406, "learning_rate": 3.772056887016066e-05, "loss": 1.7654, "step": 9332 }, { "epoch": 0.24579931524888068, "grad_norm": 3.4342520236968994, "learning_rate": 3.771925204108507e-05, "loss": 1.8115, "step": 9333 }, { "epoch": 0.24582565183039243, "grad_norm": 2.281667470932007, "learning_rate": 3.771793521200948e-05, "loss": 2.0688, "step": 9334 }, { "epoch": 0.24585198841190414, "grad_norm": 2.334261894226074, "learning_rate": 3.77166183829339e-05, "loss": 1.7786, "step": 9335 }, { "epoch": 0.24587832499341586, "grad_norm": 1.9963724613189697, "learning_rate": 3.7715301553858314e-05, "loss": 1.7811, "step": 9336 }, { "epoch": 0.24590466157492757, "grad_norm": 4.233668327331543, "learning_rate": 3.771398472478272e-05, "loss": 0.9255, "step": 9337 }, { "epoch": 0.2459309981564393, "grad_norm": 3.008275270462036, "learning_rate": 3.771266789570714e-05, "loss": 1.4952, "step": 9338 }, { "epoch": 0.245957334737951, "grad_norm": 3.359005928039551, "learning_rate": 3.771135106663155e-05, "loss": 1.4314, "step": 9339 }, { "epoch": 0.24598367131946272, "grad_norm": 2.820333242416382, "learning_rate": 3.771003423755597e-05, "loss": 1.3702, "step": 9340 }, { "epoch": 0.24601000790097446, "grad_norm": 1.56570303440094, "learning_rate": 3.770871740848038e-05, "loss": 2.0693, "step": 9341 }, { "epoch": 0.24603634448248618, "grad_norm": 2.0925402641296387, "learning_rate": 3.7707400579404794e-05, "loss": 1.7175, "step": 9342 }, { "epoch": 0.2460626810639979, "grad_norm": 2.139431953430176, "learning_rate": 3.770608375032921e-05, "loss": 2.2563, "step": 9343 }, { "epoch": 0.2460890176455096, "grad_norm": 2.698939323425293, "learning_rate": 3.770476692125362e-05, "loss": 1.1589, "step": 9344 }, { "epoch": 0.24611535422702133, "grad_norm": 2.2857675552368164, "learning_rate": 3.770345009217804e-05, "loss": 2.3414, "step": 9345 }, { "epoch": 0.24614169080853304, "grad_norm": 2.204475164413452, "learning_rate": 3.770213326310245e-05, "loss": 1.7499, "step": 9346 }, { "epoch": 0.24616802739004479, "grad_norm": 1.9875195026397705, "learning_rate": 3.7700816434026865e-05, "loss": 2.0305, "step": 9347 }, { "epoch": 0.2461943639715565, "grad_norm": 1.8272719383239746, "learning_rate": 3.7699499604951274e-05, "loss": 2.3956, "step": 9348 }, { "epoch": 0.24622070055306822, "grad_norm": 2.7291312217712402, "learning_rate": 3.7698182775875697e-05, "loss": 0.7569, "step": 9349 }, { "epoch": 0.24624703713457993, "grad_norm": 1.694484829902649, "learning_rate": 3.7696865946800105e-05, "loss": 1.3337, "step": 9350 }, { "epoch": 0.24627337371609165, "grad_norm": 1.9520087242126465, "learning_rate": 3.769554911772452e-05, "loss": 1.8412, "step": 9351 }, { "epoch": 0.24629971029760336, "grad_norm": 1.5728037357330322, "learning_rate": 3.7694232288648937e-05, "loss": 1.6443, "step": 9352 }, { "epoch": 0.24632604687911508, "grad_norm": 2.5613174438476562, "learning_rate": 3.7692915459573345e-05, "loss": 2.3595, "step": 9353 }, { "epoch": 0.24635238346062682, "grad_norm": 2.300185203552246, "learning_rate": 3.769159863049777e-05, "loss": 1.8731, "step": 9354 }, { "epoch": 0.24637872004213854, "grad_norm": 1.8807735443115234, "learning_rate": 3.769028180142218e-05, "loss": 1.4396, "step": 9355 }, { "epoch": 0.24640505662365025, "grad_norm": 1.9244358539581299, "learning_rate": 3.768896497234659e-05, "loss": 1.6437, "step": 9356 }, { "epoch": 0.24643139320516197, "grad_norm": 1.8906482458114624, "learning_rate": 3.7687648143271e-05, "loss": 1.3937, "step": 9357 }, { "epoch": 0.24645772978667368, "grad_norm": 1.834443211555481, "learning_rate": 3.768633131419542e-05, "loss": 2.2964, "step": 9358 }, { "epoch": 0.2464840663681854, "grad_norm": 1.5918519496917725, "learning_rate": 3.768501448511983e-05, "loss": 1.6159, "step": 9359 }, { "epoch": 0.24651040294969712, "grad_norm": 3.436182737350464, "learning_rate": 3.768369765604425e-05, "loss": 1.4593, "step": 9360 }, { "epoch": 0.24653673953120886, "grad_norm": 2.331357479095459, "learning_rate": 3.7682380826968663e-05, "loss": 1.7932, "step": 9361 }, { "epoch": 0.24656307611272058, "grad_norm": 3.6024861335754395, "learning_rate": 3.768106399789307e-05, "loss": 1.0316, "step": 9362 }, { "epoch": 0.2465894126942323, "grad_norm": 3.5238559246063232, "learning_rate": 3.7679747168817495e-05, "loss": 2.0608, "step": 9363 }, { "epoch": 0.246615749275744, "grad_norm": 1.7826991081237793, "learning_rate": 3.7678430339741903e-05, "loss": 1.5082, "step": 9364 }, { "epoch": 0.24664208585725572, "grad_norm": 1.9705603122711182, "learning_rate": 3.767711351066632e-05, "loss": 2.4939, "step": 9365 }, { "epoch": 0.24666842243876744, "grad_norm": 2.5866169929504395, "learning_rate": 3.767579668159073e-05, "loss": 0.7636, "step": 9366 }, { "epoch": 0.24669475902027918, "grad_norm": 1.7340705394744873, "learning_rate": 3.7674479852515143e-05, "loss": 2.1591, "step": 9367 }, { "epoch": 0.2467210956017909, "grad_norm": 4.0733232498168945, "learning_rate": 3.767316302343956e-05, "loss": 0.7121, "step": 9368 }, { "epoch": 0.2467474321833026, "grad_norm": 2.3712146282196045, "learning_rate": 3.7671846194363975e-05, "loss": 1.928, "step": 9369 }, { "epoch": 0.24677376876481433, "grad_norm": 3.0647618770599365, "learning_rate": 3.767052936528839e-05, "loss": 1.9315, "step": 9370 }, { "epoch": 0.24680010534632604, "grad_norm": 1.8452814817428589, "learning_rate": 3.76692125362128e-05, "loss": 1.6652, "step": 9371 }, { "epoch": 0.24682644192783776, "grad_norm": 2.4918529987335205, "learning_rate": 3.7667895707137215e-05, "loss": 2.1092, "step": 9372 }, { "epoch": 0.24685277850934947, "grad_norm": 2.1917600631713867, "learning_rate": 3.766657887806163e-05, "loss": 1.9167, "step": 9373 }, { "epoch": 0.24687911509086122, "grad_norm": 2.591287136077881, "learning_rate": 3.7665262048986046e-05, "loss": 1.2811, "step": 9374 }, { "epoch": 0.24690545167237293, "grad_norm": 1.6492140293121338, "learning_rate": 3.7663945219910455e-05, "loss": 1.9454, "step": 9375 }, { "epoch": 0.24693178825388465, "grad_norm": 2.2528035640716553, "learning_rate": 3.766262839083487e-05, "loss": 1.9057, "step": 9376 }, { "epoch": 0.24695812483539636, "grad_norm": 2.1465160846710205, "learning_rate": 3.7661311561759286e-05, "loss": 1.4572, "step": 9377 }, { "epoch": 0.24698446141690808, "grad_norm": 1.577000617980957, "learning_rate": 3.76599947326837e-05, "loss": 1.4408, "step": 9378 }, { "epoch": 0.2470107979984198, "grad_norm": 2.171461343765259, "learning_rate": 3.765867790360812e-05, "loss": 2.0906, "step": 9379 }, { "epoch": 0.2470371345799315, "grad_norm": 2.246291399002075, "learning_rate": 3.7657361074532526e-05, "loss": 1.9148, "step": 9380 }, { "epoch": 0.24706347116144325, "grad_norm": 4.294859886169434, "learning_rate": 3.765604424545694e-05, "loss": 2.1422, "step": 9381 }, { "epoch": 0.24708980774295497, "grad_norm": 2.15989089012146, "learning_rate": 3.765472741638136e-05, "loss": 1.3629, "step": 9382 }, { "epoch": 0.24711614432446669, "grad_norm": 2.1761250495910645, "learning_rate": 3.765341058730577e-05, "loss": 1.8728, "step": 9383 }, { "epoch": 0.2471424809059784, "grad_norm": 3.8124423027038574, "learning_rate": 3.765209375823018e-05, "loss": 1.8839, "step": 9384 }, { "epoch": 0.24716881748749012, "grad_norm": 1.5608952045440674, "learning_rate": 3.76507769291546e-05, "loss": 1.9388, "step": 9385 }, { "epoch": 0.24719515406900183, "grad_norm": 2.71919584274292, "learning_rate": 3.7649460100079006e-05, "loss": 1.3599, "step": 9386 }, { "epoch": 0.24722149065051358, "grad_norm": 4.256839752197266, "learning_rate": 3.764814327100343e-05, "loss": 2.4145, "step": 9387 }, { "epoch": 0.2472478272320253, "grad_norm": 1.5942931175231934, "learning_rate": 3.764682644192784e-05, "loss": 1.742, "step": 9388 }, { "epoch": 0.247274163813537, "grad_norm": 2.4733927249908447, "learning_rate": 3.764550961285225e-05, "loss": 2.5946, "step": 9389 }, { "epoch": 0.24730050039504872, "grad_norm": 4.844635486602783, "learning_rate": 3.764419278377667e-05, "loss": 2.207, "step": 9390 }, { "epoch": 0.24732683697656044, "grad_norm": 1.9164692163467407, "learning_rate": 3.764287595470108e-05, "loss": 0.1823, "step": 9391 }, { "epoch": 0.24735317355807215, "grad_norm": 2.549802541732788, "learning_rate": 3.76415591256255e-05, "loss": 1.7088, "step": 9392 }, { "epoch": 0.24737951013958387, "grad_norm": 3.6922531127929688, "learning_rate": 3.764024229654991e-05, "loss": 1.4142, "step": 9393 }, { "epoch": 0.2474058467210956, "grad_norm": 2.3875904083251953, "learning_rate": 3.7638925467474324e-05, "loss": 2.0409, "step": 9394 }, { "epoch": 0.24743218330260733, "grad_norm": 3.4685134887695312, "learning_rate": 3.763760863839873e-05, "loss": 0.9303, "step": 9395 }, { "epoch": 0.24745851988411904, "grad_norm": 1.6364049911499023, "learning_rate": 3.7636291809323155e-05, "loss": 1.8949, "step": 9396 }, { "epoch": 0.24748485646563076, "grad_norm": 1.799635648727417, "learning_rate": 3.7634974980247564e-05, "loss": 1.9883, "step": 9397 }, { "epoch": 0.24751119304714247, "grad_norm": 2.8206584453582764, "learning_rate": 3.763365815117198e-05, "loss": 0.9572, "step": 9398 }, { "epoch": 0.2475375296286542, "grad_norm": 1.947456955909729, "learning_rate": 3.7632341322096395e-05, "loss": 1.4944, "step": 9399 }, { "epoch": 0.24756386621016593, "grad_norm": 1.634972095489502, "learning_rate": 3.7631024493020804e-05, "loss": 1.7038, "step": 9400 }, { "epoch": 0.24759020279167765, "grad_norm": 2.0548524856567383, "learning_rate": 3.7629707663945226e-05, "loss": 1.1977, "step": 9401 }, { "epoch": 0.24761653937318936, "grad_norm": 3.009627103805542, "learning_rate": 3.7628390834869635e-05, "loss": 1.3542, "step": 9402 }, { "epoch": 0.24764287595470108, "grad_norm": 2.9424219131469727, "learning_rate": 3.762707400579405e-05, "loss": 2.4282, "step": 9403 }, { "epoch": 0.2476692125362128, "grad_norm": 2.1092593669891357, "learning_rate": 3.762575717671846e-05, "loss": 1.0205, "step": 9404 }, { "epoch": 0.2476955491177245, "grad_norm": 2.342179536819458, "learning_rate": 3.7624440347642875e-05, "loss": 2.0314, "step": 9405 }, { "epoch": 0.24772188569923623, "grad_norm": 6.2738118171691895, "learning_rate": 3.762312351856729e-05, "loss": 1.4511, "step": 9406 }, { "epoch": 0.24774822228074797, "grad_norm": 3.217378854751587, "learning_rate": 3.7621806689491706e-05, "loss": 2.4851, "step": 9407 }, { "epoch": 0.2477745588622597, "grad_norm": 2.0456595420837402, "learning_rate": 3.762048986041612e-05, "loss": 1.6556, "step": 9408 }, { "epoch": 0.2478008954437714, "grad_norm": 2.476691961288452, "learning_rate": 3.761917303134053e-05, "loss": 1.9022, "step": 9409 }, { "epoch": 0.24782723202528312, "grad_norm": 1.6512302160263062, "learning_rate": 3.761785620226495e-05, "loss": 1.5487, "step": 9410 }, { "epoch": 0.24785356860679483, "grad_norm": 4.01853609085083, "learning_rate": 3.761653937318936e-05, "loss": 1.6983, "step": 9411 }, { "epoch": 0.24787990518830655, "grad_norm": 2.401923656463623, "learning_rate": 3.761522254411378e-05, "loss": 1.3442, "step": 9412 }, { "epoch": 0.24790624176981826, "grad_norm": 1.6326779127120972, "learning_rate": 3.7613905715038186e-05, "loss": 2.4835, "step": 9413 }, { "epoch": 0.24793257835133, "grad_norm": 1.719183325767517, "learning_rate": 3.76125888859626e-05, "loss": 2.3692, "step": 9414 }, { "epoch": 0.24795891493284172, "grad_norm": 3.0368611812591553, "learning_rate": 3.761127205688702e-05, "loss": 1.5506, "step": 9415 }, { "epoch": 0.24798525151435344, "grad_norm": 1.6918164491653442, "learning_rate": 3.760995522781143e-05, "loss": 1.6433, "step": 9416 }, { "epoch": 0.24801158809586515, "grad_norm": 3.1472530364990234, "learning_rate": 3.760863839873585e-05, "loss": 1.3656, "step": 9417 }, { "epoch": 0.24803792467737687, "grad_norm": 2.1720311641693115, "learning_rate": 3.760732156966026e-05, "loss": 1.9507, "step": 9418 }, { "epoch": 0.24806426125888859, "grad_norm": 2.212719440460205, "learning_rate": 3.760600474058467e-05, "loss": 2.0337, "step": 9419 }, { "epoch": 0.24809059784040033, "grad_norm": 3.5032010078430176, "learning_rate": 3.760468791150909e-05, "loss": 1.3772, "step": 9420 }, { "epoch": 0.24811693442191204, "grad_norm": 2.362844467163086, "learning_rate": 3.7603371082433504e-05, "loss": 1.7551, "step": 9421 }, { "epoch": 0.24814327100342376, "grad_norm": 3.3651692867279053, "learning_rate": 3.760205425335791e-05, "loss": 1.0103, "step": 9422 }, { "epoch": 0.24816960758493548, "grad_norm": 4.137308597564697, "learning_rate": 3.760073742428233e-05, "loss": 1.4831, "step": 9423 }, { "epoch": 0.2481959441664472, "grad_norm": 2.5527591705322266, "learning_rate": 3.7599420595206744e-05, "loss": 1.2846, "step": 9424 }, { "epoch": 0.2482222807479589, "grad_norm": 1.8308279514312744, "learning_rate": 3.759810376613116e-05, "loss": 2.2615, "step": 9425 }, { "epoch": 0.24824861732947062, "grad_norm": 3.7004990577697754, "learning_rate": 3.7596786937055576e-05, "loss": 1.0325, "step": 9426 }, { "epoch": 0.24827495391098237, "grad_norm": 2.8788669109344482, "learning_rate": 3.7595470107979984e-05, "loss": 0.7264, "step": 9427 }, { "epoch": 0.24830129049249408, "grad_norm": 1.7954556941986084, "learning_rate": 3.75941532789044e-05, "loss": 0.7045, "step": 9428 }, { "epoch": 0.2483276270740058, "grad_norm": 2.058328628540039, "learning_rate": 3.7592836449828816e-05, "loss": 1.9194, "step": 9429 }, { "epoch": 0.2483539636555175, "grad_norm": 2.114301919937134, "learning_rate": 3.759151962075323e-05, "loss": 1.9982, "step": 9430 }, { "epoch": 0.24838030023702923, "grad_norm": 3.6203978061676025, "learning_rate": 3.759020279167764e-05, "loss": 0.8585, "step": 9431 }, { "epoch": 0.24840663681854094, "grad_norm": 2.9751482009887695, "learning_rate": 3.7588885962602056e-05, "loss": 1.8626, "step": 9432 }, { "epoch": 0.2484329734000527, "grad_norm": 1.9818267822265625, "learning_rate": 3.758756913352647e-05, "loss": 2.6741, "step": 9433 }, { "epoch": 0.2484593099815644, "grad_norm": 1.5844560861587524, "learning_rate": 3.758625230445089e-05, "loss": 1.3837, "step": 9434 }, { "epoch": 0.24848564656307612, "grad_norm": 1.6429064273834229, "learning_rate": 3.75849354753753e-05, "loss": 1.4993, "step": 9435 }, { "epoch": 0.24851198314458783, "grad_norm": 1.9596023559570312, "learning_rate": 3.758361864629971e-05, "loss": 1.4399, "step": 9436 }, { "epoch": 0.24853831972609955, "grad_norm": 3.07328462600708, "learning_rate": 3.758230181722413e-05, "loss": 1.4072, "step": 9437 }, { "epoch": 0.24856465630761126, "grad_norm": 2.0965077877044678, "learning_rate": 3.7580984988148536e-05, "loss": 1.9421, "step": 9438 }, { "epoch": 0.24859099288912298, "grad_norm": 3.9975225925445557, "learning_rate": 3.757966815907296e-05, "loss": 2.1747, "step": 9439 }, { "epoch": 0.24861732947063472, "grad_norm": 2.217804193496704, "learning_rate": 3.757835132999737e-05, "loss": 1.4465, "step": 9440 }, { "epoch": 0.24864366605214644, "grad_norm": 2.4609575271606445, "learning_rate": 3.757703450092178e-05, "loss": 0.7346, "step": 9441 }, { "epoch": 0.24867000263365815, "grad_norm": 3.542363166809082, "learning_rate": 3.757571767184619e-05, "loss": 1.7552, "step": 9442 }, { "epoch": 0.24869633921516987, "grad_norm": 1.7418606281280518, "learning_rate": 3.7574400842770614e-05, "loss": 1.722, "step": 9443 }, { "epoch": 0.24872267579668159, "grad_norm": 2.213932752609253, "learning_rate": 3.757308401369502e-05, "loss": 1.5757, "step": 9444 }, { "epoch": 0.2487490123781933, "grad_norm": 3.2281250953674316, "learning_rate": 3.757176718461944e-05, "loss": 1.6851, "step": 9445 }, { "epoch": 0.24877534895970502, "grad_norm": 4.151186943054199, "learning_rate": 3.7570450355543854e-05, "loss": 1.1654, "step": 9446 }, { "epoch": 0.24880168554121676, "grad_norm": 4.256567001342773, "learning_rate": 3.756913352646826e-05, "loss": 1.1562, "step": 9447 }, { "epoch": 0.24882802212272848, "grad_norm": 2.671936273574829, "learning_rate": 3.7567816697392685e-05, "loss": 1.7515, "step": 9448 }, { "epoch": 0.2488543587042402, "grad_norm": 2.370760202407837, "learning_rate": 3.7566499868317094e-05, "loss": 2.0849, "step": 9449 }, { "epoch": 0.2488806952857519, "grad_norm": 2.531735420227051, "learning_rate": 3.756518303924151e-05, "loss": 1.6962, "step": 9450 }, { "epoch": 0.24890703186726362, "grad_norm": 1.6980692148208618, "learning_rate": 3.756386621016592e-05, "loss": 1.6919, "step": 9451 }, { "epoch": 0.24893336844877534, "grad_norm": 1.8127738237380981, "learning_rate": 3.7562549381090334e-05, "loss": 1.5787, "step": 9452 }, { "epoch": 0.24895970503028708, "grad_norm": 1.7807490825653076, "learning_rate": 3.756123255201475e-05, "loss": 1.33, "step": 9453 }, { "epoch": 0.2489860416117988, "grad_norm": 2.3240952491760254, "learning_rate": 3.7559915722939165e-05, "loss": 1.7208, "step": 9454 }, { "epoch": 0.2490123781933105, "grad_norm": 1.6140258312225342, "learning_rate": 3.755859889386358e-05, "loss": 1.3656, "step": 9455 }, { "epoch": 0.24903871477482223, "grad_norm": 3.2601892948150635, "learning_rate": 3.755728206478799e-05, "loss": 1.7914, "step": 9456 }, { "epoch": 0.24906505135633394, "grad_norm": 1.826035737991333, "learning_rate": 3.7555965235712405e-05, "loss": 1.5036, "step": 9457 }, { "epoch": 0.24909138793784566, "grad_norm": 1.7450965642929077, "learning_rate": 3.755464840663682e-05, "loss": 2.0786, "step": 9458 }, { "epoch": 0.24911772451935738, "grad_norm": 3.048736572265625, "learning_rate": 3.7553331577561236e-05, "loss": 2.1121, "step": 9459 }, { "epoch": 0.24914406110086912, "grad_norm": 2.082838773727417, "learning_rate": 3.7552014748485645e-05, "loss": 1.9109, "step": 9460 }, { "epoch": 0.24917039768238083, "grad_norm": 2.270730495452881, "learning_rate": 3.755069791941006e-05, "loss": 1.7991, "step": 9461 }, { "epoch": 0.24919673426389255, "grad_norm": 3.197336196899414, "learning_rate": 3.7549381090334476e-05, "loss": 1.6597, "step": 9462 }, { "epoch": 0.24922307084540427, "grad_norm": 1.5741242170333862, "learning_rate": 3.754806426125889e-05, "loss": 0.2953, "step": 9463 }, { "epoch": 0.24924940742691598, "grad_norm": 2.0129241943359375, "learning_rate": 3.754674743218331e-05, "loss": 2.0393, "step": 9464 }, { "epoch": 0.2492757440084277, "grad_norm": 3.4816834926605225, "learning_rate": 3.7545430603107716e-05, "loss": 1.2914, "step": 9465 }, { "epoch": 0.2493020805899394, "grad_norm": 3.7552454471588135, "learning_rate": 3.754411377403213e-05, "loss": 1.5201, "step": 9466 }, { "epoch": 0.24932841717145116, "grad_norm": 2.8539235591888428, "learning_rate": 3.754279694495655e-05, "loss": 1.7869, "step": 9467 }, { "epoch": 0.24935475375296287, "grad_norm": 2.0584511756896973, "learning_rate": 3.754148011588096e-05, "loss": 1.1935, "step": 9468 }, { "epoch": 0.2493810903344746, "grad_norm": 2.09688663482666, "learning_rate": 3.754016328680537e-05, "loss": 1.6549, "step": 9469 }, { "epoch": 0.2494074269159863, "grad_norm": 2.091367244720459, "learning_rate": 3.753884645772979e-05, "loss": 1.7702, "step": 9470 }, { "epoch": 0.24943376349749802, "grad_norm": 5.105950832366943, "learning_rate": 3.75375296286542e-05, "loss": 2.0232, "step": 9471 }, { "epoch": 0.24946010007900973, "grad_norm": 2.230776309967041, "learning_rate": 3.753621279957862e-05, "loss": 1.4869, "step": 9472 }, { "epoch": 0.24948643666052148, "grad_norm": 1.6570568084716797, "learning_rate": 3.7534895970503034e-05, "loss": 1.9204, "step": 9473 }, { "epoch": 0.2495127732420332, "grad_norm": 3.0584867000579834, "learning_rate": 3.753357914142744e-05, "loss": 1.9227, "step": 9474 }, { "epoch": 0.2495391098235449, "grad_norm": 4.87863302230835, "learning_rate": 3.753226231235186e-05, "loss": 1.4147, "step": 9475 }, { "epoch": 0.24956544640505662, "grad_norm": 1.9131696224212646, "learning_rate": 3.7530945483276274e-05, "loss": 0.7294, "step": 9476 }, { "epoch": 0.24959178298656834, "grad_norm": 1.7261526584625244, "learning_rate": 3.752962865420069e-05, "loss": 2.2689, "step": 9477 }, { "epoch": 0.24961811956808005, "grad_norm": 1.9611601829528809, "learning_rate": 3.75283118251251e-05, "loss": 1.9983, "step": 9478 }, { "epoch": 0.24964445614959177, "grad_norm": 1.7864912748336792, "learning_rate": 3.7526994996049514e-05, "loss": 1.8251, "step": 9479 }, { "epoch": 0.2496707927311035, "grad_norm": 2.0357465744018555, "learning_rate": 3.752567816697393e-05, "loss": 1.3445, "step": 9480 }, { "epoch": 0.24969712931261523, "grad_norm": 2.4738476276397705, "learning_rate": 3.7524361337898345e-05, "loss": 2.1068, "step": 9481 }, { "epoch": 0.24972346589412694, "grad_norm": 2.471388339996338, "learning_rate": 3.752304450882276e-05, "loss": 1.8538, "step": 9482 }, { "epoch": 0.24974980247563866, "grad_norm": 2.5244197845458984, "learning_rate": 3.752172767974717e-05, "loss": 0.9427, "step": 9483 }, { "epoch": 0.24977613905715038, "grad_norm": 2.4803152084350586, "learning_rate": 3.7520410850671585e-05, "loss": 1.8732, "step": 9484 }, { "epoch": 0.2498024756386621, "grad_norm": 1.862506628036499, "learning_rate": 3.7519094021595994e-05, "loss": 1.8955, "step": 9485 }, { "epoch": 0.24982881222017383, "grad_norm": 3.2220823764801025, "learning_rate": 3.751777719252042e-05, "loss": 0.7518, "step": 9486 }, { "epoch": 0.24985514880168555, "grad_norm": 1.572978138923645, "learning_rate": 3.7516460363444825e-05, "loss": 1.4945, "step": 9487 }, { "epoch": 0.24988148538319727, "grad_norm": 2.6795876026153564, "learning_rate": 3.751514353436924e-05, "loss": 1.8778, "step": 9488 }, { "epoch": 0.24990782196470898, "grad_norm": 3.7850356101989746, "learning_rate": 3.751382670529365e-05, "loss": 2.5065, "step": 9489 }, { "epoch": 0.2499341585462207, "grad_norm": 1.7539210319519043, "learning_rate": 3.7512509876218065e-05, "loss": 2.0139, "step": 9490 }, { "epoch": 0.2499604951277324, "grad_norm": 6.198868751525879, "learning_rate": 3.751119304714248e-05, "loss": 1.4639, "step": 9491 }, { "epoch": 0.24998683170924413, "grad_norm": 1.6415857076644897, "learning_rate": 3.75098762180669e-05, "loss": 2.0316, "step": 9492 }, { "epoch": 0.25001316829075587, "grad_norm": 4.67836856842041, "learning_rate": 3.750855938899131e-05, "loss": 0.8169, "step": 9493 }, { "epoch": 0.25003950487226756, "grad_norm": 1.7568237781524658, "learning_rate": 3.750724255991572e-05, "loss": 0.6639, "step": 9494 }, { "epoch": 0.2500658414537793, "grad_norm": 3.358640432357788, "learning_rate": 3.7505925730840143e-05, "loss": 1.4922, "step": 9495 }, { "epoch": 0.25009217803529105, "grad_norm": 2.1151347160339355, "learning_rate": 3.750460890176455e-05, "loss": 1.4312, "step": 9496 }, { "epoch": 0.25011851461680273, "grad_norm": 3.34977126121521, "learning_rate": 3.750329207268897e-05, "loss": 1.6327, "step": 9497 }, { "epoch": 0.2501448511983145, "grad_norm": 4.54311466217041, "learning_rate": 3.750197524361338e-05, "loss": 1.1835, "step": 9498 }, { "epoch": 0.25017118777982617, "grad_norm": 1.5967661142349243, "learning_rate": 3.750065841453779e-05, "loss": 1.7695, "step": 9499 }, { "epoch": 0.2501975243613379, "grad_norm": 2.2285587787628174, "learning_rate": 3.749934158546221e-05, "loss": 1.961, "step": 9500 }, { "epoch": 0.2502238609428496, "grad_norm": 3.322103500366211, "learning_rate": 3.7498024756386623e-05, "loss": 1.1913, "step": 9501 }, { "epoch": 0.25025019752436134, "grad_norm": 2.247872829437256, "learning_rate": 3.749670792731104e-05, "loss": 1.9805, "step": 9502 }, { "epoch": 0.2502765341058731, "grad_norm": 2.0715060234069824, "learning_rate": 3.749539109823545e-05, "loss": 1.7396, "step": 9503 }, { "epoch": 0.25030287068738477, "grad_norm": 2.527590036392212, "learning_rate": 3.7494074269159864e-05, "loss": 1.3825, "step": 9504 }, { "epoch": 0.2503292072688965, "grad_norm": 2.449042797088623, "learning_rate": 3.749275744008428e-05, "loss": 1.204, "step": 9505 }, { "epoch": 0.2503555438504082, "grad_norm": 2.8567423820495605, "learning_rate": 3.7491440611008695e-05, "loss": 0.7857, "step": 9506 }, { "epoch": 0.25038188043191995, "grad_norm": 1.9735890626907349, "learning_rate": 3.7490123781933104e-05, "loss": 2.0595, "step": 9507 }, { "epoch": 0.25040821701343163, "grad_norm": 2.7401175498962402, "learning_rate": 3.748880695285752e-05, "loss": 1.4095, "step": 9508 }, { "epoch": 0.2504345535949434, "grad_norm": 1.7880487442016602, "learning_rate": 3.7487490123781935e-05, "loss": 0.6231, "step": 9509 }, { "epoch": 0.2504608901764551, "grad_norm": 2.08772873878479, "learning_rate": 3.748617329470635e-05, "loss": 1.8496, "step": 9510 }, { "epoch": 0.2504872267579668, "grad_norm": 2.7481024265289307, "learning_rate": 3.7484856465630766e-05, "loss": 1.524, "step": 9511 }, { "epoch": 0.25051356333947855, "grad_norm": 2.2460877895355225, "learning_rate": 3.7483539636555175e-05, "loss": 1.8958, "step": 9512 }, { "epoch": 0.25053989992099024, "grad_norm": 2.1388838291168213, "learning_rate": 3.748222280747959e-05, "loss": 1.9058, "step": 9513 }, { "epoch": 0.250566236502502, "grad_norm": 2.4700698852539062, "learning_rate": 3.7480905978404006e-05, "loss": 2.0323, "step": 9514 }, { "epoch": 0.25059257308401367, "grad_norm": 1.7383674383163452, "learning_rate": 3.747958914932842e-05, "loss": 1.5088, "step": 9515 }, { "epoch": 0.2506189096655254, "grad_norm": 1.8023300170898438, "learning_rate": 3.747827232025283e-05, "loss": 1.7283, "step": 9516 }, { "epoch": 0.25064524624703716, "grad_norm": 2.087075710296631, "learning_rate": 3.7476955491177246e-05, "loss": 0.9805, "step": 9517 }, { "epoch": 0.25067158282854884, "grad_norm": 6.363293170928955, "learning_rate": 3.747563866210166e-05, "loss": 2.4237, "step": 9518 }, { "epoch": 0.2506979194100606, "grad_norm": 1.7582508325576782, "learning_rate": 3.747432183302608e-05, "loss": 1.9084, "step": 9519 }, { "epoch": 0.2507242559915723, "grad_norm": 1.5304358005523682, "learning_rate": 3.747300500395049e-05, "loss": 1.7855, "step": 9520 }, { "epoch": 0.250750592573084, "grad_norm": 1.6910958290100098, "learning_rate": 3.74716881748749e-05, "loss": 1.6028, "step": 9521 }, { "epoch": 0.2507769291545957, "grad_norm": 2.206517219543457, "learning_rate": 3.747037134579932e-05, "loss": 1.2626, "step": 9522 }, { "epoch": 0.25080326573610745, "grad_norm": 2.0970842838287354, "learning_rate": 3.7469054516723726e-05, "loss": 1.7463, "step": 9523 }, { "epoch": 0.2508296023176192, "grad_norm": 3.262721300125122, "learning_rate": 3.746773768764815e-05, "loss": 1.4458, "step": 9524 }, { "epoch": 0.2508559388991309, "grad_norm": 2.794424057006836, "learning_rate": 3.746642085857256e-05, "loss": 0.8474, "step": 9525 }, { "epoch": 0.2508822754806426, "grad_norm": 1.6718446016311646, "learning_rate": 3.746510402949697e-05, "loss": 1.8155, "step": 9526 }, { "epoch": 0.2509086120621543, "grad_norm": 2.027240753173828, "learning_rate": 3.746378720042139e-05, "loss": 2.0762, "step": 9527 }, { "epoch": 0.25093494864366606, "grad_norm": 3.090517520904541, "learning_rate": 3.7462470371345804e-05, "loss": 2.488, "step": 9528 }, { "epoch": 0.2509612852251778, "grad_norm": 2.1532089710235596, "learning_rate": 3.746115354227022e-05, "loss": 2.2541, "step": 9529 }, { "epoch": 0.2509876218066895, "grad_norm": 1.409540057182312, "learning_rate": 3.745983671319463e-05, "loss": 1.1668, "step": 9530 }, { "epoch": 0.25101395838820123, "grad_norm": 1.402719259262085, "learning_rate": 3.7458519884119044e-05, "loss": 1.6927, "step": 9531 }, { "epoch": 0.2510402949697129, "grad_norm": 1.6526787281036377, "learning_rate": 3.745720305504345e-05, "loss": 2.0521, "step": 9532 }, { "epoch": 0.25106663155122466, "grad_norm": 1.7547125816345215, "learning_rate": 3.7455886225967875e-05, "loss": 1.0086, "step": 9533 }, { "epoch": 0.25109296813273635, "grad_norm": 1.8315263986587524, "learning_rate": 3.7454569396892284e-05, "loss": 1.2294, "step": 9534 }, { "epoch": 0.2511193047142481, "grad_norm": 2.541095018386841, "learning_rate": 3.74532525678167e-05, "loss": 0.7088, "step": 9535 }, { "epoch": 0.25114564129575984, "grad_norm": 3.2906458377838135, "learning_rate": 3.7451935738741115e-05, "loss": 1.787, "step": 9536 }, { "epoch": 0.2511719778772715, "grad_norm": 3.915858030319214, "learning_rate": 3.7450618909665524e-05, "loss": 1.5705, "step": 9537 }, { "epoch": 0.25119831445878327, "grad_norm": 2.6768863201141357, "learning_rate": 3.7449302080589946e-05, "loss": 1.7437, "step": 9538 }, { "epoch": 0.25122465104029496, "grad_norm": 1.751901626586914, "learning_rate": 3.7447985251514355e-05, "loss": 1.4041, "step": 9539 }, { "epoch": 0.2512509876218067, "grad_norm": 1.7146458625793457, "learning_rate": 3.744666842243877e-05, "loss": 1.5067, "step": 9540 }, { "epoch": 0.2512773242033184, "grad_norm": 2.255998373031616, "learning_rate": 3.744535159336318e-05, "loss": 2.035, "step": 9541 }, { "epoch": 0.25130366078483013, "grad_norm": 2.358640193939209, "learning_rate": 3.74440347642876e-05, "loss": 1.7264, "step": 9542 }, { "epoch": 0.2513299973663419, "grad_norm": 2.8867604732513428, "learning_rate": 3.744271793521201e-05, "loss": 1.3457, "step": 9543 }, { "epoch": 0.25135633394785356, "grad_norm": 3.680968999862671, "learning_rate": 3.7441401106136426e-05, "loss": 1.1702, "step": 9544 }, { "epoch": 0.2513826705293653, "grad_norm": 1.8887977600097656, "learning_rate": 3.7440084277060835e-05, "loss": 1.3718, "step": 9545 }, { "epoch": 0.251409007110877, "grad_norm": 1.5648424625396729, "learning_rate": 3.743876744798525e-05, "loss": 1.6228, "step": 9546 }, { "epoch": 0.25143534369238874, "grad_norm": 3.817568778991699, "learning_rate": 3.7437450618909666e-05, "loss": 1.0193, "step": 9547 }, { "epoch": 0.2514616802739004, "grad_norm": 1.48291015625, "learning_rate": 3.743613378983408e-05, "loss": 1.6568, "step": 9548 }, { "epoch": 0.25148801685541217, "grad_norm": 1.7690905332565308, "learning_rate": 3.74348169607585e-05, "loss": 1.854, "step": 9549 }, { "epoch": 0.2515143534369239, "grad_norm": 2.000364303588867, "learning_rate": 3.7433500131682906e-05, "loss": 1.9702, "step": 9550 }, { "epoch": 0.2515406900184356, "grad_norm": 3.544632911682129, "learning_rate": 3.743218330260732e-05, "loss": 1.2276, "step": 9551 }, { "epoch": 0.25156702659994734, "grad_norm": 1.84476637840271, "learning_rate": 3.743086647353174e-05, "loss": 1.7679, "step": 9552 }, { "epoch": 0.25159336318145903, "grad_norm": 2.341996908187866, "learning_rate": 3.742954964445615e-05, "loss": 1.7968, "step": 9553 }, { "epoch": 0.25161969976297077, "grad_norm": 2.355987548828125, "learning_rate": 3.742823281538056e-05, "loss": 2.08, "step": 9554 }, { "epoch": 0.25164603634448246, "grad_norm": 1.776052713394165, "learning_rate": 3.742691598630498e-05, "loss": 2.1826, "step": 9555 }, { "epoch": 0.2516723729259942, "grad_norm": 2.5412628650665283, "learning_rate": 3.742559915722939e-05, "loss": 1.868, "step": 9556 }, { "epoch": 0.25169870950750595, "grad_norm": 1.9412487745285034, "learning_rate": 3.742428232815381e-05, "loss": 1.9086, "step": 9557 }, { "epoch": 0.25172504608901763, "grad_norm": 2.5037190914154053, "learning_rate": 3.7422965499078224e-05, "loss": 1.6867, "step": 9558 }, { "epoch": 0.2517513826705294, "grad_norm": 9.2487154006958, "learning_rate": 3.742164867000263e-05, "loss": 2.3271, "step": 9559 }, { "epoch": 0.25177771925204107, "grad_norm": 3.0891542434692383, "learning_rate": 3.742033184092705e-05, "loss": 1.0216, "step": 9560 }, { "epoch": 0.2518040558335528, "grad_norm": 2.2402870655059814, "learning_rate": 3.7419015011851464e-05, "loss": 1.182, "step": 9561 }, { "epoch": 0.25183039241506455, "grad_norm": 2.1816024780273438, "learning_rate": 3.741769818277588e-05, "loss": 1.3848, "step": 9562 }, { "epoch": 0.25185672899657624, "grad_norm": 6.280076026916504, "learning_rate": 3.741638135370029e-05, "loss": 2.5219, "step": 9563 }, { "epoch": 0.251883065578088, "grad_norm": 2.479802131652832, "learning_rate": 3.7415064524624705e-05, "loss": 1.175, "step": 9564 }, { "epoch": 0.25190940215959967, "grad_norm": 2.814124345779419, "learning_rate": 3.741374769554912e-05, "loss": 1.729, "step": 9565 }, { "epoch": 0.2519357387411114, "grad_norm": 2.2586817741394043, "learning_rate": 3.7412430866473536e-05, "loss": 1.9833, "step": 9566 }, { "epoch": 0.2519620753226231, "grad_norm": 2.9755196571350098, "learning_rate": 3.741111403739795e-05, "loss": 1.2308, "step": 9567 }, { "epoch": 0.25198841190413485, "grad_norm": 2.1252360343933105, "learning_rate": 3.740979720832236e-05, "loss": 2.1942, "step": 9568 }, { "epoch": 0.2520147484856466, "grad_norm": 2.2151808738708496, "learning_rate": 3.7408480379246776e-05, "loss": 1.3213, "step": 9569 }, { "epoch": 0.2520410850671583, "grad_norm": 1.918562889099121, "learning_rate": 3.7407163550171185e-05, "loss": 0.3259, "step": 9570 }, { "epoch": 0.25206742164867, "grad_norm": 4.135371208190918, "learning_rate": 3.740584672109561e-05, "loss": 1.8212, "step": 9571 }, { "epoch": 0.2520937582301817, "grad_norm": 1.9333744049072266, "learning_rate": 3.7404529892020016e-05, "loss": 1.7043, "step": 9572 }, { "epoch": 0.25212009481169345, "grad_norm": 1.8396936655044556, "learning_rate": 3.740321306294443e-05, "loss": 2.163, "step": 9573 }, { "epoch": 0.25214643139320514, "grad_norm": 1.914660096168518, "learning_rate": 3.740189623386885e-05, "loss": 1.4764, "step": 9574 }, { "epoch": 0.2521727679747169, "grad_norm": 1.869932770729065, "learning_rate": 3.740057940479326e-05, "loss": 2.1826, "step": 9575 }, { "epoch": 0.2521991045562286, "grad_norm": 1.817205548286438, "learning_rate": 3.739926257571768e-05, "loss": 1.437, "step": 9576 }, { "epoch": 0.2522254411377403, "grad_norm": 2.358226776123047, "learning_rate": 3.739794574664209e-05, "loss": 2.1461, "step": 9577 }, { "epoch": 0.25225177771925206, "grad_norm": 3.138655424118042, "learning_rate": 3.73966289175665e-05, "loss": 1.4892, "step": 9578 }, { "epoch": 0.25227811430076374, "grad_norm": 1.862527847290039, "learning_rate": 3.739531208849091e-05, "loss": 0.8819, "step": 9579 }, { "epoch": 0.2523044508822755, "grad_norm": 2.721653461456299, "learning_rate": 3.7393995259415334e-05, "loss": 2.4808, "step": 9580 }, { "epoch": 0.2523307874637872, "grad_norm": 2.8426930904388428, "learning_rate": 3.739267843033974e-05, "loss": 1.6447, "step": 9581 }, { "epoch": 0.2523571240452989, "grad_norm": 2.238645553588867, "learning_rate": 3.739136160126416e-05, "loss": 1.9963, "step": 9582 }, { "epoch": 0.25238346062681066, "grad_norm": 1.9386051893234253, "learning_rate": 3.7390044772188574e-05, "loss": 1.9467, "step": 9583 }, { "epoch": 0.25240979720832235, "grad_norm": 1.8041932582855225, "learning_rate": 3.738872794311298e-05, "loss": 1.802, "step": 9584 }, { "epoch": 0.2524361337898341, "grad_norm": 2.4384052753448486, "learning_rate": 3.7387411114037405e-05, "loss": 1.6788, "step": 9585 }, { "epoch": 0.2524624703713458, "grad_norm": 1.6500581502914429, "learning_rate": 3.7386094284961814e-05, "loss": 1.7741, "step": 9586 }, { "epoch": 0.2524888069528575, "grad_norm": 3.3525314331054688, "learning_rate": 3.738477745588623e-05, "loss": 1.2178, "step": 9587 }, { "epoch": 0.2525151435343692, "grad_norm": 2.4203553199768066, "learning_rate": 3.738346062681064e-05, "loss": 1.2048, "step": 9588 }, { "epoch": 0.25254148011588096, "grad_norm": 1.5846211910247803, "learning_rate": 3.7382143797735054e-05, "loss": 0.3042, "step": 9589 }, { "epoch": 0.2525678166973927, "grad_norm": 2.4709599018096924, "learning_rate": 3.738082696865947e-05, "loss": 1.898, "step": 9590 }, { "epoch": 0.2525941532789044, "grad_norm": 1.626758098602295, "learning_rate": 3.7379510139583885e-05, "loss": 1.7603, "step": 9591 }, { "epoch": 0.25262048986041613, "grad_norm": 3.4821956157684326, "learning_rate": 3.7378193310508294e-05, "loss": 1.5059, "step": 9592 }, { "epoch": 0.2526468264419278, "grad_norm": 2.0917601585388184, "learning_rate": 3.737687648143271e-05, "loss": 1.7907, "step": 9593 }, { "epoch": 0.25267316302343956, "grad_norm": 2.327014207839966, "learning_rate": 3.7375559652357125e-05, "loss": 2.3821, "step": 9594 }, { "epoch": 0.25269949960495125, "grad_norm": 2.840378999710083, "learning_rate": 3.737424282328154e-05, "loss": 2.8007, "step": 9595 }, { "epoch": 0.252725836186463, "grad_norm": 1.7067112922668457, "learning_rate": 3.7372925994205956e-05, "loss": 1.6312, "step": 9596 }, { "epoch": 0.25275217276797474, "grad_norm": 1.8548580408096313, "learning_rate": 3.7371609165130365e-05, "loss": 1.5074, "step": 9597 }, { "epoch": 0.2527785093494864, "grad_norm": 2.1230478286743164, "learning_rate": 3.737029233605478e-05, "loss": 1.5026, "step": 9598 }, { "epoch": 0.25280484593099817, "grad_norm": 4.288404941558838, "learning_rate": 3.7368975506979196e-05, "loss": 0.8427, "step": 9599 }, { "epoch": 0.25283118251250986, "grad_norm": 2.0781381130218506, "learning_rate": 3.736765867790361e-05, "loss": 1.5195, "step": 9600 }, { "epoch": 0.2528575190940216, "grad_norm": 1.8319816589355469, "learning_rate": 3.736634184882802e-05, "loss": 1.9692, "step": 9601 }, { "epoch": 0.25288385567553334, "grad_norm": 3.4707813262939453, "learning_rate": 3.7365025019752436e-05, "loss": 1.1606, "step": 9602 }, { "epoch": 0.25291019225704503, "grad_norm": 5.787014007568359, "learning_rate": 3.736370819067685e-05, "loss": 1.603, "step": 9603 }, { "epoch": 0.2529365288385568, "grad_norm": 2.4565348625183105, "learning_rate": 3.736239136160127e-05, "loss": 1.6115, "step": 9604 }, { "epoch": 0.25296286542006846, "grad_norm": 2.6019861698150635, "learning_rate": 3.736107453252568e-05, "loss": 0.6824, "step": 9605 }, { "epoch": 0.2529892020015802, "grad_norm": 1.8150345087051392, "learning_rate": 3.735975770345009e-05, "loss": 2.243, "step": 9606 }, { "epoch": 0.2530155385830919, "grad_norm": 5.225513458251953, "learning_rate": 3.735844087437451e-05, "loss": 1.6182, "step": 9607 }, { "epoch": 0.25304187516460364, "grad_norm": 2.257249116897583, "learning_rate": 3.735712404529892e-05, "loss": 1.953, "step": 9608 }, { "epoch": 0.2530682117461154, "grad_norm": 4.514675140380859, "learning_rate": 3.735580721622334e-05, "loss": 1.3993, "step": 9609 }, { "epoch": 0.25309454832762707, "grad_norm": 2.362719774246216, "learning_rate": 3.735449038714775e-05, "loss": 0.6299, "step": 9610 }, { "epoch": 0.2531208849091388, "grad_norm": 2.76245379447937, "learning_rate": 3.735317355807216e-05, "loss": 2.4718, "step": 9611 }, { "epoch": 0.2531472214906505, "grad_norm": 2.9407243728637695, "learning_rate": 3.735185672899658e-05, "loss": 1.8316, "step": 9612 }, { "epoch": 0.25317355807216224, "grad_norm": 6.190566539764404, "learning_rate": 3.7350539899920994e-05, "loss": 2.0521, "step": 9613 }, { "epoch": 0.25319989465367393, "grad_norm": 1.9451617002487183, "learning_rate": 3.734922307084541e-05, "loss": 1.8535, "step": 9614 }, { "epoch": 0.2532262312351857, "grad_norm": 1.9362940788269043, "learning_rate": 3.734790624176982e-05, "loss": 1.7743, "step": 9615 }, { "epoch": 0.2532525678166974, "grad_norm": 3.7016282081604004, "learning_rate": 3.7346589412694234e-05, "loss": 0.7037, "step": 9616 }, { "epoch": 0.2532789043982091, "grad_norm": 2.2456562519073486, "learning_rate": 3.734527258361864e-05, "loss": 1.8348, "step": 9617 }, { "epoch": 0.25330524097972085, "grad_norm": 1.6477816104888916, "learning_rate": 3.7343955754543065e-05, "loss": 1.7671, "step": 9618 }, { "epoch": 0.25333157756123253, "grad_norm": 1.6278709173202515, "learning_rate": 3.7342638925467474e-05, "loss": 1.5653, "step": 9619 }, { "epoch": 0.2533579141427443, "grad_norm": 2.5155160427093506, "learning_rate": 3.734132209639189e-05, "loss": 1.5245, "step": 9620 }, { "epoch": 0.25338425072425597, "grad_norm": 2.0082337856292725, "learning_rate": 3.7340005267316305e-05, "loss": 1.4992, "step": 9621 }, { "epoch": 0.2534105873057677, "grad_norm": 6.262524127960205, "learning_rate": 3.7338688438240714e-05, "loss": 1.6302, "step": 9622 }, { "epoch": 0.25343692388727945, "grad_norm": 3.1318280696868896, "learning_rate": 3.733737160916514e-05, "loss": 1.4374, "step": 9623 }, { "epoch": 0.25346326046879114, "grad_norm": 1.822563648223877, "learning_rate": 3.7336054780089546e-05, "loss": 1.2263, "step": 9624 }, { "epoch": 0.2534895970503029, "grad_norm": 2.0121610164642334, "learning_rate": 3.733473795101396e-05, "loss": 1.7846, "step": 9625 }, { "epoch": 0.25351593363181457, "grad_norm": 1.7135684490203857, "learning_rate": 3.733342112193837e-05, "loss": 2.3712, "step": 9626 }, { "epoch": 0.2535422702133263, "grad_norm": 1.620875358581543, "learning_rate": 3.733210429286279e-05, "loss": 1.4597, "step": 9627 }, { "epoch": 0.253568606794838, "grad_norm": 2.601332187652588, "learning_rate": 3.73307874637872e-05, "loss": 2.0782, "step": 9628 }, { "epoch": 0.25359494337634975, "grad_norm": 2.2117395401000977, "learning_rate": 3.732947063471162e-05, "loss": 2.0553, "step": 9629 }, { "epoch": 0.2536212799578615, "grad_norm": 1.6906989812850952, "learning_rate": 3.732815380563603e-05, "loss": 0.544, "step": 9630 }, { "epoch": 0.2536476165393732, "grad_norm": 3.0786561965942383, "learning_rate": 3.732683697656044e-05, "loss": 1.9324, "step": 9631 }, { "epoch": 0.2536739531208849, "grad_norm": 1.5395272970199585, "learning_rate": 3.7325520147484864e-05, "loss": 1.6829, "step": 9632 }, { "epoch": 0.2537002897023966, "grad_norm": 1.7805149555206299, "learning_rate": 3.732420331840927e-05, "loss": 1.6129, "step": 9633 }, { "epoch": 0.25372662628390835, "grad_norm": 2.0277090072631836, "learning_rate": 3.732288648933369e-05, "loss": 2.0794, "step": 9634 }, { "epoch": 0.2537529628654201, "grad_norm": 2.461071014404297, "learning_rate": 3.73215696602581e-05, "loss": 1.5116, "step": 9635 }, { "epoch": 0.2537792994469318, "grad_norm": 1.6589722633361816, "learning_rate": 3.732025283118251e-05, "loss": 1.9653, "step": 9636 }, { "epoch": 0.2538056360284435, "grad_norm": 4.968806266784668, "learning_rate": 3.731893600210693e-05, "loss": 2.5758, "step": 9637 }, { "epoch": 0.2538319726099552, "grad_norm": 5.118461608886719, "learning_rate": 3.7317619173031344e-05, "loss": 0.8312, "step": 9638 }, { "epoch": 0.25385830919146696, "grad_norm": 1.9708750247955322, "learning_rate": 3.731630234395575e-05, "loss": 1.8017, "step": 9639 }, { "epoch": 0.25388464577297865, "grad_norm": 1.9421560764312744, "learning_rate": 3.731498551488017e-05, "loss": 1.6465, "step": 9640 }, { "epoch": 0.2539109823544904, "grad_norm": 1.9702420234680176, "learning_rate": 3.731366868580459e-05, "loss": 0.9592, "step": 9641 }, { "epoch": 0.25393731893600213, "grad_norm": 1.6837055683135986, "learning_rate": 3.7312351856729e-05, "loss": 1.5235, "step": 9642 }, { "epoch": 0.2539636555175138, "grad_norm": 2.189059019088745, "learning_rate": 3.7311035027653415e-05, "loss": 1.7711, "step": 9643 }, { "epoch": 0.25398999209902556, "grad_norm": 2.7619576454162598, "learning_rate": 3.7309718198577824e-05, "loss": 2.23, "step": 9644 }, { "epoch": 0.25401632868053725, "grad_norm": 2.131391763687134, "learning_rate": 3.730840136950224e-05, "loss": 2.1333, "step": 9645 }, { "epoch": 0.254042665262049, "grad_norm": 4.068904399871826, "learning_rate": 3.7307084540426655e-05, "loss": 0.6892, "step": 9646 }, { "epoch": 0.2540690018435607, "grad_norm": 2.9153926372528076, "learning_rate": 3.730576771135107e-05, "loss": 1.6668, "step": 9647 }, { "epoch": 0.2540953384250724, "grad_norm": 3.7939934730529785, "learning_rate": 3.730445088227548e-05, "loss": 1.4415, "step": 9648 }, { "epoch": 0.25412167500658417, "grad_norm": 1.8799753189086914, "learning_rate": 3.7303134053199895e-05, "loss": 0.4689, "step": 9649 }, { "epoch": 0.25414801158809586, "grad_norm": 1.6384416818618774, "learning_rate": 3.730181722412431e-05, "loss": 1.6233, "step": 9650 }, { "epoch": 0.2541743481696076, "grad_norm": 4.40449857711792, "learning_rate": 3.7300500395048726e-05, "loss": 1.0271, "step": 9651 }, { "epoch": 0.2542006847511193, "grad_norm": 1.7981282472610474, "learning_rate": 3.729918356597314e-05, "loss": 2.2554, "step": 9652 }, { "epoch": 0.25422702133263103, "grad_norm": 3.553616523742676, "learning_rate": 3.729786673689755e-05, "loss": 1.516, "step": 9653 }, { "epoch": 0.2542533579141427, "grad_norm": 2.7820286750793457, "learning_rate": 3.7296549907821966e-05, "loss": 1.3957, "step": 9654 }, { "epoch": 0.25427969449565446, "grad_norm": 2.124394178390503, "learning_rate": 3.729523307874638e-05, "loss": 1.0345, "step": 9655 }, { "epoch": 0.2543060310771662, "grad_norm": 2.05969500541687, "learning_rate": 3.72939162496708e-05, "loss": 1.8036, "step": 9656 }, { "epoch": 0.2543323676586779, "grad_norm": 4.607055187225342, "learning_rate": 3.7292599420595206e-05, "loss": 1.8117, "step": 9657 }, { "epoch": 0.25435870424018964, "grad_norm": 2.608032464981079, "learning_rate": 3.729128259151962e-05, "loss": 1.6529, "step": 9658 }, { "epoch": 0.2543850408217013, "grad_norm": 1.6836975812911987, "learning_rate": 3.728996576244404e-05, "loss": 1.7342, "step": 9659 }, { "epoch": 0.25441137740321307, "grad_norm": 2.188162088394165, "learning_rate": 3.728864893336845e-05, "loss": 1.5757, "step": 9660 }, { "epoch": 0.25443771398472476, "grad_norm": 2.5714704990386963, "learning_rate": 3.728733210429287e-05, "loss": 1.5288, "step": 9661 }, { "epoch": 0.2544640505662365, "grad_norm": 1.8239662647247314, "learning_rate": 3.728601527521728e-05, "loss": 1.7063, "step": 9662 }, { "epoch": 0.25449038714774824, "grad_norm": 2.715965986251831, "learning_rate": 3.728469844614169e-05, "loss": 1.1891, "step": 9663 }, { "epoch": 0.25451672372925993, "grad_norm": 2.429906129837036, "learning_rate": 3.72833816170661e-05, "loss": 0.7726, "step": 9664 }, { "epoch": 0.2545430603107717, "grad_norm": 1.5424458980560303, "learning_rate": 3.7282064787990524e-05, "loss": 1.3551, "step": 9665 }, { "epoch": 0.25456939689228336, "grad_norm": 2.020559549331665, "learning_rate": 3.728074795891493e-05, "loss": 1.5823, "step": 9666 }, { "epoch": 0.2545957334737951, "grad_norm": 1.7699886560440063, "learning_rate": 3.727943112983935e-05, "loss": 2.1829, "step": 9667 }, { "epoch": 0.25462207005530685, "grad_norm": 2.178121566772461, "learning_rate": 3.7278114300763764e-05, "loss": 1.5704, "step": 9668 }, { "epoch": 0.25464840663681854, "grad_norm": 6.326632976531982, "learning_rate": 3.727679747168817e-05, "loss": 0.6792, "step": 9669 }, { "epoch": 0.2546747432183303, "grad_norm": 2.2320096492767334, "learning_rate": 3.7275480642612595e-05, "loss": 1.7751, "step": 9670 }, { "epoch": 0.25470107979984197, "grad_norm": 1.9952192306518555, "learning_rate": 3.7274163813537004e-05, "loss": 1.7288, "step": 9671 }, { "epoch": 0.2547274163813537, "grad_norm": 2.917086362838745, "learning_rate": 3.727284698446142e-05, "loss": 1.3647, "step": 9672 }, { "epoch": 0.2547537529628654, "grad_norm": 1.6897929906845093, "learning_rate": 3.727153015538583e-05, "loss": 1.6752, "step": 9673 }, { "epoch": 0.25478008954437714, "grad_norm": 2.030916452407837, "learning_rate": 3.727021332631025e-05, "loss": 2.0869, "step": 9674 }, { "epoch": 0.2548064261258889, "grad_norm": 2.204450845718384, "learning_rate": 3.726889649723466e-05, "loss": 2.0832, "step": 9675 }, { "epoch": 0.2548327627074006, "grad_norm": 2.0757219791412354, "learning_rate": 3.7267579668159075e-05, "loss": 2.0643, "step": 9676 }, { "epoch": 0.2548590992889123, "grad_norm": 1.6636179685592651, "learning_rate": 3.726626283908349e-05, "loss": 2.0463, "step": 9677 }, { "epoch": 0.254885435870424, "grad_norm": 1.8233202695846558, "learning_rate": 3.72649460100079e-05, "loss": 2.1082, "step": 9678 }, { "epoch": 0.25491177245193575, "grad_norm": 3.6059629917144775, "learning_rate": 3.726362918093232e-05, "loss": 1.4597, "step": 9679 }, { "epoch": 0.25493810903344744, "grad_norm": 1.9130117893218994, "learning_rate": 3.726231235185673e-05, "loss": 1.839, "step": 9680 }, { "epoch": 0.2549644456149592, "grad_norm": 4.750220775604248, "learning_rate": 3.7260995522781146e-05, "loss": 0.6948, "step": 9681 }, { "epoch": 0.2549907821964709, "grad_norm": 4.482402801513672, "learning_rate": 3.7259678693705555e-05, "loss": 1.2409, "step": 9682 }, { "epoch": 0.2550171187779826, "grad_norm": 2.10050630569458, "learning_rate": 3.725836186462997e-05, "loss": 1.5929, "step": 9683 }, { "epoch": 0.25504345535949435, "grad_norm": 3.0294337272644043, "learning_rate": 3.7257045035554387e-05, "loss": 1.4726, "step": 9684 }, { "epoch": 0.25506979194100604, "grad_norm": 1.498418927192688, "learning_rate": 3.72557282064788e-05, "loss": 1.5774, "step": 9685 }, { "epoch": 0.2550961285225178, "grad_norm": 1.7631802558898926, "learning_rate": 3.725441137740322e-05, "loss": 2.1942, "step": 9686 }, { "epoch": 0.25512246510402947, "grad_norm": 1.6969068050384521, "learning_rate": 3.7253094548327627e-05, "loss": 2.3295, "step": 9687 }, { "epoch": 0.2551488016855412, "grad_norm": 1.7383668422698975, "learning_rate": 3.725177771925205e-05, "loss": 2.0288, "step": 9688 }, { "epoch": 0.25517513826705296, "grad_norm": 1.8575865030288696, "learning_rate": 3.725046089017646e-05, "loss": 1.8065, "step": 9689 }, { "epoch": 0.25520147484856465, "grad_norm": 2.979454278945923, "learning_rate": 3.724914406110087e-05, "loss": 1.1739, "step": 9690 }, { "epoch": 0.2552278114300764, "grad_norm": 1.554207682609558, "learning_rate": 3.724782723202528e-05, "loss": 1.743, "step": 9691 }, { "epoch": 0.2552541480115881, "grad_norm": 1.430321455001831, "learning_rate": 3.72465104029497e-05, "loss": 1.4131, "step": 9692 }, { "epoch": 0.2552804845930998, "grad_norm": 3.4111132621765137, "learning_rate": 3.724519357387411e-05, "loss": 1.4037, "step": 9693 }, { "epoch": 0.2553068211746115, "grad_norm": 5.329657554626465, "learning_rate": 3.724387674479853e-05, "loss": 0.9999, "step": 9694 }, { "epoch": 0.25533315775612325, "grad_norm": 5.334298133850098, "learning_rate": 3.724255991572294e-05, "loss": 1.6436, "step": 9695 }, { "epoch": 0.255359494337635, "grad_norm": 2.733109951019287, "learning_rate": 3.724124308664735e-05, "loss": 1.4872, "step": 9696 }, { "epoch": 0.2553858309191467, "grad_norm": 3.851780652999878, "learning_rate": 3.723992625757177e-05, "loss": 1.9459, "step": 9697 }, { "epoch": 0.2554121675006584, "grad_norm": 1.7495603561401367, "learning_rate": 3.7238609428496185e-05, "loss": 1.9351, "step": 9698 }, { "epoch": 0.2554385040821701, "grad_norm": 3.068164587020874, "learning_rate": 3.72372925994206e-05, "loss": 1.1152, "step": 9699 }, { "epoch": 0.25546484066368186, "grad_norm": 1.4387151002883911, "learning_rate": 3.723597577034501e-05, "loss": 2.1803, "step": 9700 }, { "epoch": 0.2554911772451936, "grad_norm": 1.874611735343933, "learning_rate": 3.7234658941269425e-05, "loss": 2.0498, "step": 9701 }, { "epoch": 0.2555175138267053, "grad_norm": 2.6465258598327637, "learning_rate": 3.723334211219383e-05, "loss": 1.3539, "step": 9702 }, { "epoch": 0.25554385040821703, "grad_norm": 1.9960482120513916, "learning_rate": 3.7232025283118256e-05, "loss": 0.4908, "step": 9703 }, { "epoch": 0.2555701869897287, "grad_norm": 1.5477030277252197, "learning_rate": 3.7230708454042665e-05, "loss": 2.0217, "step": 9704 }, { "epoch": 0.25559652357124046, "grad_norm": 2.119279146194458, "learning_rate": 3.722939162496708e-05, "loss": 0.4998, "step": 9705 }, { "epoch": 0.25562286015275215, "grad_norm": 3.4473531246185303, "learning_rate": 3.7228074795891496e-05, "loss": 1.502, "step": 9706 }, { "epoch": 0.2556491967342639, "grad_norm": 1.8358161449432373, "learning_rate": 3.722675796681591e-05, "loss": 0.7645, "step": 9707 }, { "epoch": 0.25567553331577564, "grad_norm": 1.765196442604065, "learning_rate": 3.722544113774033e-05, "loss": 1.8553, "step": 9708 }, { "epoch": 0.2557018698972873, "grad_norm": 2.1057639122009277, "learning_rate": 3.7224124308664736e-05, "loss": 1.1446, "step": 9709 }, { "epoch": 0.25572820647879907, "grad_norm": 2.1482160091400146, "learning_rate": 3.722280747958915e-05, "loss": 1.4923, "step": 9710 }, { "epoch": 0.25575454306031076, "grad_norm": 4.367096900939941, "learning_rate": 3.722149065051356e-05, "loss": 0.7062, "step": 9711 }, { "epoch": 0.2557808796418225, "grad_norm": 2.0393383502960205, "learning_rate": 3.722017382143798e-05, "loss": 1.6342, "step": 9712 }, { "epoch": 0.2558072162233342, "grad_norm": 1.8395330905914307, "learning_rate": 3.721885699236239e-05, "loss": 1.5871, "step": 9713 }, { "epoch": 0.25583355280484593, "grad_norm": 1.9447764158248901, "learning_rate": 3.721754016328681e-05, "loss": 1.8653, "step": 9714 }, { "epoch": 0.2558598893863577, "grad_norm": 2.2410926818847656, "learning_rate": 3.721622333421122e-05, "loss": 0.4755, "step": 9715 }, { "epoch": 0.25588622596786936, "grad_norm": 2.066312074661255, "learning_rate": 3.721490650513563e-05, "loss": 1.0124, "step": 9716 }, { "epoch": 0.2559125625493811, "grad_norm": 2.3067197799682617, "learning_rate": 3.7213589676060054e-05, "loss": 1.1726, "step": 9717 }, { "epoch": 0.2559388991308928, "grad_norm": 2.2440619468688965, "learning_rate": 3.721227284698446e-05, "loss": 2.5235, "step": 9718 }, { "epoch": 0.25596523571240454, "grad_norm": 2.3411786556243896, "learning_rate": 3.721095601790888e-05, "loss": 2.0143, "step": 9719 }, { "epoch": 0.2559915722939162, "grad_norm": 1.8204071521759033, "learning_rate": 3.720963918883329e-05, "loss": 1.6909, "step": 9720 }, { "epoch": 0.25601790887542797, "grad_norm": 2.0097641944885254, "learning_rate": 3.720832235975771e-05, "loss": 1.0509, "step": 9721 }, { "epoch": 0.2560442454569397, "grad_norm": 1.885451316833496, "learning_rate": 3.720700553068212e-05, "loss": 1.2706, "step": 9722 }, { "epoch": 0.2560705820384514, "grad_norm": 2.780299425125122, "learning_rate": 3.7205688701606534e-05, "loss": 1.2881, "step": 9723 }, { "epoch": 0.25609691861996314, "grad_norm": 4.1458420753479, "learning_rate": 3.720437187253095e-05, "loss": 0.8259, "step": 9724 }, { "epoch": 0.25612325520147483, "grad_norm": 1.5310566425323486, "learning_rate": 3.720305504345536e-05, "loss": 2.2724, "step": 9725 }, { "epoch": 0.2561495917829866, "grad_norm": 1.6194044351577759, "learning_rate": 3.720173821437978e-05, "loss": 1.6572, "step": 9726 }, { "epoch": 0.25617592836449826, "grad_norm": 3.186223030090332, "learning_rate": 3.720042138530419e-05, "loss": 0.6834, "step": 9727 }, { "epoch": 0.25620226494601, "grad_norm": 2.9181206226348877, "learning_rate": 3.7199104556228605e-05, "loss": 1.6049, "step": 9728 }, { "epoch": 0.25622860152752175, "grad_norm": 2.823756694793701, "learning_rate": 3.7197787727153014e-05, "loss": 0.7815, "step": 9729 }, { "epoch": 0.25625493810903344, "grad_norm": 1.807732105255127, "learning_rate": 3.719647089807743e-05, "loss": 1.7068, "step": 9730 }, { "epoch": 0.2562812746905452, "grad_norm": 2.2641496658325195, "learning_rate": 3.7195154069001845e-05, "loss": 1.9368, "step": 9731 }, { "epoch": 0.25630761127205687, "grad_norm": 3.063370943069458, "learning_rate": 3.719383723992626e-05, "loss": 1.5704, "step": 9732 }, { "epoch": 0.2563339478535686, "grad_norm": 1.7358959913253784, "learning_rate": 3.7192520410850676e-05, "loss": 2.2386, "step": 9733 }, { "epoch": 0.2563602844350803, "grad_norm": 1.8289241790771484, "learning_rate": 3.7191203581775085e-05, "loss": 2.0111, "step": 9734 }, { "epoch": 0.25638662101659204, "grad_norm": 3.409494161605835, "learning_rate": 3.71898867526995e-05, "loss": 1.0603, "step": 9735 }, { "epoch": 0.2564129575981038, "grad_norm": 1.4900327920913696, "learning_rate": 3.7188569923623916e-05, "loss": 1.4456, "step": 9736 }, { "epoch": 0.2564392941796155, "grad_norm": 3.1553726196289062, "learning_rate": 3.718725309454833e-05, "loss": 2.2029, "step": 9737 }, { "epoch": 0.2564656307611272, "grad_norm": 2.7145864963531494, "learning_rate": 3.718593626547274e-05, "loss": 0.7665, "step": 9738 }, { "epoch": 0.2564919673426389, "grad_norm": 2.8754632472991943, "learning_rate": 3.7184619436397156e-05, "loss": 0.9922, "step": 9739 }, { "epoch": 0.25651830392415065, "grad_norm": 1.6334422826766968, "learning_rate": 3.718330260732157e-05, "loss": 2.2661, "step": 9740 }, { "epoch": 0.2565446405056624, "grad_norm": 1.6821001768112183, "learning_rate": 3.718198577824599e-05, "loss": 2.0174, "step": 9741 }, { "epoch": 0.2565709770871741, "grad_norm": 2.5980312824249268, "learning_rate": 3.7180668949170396e-05, "loss": 1.797, "step": 9742 }, { "epoch": 0.2565973136686858, "grad_norm": 1.8130223751068115, "learning_rate": 3.717935212009481e-05, "loss": 1.6366, "step": 9743 }, { "epoch": 0.2566236502501975, "grad_norm": 2.012077569961548, "learning_rate": 3.717803529101923e-05, "loss": 1.8363, "step": 9744 }, { "epoch": 0.25664998683170925, "grad_norm": 2.2008514404296875, "learning_rate": 3.717671846194364e-05, "loss": 1.6171, "step": 9745 }, { "epoch": 0.25667632341322094, "grad_norm": 3.118563413619995, "learning_rate": 3.717540163286806e-05, "loss": 1.5961, "step": 9746 }, { "epoch": 0.2567026599947327, "grad_norm": 2.0176212787628174, "learning_rate": 3.717408480379247e-05, "loss": 1.6506, "step": 9747 }, { "epoch": 0.25672899657624443, "grad_norm": 3.3909292221069336, "learning_rate": 3.717276797471688e-05, "loss": 1.8123, "step": 9748 }, { "epoch": 0.2567553331577561, "grad_norm": 2.441117286682129, "learning_rate": 3.717145114564129e-05, "loss": 1.4192, "step": 9749 }, { "epoch": 0.25678166973926786, "grad_norm": 5.388453006744385, "learning_rate": 3.7170134316565714e-05, "loss": 1.8883, "step": 9750 }, { "epoch": 0.25680800632077955, "grad_norm": 2.9193570613861084, "learning_rate": 3.716881748749012e-05, "loss": 1.8319, "step": 9751 }, { "epoch": 0.2568343429022913, "grad_norm": 2.458540678024292, "learning_rate": 3.716750065841454e-05, "loss": 1.5926, "step": 9752 }, { "epoch": 0.256860679483803, "grad_norm": 2.311155080795288, "learning_rate": 3.7166183829338954e-05, "loss": 1.7621, "step": 9753 }, { "epoch": 0.2568870160653147, "grad_norm": 1.8875707387924194, "learning_rate": 3.716486700026337e-05, "loss": 1.7791, "step": 9754 }, { "epoch": 0.25691335264682646, "grad_norm": 3.3996996879577637, "learning_rate": 3.7163550171187786e-05, "loss": 2.1299, "step": 9755 }, { "epoch": 0.25693968922833815, "grad_norm": 2.3091018199920654, "learning_rate": 3.7162233342112194e-05, "loss": 1.4769, "step": 9756 }, { "epoch": 0.2569660258098499, "grad_norm": 4.962271690368652, "learning_rate": 3.716091651303661e-05, "loss": 1.5402, "step": 9757 }, { "epoch": 0.2569923623913616, "grad_norm": 3.1826913356781006, "learning_rate": 3.715959968396102e-05, "loss": 1.719, "step": 9758 }, { "epoch": 0.2570186989728733, "grad_norm": 1.8783457279205322, "learning_rate": 3.715828285488544e-05, "loss": 2.2721, "step": 9759 }, { "epoch": 0.257045035554385, "grad_norm": 6.030035495758057, "learning_rate": 3.715696602580985e-05, "loss": 1.4771, "step": 9760 }, { "epoch": 0.25707137213589676, "grad_norm": 1.5319576263427734, "learning_rate": 3.7155649196734266e-05, "loss": 1.1791, "step": 9761 }, { "epoch": 0.2570977087174085, "grad_norm": 2.60465407371521, "learning_rate": 3.715433236765868e-05, "loss": 1.9611, "step": 9762 }, { "epoch": 0.2571240452989202, "grad_norm": 3.9286725521087646, "learning_rate": 3.715301553858309e-05, "loss": 2.2252, "step": 9763 }, { "epoch": 0.25715038188043193, "grad_norm": 2.477926731109619, "learning_rate": 3.715169870950751e-05, "loss": 1.5371, "step": 9764 }, { "epoch": 0.2571767184619436, "grad_norm": 1.6478830575942993, "learning_rate": 3.715038188043192e-05, "loss": 1.765, "step": 9765 }, { "epoch": 0.25720305504345536, "grad_norm": 2.1479403972625732, "learning_rate": 3.714906505135634e-05, "loss": 0.5111, "step": 9766 }, { "epoch": 0.25722939162496705, "grad_norm": 4.6129326820373535, "learning_rate": 3.7147748222280746e-05, "loss": 0.6595, "step": 9767 }, { "epoch": 0.2572557282064788, "grad_norm": 1.7507762908935547, "learning_rate": 3.714643139320516e-05, "loss": 1.698, "step": 9768 }, { "epoch": 0.25728206478799054, "grad_norm": 1.6026253700256348, "learning_rate": 3.714511456412958e-05, "loss": 1.8894, "step": 9769 }, { "epoch": 0.2573084013695022, "grad_norm": 1.9273667335510254, "learning_rate": 3.714379773505399e-05, "loss": 1.9654, "step": 9770 }, { "epoch": 0.25733473795101397, "grad_norm": 2.0849626064300537, "learning_rate": 3.714248090597841e-05, "loss": 2.2426, "step": 9771 }, { "epoch": 0.25736107453252566, "grad_norm": 2.39615535736084, "learning_rate": 3.714116407690282e-05, "loss": 1.7556, "step": 9772 }, { "epoch": 0.2573874111140374, "grad_norm": 4.067844390869141, "learning_rate": 3.713984724782724e-05, "loss": 1.9581, "step": 9773 }, { "epoch": 0.25741374769554914, "grad_norm": 1.7253034114837646, "learning_rate": 3.713853041875165e-05, "loss": 2.5465, "step": 9774 }, { "epoch": 0.25744008427706083, "grad_norm": 2.9143927097320557, "learning_rate": 3.7137213589676064e-05, "loss": 2.1546, "step": 9775 }, { "epoch": 0.2574664208585726, "grad_norm": 3.3653507232666016, "learning_rate": 3.713589676060047e-05, "loss": 1.3208, "step": 9776 }, { "epoch": 0.25749275744008426, "grad_norm": 2.5108065605163574, "learning_rate": 3.713457993152489e-05, "loss": 1.6654, "step": 9777 }, { "epoch": 0.257519094021596, "grad_norm": 2.457394599914551, "learning_rate": 3.7133263102449304e-05, "loss": 1.9365, "step": 9778 }, { "epoch": 0.2575454306031077, "grad_norm": 1.4360820055007935, "learning_rate": 3.713194627337372e-05, "loss": 2.0449, "step": 9779 }, { "epoch": 0.25757176718461944, "grad_norm": 1.6703145503997803, "learning_rate": 3.7130629444298135e-05, "loss": 2.0669, "step": 9780 }, { "epoch": 0.2575981037661312, "grad_norm": 3.4239883422851562, "learning_rate": 3.7129312615222544e-05, "loss": 1.7407, "step": 9781 }, { "epoch": 0.25762444034764287, "grad_norm": 1.9549875259399414, "learning_rate": 3.712799578614696e-05, "loss": 1.3951, "step": 9782 }, { "epoch": 0.2576507769291546, "grad_norm": 1.9713608026504517, "learning_rate": 3.7126678957071375e-05, "loss": 1.8081, "step": 9783 }, { "epoch": 0.2576771135106663, "grad_norm": 2.0435633659362793, "learning_rate": 3.712536212799579e-05, "loss": 2.6383, "step": 9784 }, { "epoch": 0.25770345009217804, "grad_norm": 1.9134286642074585, "learning_rate": 3.71240452989202e-05, "loss": 1.7348, "step": 9785 }, { "epoch": 0.25772978667368973, "grad_norm": 3.18192195892334, "learning_rate": 3.7122728469844615e-05, "loss": 1.7457, "step": 9786 }, { "epoch": 0.2577561232552015, "grad_norm": 2.622532844543457, "learning_rate": 3.712141164076903e-05, "loss": 1.011, "step": 9787 }, { "epoch": 0.2577824598367132, "grad_norm": 2.440190076828003, "learning_rate": 3.7120094811693446e-05, "loss": 1.1379, "step": 9788 }, { "epoch": 0.2578087964182249, "grad_norm": 2.6533925533294678, "learning_rate": 3.711877798261786e-05, "loss": 1.9678, "step": 9789 }, { "epoch": 0.25783513299973665, "grad_norm": 2.3086330890655518, "learning_rate": 3.711746115354227e-05, "loss": 3.0058, "step": 9790 }, { "epoch": 0.25786146958124834, "grad_norm": 1.8761622905731201, "learning_rate": 3.7116144324466686e-05, "loss": 2.3342, "step": 9791 }, { "epoch": 0.2578878061627601, "grad_norm": 2.0247251987457275, "learning_rate": 3.71148274953911e-05, "loss": 0.8121, "step": 9792 }, { "epoch": 0.25791414274427177, "grad_norm": 3.9546287059783936, "learning_rate": 3.711351066631552e-05, "loss": 1.0587, "step": 9793 }, { "epoch": 0.2579404793257835, "grad_norm": 3.985302209854126, "learning_rate": 3.7112193837239926e-05, "loss": 1.2702, "step": 9794 }, { "epoch": 0.25796681590729525, "grad_norm": 3.0532419681549072, "learning_rate": 3.711087700816434e-05, "loss": 2.1767, "step": 9795 }, { "epoch": 0.25799315248880694, "grad_norm": 2.437147378921509, "learning_rate": 3.710956017908875e-05, "loss": 2.0605, "step": 9796 }, { "epoch": 0.2580194890703187, "grad_norm": 2.489968776702881, "learning_rate": 3.710824335001317e-05, "loss": 1.9573, "step": 9797 }, { "epoch": 0.2580458256518304, "grad_norm": 2.065798759460449, "learning_rate": 3.710692652093758e-05, "loss": 1.7157, "step": 9798 }, { "epoch": 0.2580721622333421, "grad_norm": 3.5356953144073486, "learning_rate": 3.7105609691862e-05, "loss": 2.182, "step": 9799 }, { "epoch": 0.2580984988148538, "grad_norm": 1.7834603786468506, "learning_rate": 3.710429286278641e-05, "loss": 1.7254, "step": 9800 }, { "epoch": 0.25812483539636555, "grad_norm": 1.8092585802078247, "learning_rate": 3.710297603371082e-05, "loss": 1.4736, "step": 9801 }, { "epoch": 0.2581511719778773, "grad_norm": 4.423865795135498, "learning_rate": 3.7101659204635244e-05, "loss": 1.285, "step": 9802 }, { "epoch": 0.258177508559389, "grad_norm": 2.351874589920044, "learning_rate": 3.710034237555965e-05, "loss": 1.6583, "step": 9803 }, { "epoch": 0.2582038451409007, "grad_norm": 3.509718418121338, "learning_rate": 3.709902554648407e-05, "loss": 1.4889, "step": 9804 }, { "epoch": 0.2582301817224124, "grad_norm": 2.1435463428497314, "learning_rate": 3.709770871740848e-05, "loss": 1.9123, "step": 9805 }, { "epoch": 0.25825651830392415, "grad_norm": 1.7862725257873535, "learning_rate": 3.70963918883329e-05, "loss": 1.6919, "step": 9806 }, { "epoch": 0.2582828548854359, "grad_norm": 3.4659645557403564, "learning_rate": 3.709507505925731e-05, "loss": 1.1544, "step": 9807 }, { "epoch": 0.2583091914669476, "grad_norm": 1.6393003463745117, "learning_rate": 3.7093758230181724e-05, "loss": 1.9384, "step": 9808 }, { "epoch": 0.25833552804845933, "grad_norm": 2.61045503616333, "learning_rate": 3.709244140110614e-05, "loss": 1.5775, "step": 9809 }, { "epoch": 0.258361864629971, "grad_norm": 2.4012460708618164, "learning_rate": 3.709112457203055e-05, "loss": 1.7477, "step": 9810 }, { "epoch": 0.25838820121148276, "grad_norm": 2.9234864711761475, "learning_rate": 3.708980774295497e-05, "loss": 2.2103, "step": 9811 }, { "epoch": 0.25841453779299445, "grad_norm": 2.4255216121673584, "learning_rate": 3.708849091387938e-05, "loss": 1.7509, "step": 9812 }, { "epoch": 0.2584408743745062, "grad_norm": 3.6548266410827637, "learning_rate": 3.7087174084803795e-05, "loss": 1.0274, "step": 9813 }, { "epoch": 0.25846721095601793, "grad_norm": 4.084821701049805, "learning_rate": 3.7085857255728204e-05, "loss": 1.5399, "step": 9814 }, { "epoch": 0.2584935475375296, "grad_norm": 1.5028916597366333, "learning_rate": 3.708454042665262e-05, "loss": 1.7955, "step": 9815 }, { "epoch": 0.25851988411904137, "grad_norm": 2.013051986694336, "learning_rate": 3.7083223597577035e-05, "loss": 1.3864, "step": 9816 }, { "epoch": 0.25854622070055305, "grad_norm": 4.284978866577148, "learning_rate": 3.708190676850145e-05, "loss": 1.2436, "step": 9817 }, { "epoch": 0.2585725572820648, "grad_norm": 2.6555631160736084, "learning_rate": 3.7080589939425867e-05, "loss": 1.7561, "step": 9818 }, { "epoch": 0.2585988938635765, "grad_norm": 2.107290267944336, "learning_rate": 3.7079273110350275e-05, "loss": 1.793, "step": 9819 }, { "epoch": 0.25862523044508823, "grad_norm": 2.438192129135132, "learning_rate": 3.70779562812747e-05, "loss": 1.6924, "step": 9820 }, { "epoch": 0.25865156702659997, "grad_norm": 1.8615882396697998, "learning_rate": 3.7076639452199107e-05, "loss": 1.4206, "step": 9821 }, { "epoch": 0.25867790360811166, "grad_norm": 2.988931179046631, "learning_rate": 3.707532262312352e-05, "loss": 1.9875, "step": 9822 }, { "epoch": 0.2587042401896234, "grad_norm": 2.202460765838623, "learning_rate": 3.707400579404793e-05, "loss": 1.4759, "step": 9823 }, { "epoch": 0.2587305767711351, "grad_norm": 2.3205885887145996, "learning_rate": 3.7072688964972347e-05, "loss": 1.7492, "step": 9824 }, { "epoch": 0.25875691335264683, "grad_norm": 3.144115447998047, "learning_rate": 3.707137213589676e-05, "loss": 2.4225, "step": 9825 }, { "epoch": 0.2587832499341585, "grad_norm": 1.8301434516906738, "learning_rate": 3.707005530682118e-05, "loss": 1.0252, "step": 9826 }, { "epoch": 0.25880958651567026, "grad_norm": 1.796115756034851, "learning_rate": 3.706873847774559e-05, "loss": 1.8776, "step": 9827 }, { "epoch": 0.258835923097182, "grad_norm": 2.254316806793213, "learning_rate": 3.706742164867e-05, "loss": 2.3335, "step": 9828 }, { "epoch": 0.2588622596786937, "grad_norm": 2.632594347000122, "learning_rate": 3.706610481959442e-05, "loss": 2.092, "step": 9829 }, { "epoch": 0.25888859626020544, "grad_norm": 1.641855239868164, "learning_rate": 3.706478799051883e-05, "loss": 1.7947, "step": 9830 }, { "epoch": 0.2589149328417171, "grad_norm": 1.7753918170928955, "learning_rate": 3.706347116144325e-05, "loss": 1.8587, "step": 9831 }, { "epoch": 0.25894126942322887, "grad_norm": 2.247600793838501, "learning_rate": 3.706215433236766e-05, "loss": 2.5807, "step": 9832 }, { "epoch": 0.25896760600474056, "grad_norm": 2.4937613010406494, "learning_rate": 3.7060837503292073e-05, "loss": 0.4096, "step": 9833 }, { "epoch": 0.2589939425862523, "grad_norm": 2.8936948776245117, "learning_rate": 3.705952067421649e-05, "loss": 1.5429, "step": 9834 }, { "epoch": 0.25902027916776404, "grad_norm": 1.5941886901855469, "learning_rate": 3.7058203845140905e-05, "loss": 2.0619, "step": 9835 }, { "epoch": 0.25904661574927573, "grad_norm": 1.6545512676239014, "learning_rate": 3.705688701606532e-05, "loss": 0.6942, "step": 9836 }, { "epoch": 0.2590729523307875, "grad_norm": 1.8232911825180054, "learning_rate": 3.705557018698973e-05, "loss": 2.2493, "step": 9837 }, { "epoch": 0.25909928891229916, "grad_norm": 2.0688045024871826, "learning_rate": 3.7054253357914145e-05, "loss": 1.8822, "step": 9838 }, { "epoch": 0.2591256254938109, "grad_norm": 1.9047807455062866, "learning_rate": 3.705293652883856e-05, "loss": 1.6492, "step": 9839 }, { "epoch": 0.25915196207532265, "grad_norm": 1.8137409687042236, "learning_rate": 3.7051619699762976e-05, "loss": 1.9918, "step": 9840 }, { "epoch": 0.25917829865683434, "grad_norm": 2.571833848953247, "learning_rate": 3.7050302870687385e-05, "loss": 0.7357, "step": 9841 }, { "epoch": 0.2592046352383461, "grad_norm": 1.9742419719696045, "learning_rate": 3.70489860416118e-05, "loss": 2.6158, "step": 9842 }, { "epoch": 0.25923097181985777, "grad_norm": 1.8618266582489014, "learning_rate": 3.704766921253621e-05, "loss": 1.9506, "step": 9843 }, { "epoch": 0.2592573084013695, "grad_norm": 1.8881477117538452, "learning_rate": 3.704635238346063e-05, "loss": 1.622, "step": 9844 }, { "epoch": 0.2592836449828812, "grad_norm": 3.0669729709625244, "learning_rate": 3.704503555438504e-05, "loss": 2.5703, "step": 9845 }, { "epoch": 0.25930998156439294, "grad_norm": 2.1940395832061768, "learning_rate": 3.7043718725309456e-05, "loss": 0.7745, "step": 9846 }, { "epoch": 0.2593363181459047, "grad_norm": 1.9180564880371094, "learning_rate": 3.704240189623387e-05, "loss": 1.6913, "step": 9847 }, { "epoch": 0.2593626547274164, "grad_norm": 1.9764330387115479, "learning_rate": 3.704108506715828e-05, "loss": 1.8624, "step": 9848 }, { "epoch": 0.2593889913089281, "grad_norm": 2.088878631591797, "learning_rate": 3.70397682380827e-05, "loss": 1.8017, "step": 9849 }, { "epoch": 0.2594153278904398, "grad_norm": 1.4600499868392944, "learning_rate": 3.703845140900711e-05, "loss": 1.5805, "step": 9850 }, { "epoch": 0.25944166447195155, "grad_norm": 3.7565665245056152, "learning_rate": 3.703713457993153e-05, "loss": 2.0024, "step": 9851 }, { "epoch": 0.25946800105346324, "grad_norm": 3.5762574672698975, "learning_rate": 3.7035817750855936e-05, "loss": 1.1081, "step": 9852 }, { "epoch": 0.259494337634975, "grad_norm": 2.9315707683563232, "learning_rate": 3.703450092178036e-05, "loss": 2.3125, "step": 9853 }, { "epoch": 0.2595206742164867, "grad_norm": 2.9471585750579834, "learning_rate": 3.703318409270477e-05, "loss": 1.5932, "step": 9854 }, { "epoch": 0.2595470107979984, "grad_norm": 3.957176685333252, "learning_rate": 3.703186726362918e-05, "loss": 0.9993, "step": 9855 }, { "epoch": 0.25957334737951016, "grad_norm": 1.6149044036865234, "learning_rate": 3.70305504345536e-05, "loss": 1.3921, "step": 9856 }, { "epoch": 0.25959968396102184, "grad_norm": 2.1875369548797607, "learning_rate": 3.702923360547801e-05, "loss": 1.2329, "step": 9857 }, { "epoch": 0.2596260205425336, "grad_norm": 6.689393520355225, "learning_rate": 3.702791677640243e-05, "loss": 2.0037, "step": 9858 }, { "epoch": 0.2596523571240453, "grad_norm": 2.014817476272583, "learning_rate": 3.702659994732684e-05, "loss": 1.9978, "step": 9859 }, { "epoch": 0.259678693705557, "grad_norm": 3.453049659729004, "learning_rate": 3.7025283118251254e-05, "loss": 1.3455, "step": 9860 }, { "epoch": 0.25970503028706876, "grad_norm": 2.203376531600952, "learning_rate": 3.702396628917566e-05, "loss": 1.6716, "step": 9861 }, { "epoch": 0.25973136686858045, "grad_norm": 1.8605821132659912, "learning_rate": 3.702264946010008e-05, "loss": 2.3472, "step": 9862 }, { "epoch": 0.2597577034500922, "grad_norm": 3.7724006175994873, "learning_rate": 3.7021332631024494e-05, "loss": 1.4838, "step": 9863 }, { "epoch": 0.2597840400316039, "grad_norm": 2.3422861099243164, "learning_rate": 3.702001580194891e-05, "loss": 1.6953, "step": 9864 }, { "epoch": 0.2598103766131156, "grad_norm": 2.079458475112915, "learning_rate": 3.7018698972873325e-05, "loss": 1.4839, "step": 9865 }, { "epoch": 0.2598367131946273, "grad_norm": 1.7584192752838135, "learning_rate": 3.7017382143797734e-05, "loss": 1.9271, "step": 9866 }, { "epoch": 0.25986304977613905, "grad_norm": 1.5001235008239746, "learning_rate": 3.701606531472215e-05, "loss": 1.8783, "step": 9867 }, { "epoch": 0.2598893863576508, "grad_norm": 1.7222158908843994, "learning_rate": 3.7014748485646565e-05, "loss": 1.7547, "step": 9868 }, { "epoch": 0.2599157229391625, "grad_norm": 2.3868236541748047, "learning_rate": 3.701343165657098e-05, "loss": 1.7547, "step": 9869 }, { "epoch": 0.25994205952067423, "grad_norm": 1.9237710237503052, "learning_rate": 3.701211482749539e-05, "loss": 1.8095, "step": 9870 }, { "epoch": 0.2599683961021859, "grad_norm": 2.7741787433624268, "learning_rate": 3.7010797998419805e-05, "loss": 0.8023, "step": 9871 }, { "epoch": 0.25999473268369766, "grad_norm": 3.004370927810669, "learning_rate": 3.700948116934422e-05, "loss": 1.7289, "step": 9872 }, { "epoch": 0.2600210692652094, "grad_norm": 2.0840609073638916, "learning_rate": 3.7008164340268636e-05, "loss": 2.2154, "step": 9873 }, { "epoch": 0.2600474058467211, "grad_norm": 1.8691085577011108, "learning_rate": 3.700684751119305e-05, "loss": 1.9526, "step": 9874 }, { "epoch": 0.26007374242823283, "grad_norm": 4.453634738922119, "learning_rate": 3.700553068211746e-05, "loss": 1.2187, "step": 9875 }, { "epoch": 0.2601000790097445, "grad_norm": 1.4608912467956543, "learning_rate": 3.7004213853041876e-05, "loss": 1.92, "step": 9876 }, { "epoch": 0.26012641559125627, "grad_norm": 1.9461586475372314, "learning_rate": 3.700289702396629e-05, "loss": 1.1693, "step": 9877 }, { "epoch": 0.26015275217276795, "grad_norm": 2.532621145248413, "learning_rate": 3.700158019489071e-05, "loss": 2.1008, "step": 9878 }, { "epoch": 0.2601790887542797, "grad_norm": 2.151224374771118, "learning_rate": 3.7000263365815116e-05, "loss": 0.9723, "step": 9879 }, { "epoch": 0.26020542533579144, "grad_norm": 3.0210020542144775, "learning_rate": 3.699894653673953e-05, "loss": 1.7721, "step": 9880 }, { "epoch": 0.26023176191730313, "grad_norm": 2.443286418914795, "learning_rate": 3.699762970766395e-05, "loss": 1.8425, "step": 9881 }, { "epoch": 0.26025809849881487, "grad_norm": 6.711511611938477, "learning_rate": 3.699631287858836e-05, "loss": 1.5152, "step": 9882 }, { "epoch": 0.26028443508032656, "grad_norm": 2.5607032775878906, "learning_rate": 3.699499604951278e-05, "loss": 0.8706, "step": 9883 }, { "epoch": 0.2603107716618383, "grad_norm": 1.7296491861343384, "learning_rate": 3.699367922043719e-05, "loss": 2.0387, "step": 9884 }, { "epoch": 0.26033710824335, "grad_norm": 2.640047311782837, "learning_rate": 3.69923623913616e-05, "loss": 1.0993, "step": 9885 }, { "epoch": 0.26036344482486173, "grad_norm": 2.0113070011138916, "learning_rate": 3.699104556228602e-05, "loss": 1.2701, "step": 9886 }, { "epoch": 0.2603897814063735, "grad_norm": 2.4416544437408447, "learning_rate": 3.6989728733210434e-05, "loss": 0.801, "step": 9887 }, { "epoch": 0.26041611798788517, "grad_norm": 2.1388752460479736, "learning_rate": 3.698841190413484e-05, "loss": 1.9399, "step": 9888 }, { "epoch": 0.2604424545693969, "grad_norm": 1.9972481727600098, "learning_rate": 3.698709507505926e-05, "loss": 2.1499, "step": 9889 }, { "epoch": 0.2604687911509086, "grad_norm": 1.8717926740646362, "learning_rate": 3.6985778245983674e-05, "loss": 1.7889, "step": 9890 }, { "epoch": 0.26049512773242034, "grad_norm": 1.902442455291748, "learning_rate": 3.698446141690809e-05, "loss": 2.1941, "step": 9891 }, { "epoch": 0.260521464313932, "grad_norm": 2.221508741378784, "learning_rate": 3.6983144587832506e-05, "loss": 1.5553, "step": 9892 }, { "epoch": 0.26054780089544377, "grad_norm": 3.134248971939087, "learning_rate": 3.6981827758756914e-05, "loss": 1.6244, "step": 9893 }, { "epoch": 0.2605741374769555, "grad_norm": 3.5815107822418213, "learning_rate": 3.698051092968133e-05, "loss": 1.7655, "step": 9894 }, { "epoch": 0.2606004740584672, "grad_norm": 1.7182793617248535, "learning_rate": 3.697919410060574e-05, "loss": 1.4358, "step": 9895 }, { "epoch": 0.26062681063997895, "grad_norm": 1.9965176582336426, "learning_rate": 3.697787727153016e-05, "loss": 1.9596, "step": 9896 }, { "epoch": 0.26065314722149063, "grad_norm": 1.9375555515289307, "learning_rate": 3.697656044245457e-05, "loss": 1.4071, "step": 9897 }, { "epoch": 0.2606794838030024, "grad_norm": 4.147969722747803, "learning_rate": 3.6975243613378986e-05, "loss": 0.9175, "step": 9898 }, { "epoch": 0.26070582038451406, "grad_norm": 1.8712466955184937, "learning_rate": 3.6973926784303394e-05, "loss": 2.114, "step": 9899 }, { "epoch": 0.2607321569660258, "grad_norm": 1.628216028213501, "learning_rate": 3.697260995522781e-05, "loss": 1.8131, "step": 9900 }, { "epoch": 0.26075849354753755, "grad_norm": 1.7184875011444092, "learning_rate": 3.6971293126152226e-05, "loss": 1.756, "step": 9901 }, { "epoch": 0.26078483012904924, "grad_norm": 2.2704854011535645, "learning_rate": 3.696997629707664e-05, "loss": 0.5708, "step": 9902 }, { "epoch": 0.260811166710561, "grad_norm": 3.0642383098602295, "learning_rate": 3.696865946800106e-05, "loss": 3.1794, "step": 9903 }, { "epoch": 0.26083750329207267, "grad_norm": 2.0807080268859863, "learning_rate": 3.6967342638925466e-05, "loss": 2.1727, "step": 9904 }, { "epoch": 0.2608638398735844, "grad_norm": 1.9923043251037598, "learning_rate": 3.696602580984989e-05, "loss": 1.7169, "step": 9905 }, { "epoch": 0.2608901764550961, "grad_norm": 4.368671417236328, "learning_rate": 3.69647089807743e-05, "loss": 1.3595, "step": 9906 }, { "epoch": 0.26091651303660784, "grad_norm": 1.7777026891708374, "learning_rate": 3.696339215169871e-05, "loss": 1.8276, "step": 9907 }, { "epoch": 0.2609428496181196, "grad_norm": 2.1066837310791016, "learning_rate": 3.696207532262312e-05, "loss": 1.9016, "step": 9908 }, { "epoch": 0.2609691861996313, "grad_norm": 5.21611213684082, "learning_rate": 3.696075849354754e-05, "loss": 1.5036, "step": 9909 }, { "epoch": 0.260995522781143, "grad_norm": 2.433588743209839, "learning_rate": 3.695944166447195e-05, "loss": 1.7911, "step": 9910 }, { "epoch": 0.2610218593626547, "grad_norm": 3.4112401008605957, "learning_rate": 3.695812483539637e-05, "loss": 1.7093, "step": 9911 }, { "epoch": 0.26104819594416645, "grad_norm": 2.103821039199829, "learning_rate": 3.6956808006320784e-05, "loss": 1.9149, "step": 9912 }, { "epoch": 0.2610745325256782, "grad_norm": 2.3162713050842285, "learning_rate": 3.695549117724519e-05, "loss": 1.6869, "step": 9913 }, { "epoch": 0.2611008691071899, "grad_norm": 2.0363857746124268, "learning_rate": 3.695417434816961e-05, "loss": 1.6803, "step": 9914 }, { "epoch": 0.2611272056887016, "grad_norm": 3.4832189083099365, "learning_rate": 3.6952857519094024e-05, "loss": 1.8491, "step": 9915 }, { "epoch": 0.2611535422702133, "grad_norm": 3.0477230548858643, "learning_rate": 3.695154069001844e-05, "loss": 1.2986, "step": 9916 }, { "epoch": 0.26117987885172506, "grad_norm": 1.7535345554351807, "learning_rate": 3.695022386094285e-05, "loss": 2.2733, "step": 9917 }, { "epoch": 0.26120621543323674, "grad_norm": 1.6236014366149902, "learning_rate": 3.6948907031867264e-05, "loss": 2.0663, "step": 9918 }, { "epoch": 0.2612325520147485, "grad_norm": 1.8621058464050293, "learning_rate": 3.694759020279168e-05, "loss": 1.5392, "step": 9919 }, { "epoch": 0.26125888859626023, "grad_norm": 1.4390201568603516, "learning_rate": 3.6946273373716095e-05, "loss": 1.9462, "step": 9920 }, { "epoch": 0.2612852251777719, "grad_norm": 3.6695492267608643, "learning_rate": 3.694495654464051e-05, "loss": 1.5147, "step": 9921 }, { "epoch": 0.26131156175928366, "grad_norm": 2.468250274658203, "learning_rate": 3.694363971556492e-05, "loss": 1.2001, "step": 9922 }, { "epoch": 0.26133789834079535, "grad_norm": 2.3285114765167236, "learning_rate": 3.6942322886489335e-05, "loss": 2.0223, "step": 9923 }, { "epoch": 0.2613642349223071, "grad_norm": 1.681993007659912, "learning_rate": 3.694100605741375e-05, "loss": 1.7715, "step": 9924 }, { "epoch": 0.2613905715038188, "grad_norm": 1.577872395515442, "learning_rate": 3.6939689228338166e-05, "loss": 1.9831, "step": 9925 }, { "epoch": 0.2614169080853305, "grad_norm": 3.7698237895965576, "learning_rate": 3.6938372399262575e-05, "loss": 1.3589, "step": 9926 }, { "epoch": 0.26144324466684227, "grad_norm": 2.0850603580474854, "learning_rate": 3.693705557018699e-05, "loss": 1.534, "step": 9927 }, { "epoch": 0.26146958124835395, "grad_norm": 3.415149211883545, "learning_rate": 3.6935738741111406e-05, "loss": 1.3738, "step": 9928 }, { "epoch": 0.2614959178298657, "grad_norm": 2.7064318656921387, "learning_rate": 3.693442191203582e-05, "loss": 1.9847, "step": 9929 }, { "epoch": 0.2615222544113774, "grad_norm": 2.607041597366333, "learning_rate": 3.693310508296024e-05, "loss": 1.8455, "step": 9930 }, { "epoch": 0.26154859099288913, "grad_norm": 2.6907150745391846, "learning_rate": 3.6931788253884646e-05, "loss": 0.9775, "step": 9931 }, { "epoch": 0.2615749275744008, "grad_norm": 1.8635156154632568, "learning_rate": 3.693047142480906e-05, "loss": 1.5347, "step": 9932 }, { "epoch": 0.26160126415591256, "grad_norm": 2.091691732406616, "learning_rate": 3.692915459573347e-05, "loss": 2.0928, "step": 9933 }, { "epoch": 0.2616276007374243, "grad_norm": 2.8933329582214355, "learning_rate": 3.692783776665789e-05, "loss": 1.2597, "step": 9934 }, { "epoch": 0.261653937318936, "grad_norm": 1.788756012916565, "learning_rate": 3.69265209375823e-05, "loss": 1.6322, "step": 9935 }, { "epoch": 0.26168027390044774, "grad_norm": 5.010241508483887, "learning_rate": 3.692520410850672e-05, "loss": 1.2798, "step": 9936 }, { "epoch": 0.2617066104819594, "grad_norm": 2.166435480117798, "learning_rate": 3.692388727943113e-05, "loss": 2.1485, "step": 9937 }, { "epoch": 0.26173294706347117, "grad_norm": 1.651205062866211, "learning_rate": 3.692257045035555e-05, "loss": 1.5987, "step": 9938 }, { "epoch": 0.26175928364498285, "grad_norm": 2.5741209983825684, "learning_rate": 3.6921253621279964e-05, "loss": 1.7725, "step": 9939 }, { "epoch": 0.2617856202264946, "grad_norm": 1.998192310333252, "learning_rate": 3.691993679220437e-05, "loss": 2.0674, "step": 9940 }, { "epoch": 0.26181195680800634, "grad_norm": 2.596903085708618, "learning_rate": 3.691861996312879e-05, "loss": 2.3742, "step": 9941 }, { "epoch": 0.26183829338951803, "grad_norm": 2.1205623149871826, "learning_rate": 3.69173031340532e-05, "loss": 0.4241, "step": 9942 }, { "epoch": 0.26186462997102977, "grad_norm": 1.6825841665267944, "learning_rate": 3.691598630497762e-05, "loss": 1.4807, "step": 9943 }, { "epoch": 0.26189096655254146, "grad_norm": 1.6058809757232666, "learning_rate": 3.691466947590203e-05, "loss": 1.7782, "step": 9944 }, { "epoch": 0.2619173031340532, "grad_norm": 3.9677579402923584, "learning_rate": 3.6913352646826444e-05, "loss": 1.2704, "step": 9945 }, { "epoch": 0.26194363971556495, "grad_norm": 2.0853161811828613, "learning_rate": 3.691203581775085e-05, "loss": 2.0454, "step": 9946 }, { "epoch": 0.26196997629707663, "grad_norm": 2.0083141326904297, "learning_rate": 3.691071898867527e-05, "loss": 0.4336, "step": 9947 }, { "epoch": 0.2619963128785884, "grad_norm": 2.3993773460388184, "learning_rate": 3.6909402159599684e-05, "loss": 1.3382, "step": 9948 }, { "epoch": 0.26202264946010007, "grad_norm": 5.320145606994629, "learning_rate": 3.69080853305241e-05, "loss": 1.0417, "step": 9949 }, { "epoch": 0.2620489860416118, "grad_norm": 3.865410327911377, "learning_rate": 3.6906768501448515e-05, "loss": 0.7176, "step": 9950 }, { "epoch": 0.2620753226231235, "grad_norm": 4.403519153594971, "learning_rate": 3.6905451672372924e-05, "loss": 1.0294, "step": 9951 }, { "epoch": 0.26210165920463524, "grad_norm": 1.9168848991394043, "learning_rate": 3.6904134843297347e-05, "loss": 1.6702, "step": 9952 }, { "epoch": 0.262127995786147, "grad_norm": 1.7266138792037964, "learning_rate": 3.6902818014221755e-05, "loss": 2.1554, "step": 9953 }, { "epoch": 0.26215433236765867, "grad_norm": 3.792888641357422, "learning_rate": 3.690150118514617e-05, "loss": 0.8645, "step": 9954 }, { "epoch": 0.2621806689491704, "grad_norm": 2.359764337539673, "learning_rate": 3.690018435607058e-05, "loss": 1.6779, "step": 9955 }, { "epoch": 0.2622070055306821, "grad_norm": 2.330033302307129, "learning_rate": 3.6898867526994995e-05, "loss": 0.4269, "step": 9956 }, { "epoch": 0.26223334211219385, "grad_norm": 2.988826036453247, "learning_rate": 3.689755069791941e-05, "loss": 1.5425, "step": 9957 }, { "epoch": 0.26225967869370553, "grad_norm": 2.079436779022217, "learning_rate": 3.6896233868843827e-05, "loss": 1.6766, "step": 9958 }, { "epoch": 0.2622860152752173, "grad_norm": 2.129084348678589, "learning_rate": 3.689491703976824e-05, "loss": 1.7665, "step": 9959 }, { "epoch": 0.262312351856729, "grad_norm": 2.360966920852661, "learning_rate": 3.689360021069265e-05, "loss": 1.7158, "step": 9960 }, { "epoch": 0.2623386884382407, "grad_norm": 1.8067656755447388, "learning_rate": 3.689228338161707e-05, "loss": 1.4482, "step": 9961 }, { "epoch": 0.26236502501975245, "grad_norm": 1.6358373165130615, "learning_rate": 3.689096655254148e-05, "loss": 2.3643, "step": 9962 }, { "epoch": 0.26239136160126414, "grad_norm": 5.185770034790039, "learning_rate": 3.68896497234659e-05, "loss": 1.3845, "step": 9963 }, { "epoch": 0.2624176981827759, "grad_norm": 2.6043448448181152, "learning_rate": 3.688833289439031e-05, "loss": 1.6341, "step": 9964 }, { "epoch": 0.26244403476428757, "grad_norm": 2.543100595474243, "learning_rate": 3.688701606531472e-05, "loss": 2.2213, "step": 9965 }, { "epoch": 0.2624703713457993, "grad_norm": 1.7823302745819092, "learning_rate": 3.688569923623914e-05, "loss": 0.9165, "step": 9966 }, { "epoch": 0.26249670792731106, "grad_norm": 1.6086454391479492, "learning_rate": 3.6884382407163553e-05, "loss": 1.8282, "step": 9967 }, { "epoch": 0.26252304450882274, "grad_norm": 1.7740914821624756, "learning_rate": 3.688306557808797e-05, "loss": 1.5867, "step": 9968 }, { "epoch": 0.2625493810903345, "grad_norm": 2.1103787422180176, "learning_rate": 3.688174874901238e-05, "loss": 1.982, "step": 9969 }, { "epoch": 0.2625757176718462, "grad_norm": 1.7448289394378662, "learning_rate": 3.6880431919936793e-05, "loss": 1.6368, "step": 9970 }, { "epoch": 0.2626020542533579, "grad_norm": 3.1486148834228516, "learning_rate": 3.687911509086121e-05, "loss": 1.4089, "step": 9971 }, { "epoch": 0.2626283908348696, "grad_norm": 2.2418668270111084, "learning_rate": 3.6877798261785625e-05, "loss": 1.7981, "step": 9972 }, { "epoch": 0.26265472741638135, "grad_norm": 1.9854317903518677, "learning_rate": 3.6876481432710033e-05, "loss": 1.9868, "step": 9973 }, { "epoch": 0.2626810639978931, "grad_norm": 2.909822463989258, "learning_rate": 3.687516460363445e-05, "loss": 1.4789, "step": 9974 }, { "epoch": 0.2627074005794048, "grad_norm": 1.796769142150879, "learning_rate": 3.6873847774558865e-05, "loss": 2.0325, "step": 9975 }, { "epoch": 0.2627337371609165, "grad_norm": 1.8284469842910767, "learning_rate": 3.687253094548328e-05, "loss": 1.7165, "step": 9976 }, { "epoch": 0.2627600737424282, "grad_norm": 1.6428061723709106, "learning_rate": 3.6871214116407696e-05, "loss": 1.6384, "step": 9977 }, { "epoch": 0.26278641032393996, "grad_norm": 1.7576966285705566, "learning_rate": 3.6869897287332105e-05, "loss": 0.9424, "step": 9978 }, { "epoch": 0.2628127469054517, "grad_norm": 2.577503204345703, "learning_rate": 3.686858045825652e-05, "loss": 2.1824, "step": 9979 }, { "epoch": 0.2628390834869634, "grad_norm": 1.9705294370651245, "learning_rate": 3.686726362918093e-05, "loss": 0.4674, "step": 9980 }, { "epoch": 0.26286542006847513, "grad_norm": 1.730053186416626, "learning_rate": 3.686594680010535e-05, "loss": 1.8857, "step": 9981 }, { "epoch": 0.2628917566499868, "grad_norm": 6.366098403930664, "learning_rate": 3.686462997102976e-05, "loss": 1.4864, "step": 9982 }, { "epoch": 0.26291809323149856, "grad_norm": 2.1583688259124756, "learning_rate": 3.6863313141954176e-05, "loss": 1.4697, "step": 9983 }, { "epoch": 0.26294442981301025, "grad_norm": 1.7725675106048584, "learning_rate": 3.686199631287859e-05, "loss": 2.431, "step": 9984 }, { "epoch": 0.262970766394522, "grad_norm": 5.0243072509765625, "learning_rate": 3.686067948380301e-05, "loss": 1.1403, "step": 9985 }, { "epoch": 0.26299710297603374, "grad_norm": 3.0046603679656982, "learning_rate": 3.685936265472742e-05, "loss": 0.8956, "step": 9986 }, { "epoch": 0.2630234395575454, "grad_norm": 3.7581191062927246, "learning_rate": 3.685804582565183e-05, "loss": 1.2484, "step": 9987 }, { "epoch": 0.26304977613905717, "grad_norm": 3.3439056873321533, "learning_rate": 3.685672899657625e-05, "loss": 1.5957, "step": 9988 }, { "epoch": 0.26307611272056886, "grad_norm": 5.87441349029541, "learning_rate": 3.6855412167500656e-05, "loss": 1.4431, "step": 9989 }, { "epoch": 0.2631024493020806, "grad_norm": 1.8784219026565552, "learning_rate": 3.685409533842508e-05, "loss": 1.949, "step": 9990 }, { "epoch": 0.2631287858835923, "grad_norm": 2.4301981925964355, "learning_rate": 3.685277850934949e-05, "loss": 2.1704, "step": 9991 }, { "epoch": 0.26315512246510403, "grad_norm": 7.405130386352539, "learning_rate": 3.68514616802739e-05, "loss": 1.0526, "step": 9992 }, { "epoch": 0.2631814590466158, "grad_norm": 3.237091064453125, "learning_rate": 3.685014485119832e-05, "loss": 1.5404, "step": 9993 }, { "epoch": 0.26320779562812746, "grad_norm": 3.4121577739715576, "learning_rate": 3.684882802212273e-05, "loss": 2.0441, "step": 9994 }, { "epoch": 0.2632341322096392, "grad_norm": 2.61460018157959, "learning_rate": 3.684751119304715e-05, "loss": 2.4158, "step": 9995 }, { "epoch": 0.2632604687911509, "grad_norm": 1.840182900428772, "learning_rate": 3.684619436397156e-05, "loss": 1.8731, "step": 9996 }, { "epoch": 0.26328680537266264, "grad_norm": 2.847264051437378, "learning_rate": 3.6844877534895974e-05, "loss": 1.2876, "step": 9997 }, { "epoch": 0.2633131419541743, "grad_norm": 3.600921869277954, "learning_rate": 3.684356070582038e-05, "loss": 2.1099, "step": 9998 }, { "epoch": 0.26333947853568607, "grad_norm": 1.7776530981063843, "learning_rate": 3.6842243876744805e-05, "loss": 1.6777, "step": 9999 }, { "epoch": 0.2633658151171978, "grad_norm": 2.220614194869995, "learning_rate": 3.6840927047669214e-05, "loss": 1.7253, "step": 10000 }, { "epoch": 0.2633921516987095, "grad_norm": 2.796560525894165, "learning_rate": 3.683961021859363e-05, "loss": 1.5488, "step": 10001 }, { "epoch": 0.26341848828022124, "grad_norm": 2.200914144515991, "learning_rate": 3.683829338951804e-05, "loss": 2.0682, "step": 10002 }, { "epoch": 0.26344482486173293, "grad_norm": 2.2607386112213135, "learning_rate": 3.6836976560442454e-05, "loss": 1.8996, "step": 10003 }, { "epoch": 0.2634711614432447, "grad_norm": 1.764689564704895, "learning_rate": 3.683565973136687e-05, "loss": 2.3482, "step": 10004 }, { "epoch": 0.26349749802475636, "grad_norm": 2.6437742710113525, "learning_rate": 3.6834342902291285e-05, "loss": 1.8611, "step": 10005 }, { "epoch": 0.2635238346062681, "grad_norm": 3.04620099067688, "learning_rate": 3.68330260732157e-05, "loss": 1.2221, "step": 10006 }, { "epoch": 0.26355017118777985, "grad_norm": 2.4913136959075928, "learning_rate": 3.683170924414011e-05, "loss": 1.737, "step": 10007 }, { "epoch": 0.26357650776929153, "grad_norm": 2.2860565185546875, "learning_rate": 3.6830392415064525e-05, "loss": 1.5518, "step": 10008 }, { "epoch": 0.2636028443508033, "grad_norm": 2.7356841564178467, "learning_rate": 3.682907558598894e-05, "loss": 2.9802, "step": 10009 }, { "epoch": 0.26362918093231497, "grad_norm": 2.5983242988586426, "learning_rate": 3.6827758756913356e-05, "loss": 2.0625, "step": 10010 }, { "epoch": 0.2636555175138267, "grad_norm": 2.2959084510803223, "learning_rate": 3.6826441927837765e-05, "loss": 2.1491, "step": 10011 }, { "epoch": 0.26368185409533845, "grad_norm": 1.8262606859207153, "learning_rate": 3.682512509876218e-05, "loss": 1.7723, "step": 10012 }, { "epoch": 0.26370819067685014, "grad_norm": 2.623422145843506, "learning_rate": 3.6823808269686596e-05, "loss": 1.3331, "step": 10013 }, { "epoch": 0.2637345272583619, "grad_norm": 1.9574443101882935, "learning_rate": 3.682249144061101e-05, "loss": 1.6323, "step": 10014 }, { "epoch": 0.26376086383987357, "grad_norm": 5.081628799438477, "learning_rate": 3.682117461153543e-05, "loss": 1.25, "step": 10015 }, { "epoch": 0.2637872004213853, "grad_norm": 4.234491348266602, "learning_rate": 3.6819857782459836e-05, "loss": 1.0458, "step": 10016 }, { "epoch": 0.263813537002897, "grad_norm": 1.6453907489776611, "learning_rate": 3.681854095338425e-05, "loss": 1.5962, "step": 10017 }, { "epoch": 0.26383987358440875, "grad_norm": 2.469261884689331, "learning_rate": 3.681722412430867e-05, "loss": 1.741, "step": 10018 }, { "epoch": 0.2638662101659205, "grad_norm": 4.05128812789917, "learning_rate": 3.681590729523308e-05, "loss": 0.7735, "step": 10019 }, { "epoch": 0.2638925467474322, "grad_norm": 2.9753005504608154, "learning_rate": 3.681459046615749e-05, "loss": 0.9617, "step": 10020 }, { "epoch": 0.2639188833289439, "grad_norm": 1.9553937911987305, "learning_rate": 3.681327363708191e-05, "loss": 1.8002, "step": 10021 }, { "epoch": 0.2639452199104556, "grad_norm": 1.9491946697235107, "learning_rate": 3.681195680800632e-05, "loss": 1.2068, "step": 10022 }, { "epoch": 0.26397155649196735, "grad_norm": 2.2591712474823, "learning_rate": 3.681063997893074e-05, "loss": 1.7033, "step": 10023 }, { "epoch": 0.26399789307347904, "grad_norm": 2.861011028289795, "learning_rate": 3.6809323149855154e-05, "loss": 1.6876, "step": 10024 }, { "epoch": 0.2640242296549908, "grad_norm": 1.7646710872650146, "learning_rate": 3.680800632077956e-05, "loss": 2.0335, "step": 10025 }, { "epoch": 0.2640505662365025, "grad_norm": 2.0951666831970215, "learning_rate": 3.680668949170398e-05, "loss": 2.422, "step": 10026 }, { "epoch": 0.2640769028180142, "grad_norm": 2.4043684005737305, "learning_rate": 3.680537266262839e-05, "loss": 1.8476, "step": 10027 }, { "epoch": 0.26410323939952596, "grad_norm": 2.605814218521118, "learning_rate": 3.680405583355281e-05, "loss": 1.9494, "step": 10028 }, { "epoch": 0.26412957598103765, "grad_norm": 2.503455400466919, "learning_rate": 3.680273900447722e-05, "loss": 0.2176, "step": 10029 }, { "epoch": 0.2641559125625494, "grad_norm": 1.784052848815918, "learning_rate": 3.6801422175401634e-05, "loss": 1.5564, "step": 10030 }, { "epoch": 0.2641822491440611, "grad_norm": 1.4798592329025269, "learning_rate": 3.680010534632605e-05, "loss": 1.5415, "step": 10031 }, { "epoch": 0.2642085857255728, "grad_norm": 2.9827847480773926, "learning_rate": 3.6798788517250466e-05, "loss": 1.5892, "step": 10032 }, { "epoch": 0.26423492230708456, "grad_norm": 1.920401930809021, "learning_rate": 3.679747168817488e-05, "loss": 2.3112, "step": 10033 }, { "epoch": 0.26426125888859625, "grad_norm": 3.290771961212158, "learning_rate": 3.679615485909929e-05, "loss": 1.7108, "step": 10034 }, { "epoch": 0.264287595470108, "grad_norm": 2.365751266479492, "learning_rate": 3.6794838030023706e-05, "loss": 1.9305, "step": 10035 }, { "epoch": 0.2643139320516197, "grad_norm": 2.2461624145507812, "learning_rate": 3.6793521200948114e-05, "loss": 2.036, "step": 10036 }, { "epoch": 0.2643402686331314, "grad_norm": 2.9866960048675537, "learning_rate": 3.679220437187254e-05, "loss": 1.0285, "step": 10037 }, { "epoch": 0.2643666052146431, "grad_norm": 3.6891326904296875, "learning_rate": 3.6790887542796946e-05, "loss": 2.0806, "step": 10038 }, { "epoch": 0.26439294179615486, "grad_norm": 2.289691925048828, "learning_rate": 3.678957071372136e-05, "loss": 0.6964, "step": 10039 }, { "epoch": 0.2644192783776666, "grad_norm": 5.166062831878662, "learning_rate": 3.678825388464578e-05, "loss": 0.8084, "step": 10040 }, { "epoch": 0.2644456149591783, "grad_norm": 2.6089706420898438, "learning_rate": 3.6786937055570186e-05, "loss": 2.2339, "step": 10041 }, { "epoch": 0.26447195154069003, "grad_norm": 2.1294283866882324, "learning_rate": 3.678562022649461e-05, "loss": 2.0517, "step": 10042 }, { "epoch": 0.2644982881222017, "grad_norm": 1.9592230319976807, "learning_rate": 3.678430339741902e-05, "loss": 1.9269, "step": 10043 }, { "epoch": 0.26452462470371346, "grad_norm": 1.92788565158844, "learning_rate": 3.678298656834343e-05, "loss": 1.7954, "step": 10044 }, { "epoch": 0.26455096128522515, "grad_norm": 3.0028562545776367, "learning_rate": 3.678166973926784e-05, "loss": 1.7689, "step": 10045 }, { "epoch": 0.2645772978667369, "grad_norm": 4.070221424102783, "learning_rate": 3.678035291019226e-05, "loss": 1.4069, "step": 10046 }, { "epoch": 0.26460363444824864, "grad_norm": 1.8371425867080688, "learning_rate": 3.677903608111667e-05, "loss": 2.8702, "step": 10047 }, { "epoch": 0.2646299710297603, "grad_norm": 1.7458940744400024, "learning_rate": 3.677771925204109e-05, "loss": 2.056, "step": 10048 }, { "epoch": 0.26465630761127207, "grad_norm": 9.552285194396973, "learning_rate": 3.67764024229655e-05, "loss": 2.6065, "step": 10049 }, { "epoch": 0.26468264419278376, "grad_norm": 1.8528327941894531, "learning_rate": 3.677508559388991e-05, "loss": 1.7531, "step": 10050 }, { "epoch": 0.2647089807742955, "grad_norm": 2.379115581512451, "learning_rate": 3.677376876481433e-05, "loss": 2.5959, "step": 10051 }, { "epoch": 0.26473531735580724, "grad_norm": 2.3762495517730713, "learning_rate": 3.6772451935738744e-05, "loss": 1.7078, "step": 10052 }, { "epoch": 0.26476165393731893, "grad_norm": 2.559075355529785, "learning_rate": 3.677113510666316e-05, "loss": 2.0787, "step": 10053 }, { "epoch": 0.2647879905188307, "grad_norm": 1.5611319541931152, "learning_rate": 3.676981827758757e-05, "loss": 1.6679, "step": 10054 }, { "epoch": 0.26481432710034236, "grad_norm": 2.640212059020996, "learning_rate": 3.6768501448511984e-05, "loss": 1.3131, "step": 10055 }, { "epoch": 0.2648406636818541, "grad_norm": 2.510906219482422, "learning_rate": 3.67671846194364e-05, "loss": 0.6519, "step": 10056 }, { "epoch": 0.2648670002633658, "grad_norm": 2.74783992767334, "learning_rate": 3.6765867790360815e-05, "loss": 2.6449, "step": 10057 }, { "epoch": 0.26489333684487754, "grad_norm": 1.6115412712097168, "learning_rate": 3.6764550961285224e-05, "loss": 1.4667, "step": 10058 }, { "epoch": 0.2649196734263893, "grad_norm": 2.5120208263397217, "learning_rate": 3.676323413220964e-05, "loss": 0.5017, "step": 10059 }, { "epoch": 0.26494601000790097, "grad_norm": 4.157475471496582, "learning_rate": 3.6761917303134055e-05, "loss": 2.4723, "step": 10060 }, { "epoch": 0.2649723465894127, "grad_norm": 4.212089538574219, "learning_rate": 3.676060047405847e-05, "loss": 1.3918, "step": 10061 }, { "epoch": 0.2649986831709244, "grad_norm": 1.997915506362915, "learning_rate": 3.6759283644982886e-05, "loss": 1.2492, "step": 10062 }, { "epoch": 0.26502501975243614, "grad_norm": 2.1331331729888916, "learning_rate": 3.6757966815907295e-05, "loss": 2.1265, "step": 10063 }, { "epoch": 0.26505135633394783, "grad_norm": 2.222640037536621, "learning_rate": 3.675664998683171e-05, "loss": 1.0801, "step": 10064 }, { "epoch": 0.2650776929154596, "grad_norm": 2.3524866104125977, "learning_rate": 3.6755333157756126e-05, "loss": 2.6966, "step": 10065 }, { "epoch": 0.2651040294969713, "grad_norm": 2.105419158935547, "learning_rate": 3.675401632868054e-05, "loss": 1.1735, "step": 10066 }, { "epoch": 0.265130366078483, "grad_norm": 3.2529942989349365, "learning_rate": 3.675269949960495e-05, "loss": 1.5403, "step": 10067 }, { "epoch": 0.26515670265999475, "grad_norm": 1.942391037940979, "learning_rate": 3.6751382670529366e-05, "loss": 1.7382, "step": 10068 }, { "epoch": 0.26518303924150644, "grad_norm": 3.163064479827881, "learning_rate": 3.675006584145378e-05, "loss": 1.8874, "step": 10069 }, { "epoch": 0.2652093758230182, "grad_norm": 2.4406042098999023, "learning_rate": 3.67487490123782e-05, "loss": 1.7741, "step": 10070 }, { "epoch": 0.26523571240452987, "grad_norm": 1.6882086992263794, "learning_rate": 3.674743218330261e-05, "loss": 1.0368, "step": 10071 }, { "epoch": 0.2652620489860416, "grad_norm": 1.9586982727050781, "learning_rate": 3.674611535422702e-05, "loss": 2.183, "step": 10072 }, { "epoch": 0.26528838556755335, "grad_norm": 1.9157731533050537, "learning_rate": 3.674479852515144e-05, "loss": 1.4521, "step": 10073 }, { "epoch": 0.26531472214906504, "grad_norm": 1.4000648260116577, "learning_rate": 3.6743481696075846e-05, "loss": 1.6729, "step": 10074 }, { "epoch": 0.2653410587305768, "grad_norm": 2.051919937133789, "learning_rate": 3.674216486700027e-05, "loss": 2.5286, "step": 10075 }, { "epoch": 0.26536739531208847, "grad_norm": 2.4728801250457764, "learning_rate": 3.674084803792468e-05, "loss": 1.3984, "step": 10076 }, { "epoch": 0.2653937318936002, "grad_norm": 1.9722304344177246, "learning_rate": 3.673953120884909e-05, "loss": 1.5037, "step": 10077 }, { "epoch": 0.2654200684751119, "grad_norm": 2.188192367553711, "learning_rate": 3.673821437977351e-05, "loss": 2.309, "step": 10078 }, { "epoch": 0.26544640505662365, "grad_norm": 5.090904712677002, "learning_rate": 3.673689755069792e-05, "loss": 0.7312, "step": 10079 }, { "epoch": 0.2654727416381354, "grad_norm": 3.69519305229187, "learning_rate": 3.673558072162234e-05, "loss": 0.7831, "step": 10080 }, { "epoch": 0.2654990782196471, "grad_norm": 2.031735897064209, "learning_rate": 3.673426389254675e-05, "loss": 1.2649, "step": 10081 }, { "epoch": 0.2655254148011588, "grad_norm": 2.5361666679382324, "learning_rate": 3.6732947063471164e-05, "loss": 1.8613, "step": 10082 }, { "epoch": 0.2655517513826705, "grad_norm": 2.226508378982544, "learning_rate": 3.673163023439557e-05, "loss": 0.9733, "step": 10083 }, { "epoch": 0.26557808796418225, "grad_norm": 3.9375691413879395, "learning_rate": 3.6730313405319995e-05, "loss": 0.7214, "step": 10084 }, { "epoch": 0.265604424545694, "grad_norm": 1.6749688386917114, "learning_rate": 3.6728996576244404e-05, "loss": 1.9015, "step": 10085 }, { "epoch": 0.2656307611272057, "grad_norm": 4.377026557922363, "learning_rate": 3.672767974716882e-05, "loss": 2.2142, "step": 10086 }, { "epoch": 0.2656570977087174, "grad_norm": 3.526095151901245, "learning_rate": 3.6726362918093235e-05, "loss": 2.3948, "step": 10087 }, { "epoch": 0.2656834342902291, "grad_norm": 4.275936603546143, "learning_rate": 3.6725046089017644e-05, "loss": 1.2027, "step": 10088 }, { "epoch": 0.26570977087174086, "grad_norm": 2.2248125076293945, "learning_rate": 3.6723729259942067e-05, "loss": 1.8308, "step": 10089 }, { "epoch": 0.26573610745325255, "grad_norm": 2.8518710136413574, "learning_rate": 3.6722412430866475e-05, "loss": 1.6113, "step": 10090 }, { "epoch": 0.2657624440347643, "grad_norm": 2.782762050628662, "learning_rate": 3.672109560179089e-05, "loss": 1.4446, "step": 10091 }, { "epoch": 0.26578878061627603, "grad_norm": 2.3260200023651123, "learning_rate": 3.67197787727153e-05, "loss": 1.5937, "step": 10092 }, { "epoch": 0.2658151171977877, "grad_norm": 2.2114033699035645, "learning_rate": 3.6718461943639715e-05, "loss": 2.2396, "step": 10093 }, { "epoch": 0.26584145377929946, "grad_norm": 1.9129087924957275, "learning_rate": 3.671714511456413e-05, "loss": 1.4507, "step": 10094 }, { "epoch": 0.26586779036081115, "grad_norm": 1.6789244413375854, "learning_rate": 3.671582828548855e-05, "loss": 1.6753, "step": 10095 }, { "epoch": 0.2658941269423229, "grad_norm": 1.5938291549682617, "learning_rate": 3.671451145641296e-05, "loss": 1.8683, "step": 10096 }, { "epoch": 0.2659204635238346, "grad_norm": 4.494757652282715, "learning_rate": 3.671319462733737e-05, "loss": 1.4146, "step": 10097 }, { "epoch": 0.2659468001053463, "grad_norm": 1.9412486553192139, "learning_rate": 3.6711877798261793e-05, "loss": 1.3101, "step": 10098 }, { "epoch": 0.26597313668685807, "grad_norm": 8.31187915802002, "learning_rate": 3.67105609691862e-05, "loss": 2.1754, "step": 10099 }, { "epoch": 0.26599947326836976, "grad_norm": 2.717020273208618, "learning_rate": 3.670924414011062e-05, "loss": 1.5488, "step": 10100 }, { "epoch": 0.2660258098498815, "grad_norm": 1.9534482955932617, "learning_rate": 3.670792731103503e-05, "loss": 1.6723, "step": 10101 }, { "epoch": 0.2660521464313932, "grad_norm": 4.684081077575684, "learning_rate": 3.670661048195944e-05, "loss": 1.4647, "step": 10102 }, { "epoch": 0.26607848301290493, "grad_norm": 2.3786094188690186, "learning_rate": 3.670529365288386e-05, "loss": 1.2828, "step": 10103 }, { "epoch": 0.2661048195944166, "grad_norm": 3.372189521789551, "learning_rate": 3.6703976823808273e-05, "loss": 0.991, "step": 10104 }, { "epoch": 0.26613115617592836, "grad_norm": 2.048107862472534, "learning_rate": 3.670265999473268e-05, "loss": 2.0632, "step": 10105 }, { "epoch": 0.2661574927574401, "grad_norm": 1.9173202514648438, "learning_rate": 3.67013431656571e-05, "loss": 1.7183, "step": 10106 }, { "epoch": 0.2661838293389518, "grad_norm": 2.5450634956359863, "learning_rate": 3.6700026336581513e-05, "loss": 1.9082, "step": 10107 }, { "epoch": 0.26621016592046354, "grad_norm": 2.0234475135803223, "learning_rate": 3.669870950750593e-05, "loss": 1.7987, "step": 10108 }, { "epoch": 0.2662365025019752, "grad_norm": 4.430846214294434, "learning_rate": 3.6697392678430345e-05, "loss": 1.1304, "step": 10109 }, { "epoch": 0.26626283908348697, "grad_norm": 2.909736156463623, "learning_rate": 3.6696075849354754e-05, "loss": 1.7499, "step": 10110 }, { "epoch": 0.26628917566499866, "grad_norm": 1.8816019296646118, "learning_rate": 3.669475902027917e-05, "loss": 0.3428, "step": 10111 }, { "epoch": 0.2663155122465104, "grad_norm": 4.846202373504639, "learning_rate": 3.669344219120358e-05, "loss": 0.7626, "step": 10112 }, { "epoch": 0.26634184882802214, "grad_norm": 1.4240412712097168, "learning_rate": 3.6692125362128e-05, "loss": 1.7212, "step": 10113 }, { "epoch": 0.26636818540953383, "grad_norm": 1.6693060398101807, "learning_rate": 3.669080853305241e-05, "loss": 1.2432, "step": 10114 }, { "epoch": 0.2663945219910456, "grad_norm": 2.1589486598968506, "learning_rate": 3.6689491703976825e-05, "loss": 0.5404, "step": 10115 }, { "epoch": 0.26642085857255726, "grad_norm": 2.379560947418213, "learning_rate": 3.668817487490124e-05, "loss": 2.0994, "step": 10116 }, { "epoch": 0.266447195154069, "grad_norm": 2.8508524894714355, "learning_rate": 3.6686858045825656e-05, "loss": 1.7082, "step": 10117 }, { "epoch": 0.26647353173558075, "grad_norm": 2.458432912826538, "learning_rate": 3.668554121675007e-05, "loss": 2.1248, "step": 10118 }, { "epoch": 0.26649986831709244, "grad_norm": 3.4483277797698975, "learning_rate": 3.668422438767448e-05, "loss": 1.3698, "step": 10119 }, { "epoch": 0.2665262048986042, "grad_norm": 1.8100333213806152, "learning_rate": 3.6682907558598896e-05, "loss": 2.3127, "step": 10120 }, { "epoch": 0.26655254148011587, "grad_norm": 2.026071548461914, "learning_rate": 3.6681590729523305e-05, "loss": 1.992, "step": 10121 }, { "epoch": 0.2665788780616276, "grad_norm": 2.3289945125579834, "learning_rate": 3.668027390044773e-05, "loss": 1.6703, "step": 10122 }, { "epoch": 0.2666052146431393, "grad_norm": 1.7287365198135376, "learning_rate": 3.6678957071372136e-05, "loss": 2.3727, "step": 10123 }, { "epoch": 0.26663155122465104, "grad_norm": 1.7984932661056519, "learning_rate": 3.667764024229655e-05, "loss": 1.7627, "step": 10124 }, { "epoch": 0.2666578878061628, "grad_norm": 1.8620436191558838, "learning_rate": 3.667632341322097e-05, "loss": 2.183, "step": 10125 }, { "epoch": 0.2666842243876745, "grad_norm": 1.9804375171661377, "learning_rate": 3.6675006584145376e-05, "loss": 1.1698, "step": 10126 }, { "epoch": 0.2667105609691862, "grad_norm": 3.017799139022827, "learning_rate": 3.66736897550698e-05, "loss": 0.9595, "step": 10127 }, { "epoch": 0.2667368975506979, "grad_norm": 2.9736342430114746, "learning_rate": 3.667237292599421e-05, "loss": 1.3165, "step": 10128 }, { "epoch": 0.26676323413220965, "grad_norm": 1.9643712043762207, "learning_rate": 3.667105609691862e-05, "loss": 0.4521, "step": 10129 }, { "epoch": 0.26678957071372134, "grad_norm": 6.517406463623047, "learning_rate": 3.666973926784303e-05, "loss": 1.2143, "step": 10130 }, { "epoch": 0.2668159072952331, "grad_norm": 1.7456799745559692, "learning_rate": 3.6668422438767454e-05, "loss": 2.1619, "step": 10131 }, { "epoch": 0.2668422438767448, "grad_norm": 2.0272860527038574, "learning_rate": 3.666710560969186e-05, "loss": 1.5795, "step": 10132 }, { "epoch": 0.2668685804582565, "grad_norm": 2.6654109954833984, "learning_rate": 3.666578878061628e-05, "loss": 1.6556, "step": 10133 }, { "epoch": 0.26689491703976825, "grad_norm": 3.296703338623047, "learning_rate": 3.6664471951540694e-05, "loss": 0.6678, "step": 10134 }, { "epoch": 0.26692125362127994, "grad_norm": 1.7900242805480957, "learning_rate": 3.66631551224651e-05, "loss": 1.8282, "step": 10135 }, { "epoch": 0.2669475902027917, "grad_norm": 1.3976291418075562, "learning_rate": 3.6661838293389525e-05, "loss": 1.3654, "step": 10136 }, { "epoch": 0.2669739267843034, "grad_norm": 2.2098259925842285, "learning_rate": 3.6660521464313934e-05, "loss": 0.9748, "step": 10137 }, { "epoch": 0.2670002633658151, "grad_norm": 3.5597519874572754, "learning_rate": 3.665920463523835e-05, "loss": 0.7587, "step": 10138 }, { "epoch": 0.26702659994732686, "grad_norm": 1.924920916557312, "learning_rate": 3.665788780616276e-05, "loss": 1.4954, "step": 10139 }, { "epoch": 0.26705293652883855, "grad_norm": 2.8917338848114014, "learning_rate": 3.6656570977087174e-05, "loss": 1.2898, "step": 10140 }, { "epoch": 0.2670792731103503, "grad_norm": 2.3523623943328857, "learning_rate": 3.665525414801159e-05, "loss": 1.8597, "step": 10141 }, { "epoch": 0.267105609691862, "grad_norm": 1.6046857833862305, "learning_rate": 3.6653937318936005e-05, "loss": 1.63, "step": 10142 }, { "epoch": 0.2671319462733737, "grad_norm": 3.940185546875, "learning_rate": 3.665262048986042e-05, "loss": 1.4117, "step": 10143 }, { "epoch": 0.2671582828548854, "grad_norm": 2.744216203689575, "learning_rate": 3.665130366078483e-05, "loss": 1.0407, "step": 10144 }, { "epoch": 0.26718461943639715, "grad_norm": 1.977405309677124, "learning_rate": 3.6649986831709245e-05, "loss": 1.4877, "step": 10145 }, { "epoch": 0.2672109560179089, "grad_norm": 1.750759243965149, "learning_rate": 3.664867000263366e-05, "loss": 1.9072, "step": 10146 }, { "epoch": 0.2672372925994206, "grad_norm": 1.7603579759597778, "learning_rate": 3.6647353173558076e-05, "loss": 1.9183, "step": 10147 }, { "epoch": 0.2672636291809323, "grad_norm": 1.503250241279602, "learning_rate": 3.6646036344482485e-05, "loss": 1.9238, "step": 10148 }, { "epoch": 0.267289965762444, "grad_norm": 3.797058343887329, "learning_rate": 3.66447195154069e-05, "loss": 0.977, "step": 10149 }, { "epoch": 0.26731630234395576, "grad_norm": 2.008352756500244, "learning_rate": 3.6643402686331316e-05, "loss": 1.9535, "step": 10150 }, { "epoch": 0.2673426389254675, "grad_norm": 2.1281659603118896, "learning_rate": 3.664208585725573e-05, "loss": 2.8871, "step": 10151 }, { "epoch": 0.2673689755069792, "grad_norm": 2.482161521911621, "learning_rate": 3.664076902818014e-05, "loss": 1.6571, "step": 10152 }, { "epoch": 0.26739531208849093, "grad_norm": 2.4182276725769043, "learning_rate": 3.6639452199104556e-05, "loss": 0.7212, "step": 10153 }, { "epoch": 0.2674216486700026, "grad_norm": 2.4475228786468506, "learning_rate": 3.663813537002897e-05, "loss": 2.244, "step": 10154 }, { "epoch": 0.26744798525151436, "grad_norm": 2.8437564373016357, "learning_rate": 3.663681854095339e-05, "loss": 2.1432, "step": 10155 }, { "epoch": 0.26747432183302605, "grad_norm": 1.8764352798461914, "learning_rate": 3.66355017118778e-05, "loss": 2.2595, "step": 10156 }, { "epoch": 0.2675006584145378, "grad_norm": 1.4289652109146118, "learning_rate": 3.663418488280221e-05, "loss": 0.1806, "step": 10157 }, { "epoch": 0.26752699499604954, "grad_norm": 2.258349657058716, "learning_rate": 3.663286805372663e-05, "loss": 1.8163, "step": 10158 }, { "epoch": 0.2675533315775612, "grad_norm": 1.6824095249176025, "learning_rate": 3.6631551224651036e-05, "loss": 1.5649, "step": 10159 }, { "epoch": 0.26757966815907297, "grad_norm": 2.620978355407715, "learning_rate": 3.663023439557546e-05, "loss": 2.3284, "step": 10160 }, { "epoch": 0.26760600474058466, "grad_norm": 1.7957783937454224, "learning_rate": 3.662891756649987e-05, "loss": 2.1384, "step": 10161 }, { "epoch": 0.2676323413220964, "grad_norm": 2.219515800476074, "learning_rate": 3.662760073742428e-05, "loss": 1.3335, "step": 10162 }, { "epoch": 0.2676586779036081, "grad_norm": 1.5960590839385986, "learning_rate": 3.66262839083487e-05, "loss": 1.7403, "step": 10163 }, { "epoch": 0.26768501448511983, "grad_norm": 2.2094027996063232, "learning_rate": 3.6624967079273114e-05, "loss": 1.9251, "step": 10164 }, { "epoch": 0.2677113510666316, "grad_norm": 1.57603120803833, "learning_rate": 3.662365025019753e-05, "loss": 2.1808, "step": 10165 }, { "epoch": 0.26773768764814326, "grad_norm": 2.3066632747650146, "learning_rate": 3.662233342112194e-05, "loss": 1.9552, "step": 10166 }, { "epoch": 0.267764024229655, "grad_norm": 1.8146836757659912, "learning_rate": 3.6621016592046354e-05, "loss": 2.3886, "step": 10167 }, { "epoch": 0.2677903608111667, "grad_norm": 2.3190245628356934, "learning_rate": 3.661969976297076e-05, "loss": 2.0307, "step": 10168 }, { "epoch": 0.26781669739267844, "grad_norm": 2.938628673553467, "learning_rate": 3.6618382933895186e-05, "loss": 1.5953, "step": 10169 }, { "epoch": 0.2678430339741901, "grad_norm": 2.054081439971924, "learning_rate": 3.6617066104819594e-05, "loss": 1.9068, "step": 10170 }, { "epoch": 0.26786937055570187, "grad_norm": 1.9563339948654175, "learning_rate": 3.661574927574401e-05, "loss": 1.6163, "step": 10171 }, { "epoch": 0.2678957071372136, "grad_norm": 3.8861870765686035, "learning_rate": 3.6614432446668426e-05, "loss": 0.9697, "step": 10172 }, { "epoch": 0.2679220437187253, "grad_norm": 2.57987642288208, "learning_rate": 3.6613115617592835e-05, "loss": 2.7066, "step": 10173 }, { "epoch": 0.26794838030023704, "grad_norm": 2.2081856727600098, "learning_rate": 3.661179878851726e-05, "loss": 1.7714, "step": 10174 }, { "epoch": 0.26797471688174873, "grad_norm": 2.464604616165161, "learning_rate": 3.6610481959441666e-05, "loss": 0.6583, "step": 10175 }, { "epoch": 0.2680010534632605, "grad_norm": 1.5755434036254883, "learning_rate": 3.660916513036608e-05, "loss": 0.5501, "step": 10176 }, { "epoch": 0.26802739004477216, "grad_norm": 1.4400713443756104, "learning_rate": 3.660784830129049e-05, "loss": 1.4302, "step": 10177 }, { "epoch": 0.2680537266262839, "grad_norm": 2.777074098587036, "learning_rate": 3.6606531472214906e-05, "loss": 1.2679, "step": 10178 }, { "epoch": 0.26808006320779565, "grad_norm": 2.064556837081909, "learning_rate": 3.660521464313932e-05, "loss": 2.2791, "step": 10179 }, { "epoch": 0.26810639978930734, "grad_norm": 6.590086460113525, "learning_rate": 3.660389781406374e-05, "loss": 1.3977, "step": 10180 }, { "epoch": 0.2681327363708191, "grad_norm": 2.8759360313415527, "learning_rate": 3.660258098498815e-05, "loss": 1.6432, "step": 10181 }, { "epoch": 0.26815907295233077, "grad_norm": 2.0994532108306885, "learning_rate": 3.660126415591256e-05, "loss": 1.9579, "step": 10182 }, { "epoch": 0.2681854095338425, "grad_norm": 2.383129596710205, "learning_rate": 3.6599947326836984e-05, "loss": 1.7171, "step": 10183 }, { "epoch": 0.2682117461153542, "grad_norm": 3.364213466644287, "learning_rate": 3.659863049776139e-05, "loss": 0.6375, "step": 10184 }, { "epoch": 0.26823808269686594, "grad_norm": 4.401752948760986, "learning_rate": 3.659731366868581e-05, "loss": 2.4599, "step": 10185 }, { "epoch": 0.2682644192783777, "grad_norm": 2.8946151733398438, "learning_rate": 3.659599683961022e-05, "loss": 2.0729, "step": 10186 }, { "epoch": 0.2682907558598894, "grad_norm": 1.7243560552597046, "learning_rate": 3.659468001053463e-05, "loss": 1.4998, "step": 10187 }, { "epoch": 0.2683170924414011, "grad_norm": 1.4112474918365479, "learning_rate": 3.659336318145905e-05, "loss": 1.7683, "step": 10188 }, { "epoch": 0.2683434290229128, "grad_norm": 1.7918068170547485, "learning_rate": 3.6592046352383464e-05, "loss": 2.0734, "step": 10189 }, { "epoch": 0.26836976560442455, "grad_norm": 1.8346103429794312, "learning_rate": 3.659072952330788e-05, "loss": 1.7252, "step": 10190 }, { "epoch": 0.2683961021859363, "grad_norm": 1.9054176807403564, "learning_rate": 3.658941269423229e-05, "loss": 2.2812, "step": 10191 }, { "epoch": 0.268422438767448, "grad_norm": 3.0302562713623047, "learning_rate": 3.6588095865156704e-05, "loss": 1.2557, "step": 10192 }, { "epoch": 0.2684487753489597, "grad_norm": 1.806452989578247, "learning_rate": 3.658677903608112e-05, "loss": 1.9694, "step": 10193 }, { "epoch": 0.2684751119304714, "grad_norm": 1.979327917098999, "learning_rate": 3.6585462207005535e-05, "loss": 1.3223, "step": 10194 }, { "epoch": 0.26850144851198315, "grad_norm": 2.0761165618896484, "learning_rate": 3.6584145377929944e-05, "loss": 2.0764, "step": 10195 }, { "epoch": 0.26852778509349484, "grad_norm": 3.167067527770996, "learning_rate": 3.658282854885436e-05, "loss": 2.0625, "step": 10196 }, { "epoch": 0.2685541216750066, "grad_norm": 1.328862190246582, "learning_rate": 3.6581511719778775e-05, "loss": 0.3699, "step": 10197 }, { "epoch": 0.26858045825651833, "grad_norm": 2.5438637733459473, "learning_rate": 3.658019489070319e-05, "loss": 1.339, "step": 10198 }, { "epoch": 0.26860679483803, "grad_norm": 2.776309013366699, "learning_rate": 3.65788780616276e-05, "loss": 1.8084, "step": 10199 }, { "epoch": 0.26863313141954176, "grad_norm": 1.5442655086517334, "learning_rate": 3.6577561232552015e-05, "loss": 1.6825, "step": 10200 }, { "epoch": 0.26865946800105345, "grad_norm": 1.637276291847229, "learning_rate": 3.657624440347643e-05, "loss": 1.7556, "step": 10201 }, { "epoch": 0.2686858045825652, "grad_norm": 1.8612775802612305, "learning_rate": 3.6574927574400846e-05, "loss": 1.9085, "step": 10202 }, { "epoch": 0.2687121411640769, "grad_norm": 1.9783895015716553, "learning_rate": 3.657361074532526e-05, "loss": 0.5545, "step": 10203 }, { "epoch": 0.2687384777455886, "grad_norm": 1.7216050624847412, "learning_rate": 3.657229391624967e-05, "loss": 2.0621, "step": 10204 }, { "epoch": 0.26876481432710037, "grad_norm": 2.0586533546447754, "learning_rate": 3.6570977087174086e-05, "loss": 1.7193, "step": 10205 }, { "epoch": 0.26879115090861205, "grad_norm": 3.382467031478882, "learning_rate": 3.6569660258098495e-05, "loss": 1.0798, "step": 10206 }, { "epoch": 0.2688174874901238, "grad_norm": 1.9821090698242188, "learning_rate": 3.656834342902292e-05, "loss": 1.2546, "step": 10207 }, { "epoch": 0.2688438240716355, "grad_norm": 3.441862106323242, "learning_rate": 3.6567026599947326e-05, "loss": 1.7819, "step": 10208 }, { "epoch": 0.2688701606531472, "grad_norm": 1.9196189641952515, "learning_rate": 3.656570977087174e-05, "loss": 1.7627, "step": 10209 }, { "epoch": 0.2688964972346589, "grad_norm": 2.1342666149139404, "learning_rate": 3.656439294179616e-05, "loss": 1.842, "step": 10210 }, { "epoch": 0.26892283381617066, "grad_norm": 4.082106590270996, "learning_rate": 3.6563076112720566e-05, "loss": 1.4424, "step": 10211 }, { "epoch": 0.2689491703976824, "grad_norm": 1.8392866849899292, "learning_rate": 3.656175928364499e-05, "loss": 1.7364, "step": 10212 }, { "epoch": 0.2689755069791941, "grad_norm": 2.4836320877075195, "learning_rate": 3.65604424545694e-05, "loss": 1.3697, "step": 10213 }, { "epoch": 0.26900184356070583, "grad_norm": 2.3564136028289795, "learning_rate": 3.655912562549381e-05, "loss": 2.1738, "step": 10214 }, { "epoch": 0.2690281801422175, "grad_norm": 1.7465589046478271, "learning_rate": 3.655780879641822e-05, "loss": 1.643, "step": 10215 }, { "epoch": 0.26905451672372926, "grad_norm": 3.9097602367401123, "learning_rate": 3.6556491967342644e-05, "loss": 1.5084, "step": 10216 }, { "epoch": 0.26908085330524095, "grad_norm": 3.0084879398345947, "learning_rate": 3.655517513826705e-05, "loss": 1.6082, "step": 10217 }, { "epoch": 0.2691071898867527, "grad_norm": 1.3666701316833496, "learning_rate": 3.655385830919147e-05, "loss": 1.8943, "step": 10218 }, { "epoch": 0.26913352646826444, "grad_norm": 1.8530930280685425, "learning_rate": 3.6552541480115884e-05, "loss": 1.7454, "step": 10219 }, { "epoch": 0.2691598630497761, "grad_norm": 2.229616165161133, "learning_rate": 3.655122465104029e-05, "loss": 1.8255, "step": 10220 }, { "epoch": 0.26918619963128787, "grad_norm": 3.463870048522949, "learning_rate": 3.6549907821964715e-05, "loss": 1.612, "step": 10221 }, { "epoch": 0.26921253621279956, "grad_norm": 3.262535572052002, "learning_rate": 3.6548590992889124e-05, "loss": 1.2556, "step": 10222 }, { "epoch": 0.2692388727943113, "grad_norm": 1.6866883039474487, "learning_rate": 3.654727416381354e-05, "loss": 1.618, "step": 10223 }, { "epoch": 0.26926520937582304, "grad_norm": 2.051145076751709, "learning_rate": 3.654595733473795e-05, "loss": 1.3316, "step": 10224 }, { "epoch": 0.26929154595733473, "grad_norm": 1.9978785514831543, "learning_rate": 3.6544640505662364e-05, "loss": 1.4455, "step": 10225 }, { "epoch": 0.2693178825388465, "grad_norm": 4.349775314331055, "learning_rate": 3.654332367658678e-05, "loss": 1.0462, "step": 10226 }, { "epoch": 0.26934421912035816, "grad_norm": 2.0261640548706055, "learning_rate": 3.6542006847511195e-05, "loss": 1.5558, "step": 10227 }, { "epoch": 0.2693705557018699, "grad_norm": 4.261337757110596, "learning_rate": 3.654069001843561e-05, "loss": 2.2034, "step": 10228 }, { "epoch": 0.2693968922833816, "grad_norm": 2.3293251991271973, "learning_rate": 3.653937318936002e-05, "loss": 0.9925, "step": 10229 }, { "epoch": 0.26942322886489334, "grad_norm": 2.065385580062866, "learning_rate": 3.653805636028444e-05, "loss": 1.5747, "step": 10230 }, { "epoch": 0.2694495654464051, "grad_norm": 1.7524702548980713, "learning_rate": 3.653673953120885e-05, "loss": 1.8194, "step": 10231 }, { "epoch": 0.26947590202791677, "grad_norm": 2.126185894012451, "learning_rate": 3.653542270213327e-05, "loss": 1.3905, "step": 10232 }, { "epoch": 0.2695022386094285, "grad_norm": 1.7913614511489868, "learning_rate": 3.6534105873057676e-05, "loss": 1.561, "step": 10233 }, { "epoch": 0.2695285751909402, "grad_norm": 4.287265777587891, "learning_rate": 3.653278904398209e-05, "loss": 1.7555, "step": 10234 }, { "epoch": 0.26955491177245194, "grad_norm": 2.717137336730957, "learning_rate": 3.653147221490651e-05, "loss": 2.051, "step": 10235 }, { "epoch": 0.26958124835396363, "grad_norm": 2.6772007942199707, "learning_rate": 3.653015538583092e-05, "loss": 1.4684, "step": 10236 }, { "epoch": 0.2696075849354754, "grad_norm": 3.531998872756958, "learning_rate": 3.652883855675534e-05, "loss": 1.3407, "step": 10237 }, { "epoch": 0.2696339215169871, "grad_norm": 2.233687400817871, "learning_rate": 3.652752172767975e-05, "loss": 1.4213, "step": 10238 }, { "epoch": 0.2696602580984988, "grad_norm": 1.4807181358337402, "learning_rate": 3.652620489860416e-05, "loss": 1.6249, "step": 10239 }, { "epoch": 0.26968659468001055, "grad_norm": 2.244846820831299, "learning_rate": 3.652488806952858e-05, "loss": 0.6817, "step": 10240 }, { "epoch": 0.26971293126152224, "grad_norm": 1.7698038816452026, "learning_rate": 3.6523571240452994e-05, "loss": 1.9588, "step": 10241 }, { "epoch": 0.269739267843034, "grad_norm": 5.134593486785889, "learning_rate": 3.65222544113774e-05, "loss": 1.7539, "step": 10242 }, { "epoch": 0.26976560442454567, "grad_norm": 2.8440356254577637, "learning_rate": 3.652093758230182e-05, "loss": 2.2596, "step": 10243 }, { "epoch": 0.2697919410060574, "grad_norm": 2.1762497425079346, "learning_rate": 3.6519620753226234e-05, "loss": 1.9777, "step": 10244 }, { "epoch": 0.26981827758756916, "grad_norm": 3.0282039642333984, "learning_rate": 3.651830392415065e-05, "loss": 1.4478, "step": 10245 }, { "epoch": 0.26984461416908084, "grad_norm": 1.7023380994796753, "learning_rate": 3.6516987095075065e-05, "loss": 1.8943, "step": 10246 }, { "epoch": 0.2698709507505926, "grad_norm": 1.9163802862167358, "learning_rate": 3.6515670265999474e-05, "loss": 1.6845, "step": 10247 }, { "epoch": 0.2698972873321043, "grad_norm": 1.7483388185501099, "learning_rate": 3.651435343692389e-05, "loss": 2.1314, "step": 10248 }, { "epoch": 0.269923623913616, "grad_norm": 1.972560167312622, "learning_rate": 3.6513036607848305e-05, "loss": 2.1892, "step": 10249 }, { "epoch": 0.2699499604951277, "grad_norm": 1.8761812448501587, "learning_rate": 3.651171977877272e-05, "loss": 1.6801, "step": 10250 }, { "epoch": 0.26997629707663945, "grad_norm": 1.4988170862197876, "learning_rate": 3.651040294969713e-05, "loss": 1.7104, "step": 10251 }, { "epoch": 0.2700026336581512, "grad_norm": 2.0416862964630127, "learning_rate": 3.6509086120621545e-05, "loss": 1.9879, "step": 10252 }, { "epoch": 0.2700289702396629, "grad_norm": 2.491778612136841, "learning_rate": 3.6507769291545954e-05, "loss": 1.3525, "step": 10253 }, { "epoch": 0.2700553068211746, "grad_norm": 2.2661430835723877, "learning_rate": 3.6506452462470376e-05, "loss": 1.8682, "step": 10254 }, { "epoch": 0.2700816434026863, "grad_norm": 2.217207431793213, "learning_rate": 3.6505135633394785e-05, "loss": 1.878, "step": 10255 }, { "epoch": 0.27010797998419805, "grad_norm": 1.5754505395889282, "learning_rate": 3.65038188043192e-05, "loss": 1.9861, "step": 10256 }, { "epoch": 0.2701343165657098, "grad_norm": 3.6967532634735107, "learning_rate": 3.6502501975243616e-05, "loss": 1.2931, "step": 10257 }, { "epoch": 0.2701606531472215, "grad_norm": 2.2997963428497314, "learning_rate": 3.6501185146168025e-05, "loss": 1.3636, "step": 10258 }, { "epoch": 0.27018698972873323, "grad_norm": 1.778559684753418, "learning_rate": 3.649986831709245e-05, "loss": 1.8104, "step": 10259 }, { "epoch": 0.2702133263102449, "grad_norm": 2.6709375381469727, "learning_rate": 3.6498551488016856e-05, "loss": 2.0059, "step": 10260 }, { "epoch": 0.27023966289175666, "grad_norm": 2.425659656524658, "learning_rate": 3.649723465894127e-05, "loss": 0.6497, "step": 10261 }, { "epoch": 0.27026599947326835, "grad_norm": 2.145305871963501, "learning_rate": 3.649591782986568e-05, "loss": 2.8096, "step": 10262 }, { "epoch": 0.2702923360547801, "grad_norm": 1.9830477237701416, "learning_rate": 3.64946010007901e-05, "loss": 1.8591, "step": 10263 }, { "epoch": 0.27031867263629183, "grad_norm": 1.9636954069137573, "learning_rate": 3.649328417171451e-05, "loss": 1.9282, "step": 10264 }, { "epoch": 0.2703450092178035, "grad_norm": 2.2643651962280273, "learning_rate": 3.649196734263893e-05, "loss": 2.0527, "step": 10265 }, { "epoch": 0.27037134579931527, "grad_norm": 1.8713598251342773, "learning_rate": 3.649065051356334e-05, "loss": 2.5977, "step": 10266 }, { "epoch": 0.27039768238082695, "grad_norm": 1.8056583404541016, "learning_rate": 3.648933368448775e-05, "loss": 1.5701, "step": 10267 }, { "epoch": 0.2704240189623387, "grad_norm": 2.695420503616333, "learning_rate": 3.6488016855412174e-05, "loss": 1.2163, "step": 10268 }, { "epoch": 0.2704503555438504, "grad_norm": 4.3372931480407715, "learning_rate": 3.648670002633658e-05, "loss": 1.5018, "step": 10269 }, { "epoch": 0.27047669212536213, "grad_norm": 4.943918704986572, "learning_rate": 3.6485383197261e-05, "loss": 1.6296, "step": 10270 }, { "epoch": 0.27050302870687387, "grad_norm": 1.8845360279083252, "learning_rate": 3.648406636818541e-05, "loss": 2.0695, "step": 10271 }, { "epoch": 0.27052936528838556, "grad_norm": 4.754239559173584, "learning_rate": 3.648274953910982e-05, "loss": 0.8524, "step": 10272 }, { "epoch": 0.2705557018698973, "grad_norm": 1.9632288217544556, "learning_rate": 3.648143271003424e-05, "loss": 2.0543, "step": 10273 }, { "epoch": 0.270582038451409, "grad_norm": 1.8391855955123901, "learning_rate": 3.6480115880958654e-05, "loss": 1.5461, "step": 10274 }, { "epoch": 0.27060837503292073, "grad_norm": 4.078208923339844, "learning_rate": 3.647879905188307e-05, "loss": 0.8907, "step": 10275 }, { "epoch": 0.2706347116144324, "grad_norm": 2.712761640548706, "learning_rate": 3.647748222280748e-05, "loss": 2.1295, "step": 10276 }, { "epoch": 0.27066104819594416, "grad_norm": 2.2774975299835205, "learning_rate": 3.64761653937319e-05, "loss": 1.0989, "step": 10277 }, { "epoch": 0.2706873847774559, "grad_norm": 2.394390106201172, "learning_rate": 3.647484856465631e-05, "loss": 1.8499, "step": 10278 }, { "epoch": 0.2707137213589676, "grad_norm": 1.5043600797653198, "learning_rate": 3.6473531735580725e-05, "loss": 0.4539, "step": 10279 }, { "epoch": 0.27074005794047934, "grad_norm": 2.093470335006714, "learning_rate": 3.6472214906505134e-05, "loss": 0.4458, "step": 10280 }, { "epoch": 0.270766394521991, "grad_norm": 1.9578824043273926, "learning_rate": 3.647089807742955e-05, "loss": 1.9961, "step": 10281 }, { "epoch": 0.27079273110350277, "grad_norm": 2.1867403984069824, "learning_rate": 3.6469581248353965e-05, "loss": 1.7453, "step": 10282 }, { "epoch": 0.27081906768501446, "grad_norm": 1.919355034828186, "learning_rate": 3.646826441927838e-05, "loss": 1.4913, "step": 10283 }, { "epoch": 0.2708454042665262, "grad_norm": 2.5337464809417725, "learning_rate": 3.6466947590202796e-05, "loss": 1.8079, "step": 10284 }, { "epoch": 0.27087174084803795, "grad_norm": 2.3828213214874268, "learning_rate": 3.6465630761127205e-05, "loss": 2.2451, "step": 10285 }, { "epoch": 0.27089807742954963, "grad_norm": 2.907728433609009, "learning_rate": 3.646431393205162e-05, "loss": 1.1044, "step": 10286 }, { "epoch": 0.2709244140110614, "grad_norm": 1.8900034427642822, "learning_rate": 3.6462997102976036e-05, "loss": 1.6356, "step": 10287 }, { "epoch": 0.27095075059257306, "grad_norm": 2.902811288833618, "learning_rate": 3.646168027390045e-05, "loss": 0.5479, "step": 10288 }, { "epoch": 0.2709770871740848, "grad_norm": 2.088362455368042, "learning_rate": 3.646036344482486e-05, "loss": 2.0231, "step": 10289 }, { "epoch": 0.27100342375559655, "grad_norm": 1.5732823610305786, "learning_rate": 3.6459046615749276e-05, "loss": 1.4694, "step": 10290 }, { "epoch": 0.27102976033710824, "grad_norm": 2.044257640838623, "learning_rate": 3.645772978667369e-05, "loss": 1.2716, "step": 10291 }, { "epoch": 0.27105609691862, "grad_norm": 1.5156606435775757, "learning_rate": 3.645641295759811e-05, "loss": 2.5358, "step": 10292 }, { "epoch": 0.27108243350013167, "grad_norm": 2.078439474105835, "learning_rate": 3.645509612852252e-05, "loss": 1.5971, "step": 10293 }, { "epoch": 0.2711087700816434, "grad_norm": 2.194434642791748, "learning_rate": 3.645377929944693e-05, "loss": 1.6092, "step": 10294 }, { "epoch": 0.2711351066631551, "grad_norm": 3.3006484508514404, "learning_rate": 3.645246247037135e-05, "loss": 1.2254, "step": 10295 }, { "epoch": 0.27116144324466684, "grad_norm": 2.5422470569610596, "learning_rate": 3.645114564129576e-05, "loss": 1.8761, "step": 10296 }, { "epoch": 0.2711877798261786, "grad_norm": 2.4962961673736572, "learning_rate": 3.644982881222018e-05, "loss": 0.685, "step": 10297 }, { "epoch": 0.2712141164076903, "grad_norm": 2.2493808269500732, "learning_rate": 3.644851198314459e-05, "loss": 1.89, "step": 10298 }, { "epoch": 0.271240452989202, "grad_norm": 1.9526785612106323, "learning_rate": 3.6447195154069e-05, "loss": 2.5459, "step": 10299 }, { "epoch": 0.2712667895707137, "grad_norm": 1.5954350233078003, "learning_rate": 3.644587832499341e-05, "loss": 2.182, "step": 10300 }, { "epoch": 0.27129312615222545, "grad_norm": 2.677779197692871, "learning_rate": 3.6444561495917835e-05, "loss": 0.5432, "step": 10301 }, { "epoch": 0.27131946273373714, "grad_norm": 1.6161595582962036, "learning_rate": 3.644324466684224e-05, "loss": 0.3551, "step": 10302 }, { "epoch": 0.2713457993152489, "grad_norm": 2.9487502574920654, "learning_rate": 3.644192783776666e-05, "loss": 1.8901, "step": 10303 }, { "epoch": 0.2713721358967606, "grad_norm": 1.8856022357940674, "learning_rate": 3.6440611008691075e-05, "loss": 2.214, "step": 10304 }, { "epoch": 0.2713984724782723, "grad_norm": 2.3007776737213135, "learning_rate": 3.643929417961548e-05, "loss": 2.2138, "step": 10305 }, { "epoch": 0.27142480905978406, "grad_norm": 1.6632148027420044, "learning_rate": 3.6437977350539906e-05, "loss": 2.1418, "step": 10306 }, { "epoch": 0.27145114564129574, "grad_norm": 1.6475247144699097, "learning_rate": 3.6436660521464315e-05, "loss": 1.8145, "step": 10307 }, { "epoch": 0.2714774822228075, "grad_norm": 3.1381053924560547, "learning_rate": 3.643534369238873e-05, "loss": 0.7929, "step": 10308 }, { "epoch": 0.2715038188043192, "grad_norm": 1.8969988822937012, "learning_rate": 3.643402686331314e-05, "loss": 0.7786, "step": 10309 }, { "epoch": 0.2715301553858309, "grad_norm": 1.9219800233840942, "learning_rate": 3.643271003423756e-05, "loss": 1.5643, "step": 10310 }, { "epoch": 0.27155649196734266, "grad_norm": 2.1371917724609375, "learning_rate": 3.643139320516197e-05, "loss": 0.5427, "step": 10311 }, { "epoch": 0.27158282854885435, "grad_norm": 3.16784405708313, "learning_rate": 3.6430076376086386e-05, "loss": 1.7544, "step": 10312 }, { "epoch": 0.2716091651303661, "grad_norm": 2.1667733192443848, "learning_rate": 3.64287595470108e-05, "loss": 1.9134, "step": 10313 }, { "epoch": 0.2716355017118778, "grad_norm": 2.247311592102051, "learning_rate": 3.642744271793521e-05, "loss": 1.8834, "step": 10314 }, { "epoch": 0.2716618382933895, "grad_norm": 2.0783884525299072, "learning_rate": 3.642612588885963e-05, "loss": 1.6994, "step": 10315 }, { "epoch": 0.2716881748749012, "grad_norm": 1.5845547914505005, "learning_rate": 3.642480905978404e-05, "loss": 1.692, "step": 10316 }, { "epoch": 0.27171451145641295, "grad_norm": 2.274195909500122, "learning_rate": 3.642349223070846e-05, "loss": 1.1233, "step": 10317 }, { "epoch": 0.2717408480379247, "grad_norm": 2.503321647644043, "learning_rate": 3.6422175401632866e-05, "loss": 1.7993, "step": 10318 }, { "epoch": 0.2717671846194364, "grad_norm": 6.4886980056762695, "learning_rate": 3.642085857255728e-05, "loss": 1.2929, "step": 10319 }, { "epoch": 0.27179352120094813, "grad_norm": 1.6680946350097656, "learning_rate": 3.64195417434817e-05, "loss": 1.0955, "step": 10320 }, { "epoch": 0.2718198577824598, "grad_norm": 4.5090789794921875, "learning_rate": 3.641822491440611e-05, "loss": 0.7915, "step": 10321 }, { "epoch": 0.27184619436397156, "grad_norm": 2.5213704109191895, "learning_rate": 3.641690808533053e-05, "loss": 2.5157, "step": 10322 }, { "epoch": 0.2718725309454833, "grad_norm": 1.5279197692871094, "learning_rate": 3.641559125625494e-05, "loss": 2.0742, "step": 10323 }, { "epoch": 0.271898867526995, "grad_norm": 2.4287710189819336, "learning_rate": 3.641427442717935e-05, "loss": 2.5385, "step": 10324 }, { "epoch": 0.27192520410850674, "grad_norm": 1.6410257816314697, "learning_rate": 3.641295759810377e-05, "loss": 1.9969, "step": 10325 }, { "epoch": 0.2719515406900184, "grad_norm": 1.8392621278762817, "learning_rate": 3.6411640769028184e-05, "loss": 0.6193, "step": 10326 }, { "epoch": 0.27197787727153017, "grad_norm": 5.369431495666504, "learning_rate": 3.641032393995259e-05, "loss": 1.2819, "step": 10327 }, { "epoch": 0.27200421385304185, "grad_norm": 2.5196642875671387, "learning_rate": 3.640900711087701e-05, "loss": 1.9757, "step": 10328 }, { "epoch": 0.2720305504345536, "grad_norm": 2.9400951862335205, "learning_rate": 3.6407690281801424e-05, "loss": 1.8495, "step": 10329 }, { "epoch": 0.27205688701606534, "grad_norm": 2.3373031616210938, "learning_rate": 3.640637345272584e-05, "loss": 1.5409, "step": 10330 }, { "epoch": 0.27208322359757703, "grad_norm": 3.901477813720703, "learning_rate": 3.6405056623650255e-05, "loss": 1.9812, "step": 10331 }, { "epoch": 0.27210956017908877, "grad_norm": 1.6351262331008911, "learning_rate": 3.6403739794574664e-05, "loss": 1.3154, "step": 10332 }, { "epoch": 0.27213589676060046, "grad_norm": 2.3115203380584717, "learning_rate": 3.640242296549908e-05, "loss": 1.4164, "step": 10333 }, { "epoch": 0.2721622333421122, "grad_norm": 2.477191209793091, "learning_rate": 3.6401106136423495e-05, "loss": 0.87, "step": 10334 }, { "epoch": 0.2721885699236239, "grad_norm": 2.5607640743255615, "learning_rate": 3.639978930734791e-05, "loss": 1.7121, "step": 10335 }, { "epoch": 0.27221490650513563, "grad_norm": 2.4744653701782227, "learning_rate": 3.639847247827232e-05, "loss": 1.4515, "step": 10336 }, { "epoch": 0.2722412430866474, "grad_norm": 1.6541136503219604, "learning_rate": 3.6397155649196735e-05, "loss": 2.0111, "step": 10337 }, { "epoch": 0.27226757966815907, "grad_norm": 2.601785898208618, "learning_rate": 3.639583882012115e-05, "loss": 0.8639, "step": 10338 }, { "epoch": 0.2722939162496708, "grad_norm": 2.812194585800171, "learning_rate": 3.6394521991045566e-05, "loss": 1.4026, "step": 10339 }, { "epoch": 0.2723202528311825, "grad_norm": 2.294760227203369, "learning_rate": 3.639320516196998e-05, "loss": 1.4016, "step": 10340 }, { "epoch": 0.27234658941269424, "grad_norm": 1.8534947633743286, "learning_rate": 3.639188833289439e-05, "loss": 1.8126, "step": 10341 }, { "epoch": 0.27237292599420593, "grad_norm": 2.672706365585327, "learning_rate": 3.6390571503818806e-05, "loss": 2.0264, "step": 10342 }, { "epoch": 0.27239926257571767, "grad_norm": 3.424736738204956, "learning_rate": 3.638925467474322e-05, "loss": 0.6968, "step": 10343 }, { "epoch": 0.2724255991572294, "grad_norm": 2.4977211952209473, "learning_rate": 3.638793784566764e-05, "loss": 1.8002, "step": 10344 }, { "epoch": 0.2724519357387411, "grad_norm": 2.5923004150390625, "learning_rate": 3.6386621016592046e-05, "loss": 1.0333, "step": 10345 }, { "epoch": 0.27247827232025285, "grad_norm": 2.979314088821411, "learning_rate": 3.638530418751646e-05, "loss": 1.5285, "step": 10346 }, { "epoch": 0.27250460890176453, "grad_norm": 2.8747994899749756, "learning_rate": 3.638398735844088e-05, "loss": 1.6198, "step": 10347 }, { "epoch": 0.2725309454832763, "grad_norm": 2.0636353492736816, "learning_rate": 3.638267052936529e-05, "loss": 1.6727, "step": 10348 }, { "epoch": 0.27255728206478796, "grad_norm": 4.8102216720581055, "learning_rate": 3.638135370028971e-05, "loss": 1.1448, "step": 10349 }, { "epoch": 0.2725836186462997, "grad_norm": 1.7907400131225586, "learning_rate": 3.638003687121412e-05, "loss": 1.7525, "step": 10350 }, { "epoch": 0.27260995522781145, "grad_norm": 2.293001413345337, "learning_rate": 3.637872004213853e-05, "loss": 1.6077, "step": 10351 }, { "epoch": 0.27263629180932314, "grad_norm": 2.3834128379821777, "learning_rate": 3.637740321306294e-05, "loss": 1.5715, "step": 10352 }, { "epoch": 0.2726626283908349, "grad_norm": 1.9715545177459717, "learning_rate": 3.6376086383987364e-05, "loss": 1.3433, "step": 10353 }, { "epoch": 0.27268896497234657, "grad_norm": 2.0868146419525146, "learning_rate": 3.637476955491177e-05, "loss": 2.0491, "step": 10354 }, { "epoch": 0.2727153015538583, "grad_norm": 2.4551172256469727, "learning_rate": 3.637345272583619e-05, "loss": 0.9652, "step": 10355 }, { "epoch": 0.27274163813537, "grad_norm": 1.6090114116668701, "learning_rate": 3.63721358967606e-05, "loss": 1.5764, "step": 10356 }, { "epoch": 0.27276797471688174, "grad_norm": 2.094609260559082, "learning_rate": 3.637081906768501e-05, "loss": 0.3454, "step": 10357 }, { "epoch": 0.2727943112983935, "grad_norm": 3.1312239170074463, "learning_rate": 3.636950223860943e-05, "loss": 2.2632, "step": 10358 }, { "epoch": 0.2728206478799052, "grad_norm": 2.2778000831604004, "learning_rate": 3.6368185409533844e-05, "loss": 0.7167, "step": 10359 }, { "epoch": 0.2728469844614169, "grad_norm": 2.317643165588379, "learning_rate": 3.636686858045826e-05, "loss": 1.7029, "step": 10360 }, { "epoch": 0.2728733210429286, "grad_norm": 1.8217017650604248, "learning_rate": 3.636555175138267e-05, "loss": 1.6405, "step": 10361 }, { "epoch": 0.27289965762444035, "grad_norm": 2.6209917068481445, "learning_rate": 3.636423492230709e-05, "loss": 1.9569, "step": 10362 }, { "epoch": 0.2729259942059521, "grad_norm": 1.8491734266281128, "learning_rate": 3.63629180932315e-05, "loss": 1.6601, "step": 10363 }, { "epoch": 0.2729523307874638, "grad_norm": 5.69320821762085, "learning_rate": 3.6361601264155916e-05, "loss": 1.5579, "step": 10364 }, { "epoch": 0.2729786673689755, "grad_norm": 2.1832668781280518, "learning_rate": 3.6360284435080324e-05, "loss": 0.5198, "step": 10365 }, { "epoch": 0.2730050039504872, "grad_norm": 3.8296897411346436, "learning_rate": 3.635896760600474e-05, "loss": 0.942, "step": 10366 }, { "epoch": 0.27303134053199896, "grad_norm": 2.4550304412841797, "learning_rate": 3.6357650776929156e-05, "loss": 2.419, "step": 10367 }, { "epoch": 0.27305767711351064, "grad_norm": 2.2154898643493652, "learning_rate": 3.635633394785357e-05, "loss": 1.8225, "step": 10368 }, { "epoch": 0.2730840136950224, "grad_norm": 2.387238025665283, "learning_rate": 3.635501711877799e-05, "loss": 1.633, "step": 10369 }, { "epoch": 0.27311035027653413, "grad_norm": 1.885642647743225, "learning_rate": 3.6353700289702396e-05, "loss": 1.8477, "step": 10370 }, { "epoch": 0.2731366868580458, "grad_norm": 2.8946776390075684, "learning_rate": 3.635238346062681e-05, "loss": 1.5476, "step": 10371 }, { "epoch": 0.27316302343955756, "grad_norm": 3.3582749366760254, "learning_rate": 3.635106663155123e-05, "loss": 0.4198, "step": 10372 }, { "epoch": 0.27318936002106925, "grad_norm": 2.1126949787139893, "learning_rate": 3.634974980247564e-05, "loss": 2.3508, "step": 10373 }, { "epoch": 0.273215696602581, "grad_norm": 1.748124599456787, "learning_rate": 3.634843297340005e-05, "loss": 1.9212, "step": 10374 }, { "epoch": 0.2732420331840927, "grad_norm": 4.46715784072876, "learning_rate": 3.634711614432447e-05, "loss": 1.4425, "step": 10375 }, { "epoch": 0.2732683697656044, "grad_norm": 2.0045342445373535, "learning_rate": 3.634579931524888e-05, "loss": 1.6954, "step": 10376 }, { "epoch": 0.27329470634711617, "grad_norm": 2.654986619949341, "learning_rate": 3.63444824861733e-05, "loss": 1.2639, "step": 10377 }, { "epoch": 0.27332104292862786, "grad_norm": 1.6185369491577148, "learning_rate": 3.6343165657097714e-05, "loss": 2.3115, "step": 10378 }, { "epoch": 0.2733473795101396, "grad_norm": 3.293684959411621, "learning_rate": 3.634184882802212e-05, "loss": 1.452, "step": 10379 }, { "epoch": 0.2733737160916513, "grad_norm": 2.2743654251098633, "learning_rate": 3.634053199894654e-05, "loss": 1.8677, "step": 10380 }, { "epoch": 0.27340005267316303, "grad_norm": 1.528550386428833, "learning_rate": 3.6339215169870954e-05, "loss": 1.9407, "step": 10381 }, { "epoch": 0.2734263892546747, "grad_norm": 4.636200904846191, "learning_rate": 3.633789834079537e-05, "loss": 0.4386, "step": 10382 }, { "epoch": 0.27345272583618646, "grad_norm": 2.5900821685791016, "learning_rate": 3.633658151171978e-05, "loss": 1.9323, "step": 10383 }, { "epoch": 0.2734790624176982, "grad_norm": 1.6953577995300293, "learning_rate": 3.6335264682644194e-05, "loss": 1.9221, "step": 10384 }, { "epoch": 0.2735053989992099, "grad_norm": 1.9355050325393677, "learning_rate": 3.633394785356861e-05, "loss": 2.1167, "step": 10385 }, { "epoch": 0.27353173558072164, "grad_norm": 2.9548747539520264, "learning_rate": 3.6332631024493025e-05, "loss": 1.0196, "step": 10386 }, { "epoch": 0.2735580721622333, "grad_norm": 1.5879417657852173, "learning_rate": 3.633131419541744e-05, "loss": 1.5018, "step": 10387 }, { "epoch": 0.27358440874374507, "grad_norm": 3.798142194747925, "learning_rate": 3.632999736634185e-05, "loss": 1.1752, "step": 10388 }, { "epoch": 0.27361074532525675, "grad_norm": 2.2538506984710693, "learning_rate": 3.6328680537266265e-05, "loss": 2.4424, "step": 10389 }, { "epoch": 0.2736370819067685, "grad_norm": 2.5515694618225098, "learning_rate": 3.6327363708190674e-05, "loss": 0.5356, "step": 10390 }, { "epoch": 0.27366341848828024, "grad_norm": 1.8959494829177856, "learning_rate": 3.6326046879115096e-05, "loss": 1.8924, "step": 10391 }, { "epoch": 0.27368975506979193, "grad_norm": 4.369131088256836, "learning_rate": 3.6324730050039505e-05, "loss": 1.2374, "step": 10392 }, { "epoch": 0.2737160916513037, "grad_norm": 3.6731436252593994, "learning_rate": 3.632341322096392e-05, "loss": 1.5548, "step": 10393 }, { "epoch": 0.27374242823281536, "grad_norm": 2.345093250274658, "learning_rate": 3.6322096391888336e-05, "loss": 1.6417, "step": 10394 }, { "epoch": 0.2737687648143271, "grad_norm": 2.260816812515259, "learning_rate": 3.632077956281275e-05, "loss": 1.9425, "step": 10395 }, { "epoch": 0.27379510139583885, "grad_norm": 4.049269199371338, "learning_rate": 3.631946273373717e-05, "loss": 2.0169, "step": 10396 }, { "epoch": 0.27382143797735053, "grad_norm": 1.8645570278167725, "learning_rate": 3.6318145904661576e-05, "loss": 1.8006, "step": 10397 }, { "epoch": 0.2738477745588623, "grad_norm": 2.959125518798828, "learning_rate": 3.631682907558599e-05, "loss": 1.4961, "step": 10398 }, { "epoch": 0.27387411114037397, "grad_norm": 2.605584144592285, "learning_rate": 3.63155122465104e-05, "loss": 0.6815, "step": 10399 }, { "epoch": 0.2739004477218857, "grad_norm": 3.761110782623291, "learning_rate": 3.631419541743482e-05, "loss": 1.1962, "step": 10400 }, { "epoch": 0.2739267843033974, "grad_norm": 2.673065185546875, "learning_rate": 3.631287858835923e-05, "loss": 1.8779, "step": 10401 }, { "epoch": 0.27395312088490914, "grad_norm": 1.6883829832077026, "learning_rate": 3.631156175928365e-05, "loss": 2.304, "step": 10402 }, { "epoch": 0.2739794574664209, "grad_norm": 1.82489013671875, "learning_rate": 3.6310244930208056e-05, "loss": 2.2771, "step": 10403 }, { "epoch": 0.27400579404793257, "grad_norm": 2.5863540172576904, "learning_rate": 3.630892810113247e-05, "loss": 1.9069, "step": 10404 }, { "epoch": 0.2740321306294443, "grad_norm": 1.6233669519424438, "learning_rate": 3.630761127205689e-05, "loss": 1.8689, "step": 10405 }, { "epoch": 0.274058467210956, "grad_norm": 1.4105255603790283, "learning_rate": 3.63062944429813e-05, "loss": 2.0748, "step": 10406 }, { "epoch": 0.27408480379246775, "grad_norm": 3.1554150581359863, "learning_rate": 3.630497761390572e-05, "loss": 2.2471, "step": 10407 }, { "epoch": 0.27411114037397943, "grad_norm": 2.0766801834106445, "learning_rate": 3.630366078483013e-05, "loss": 1.8686, "step": 10408 }, { "epoch": 0.2741374769554912, "grad_norm": 3.3039138317108154, "learning_rate": 3.630234395575455e-05, "loss": 1.3837, "step": 10409 }, { "epoch": 0.2741638135370029, "grad_norm": 1.975628137588501, "learning_rate": 3.630102712667896e-05, "loss": 1.6537, "step": 10410 }, { "epoch": 0.2741901501185146, "grad_norm": 1.6447898149490356, "learning_rate": 3.6299710297603374e-05, "loss": 1.7791, "step": 10411 }, { "epoch": 0.27421648670002635, "grad_norm": 1.8509248495101929, "learning_rate": 3.629839346852778e-05, "loss": 1.5306, "step": 10412 }, { "epoch": 0.27424282328153804, "grad_norm": 3.179069757461548, "learning_rate": 3.62970766394522e-05, "loss": 1.0179, "step": 10413 }, { "epoch": 0.2742691598630498, "grad_norm": 2.821903944015503, "learning_rate": 3.6295759810376614e-05, "loss": 1.6608, "step": 10414 }, { "epoch": 0.27429549644456147, "grad_norm": 1.7800081968307495, "learning_rate": 3.629444298130103e-05, "loss": 1.3441, "step": 10415 }, { "epoch": 0.2743218330260732, "grad_norm": 2.4287467002868652, "learning_rate": 3.6293126152225445e-05, "loss": 1.1843, "step": 10416 }, { "epoch": 0.27434816960758496, "grad_norm": 1.6981650590896606, "learning_rate": 3.6291809323149854e-05, "loss": 1.6747, "step": 10417 }, { "epoch": 0.27437450618909665, "grad_norm": 2.2687954902648926, "learning_rate": 3.629049249407427e-05, "loss": 1.5328, "step": 10418 }, { "epoch": 0.2744008427706084, "grad_norm": 4.247220516204834, "learning_rate": 3.6289175664998685e-05, "loss": 1.1176, "step": 10419 }, { "epoch": 0.2744271793521201, "grad_norm": 3.4652798175811768, "learning_rate": 3.62878588359231e-05, "loss": 0.9128, "step": 10420 }, { "epoch": 0.2744535159336318, "grad_norm": 1.9754316806793213, "learning_rate": 3.628654200684751e-05, "loss": 0.6499, "step": 10421 }, { "epoch": 0.2744798525151435, "grad_norm": 2.5423145294189453, "learning_rate": 3.6285225177771925e-05, "loss": 1.5877, "step": 10422 }, { "epoch": 0.27450618909665525, "grad_norm": 2.2928049564361572, "learning_rate": 3.628390834869634e-05, "loss": 1.7479, "step": 10423 }, { "epoch": 0.274532525678167, "grad_norm": 1.8844287395477295, "learning_rate": 3.6282591519620757e-05, "loss": 2.2141, "step": 10424 }, { "epoch": 0.2745588622596787, "grad_norm": 1.8125797510147095, "learning_rate": 3.628127469054517e-05, "loss": 0.4839, "step": 10425 }, { "epoch": 0.2745851988411904, "grad_norm": 1.764673113822937, "learning_rate": 3.627995786146958e-05, "loss": 1.9221, "step": 10426 }, { "epoch": 0.2746115354227021, "grad_norm": 2.157508134841919, "learning_rate": 3.6278641032393997e-05, "loss": 1.8939, "step": 10427 }, { "epoch": 0.27463787200421386, "grad_norm": 3.0758440494537354, "learning_rate": 3.627732420331841e-05, "loss": 1.078, "step": 10428 }, { "epoch": 0.2746642085857256, "grad_norm": 1.9144856929779053, "learning_rate": 3.627600737424283e-05, "loss": 2.5218, "step": 10429 }, { "epoch": 0.2746905451672373, "grad_norm": 2.0303256511688232, "learning_rate": 3.6274690545167237e-05, "loss": 1.6567, "step": 10430 }, { "epoch": 0.27471688174874903, "grad_norm": 2.235045909881592, "learning_rate": 3.627337371609165e-05, "loss": 1.4615, "step": 10431 }, { "epoch": 0.2747432183302607, "grad_norm": 1.6566983461380005, "learning_rate": 3.627205688701607e-05, "loss": 0.9702, "step": 10432 }, { "epoch": 0.27476955491177246, "grad_norm": 2.245187759399414, "learning_rate": 3.627074005794048e-05, "loss": 1.8433, "step": 10433 }, { "epoch": 0.27479589149328415, "grad_norm": 1.9110386371612549, "learning_rate": 3.62694232288649e-05, "loss": 2.14, "step": 10434 }, { "epoch": 0.2748222280747959, "grad_norm": 2.1642751693725586, "learning_rate": 3.626810639978931e-05, "loss": 2.1116, "step": 10435 }, { "epoch": 0.27484856465630764, "grad_norm": 2.8699982166290283, "learning_rate": 3.626678957071372e-05, "loss": 1.4268, "step": 10436 }, { "epoch": 0.2748749012378193, "grad_norm": 1.994243860244751, "learning_rate": 3.626547274163813e-05, "loss": 1.571, "step": 10437 }, { "epoch": 0.27490123781933107, "grad_norm": 1.787546992301941, "learning_rate": 3.6264155912562555e-05, "loss": 1.9877, "step": 10438 }, { "epoch": 0.27492757440084276, "grad_norm": 3.318443536758423, "learning_rate": 3.626283908348696e-05, "loss": 0.7593, "step": 10439 }, { "epoch": 0.2749539109823545, "grad_norm": 2.943997859954834, "learning_rate": 3.626152225441138e-05, "loss": 1.3951, "step": 10440 }, { "epoch": 0.2749802475638662, "grad_norm": 5.653837203979492, "learning_rate": 3.6260205425335795e-05, "loss": 1.4294, "step": 10441 }, { "epoch": 0.27500658414537793, "grad_norm": 1.8154033422470093, "learning_rate": 3.625888859626021e-05, "loss": 1.8327, "step": 10442 }, { "epoch": 0.2750329207268897, "grad_norm": 2.034886598587036, "learning_rate": 3.6257571767184626e-05, "loss": 2.5932, "step": 10443 }, { "epoch": 0.27505925730840136, "grad_norm": 1.9043903350830078, "learning_rate": 3.6256254938109035e-05, "loss": 1.3506, "step": 10444 }, { "epoch": 0.2750855938899131, "grad_norm": 1.687850832939148, "learning_rate": 3.625493810903345e-05, "loss": 2.2564, "step": 10445 }, { "epoch": 0.2751119304714248, "grad_norm": 1.6645928621292114, "learning_rate": 3.625362127995786e-05, "loss": 0.9856, "step": 10446 }, { "epoch": 0.27513826705293654, "grad_norm": 1.674831509590149, "learning_rate": 3.625230445088228e-05, "loss": 2.3101, "step": 10447 }, { "epoch": 0.2751646036344482, "grad_norm": 1.8255071640014648, "learning_rate": 3.625098762180669e-05, "loss": 2.1872, "step": 10448 }, { "epoch": 0.27519094021595997, "grad_norm": 1.874585509300232, "learning_rate": 3.6249670792731106e-05, "loss": 1.8736, "step": 10449 }, { "epoch": 0.2752172767974717, "grad_norm": 2.40224027633667, "learning_rate": 3.624835396365552e-05, "loss": 1.2941, "step": 10450 }, { "epoch": 0.2752436133789834, "grad_norm": 6.703125, "learning_rate": 3.624703713457993e-05, "loss": 1.2435, "step": 10451 }, { "epoch": 0.27526994996049514, "grad_norm": 4.762519836425781, "learning_rate": 3.624572030550435e-05, "loss": 1.3299, "step": 10452 }, { "epoch": 0.27529628654200683, "grad_norm": 2.813340902328491, "learning_rate": 3.624440347642876e-05, "loss": 1.7707, "step": 10453 }, { "epoch": 0.2753226231235186, "grad_norm": 2.2296957969665527, "learning_rate": 3.624308664735318e-05, "loss": 1.7691, "step": 10454 }, { "epoch": 0.27534895970503026, "grad_norm": 3.706146001815796, "learning_rate": 3.6241769818277586e-05, "loss": 2.2148, "step": 10455 }, { "epoch": 0.275375296286542, "grad_norm": 1.4004148244857788, "learning_rate": 3.6240452989202e-05, "loss": 1.8777, "step": 10456 }, { "epoch": 0.27540163286805375, "grad_norm": 2.524167537689209, "learning_rate": 3.623913616012642e-05, "loss": 2.1514, "step": 10457 }, { "epoch": 0.27542796944956544, "grad_norm": 2.8889214992523193, "learning_rate": 3.623781933105083e-05, "loss": 1.1664, "step": 10458 }, { "epoch": 0.2754543060310772, "grad_norm": 1.6348000764846802, "learning_rate": 3.623650250197524e-05, "loss": 1.3697, "step": 10459 }, { "epoch": 0.27548064261258887, "grad_norm": 1.8383204936981201, "learning_rate": 3.623518567289966e-05, "loss": 1.7364, "step": 10460 }, { "epoch": 0.2755069791941006, "grad_norm": 1.8291422128677368, "learning_rate": 3.623386884382407e-05, "loss": 1.5751, "step": 10461 }, { "epoch": 0.27553331577561235, "grad_norm": 5.297059059143066, "learning_rate": 3.623255201474849e-05, "loss": 1.4437, "step": 10462 }, { "epoch": 0.27555965235712404, "grad_norm": 2.6070845127105713, "learning_rate": 3.6231235185672904e-05, "loss": 1.3462, "step": 10463 }, { "epoch": 0.2755859889386358, "grad_norm": 3.5472006797790527, "learning_rate": 3.622991835659731e-05, "loss": 0.9255, "step": 10464 }, { "epoch": 0.27561232552014747, "grad_norm": 2.4515955448150635, "learning_rate": 3.622860152752173e-05, "loss": 1.8441, "step": 10465 }, { "epoch": 0.2756386621016592, "grad_norm": 4.607582092285156, "learning_rate": 3.6227284698446144e-05, "loss": 1.3286, "step": 10466 }, { "epoch": 0.2756649986831709, "grad_norm": 1.7889039516448975, "learning_rate": 3.622596786937056e-05, "loss": 1.6902, "step": 10467 }, { "epoch": 0.27569133526468265, "grad_norm": 2.2339816093444824, "learning_rate": 3.622465104029497e-05, "loss": 1.9056, "step": 10468 }, { "epoch": 0.2757176718461944, "grad_norm": 1.987520456314087, "learning_rate": 3.6223334211219384e-05, "loss": 1.8437, "step": 10469 }, { "epoch": 0.2757440084277061, "grad_norm": 2.2537505626678467, "learning_rate": 3.62220173821438e-05, "loss": 1.652, "step": 10470 }, { "epoch": 0.2757703450092178, "grad_norm": 3.435530424118042, "learning_rate": 3.6220700553068215e-05, "loss": 0.672, "step": 10471 }, { "epoch": 0.2757966815907295, "grad_norm": 3.3695995807647705, "learning_rate": 3.621938372399263e-05, "loss": 1.9393, "step": 10472 }, { "epoch": 0.27582301817224125, "grad_norm": 3.0079569816589355, "learning_rate": 3.621806689491704e-05, "loss": 2.1282, "step": 10473 }, { "epoch": 0.27584935475375294, "grad_norm": 1.5797780752182007, "learning_rate": 3.6216750065841455e-05, "loss": 1.6722, "step": 10474 }, { "epoch": 0.2758756913352647, "grad_norm": 2.2939703464508057, "learning_rate": 3.621543323676587e-05, "loss": 1.8333, "step": 10475 }, { "epoch": 0.2759020279167764, "grad_norm": 1.8671458959579468, "learning_rate": 3.6214116407690286e-05, "loss": 1.8496, "step": 10476 }, { "epoch": 0.2759283644982881, "grad_norm": 2.9520442485809326, "learning_rate": 3.6212799578614695e-05, "loss": 1.837, "step": 10477 }, { "epoch": 0.27595470107979986, "grad_norm": 4.0135817527771, "learning_rate": 3.621148274953911e-05, "loss": 0.6895, "step": 10478 }, { "epoch": 0.27598103766131155, "grad_norm": 1.4256529808044434, "learning_rate": 3.6210165920463526e-05, "loss": 0.9646, "step": 10479 }, { "epoch": 0.2760073742428233, "grad_norm": 1.755150556564331, "learning_rate": 3.620884909138794e-05, "loss": 0.6211, "step": 10480 }, { "epoch": 0.276033710824335, "grad_norm": 3.8370206356048584, "learning_rate": 3.620753226231236e-05, "loss": 1.8618, "step": 10481 }, { "epoch": 0.2760600474058467, "grad_norm": 2.5078601837158203, "learning_rate": 3.6206215433236766e-05, "loss": 1.7979, "step": 10482 }, { "epoch": 0.27608638398735846, "grad_norm": 1.6675509214401245, "learning_rate": 3.620489860416118e-05, "loss": 1.9262, "step": 10483 }, { "epoch": 0.27611272056887015, "grad_norm": 3.0252974033355713, "learning_rate": 3.620358177508559e-05, "loss": 1.7034, "step": 10484 }, { "epoch": 0.2761390571503819, "grad_norm": 1.6886072158813477, "learning_rate": 3.620226494601001e-05, "loss": 1.5362, "step": 10485 }, { "epoch": 0.2761653937318936, "grad_norm": 3.0428547859191895, "learning_rate": 3.620094811693442e-05, "loss": 1.7356, "step": 10486 }, { "epoch": 0.2761917303134053, "grad_norm": 1.7481582164764404, "learning_rate": 3.619963128785884e-05, "loss": 2.1848, "step": 10487 }, { "epoch": 0.276218066894917, "grad_norm": 2.8296101093292236, "learning_rate": 3.619831445878325e-05, "loss": 2.1159, "step": 10488 }, { "epoch": 0.27624440347642876, "grad_norm": 2.57230544090271, "learning_rate": 3.619699762970766e-05, "loss": 0.7766, "step": 10489 }, { "epoch": 0.2762707400579405, "grad_norm": 1.5827943086624146, "learning_rate": 3.6195680800632084e-05, "loss": 1.4311, "step": 10490 }, { "epoch": 0.2762970766394522, "grad_norm": 2.5470101833343506, "learning_rate": 3.619436397155649e-05, "loss": 0.6682, "step": 10491 }, { "epoch": 0.27632341322096393, "grad_norm": 1.8786563873291016, "learning_rate": 3.619304714248091e-05, "loss": 1.3432, "step": 10492 }, { "epoch": 0.2763497498024756, "grad_norm": 1.6635288000106812, "learning_rate": 3.619173031340532e-05, "loss": 1.8164, "step": 10493 }, { "epoch": 0.27637608638398736, "grad_norm": 3.3679616451263428, "learning_rate": 3.619041348432974e-05, "loss": 1.8374, "step": 10494 }, { "epoch": 0.27640242296549905, "grad_norm": 1.6528295278549194, "learning_rate": 3.618909665525415e-05, "loss": 1.8446, "step": 10495 }, { "epoch": 0.2764287595470108, "grad_norm": 2.233024835586548, "learning_rate": 3.6187779826178564e-05, "loss": 1.4532, "step": 10496 }, { "epoch": 0.27645509612852254, "grad_norm": 2.285409450531006, "learning_rate": 3.618646299710298e-05, "loss": 0.3484, "step": 10497 }, { "epoch": 0.2764814327100342, "grad_norm": 2.079171895980835, "learning_rate": 3.618514616802739e-05, "loss": 2.2279, "step": 10498 }, { "epoch": 0.27650776929154597, "grad_norm": 1.615371584892273, "learning_rate": 3.618382933895181e-05, "loss": 1.7252, "step": 10499 }, { "epoch": 0.27653410587305766, "grad_norm": 2.8617053031921387, "learning_rate": 3.618251250987622e-05, "loss": 1.9224, "step": 10500 }, { "epoch": 0.2765604424545694, "grad_norm": 2.6724143028259277, "learning_rate": 3.6181195680800636e-05, "loss": 1.9771, "step": 10501 }, { "epoch": 0.27658677903608114, "grad_norm": 2.234105110168457, "learning_rate": 3.6179878851725044e-05, "loss": 2.2792, "step": 10502 }, { "epoch": 0.27661311561759283, "grad_norm": 2.2478859424591064, "learning_rate": 3.617856202264946e-05, "loss": 2.338, "step": 10503 }, { "epoch": 0.2766394521991046, "grad_norm": 3.4104225635528564, "learning_rate": 3.6177245193573876e-05, "loss": 1.8354, "step": 10504 }, { "epoch": 0.27666578878061626, "grad_norm": 1.7714762687683105, "learning_rate": 3.617592836449829e-05, "loss": 1.8856, "step": 10505 }, { "epoch": 0.276692125362128, "grad_norm": 3.2018375396728516, "learning_rate": 3.61746115354227e-05, "loss": 0.7587, "step": 10506 }, { "epoch": 0.2767184619436397, "grad_norm": 2.6129369735717773, "learning_rate": 3.6173294706347116e-05, "loss": 1.5104, "step": 10507 }, { "epoch": 0.27674479852515144, "grad_norm": 1.854946494102478, "learning_rate": 3.617197787727153e-05, "loss": 1.8627, "step": 10508 }, { "epoch": 0.2767711351066632, "grad_norm": 1.9550607204437256, "learning_rate": 3.617066104819595e-05, "loss": 1.4634, "step": 10509 }, { "epoch": 0.27679747168817487, "grad_norm": 1.94491446018219, "learning_rate": 3.616934421912036e-05, "loss": 1.342, "step": 10510 }, { "epoch": 0.2768238082696866, "grad_norm": 2.0520408153533936, "learning_rate": 3.616802739004477e-05, "loss": 0.7579, "step": 10511 }, { "epoch": 0.2768501448511983, "grad_norm": 1.580329418182373, "learning_rate": 3.616671056096919e-05, "loss": 2.1161, "step": 10512 }, { "epoch": 0.27687648143271004, "grad_norm": 9.892873764038086, "learning_rate": 3.61653937318936e-05, "loss": 1.7196, "step": 10513 }, { "epoch": 0.27690281801422173, "grad_norm": 2.776784658432007, "learning_rate": 3.616407690281802e-05, "loss": 1.2163, "step": 10514 }, { "epoch": 0.2769291545957335, "grad_norm": 2.1026298999786377, "learning_rate": 3.616276007374243e-05, "loss": 0.7488, "step": 10515 }, { "epoch": 0.2769554911772452, "grad_norm": 3.374598741531372, "learning_rate": 3.616144324466684e-05, "loss": 1.1478, "step": 10516 }, { "epoch": 0.2769818277587569, "grad_norm": 2.0458824634552, "learning_rate": 3.616012641559126e-05, "loss": 0.8614, "step": 10517 }, { "epoch": 0.27700816434026865, "grad_norm": 2.6714377403259277, "learning_rate": 3.6158809586515674e-05, "loss": 0.7945, "step": 10518 }, { "epoch": 0.27703450092178034, "grad_norm": 3.6527814865112305, "learning_rate": 3.615749275744009e-05, "loss": 1.4755, "step": 10519 }, { "epoch": 0.2770608375032921, "grad_norm": 1.7627272605895996, "learning_rate": 3.61561759283645e-05, "loss": 1.8425, "step": 10520 }, { "epoch": 0.27708717408480377, "grad_norm": 2.1502063274383545, "learning_rate": 3.6154859099288914e-05, "loss": 1.6817, "step": 10521 }, { "epoch": 0.2771135106663155, "grad_norm": 3.362943172454834, "learning_rate": 3.615354227021332e-05, "loss": 1.3013, "step": 10522 }, { "epoch": 0.27713984724782725, "grad_norm": 2.4210638999938965, "learning_rate": 3.6152225441137745e-05, "loss": 1.6636, "step": 10523 }, { "epoch": 0.27716618382933894, "grad_norm": 2.529226064682007, "learning_rate": 3.6150908612062154e-05, "loss": 0.6731, "step": 10524 }, { "epoch": 0.2771925204108507, "grad_norm": 3.0127105712890625, "learning_rate": 3.614959178298657e-05, "loss": 1.0777, "step": 10525 }, { "epoch": 0.2772188569923624, "grad_norm": 3.4717442989349365, "learning_rate": 3.6148274953910985e-05, "loss": 1.0245, "step": 10526 }, { "epoch": 0.2772451935738741, "grad_norm": 3.9940237998962402, "learning_rate": 3.61469581248354e-05, "loss": 1.9003, "step": 10527 }, { "epoch": 0.2772715301553858, "grad_norm": 5.008260250091553, "learning_rate": 3.6145641295759816e-05, "loss": 1.633, "step": 10528 }, { "epoch": 0.27729786673689755, "grad_norm": 2.702284336090088, "learning_rate": 3.6144324466684225e-05, "loss": 1.9457, "step": 10529 }, { "epoch": 0.2773242033184093, "grad_norm": 1.7166951894760132, "learning_rate": 3.614300763760864e-05, "loss": 2.2659, "step": 10530 }, { "epoch": 0.277350539899921, "grad_norm": 1.9460575580596924, "learning_rate": 3.614169080853305e-05, "loss": 1.7085, "step": 10531 }, { "epoch": 0.2773768764814327, "grad_norm": 4.915445327758789, "learning_rate": 3.614037397945747e-05, "loss": 2.1869, "step": 10532 }, { "epoch": 0.2774032130629444, "grad_norm": 1.7240326404571533, "learning_rate": 3.613905715038188e-05, "loss": 1.8895, "step": 10533 }, { "epoch": 0.27742954964445615, "grad_norm": 2.6020450592041016, "learning_rate": 3.6137740321306296e-05, "loss": 1.6892, "step": 10534 }, { "epoch": 0.2774558862259679, "grad_norm": 3.6185905933380127, "learning_rate": 3.613642349223071e-05, "loss": 1.4694, "step": 10535 }, { "epoch": 0.2774822228074796, "grad_norm": 1.8450874090194702, "learning_rate": 3.613510666315512e-05, "loss": 1.9159, "step": 10536 }, { "epoch": 0.2775085593889913, "grad_norm": 2.4880099296569824, "learning_rate": 3.613378983407954e-05, "loss": 2.2643, "step": 10537 }, { "epoch": 0.277534895970503, "grad_norm": 1.7124220132827759, "learning_rate": 3.613247300500395e-05, "loss": 1.8497, "step": 10538 }, { "epoch": 0.27756123255201476, "grad_norm": 1.8947703838348389, "learning_rate": 3.613115617592837e-05, "loss": 1.4309, "step": 10539 }, { "epoch": 0.27758756913352645, "grad_norm": 1.5899467468261719, "learning_rate": 3.6129839346852776e-05, "loss": 1.5447, "step": 10540 }, { "epoch": 0.2776139057150382, "grad_norm": 1.6295113563537598, "learning_rate": 3.61285225177772e-05, "loss": 2.3948, "step": 10541 }, { "epoch": 0.27764024229654993, "grad_norm": 2.512416362762451, "learning_rate": 3.612720568870161e-05, "loss": 2.3674, "step": 10542 }, { "epoch": 0.2776665788780616, "grad_norm": 2.4679908752441406, "learning_rate": 3.612588885962602e-05, "loss": 1.2602, "step": 10543 }, { "epoch": 0.27769291545957336, "grad_norm": 1.6646476984024048, "learning_rate": 3.612457203055044e-05, "loss": 1.6468, "step": 10544 }, { "epoch": 0.27771925204108505, "grad_norm": 4.851602077484131, "learning_rate": 3.612325520147485e-05, "loss": 1.2795, "step": 10545 }, { "epoch": 0.2777455886225968, "grad_norm": 2.18587327003479, "learning_rate": 3.612193837239927e-05, "loss": 1.4798, "step": 10546 }, { "epoch": 0.2777719252041085, "grad_norm": 2.0642988681793213, "learning_rate": 3.612062154332368e-05, "loss": 1.6178, "step": 10547 }, { "epoch": 0.2777982617856202, "grad_norm": 1.6430569887161255, "learning_rate": 3.6119304714248094e-05, "loss": 2.2333, "step": 10548 }, { "epoch": 0.27782459836713197, "grad_norm": 2.343280076980591, "learning_rate": 3.61179878851725e-05, "loss": 1.7574, "step": 10549 }, { "epoch": 0.27785093494864366, "grad_norm": 2.0710694789886475, "learning_rate": 3.611667105609692e-05, "loss": 2.4, "step": 10550 }, { "epoch": 0.2778772715301554, "grad_norm": 2.84663987159729, "learning_rate": 3.6115354227021334e-05, "loss": 0.4437, "step": 10551 }, { "epoch": 0.2779036081116671, "grad_norm": 2.0948305130004883, "learning_rate": 3.611403739794575e-05, "loss": 1.502, "step": 10552 }, { "epoch": 0.27792994469317883, "grad_norm": 4.115591049194336, "learning_rate": 3.6112720568870165e-05, "loss": 1.327, "step": 10553 }, { "epoch": 0.2779562812746905, "grad_norm": 2.811091184616089, "learning_rate": 3.6111403739794574e-05, "loss": 1.2381, "step": 10554 }, { "epoch": 0.27798261785620226, "grad_norm": 2.0517990589141846, "learning_rate": 3.611008691071899e-05, "loss": 1.5499, "step": 10555 }, { "epoch": 0.278008954437714, "grad_norm": 1.8026422262191772, "learning_rate": 3.6108770081643405e-05, "loss": 1.6637, "step": 10556 }, { "epoch": 0.2780352910192257, "grad_norm": 2.157827138900757, "learning_rate": 3.610745325256782e-05, "loss": 1.8581, "step": 10557 }, { "epoch": 0.27806162760073744, "grad_norm": 1.9836013317108154, "learning_rate": 3.610613642349223e-05, "loss": 1.9057, "step": 10558 }, { "epoch": 0.2780879641822491, "grad_norm": 1.6347705125808716, "learning_rate": 3.6104819594416645e-05, "loss": 1.8962, "step": 10559 }, { "epoch": 0.27811430076376087, "grad_norm": 2.470343589782715, "learning_rate": 3.610350276534106e-05, "loss": 1.8694, "step": 10560 }, { "epoch": 0.27814063734527256, "grad_norm": 1.6528599262237549, "learning_rate": 3.6102185936265477e-05, "loss": 2.3773, "step": 10561 }, { "epoch": 0.2781669739267843, "grad_norm": 1.9485795497894287, "learning_rate": 3.6100869107189885e-05, "loss": 1.8891, "step": 10562 }, { "epoch": 0.27819331050829604, "grad_norm": 3.1197283267974854, "learning_rate": 3.60995522781143e-05, "loss": 1.4587, "step": 10563 }, { "epoch": 0.27821964708980773, "grad_norm": 1.660825252532959, "learning_rate": 3.6098235449038717e-05, "loss": 1.7337, "step": 10564 }, { "epoch": 0.2782459836713195, "grad_norm": 5.408761501312256, "learning_rate": 3.609691861996313e-05, "loss": 1.3651, "step": 10565 }, { "epoch": 0.27827232025283116, "grad_norm": 2.2877254486083984, "learning_rate": 3.609560179088755e-05, "loss": 0.9732, "step": 10566 }, { "epoch": 0.2782986568343429, "grad_norm": 2.5487403869628906, "learning_rate": 3.6094284961811957e-05, "loss": 1.9014, "step": 10567 }, { "epoch": 0.27832499341585465, "grad_norm": 2.2451066970825195, "learning_rate": 3.609296813273637e-05, "loss": 1.8042, "step": 10568 }, { "epoch": 0.27835132999736634, "grad_norm": 2.410404920578003, "learning_rate": 3.609165130366078e-05, "loss": 1.6118, "step": 10569 }, { "epoch": 0.2783776665788781, "grad_norm": 1.6133015155792236, "learning_rate": 3.6090334474585203e-05, "loss": 0.547, "step": 10570 }, { "epoch": 0.27840400316038977, "grad_norm": 3.749532699584961, "learning_rate": 3.608901764550961e-05, "loss": 0.6882, "step": 10571 }, { "epoch": 0.2784303397419015, "grad_norm": 3.7434544563293457, "learning_rate": 3.608770081643403e-05, "loss": 1.4323, "step": 10572 }, { "epoch": 0.2784566763234132, "grad_norm": 1.835457444190979, "learning_rate": 3.6086383987358443e-05, "loss": 1.9915, "step": 10573 }, { "epoch": 0.27848301290492494, "grad_norm": 3.5884530544281006, "learning_rate": 3.608506715828286e-05, "loss": 1.5782, "step": 10574 }, { "epoch": 0.2785093494864367, "grad_norm": 1.5033602714538574, "learning_rate": 3.6083750329207275e-05, "loss": 0.2876, "step": 10575 }, { "epoch": 0.2785356860679484, "grad_norm": 1.9221035242080688, "learning_rate": 3.6082433500131683e-05, "loss": 2.6736, "step": 10576 }, { "epoch": 0.2785620226494601, "grad_norm": 3.4220848083496094, "learning_rate": 3.60811166710561e-05, "loss": 1.6724, "step": 10577 }, { "epoch": 0.2785883592309718, "grad_norm": 4.6325578689575195, "learning_rate": 3.607979984198051e-05, "loss": 1.0641, "step": 10578 }, { "epoch": 0.27861469581248355, "grad_norm": 2.0484626293182373, "learning_rate": 3.607848301290493e-05, "loss": 1.754, "step": 10579 }, { "epoch": 0.27864103239399524, "grad_norm": 1.7742486000061035, "learning_rate": 3.607716618382934e-05, "loss": 1.252, "step": 10580 }, { "epoch": 0.278667368975507, "grad_norm": 3.1879234313964844, "learning_rate": 3.6075849354753755e-05, "loss": 1.6456, "step": 10581 }, { "epoch": 0.2786937055570187, "grad_norm": 1.864965558052063, "learning_rate": 3.607453252567817e-05, "loss": 2.0505, "step": 10582 }, { "epoch": 0.2787200421385304, "grad_norm": 1.4988855123519897, "learning_rate": 3.607321569660258e-05, "loss": 1.4652, "step": 10583 }, { "epoch": 0.27874637872004215, "grad_norm": 1.9227646589279175, "learning_rate": 3.6071898867527e-05, "loss": 1.1357, "step": 10584 }, { "epoch": 0.27877271530155384, "grad_norm": 2.266540050506592, "learning_rate": 3.607058203845141e-05, "loss": 0.6786, "step": 10585 }, { "epoch": 0.2787990518830656, "grad_norm": 2.252044200897217, "learning_rate": 3.6069265209375826e-05, "loss": 1.9835, "step": 10586 }, { "epoch": 0.2788253884645773, "grad_norm": 1.7922183275222778, "learning_rate": 3.6067948380300235e-05, "loss": 1.6849, "step": 10587 }, { "epoch": 0.278851725046089, "grad_norm": 2.043391466140747, "learning_rate": 3.606663155122466e-05, "loss": 1.8441, "step": 10588 }, { "epoch": 0.27887806162760076, "grad_norm": 1.9932903051376343, "learning_rate": 3.6065314722149066e-05, "loss": 1.1982, "step": 10589 }, { "epoch": 0.27890439820911245, "grad_norm": 3.703014373779297, "learning_rate": 3.606399789307348e-05, "loss": 0.6268, "step": 10590 }, { "epoch": 0.2789307347906242, "grad_norm": 3.817866325378418, "learning_rate": 3.60626810639979e-05, "loss": 2.0868, "step": 10591 }, { "epoch": 0.2789570713721359, "grad_norm": 2.0294957160949707, "learning_rate": 3.6061364234922306e-05, "loss": 2.2436, "step": 10592 }, { "epoch": 0.2789834079536476, "grad_norm": 1.622313380241394, "learning_rate": 3.606004740584673e-05, "loss": 0.5224, "step": 10593 }, { "epoch": 0.2790097445351593, "grad_norm": 1.5825412273406982, "learning_rate": 3.605873057677114e-05, "loss": 2.2051, "step": 10594 }, { "epoch": 0.27903608111667105, "grad_norm": 1.7760380506515503, "learning_rate": 3.605741374769555e-05, "loss": 1.7089, "step": 10595 }, { "epoch": 0.2790624176981828, "grad_norm": 6.593071937561035, "learning_rate": 3.605609691861996e-05, "loss": 1.1579, "step": 10596 }, { "epoch": 0.2790887542796945, "grad_norm": 2.3868892192840576, "learning_rate": 3.605478008954438e-05, "loss": 1.9262, "step": 10597 }, { "epoch": 0.2791150908612062, "grad_norm": 1.863279938697815, "learning_rate": 3.605346326046879e-05, "loss": 1.3981, "step": 10598 }, { "epoch": 0.2791414274427179, "grad_norm": 2.025031805038452, "learning_rate": 3.605214643139321e-05, "loss": 2.6291, "step": 10599 }, { "epoch": 0.27916776402422966, "grad_norm": 2.297893762588501, "learning_rate": 3.6050829602317624e-05, "loss": 1.5302, "step": 10600 }, { "epoch": 0.2791941006057414, "grad_norm": 3.0263586044311523, "learning_rate": 3.604951277324203e-05, "loss": 1.314, "step": 10601 }, { "epoch": 0.2792204371872531, "grad_norm": 1.8723514080047607, "learning_rate": 3.604819594416645e-05, "loss": 1.707, "step": 10602 }, { "epoch": 0.27924677376876483, "grad_norm": 4.323929786682129, "learning_rate": 3.6046879115090864e-05, "loss": 1.8483, "step": 10603 }, { "epoch": 0.2792731103502765, "grad_norm": 4.868648052215576, "learning_rate": 3.604556228601528e-05, "loss": 1.5361, "step": 10604 }, { "epoch": 0.27929944693178826, "grad_norm": 1.6330366134643555, "learning_rate": 3.604424545693969e-05, "loss": 1.955, "step": 10605 }, { "epoch": 0.27932578351329995, "grad_norm": 1.8268793821334839, "learning_rate": 3.6042928627864104e-05, "loss": 2.5499, "step": 10606 }, { "epoch": 0.2793521200948117, "grad_norm": 1.7831532955169678, "learning_rate": 3.604161179878852e-05, "loss": 1.7541, "step": 10607 }, { "epoch": 0.27937845667632344, "grad_norm": 1.713757038116455, "learning_rate": 3.6040294969712935e-05, "loss": 1.8245, "step": 10608 }, { "epoch": 0.2794047932578351, "grad_norm": 3.6432838439941406, "learning_rate": 3.6038978140637344e-05, "loss": 1.1547, "step": 10609 }, { "epoch": 0.27943112983934687, "grad_norm": 2.1600749492645264, "learning_rate": 3.603766131156176e-05, "loss": 1.5969, "step": 10610 }, { "epoch": 0.27945746642085856, "grad_norm": 2.707930564880371, "learning_rate": 3.6036344482486175e-05, "loss": 2.5188, "step": 10611 }, { "epoch": 0.2794838030023703, "grad_norm": 2.9858036041259766, "learning_rate": 3.603502765341059e-05, "loss": 1.1827, "step": 10612 }, { "epoch": 0.279510139583882, "grad_norm": 1.812178373336792, "learning_rate": 3.6033710824335006e-05, "loss": 1.9033, "step": 10613 }, { "epoch": 0.27953647616539373, "grad_norm": 1.4573904275894165, "learning_rate": 3.6032393995259415e-05, "loss": 1.8184, "step": 10614 }, { "epoch": 0.2795628127469055, "grad_norm": 2.8986144065856934, "learning_rate": 3.603107716618383e-05, "loss": 1.4632, "step": 10615 }, { "epoch": 0.27958914932841716, "grad_norm": 3.6511080265045166, "learning_rate": 3.602976033710824e-05, "loss": 0.9638, "step": 10616 }, { "epoch": 0.2796154859099289, "grad_norm": 1.5710798501968384, "learning_rate": 3.602844350803266e-05, "loss": 2.1146, "step": 10617 }, { "epoch": 0.2796418224914406, "grad_norm": 4.432260513305664, "learning_rate": 3.602712667895707e-05, "loss": 1.0949, "step": 10618 }, { "epoch": 0.27966815907295234, "grad_norm": 2.828469753265381, "learning_rate": 3.6025809849881486e-05, "loss": 1.5586, "step": 10619 }, { "epoch": 0.279694495654464, "grad_norm": 1.6373722553253174, "learning_rate": 3.60244930208059e-05, "loss": 1.9336, "step": 10620 }, { "epoch": 0.27972083223597577, "grad_norm": 2.660008430480957, "learning_rate": 3.602317619173032e-05, "loss": 2.4598, "step": 10621 }, { "epoch": 0.2797471688174875, "grad_norm": 2.2494709491729736, "learning_rate": 3.602185936265473e-05, "loss": 1.9613, "step": 10622 }, { "epoch": 0.2797735053989992, "grad_norm": 3.685098886489868, "learning_rate": 3.602054253357914e-05, "loss": 1.0617, "step": 10623 }, { "epoch": 0.27979984198051094, "grad_norm": 1.8566356897354126, "learning_rate": 3.601922570450356e-05, "loss": 1.7472, "step": 10624 }, { "epoch": 0.27982617856202263, "grad_norm": 1.5898544788360596, "learning_rate": 3.6017908875427966e-05, "loss": 1.6229, "step": 10625 }, { "epoch": 0.2798525151435344, "grad_norm": 3.930208921432495, "learning_rate": 3.601659204635239e-05, "loss": 1.2664, "step": 10626 }, { "epoch": 0.27987885172504606, "grad_norm": 2.7435531616210938, "learning_rate": 3.60152752172768e-05, "loss": 0.605, "step": 10627 }, { "epoch": 0.2799051883065578, "grad_norm": 1.8299710750579834, "learning_rate": 3.601395838820121e-05, "loss": 1.8009, "step": 10628 }, { "epoch": 0.27993152488806955, "grad_norm": 4.164587497711182, "learning_rate": 3.601264155912563e-05, "loss": 1.8989, "step": 10629 }, { "epoch": 0.27995786146958124, "grad_norm": 3.663940668106079, "learning_rate": 3.601132473005004e-05, "loss": 2.3288, "step": 10630 }, { "epoch": 0.279984198051093, "grad_norm": 1.454330325126648, "learning_rate": 3.601000790097446e-05, "loss": 1.584, "step": 10631 }, { "epoch": 0.28001053463260467, "grad_norm": 2.1807925701141357, "learning_rate": 3.600869107189887e-05, "loss": 2.5548, "step": 10632 }, { "epoch": 0.2800368712141164, "grad_norm": 1.6666271686553955, "learning_rate": 3.6007374242823284e-05, "loss": 1.5791, "step": 10633 }, { "epoch": 0.28006320779562816, "grad_norm": 2.1405491828918457, "learning_rate": 3.600605741374769e-05, "loss": 1.87, "step": 10634 }, { "epoch": 0.28008954437713984, "grad_norm": 2.001499652862549, "learning_rate": 3.600474058467211e-05, "loss": 1.5485, "step": 10635 }, { "epoch": 0.2801158809586516, "grad_norm": 3.800605535507202, "learning_rate": 3.6003423755596524e-05, "loss": 1.2924, "step": 10636 }, { "epoch": 0.2801422175401633, "grad_norm": 1.8137459754943848, "learning_rate": 3.600210692652094e-05, "loss": 1.8019, "step": 10637 }, { "epoch": 0.280168554121675, "grad_norm": 4.043032646179199, "learning_rate": 3.6000790097445356e-05, "loss": 1.1812, "step": 10638 }, { "epoch": 0.2801948907031867, "grad_norm": 2.0625429153442383, "learning_rate": 3.5999473268369764e-05, "loss": 0.9757, "step": 10639 }, { "epoch": 0.28022122728469845, "grad_norm": 2.455791711807251, "learning_rate": 3.599815643929419e-05, "loss": 0.8719, "step": 10640 }, { "epoch": 0.2802475638662102, "grad_norm": 2.3219807147979736, "learning_rate": 3.5996839610218596e-05, "loss": 0.5165, "step": 10641 }, { "epoch": 0.2802739004477219, "grad_norm": 1.8683165311813354, "learning_rate": 3.599552278114301e-05, "loss": 0.33, "step": 10642 }, { "epoch": 0.2803002370292336, "grad_norm": 1.7563034296035767, "learning_rate": 3.599420595206742e-05, "loss": 1.75, "step": 10643 }, { "epoch": 0.2803265736107453, "grad_norm": 3.0634682178497314, "learning_rate": 3.5992889122991836e-05, "loss": 1.8466, "step": 10644 }, { "epoch": 0.28035291019225705, "grad_norm": 1.5442073345184326, "learning_rate": 3.599157229391625e-05, "loss": 2.156, "step": 10645 }, { "epoch": 0.28037924677376874, "grad_norm": 1.9861361980438232, "learning_rate": 3.599025546484067e-05, "loss": 1.5096, "step": 10646 }, { "epoch": 0.2804055833552805, "grad_norm": 3.358288288116455, "learning_rate": 3.598893863576508e-05, "loss": 1.0369, "step": 10647 }, { "epoch": 0.28043191993679223, "grad_norm": 2.8081891536712646, "learning_rate": 3.598762180668949e-05, "loss": 1.0704, "step": 10648 }, { "epoch": 0.2804582565183039, "grad_norm": 1.6818207502365112, "learning_rate": 3.598630497761391e-05, "loss": 2.0621, "step": 10649 }, { "epoch": 0.28048459309981566, "grad_norm": 2.4079747200012207, "learning_rate": 3.598498814853832e-05, "loss": 1.6631, "step": 10650 }, { "epoch": 0.28051092968132735, "grad_norm": 3.6856985092163086, "learning_rate": 3.598367131946274e-05, "loss": 2.029, "step": 10651 }, { "epoch": 0.2805372662628391, "grad_norm": 2.1202104091644287, "learning_rate": 3.598235449038715e-05, "loss": 1.9108, "step": 10652 }, { "epoch": 0.2805636028443508, "grad_norm": 7.725017547607422, "learning_rate": 3.598103766131156e-05, "loss": 1.2323, "step": 10653 }, { "epoch": 0.2805899394258625, "grad_norm": 1.7582473754882812, "learning_rate": 3.597972083223598e-05, "loss": 1.3198, "step": 10654 }, { "epoch": 0.28061627600737427, "grad_norm": 2.8983075618743896, "learning_rate": 3.5978404003160394e-05, "loss": 1.9464, "step": 10655 }, { "epoch": 0.28064261258888595, "grad_norm": 1.5993150472640991, "learning_rate": 3.597708717408481e-05, "loss": 2.208, "step": 10656 }, { "epoch": 0.2806689491703977, "grad_norm": 1.9493886232376099, "learning_rate": 3.597577034500922e-05, "loss": 1.7803, "step": 10657 }, { "epoch": 0.2806952857519094, "grad_norm": 1.8038997650146484, "learning_rate": 3.5974453515933634e-05, "loss": 2.0923, "step": 10658 }, { "epoch": 0.28072162233342113, "grad_norm": 3.5540947914123535, "learning_rate": 3.597313668685805e-05, "loss": 0.386, "step": 10659 }, { "epoch": 0.2807479589149328, "grad_norm": 2.0793118476867676, "learning_rate": 3.5971819857782465e-05, "loss": 1.6538, "step": 10660 }, { "epoch": 0.28077429549644456, "grad_norm": 1.7483510971069336, "learning_rate": 3.5970503028706874e-05, "loss": 1.9041, "step": 10661 }, { "epoch": 0.2808006320779563, "grad_norm": 6.435232639312744, "learning_rate": 3.596918619963129e-05, "loss": 1.3221, "step": 10662 }, { "epoch": 0.280826968659468, "grad_norm": 2.415607213973999, "learning_rate": 3.59678693705557e-05, "loss": 1.6288, "step": 10663 }, { "epoch": 0.28085330524097973, "grad_norm": 1.8533782958984375, "learning_rate": 3.596655254148012e-05, "loss": 1.5603, "step": 10664 }, { "epoch": 0.2808796418224914, "grad_norm": 3.2497153282165527, "learning_rate": 3.596523571240453e-05, "loss": 1.237, "step": 10665 }, { "epoch": 0.28090597840400316, "grad_norm": 1.4536837339401245, "learning_rate": 3.5963918883328945e-05, "loss": 2.5447, "step": 10666 }, { "epoch": 0.28093231498551485, "grad_norm": 1.774584412574768, "learning_rate": 3.596260205425336e-05, "loss": 1.4216, "step": 10667 }, { "epoch": 0.2809586515670266, "grad_norm": 2.871354103088379, "learning_rate": 3.596128522517777e-05, "loss": 1.5716, "step": 10668 }, { "epoch": 0.28098498814853834, "grad_norm": 1.5408275127410889, "learning_rate": 3.595996839610219e-05, "loss": 2.125, "step": 10669 }, { "epoch": 0.28101132473005, "grad_norm": 2.006460189819336, "learning_rate": 3.59586515670266e-05, "loss": 1.6612, "step": 10670 }, { "epoch": 0.28103766131156177, "grad_norm": 3.255465030670166, "learning_rate": 3.5957334737951016e-05, "loss": 1.4083, "step": 10671 }, { "epoch": 0.28106399789307346, "grad_norm": 4.274167537689209, "learning_rate": 3.5956017908875425e-05, "loss": 1.2642, "step": 10672 }, { "epoch": 0.2810903344745852, "grad_norm": 2.426962375640869, "learning_rate": 3.595470107979985e-05, "loss": 1.4912, "step": 10673 }, { "epoch": 0.28111667105609695, "grad_norm": 1.8696622848510742, "learning_rate": 3.5953384250724256e-05, "loss": 1.2084, "step": 10674 }, { "epoch": 0.28114300763760863, "grad_norm": 4.604039192199707, "learning_rate": 3.595206742164867e-05, "loss": 0.9946, "step": 10675 }, { "epoch": 0.2811693442191204, "grad_norm": 5.191089153289795, "learning_rate": 3.595075059257309e-05, "loss": 2.1446, "step": 10676 }, { "epoch": 0.28119568080063206, "grad_norm": 2.00488018989563, "learning_rate": 3.5949433763497496e-05, "loss": 1.9144, "step": 10677 }, { "epoch": 0.2812220173821438, "grad_norm": 2.6417410373687744, "learning_rate": 3.594811693442192e-05, "loss": 1.2435, "step": 10678 }, { "epoch": 0.2812483539636555, "grad_norm": 3.796337842941284, "learning_rate": 3.594680010534633e-05, "loss": 0.8128, "step": 10679 }, { "epoch": 0.28127469054516724, "grad_norm": 2.1222522258758545, "learning_rate": 3.594548327627074e-05, "loss": 2.1263, "step": 10680 }, { "epoch": 0.281301027126679, "grad_norm": 6.180479526519775, "learning_rate": 3.594416644719515e-05, "loss": 1.6803, "step": 10681 }, { "epoch": 0.28132736370819067, "grad_norm": 4.049415588378906, "learning_rate": 3.594284961811957e-05, "loss": 1.141, "step": 10682 }, { "epoch": 0.2813537002897024, "grad_norm": 1.708604335784912, "learning_rate": 3.594153278904398e-05, "loss": 2.5271, "step": 10683 }, { "epoch": 0.2813800368712141, "grad_norm": 2.8715932369232178, "learning_rate": 3.59402159599684e-05, "loss": 1.3198, "step": 10684 }, { "epoch": 0.28140637345272584, "grad_norm": 3.1334052085876465, "learning_rate": 3.5938899130892814e-05, "loss": 0.2673, "step": 10685 }, { "epoch": 0.28143271003423753, "grad_norm": 3.3107383251190186, "learning_rate": 3.593758230181722e-05, "loss": 1.4493, "step": 10686 }, { "epoch": 0.2814590466157493, "grad_norm": 1.8343831300735474, "learning_rate": 3.5936265472741645e-05, "loss": 2.1448, "step": 10687 }, { "epoch": 0.281485383197261, "grad_norm": 1.6674515008926392, "learning_rate": 3.5934948643666054e-05, "loss": 1.8204, "step": 10688 }, { "epoch": 0.2815117197787727, "grad_norm": 2.22741436958313, "learning_rate": 3.593363181459047e-05, "loss": 1.5256, "step": 10689 }, { "epoch": 0.28153805636028445, "grad_norm": 2.178539991378784, "learning_rate": 3.593231498551488e-05, "loss": 1.7154, "step": 10690 }, { "epoch": 0.28156439294179614, "grad_norm": 2.9959981441497803, "learning_rate": 3.5930998156439294e-05, "loss": 1.8, "step": 10691 }, { "epoch": 0.2815907295233079, "grad_norm": 1.6000972986221313, "learning_rate": 3.592968132736371e-05, "loss": 1.5372, "step": 10692 }, { "epoch": 0.28161706610481957, "grad_norm": 1.7989879846572876, "learning_rate": 3.5928364498288125e-05, "loss": 2.565, "step": 10693 }, { "epoch": 0.2816434026863313, "grad_norm": 1.635472059249878, "learning_rate": 3.592704766921254e-05, "loss": 1.303, "step": 10694 }, { "epoch": 0.28166973926784306, "grad_norm": 2.0246100425720215, "learning_rate": 3.592573084013695e-05, "loss": 1.5681, "step": 10695 }, { "epoch": 0.28169607584935474, "grad_norm": 1.5443994998931885, "learning_rate": 3.5924414011061365e-05, "loss": 2.0599, "step": 10696 }, { "epoch": 0.2817224124308665, "grad_norm": 3.6381704807281494, "learning_rate": 3.592309718198578e-05, "loss": 0.5802, "step": 10697 }, { "epoch": 0.2817487490123782, "grad_norm": 2.2468931674957275, "learning_rate": 3.59217803529102e-05, "loss": 2.0681, "step": 10698 }, { "epoch": 0.2817750855938899, "grad_norm": 1.9977346658706665, "learning_rate": 3.5920463523834605e-05, "loss": 2.037, "step": 10699 }, { "epoch": 0.2818014221754016, "grad_norm": 1.7926998138427734, "learning_rate": 3.591914669475902e-05, "loss": 1.299, "step": 10700 }, { "epoch": 0.28182775875691335, "grad_norm": 1.8555229902267456, "learning_rate": 3.591782986568344e-05, "loss": 1.7055, "step": 10701 }, { "epoch": 0.2818540953384251, "grad_norm": 2.0828359127044678, "learning_rate": 3.591651303660785e-05, "loss": 1.5137, "step": 10702 }, { "epoch": 0.2818804319199368, "grad_norm": 2.4519834518432617, "learning_rate": 3.591519620753227e-05, "loss": 1.5815, "step": 10703 }, { "epoch": 0.2819067685014485, "grad_norm": 3.7882378101348877, "learning_rate": 3.591387937845668e-05, "loss": 2.279, "step": 10704 }, { "epoch": 0.2819331050829602, "grad_norm": 3.561875581741333, "learning_rate": 3.591256254938109e-05, "loss": 0.529, "step": 10705 }, { "epoch": 0.28195944166447195, "grad_norm": 2.8583076000213623, "learning_rate": 3.591124572030551e-05, "loss": 1.5987, "step": 10706 }, { "epoch": 0.2819857782459837, "grad_norm": 2.1120409965515137, "learning_rate": 3.5909928891229923e-05, "loss": 2.0727, "step": 10707 }, { "epoch": 0.2820121148274954, "grad_norm": 2.837987184524536, "learning_rate": 3.590861206215433e-05, "loss": 1.2557, "step": 10708 }, { "epoch": 0.28203845140900713, "grad_norm": 1.9793260097503662, "learning_rate": 3.590729523307875e-05, "loss": 2.0804, "step": 10709 }, { "epoch": 0.2820647879905188, "grad_norm": 2.5520782470703125, "learning_rate": 3.590597840400316e-05, "loss": 1.4869, "step": 10710 }, { "epoch": 0.28209112457203056, "grad_norm": 1.3926054239273071, "learning_rate": 3.590466157492758e-05, "loss": 1.2704, "step": 10711 }, { "epoch": 0.28211746115354225, "grad_norm": 2.701925277709961, "learning_rate": 3.590334474585199e-05, "loss": 1.4179, "step": 10712 }, { "epoch": 0.282143797735054, "grad_norm": 2.1796493530273438, "learning_rate": 3.5902027916776403e-05, "loss": 1.6388, "step": 10713 }, { "epoch": 0.28217013431656573, "grad_norm": 3.534756660461426, "learning_rate": 3.590071108770082e-05, "loss": 0.7621, "step": 10714 }, { "epoch": 0.2821964708980774, "grad_norm": 3.0433273315429688, "learning_rate": 3.589939425862523e-05, "loss": 1.814, "step": 10715 }, { "epoch": 0.28222280747958917, "grad_norm": 1.9669334888458252, "learning_rate": 3.589807742954965e-05, "loss": 2.0966, "step": 10716 }, { "epoch": 0.28224914406110085, "grad_norm": 1.8439295291900635, "learning_rate": 3.589676060047406e-05, "loss": 2.0059, "step": 10717 }, { "epoch": 0.2822754806426126, "grad_norm": 1.5100206136703491, "learning_rate": 3.5895443771398475e-05, "loss": 1.4924, "step": 10718 }, { "epoch": 0.2823018172241243, "grad_norm": 2.1831393241882324, "learning_rate": 3.5894126942322884e-05, "loss": 1.3613, "step": 10719 }, { "epoch": 0.28232815380563603, "grad_norm": 2.080354928970337, "learning_rate": 3.5892810113247306e-05, "loss": 1.6664, "step": 10720 }, { "epoch": 0.28235449038714777, "grad_norm": 4.941184997558594, "learning_rate": 3.5891493284171715e-05, "loss": 1.6204, "step": 10721 }, { "epoch": 0.28238082696865946, "grad_norm": 2.2045881748199463, "learning_rate": 3.589017645509613e-05, "loss": 1.1392, "step": 10722 }, { "epoch": 0.2824071635501712, "grad_norm": 3.418728828430176, "learning_rate": 3.5888859626020546e-05, "loss": 1.6709, "step": 10723 }, { "epoch": 0.2824335001316829, "grad_norm": 1.8697012662887573, "learning_rate": 3.5887542796944955e-05, "loss": 2.127, "step": 10724 }, { "epoch": 0.28245983671319463, "grad_norm": 3.483039140701294, "learning_rate": 3.588622596786938e-05, "loss": 2.0922, "step": 10725 }, { "epoch": 0.2824861732947063, "grad_norm": 2.179659128189087, "learning_rate": 3.5884909138793786e-05, "loss": 1.1217, "step": 10726 }, { "epoch": 0.28251250987621807, "grad_norm": 1.5021837949752808, "learning_rate": 3.58835923097182e-05, "loss": 1.3667, "step": 10727 }, { "epoch": 0.2825388464577298, "grad_norm": 1.7014353275299072, "learning_rate": 3.588227548064261e-05, "loss": 1.4668, "step": 10728 }, { "epoch": 0.2825651830392415, "grad_norm": 1.534012794494629, "learning_rate": 3.5880958651567026e-05, "loss": 1.7634, "step": 10729 }, { "epoch": 0.28259151962075324, "grad_norm": 1.7933578491210938, "learning_rate": 3.587964182249144e-05, "loss": 0.7792, "step": 10730 }, { "epoch": 0.2826178562022649, "grad_norm": 3.4233410358428955, "learning_rate": 3.587832499341586e-05, "loss": 0.8375, "step": 10731 }, { "epoch": 0.28264419278377667, "grad_norm": 3.7914533615112305, "learning_rate": 3.587700816434027e-05, "loss": 1.1995, "step": 10732 }, { "epoch": 0.28267052936528836, "grad_norm": 2.1551201343536377, "learning_rate": 3.587569133526468e-05, "loss": 1.1777, "step": 10733 }, { "epoch": 0.2826968659468001, "grad_norm": 3.0795228481292725, "learning_rate": 3.58743745061891e-05, "loss": 1.6719, "step": 10734 }, { "epoch": 0.28272320252831185, "grad_norm": 1.899985909461975, "learning_rate": 3.587305767711351e-05, "loss": 2.0629, "step": 10735 }, { "epoch": 0.28274953910982353, "grad_norm": 2.0340893268585205, "learning_rate": 3.587174084803793e-05, "loss": 1.375, "step": 10736 }, { "epoch": 0.2827758756913353, "grad_norm": 1.7357654571533203, "learning_rate": 3.587042401896234e-05, "loss": 1.704, "step": 10737 }, { "epoch": 0.28280221227284696, "grad_norm": 3.342672348022461, "learning_rate": 3.586910718988675e-05, "loss": 1.2899, "step": 10738 }, { "epoch": 0.2828285488543587, "grad_norm": 3.6308014392852783, "learning_rate": 3.586779036081117e-05, "loss": 1.6223, "step": 10739 }, { "epoch": 0.28285488543587045, "grad_norm": 3.73903226852417, "learning_rate": 3.5866473531735584e-05, "loss": 1.5266, "step": 10740 }, { "epoch": 0.28288122201738214, "grad_norm": 1.6772918701171875, "learning_rate": 3.586515670266e-05, "loss": 1.6978, "step": 10741 }, { "epoch": 0.2829075585988939, "grad_norm": 1.7821944952011108, "learning_rate": 3.586383987358441e-05, "loss": 1.883, "step": 10742 }, { "epoch": 0.28293389518040557, "grad_norm": 4.070680618286133, "learning_rate": 3.5862523044508824e-05, "loss": 1.7298, "step": 10743 }, { "epoch": 0.2829602317619173, "grad_norm": 2.752892255783081, "learning_rate": 3.586120621543324e-05, "loss": 1.6368, "step": 10744 }, { "epoch": 0.282986568343429, "grad_norm": 2.187781572341919, "learning_rate": 3.5859889386357655e-05, "loss": 1.8481, "step": 10745 }, { "epoch": 0.28301290492494074, "grad_norm": 1.964159607887268, "learning_rate": 3.5858572557282064e-05, "loss": 1.5193, "step": 10746 }, { "epoch": 0.2830392415064525, "grad_norm": 2.1101324558258057, "learning_rate": 3.585725572820648e-05, "loss": 2.3428, "step": 10747 }, { "epoch": 0.2830655780879642, "grad_norm": 1.8688033819198608, "learning_rate": 3.5855938899130895e-05, "loss": 2.0471, "step": 10748 }, { "epoch": 0.2830919146694759, "grad_norm": 2.8875467777252197, "learning_rate": 3.585462207005531e-05, "loss": 1.9323, "step": 10749 }, { "epoch": 0.2831182512509876, "grad_norm": 1.721235990524292, "learning_rate": 3.5853305240979726e-05, "loss": 2.2385, "step": 10750 }, { "epoch": 0.28314458783249935, "grad_norm": 2.5938611030578613, "learning_rate": 3.5851988411904135e-05, "loss": 1.7843, "step": 10751 }, { "epoch": 0.28317092441401104, "grad_norm": 1.7995103597640991, "learning_rate": 3.585067158282855e-05, "loss": 1.9036, "step": 10752 }, { "epoch": 0.2831972609955228, "grad_norm": 3.8045501708984375, "learning_rate": 3.5849354753752966e-05, "loss": 2.0679, "step": 10753 }, { "epoch": 0.2832235975770345, "grad_norm": 2.0957834720611572, "learning_rate": 3.584803792467738e-05, "loss": 1.7508, "step": 10754 }, { "epoch": 0.2832499341585462, "grad_norm": 4.18516206741333, "learning_rate": 3.584672109560179e-05, "loss": 1.6536, "step": 10755 }, { "epoch": 0.28327627074005796, "grad_norm": 2.9924893379211426, "learning_rate": 3.5845404266526206e-05, "loss": 1.7985, "step": 10756 }, { "epoch": 0.28330260732156964, "grad_norm": 2.581116199493408, "learning_rate": 3.5844087437450615e-05, "loss": 1.4849, "step": 10757 }, { "epoch": 0.2833289439030814, "grad_norm": 2.794194221496582, "learning_rate": 3.584277060837504e-05, "loss": 1.2438, "step": 10758 }, { "epoch": 0.2833552804845931, "grad_norm": 3.0024542808532715, "learning_rate": 3.5841453779299446e-05, "loss": 1.3339, "step": 10759 }, { "epoch": 0.2833816170661048, "grad_norm": 2.0632169246673584, "learning_rate": 3.584013695022386e-05, "loss": 1.1334, "step": 10760 }, { "epoch": 0.28340795364761656, "grad_norm": 3.399566888809204, "learning_rate": 3.583882012114828e-05, "loss": 1.8846, "step": 10761 }, { "epoch": 0.28343429022912825, "grad_norm": 2.2777318954467773, "learning_rate": 3.5837503292072686e-05, "loss": 1.5551, "step": 10762 }, { "epoch": 0.28346062681064, "grad_norm": 1.8050422668457031, "learning_rate": 3.583618646299711e-05, "loss": 1.6948, "step": 10763 }, { "epoch": 0.2834869633921517, "grad_norm": 1.8940041065216064, "learning_rate": 3.583486963392152e-05, "loss": 1.2282, "step": 10764 }, { "epoch": 0.2835132999736634, "grad_norm": 2.728278398513794, "learning_rate": 3.583355280484593e-05, "loss": 1.2962, "step": 10765 }, { "epoch": 0.2835396365551751, "grad_norm": 4.125481605529785, "learning_rate": 3.583223597577034e-05, "loss": 2.2674, "step": 10766 }, { "epoch": 0.28356597313668686, "grad_norm": 3.0636467933654785, "learning_rate": 3.583091914669476e-05, "loss": 1.7247, "step": 10767 }, { "epoch": 0.2835923097181986, "grad_norm": 3.9943528175354004, "learning_rate": 3.582960231761917e-05, "loss": 1.2233, "step": 10768 }, { "epoch": 0.2836186462997103, "grad_norm": 5.200692653656006, "learning_rate": 3.582828548854359e-05, "loss": 1.1838, "step": 10769 }, { "epoch": 0.28364498288122203, "grad_norm": 6.776669025421143, "learning_rate": 3.5826968659468004e-05, "loss": 1.1258, "step": 10770 }, { "epoch": 0.2836713194627337, "grad_norm": 1.9097925424575806, "learning_rate": 3.582565183039241e-05, "loss": 1.449, "step": 10771 }, { "epoch": 0.28369765604424546, "grad_norm": 2.9563980102539062, "learning_rate": 3.5824335001316836e-05, "loss": 1.4752, "step": 10772 }, { "epoch": 0.2837239926257572, "grad_norm": 3.4084765911102295, "learning_rate": 3.5823018172241244e-05, "loss": 0.757, "step": 10773 }, { "epoch": 0.2837503292072689, "grad_norm": 2.4827938079833984, "learning_rate": 3.582170134316566e-05, "loss": 1.5668, "step": 10774 }, { "epoch": 0.28377666578878064, "grad_norm": 1.687890648841858, "learning_rate": 3.582038451409007e-05, "loss": 1.8184, "step": 10775 }, { "epoch": 0.2838030023702923, "grad_norm": 1.8881187438964844, "learning_rate": 3.5819067685014484e-05, "loss": 2.5232, "step": 10776 }, { "epoch": 0.28382933895180407, "grad_norm": 3.3049566745758057, "learning_rate": 3.58177508559389e-05, "loss": 1.678, "step": 10777 }, { "epoch": 0.28385567553331575, "grad_norm": 3.8362557888031006, "learning_rate": 3.5816434026863316e-05, "loss": 1.7891, "step": 10778 }, { "epoch": 0.2838820121148275, "grad_norm": 1.5089341402053833, "learning_rate": 3.581511719778773e-05, "loss": 2.3159, "step": 10779 }, { "epoch": 0.28390834869633924, "grad_norm": 1.741356611251831, "learning_rate": 3.581380036871214e-05, "loss": 1.8636, "step": 10780 }, { "epoch": 0.28393468527785093, "grad_norm": 1.5450695753097534, "learning_rate": 3.5812483539636556e-05, "loss": 0.3157, "step": 10781 }, { "epoch": 0.2839610218593627, "grad_norm": 1.453835129737854, "learning_rate": 3.581116671056097e-05, "loss": 2.0476, "step": 10782 }, { "epoch": 0.28398735844087436, "grad_norm": 3.3188846111297607, "learning_rate": 3.580984988148539e-05, "loss": 0.5036, "step": 10783 }, { "epoch": 0.2840136950223861, "grad_norm": 1.9738532304763794, "learning_rate": 3.5808533052409796e-05, "loss": 1.659, "step": 10784 }, { "epoch": 0.2840400316038978, "grad_norm": 1.987406611442566, "learning_rate": 3.580721622333421e-05, "loss": 1.9322, "step": 10785 }, { "epoch": 0.28406636818540953, "grad_norm": 3.094905138015747, "learning_rate": 3.580589939425863e-05, "loss": 1.2558, "step": 10786 }, { "epoch": 0.2840927047669213, "grad_norm": 2.3643505573272705, "learning_rate": 3.580458256518304e-05, "loss": 1.6724, "step": 10787 }, { "epoch": 0.28411904134843297, "grad_norm": 2.3726208209991455, "learning_rate": 3.580326573610746e-05, "loss": 1.7988, "step": 10788 }, { "epoch": 0.2841453779299447, "grad_norm": 2.0558271408081055, "learning_rate": 3.580194890703187e-05, "loss": 2.0569, "step": 10789 }, { "epoch": 0.2841717145114564, "grad_norm": 3.745953321456909, "learning_rate": 3.580063207795628e-05, "loss": 1.3069, "step": 10790 }, { "epoch": 0.28419805109296814, "grad_norm": 1.679540753364563, "learning_rate": 3.57993152488807e-05, "loss": 1.4655, "step": 10791 }, { "epoch": 0.28422438767447983, "grad_norm": 1.8753336668014526, "learning_rate": 3.5797998419805114e-05, "loss": 2.1195, "step": 10792 }, { "epoch": 0.28425072425599157, "grad_norm": 1.9696885347366333, "learning_rate": 3.579668159072952e-05, "loss": 1.4623, "step": 10793 }, { "epoch": 0.2842770608375033, "grad_norm": 11.313634872436523, "learning_rate": 3.579536476165394e-05, "loss": 2.0706, "step": 10794 }, { "epoch": 0.284303397419015, "grad_norm": 1.9421197175979614, "learning_rate": 3.5794047932578354e-05, "loss": 1.9291, "step": 10795 }, { "epoch": 0.28432973400052675, "grad_norm": 2.6556382179260254, "learning_rate": 3.579273110350277e-05, "loss": 0.9597, "step": 10796 }, { "epoch": 0.28435607058203843, "grad_norm": 2.0371716022491455, "learning_rate": 3.5791414274427185e-05, "loss": 1.846, "step": 10797 }, { "epoch": 0.2843824071635502, "grad_norm": 2.046513319015503, "learning_rate": 3.5790097445351594e-05, "loss": 1.5943, "step": 10798 }, { "epoch": 0.28440874374506186, "grad_norm": 1.6495250463485718, "learning_rate": 3.578878061627601e-05, "loss": 1.9194, "step": 10799 }, { "epoch": 0.2844350803265736, "grad_norm": 1.8406317234039307, "learning_rate": 3.578746378720042e-05, "loss": 1.9922, "step": 10800 }, { "epoch": 0.28446141690808535, "grad_norm": 3.196396589279175, "learning_rate": 3.578614695812484e-05, "loss": 2.1663, "step": 10801 }, { "epoch": 0.28448775348959704, "grad_norm": 2.3981986045837402, "learning_rate": 3.578483012904925e-05, "loss": 1.365, "step": 10802 }, { "epoch": 0.2845140900711088, "grad_norm": 1.6877681016921997, "learning_rate": 3.5783513299973665e-05, "loss": 1.7587, "step": 10803 }, { "epoch": 0.28454042665262047, "grad_norm": 1.5932350158691406, "learning_rate": 3.578219647089808e-05, "loss": 1.6025, "step": 10804 }, { "epoch": 0.2845667632341322, "grad_norm": 2.3335299491882324, "learning_rate": 3.5780879641822496e-05, "loss": 1.7822, "step": 10805 }, { "epoch": 0.2845930998156439, "grad_norm": 1.9196819067001343, "learning_rate": 3.577956281274691e-05, "loss": 1.8837, "step": 10806 }, { "epoch": 0.28461943639715565, "grad_norm": 1.398080825805664, "learning_rate": 3.577824598367132e-05, "loss": 1.9654, "step": 10807 }, { "epoch": 0.2846457729786674, "grad_norm": 5.5888261795043945, "learning_rate": 3.5776929154595736e-05, "loss": 1.0024, "step": 10808 }, { "epoch": 0.2846721095601791, "grad_norm": 1.979221224784851, "learning_rate": 3.5775612325520145e-05, "loss": 1.6808, "step": 10809 }, { "epoch": 0.2846984461416908, "grad_norm": 3.1664185523986816, "learning_rate": 3.577429549644457e-05, "loss": 1.4544, "step": 10810 }, { "epoch": 0.2847247827232025, "grad_norm": 2.1926774978637695, "learning_rate": 3.5772978667368976e-05, "loss": 1.1609, "step": 10811 }, { "epoch": 0.28475111930471425, "grad_norm": 1.938710331916809, "learning_rate": 3.577166183829339e-05, "loss": 0.4026, "step": 10812 }, { "epoch": 0.284777455886226, "grad_norm": 1.8479552268981934, "learning_rate": 3.57703450092178e-05, "loss": 1.0639, "step": 10813 }, { "epoch": 0.2848037924677377, "grad_norm": 4.910101413726807, "learning_rate": 3.5769028180142216e-05, "loss": 1.3656, "step": 10814 }, { "epoch": 0.2848301290492494, "grad_norm": 1.845170021057129, "learning_rate": 3.576771135106663e-05, "loss": 1.8049, "step": 10815 }, { "epoch": 0.2848564656307611, "grad_norm": 2.0903091430664062, "learning_rate": 3.576639452199105e-05, "loss": 2.0961, "step": 10816 }, { "epoch": 0.28488280221227286, "grad_norm": 4.3012003898620605, "learning_rate": 3.576507769291546e-05, "loss": 1.1164, "step": 10817 }, { "epoch": 0.28490913879378454, "grad_norm": 1.6772291660308838, "learning_rate": 3.576376086383987e-05, "loss": 1.9686, "step": 10818 }, { "epoch": 0.2849354753752963, "grad_norm": 1.6850790977478027, "learning_rate": 3.5762444034764294e-05, "loss": 1.7021, "step": 10819 }, { "epoch": 0.28496181195680803, "grad_norm": 1.6811139583587646, "learning_rate": 3.57611272056887e-05, "loss": 2.0023, "step": 10820 }, { "epoch": 0.2849881485383197, "grad_norm": 2.713277816772461, "learning_rate": 3.575981037661312e-05, "loss": 1.8991, "step": 10821 }, { "epoch": 0.28501448511983146, "grad_norm": 2.5310380458831787, "learning_rate": 3.575849354753753e-05, "loss": 1.5155, "step": 10822 }, { "epoch": 0.28504082170134315, "grad_norm": 4.494997024536133, "learning_rate": 3.575717671846194e-05, "loss": 1.2162, "step": 10823 }, { "epoch": 0.2850671582828549, "grad_norm": 2.0674829483032227, "learning_rate": 3.575585988938636e-05, "loss": 2.3371, "step": 10824 }, { "epoch": 0.2850934948643666, "grad_norm": 3.2719314098358154, "learning_rate": 3.5754543060310774e-05, "loss": 0.9209, "step": 10825 }, { "epoch": 0.2851198314458783, "grad_norm": 1.9471609592437744, "learning_rate": 3.575322623123519e-05, "loss": 2.8147, "step": 10826 }, { "epoch": 0.28514616802739007, "grad_norm": 2.836864709854126, "learning_rate": 3.57519094021596e-05, "loss": 1.7648, "step": 10827 }, { "epoch": 0.28517250460890176, "grad_norm": 2.6008498668670654, "learning_rate": 3.5750592573084014e-05, "loss": 1.9024, "step": 10828 }, { "epoch": 0.2851988411904135, "grad_norm": 2.5339577198028564, "learning_rate": 3.574927574400843e-05, "loss": 1.5394, "step": 10829 }, { "epoch": 0.2852251777719252, "grad_norm": 1.392446517944336, "learning_rate": 3.5747958914932845e-05, "loss": 1.4745, "step": 10830 }, { "epoch": 0.28525151435343693, "grad_norm": 2.5190680027008057, "learning_rate": 3.5746642085857254e-05, "loss": 1.4114, "step": 10831 }, { "epoch": 0.2852778509349486, "grad_norm": 3.5672900676727295, "learning_rate": 3.574532525678167e-05, "loss": 2.1517, "step": 10832 }, { "epoch": 0.28530418751646036, "grad_norm": 1.529719352722168, "learning_rate": 3.5744008427706085e-05, "loss": 1.8108, "step": 10833 }, { "epoch": 0.2853305240979721, "grad_norm": 3.915332555770874, "learning_rate": 3.57426915986305e-05, "loss": 1.2517, "step": 10834 }, { "epoch": 0.2853568606794838, "grad_norm": 2.5521485805511475, "learning_rate": 3.574137476955492e-05, "loss": 1.9467, "step": 10835 }, { "epoch": 0.28538319726099554, "grad_norm": 1.6555213928222656, "learning_rate": 3.5740057940479325e-05, "loss": 2.1273, "step": 10836 }, { "epoch": 0.2854095338425072, "grad_norm": 2.305870294570923, "learning_rate": 3.573874111140374e-05, "loss": 1.8347, "step": 10837 }, { "epoch": 0.28543587042401897, "grad_norm": 2.5898029804229736, "learning_rate": 3.573742428232816e-05, "loss": 1.5236, "step": 10838 }, { "epoch": 0.28546220700553065, "grad_norm": 1.9200682640075684, "learning_rate": 3.573610745325257e-05, "loss": 2.0033, "step": 10839 }, { "epoch": 0.2854885435870424, "grad_norm": 3.0526697635650635, "learning_rate": 3.573479062417698e-05, "loss": 2.4709, "step": 10840 }, { "epoch": 0.28551488016855414, "grad_norm": 1.7224936485290527, "learning_rate": 3.57334737951014e-05, "loss": 1.8446, "step": 10841 }, { "epoch": 0.28554121675006583, "grad_norm": 1.5152957439422607, "learning_rate": 3.573215696602581e-05, "loss": 1.792, "step": 10842 }, { "epoch": 0.2855675533315776, "grad_norm": 4.488541603088379, "learning_rate": 3.573084013695023e-05, "loss": 1.5066, "step": 10843 }, { "epoch": 0.28559388991308926, "grad_norm": 2.974428415298462, "learning_rate": 3.5729523307874643e-05, "loss": 1.7532, "step": 10844 }, { "epoch": 0.285620226494601, "grad_norm": 2.302385091781616, "learning_rate": 3.572820647879905e-05, "loss": 1.5413, "step": 10845 }, { "epoch": 0.28564656307611275, "grad_norm": 1.6653081178665161, "learning_rate": 3.572688964972347e-05, "loss": 1.825, "step": 10846 }, { "epoch": 0.28567289965762444, "grad_norm": 1.374388575553894, "learning_rate": 3.572557282064788e-05, "loss": 0.7493, "step": 10847 }, { "epoch": 0.2856992362391362, "grad_norm": 2.031874418258667, "learning_rate": 3.57242559915723e-05, "loss": 1.3097, "step": 10848 }, { "epoch": 0.28572557282064787, "grad_norm": 4.8944621086120605, "learning_rate": 3.572293916249671e-05, "loss": 2.3464, "step": 10849 }, { "epoch": 0.2857519094021596, "grad_norm": 2.4388692378997803, "learning_rate": 3.5721622333421124e-05, "loss": 1.7899, "step": 10850 }, { "epoch": 0.2857782459836713, "grad_norm": 3.541611909866333, "learning_rate": 3.572030550434554e-05, "loss": 0.405, "step": 10851 }, { "epoch": 0.28580458256518304, "grad_norm": 1.9741238355636597, "learning_rate": 3.5718988675269955e-05, "loss": 1.5981, "step": 10852 }, { "epoch": 0.2858309191466948, "grad_norm": 1.4928537607192993, "learning_rate": 3.571767184619437e-05, "loss": 0.6627, "step": 10853 }, { "epoch": 0.28585725572820647, "grad_norm": 2.9098763465881348, "learning_rate": 3.571635501711878e-05, "loss": 1.8585, "step": 10854 }, { "epoch": 0.2858835923097182, "grad_norm": 1.4445680379867554, "learning_rate": 3.5715038188043195e-05, "loss": 1.4236, "step": 10855 }, { "epoch": 0.2859099288912299, "grad_norm": 1.6740747690200806, "learning_rate": 3.5713721358967604e-05, "loss": 2.0825, "step": 10856 }, { "epoch": 0.28593626547274165, "grad_norm": 2.372706174850464, "learning_rate": 3.5712404529892026e-05, "loss": 1.7807, "step": 10857 }, { "epoch": 0.28596260205425333, "grad_norm": 1.9517550468444824, "learning_rate": 3.5711087700816435e-05, "loss": 0.7374, "step": 10858 }, { "epoch": 0.2859889386357651, "grad_norm": 1.688175916671753, "learning_rate": 3.570977087174085e-05, "loss": 2.0738, "step": 10859 }, { "epoch": 0.2860152752172768, "grad_norm": 4.115051746368408, "learning_rate": 3.570845404266526e-05, "loss": 0.9316, "step": 10860 }, { "epoch": 0.2860416117987885, "grad_norm": 2.710395574569702, "learning_rate": 3.5707137213589675e-05, "loss": 1.8353, "step": 10861 }, { "epoch": 0.28606794838030025, "grad_norm": 2.231415033340454, "learning_rate": 3.570582038451409e-05, "loss": 2.2586, "step": 10862 }, { "epoch": 0.28609428496181194, "grad_norm": 2.0620667934417725, "learning_rate": 3.5704503555438506e-05, "loss": 1.6772, "step": 10863 }, { "epoch": 0.2861206215433237, "grad_norm": 2.5980162620544434, "learning_rate": 3.570318672636292e-05, "loss": 1.7098, "step": 10864 }, { "epoch": 0.28614695812483537, "grad_norm": 1.8276180028915405, "learning_rate": 3.570186989728733e-05, "loss": 1.6537, "step": 10865 }, { "epoch": 0.2861732947063471, "grad_norm": 1.5904067754745483, "learning_rate": 3.5700553068211746e-05, "loss": 2.1786, "step": 10866 }, { "epoch": 0.28619963128785886, "grad_norm": 2.0989725589752197, "learning_rate": 3.569923623913616e-05, "loss": 1.2591, "step": 10867 }, { "epoch": 0.28622596786937055, "grad_norm": 2.1903600692749023, "learning_rate": 3.569791941006058e-05, "loss": 0.3586, "step": 10868 }, { "epoch": 0.2862523044508823, "grad_norm": 2.014195680618286, "learning_rate": 3.5696602580984986e-05, "loss": 1.6001, "step": 10869 }, { "epoch": 0.286278641032394, "grad_norm": 2.124044418334961, "learning_rate": 3.56952857519094e-05, "loss": 2.0115, "step": 10870 }, { "epoch": 0.2863049776139057, "grad_norm": 2.830047369003296, "learning_rate": 3.569396892283382e-05, "loss": 1.9418, "step": 10871 }, { "epoch": 0.2863313141954174, "grad_norm": 2.600907564163208, "learning_rate": 3.569265209375823e-05, "loss": 1.5323, "step": 10872 }, { "epoch": 0.28635765077692915, "grad_norm": 3.5687639713287354, "learning_rate": 3.569133526468265e-05, "loss": 1.0994, "step": 10873 }, { "epoch": 0.2863839873584409, "grad_norm": 1.9190971851348877, "learning_rate": 3.569001843560706e-05, "loss": 2.4625, "step": 10874 }, { "epoch": 0.2864103239399526, "grad_norm": 1.8622959852218628, "learning_rate": 3.568870160653147e-05, "loss": 2.3653, "step": 10875 }, { "epoch": 0.2864366605214643, "grad_norm": 1.8052875995635986, "learning_rate": 3.568738477745589e-05, "loss": 1.2829, "step": 10876 }, { "epoch": 0.286462997102976, "grad_norm": 1.6404985189437866, "learning_rate": 3.5686067948380304e-05, "loss": 1.8094, "step": 10877 }, { "epoch": 0.28648933368448776, "grad_norm": 4.276154518127441, "learning_rate": 3.568475111930471e-05, "loss": 1.5444, "step": 10878 }, { "epoch": 0.2865156702659995, "grad_norm": 2.249417543411255, "learning_rate": 3.568343429022913e-05, "loss": 1.2385, "step": 10879 }, { "epoch": 0.2865420068475112, "grad_norm": 2.065884590148926, "learning_rate": 3.5682117461153544e-05, "loss": 0.924, "step": 10880 }, { "epoch": 0.28656834342902293, "grad_norm": 3.236016273498535, "learning_rate": 3.568080063207796e-05, "loss": 2.198, "step": 10881 }, { "epoch": 0.2865946800105346, "grad_norm": 5.809149742126465, "learning_rate": 3.5679483803002375e-05, "loss": 1.4607, "step": 10882 }, { "epoch": 0.28662101659204636, "grad_norm": 4.300374984741211, "learning_rate": 3.5678166973926784e-05, "loss": 0.4194, "step": 10883 }, { "epoch": 0.28664735317355805, "grad_norm": 1.977638602256775, "learning_rate": 3.56768501448512e-05, "loss": 2.344, "step": 10884 }, { "epoch": 0.2866736897550698, "grad_norm": 4.997640609741211, "learning_rate": 3.5675533315775615e-05, "loss": 1.664, "step": 10885 }, { "epoch": 0.28670002633658154, "grad_norm": 3.3185009956359863, "learning_rate": 3.567421648670003e-05, "loss": 2.1228, "step": 10886 }, { "epoch": 0.2867263629180932, "grad_norm": 1.818388819694519, "learning_rate": 3.567289965762444e-05, "loss": 2.1683, "step": 10887 }, { "epoch": 0.28675269949960497, "grad_norm": 1.9575581550598145, "learning_rate": 3.5671582828548855e-05, "loss": 1.1518, "step": 10888 }, { "epoch": 0.28677903608111666, "grad_norm": 3.258960008621216, "learning_rate": 3.567026599947327e-05, "loss": 0.9538, "step": 10889 }, { "epoch": 0.2868053726626284, "grad_norm": 2.2418830394744873, "learning_rate": 3.5668949170397686e-05, "loss": 1.5618, "step": 10890 }, { "epoch": 0.2868317092441401, "grad_norm": 2.235569715499878, "learning_rate": 3.56676323413221e-05, "loss": 1.2931, "step": 10891 }, { "epoch": 0.28685804582565183, "grad_norm": 6.3410749435424805, "learning_rate": 3.566631551224651e-05, "loss": 1.3158, "step": 10892 }, { "epoch": 0.2868843824071636, "grad_norm": 3.698495864868164, "learning_rate": 3.5664998683170926e-05, "loss": 1.4903, "step": 10893 }, { "epoch": 0.28691071898867526, "grad_norm": 1.6409622430801392, "learning_rate": 3.5663681854095335e-05, "loss": 1.9041, "step": 10894 }, { "epoch": 0.286937055570187, "grad_norm": 2.1133944988250732, "learning_rate": 3.566236502501976e-05, "loss": 1.7888, "step": 10895 }, { "epoch": 0.2869633921516987, "grad_norm": 1.7504150867462158, "learning_rate": 3.5661048195944166e-05, "loss": 1.6241, "step": 10896 }, { "epoch": 0.28698972873321044, "grad_norm": 3.844424247741699, "learning_rate": 3.565973136686858e-05, "loss": 1.9478, "step": 10897 }, { "epoch": 0.2870160653147221, "grad_norm": 2.768083333969116, "learning_rate": 3.5658414537793e-05, "loss": 1.5144, "step": 10898 }, { "epoch": 0.28704240189623387, "grad_norm": 1.9580880403518677, "learning_rate": 3.565709770871741e-05, "loss": 1.4769, "step": 10899 }, { "epoch": 0.2870687384777456, "grad_norm": 2.030640125274658, "learning_rate": 3.565578087964183e-05, "loss": 0.4653, "step": 10900 }, { "epoch": 0.2870950750592573, "grad_norm": 4.788317680358887, "learning_rate": 3.565446405056624e-05, "loss": 1.5075, "step": 10901 }, { "epoch": 0.28712141164076904, "grad_norm": 4.070150375366211, "learning_rate": 3.565314722149065e-05, "loss": 1.6207, "step": 10902 }, { "epoch": 0.28714774822228073, "grad_norm": 2.3336422443389893, "learning_rate": 3.565183039241506e-05, "loss": 0.6337, "step": 10903 }, { "epoch": 0.2871740848037925, "grad_norm": 2.0194108486175537, "learning_rate": 3.5650513563339484e-05, "loss": 2.1127, "step": 10904 }, { "epoch": 0.28720042138530416, "grad_norm": 2.1101112365722656, "learning_rate": 3.564919673426389e-05, "loss": 2.5143, "step": 10905 }, { "epoch": 0.2872267579668159, "grad_norm": 1.6519547700881958, "learning_rate": 3.564787990518831e-05, "loss": 0.6376, "step": 10906 }, { "epoch": 0.28725309454832765, "grad_norm": 2.7773334980010986, "learning_rate": 3.5646563076112725e-05, "loss": 1.6727, "step": 10907 }, { "epoch": 0.28727943112983934, "grad_norm": 4.362887859344482, "learning_rate": 3.564524624703713e-05, "loss": 1.2566, "step": 10908 }, { "epoch": 0.2873057677113511, "grad_norm": 1.9870001077651978, "learning_rate": 3.5643929417961556e-05, "loss": 2.2809, "step": 10909 }, { "epoch": 0.28733210429286277, "grad_norm": 2.3629238605499268, "learning_rate": 3.5642612588885965e-05, "loss": 1.1103, "step": 10910 }, { "epoch": 0.2873584408743745, "grad_norm": 1.5843186378479004, "learning_rate": 3.564129575981038e-05, "loss": 1.4126, "step": 10911 }, { "epoch": 0.28738477745588625, "grad_norm": 2.0120108127593994, "learning_rate": 3.563997893073479e-05, "loss": 1.0796, "step": 10912 }, { "epoch": 0.28741111403739794, "grad_norm": 2.035494089126587, "learning_rate": 3.5638662101659205e-05, "loss": 1.4534, "step": 10913 }, { "epoch": 0.2874374506189097, "grad_norm": 1.856073021888733, "learning_rate": 3.563734527258362e-05, "loss": 1.6312, "step": 10914 }, { "epoch": 0.2874637872004214, "grad_norm": 3.239377737045288, "learning_rate": 3.5636028443508036e-05, "loss": 1.181, "step": 10915 }, { "epoch": 0.2874901237819331, "grad_norm": 1.8262280225753784, "learning_rate": 3.5634711614432445e-05, "loss": 1.5209, "step": 10916 }, { "epoch": 0.2875164603634448, "grad_norm": 1.726887822151184, "learning_rate": 3.563339478535686e-05, "loss": 2.0076, "step": 10917 }, { "epoch": 0.28754279694495655, "grad_norm": 2.1747591495513916, "learning_rate": 3.5632077956281276e-05, "loss": 2.0095, "step": 10918 }, { "epoch": 0.2875691335264683, "grad_norm": 2.274733543395996, "learning_rate": 3.563076112720569e-05, "loss": 1.4245, "step": 10919 }, { "epoch": 0.28759547010798, "grad_norm": 2.9830405712127686, "learning_rate": 3.562944429813011e-05, "loss": 1.0625, "step": 10920 }, { "epoch": 0.2876218066894917, "grad_norm": 3.5884392261505127, "learning_rate": 3.5628127469054516e-05, "loss": 1.4427, "step": 10921 }, { "epoch": 0.2876481432710034, "grad_norm": 3.065458297729492, "learning_rate": 3.562681063997893e-05, "loss": 2.4932, "step": 10922 }, { "epoch": 0.28767447985251515, "grad_norm": 4.81726598739624, "learning_rate": 3.562549381090335e-05, "loss": 1.1851, "step": 10923 }, { "epoch": 0.28770081643402684, "grad_norm": 3.7395832538604736, "learning_rate": 3.562417698182776e-05, "loss": 0.9141, "step": 10924 }, { "epoch": 0.2877271530155386, "grad_norm": 1.4980483055114746, "learning_rate": 3.562286015275217e-05, "loss": 1.8753, "step": 10925 }, { "epoch": 0.2877534895970503, "grad_norm": 3.8506574630737305, "learning_rate": 3.562154332367659e-05, "loss": 0.9344, "step": 10926 }, { "epoch": 0.287779826178562, "grad_norm": 2.8683130741119385, "learning_rate": 3.5620226494601e-05, "loss": 1.1648, "step": 10927 }, { "epoch": 0.28780616276007376, "grad_norm": 2.4282519817352295, "learning_rate": 3.561890966552542e-05, "loss": 1.1954, "step": 10928 }, { "epoch": 0.28783249934158545, "grad_norm": 3.0133583545684814, "learning_rate": 3.5617592836449834e-05, "loss": 1.7148, "step": 10929 }, { "epoch": 0.2878588359230972, "grad_norm": 4.448825359344482, "learning_rate": 3.561627600737424e-05, "loss": 0.9899, "step": 10930 }, { "epoch": 0.2878851725046089, "grad_norm": 4.573001861572266, "learning_rate": 3.561495917829866e-05, "loss": 1.8964, "step": 10931 }, { "epoch": 0.2879115090861206, "grad_norm": 1.802765130996704, "learning_rate": 3.5613642349223074e-05, "loss": 1.4969, "step": 10932 }, { "epoch": 0.28793784566763236, "grad_norm": 2.5864810943603516, "learning_rate": 3.561232552014749e-05, "loss": 0.7321, "step": 10933 }, { "epoch": 0.28796418224914405, "grad_norm": 3.4646875858306885, "learning_rate": 3.56110086910719e-05, "loss": 1.1532, "step": 10934 }, { "epoch": 0.2879905188306558, "grad_norm": 1.8657139539718628, "learning_rate": 3.5609691861996314e-05, "loss": 2.0382, "step": 10935 }, { "epoch": 0.2880168554121675, "grad_norm": 2.163668155670166, "learning_rate": 3.560837503292073e-05, "loss": 1.452, "step": 10936 }, { "epoch": 0.2880431919936792, "grad_norm": 1.600176215171814, "learning_rate": 3.5607058203845145e-05, "loss": 1.9442, "step": 10937 }, { "epoch": 0.2880695285751909, "grad_norm": 3.5224242210388184, "learning_rate": 3.560574137476956e-05, "loss": 1.7202, "step": 10938 }, { "epoch": 0.28809586515670266, "grad_norm": 3.597471237182617, "learning_rate": 3.560442454569397e-05, "loss": 1.6232, "step": 10939 }, { "epoch": 0.2881222017382144, "grad_norm": 3.762521266937256, "learning_rate": 3.5603107716618385e-05, "loss": 1.1984, "step": 10940 }, { "epoch": 0.2881485383197261, "grad_norm": 4.7362589836120605, "learning_rate": 3.5601790887542794e-05, "loss": 0.9751, "step": 10941 }, { "epoch": 0.28817487490123783, "grad_norm": 2.2351064682006836, "learning_rate": 3.5600474058467216e-05, "loss": 2.4552, "step": 10942 }, { "epoch": 0.2882012114827495, "grad_norm": 3.0252811908721924, "learning_rate": 3.5599157229391625e-05, "loss": 1.0504, "step": 10943 }, { "epoch": 0.28822754806426126, "grad_norm": 2.5757875442504883, "learning_rate": 3.559784040031604e-05, "loss": 1.4499, "step": 10944 }, { "epoch": 0.288253884645773, "grad_norm": 2.449920892715454, "learning_rate": 3.5596523571240456e-05, "loss": 1.7631, "step": 10945 }, { "epoch": 0.2882802212272847, "grad_norm": 2.6157121658325195, "learning_rate": 3.5595206742164865e-05, "loss": 0.9906, "step": 10946 }, { "epoch": 0.28830655780879644, "grad_norm": 2.005641460418701, "learning_rate": 3.559388991308929e-05, "loss": 1.501, "step": 10947 }, { "epoch": 0.2883328943903081, "grad_norm": 2.2654950618743896, "learning_rate": 3.5592573084013696e-05, "loss": 1.2623, "step": 10948 }, { "epoch": 0.28835923097181987, "grad_norm": 3.0298023223876953, "learning_rate": 3.559125625493811e-05, "loss": 0.9493, "step": 10949 }, { "epoch": 0.28838556755333156, "grad_norm": 2.7169744968414307, "learning_rate": 3.558993942586252e-05, "loss": 1.6912, "step": 10950 }, { "epoch": 0.2884119041348433, "grad_norm": 5.633264064788818, "learning_rate": 3.558862259678694e-05, "loss": 1.1934, "step": 10951 }, { "epoch": 0.28843824071635504, "grad_norm": 1.7106952667236328, "learning_rate": 3.558730576771135e-05, "loss": 1.1364, "step": 10952 }, { "epoch": 0.28846457729786673, "grad_norm": 1.6821469068527222, "learning_rate": 3.558598893863577e-05, "loss": 2.2684, "step": 10953 }, { "epoch": 0.2884909138793785, "grad_norm": 5.259078502655029, "learning_rate": 3.558467210956018e-05, "loss": 1.0155, "step": 10954 }, { "epoch": 0.28851725046089016, "grad_norm": 1.5545750856399536, "learning_rate": 3.558335528048459e-05, "loss": 1.2766, "step": 10955 }, { "epoch": 0.2885435870424019, "grad_norm": 1.6793551445007324, "learning_rate": 3.5582038451409014e-05, "loss": 2.3053, "step": 10956 }, { "epoch": 0.2885699236239136, "grad_norm": 2.053560256958008, "learning_rate": 3.558072162233342e-05, "loss": 1.2424, "step": 10957 }, { "epoch": 0.28859626020542534, "grad_norm": 1.6040992736816406, "learning_rate": 3.557940479325784e-05, "loss": 1.8083, "step": 10958 }, { "epoch": 0.2886225967869371, "grad_norm": 2.1622772216796875, "learning_rate": 3.557808796418225e-05, "loss": 2.4501, "step": 10959 }, { "epoch": 0.28864893336844877, "grad_norm": 2.141171932220459, "learning_rate": 3.557677113510666e-05, "loss": 1.5421, "step": 10960 }, { "epoch": 0.2886752699499605, "grad_norm": 3.0063464641571045, "learning_rate": 3.557545430603108e-05, "loss": 1.9009, "step": 10961 }, { "epoch": 0.2887016065314722, "grad_norm": 2.160053253173828, "learning_rate": 3.5574137476955494e-05, "loss": 0.685, "step": 10962 }, { "epoch": 0.28872794311298394, "grad_norm": 1.8533016443252563, "learning_rate": 3.55728206478799e-05, "loss": 1.9313, "step": 10963 }, { "epoch": 0.28875427969449563, "grad_norm": 2.5866363048553467, "learning_rate": 3.557150381880432e-05, "loss": 1.9037, "step": 10964 }, { "epoch": 0.2887806162760074, "grad_norm": 1.7641345262527466, "learning_rate": 3.5570186989728734e-05, "loss": 0.2565, "step": 10965 }, { "epoch": 0.2888069528575191, "grad_norm": 3.6756820678710938, "learning_rate": 3.556887016065315e-05, "loss": 1.713, "step": 10966 }, { "epoch": 0.2888332894390308, "grad_norm": 2.8689334392547607, "learning_rate": 3.5567553331577565e-05, "loss": 0.7, "step": 10967 }, { "epoch": 0.28885962602054255, "grad_norm": 2.9871444702148438, "learning_rate": 3.5566236502501974e-05, "loss": 1.5818, "step": 10968 }, { "epoch": 0.28888596260205424, "grad_norm": 1.6511298418045044, "learning_rate": 3.556491967342639e-05, "loss": 2.0819, "step": 10969 }, { "epoch": 0.288912299183566, "grad_norm": 2.019268274307251, "learning_rate": 3.5563602844350806e-05, "loss": 1.8302, "step": 10970 }, { "epoch": 0.28893863576507767, "grad_norm": 2.2264962196350098, "learning_rate": 3.556228601527522e-05, "loss": 1.5024, "step": 10971 }, { "epoch": 0.2889649723465894, "grad_norm": 1.9779112339019775, "learning_rate": 3.556096918619963e-05, "loss": 2.0727, "step": 10972 }, { "epoch": 0.28899130892810115, "grad_norm": 2.1315879821777344, "learning_rate": 3.5559652357124046e-05, "loss": 2.098, "step": 10973 }, { "epoch": 0.28901764550961284, "grad_norm": 3.3385040760040283, "learning_rate": 3.555833552804846e-05, "loss": 1.0683, "step": 10974 }, { "epoch": 0.2890439820911246, "grad_norm": 2.2172985076904297, "learning_rate": 3.555701869897288e-05, "loss": 1.993, "step": 10975 }, { "epoch": 0.2890703186726363, "grad_norm": 2.3556089401245117, "learning_rate": 3.555570186989729e-05, "loss": 2.4113, "step": 10976 }, { "epoch": 0.289096655254148, "grad_norm": 1.7501848936080933, "learning_rate": 3.55543850408217e-05, "loss": 1.1491, "step": 10977 }, { "epoch": 0.2891229918356597, "grad_norm": 2.6829991340637207, "learning_rate": 3.555306821174612e-05, "loss": 1.0855, "step": 10978 }, { "epoch": 0.28914932841717145, "grad_norm": 2.8630950450897217, "learning_rate": 3.5551751382670526e-05, "loss": 2.0312, "step": 10979 }, { "epoch": 0.2891756649986832, "grad_norm": 2.9446446895599365, "learning_rate": 3.555043455359495e-05, "loss": 1.4823, "step": 10980 }, { "epoch": 0.2892020015801949, "grad_norm": 2.8820769786834717, "learning_rate": 3.554911772451936e-05, "loss": 0.581, "step": 10981 }, { "epoch": 0.2892283381617066, "grad_norm": 2.2731316089630127, "learning_rate": 3.554780089544377e-05, "loss": 1.4354, "step": 10982 }, { "epoch": 0.2892546747432183, "grad_norm": 2.1192312240600586, "learning_rate": 3.554648406636819e-05, "loss": 1.2348, "step": 10983 }, { "epoch": 0.28928101132473005, "grad_norm": 2.931610345840454, "learning_rate": 3.5545167237292604e-05, "loss": 1.7275, "step": 10984 }, { "epoch": 0.2893073479062418, "grad_norm": 1.4986823797225952, "learning_rate": 3.554385040821702e-05, "loss": 1.4245, "step": 10985 }, { "epoch": 0.2893336844877535, "grad_norm": 3.0312817096710205, "learning_rate": 3.554253357914143e-05, "loss": 1.4358, "step": 10986 }, { "epoch": 0.2893600210692652, "grad_norm": 1.7814475297927856, "learning_rate": 3.5541216750065844e-05, "loss": 1.8522, "step": 10987 }, { "epoch": 0.2893863576507769, "grad_norm": 1.8796284198760986, "learning_rate": 3.553989992099025e-05, "loss": 0.4629, "step": 10988 }, { "epoch": 0.28941269423228866, "grad_norm": 5.068589687347412, "learning_rate": 3.5538583091914675e-05, "loss": 0.8754, "step": 10989 }, { "epoch": 0.28943903081380035, "grad_norm": 5.044485569000244, "learning_rate": 3.5537266262839084e-05, "loss": 1.4031, "step": 10990 }, { "epoch": 0.2894653673953121, "grad_norm": 3.1948459148406982, "learning_rate": 3.55359494337635e-05, "loss": 1.2049, "step": 10991 }, { "epoch": 0.28949170397682383, "grad_norm": 5.329652786254883, "learning_rate": 3.5534632604687915e-05, "loss": 2.9056, "step": 10992 }, { "epoch": 0.2895180405583355, "grad_norm": 1.6377066373825073, "learning_rate": 3.5533315775612324e-05, "loss": 1.7653, "step": 10993 }, { "epoch": 0.28954437713984726, "grad_norm": 3.317782163619995, "learning_rate": 3.5531998946536746e-05, "loss": 0.3472, "step": 10994 }, { "epoch": 0.28957071372135895, "grad_norm": 1.922645092010498, "learning_rate": 3.5530682117461155e-05, "loss": 1.3858, "step": 10995 }, { "epoch": 0.2895970503028707, "grad_norm": 4.561474800109863, "learning_rate": 3.552936528838557e-05, "loss": 0.8801, "step": 10996 }, { "epoch": 0.2896233868843824, "grad_norm": 2.3178837299346924, "learning_rate": 3.552804845930998e-05, "loss": 2.066, "step": 10997 }, { "epoch": 0.2896497234658941, "grad_norm": 2.91257905960083, "learning_rate": 3.55267316302344e-05, "loss": 1.4585, "step": 10998 }, { "epoch": 0.28967606004740587, "grad_norm": 2.5721254348754883, "learning_rate": 3.552541480115881e-05, "loss": 2.8255, "step": 10999 }, { "epoch": 0.28970239662891756, "grad_norm": 4.280350685119629, "learning_rate": 3.5524097972083226e-05, "loss": 1.5415, "step": 11000 }, { "epoch": 0.2897287332104293, "grad_norm": 2.1255195140838623, "learning_rate": 3.552278114300764e-05, "loss": 1.6239, "step": 11001 }, { "epoch": 0.289755069791941, "grad_norm": 1.4986419677734375, "learning_rate": 3.552146431393205e-05, "loss": 1.1223, "step": 11002 }, { "epoch": 0.28978140637345273, "grad_norm": 1.7992783784866333, "learning_rate": 3.552014748485647e-05, "loss": 2.0119, "step": 11003 }, { "epoch": 0.2898077429549644, "grad_norm": 2.8025548458099365, "learning_rate": 3.551883065578088e-05, "loss": 1.935, "step": 11004 }, { "epoch": 0.28983407953647616, "grad_norm": 4.0590410232543945, "learning_rate": 3.55175138267053e-05, "loss": 1.0777, "step": 11005 }, { "epoch": 0.2898604161179879, "grad_norm": 3.287367105484009, "learning_rate": 3.5516196997629706e-05, "loss": 1.6116, "step": 11006 }, { "epoch": 0.2898867526994996, "grad_norm": 2.9920408725738525, "learning_rate": 3.551488016855412e-05, "loss": 0.858, "step": 11007 }, { "epoch": 0.28991308928101134, "grad_norm": 1.7735719680786133, "learning_rate": 3.551356333947854e-05, "loss": 1.7671, "step": 11008 }, { "epoch": 0.289939425862523, "grad_norm": 1.7227829694747925, "learning_rate": 3.551224651040295e-05, "loss": 1.6007, "step": 11009 }, { "epoch": 0.28996576244403477, "grad_norm": 1.7703499794006348, "learning_rate": 3.551092968132737e-05, "loss": 1.7369, "step": 11010 }, { "epoch": 0.28999209902554646, "grad_norm": 2.5169200897216797, "learning_rate": 3.550961285225178e-05, "loss": 1.6055, "step": 11011 }, { "epoch": 0.2900184356070582, "grad_norm": 2.7595176696777344, "learning_rate": 3.550829602317619e-05, "loss": 1.7946, "step": 11012 }, { "epoch": 0.29004477218856994, "grad_norm": 2.930305004119873, "learning_rate": 3.550697919410061e-05, "loss": 2.2925, "step": 11013 }, { "epoch": 0.29007110877008163, "grad_norm": 1.7983318567276, "learning_rate": 3.5505662365025024e-05, "loss": 0.4155, "step": 11014 }, { "epoch": 0.2900974453515934, "grad_norm": 3.9221348762512207, "learning_rate": 3.550434553594943e-05, "loss": 1.0817, "step": 11015 }, { "epoch": 0.29012378193310506, "grad_norm": 2.2868871688842773, "learning_rate": 3.550302870687385e-05, "loss": 2.1247, "step": 11016 }, { "epoch": 0.2901501185146168, "grad_norm": 7.007227897644043, "learning_rate": 3.5501711877798264e-05, "loss": 1.0384, "step": 11017 }, { "epoch": 0.29017645509612855, "grad_norm": 2.3141703605651855, "learning_rate": 3.550039504872268e-05, "loss": 1.6293, "step": 11018 }, { "epoch": 0.29020279167764024, "grad_norm": 3.3522896766662598, "learning_rate": 3.549907821964709e-05, "loss": 1.8453, "step": 11019 }, { "epoch": 0.290229128259152, "grad_norm": 3.965550422668457, "learning_rate": 3.5497761390571504e-05, "loss": 1.1338, "step": 11020 }, { "epoch": 0.29025546484066367, "grad_norm": 6.852443218231201, "learning_rate": 3.549644456149592e-05, "loss": 0.908, "step": 11021 }, { "epoch": 0.2902818014221754, "grad_norm": 3.539928436279297, "learning_rate": 3.5495127732420335e-05, "loss": 1.3154, "step": 11022 }, { "epoch": 0.2903081380036871, "grad_norm": 3.4951250553131104, "learning_rate": 3.549381090334475e-05, "loss": 1.2941, "step": 11023 }, { "epoch": 0.29033447458519884, "grad_norm": 2.771512269973755, "learning_rate": 3.549249407426916e-05, "loss": 0.613, "step": 11024 }, { "epoch": 0.2903608111667106, "grad_norm": 2.6112723350524902, "learning_rate": 3.5491177245193575e-05, "loss": 2.3057, "step": 11025 }, { "epoch": 0.2903871477482223, "grad_norm": 2.004936695098877, "learning_rate": 3.5489860416117984e-05, "loss": 1.7175, "step": 11026 }, { "epoch": 0.290413484329734, "grad_norm": 1.655478835105896, "learning_rate": 3.5488543587042406e-05, "loss": 1.7248, "step": 11027 }, { "epoch": 0.2904398209112457, "grad_norm": 2.637791633605957, "learning_rate": 3.5487226757966815e-05, "loss": 1.5018, "step": 11028 }, { "epoch": 0.29046615749275745, "grad_norm": 1.5167251825332642, "learning_rate": 3.548590992889123e-05, "loss": 2.2552, "step": 11029 }, { "epoch": 0.29049249407426914, "grad_norm": 2.9432373046875, "learning_rate": 3.5484593099815647e-05, "loss": 1.998, "step": 11030 }, { "epoch": 0.2905188306557809, "grad_norm": 2.922551393508911, "learning_rate": 3.548327627074006e-05, "loss": 1.5541, "step": 11031 }, { "epoch": 0.2905451672372926, "grad_norm": 6.568065643310547, "learning_rate": 3.548195944166448e-05, "loss": 1.1314, "step": 11032 }, { "epoch": 0.2905715038188043, "grad_norm": 2.2814013957977295, "learning_rate": 3.5480642612588887e-05, "loss": 2.2513, "step": 11033 }, { "epoch": 0.29059784040031605, "grad_norm": 2.166842222213745, "learning_rate": 3.54793257835133e-05, "loss": 1.9773, "step": 11034 }, { "epoch": 0.29062417698182774, "grad_norm": 1.8020366430282593, "learning_rate": 3.547800895443771e-05, "loss": 2.4285, "step": 11035 }, { "epoch": 0.2906505135633395, "grad_norm": 1.7066317796707153, "learning_rate": 3.547669212536213e-05, "loss": 2.1236, "step": 11036 }, { "epoch": 0.2906768501448512, "grad_norm": 2.9069323539733887, "learning_rate": 3.547537529628654e-05, "loss": 2.0676, "step": 11037 }, { "epoch": 0.2907031867263629, "grad_norm": 1.9107900857925415, "learning_rate": 3.547405846721096e-05, "loss": 1.7973, "step": 11038 }, { "epoch": 0.29072952330787466, "grad_norm": 2.988912343978882, "learning_rate": 3.547274163813537e-05, "loss": 0.4131, "step": 11039 }, { "epoch": 0.29075585988938635, "grad_norm": 1.850653052330017, "learning_rate": 3.547142480905978e-05, "loss": 1.9137, "step": 11040 }, { "epoch": 0.2907821964708981, "grad_norm": 1.9713221788406372, "learning_rate": 3.5470107979984205e-05, "loss": 2.2683, "step": 11041 }, { "epoch": 0.2908085330524098, "grad_norm": 2.0246012210845947, "learning_rate": 3.546879115090861e-05, "loss": 2.0447, "step": 11042 }, { "epoch": 0.2908348696339215, "grad_norm": 4.18183708190918, "learning_rate": 3.546747432183303e-05, "loss": 1.5515, "step": 11043 }, { "epoch": 0.2908612062154332, "grad_norm": 3.0166642665863037, "learning_rate": 3.546615749275744e-05, "loss": 1.4106, "step": 11044 }, { "epoch": 0.29088754279694495, "grad_norm": 2.317770481109619, "learning_rate": 3.546484066368185e-05, "loss": 0.3382, "step": 11045 }, { "epoch": 0.2909138793784567, "grad_norm": 2.7255659103393555, "learning_rate": 3.546352383460627e-05, "loss": 0.4388, "step": 11046 }, { "epoch": 0.2909402159599684, "grad_norm": 1.7485480308532715, "learning_rate": 3.5462207005530685e-05, "loss": 1.8356, "step": 11047 }, { "epoch": 0.29096655254148013, "grad_norm": 3.4362640380859375, "learning_rate": 3.54608901764551e-05, "loss": 1.097, "step": 11048 }, { "epoch": 0.2909928891229918, "grad_norm": 2.0705902576446533, "learning_rate": 3.545957334737951e-05, "loss": 2.1396, "step": 11049 }, { "epoch": 0.29101922570450356, "grad_norm": 2.3216209411621094, "learning_rate": 3.545825651830393e-05, "loss": 1.3643, "step": 11050 }, { "epoch": 0.2910455622860153, "grad_norm": 2.6257174015045166, "learning_rate": 3.545693968922834e-05, "loss": 1.7138, "step": 11051 }, { "epoch": 0.291071898867527, "grad_norm": 2.5664923191070557, "learning_rate": 3.5455622860152756e-05, "loss": 1.2035, "step": 11052 }, { "epoch": 0.29109823544903873, "grad_norm": 2.6241204738616943, "learning_rate": 3.5454306031077165e-05, "loss": 2.0248, "step": 11053 }, { "epoch": 0.2911245720305504, "grad_norm": 2.7061100006103516, "learning_rate": 3.545298920200158e-05, "loss": 1.7668, "step": 11054 }, { "epoch": 0.29115090861206216, "grad_norm": 2.0117859840393066, "learning_rate": 3.5451672372925996e-05, "loss": 1.9369, "step": 11055 }, { "epoch": 0.29117724519357385, "grad_norm": 3.36904239654541, "learning_rate": 3.545035554385041e-05, "loss": 0.7704, "step": 11056 }, { "epoch": 0.2912035817750856, "grad_norm": 2.7240214347839355, "learning_rate": 3.544903871477483e-05, "loss": 1.8645, "step": 11057 }, { "epoch": 0.29122991835659734, "grad_norm": 3.428537607192993, "learning_rate": 3.5447721885699236e-05, "loss": 0.3247, "step": 11058 }, { "epoch": 0.291256254938109, "grad_norm": 2.109004497528076, "learning_rate": 3.544640505662365e-05, "loss": 2.179, "step": 11059 }, { "epoch": 0.29128259151962077, "grad_norm": 2.1439788341522217, "learning_rate": 3.544508822754807e-05, "loss": 1.7094, "step": 11060 }, { "epoch": 0.29130892810113246, "grad_norm": 3.4488956928253174, "learning_rate": 3.544377139847248e-05, "loss": 2.4145, "step": 11061 }, { "epoch": 0.2913352646826442, "grad_norm": 5.650046348571777, "learning_rate": 3.544245456939689e-05, "loss": 1.8215, "step": 11062 }, { "epoch": 0.2913616012641559, "grad_norm": 1.7302626371383667, "learning_rate": 3.544113774032131e-05, "loss": 1.3273, "step": 11063 }, { "epoch": 0.29138793784566763, "grad_norm": 3.058371067047119, "learning_rate": 3.543982091124572e-05, "loss": 2.2555, "step": 11064 }, { "epoch": 0.2914142744271794, "grad_norm": 5.631766319274902, "learning_rate": 3.543850408217014e-05, "loss": 1.1217, "step": 11065 }, { "epoch": 0.29144061100869106, "grad_norm": 3.0049452781677246, "learning_rate": 3.543718725309455e-05, "loss": 1.5222, "step": 11066 }, { "epoch": 0.2914669475902028, "grad_norm": 2.9334356784820557, "learning_rate": 3.543587042401896e-05, "loss": 0.6939, "step": 11067 }, { "epoch": 0.2914932841717145, "grad_norm": 2.0152101516723633, "learning_rate": 3.543455359494338e-05, "loss": 2.1507, "step": 11068 }, { "epoch": 0.29151962075322624, "grad_norm": 2.8079895973205566, "learning_rate": 3.5433236765867794e-05, "loss": 1.7229, "step": 11069 }, { "epoch": 0.2915459573347379, "grad_norm": 5.658259391784668, "learning_rate": 3.543191993679221e-05, "loss": 1.9062, "step": 11070 }, { "epoch": 0.29157229391624967, "grad_norm": 3.0796990394592285, "learning_rate": 3.543060310771662e-05, "loss": 1.8644, "step": 11071 }, { "epoch": 0.2915986304977614, "grad_norm": 1.983187198638916, "learning_rate": 3.5429286278641034e-05, "loss": 0.4625, "step": 11072 }, { "epoch": 0.2916249670792731, "grad_norm": 1.7298763990402222, "learning_rate": 3.542796944956544e-05, "loss": 2.4248, "step": 11073 }, { "epoch": 0.29165130366078484, "grad_norm": 5.213994979858398, "learning_rate": 3.5426652620489865e-05, "loss": 1.5353, "step": 11074 }, { "epoch": 0.29167764024229653, "grad_norm": 2.2726612091064453, "learning_rate": 3.5425335791414274e-05, "loss": 1.3574, "step": 11075 }, { "epoch": 0.2917039768238083, "grad_norm": 1.9792426824569702, "learning_rate": 3.542401896233869e-05, "loss": 1.6951, "step": 11076 }, { "epoch": 0.29173031340531996, "grad_norm": 2.554072141647339, "learning_rate": 3.5422702133263105e-05, "loss": 1.8157, "step": 11077 }, { "epoch": 0.2917566499868317, "grad_norm": 1.6620303392410278, "learning_rate": 3.5421385304187514e-05, "loss": 2.1404, "step": 11078 }, { "epoch": 0.29178298656834345, "grad_norm": 2.2123641967773438, "learning_rate": 3.5420068475111936e-05, "loss": 1.5199, "step": 11079 }, { "epoch": 0.29180932314985514, "grad_norm": 1.6105784177780151, "learning_rate": 3.5418751646036345e-05, "loss": 2.1827, "step": 11080 }, { "epoch": 0.2918356597313669, "grad_norm": 2.6802256107330322, "learning_rate": 3.541743481696076e-05, "loss": 1.2632, "step": 11081 }, { "epoch": 0.29186199631287857, "grad_norm": 2.152794361114502, "learning_rate": 3.541611798788517e-05, "loss": 0.2989, "step": 11082 }, { "epoch": 0.2918883328943903, "grad_norm": 2.596158504486084, "learning_rate": 3.541480115880959e-05, "loss": 1.2878, "step": 11083 }, { "epoch": 0.29191466947590206, "grad_norm": 3.5145933628082275, "learning_rate": 3.5413484329734e-05, "loss": 1.6906, "step": 11084 }, { "epoch": 0.29194100605741374, "grad_norm": 1.7567402124404907, "learning_rate": 3.5412167500658416e-05, "loss": 1.4828, "step": 11085 }, { "epoch": 0.2919673426389255, "grad_norm": 3.0997812747955322, "learning_rate": 3.541085067158283e-05, "loss": 0.6779, "step": 11086 }, { "epoch": 0.2919936792204372, "grad_norm": 1.9743634462356567, "learning_rate": 3.540953384250724e-05, "loss": 2.1128, "step": 11087 }, { "epoch": 0.2920200158019489, "grad_norm": 7.771797180175781, "learning_rate": 3.540821701343166e-05, "loss": 1.4228, "step": 11088 }, { "epoch": 0.2920463523834606, "grad_norm": 1.853283405303955, "learning_rate": 3.540690018435607e-05, "loss": 1.5985, "step": 11089 }, { "epoch": 0.29207268896497235, "grad_norm": 2.664031744003296, "learning_rate": 3.540558335528049e-05, "loss": 1.308, "step": 11090 }, { "epoch": 0.2920990255464841, "grad_norm": 6.335636615753174, "learning_rate": 3.5404266526204896e-05, "loss": 2.1488, "step": 11091 }, { "epoch": 0.2921253621279958, "grad_norm": 1.7049521207809448, "learning_rate": 3.540294969712931e-05, "loss": 1.9945, "step": 11092 }, { "epoch": 0.2921516987095075, "grad_norm": 3.685929775238037, "learning_rate": 3.540163286805373e-05, "loss": 0.7921, "step": 11093 }, { "epoch": 0.2921780352910192, "grad_norm": 1.8595293760299683, "learning_rate": 3.540031603897814e-05, "loss": 0.6721, "step": 11094 }, { "epoch": 0.29220437187253095, "grad_norm": 3.9109089374542236, "learning_rate": 3.539899920990256e-05, "loss": 1.0977, "step": 11095 }, { "epoch": 0.29223070845404264, "grad_norm": 2.1876718997955322, "learning_rate": 3.539768238082697e-05, "loss": 1.4682, "step": 11096 }, { "epoch": 0.2922570450355544, "grad_norm": 3.5205602645874023, "learning_rate": 3.539636555175139e-05, "loss": 1.332, "step": 11097 }, { "epoch": 0.29228338161706613, "grad_norm": 2.544851064682007, "learning_rate": 3.53950487226758e-05, "loss": 2.1016, "step": 11098 }, { "epoch": 0.2923097181985778, "grad_norm": 2.9030494689941406, "learning_rate": 3.5393731893600214e-05, "loss": 1.1756, "step": 11099 }, { "epoch": 0.29233605478008956, "grad_norm": 2.0888679027557373, "learning_rate": 3.539241506452462e-05, "loss": 1.5785, "step": 11100 }, { "epoch": 0.29236239136160125, "grad_norm": 2.51741361618042, "learning_rate": 3.539109823544904e-05, "loss": 1.7483, "step": 11101 }, { "epoch": 0.292388727943113, "grad_norm": 2.9861435890197754, "learning_rate": 3.5389781406373454e-05, "loss": 1.9696, "step": 11102 }, { "epoch": 0.2924150645246247, "grad_norm": 1.9171050786972046, "learning_rate": 3.538846457729787e-05, "loss": 1.9575, "step": 11103 }, { "epoch": 0.2924414011061364, "grad_norm": 1.6457089185714722, "learning_rate": 3.5387147748222286e-05, "loss": 1.7112, "step": 11104 }, { "epoch": 0.29246773768764817, "grad_norm": 2.636373996734619, "learning_rate": 3.5385830919146694e-05, "loss": 1.5134, "step": 11105 }, { "epoch": 0.29249407426915985, "grad_norm": 2.703622579574585, "learning_rate": 3.538451409007111e-05, "loss": 1.1645, "step": 11106 }, { "epoch": 0.2925204108506716, "grad_norm": 3.1340646743774414, "learning_rate": 3.5383197260995526e-05, "loss": 1.7501, "step": 11107 }, { "epoch": 0.2925467474321833, "grad_norm": 5.145357608795166, "learning_rate": 3.538188043191994e-05, "loss": 1.4914, "step": 11108 }, { "epoch": 0.29257308401369503, "grad_norm": 9.81971263885498, "learning_rate": 3.538056360284435e-05, "loss": 2.3546, "step": 11109 }, { "epoch": 0.2925994205952067, "grad_norm": 3.489922046661377, "learning_rate": 3.5379246773768766e-05, "loss": 1.9695, "step": 11110 }, { "epoch": 0.29262575717671846, "grad_norm": 2.2756972312927246, "learning_rate": 3.5377929944693174e-05, "loss": 2.0647, "step": 11111 }, { "epoch": 0.2926520937582302, "grad_norm": 4.781240940093994, "learning_rate": 3.53766131156176e-05, "loss": 2.4186, "step": 11112 }, { "epoch": 0.2926784303397419, "grad_norm": 1.6885876655578613, "learning_rate": 3.537529628654201e-05, "loss": 2.1346, "step": 11113 }, { "epoch": 0.29270476692125363, "grad_norm": 5.172010898590088, "learning_rate": 3.537397945746642e-05, "loss": 2.4652, "step": 11114 }, { "epoch": 0.2927311035027653, "grad_norm": 1.7945427894592285, "learning_rate": 3.537266262839084e-05, "loss": 2.0324, "step": 11115 }, { "epoch": 0.29275744008427707, "grad_norm": 2.261831283569336, "learning_rate": 3.537134579931525e-05, "loss": 2.0637, "step": 11116 }, { "epoch": 0.29278377666578875, "grad_norm": 1.7589129209518433, "learning_rate": 3.537002897023967e-05, "loss": 2.0809, "step": 11117 }, { "epoch": 0.2928101132473005, "grad_norm": 3.1029694080352783, "learning_rate": 3.536871214116408e-05, "loss": 1.7428, "step": 11118 }, { "epoch": 0.29283644982881224, "grad_norm": 3.1686432361602783, "learning_rate": 3.536739531208849e-05, "loss": 0.6834, "step": 11119 }, { "epoch": 0.2928627864103239, "grad_norm": 10.900420188903809, "learning_rate": 3.53660784830129e-05, "loss": 1.2351, "step": 11120 }, { "epoch": 0.29288912299183567, "grad_norm": 2.672062635421753, "learning_rate": 3.5364761653937324e-05, "loss": 1.7464, "step": 11121 }, { "epoch": 0.29291545957334736, "grad_norm": 2.002274751663208, "learning_rate": 3.536344482486173e-05, "loss": 0.914, "step": 11122 }, { "epoch": 0.2929417961548591, "grad_norm": 1.8139182329177856, "learning_rate": 3.536212799578615e-05, "loss": 1.5855, "step": 11123 }, { "epoch": 0.29296813273637085, "grad_norm": 1.4487301111221313, "learning_rate": 3.5360811166710564e-05, "loss": 1.7363, "step": 11124 }, { "epoch": 0.29299446931788253, "grad_norm": 3.9074110984802246, "learning_rate": 3.535949433763497e-05, "loss": 2.0718, "step": 11125 }, { "epoch": 0.2930208058993943, "grad_norm": 1.735677719116211, "learning_rate": 3.5358177508559395e-05, "loss": 1.5249, "step": 11126 }, { "epoch": 0.29304714248090596, "grad_norm": 2.083416700363159, "learning_rate": 3.5356860679483804e-05, "loss": 1.5842, "step": 11127 }, { "epoch": 0.2930734790624177, "grad_norm": 2.062920331954956, "learning_rate": 3.535554385040822e-05, "loss": 1.8368, "step": 11128 }, { "epoch": 0.2930998156439294, "grad_norm": 3.423672914505005, "learning_rate": 3.535422702133263e-05, "loss": 2.1068, "step": 11129 }, { "epoch": 0.29312615222544114, "grad_norm": 2.391085624694824, "learning_rate": 3.535291019225705e-05, "loss": 2.1505, "step": 11130 }, { "epoch": 0.2931524888069529, "grad_norm": 1.9471317529678345, "learning_rate": 3.535159336318146e-05, "loss": 1.7257, "step": 11131 }, { "epoch": 0.29317882538846457, "grad_norm": 1.749572515487671, "learning_rate": 3.5350276534105875e-05, "loss": 1.8632, "step": 11132 }, { "epoch": 0.2932051619699763, "grad_norm": 1.7597603797912598, "learning_rate": 3.534895970503029e-05, "loss": 1.4161, "step": 11133 }, { "epoch": 0.293231498551488, "grad_norm": 3.6342952251434326, "learning_rate": 3.53476428759547e-05, "loss": 2.1602, "step": 11134 }, { "epoch": 0.29325783513299974, "grad_norm": 3.2049062252044678, "learning_rate": 3.534632604687912e-05, "loss": 2.4722, "step": 11135 }, { "epoch": 0.29328417171451143, "grad_norm": 3.002648115158081, "learning_rate": 3.534500921780353e-05, "loss": 1.0007, "step": 11136 }, { "epoch": 0.2933105082960232, "grad_norm": 2.1872189044952393, "learning_rate": 3.5343692388727946e-05, "loss": 1.8473, "step": 11137 }, { "epoch": 0.2933368448775349, "grad_norm": 2.237682342529297, "learning_rate": 3.5342375559652355e-05, "loss": 1.7228, "step": 11138 }, { "epoch": 0.2933631814590466, "grad_norm": 2.02057147026062, "learning_rate": 3.534105873057677e-05, "loss": 1.8477, "step": 11139 }, { "epoch": 0.29338951804055835, "grad_norm": 1.8187834024429321, "learning_rate": 3.5339741901501186e-05, "loss": 2.1298, "step": 11140 }, { "epoch": 0.29341585462207004, "grad_norm": 1.865159273147583, "learning_rate": 3.53384250724256e-05, "loss": 1.5547, "step": 11141 }, { "epoch": 0.2934421912035818, "grad_norm": 2.249774932861328, "learning_rate": 3.533710824335002e-05, "loss": 1.7296, "step": 11142 }, { "epoch": 0.29346852778509347, "grad_norm": 2.927061080932617, "learning_rate": 3.5335791414274426e-05, "loss": 1.8314, "step": 11143 }, { "epoch": 0.2934948643666052, "grad_norm": 1.8893762826919556, "learning_rate": 3.533447458519884e-05, "loss": 1.5575, "step": 11144 }, { "epoch": 0.29352120094811696, "grad_norm": 6.156041622161865, "learning_rate": 3.533315775612326e-05, "loss": 1.0783, "step": 11145 }, { "epoch": 0.29354753752962864, "grad_norm": 2.1454193592071533, "learning_rate": 3.533184092704767e-05, "loss": 1.9329, "step": 11146 }, { "epoch": 0.2935738741111404, "grad_norm": 1.8228569030761719, "learning_rate": 3.533052409797208e-05, "loss": 1.8157, "step": 11147 }, { "epoch": 0.2936002106926521, "grad_norm": 2.6528375148773193, "learning_rate": 3.53292072688965e-05, "loss": 1.8013, "step": 11148 }, { "epoch": 0.2936265472741638, "grad_norm": 2.364283800125122, "learning_rate": 3.532789043982091e-05, "loss": 1.6277, "step": 11149 }, { "epoch": 0.2936528838556755, "grad_norm": 2.68813157081604, "learning_rate": 3.532657361074533e-05, "loss": 1.704, "step": 11150 }, { "epoch": 0.29367922043718725, "grad_norm": 3.6759097576141357, "learning_rate": 3.5325256781669744e-05, "loss": 1.4004, "step": 11151 }, { "epoch": 0.293705557018699, "grad_norm": 1.7254774570465088, "learning_rate": 3.532393995259415e-05, "loss": 2.0627, "step": 11152 }, { "epoch": 0.2937318936002107, "grad_norm": 5.245553493499756, "learning_rate": 3.532262312351857e-05, "loss": 1.4824, "step": 11153 }, { "epoch": 0.2937582301817224, "grad_norm": 4.06008243560791, "learning_rate": 3.5321306294442984e-05, "loss": 1.9692, "step": 11154 }, { "epoch": 0.2937845667632341, "grad_norm": 2.40374755859375, "learning_rate": 3.53199894653674e-05, "loss": 2.3672, "step": 11155 }, { "epoch": 0.29381090334474586, "grad_norm": 3.136652708053589, "learning_rate": 3.531867263629181e-05, "loss": 1.3923, "step": 11156 }, { "epoch": 0.2938372399262576, "grad_norm": 2.8856682777404785, "learning_rate": 3.5317355807216224e-05, "loss": 1.6868, "step": 11157 }, { "epoch": 0.2938635765077693, "grad_norm": 3.481116771697998, "learning_rate": 3.531603897814064e-05, "loss": 2.2314, "step": 11158 }, { "epoch": 0.29388991308928103, "grad_norm": 2.4352495670318604, "learning_rate": 3.5314722149065055e-05, "loss": 1.2976, "step": 11159 }, { "epoch": 0.2939162496707927, "grad_norm": 1.915453314781189, "learning_rate": 3.531340531998947e-05, "loss": 1.7207, "step": 11160 }, { "epoch": 0.29394258625230446, "grad_norm": 3.60813045501709, "learning_rate": 3.531208849091388e-05, "loss": 2.2216, "step": 11161 }, { "epoch": 0.29396892283381615, "grad_norm": 3.326063871383667, "learning_rate": 3.5310771661838295e-05, "loss": 1.7641, "step": 11162 }, { "epoch": 0.2939952594153279, "grad_norm": 1.6828333139419556, "learning_rate": 3.530945483276271e-05, "loss": 2.0016, "step": 11163 }, { "epoch": 0.29402159599683964, "grad_norm": 2.586540937423706, "learning_rate": 3.5308138003687127e-05, "loss": 1.7729, "step": 11164 }, { "epoch": 0.2940479325783513, "grad_norm": 2.4852166175842285, "learning_rate": 3.5306821174611535e-05, "loss": 1.3307, "step": 11165 }, { "epoch": 0.29407426915986307, "grad_norm": 5.29385232925415, "learning_rate": 3.530550434553595e-05, "loss": 1.0516, "step": 11166 }, { "epoch": 0.29410060574137475, "grad_norm": 2.363166570663452, "learning_rate": 3.530418751646036e-05, "loss": 1.0703, "step": 11167 }, { "epoch": 0.2941269423228865, "grad_norm": 1.8245933055877686, "learning_rate": 3.530287068738478e-05, "loss": 2.0348, "step": 11168 }, { "epoch": 0.2941532789043982, "grad_norm": 2.159219264984131, "learning_rate": 3.530155385830919e-05, "loss": 1.4938, "step": 11169 }, { "epoch": 0.29417961548590993, "grad_norm": 2.0778892040252686, "learning_rate": 3.5300237029233607e-05, "loss": 0.7858, "step": 11170 }, { "epoch": 0.29420595206742167, "grad_norm": 1.5472893714904785, "learning_rate": 3.529892020015802e-05, "loss": 2.0901, "step": 11171 }, { "epoch": 0.29423228864893336, "grad_norm": 3.383136510848999, "learning_rate": 3.529760337108243e-05, "loss": 1.3084, "step": 11172 }, { "epoch": 0.2942586252304451, "grad_norm": 3.1431174278259277, "learning_rate": 3.529628654200685e-05, "loss": 1.3236, "step": 11173 }, { "epoch": 0.2942849618119568, "grad_norm": 1.9847277402877808, "learning_rate": 3.529496971293126e-05, "loss": 2.0408, "step": 11174 }, { "epoch": 0.29431129839346853, "grad_norm": 1.7892119884490967, "learning_rate": 3.529365288385568e-05, "loss": 2.1783, "step": 11175 }, { "epoch": 0.2943376349749802, "grad_norm": 2.5990209579467773, "learning_rate": 3.5292336054780087e-05, "loss": 2.0511, "step": 11176 }, { "epoch": 0.29436397155649197, "grad_norm": 1.5941860675811768, "learning_rate": 3.52910192257045e-05, "loss": 2.1411, "step": 11177 }, { "epoch": 0.2943903081380037, "grad_norm": 2.296689987182617, "learning_rate": 3.528970239662892e-05, "loss": 0.7347, "step": 11178 }, { "epoch": 0.2944166447195154, "grad_norm": 1.9193092584609985, "learning_rate": 3.5288385567553333e-05, "loss": 1.777, "step": 11179 }, { "epoch": 0.29444298130102714, "grad_norm": 1.9996116161346436, "learning_rate": 3.528706873847775e-05, "loss": 1.6853, "step": 11180 }, { "epoch": 0.29446931788253883, "grad_norm": 3.902935266494751, "learning_rate": 3.528575190940216e-05, "loss": 1.706, "step": 11181 }, { "epoch": 0.29449565446405057, "grad_norm": 2.9622862339019775, "learning_rate": 3.528443508032658e-05, "loss": 1.5944, "step": 11182 }, { "epoch": 0.29452199104556226, "grad_norm": 2.138038396835327, "learning_rate": 3.528311825125099e-05, "loss": 1.861, "step": 11183 }, { "epoch": 0.294548327627074, "grad_norm": 1.7757973670959473, "learning_rate": 3.5281801422175405e-05, "loss": 2.1649, "step": 11184 }, { "epoch": 0.29457466420858575, "grad_norm": 2.3491737842559814, "learning_rate": 3.5280484593099813e-05, "loss": 1.5502, "step": 11185 }, { "epoch": 0.29460100079009743, "grad_norm": 2.606473207473755, "learning_rate": 3.527916776402423e-05, "loss": 2.1967, "step": 11186 }, { "epoch": 0.2946273373716092, "grad_norm": 1.9916232824325562, "learning_rate": 3.5277850934948645e-05, "loss": 1.6064, "step": 11187 }, { "epoch": 0.29465367395312086, "grad_norm": 4.016951084136963, "learning_rate": 3.527653410587306e-05, "loss": 1.3203, "step": 11188 }, { "epoch": 0.2946800105346326, "grad_norm": 2.234654426574707, "learning_rate": 3.5275217276797476e-05, "loss": 1.8732, "step": 11189 }, { "epoch": 0.29470634711614435, "grad_norm": 3.873143196105957, "learning_rate": 3.5273900447721885e-05, "loss": 1.3686, "step": 11190 }, { "epoch": 0.29473268369765604, "grad_norm": 1.75237238407135, "learning_rate": 3.52725836186463e-05, "loss": 1.7468, "step": 11191 }, { "epoch": 0.2947590202791678, "grad_norm": 1.793145775794983, "learning_rate": 3.5271266789570716e-05, "loss": 1.8607, "step": 11192 }, { "epoch": 0.29478535686067947, "grad_norm": 4.081301689147949, "learning_rate": 3.526994996049513e-05, "loss": 1.1937, "step": 11193 }, { "epoch": 0.2948116934421912, "grad_norm": 2.46317982673645, "learning_rate": 3.526863313141954e-05, "loss": 1.3486, "step": 11194 }, { "epoch": 0.2948380300237029, "grad_norm": 2.060487747192383, "learning_rate": 3.5267316302343956e-05, "loss": 1.6392, "step": 11195 }, { "epoch": 0.29486436660521465, "grad_norm": 2.104280471801758, "learning_rate": 3.526599947326837e-05, "loss": 2.046, "step": 11196 }, { "epoch": 0.2948907031867264, "grad_norm": 1.8553714752197266, "learning_rate": 3.526468264419279e-05, "loss": 1.4616, "step": 11197 }, { "epoch": 0.2949170397682381, "grad_norm": 2.332540988922119, "learning_rate": 3.52633658151172e-05, "loss": 2.0226, "step": 11198 }, { "epoch": 0.2949433763497498, "grad_norm": 2.6931471824645996, "learning_rate": 3.526204898604161e-05, "loss": 1.6319, "step": 11199 }, { "epoch": 0.2949697129312615, "grad_norm": 1.7424328327178955, "learning_rate": 3.526073215696603e-05, "loss": 1.5522, "step": 11200 }, { "epoch": 0.29499604951277325, "grad_norm": 3.1145408153533936, "learning_rate": 3.525941532789044e-05, "loss": 2.0246, "step": 11201 }, { "epoch": 0.29502238609428494, "grad_norm": 1.8936015367507935, "learning_rate": 3.525809849881486e-05, "loss": 2.3262, "step": 11202 }, { "epoch": 0.2950487226757967, "grad_norm": 1.7812623977661133, "learning_rate": 3.525678166973927e-05, "loss": 1.5768, "step": 11203 }, { "epoch": 0.2950750592573084, "grad_norm": 2.1300299167633057, "learning_rate": 3.525546484066368e-05, "loss": 2.3458, "step": 11204 }, { "epoch": 0.2951013958388201, "grad_norm": 1.9849748611450195, "learning_rate": 3.52541480115881e-05, "loss": 2.2947, "step": 11205 }, { "epoch": 0.29512773242033186, "grad_norm": 3.6287360191345215, "learning_rate": 3.5252831182512514e-05, "loss": 1.2135, "step": 11206 }, { "epoch": 0.29515406900184354, "grad_norm": 2.8776333332061768, "learning_rate": 3.525151435343693e-05, "loss": 2.1561, "step": 11207 }, { "epoch": 0.2951804055833553, "grad_norm": 3.6309022903442383, "learning_rate": 3.525019752436134e-05, "loss": 1.3714, "step": 11208 }, { "epoch": 0.295206742164867, "grad_norm": 3.14775013923645, "learning_rate": 3.5248880695285754e-05, "loss": 1.3941, "step": 11209 }, { "epoch": 0.2952330787463787, "grad_norm": 3.3579788208007812, "learning_rate": 3.524756386621017e-05, "loss": 1.4462, "step": 11210 }, { "epoch": 0.29525941532789046, "grad_norm": 2.4440853595733643, "learning_rate": 3.5246247037134585e-05, "loss": 1.6734, "step": 11211 }, { "epoch": 0.29528575190940215, "grad_norm": 2.0921802520751953, "learning_rate": 3.5244930208058994e-05, "loss": 1.2276, "step": 11212 }, { "epoch": 0.2953120884909139, "grad_norm": 1.534582495689392, "learning_rate": 3.524361337898341e-05, "loss": 1.446, "step": 11213 }, { "epoch": 0.2953384250724256, "grad_norm": 4.487147808074951, "learning_rate": 3.524229654990782e-05, "loss": 0.9311, "step": 11214 }, { "epoch": 0.2953647616539373, "grad_norm": 1.7657101154327393, "learning_rate": 3.524097972083224e-05, "loss": 2.3719, "step": 11215 }, { "epoch": 0.295391098235449, "grad_norm": 2.163151741027832, "learning_rate": 3.5239662891756656e-05, "loss": 1.6507, "step": 11216 }, { "epoch": 0.29541743481696076, "grad_norm": 1.5511177778244019, "learning_rate": 3.5238346062681065e-05, "loss": 1.7734, "step": 11217 }, { "epoch": 0.2954437713984725, "grad_norm": 1.5488502979278564, "learning_rate": 3.523702923360548e-05, "loss": 0.7871, "step": 11218 }, { "epoch": 0.2954701079799842, "grad_norm": 8.044669151306152, "learning_rate": 3.523571240452989e-05, "loss": 0.8054, "step": 11219 }, { "epoch": 0.29549644456149593, "grad_norm": 1.952410101890564, "learning_rate": 3.523439557545431e-05, "loss": 1.7429, "step": 11220 }, { "epoch": 0.2955227811430076, "grad_norm": 3.4446659088134766, "learning_rate": 3.523307874637872e-05, "loss": 0.7818, "step": 11221 }, { "epoch": 0.29554911772451936, "grad_norm": 2.1014182567596436, "learning_rate": 3.5231761917303136e-05, "loss": 1.0132, "step": 11222 }, { "epoch": 0.2955754543060311, "grad_norm": 2.375545024871826, "learning_rate": 3.5230445088227545e-05, "loss": 2.0499, "step": 11223 }, { "epoch": 0.2956017908875428, "grad_norm": 1.7391877174377441, "learning_rate": 3.522912825915196e-05, "loss": 1.7864, "step": 11224 }, { "epoch": 0.29562812746905454, "grad_norm": 2.1989901065826416, "learning_rate": 3.5227811430076376e-05, "loss": 1.6099, "step": 11225 }, { "epoch": 0.2956544640505662, "grad_norm": 3.3491978645324707, "learning_rate": 3.522649460100079e-05, "loss": 2.3066, "step": 11226 }, { "epoch": 0.29568080063207797, "grad_norm": 2.848043441772461, "learning_rate": 3.522517777192521e-05, "loss": 0.337, "step": 11227 }, { "epoch": 0.29570713721358965, "grad_norm": 1.8994667530059814, "learning_rate": 3.5223860942849616e-05, "loss": 1.3694, "step": 11228 }, { "epoch": 0.2957334737951014, "grad_norm": 1.9781851768493652, "learning_rate": 3.522254411377404e-05, "loss": 1.9247, "step": 11229 }, { "epoch": 0.29575981037661314, "grad_norm": 2.438587188720703, "learning_rate": 3.522122728469845e-05, "loss": 1.4936, "step": 11230 }, { "epoch": 0.29578614695812483, "grad_norm": 4.01224422454834, "learning_rate": 3.521991045562286e-05, "loss": 0.8521, "step": 11231 }, { "epoch": 0.2958124835396366, "grad_norm": 3.047328233718872, "learning_rate": 3.521859362654727e-05, "loss": 2.0677, "step": 11232 }, { "epoch": 0.29583882012114826, "grad_norm": 2.571894645690918, "learning_rate": 3.521727679747169e-05, "loss": 0.4427, "step": 11233 }, { "epoch": 0.29586515670266, "grad_norm": 5.571075916290283, "learning_rate": 3.52159599683961e-05, "loss": 1.6175, "step": 11234 }, { "epoch": 0.2958914932841717, "grad_norm": 2.1518359184265137, "learning_rate": 3.521464313932052e-05, "loss": 1.9484, "step": 11235 }, { "epoch": 0.29591782986568343, "grad_norm": 1.8952051401138306, "learning_rate": 3.5213326310244934e-05, "loss": 1.6916, "step": 11236 }, { "epoch": 0.2959441664471952, "grad_norm": 1.8660380840301514, "learning_rate": 3.521200948116934e-05, "loss": 1.939, "step": 11237 }, { "epoch": 0.29597050302870687, "grad_norm": 2.507702350616455, "learning_rate": 3.521069265209376e-05, "loss": 0.5641, "step": 11238 }, { "epoch": 0.2959968396102186, "grad_norm": 3.8901422023773193, "learning_rate": 3.5209375823018174e-05, "loss": 2.1301, "step": 11239 }, { "epoch": 0.2960231761917303, "grad_norm": 1.9510704278945923, "learning_rate": 3.520805899394259e-05, "loss": 1.4203, "step": 11240 }, { "epoch": 0.29604951277324204, "grad_norm": 3.083888292312622, "learning_rate": 3.5206742164867e-05, "loss": 0.9464, "step": 11241 }, { "epoch": 0.29607584935475373, "grad_norm": 1.8966132402420044, "learning_rate": 3.5205425335791414e-05, "loss": 1.3417, "step": 11242 }, { "epoch": 0.29610218593626547, "grad_norm": 2.2648446559906006, "learning_rate": 3.520410850671583e-05, "loss": 1.8079, "step": 11243 }, { "epoch": 0.2961285225177772, "grad_norm": 1.9381766319274902, "learning_rate": 3.5202791677640246e-05, "loss": 2.2784, "step": 11244 }, { "epoch": 0.2961548590992889, "grad_norm": 2.3600974082946777, "learning_rate": 3.520147484856466e-05, "loss": 1.5315, "step": 11245 }, { "epoch": 0.29618119568080065, "grad_norm": 4.122308731079102, "learning_rate": 3.520015801948907e-05, "loss": 0.6987, "step": 11246 }, { "epoch": 0.29620753226231233, "grad_norm": 2.9262075424194336, "learning_rate": 3.5198841190413486e-05, "loss": 2.2166, "step": 11247 }, { "epoch": 0.2962338688438241, "grad_norm": 1.4721442461013794, "learning_rate": 3.51975243613379e-05, "loss": 0.9561, "step": 11248 }, { "epoch": 0.29626020542533577, "grad_norm": 1.8601629734039307, "learning_rate": 3.519620753226232e-05, "loss": 1.3197, "step": 11249 }, { "epoch": 0.2962865420068475, "grad_norm": 2.136586904525757, "learning_rate": 3.5194890703186726e-05, "loss": 2.053, "step": 11250 }, { "epoch": 0.29631287858835925, "grad_norm": 1.799536108970642, "learning_rate": 3.519357387411114e-05, "loss": 1.5529, "step": 11251 }, { "epoch": 0.29633921516987094, "grad_norm": 1.7599679231643677, "learning_rate": 3.519225704503556e-05, "loss": 1.5343, "step": 11252 }, { "epoch": 0.2963655517513827, "grad_norm": 1.6139698028564453, "learning_rate": 3.519094021595997e-05, "loss": 1.8092, "step": 11253 }, { "epoch": 0.29639188833289437, "grad_norm": 2.0157313346862793, "learning_rate": 3.518962338688439e-05, "loss": 1.721, "step": 11254 }, { "epoch": 0.2964182249144061, "grad_norm": 2.6079649925231934, "learning_rate": 3.51883065578088e-05, "loss": 2.1226, "step": 11255 }, { "epoch": 0.2964445614959178, "grad_norm": 1.6632548570632935, "learning_rate": 3.518698972873321e-05, "loss": 1.8434, "step": 11256 }, { "epoch": 0.29647089807742955, "grad_norm": 5.257874011993408, "learning_rate": 3.518567289965762e-05, "loss": 1.849, "step": 11257 }, { "epoch": 0.2964972346589413, "grad_norm": 3.5983164310455322, "learning_rate": 3.5184356070582044e-05, "loss": 1.6561, "step": 11258 }, { "epoch": 0.296523571240453, "grad_norm": 2.2889139652252197, "learning_rate": 3.518303924150645e-05, "loss": 2.0746, "step": 11259 }, { "epoch": 0.2965499078219647, "grad_norm": 1.7331256866455078, "learning_rate": 3.518172241243087e-05, "loss": 1.9603, "step": 11260 }, { "epoch": 0.2965762444034764, "grad_norm": 3.4799764156341553, "learning_rate": 3.5180405583355284e-05, "loss": 1.679, "step": 11261 }, { "epoch": 0.29660258098498815, "grad_norm": 1.8756428956985474, "learning_rate": 3.51790887542797e-05, "loss": 1.7471, "step": 11262 }, { "epoch": 0.2966289175664999, "grad_norm": 1.4509050846099854, "learning_rate": 3.5177771925204115e-05, "loss": 1.6872, "step": 11263 }, { "epoch": 0.2966552541480116, "grad_norm": 2.148665428161621, "learning_rate": 3.5176455096128524e-05, "loss": 1.1716, "step": 11264 }, { "epoch": 0.2966815907295233, "grad_norm": 2.5871989727020264, "learning_rate": 3.517513826705294e-05, "loss": 0.4398, "step": 11265 }, { "epoch": 0.296707927311035, "grad_norm": 3.0054564476013184, "learning_rate": 3.517382143797735e-05, "loss": 2.1549, "step": 11266 }, { "epoch": 0.29673426389254676, "grad_norm": 2.008307933807373, "learning_rate": 3.517250460890177e-05, "loss": 1.6396, "step": 11267 }, { "epoch": 0.29676060047405844, "grad_norm": 1.8543498516082764, "learning_rate": 3.517118777982618e-05, "loss": 1.9873, "step": 11268 }, { "epoch": 0.2967869370555702, "grad_norm": 1.8072952032089233, "learning_rate": 3.5169870950750595e-05, "loss": 1.7686, "step": 11269 }, { "epoch": 0.29681327363708193, "grad_norm": 3.243941307067871, "learning_rate": 3.5168554121675004e-05, "loss": 2.1289, "step": 11270 }, { "epoch": 0.2968396102185936, "grad_norm": 7.306673049926758, "learning_rate": 3.516723729259942e-05, "loss": 2.2979, "step": 11271 }, { "epoch": 0.29686594680010536, "grad_norm": 3.147047996520996, "learning_rate": 3.5165920463523835e-05, "loss": 1.3544, "step": 11272 }, { "epoch": 0.29689228338161705, "grad_norm": 1.3698394298553467, "learning_rate": 3.516460363444825e-05, "loss": 1.8454, "step": 11273 }, { "epoch": 0.2969186199631288, "grad_norm": 2.0408740043640137, "learning_rate": 3.5163286805372666e-05, "loss": 1.8977, "step": 11274 }, { "epoch": 0.2969449565446405, "grad_norm": 2.604503631591797, "learning_rate": 3.5161969976297075e-05, "loss": 1.2487, "step": 11275 }, { "epoch": 0.2969712931261522, "grad_norm": 1.969042181968689, "learning_rate": 3.51606531472215e-05, "loss": 1.9016, "step": 11276 }, { "epoch": 0.29699762970766397, "grad_norm": 1.6118701696395874, "learning_rate": 3.5159336318145906e-05, "loss": 1.5509, "step": 11277 }, { "epoch": 0.29702396628917566, "grad_norm": 4.527522563934326, "learning_rate": 3.515801948907032e-05, "loss": 1.781, "step": 11278 }, { "epoch": 0.2970503028706874, "grad_norm": 1.9608347415924072, "learning_rate": 3.515670265999473e-05, "loss": 1.8426, "step": 11279 }, { "epoch": 0.2970766394521991, "grad_norm": 4.401960849761963, "learning_rate": 3.5155385830919146e-05, "loss": 2.2969, "step": 11280 }, { "epoch": 0.29710297603371083, "grad_norm": 2.0636041164398193, "learning_rate": 3.515406900184356e-05, "loss": 2.0198, "step": 11281 }, { "epoch": 0.2971293126152225, "grad_norm": 2.2625300884246826, "learning_rate": 3.515275217276798e-05, "loss": 2.2881, "step": 11282 }, { "epoch": 0.29715564919673426, "grad_norm": 1.8739567995071411, "learning_rate": 3.515143534369239e-05, "loss": 1.84, "step": 11283 }, { "epoch": 0.297181985778246, "grad_norm": 1.908544659614563, "learning_rate": 3.51501185146168e-05, "loss": 1.9209, "step": 11284 }, { "epoch": 0.2972083223597577, "grad_norm": 3.1405060291290283, "learning_rate": 3.514880168554122e-05, "loss": 0.639, "step": 11285 }, { "epoch": 0.29723465894126944, "grad_norm": 2.2767162322998047, "learning_rate": 3.514748485646563e-05, "loss": 1.676, "step": 11286 }, { "epoch": 0.2972609955227811, "grad_norm": 1.9440308809280396, "learning_rate": 3.514616802739005e-05, "loss": 1.5822, "step": 11287 }, { "epoch": 0.29728733210429287, "grad_norm": 1.9416247606277466, "learning_rate": 3.514485119831446e-05, "loss": 0.8407, "step": 11288 }, { "epoch": 0.29731366868580456, "grad_norm": 3.2492847442626953, "learning_rate": 3.514353436923887e-05, "loss": 2.1815, "step": 11289 }, { "epoch": 0.2973400052673163, "grad_norm": 4.336751937866211, "learning_rate": 3.514221754016329e-05, "loss": 1.885, "step": 11290 }, { "epoch": 0.29736634184882804, "grad_norm": 6.096614360809326, "learning_rate": 3.5140900711087704e-05, "loss": 1.5798, "step": 11291 }, { "epoch": 0.29739267843033973, "grad_norm": 2.607179641723633, "learning_rate": 3.513958388201212e-05, "loss": 2.0135, "step": 11292 }, { "epoch": 0.2974190150118515, "grad_norm": 3.0693907737731934, "learning_rate": 3.513826705293653e-05, "loss": 2.3704, "step": 11293 }, { "epoch": 0.29744535159336316, "grad_norm": 1.8100945949554443, "learning_rate": 3.5136950223860944e-05, "loss": 1.7142, "step": 11294 }, { "epoch": 0.2974716881748749, "grad_norm": 1.720001459121704, "learning_rate": 3.513563339478536e-05, "loss": 1.6544, "step": 11295 }, { "epoch": 0.29749802475638665, "grad_norm": 2.7185206413269043, "learning_rate": 3.5134316565709775e-05, "loss": 1.7472, "step": 11296 }, { "epoch": 0.29752436133789834, "grad_norm": 2.925856590270996, "learning_rate": 3.5132999736634184e-05, "loss": 0.6453, "step": 11297 }, { "epoch": 0.2975506979194101, "grad_norm": 3.8374879360198975, "learning_rate": 3.51316829075586e-05, "loss": 1.6675, "step": 11298 }, { "epoch": 0.29757703450092177, "grad_norm": 1.8744838237762451, "learning_rate": 3.5130366078483015e-05, "loss": 1.0709, "step": 11299 }, { "epoch": 0.2976033710824335, "grad_norm": 3.5693914890289307, "learning_rate": 3.512904924940743e-05, "loss": 1.7182, "step": 11300 }, { "epoch": 0.2976297076639452, "grad_norm": 2.1889984607696533, "learning_rate": 3.5127732420331847e-05, "loss": 0.6038, "step": 11301 }, { "epoch": 0.29765604424545694, "grad_norm": 1.7235954999923706, "learning_rate": 3.5126415591256255e-05, "loss": 1.4082, "step": 11302 }, { "epoch": 0.2976823808269687, "grad_norm": 2.602802038192749, "learning_rate": 3.512509876218067e-05, "loss": 1.8461, "step": 11303 }, { "epoch": 0.29770871740848037, "grad_norm": 3.2257328033447266, "learning_rate": 3.512378193310508e-05, "loss": 0.7786, "step": 11304 }, { "epoch": 0.2977350539899921, "grad_norm": 6.53745174407959, "learning_rate": 3.51224651040295e-05, "loss": 1.0738, "step": 11305 }, { "epoch": 0.2977613905715038, "grad_norm": 2.5591909885406494, "learning_rate": 3.512114827495391e-05, "loss": 1.5284, "step": 11306 }, { "epoch": 0.29778772715301555, "grad_norm": 3.120382070541382, "learning_rate": 3.511983144587833e-05, "loss": 0.5894, "step": 11307 }, { "epoch": 0.29781406373452723, "grad_norm": 1.8904626369476318, "learning_rate": 3.511851461680274e-05, "loss": 1.4343, "step": 11308 }, { "epoch": 0.297840400316039, "grad_norm": 2.502664566040039, "learning_rate": 3.511719778772716e-05, "loss": 0.897, "step": 11309 }, { "epoch": 0.2978667368975507, "grad_norm": 3.501429557800293, "learning_rate": 3.5115880958651573e-05, "loss": 0.6699, "step": 11310 }, { "epoch": 0.2978930734790624, "grad_norm": 2.1692235469818115, "learning_rate": 3.511456412957598e-05, "loss": 1.7393, "step": 11311 }, { "epoch": 0.29791941006057415, "grad_norm": 2.408330202102661, "learning_rate": 3.51132473005004e-05, "loss": 1.9903, "step": 11312 }, { "epoch": 0.29794574664208584, "grad_norm": 1.7968454360961914, "learning_rate": 3.511193047142481e-05, "loss": 1.9403, "step": 11313 }, { "epoch": 0.2979720832235976, "grad_norm": 1.9358646869659424, "learning_rate": 3.511061364234923e-05, "loss": 2.1813, "step": 11314 }, { "epoch": 0.29799841980510927, "grad_norm": 2.53141450881958, "learning_rate": 3.510929681327364e-05, "loss": 1.9958, "step": 11315 }, { "epoch": 0.298024756386621, "grad_norm": 1.9014556407928467, "learning_rate": 3.5107979984198053e-05, "loss": 1.1513, "step": 11316 }, { "epoch": 0.29805109296813276, "grad_norm": 2.8176589012145996, "learning_rate": 3.510666315512246e-05, "loss": 1.8123, "step": 11317 }, { "epoch": 0.29807742954964445, "grad_norm": 8.192499160766602, "learning_rate": 3.510534632604688e-05, "loss": 2.3802, "step": 11318 }, { "epoch": 0.2981037661311562, "grad_norm": 1.729529857635498, "learning_rate": 3.5104029496971293e-05, "loss": 1.5345, "step": 11319 }, { "epoch": 0.2981301027126679, "grad_norm": 2.0037455558776855, "learning_rate": 3.510271266789571e-05, "loss": 1.1211, "step": 11320 }, { "epoch": 0.2981564392941796, "grad_norm": 2.028219223022461, "learning_rate": 3.5101395838820125e-05, "loss": 1.751, "step": 11321 }, { "epoch": 0.2981827758756913, "grad_norm": 7.575572967529297, "learning_rate": 3.5100079009744533e-05, "loss": 1.3426, "step": 11322 }, { "epoch": 0.29820911245720305, "grad_norm": 3.681105852127075, "learning_rate": 3.509876218066895e-05, "loss": 1.0907, "step": 11323 }, { "epoch": 0.2982354490387148, "grad_norm": 2.6946728229522705, "learning_rate": 3.5097445351593365e-05, "loss": 1.1082, "step": 11324 }, { "epoch": 0.2982617856202265, "grad_norm": 3.061182737350464, "learning_rate": 3.509612852251778e-05, "loss": 0.9191, "step": 11325 }, { "epoch": 0.2982881222017382, "grad_norm": 2.9638516902923584, "learning_rate": 3.509481169344219e-05, "loss": 1.4096, "step": 11326 }, { "epoch": 0.2983144587832499, "grad_norm": 1.582367181777954, "learning_rate": 3.5093494864366605e-05, "loss": 2.2101, "step": 11327 }, { "epoch": 0.29834079536476166, "grad_norm": 3.923387289047241, "learning_rate": 3.509217803529102e-05, "loss": 1.1766, "step": 11328 }, { "epoch": 0.2983671319462734, "grad_norm": 1.9604004621505737, "learning_rate": 3.5090861206215436e-05, "loss": 1.9074, "step": 11329 }, { "epoch": 0.2983934685277851, "grad_norm": 2.111985206604004, "learning_rate": 3.508954437713985e-05, "loss": 2.2323, "step": 11330 }, { "epoch": 0.29841980510929683, "grad_norm": 1.5968613624572754, "learning_rate": 3.508822754806426e-05, "loss": 1.7975, "step": 11331 }, { "epoch": 0.2984461416908085, "grad_norm": 3.5455071926116943, "learning_rate": 3.5086910718988676e-05, "loss": 1.7912, "step": 11332 }, { "epoch": 0.29847247827232026, "grad_norm": 2.2526659965515137, "learning_rate": 3.508559388991309e-05, "loss": 1.9117, "step": 11333 }, { "epoch": 0.29849881485383195, "grad_norm": 1.5859909057617188, "learning_rate": 3.508427706083751e-05, "loss": 1.2353, "step": 11334 }, { "epoch": 0.2985251514353437, "grad_norm": 1.991771936416626, "learning_rate": 3.5082960231761916e-05, "loss": 0.5767, "step": 11335 }, { "epoch": 0.29855148801685544, "grad_norm": 1.495442271232605, "learning_rate": 3.508164340268633e-05, "loss": 2.0683, "step": 11336 }, { "epoch": 0.2985778245983671, "grad_norm": 2.049715995788574, "learning_rate": 3.508032657361075e-05, "loss": 1.6001, "step": 11337 }, { "epoch": 0.29860416117987887, "grad_norm": 1.6948968172073364, "learning_rate": 3.507900974453516e-05, "loss": 0.6071, "step": 11338 }, { "epoch": 0.29863049776139056, "grad_norm": 2.6026017665863037, "learning_rate": 3.507769291545958e-05, "loss": 2.0654, "step": 11339 }, { "epoch": 0.2986568343429023, "grad_norm": 2.0262014865875244, "learning_rate": 3.507637608638399e-05, "loss": 2.1178, "step": 11340 }, { "epoch": 0.298683170924414, "grad_norm": 1.9327361583709717, "learning_rate": 3.50750592573084e-05, "loss": 2.4644, "step": 11341 }, { "epoch": 0.29870950750592573, "grad_norm": 4.835659980773926, "learning_rate": 3.507374242823282e-05, "loss": 0.7732, "step": 11342 }, { "epoch": 0.2987358440874375, "grad_norm": 2.3807318210601807, "learning_rate": 3.5072425599157234e-05, "loss": 1.6574, "step": 11343 }, { "epoch": 0.29876218066894916, "grad_norm": 1.720497727394104, "learning_rate": 3.507110877008164e-05, "loss": 2.2618, "step": 11344 }, { "epoch": 0.2987885172504609, "grad_norm": 2.229938268661499, "learning_rate": 3.506979194100606e-05, "loss": 1.7065, "step": 11345 }, { "epoch": 0.2988148538319726, "grad_norm": 1.668868899345398, "learning_rate": 3.5068475111930474e-05, "loss": 2.1841, "step": 11346 }, { "epoch": 0.29884119041348434, "grad_norm": 1.6168829202651978, "learning_rate": 3.506715828285489e-05, "loss": 1.8651, "step": 11347 }, { "epoch": 0.298867526994996, "grad_norm": 2.2111268043518066, "learning_rate": 3.5065841453779305e-05, "loss": 2.1177, "step": 11348 }, { "epoch": 0.29889386357650777, "grad_norm": 3.8437047004699707, "learning_rate": 3.5064524624703714e-05, "loss": 1.2729, "step": 11349 }, { "epoch": 0.2989202001580195, "grad_norm": 2.1236634254455566, "learning_rate": 3.506320779562813e-05, "loss": 1.7316, "step": 11350 }, { "epoch": 0.2989465367395312, "grad_norm": 3.524595022201538, "learning_rate": 3.506189096655254e-05, "loss": 1.5795, "step": 11351 }, { "epoch": 0.29897287332104294, "grad_norm": 3.1641995906829834, "learning_rate": 3.506057413747696e-05, "loss": 1.5867, "step": 11352 }, { "epoch": 0.29899920990255463, "grad_norm": 3.083679676055908, "learning_rate": 3.505925730840137e-05, "loss": 1.0551, "step": 11353 }, { "epoch": 0.2990255464840664, "grad_norm": 2.1841988563537598, "learning_rate": 3.5057940479325785e-05, "loss": 1.7922, "step": 11354 }, { "epoch": 0.29905188306557806, "grad_norm": 2.1207447052001953, "learning_rate": 3.50566236502502e-05, "loss": 1.8548, "step": 11355 }, { "epoch": 0.2990782196470898, "grad_norm": 2.3573505878448486, "learning_rate": 3.505530682117461e-05, "loss": 1.5995, "step": 11356 }, { "epoch": 0.29910455622860155, "grad_norm": 3.5008678436279297, "learning_rate": 3.505398999209903e-05, "loss": 1.7812, "step": 11357 }, { "epoch": 0.29913089281011324, "grad_norm": 3.3185739517211914, "learning_rate": 3.505267316302344e-05, "loss": 1.3111, "step": 11358 }, { "epoch": 0.299157229391625, "grad_norm": 2.8683111667633057, "learning_rate": 3.5051356333947856e-05, "loss": 0.6612, "step": 11359 }, { "epoch": 0.29918356597313667, "grad_norm": 2.66521954536438, "learning_rate": 3.5050039504872265e-05, "loss": 1.528, "step": 11360 }, { "epoch": 0.2992099025546484, "grad_norm": 2.139737367630005, "learning_rate": 3.504872267579669e-05, "loss": 2.2746, "step": 11361 }, { "epoch": 0.29923623913616015, "grad_norm": 1.8905680179595947, "learning_rate": 3.5047405846721096e-05, "loss": 1.9795, "step": 11362 }, { "epoch": 0.29926257571767184, "grad_norm": 1.9279958009719849, "learning_rate": 3.504608901764551e-05, "loss": 1.773, "step": 11363 }, { "epoch": 0.2992889122991836, "grad_norm": 1.6923837661743164, "learning_rate": 3.504477218856993e-05, "loss": 1.5279, "step": 11364 }, { "epoch": 0.2993152488806953, "grad_norm": 3.3340327739715576, "learning_rate": 3.5043455359494336e-05, "loss": 1.377, "step": 11365 }, { "epoch": 0.299341585462207, "grad_norm": 1.6432462930679321, "learning_rate": 3.504213853041876e-05, "loss": 1.3982, "step": 11366 }, { "epoch": 0.2993679220437187, "grad_norm": 3.1528308391571045, "learning_rate": 3.504082170134317e-05, "loss": 2.1196, "step": 11367 }, { "epoch": 0.29939425862523045, "grad_norm": 3.964564085006714, "learning_rate": 3.503950487226758e-05, "loss": 1.2747, "step": 11368 }, { "epoch": 0.2994205952067422, "grad_norm": 1.9572641849517822, "learning_rate": 3.503818804319199e-05, "loss": 1.5417, "step": 11369 }, { "epoch": 0.2994469317882539, "grad_norm": 3.664025068283081, "learning_rate": 3.503687121411641e-05, "loss": 1.3096, "step": 11370 }, { "epoch": 0.2994732683697656, "grad_norm": 2.319117546081543, "learning_rate": 3.503555438504082e-05, "loss": 1.617, "step": 11371 }, { "epoch": 0.2994996049512773, "grad_norm": 2.221092462539673, "learning_rate": 3.503423755596524e-05, "loss": 1.9934, "step": 11372 }, { "epoch": 0.29952594153278905, "grad_norm": 2.0143775939941406, "learning_rate": 3.503292072688965e-05, "loss": 1.7052, "step": 11373 }, { "epoch": 0.29955227811430074, "grad_norm": 2.1504523754119873, "learning_rate": 3.503160389781406e-05, "loss": 1.4524, "step": 11374 }, { "epoch": 0.2995786146958125, "grad_norm": 2.070394992828369, "learning_rate": 3.503028706873848e-05, "loss": 1.5879, "step": 11375 }, { "epoch": 0.2996049512773242, "grad_norm": 2.0374763011932373, "learning_rate": 3.5028970239662894e-05, "loss": 1.7961, "step": 11376 }, { "epoch": 0.2996312878588359, "grad_norm": 4.845815181732178, "learning_rate": 3.502765341058731e-05, "loss": 1.8492, "step": 11377 }, { "epoch": 0.29965762444034766, "grad_norm": 2.439401865005493, "learning_rate": 3.502633658151172e-05, "loss": 1.8908, "step": 11378 }, { "epoch": 0.29968396102185935, "grad_norm": 2.191620111465454, "learning_rate": 3.5025019752436134e-05, "loss": 1.7195, "step": 11379 }, { "epoch": 0.2997102976033711, "grad_norm": 1.9873735904693604, "learning_rate": 3.502370292336055e-05, "loss": 1.6176, "step": 11380 }, { "epoch": 0.2997366341848828, "grad_norm": 2.8773157596588135, "learning_rate": 3.5022386094284966e-05, "loss": 2.1566, "step": 11381 }, { "epoch": 0.2997629707663945, "grad_norm": 3.963364601135254, "learning_rate": 3.5021069265209374e-05, "loss": 0.8538, "step": 11382 }, { "epoch": 0.29978930734790626, "grad_norm": 3.418531656265259, "learning_rate": 3.501975243613379e-05, "loss": 0.6466, "step": 11383 }, { "epoch": 0.29981564392941795, "grad_norm": 1.8965864181518555, "learning_rate": 3.5018435607058206e-05, "loss": 1.0929, "step": 11384 }, { "epoch": 0.2998419805109297, "grad_norm": 3.509310722351074, "learning_rate": 3.501711877798262e-05, "loss": 1.364, "step": 11385 }, { "epoch": 0.2998683170924414, "grad_norm": 1.5227187871932983, "learning_rate": 3.501580194890704e-05, "loss": 1.4799, "step": 11386 }, { "epoch": 0.2998946536739531, "grad_norm": 1.956417202949524, "learning_rate": 3.5014485119831446e-05, "loss": 1.8624, "step": 11387 }, { "epoch": 0.2999209902554648, "grad_norm": 1.7138806581497192, "learning_rate": 3.501316829075586e-05, "loss": 2.2537, "step": 11388 }, { "epoch": 0.29994732683697656, "grad_norm": 1.9416273832321167, "learning_rate": 3.501185146168027e-05, "loss": 0.4365, "step": 11389 }, { "epoch": 0.2999736634184883, "grad_norm": 3.6158647537231445, "learning_rate": 3.501053463260469e-05, "loss": 1.436, "step": 11390 }, { "epoch": 0.3, "grad_norm": 2.3787429332733154, "learning_rate": 3.50092178035291e-05, "loss": 1.4393, "step": 11391 }, { "epoch": 0.30002633658151173, "grad_norm": 1.7575633525848389, "learning_rate": 3.500790097445352e-05, "loss": 2.5512, "step": 11392 }, { "epoch": 0.3000526731630234, "grad_norm": 2.107300281524658, "learning_rate": 3.500658414537793e-05, "loss": 2.4838, "step": 11393 }, { "epoch": 0.30007900974453516, "grad_norm": 1.9221919775009155, "learning_rate": 3.500526731630235e-05, "loss": 2.3892, "step": 11394 }, { "epoch": 0.3001053463260469, "grad_norm": 2.0634987354278564, "learning_rate": 3.5003950487226764e-05, "loss": 2.0733, "step": 11395 }, { "epoch": 0.3001316829075586, "grad_norm": 2.9131155014038086, "learning_rate": 3.500263365815117e-05, "loss": 1.36, "step": 11396 }, { "epoch": 0.30015801948907034, "grad_norm": 1.9818575382232666, "learning_rate": 3.500131682907559e-05, "loss": 1.6081, "step": 11397 }, { "epoch": 0.300184356070582, "grad_norm": 1.4716944694519043, "learning_rate": 3.5e-05, "loss": 1.8672, "step": 11398 }, { "epoch": 0.30021069265209377, "grad_norm": 2.087327241897583, "learning_rate": 3.499868317092442e-05, "loss": 2.099, "step": 11399 }, { "epoch": 0.30023702923360546, "grad_norm": 1.72110116481781, "learning_rate": 3.499736634184883e-05, "loss": 0.6496, "step": 11400 }, { "epoch": 0.3002633658151172, "grad_norm": 1.8490465879440308, "learning_rate": 3.4996049512773244e-05, "loss": 1.8095, "step": 11401 }, { "epoch": 0.30028970239662894, "grad_norm": 1.335369348526001, "learning_rate": 3.499473268369766e-05, "loss": 1.6903, "step": 11402 }, { "epoch": 0.30031603897814063, "grad_norm": 1.898796558380127, "learning_rate": 3.499341585462207e-05, "loss": 1.4037, "step": 11403 }, { "epoch": 0.3003423755596524, "grad_norm": 2.4721274375915527, "learning_rate": 3.499209902554649e-05, "loss": 0.3906, "step": 11404 }, { "epoch": 0.30036871214116406, "grad_norm": 4.560434341430664, "learning_rate": 3.49907821964709e-05, "loss": 1.9955, "step": 11405 }, { "epoch": 0.3003950487226758, "grad_norm": 1.8851643800735474, "learning_rate": 3.4989465367395315e-05, "loss": 1.9015, "step": 11406 }, { "epoch": 0.3004213853041875, "grad_norm": 3.1878209114074707, "learning_rate": 3.4988148538319724e-05, "loss": 1.7438, "step": 11407 }, { "epoch": 0.30044772188569924, "grad_norm": 2.026601791381836, "learning_rate": 3.4986831709244146e-05, "loss": 1.744, "step": 11408 }, { "epoch": 0.300474058467211, "grad_norm": 2.912034273147583, "learning_rate": 3.4985514880168555e-05, "loss": 1.7488, "step": 11409 }, { "epoch": 0.30050039504872267, "grad_norm": 2.195408821105957, "learning_rate": 3.498419805109297e-05, "loss": 1.8744, "step": 11410 }, { "epoch": 0.3005267316302344, "grad_norm": 1.757171630859375, "learning_rate": 3.4982881222017386e-05, "loss": 1.4281, "step": 11411 }, { "epoch": 0.3005530682117461, "grad_norm": 2.126519203186035, "learning_rate": 3.4981564392941795e-05, "loss": 2.2833, "step": 11412 }, { "epoch": 0.30057940479325784, "grad_norm": 3.181601047515869, "learning_rate": 3.498024756386622e-05, "loss": 1.4416, "step": 11413 }, { "epoch": 0.30060574137476953, "grad_norm": 3.573237419128418, "learning_rate": 3.4978930734790626e-05, "loss": 1.0448, "step": 11414 }, { "epoch": 0.3006320779562813, "grad_norm": 1.7598329782485962, "learning_rate": 3.497761390571504e-05, "loss": 1.4158, "step": 11415 }, { "epoch": 0.300658414537793, "grad_norm": 3.2319602966308594, "learning_rate": 3.497629707663945e-05, "loss": 0.6546, "step": 11416 }, { "epoch": 0.3006847511193047, "grad_norm": 2.052393674850464, "learning_rate": 3.4974980247563866e-05, "loss": 0.2748, "step": 11417 }, { "epoch": 0.30071108770081645, "grad_norm": 1.855228304862976, "learning_rate": 3.497366341848828e-05, "loss": 0.4704, "step": 11418 }, { "epoch": 0.30073742428232814, "grad_norm": 1.6941291093826294, "learning_rate": 3.49723465894127e-05, "loss": 1.2878, "step": 11419 }, { "epoch": 0.3007637608638399, "grad_norm": 2.1435251235961914, "learning_rate": 3.4971029760337106e-05, "loss": 1.9211, "step": 11420 }, { "epoch": 0.30079009744535157, "grad_norm": 4.7399210929870605, "learning_rate": 3.496971293126152e-05, "loss": 1.5094, "step": 11421 }, { "epoch": 0.3008164340268633, "grad_norm": 1.5705182552337646, "learning_rate": 3.496839610218594e-05, "loss": 2.2813, "step": 11422 }, { "epoch": 0.30084277060837505, "grad_norm": 3.903655529022217, "learning_rate": 3.496707927311035e-05, "loss": 2.0785, "step": 11423 }, { "epoch": 0.30086910718988674, "grad_norm": 2.515242576599121, "learning_rate": 3.496576244403477e-05, "loss": 1.3157, "step": 11424 }, { "epoch": 0.3008954437713985, "grad_norm": 3.384575843811035, "learning_rate": 3.496444561495918e-05, "loss": 1.4308, "step": 11425 }, { "epoch": 0.3009217803529102, "grad_norm": 1.9864667654037476, "learning_rate": 3.496312878588359e-05, "loss": 1.7673, "step": 11426 }, { "epoch": 0.3009481169344219, "grad_norm": 2.24808669090271, "learning_rate": 3.496181195680801e-05, "loss": 1.6765, "step": 11427 }, { "epoch": 0.3009744535159336, "grad_norm": 2.157534122467041, "learning_rate": 3.4960495127732424e-05, "loss": 1.6835, "step": 11428 }, { "epoch": 0.30100079009744535, "grad_norm": 1.9540029764175415, "learning_rate": 3.495917829865683e-05, "loss": 1.9228, "step": 11429 }, { "epoch": 0.3010271266789571, "grad_norm": 1.7175195217132568, "learning_rate": 3.495786146958125e-05, "loss": 1.1776, "step": 11430 }, { "epoch": 0.3010534632604688, "grad_norm": 1.51752769947052, "learning_rate": 3.4956544640505664e-05, "loss": 0.4166, "step": 11431 }, { "epoch": 0.3010797998419805, "grad_norm": 2.9118707180023193, "learning_rate": 3.495522781143008e-05, "loss": 1.5472, "step": 11432 }, { "epoch": 0.3011061364234922, "grad_norm": 1.7199506759643555, "learning_rate": 3.4953910982354495e-05, "loss": 1.9804, "step": 11433 }, { "epoch": 0.30113247300500395, "grad_norm": 1.5731666088104248, "learning_rate": 3.4952594153278904e-05, "loss": 1.6395, "step": 11434 }, { "epoch": 0.3011588095865157, "grad_norm": 3.308396816253662, "learning_rate": 3.495127732420332e-05, "loss": 1.7894, "step": 11435 }, { "epoch": 0.3011851461680274, "grad_norm": 1.9530525207519531, "learning_rate": 3.494996049512773e-05, "loss": 1.9274, "step": 11436 }, { "epoch": 0.30121148274953913, "grad_norm": 4.370994567871094, "learning_rate": 3.494864366605215e-05, "loss": 0.9156, "step": 11437 }, { "epoch": 0.3012378193310508, "grad_norm": 3.1108012199401855, "learning_rate": 3.494732683697656e-05, "loss": 2.279, "step": 11438 }, { "epoch": 0.30126415591256256, "grad_norm": 2.022583246231079, "learning_rate": 3.4946010007900975e-05, "loss": 2.1374, "step": 11439 }, { "epoch": 0.30129049249407425, "grad_norm": 2.1375811100006104, "learning_rate": 3.494469317882539e-05, "loss": 1.7166, "step": 11440 }, { "epoch": 0.301316829075586, "grad_norm": 4.401586055755615, "learning_rate": 3.494337634974981e-05, "loss": 1.4899, "step": 11441 }, { "epoch": 0.30134316565709773, "grad_norm": 4.158323764801025, "learning_rate": 3.494205952067422e-05, "loss": 1.1617, "step": 11442 }, { "epoch": 0.3013695022386094, "grad_norm": 4.1364850997924805, "learning_rate": 3.494074269159863e-05, "loss": 1.7601, "step": 11443 }, { "epoch": 0.30139583882012116, "grad_norm": 2.1257357597351074, "learning_rate": 3.493942586252305e-05, "loss": 2.1702, "step": 11444 }, { "epoch": 0.30142217540163285, "grad_norm": 3.1927878856658936, "learning_rate": 3.4938109033447455e-05, "loss": 1.2641, "step": 11445 }, { "epoch": 0.3014485119831446, "grad_norm": 1.7009096145629883, "learning_rate": 3.493679220437188e-05, "loss": 1.7328, "step": 11446 }, { "epoch": 0.3014748485646563, "grad_norm": 6.142141342163086, "learning_rate": 3.493547537529629e-05, "loss": 1.3355, "step": 11447 }, { "epoch": 0.301501185146168, "grad_norm": 2.5633113384246826, "learning_rate": 3.49341585462207e-05, "loss": 1.9544, "step": 11448 }, { "epoch": 0.30152752172767977, "grad_norm": 1.8000307083129883, "learning_rate": 3.493284171714512e-05, "loss": 1.628, "step": 11449 }, { "epoch": 0.30155385830919146, "grad_norm": 3.10040020942688, "learning_rate": 3.493152488806953e-05, "loss": 0.6859, "step": 11450 }, { "epoch": 0.3015801948907032, "grad_norm": 2.016465425491333, "learning_rate": 3.493020805899395e-05, "loss": 0.3359, "step": 11451 }, { "epoch": 0.3016065314722149, "grad_norm": 2.095041036605835, "learning_rate": 3.492889122991836e-05, "loss": 1.3384, "step": 11452 }, { "epoch": 0.30163286805372663, "grad_norm": 2.615236520767212, "learning_rate": 3.4927574400842773e-05, "loss": 2.3088, "step": 11453 }, { "epoch": 0.3016592046352383, "grad_norm": 4.078410625457764, "learning_rate": 3.492625757176718e-05, "loss": 1.2441, "step": 11454 }, { "epoch": 0.30168554121675006, "grad_norm": 1.917397141456604, "learning_rate": 3.49249407426916e-05, "loss": 2.6897, "step": 11455 }, { "epoch": 0.3017118777982618, "grad_norm": 2.176980972290039, "learning_rate": 3.4923623913616014e-05, "loss": 1.821, "step": 11456 }, { "epoch": 0.3017382143797735, "grad_norm": 2.618560314178467, "learning_rate": 3.492230708454043e-05, "loss": 0.8655, "step": 11457 }, { "epoch": 0.30176455096128524, "grad_norm": 1.825404167175293, "learning_rate": 3.4920990255464845e-05, "loss": 2.4378, "step": 11458 }, { "epoch": 0.3017908875427969, "grad_norm": 2.453624725341797, "learning_rate": 3.4919673426389254e-05, "loss": 2.4098, "step": 11459 }, { "epoch": 0.30181722412430867, "grad_norm": 3.267610549926758, "learning_rate": 3.4918356597313676e-05, "loss": 1.7119, "step": 11460 }, { "epoch": 0.30184356070582036, "grad_norm": 1.6376019716262817, "learning_rate": 3.4917039768238085e-05, "loss": 0.8118, "step": 11461 }, { "epoch": 0.3018698972873321, "grad_norm": 2.9108848571777344, "learning_rate": 3.49157229391625e-05, "loss": 2.3266, "step": 11462 }, { "epoch": 0.30189623386884384, "grad_norm": 2.1124589443206787, "learning_rate": 3.491440611008691e-05, "loss": 1.2451, "step": 11463 }, { "epoch": 0.30192257045035553, "grad_norm": 1.5624676942825317, "learning_rate": 3.4913089281011325e-05, "loss": 1.7329, "step": 11464 }, { "epoch": 0.3019489070318673, "grad_norm": 2.3640170097351074, "learning_rate": 3.491177245193574e-05, "loss": 2.6786, "step": 11465 }, { "epoch": 0.30197524361337896, "grad_norm": 3.3089945316314697, "learning_rate": 3.4910455622860156e-05, "loss": 0.4188, "step": 11466 }, { "epoch": 0.3020015801948907, "grad_norm": 1.6686619520187378, "learning_rate": 3.490913879378457e-05, "loss": 1.5781, "step": 11467 }, { "epoch": 0.30202791677640245, "grad_norm": 1.8771644830703735, "learning_rate": 3.490782196470898e-05, "loss": 1.8105, "step": 11468 }, { "epoch": 0.30205425335791414, "grad_norm": 2.1818089485168457, "learning_rate": 3.4906505135633396e-05, "loss": 1.2544, "step": 11469 }, { "epoch": 0.3020805899394259, "grad_norm": 2.2085797786712646, "learning_rate": 3.490518830655781e-05, "loss": 2.5588, "step": 11470 }, { "epoch": 0.30210692652093757, "grad_norm": 3.314614772796631, "learning_rate": 3.490387147748223e-05, "loss": 2.4428, "step": 11471 }, { "epoch": 0.3021332631024493, "grad_norm": 1.92331063747406, "learning_rate": 3.4902554648406636e-05, "loss": 0.7612, "step": 11472 }, { "epoch": 0.302159599683961, "grad_norm": 2.9464011192321777, "learning_rate": 3.490123781933105e-05, "loss": 1.8368, "step": 11473 }, { "epoch": 0.30218593626547274, "grad_norm": 2.567378520965576, "learning_rate": 3.489992099025547e-05, "loss": 0.7124, "step": 11474 }, { "epoch": 0.3022122728469845, "grad_norm": 2.5734474658966064, "learning_rate": 3.489860416117988e-05, "loss": 0.7, "step": 11475 }, { "epoch": 0.3022386094284962, "grad_norm": 2.0721397399902344, "learning_rate": 3.489728733210429e-05, "loss": 1.7553, "step": 11476 }, { "epoch": 0.3022649460100079, "grad_norm": 3.214118003845215, "learning_rate": 3.489597050302871e-05, "loss": 1.8072, "step": 11477 }, { "epoch": 0.3022912825915196, "grad_norm": 1.5494662523269653, "learning_rate": 3.489465367395312e-05, "loss": 1.7095, "step": 11478 }, { "epoch": 0.30231761917303135, "grad_norm": 3.223907709121704, "learning_rate": 3.489333684487754e-05, "loss": 1.3393, "step": 11479 }, { "epoch": 0.30234395575454304, "grad_norm": 5.31834602355957, "learning_rate": 3.4892020015801954e-05, "loss": 1.1932, "step": 11480 }, { "epoch": 0.3023702923360548, "grad_norm": 1.904130220413208, "learning_rate": 3.489070318672636e-05, "loss": 1.8768, "step": 11481 }, { "epoch": 0.3023966289175665, "grad_norm": 2.699662208557129, "learning_rate": 3.488938635765078e-05, "loss": 1.8886, "step": 11482 }, { "epoch": 0.3024229654990782, "grad_norm": 1.994221806526184, "learning_rate": 3.488806952857519e-05, "loss": 2.1696, "step": 11483 }, { "epoch": 0.30244930208058995, "grad_norm": 2.02368426322937, "learning_rate": 3.488675269949961e-05, "loss": 1.2955, "step": 11484 }, { "epoch": 0.30247563866210164, "grad_norm": 3.2579987049102783, "learning_rate": 3.488543587042402e-05, "loss": 0.6286, "step": 11485 }, { "epoch": 0.3025019752436134, "grad_norm": 2.1361560821533203, "learning_rate": 3.4884119041348434e-05, "loss": 1.3654, "step": 11486 }, { "epoch": 0.3025283118251251, "grad_norm": 1.8205980062484741, "learning_rate": 3.488280221227285e-05, "loss": 1.7312, "step": 11487 }, { "epoch": 0.3025546484066368, "grad_norm": 1.6786627769470215, "learning_rate": 3.488148538319726e-05, "loss": 1.5492, "step": 11488 }, { "epoch": 0.30258098498814856, "grad_norm": 1.9154386520385742, "learning_rate": 3.488016855412168e-05, "loss": 1.5429, "step": 11489 }, { "epoch": 0.30260732156966025, "grad_norm": 1.7850189208984375, "learning_rate": 3.487885172504609e-05, "loss": 1.4754, "step": 11490 }, { "epoch": 0.302633658151172, "grad_norm": 2.176323652267456, "learning_rate": 3.4877534895970505e-05, "loss": 2.0591, "step": 11491 }, { "epoch": 0.3026599947326837, "grad_norm": 2.3130524158477783, "learning_rate": 3.4876218066894914e-05, "loss": 2.0783, "step": 11492 }, { "epoch": 0.3026863313141954, "grad_norm": 3.135699510574341, "learning_rate": 3.4874901237819336e-05, "loss": 0.4751, "step": 11493 }, { "epoch": 0.3027126678957071, "grad_norm": 3.852264881134033, "learning_rate": 3.4873584408743745e-05, "loss": 1.597, "step": 11494 }, { "epoch": 0.30273900447721885, "grad_norm": 2.207695245742798, "learning_rate": 3.487226757966816e-05, "loss": 1.9708, "step": 11495 }, { "epoch": 0.3027653410587306, "grad_norm": 3.5587363243103027, "learning_rate": 3.4870950750592576e-05, "loss": 1.8565, "step": 11496 }, { "epoch": 0.3027916776402423, "grad_norm": 4.040370464324951, "learning_rate": 3.4869633921516985e-05, "loss": 1.093, "step": 11497 }, { "epoch": 0.30281801422175403, "grad_norm": 1.7107130289077759, "learning_rate": 3.486831709244141e-05, "loss": 1.1887, "step": 11498 }, { "epoch": 0.3028443508032657, "grad_norm": 1.9254714250564575, "learning_rate": 3.4867000263365816e-05, "loss": 0.4478, "step": 11499 }, { "epoch": 0.30287068738477746, "grad_norm": 3.078185558319092, "learning_rate": 3.486568343429023e-05, "loss": 1.2184, "step": 11500 }, { "epoch": 0.3028970239662892, "grad_norm": 2.670498847961426, "learning_rate": 3.486436660521464e-05, "loss": 1.3868, "step": 11501 }, { "epoch": 0.3029233605478009, "grad_norm": 2.282266139984131, "learning_rate": 3.4863049776139056e-05, "loss": 1.7738, "step": 11502 }, { "epoch": 0.30294969712931263, "grad_norm": 2.90488600730896, "learning_rate": 3.486173294706347e-05, "loss": 1.4311, "step": 11503 }, { "epoch": 0.3029760337108243, "grad_norm": 1.6077455282211304, "learning_rate": 3.486041611798789e-05, "loss": 0.583, "step": 11504 }, { "epoch": 0.30300237029233607, "grad_norm": 5.220090866088867, "learning_rate": 3.48590992889123e-05, "loss": 1.1909, "step": 11505 }, { "epoch": 0.30302870687384775, "grad_norm": 1.885409951210022, "learning_rate": 3.485778245983671e-05, "loss": 2.0389, "step": 11506 }, { "epoch": 0.3030550434553595, "grad_norm": 2.345433473587036, "learning_rate": 3.4856465630761134e-05, "loss": 1.8258, "step": 11507 }, { "epoch": 0.30308138003687124, "grad_norm": 1.9141958951950073, "learning_rate": 3.485514880168554e-05, "loss": 1.9422, "step": 11508 }, { "epoch": 0.3031077166183829, "grad_norm": 2.8689074516296387, "learning_rate": 3.485383197260996e-05, "loss": 1.1462, "step": 11509 }, { "epoch": 0.30313405319989467, "grad_norm": 3.328822612762451, "learning_rate": 3.485251514353437e-05, "loss": 2.5067, "step": 11510 }, { "epoch": 0.30316038978140636, "grad_norm": 2.0886874198913574, "learning_rate": 3.485119831445878e-05, "loss": 1.7875, "step": 11511 }, { "epoch": 0.3031867263629181, "grad_norm": 2.780932664871216, "learning_rate": 3.48498814853832e-05, "loss": 2.3524, "step": 11512 }, { "epoch": 0.3032130629444298, "grad_norm": 5.195681095123291, "learning_rate": 3.4848564656307614e-05, "loss": 1.1057, "step": 11513 }, { "epoch": 0.30323939952594153, "grad_norm": 2.395005464553833, "learning_rate": 3.484724782723203e-05, "loss": 1.874, "step": 11514 }, { "epoch": 0.3032657361074533, "grad_norm": 2.122614622116089, "learning_rate": 3.484593099815644e-05, "loss": 1.6429, "step": 11515 }, { "epoch": 0.30329207268896496, "grad_norm": 1.6674736738204956, "learning_rate": 3.4844614169080855e-05, "loss": 1.9288, "step": 11516 }, { "epoch": 0.3033184092704767, "grad_norm": 2.5792770385742188, "learning_rate": 3.484329734000527e-05, "loss": 1.6325, "step": 11517 }, { "epoch": 0.3033447458519884, "grad_norm": 3.0751078128814697, "learning_rate": 3.4841980510929686e-05, "loss": 0.8267, "step": 11518 }, { "epoch": 0.30337108243350014, "grad_norm": 2.639277219772339, "learning_rate": 3.4840663681854095e-05, "loss": 1.8914, "step": 11519 }, { "epoch": 0.3033974190150118, "grad_norm": 1.6220495700836182, "learning_rate": 3.483934685277851e-05, "loss": 2.0251, "step": 11520 }, { "epoch": 0.30342375559652357, "grad_norm": 1.768712043762207, "learning_rate": 3.4838030023702926e-05, "loss": 1.5173, "step": 11521 }, { "epoch": 0.3034500921780353, "grad_norm": 2.242807626724243, "learning_rate": 3.483671319462734e-05, "loss": 1.3552, "step": 11522 }, { "epoch": 0.303476428759547, "grad_norm": 3.2393174171447754, "learning_rate": 3.483539636555175e-05, "loss": 1.5405, "step": 11523 }, { "epoch": 0.30350276534105874, "grad_norm": 2.5753369331359863, "learning_rate": 3.4834079536476166e-05, "loss": 1.8293, "step": 11524 }, { "epoch": 0.30352910192257043, "grad_norm": 1.4630845785140991, "learning_rate": 3.483276270740058e-05, "loss": 2.2216, "step": 11525 }, { "epoch": 0.3035554385040822, "grad_norm": 2.8406453132629395, "learning_rate": 3.4831445878325e-05, "loss": 1.1713, "step": 11526 }, { "epoch": 0.30358177508559386, "grad_norm": 2.0083794593811035, "learning_rate": 3.483012904924941e-05, "loss": 1.4846, "step": 11527 }, { "epoch": 0.3036081116671056, "grad_norm": 2.4162118434906006, "learning_rate": 3.482881222017382e-05, "loss": 2.3402, "step": 11528 }, { "epoch": 0.30363444824861735, "grad_norm": 1.5709762573242188, "learning_rate": 3.482749539109824e-05, "loss": 2.131, "step": 11529 }, { "epoch": 0.30366078483012904, "grad_norm": 4.539622783660889, "learning_rate": 3.4826178562022646e-05, "loss": 1.4055, "step": 11530 }, { "epoch": 0.3036871214116408, "grad_norm": 3.038649559020996, "learning_rate": 3.482486173294707e-05, "loss": 1.8247, "step": 11531 }, { "epoch": 0.30371345799315247, "grad_norm": 3.3281667232513428, "learning_rate": 3.482354490387148e-05, "loss": 1.1562, "step": 11532 }, { "epoch": 0.3037397945746642, "grad_norm": 3.070957660675049, "learning_rate": 3.482222807479589e-05, "loss": 1.8494, "step": 11533 }, { "epoch": 0.30376613115617596, "grad_norm": 1.99111807346344, "learning_rate": 3.482091124572031e-05, "loss": 1.7051, "step": 11534 }, { "epoch": 0.30379246773768764, "grad_norm": 2.608184337615967, "learning_rate": 3.481959441664472e-05, "loss": 1.2814, "step": 11535 }, { "epoch": 0.3038188043191994, "grad_norm": 3.271303653717041, "learning_rate": 3.481827758756914e-05, "loss": 1.2071, "step": 11536 }, { "epoch": 0.3038451409007111, "grad_norm": 3.1633152961730957, "learning_rate": 3.481696075849355e-05, "loss": 1.5983, "step": 11537 }, { "epoch": 0.3038714774822228, "grad_norm": 2.2482879161834717, "learning_rate": 3.4815643929417964e-05, "loss": 1.8539, "step": 11538 }, { "epoch": 0.3038978140637345, "grad_norm": 3.257370948791504, "learning_rate": 3.481432710034237e-05, "loss": 0.2941, "step": 11539 }, { "epoch": 0.30392415064524625, "grad_norm": 3.31890606880188, "learning_rate": 3.4813010271266795e-05, "loss": 1.6001, "step": 11540 }, { "epoch": 0.303950487226758, "grad_norm": 1.7338519096374512, "learning_rate": 3.4811693442191204e-05, "loss": 1.3351, "step": 11541 }, { "epoch": 0.3039768238082697, "grad_norm": 1.7897071838378906, "learning_rate": 3.481037661311562e-05, "loss": 1.5936, "step": 11542 }, { "epoch": 0.3040031603897814, "grad_norm": 2.6925737857818604, "learning_rate": 3.4809059784040035e-05, "loss": 1.4094, "step": 11543 }, { "epoch": 0.3040294969712931, "grad_norm": 2.764449119567871, "learning_rate": 3.4807742954964444e-05, "loss": 0.5718, "step": 11544 }, { "epoch": 0.30405583355280486, "grad_norm": 2.2823641300201416, "learning_rate": 3.4806426125888866e-05, "loss": 1.9221, "step": 11545 }, { "epoch": 0.30408217013431654, "grad_norm": 1.7916207313537598, "learning_rate": 3.4805109296813275e-05, "loss": 2.0903, "step": 11546 }, { "epoch": 0.3041085067158283, "grad_norm": 2.2723875045776367, "learning_rate": 3.480379246773769e-05, "loss": 1.1967, "step": 11547 }, { "epoch": 0.30413484329734003, "grad_norm": 3.0824718475341797, "learning_rate": 3.48024756386621e-05, "loss": 1.0949, "step": 11548 }, { "epoch": 0.3041611798788517, "grad_norm": 2.6962482929229736, "learning_rate": 3.4801158809586515e-05, "loss": 2.256, "step": 11549 }, { "epoch": 0.30418751646036346, "grad_norm": 1.5778225660324097, "learning_rate": 3.479984198051093e-05, "loss": 1.3628, "step": 11550 }, { "epoch": 0.30421385304187515, "grad_norm": 1.6850782632827759, "learning_rate": 3.4798525151435346e-05, "loss": 0.9326, "step": 11551 }, { "epoch": 0.3042401896233869, "grad_norm": 3.4709932804107666, "learning_rate": 3.479720832235976e-05, "loss": 1.3931, "step": 11552 }, { "epoch": 0.3042665262048986, "grad_norm": 3.33630108833313, "learning_rate": 3.479589149328417e-05, "loss": 1.1745, "step": 11553 }, { "epoch": 0.3042928627864103, "grad_norm": 2.5795185565948486, "learning_rate": 3.479457466420859e-05, "loss": 1.827, "step": 11554 }, { "epoch": 0.30431919936792207, "grad_norm": 2.360293388366699, "learning_rate": 3.4793257835133e-05, "loss": 2.0577, "step": 11555 }, { "epoch": 0.30434553594943375, "grad_norm": 1.9041166305541992, "learning_rate": 3.479194100605742e-05, "loss": 1.77, "step": 11556 }, { "epoch": 0.3043718725309455, "grad_norm": 4.251124858856201, "learning_rate": 3.4790624176981826e-05, "loss": 0.6899, "step": 11557 }, { "epoch": 0.3043982091124572, "grad_norm": 4.271191596984863, "learning_rate": 3.478930734790624e-05, "loss": 0.4294, "step": 11558 }, { "epoch": 0.30442454569396893, "grad_norm": 2.47845721244812, "learning_rate": 3.478799051883066e-05, "loss": 0.8647, "step": 11559 }, { "epoch": 0.3044508822754806, "grad_norm": 2.1287505626678467, "learning_rate": 3.478667368975507e-05, "loss": 0.6819, "step": 11560 }, { "epoch": 0.30447721885699236, "grad_norm": 2.419715642929077, "learning_rate": 3.478535686067949e-05, "loss": 2.5048, "step": 11561 }, { "epoch": 0.3045035554385041, "grad_norm": 2.0600006580352783, "learning_rate": 3.47840400316039e-05, "loss": 1.7711, "step": 11562 }, { "epoch": 0.3045298920200158, "grad_norm": 1.9151105880737305, "learning_rate": 3.478272320252831e-05, "loss": 1.4715, "step": 11563 }, { "epoch": 0.30455622860152753, "grad_norm": 3.447761297225952, "learning_rate": 3.478140637345273e-05, "loss": 1.1135, "step": 11564 }, { "epoch": 0.3045825651830392, "grad_norm": 2.368546962738037, "learning_rate": 3.4780089544377144e-05, "loss": 2.203, "step": 11565 }, { "epoch": 0.30460890176455097, "grad_norm": 1.8937956094741821, "learning_rate": 3.477877271530155e-05, "loss": 1.6095, "step": 11566 }, { "epoch": 0.30463523834606265, "grad_norm": 1.9551299810409546, "learning_rate": 3.477745588622597e-05, "loss": 1.8678, "step": 11567 }, { "epoch": 0.3046615749275744, "grad_norm": 2.2647898197174072, "learning_rate": 3.4776139057150384e-05, "loss": 2.5686, "step": 11568 }, { "epoch": 0.30468791150908614, "grad_norm": 4.3162102699279785, "learning_rate": 3.47748222280748e-05, "loss": 1.5623, "step": 11569 }, { "epoch": 0.30471424809059783, "grad_norm": 3.346144437789917, "learning_rate": 3.4773505398999215e-05, "loss": 1.1588, "step": 11570 }, { "epoch": 0.30474058467210957, "grad_norm": 3.9300930500030518, "learning_rate": 3.4772188569923624e-05, "loss": 0.7208, "step": 11571 }, { "epoch": 0.30476692125362126, "grad_norm": 1.9246171712875366, "learning_rate": 3.477087174084804e-05, "loss": 1.6903, "step": 11572 }, { "epoch": 0.304793257835133, "grad_norm": 3.4693009853363037, "learning_rate": 3.4769554911772455e-05, "loss": 1.4383, "step": 11573 }, { "epoch": 0.30481959441664475, "grad_norm": 2.248051404953003, "learning_rate": 3.476823808269687e-05, "loss": 1.1415, "step": 11574 }, { "epoch": 0.30484593099815643, "grad_norm": 4.649141311645508, "learning_rate": 3.476692125362128e-05, "loss": 1.0032, "step": 11575 }, { "epoch": 0.3048722675796682, "grad_norm": 4.173797130584717, "learning_rate": 3.4765604424545696e-05, "loss": 1.318, "step": 11576 }, { "epoch": 0.30489860416117986, "grad_norm": 1.853887677192688, "learning_rate": 3.4764287595470104e-05, "loss": 2.9107, "step": 11577 }, { "epoch": 0.3049249407426916, "grad_norm": 3.66133189201355, "learning_rate": 3.476297076639453e-05, "loss": 2.4301, "step": 11578 }, { "epoch": 0.3049512773242033, "grad_norm": 2.828031301498413, "learning_rate": 3.4761653937318936e-05, "loss": 1.98, "step": 11579 }, { "epoch": 0.30497761390571504, "grad_norm": 2.647550344467163, "learning_rate": 3.476033710824335e-05, "loss": 1.2626, "step": 11580 }, { "epoch": 0.3050039504872268, "grad_norm": 3.1275625228881836, "learning_rate": 3.475902027916777e-05, "loss": 1.2381, "step": 11581 }, { "epoch": 0.30503028706873847, "grad_norm": 2.6373543739318848, "learning_rate": 3.4757703450092176e-05, "loss": 0.8409, "step": 11582 }, { "epoch": 0.3050566236502502, "grad_norm": 2.427901029586792, "learning_rate": 3.47563866210166e-05, "loss": 1.2942, "step": 11583 }, { "epoch": 0.3050829602317619, "grad_norm": 2.0694851875305176, "learning_rate": 3.475506979194101e-05, "loss": 2.3386, "step": 11584 }, { "epoch": 0.30510929681327364, "grad_norm": 5.559150695800781, "learning_rate": 3.475375296286542e-05, "loss": 1.0728, "step": 11585 }, { "epoch": 0.30513563339478533, "grad_norm": 1.7264204025268555, "learning_rate": 3.475243613378983e-05, "loss": 1.8654, "step": 11586 }, { "epoch": 0.3051619699762971, "grad_norm": 1.5841379165649414, "learning_rate": 3.4751119304714254e-05, "loss": 2.2088, "step": 11587 }, { "epoch": 0.3051883065578088, "grad_norm": 5.1678032875061035, "learning_rate": 3.474980247563866e-05, "loss": 1.5628, "step": 11588 }, { "epoch": 0.3052146431393205, "grad_norm": 2.632446765899658, "learning_rate": 3.474848564656308e-05, "loss": 2.0084, "step": 11589 }, { "epoch": 0.30524097972083225, "grad_norm": 3.017375946044922, "learning_rate": 3.4747168817487494e-05, "loss": 1.7709, "step": 11590 }, { "epoch": 0.30526731630234394, "grad_norm": 2.235577344894409, "learning_rate": 3.47458519884119e-05, "loss": 1.1135, "step": 11591 }, { "epoch": 0.3052936528838557, "grad_norm": 1.7186636924743652, "learning_rate": 3.4744535159336325e-05, "loss": 0.7838, "step": 11592 }, { "epoch": 0.30531998946536737, "grad_norm": 1.8294529914855957, "learning_rate": 3.4743218330260734e-05, "loss": 0.4327, "step": 11593 }, { "epoch": 0.3053463260468791, "grad_norm": 4.340508460998535, "learning_rate": 3.474190150118515e-05, "loss": 0.7785, "step": 11594 }, { "epoch": 0.30537266262839086, "grad_norm": 2.3846845626831055, "learning_rate": 3.474058467210956e-05, "loss": 1.774, "step": 11595 }, { "epoch": 0.30539899920990254, "grad_norm": 2.962155342102051, "learning_rate": 3.4739267843033974e-05, "loss": 1.6014, "step": 11596 }, { "epoch": 0.3054253357914143, "grad_norm": 1.905815839767456, "learning_rate": 3.473795101395839e-05, "loss": 1.3828, "step": 11597 }, { "epoch": 0.305451672372926, "grad_norm": 2.779789686203003, "learning_rate": 3.4736634184882805e-05, "loss": 0.5156, "step": 11598 }, { "epoch": 0.3054780089544377, "grad_norm": 1.7144635915756226, "learning_rate": 3.473531735580722e-05, "loss": 1.6641, "step": 11599 }, { "epoch": 0.3055043455359494, "grad_norm": 2.321122169494629, "learning_rate": 3.473400052673163e-05, "loss": 1.8675, "step": 11600 }, { "epoch": 0.30553068211746115, "grad_norm": 1.7349375486373901, "learning_rate": 3.4732683697656045e-05, "loss": 1.6778, "step": 11601 }, { "epoch": 0.3055570186989729, "grad_norm": 3.526191473007202, "learning_rate": 3.473136686858046e-05, "loss": 1.0013, "step": 11602 }, { "epoch": 0.3055833552804846, "grad_norm": 3.1799354553222656, "learning_rate": 3.4730050039504876e-05, "loss": 1.6148, "step": 11603 }, { "epoch": 0.3056096918619963, "grad_norm": 5.7747368812561035, "learning_rate": 3.4728733210429285e-05, "loss": 1.5668, "step": 11604 }, { "epoch": 0.305636028443508, "grad_norm": 4.5626349449157715, "learning_rate": 3.47274163813537e-05, "loss": 1.7021, "step": 11605 }, { "epoch": 0.30566236502501976, "grad_norm": 3.611142635345459, "learning_rate": 3.4726099552278116e-05, "loss": 1.0385, "step": 11606 }, { "epoch": 0.3056887016065315, "grad_norm": 2.088200092315674, "learning_rate": 3.472478272320253e-05, "loss": 2.1357, "step": 11607 }, { "epoch": 0.3057150381880432, "grad_norm": 1.9994417428970337, "learning_rate": 3.472346589412695e-05, "loss": 0.4828, "step": 11608 }, { "epoch": 0.30574137476955493, "grad_norm": 2.1813161373138428, "learning_rate": 3.4722149065051356e-05, "loss": 2.2199, "step": 11609 }, { "epoch": 0.3057677113510666, "grad_norm": 2.301156520843506, "learning_rate": 3.472083223597577e-05, "loss": 0.5075, "step": 11610 }, { "epoch": 0.30579404793257836, "grad_norm": 3.0289649963378906, "learning_rate": 3.471951540690019e-05, "loss": 1.7263, "step": 11611 }, { "epoch": 0.30582038451409005, "grad_norm": 2.1083133220672607, "learning_rate": 3.47181985778246e-05, "loss": 2.0567, "step": 11612 }, { "epoch": 0.3058467210956018, "grad_norm": 3.003511905670166, "learning_rate": 3.471688174874901e-05, "loss": 1.9288, "step": 11613 }, { "epoch": 0.30587305767711354, "grad_norm": 2.906557083129883, "learning_rate": 3.471556491967343e-05, "loss": 1.253, "step": 11614 }, { "epoch": 0.3058993942586252, "grad_norm": 2.2326910495758057, "learning_rate": 3.471424809059784e-05, "loss": 2.158, "step": 11615 }, { "epoch": 0.30592573084013697, "grad_norm": 2.362025260925293, "learning_rate": 3.471293126152226e-05, "loss": 1.4417, "step": 11616 }, { "epoch": 0.30595206742164865, "grad_norm": 6.059136867523193, "learning_rate": 3.4711614432446674e-05, "loss": 0.779, "step": 11617 }, { "epoch": 0.3059784040031604, "grad_norm": 1.6216179132461548, "learning_rate": 3.471029760337108e-05, "loss": 0.7152, "step": 11618 }, { "epoch": 0.3060047405846721, "grad_norm": 1.8468326330184937, "learning_rate": 3.47089807742955e-05, "loss": 1.939, "step": 11619 }, { "epoch": 0.30603107716618383, "grad_norm": 2.2648494243621826, "learning_rate": 3.4707663945219914e-05, "loss": 0.7205, "step": 11620 }, { "epoch": 0.3060574137476956, "grad_norm": 2.741974115371704, "learning_rate": 3.470634711614433e-05, "loss": 0.9894, "step": 11621 }, { "epoch": 0.30608375032920726, "grad_norm": 1.7147393226623535, "learning_rate": 3.470503028706874e-05, "loss": 1.8034, "step": 11622 }, { "epoch": 0.306110086910719, "grad_norm": 1.8979274034500122, "learning_rate": 3.4703713457993154e-05, "loss": 2.1463, "step": 11623 }, { "epoch": 0.3061364234922307, "grad_norm": 7.656367301940918, "learning_rate": 3.470239662891756e-05, "loss": 2.3868, "step": 11624 }, { "epoch": 0.30616276007374243, "grad_norm": 1.894753336906433, "learning_rate": 3.4701079799841985e-05, "loss": 1.416, "step": 11625 }, { "epoch": 0.3061890966552541, "grad_norm": 2.048351526260376, "learning_rate": 3.4699762970766394e-05, "loss": 1.7095, "step": 11626 }, { "epoch": 0.30621543323676587, "grad_norm": 3.1628544330596924, "learning_rate": 3.469844614169081e-05, "loss": 1.5224, "step": 11627 }, { "epoch": 0.3062417698182776, "grad_norm": 1.853131890296936, "learning_rate": 3.4697129312615225e-05, "loss": 1.2695, "step": 11628 }, { "epoch": 0.3062681063997893, "grad_norm": 3.5873470306396484, "learning_rate": 3.4695812483539634e-05, "loss": 2.2628, "step": 11629 }, { "epoch": 0.30629444298130104, "grad_norm": 2.3380956649780273, "learning_rate": 3.4694495654464056e-05, "loss": 2.4333, "step": 11630 }, { "epoch": 0.30632077956281273, "grad_norm": 2.702810049057007, "learning_rate": 3.4693178825388465e-05, "loss": 1.7219, "step": 11631 }, { "epoch": 0.30634711614432447, "grad_norm": 1.5835646390914917, "learning_rate": 3.469186199631288e-05, "loss": 1.5005, "step": 11632 }, { "epoch": 0.30637345272583616, "grad_norm": 3.7774085998535156, "learning_rate": 3.469054516723729e-05, "loss": 1.76, "step": 11633 }, { "epoch": 0.3063997893073479, "grad_norm": 4.889803886413574, "learning_rate": 3.4689228338161705e-05, "loss": 0.8578, "step": 11634 }, { "epoch": 0.30642612588885965, "grad_norm": 2.427330732345581, "learning_rate": 3.468791150908612e-05, "loss": 1.3588, "step": 11635 }, { "epoch": 0.30645246247037133, "grad_norm": 1.8620880842208862, "learning_rate": 3.4686594680010536e-05, "loss": 1.6097, "step": 11636 }, { "epoch": 0.3064787990518831, "grad_norm": 1.6033748388290405, "learning_rate": 3.468527785093495e-05, "loss": 1.4988, "step": 11637 }, { "epoch": 0.30650513563339477, "grad_norm": 3.518514633178711, "learning_rate": 3.468396102185936e-05, "loss": 1.7217, "step": 11638 }, { "epoch": 0.3065314722149065, "grad_norm": 1.8962196111679077, "learning_rate": 3.468264419278378e-05, "loss": 1.7625, "step": 11639 }, { "epoch": 0.30655780879641825, "grad_norm": 2.0571370124816895, "learning_rate": 3.468132736370819e-05, "loss": 2.5235, "step": 11640 }, { "epoch": 0.30658414537792994, "grad_norm": 1.7881693840026855, "learning_rate": 3.468001053463261e-05, "loss": 1.7532, "step": 11641 }, { "epoch": 0.3066104819594417, "grad_norm": 7.386948108673096, "learning_rate": 3.4678693705557017e-05, "loss": 1.8551, "step": 11642 }, { "epoch": 0.30663681854095337, "grad_norm": 1.7662712335586548, "learning_rate": 3.467737687648143e-05, "loss": 2.0711, "step": 11643 }, { "epoch": 0.3066631551224651, "grad_norm": 1.8915596008300781, "learning_rate": 3.467606004740585e-05, "loss": 1.4657, "step": 11644 }, { "epoch": 0.3066894917039768, "grad_norm": 4.346502780914307, "learning_rate": 3.467474321833026e-05, "loss": 1.7115, "step": 11645 }, { "epoch": 0.30671582828548855, "grad_norm": 4.3162031173706055, "learning_rate": 3.467342638925468e-05, "loss": 1.4795, "step": 11646 }, { "epoch": 0.3067421648670003, "grad_norm": 3.1514456272125244, "learning_rate": 3.467210956017909e-05, "loss": 1.6011, "step": 11647 }, { "epoch": 0.306768501448512, "grad_norm": 1.5485625267028809, "learning_rate": 3.46707927311035e-05, "loss": 1.8156, "step": 11648 }, { "epoch": 0.3067948380300237, "grad_norm": 1.799734115600586, "learning_rate": 3.466947590202792e-05, "loss": 1.9392, "step": 11649 }, { "epoch": 0.3068211746115354, "grad_norm": 3.8530821800231934, "learning_rate": 3.4668159072952335e-05, "loss": 1.6469, "step": 11650 }, { "epoch": 0.30684751119304715, "grad_norm": 1.5377084016799927, "learning_rate": 3.466684224387674e-05, "loss": 1.6315, "step": 11651 }, { "epoch": 0.30687384777455884, "grad_norm": 2.5824930667877197, "learning_rate": 3.466552541480116e-05, "loss": 1.3066, "step": 11652 }, { "epoch": 0.3069001843560706, "grad_norm": 1.6987619400024414, "learning_rate": 3.4664208585725575e-05, "loss": 1.6432, "step": 11653 }, { "epoch": 0.3069265209375823, "grad_norm": 1.9125723838806152, "learning_rate": 3.466289175664999e-05, "loss": 1.8698, "step": 11654 }, { "epoch": 0.306952857519094, "grad_norm": 2.4199650287628174, "learning_rate": 3.4661574927574406e-05, "loss": 1.7335, "step": 11655 }, { "epoch": 0.30697919410060576, "grad_norm": 1.516898512840271, "learning_rate": 3.4660258098498815e-05, "loss": 1.7069, "step": 11656 }, { "epoch": 0.30700553068211744, "grad_norm": 2.0510690212249756, "learning_rate": 3.465894126942323e-05, "loss": 1.6581, "step": 11657 }, { "epoch": 0.3070318672636292, "grad_norm": 2.3611223697662354, "learning_rate": 3.4657624440347646e-05, "loss": 0.9748, "step": 11658 }, { "epoch": 0.3070582038451409, "grad_norm": 2.9073562622070312, "learning_rate": 3.465630761127206e-05, "loss": 1.6822, "step": 11659 }, { "epoch": 0.3070845404266526, "grad_norm": 2.41892409324646, "learning_rate": 3.465499078219647e-05, "loss": 0.5274, "step": 11660 }, { "epoch": 0.30711087700816436, "grad_norm": 2.74349045753479, "learning_rate": 3.4653673953120886e-05, "loss": 1.6235, "step": 11661 }, { "epoch": 0.30713721358967605, "grad_norm": 2.5569379329681396, "learning_rate": 3.46523571240453e-05, "loss": 1.648, "step": 11662 }, { "epoch": 0.3071635501711878, "grad_norm": 2.395657539367676, "learning_rate": 3.465104029496972e-05, "loss": 1.8136, "step": 11663 }, { "epoch": 0.3071898867526995, "grad_norm": 3.543126106262207, "learning_rate": 3.464972346589413e-05, "loss": 0.8016, "step": 11664 }, { "epoch": 0.3072162233342112, "grad_norm": 1.9433414936065674, "learning_rate": 3.464840663681854e-05, "loss": 2.0382, "step": 11665 }, { "epoch": 0.3072425599157229, "grad_norm": 4.567250728607178, "learning_rate": 3.464708980774296e-05, "loss": 1.3142, "step": 11666 }, { "epoch": 0.30726889649723466, "grad_norm": 2.566758871078491, "learning_rate": 3.4645772978667366e-05, "loss": 2.0811, "step": 11667 }, { "epoch": 0.3072952330787464, "grad_norm": 2.4919593334198, "learning_rate": 3.464445614959179e-05, "loss": 1.4652, "step": 11668 }, { "epoch": 0.3073215696602581, "grad_norm": 1.6978918313980103, "learning_rate": 3.46431393205162e-05, "loss": 2.0075, "step": 11669 }, { "epoch": 0.30734790624176983, "grad_norm": 3.3036160469055176, "learning_rate": 3.464182249144061e-05, "loss": 2.373, "step": 11670 }, { "epoch": 0.3073742428232815, "grad_norm": 3.3027424812316895, "learning_rate": 3.464050566236502e-05, "loss": 1.166, "step": 11671 }, { "epoch": 0.30740057940479326, "grad_norm": 1.8507441282272339, "learning_rate": 3.4639188833289444e-05, "loss": 1.8131, "step": 11672 }, { "epoch": 0.307426915986305, "grad_norm": 3.1358394622802734, "learning_rate": 3.463787200421386e-05, "loss": 1.8126, "step": 11673 }, { "epoch": 0.3074532525678167, "grad_norm": 3.907683849334717, "learning_rate": 3.463655517513827e-05, "loss": 1.7927, "step": 11674 }, { "epoch": 0.30747958914932844, "grad_norm": 2.0035784244537354, "learning_rate": 3.4635238346062684e-05, "loss": 1.9221, "step": 11675 }, { "epoch": 0.3075059257308401, "grad_norm": 1.9177042245864868, "learning_rate": 3.463392151698709e-05, "loss": 1.5778, "step": 11676 }, { "epoch": 0.30753226231235187, "grad_norm": 3.768989086151123, "learning_rate": 3.4632604687911515e-05, "loss": 0.9479, "step": 11677 }, { "epoch": 0.30755859889386356, "grad_norm": 3.2747578620910645, "learning_rate": 3.4631287858835924e-05, "loss": 1.4946, "step": 11678 }, { "epoch": 0.3075849354753753, "grad_norm": 1.6375569105148315, "learning_rate": 3.462997102976034e-05, "loss": 2.1016, "step": 11679 }, { "epoch": 0.30761127205688704, "grad_norm": 2.23073410987854, "learning_rate": 3.462865420068475e-05, "loss": 1.6804, "step": 11680 }, { "epoch": 0.30763760863839873, "grad_norm": 2.1874592304229736, "learning_rate": 3.4627337371609164e-05, "loss": 0.4643, "step": 11681 }, { "epoch": 0.3076639452199105, "grad_norm": 1.587695837020874, "learning_rate": 3.462602054253358e-05, "loss": 1.3702, "step": 11682 }, { "epoch": 0.30769028180142216, "grad_norm": 3.9395804405212402, "learning_rate": 3.4624703713457995e-05, "loss": 1.0728, "step": 11683 }, { "epoch": 0.3077166183829339, "grad_norm": 1.6745870113372803, "learning_rate": 3.462338688438241e-05, "loss": 0.7248, "step": 11684 }, { "epoch": 0.3077429549644456, "grad_norm": 2.6220033168792725, "learning_rate": 3.462207005530682e-05, "loss": 1.2804, "step": 11685 }, { "epoch": 0.30776929154595734, "grad_norm": 1.884389042854309, "learning_rate": 3.462075322623124e-05, "loss": 2.0532, "step": 11686 }, { "epoch": 0.3077956281274691, "grad_norm": 3.220917224884033, "learning_rate": 3.461943639715565e-05, "loss": 1.6703, "step": 11687 }, { "epoch": 0.30782196470898077, "grad_norm": 1.9681662321090698, "learning_rate": 3.4618119568080066e-05, "loss": 1.8496, "step": 11688 }, { "epoch": 0.3078483012904925, "grad_norm": 1.8144863843917847, "learning_rate": 3.4616802739004475e-05, "loss": 1.5183, "step": 11689 }, { "epoch": 0.3078746378720042, "grad_norm": 2.708662509918213, "learning_rate": 3.461548590992889e-05, "loss": 2.3922, "step": 11690 }, { "epoch": 0.30790097445351594, "grad_norm": 3.191736936569214, "learning_rate": 3.4614169080853306e-05, "loss": 1.3397, "step": 11691 }, { "epoch": 0.30792731103502763, "grad_norm": 1.8826956748962402, "learning_rate": 3.461285225177772e-05, "loss": 1.8414, "step": 11692 }, { "epoch": 0.30795364761653937, "grad_norm": 3.622114658355713, "learning_rate": 3.461153542270214e-05, "loss": 1.5846, "step": 11693 }, { "epoch": 0.3079799841980511, "grad_norm": 1.8765538930892944, "learning_rate": 3.4610218593626546e-05, "loss": 1.5439, "step": 11694 }, { "epoch": 0.3080063207795628, "grad_norm": 1.8673611879348755, "learning_rate": 3.460890176455096e-05, "loss": 2.4222, "step": 11695 }, { "epoch": 0.30803265736107455, "grad_norm": 1.5874181985855103, "learning_rate": 3.460758493547538e-05, "loss": 1.9228, "step": 11696 }, { "epoch": 0.30805899394258623, "grad_norm": 2.460047483444214, "learning_rate": 3.460626810639979e-05, "loss": 1.2548, "step": 11697 }, { "epoch": 0.308085330524098, "grad_norm": 2.4794962406158447, "learning_rate": 3.46049512773242e-05, "loss": 1.5382, "step": 11698 }, { "epoch": 0.30811166710560967, "grad_norm": 2.0717644691467285, "learning_rate": 3.460363444824862e-05, "loss": 1.8282, "step": 11699 }, { "epoch": 0.3081380036871214, "grad_norm": 2.1402463912963867, "learning_rate": 3.460231761917303e-05, "loss": 1.457, "step": 11700 }, { "epoch": 0.30816434026863315, "grad_norm": 2.433180570602417, "learning_rate": 3.460100079009745e-05, "loss": 1.6213, "step": 11701 }, { "epoch": 0.30819067685014484, "grad_norm": 2.736400842666626, "learning_rate": 3.4599683961021864e-05, "loss": 0.968, "step": 11702 }, { "epoch": 0.3082170134316566, "grad_norm": 2.3250274658203125, "learning_rate": 3.459836713194627e-05, "loss": 1.9955, "step": 11703 }, { "epoch": 0.30824335001316827, "grad_norm": 1.7911103963851929, "learning_rate": 3.459705030287069e-05, "loss": 1.6743, "step": 11704 }, { "epoch": 0.30826968659468, "grad_norm": 2.7280426025390625, "learning_rate": 3.4595733473795104e-05, "loss": 1.3697, "step": 11705 }, { "epoch": 0.30829602317619176, "grad_norm": 3.187476873397827, "learning_rate": 3.459441664471952e-05, "loss": 0.7183, "step": 11706 }, { "epoch": 0.30832235975770345, "grad_norm": 2.6914868354797363, "learning_rate": 3.459309981564393e-05, "loss": 1.1978, "step": 11707 }, { "epoch": 0.3083486963392152, "grad_norm": 3.1083109378814697, "learning_rate": 3.4591782986568344e-05, "loss": 1.9836, "step": 11708 }, { "epoch": 0.3083750329207269, "grad_norm": 2.3045692443847656, "learning_rate": 3.459046615749276e-05, "loss": 1.3302, "step": 11709 }, { "epoch": 0.3084013695022386, "grad_norm": 3.2504324913024902, "learning_rate": 3.4589149328417176e-05, "loss": 1.9067, "step": 11710 }, { "epoch": 0.3084277060837503, "grad_norm": 1.6680676937103271, "learning_rate": 3.458783249934159e-05, "loss": 2.6122, "step": 11711 }, { "epoch": 0.30845404266526205, "grad_norm": 1.7473156452178955, "learning_rate": 3.4586515670266e-05, "loss": 2.3533, "step": 11712 }, { "epoch": 0.3084803792467738, "grad_norm": 3.5321526527404785, "learning_rate": 3.4585198841190416e-05, "loss": 1.2481, "step": 11713 }, { "epoch": 0.3085067158282855, "grad_norm": 2.50329852104187, "learning_rate": 3.4583882012114824e-05, "loss": 1.6297, "step": 11714 }, { "epoch": 0.3085330524097972, "grad_norm": 2.940936803817749, "learning_rate": 3.458256518303925e-05, "loss": 1.9327, "step": 11715 }, { "epoch": 0.3085593889913089, "grad_norm": 3.2888314723968506, "learning_rate": 3.4581248353963656e-05, "loss": 0.7863, "step": 11716 }, { "epoch": 0.30858572557282066, "grad_norm": 2.1054883003234863, "learning_rate": 3.457993152488807e-05, "loss": 2.4678, "step": 11717 }, { "epoch": 0.30861206215433235, "grad_norm": 3.3386595249176025, "learning_rate": 3.457861469581249e-05, "loss": 0.3001, "step": 11718 }, { "epoch": 0.3086383987358441, "grad_norm": 2.0013539791107178, "learning_rate": 3.45772978667369e-05, "loss": 2.0131, "step": 11719 }, { "epoch": 0.30866473531735583, "grad_norm": 2.938347339630127, "learning_rate": 3.457598103766132e-05, "loss": 0.8582, "step": 11720 }, { "epoch": 0.3086910718988675, "grad_norm": 1.8072912693023682, "learning_rate": 3.457466420858573e-05, "loss": 0.5579, "step": 11721 }, { "epoch": 0.30871740848037926, "grad_norm": 1.6535812616348267, "learning_rate": 3.457334737951014e-05, "loss": 2.0589, "step": 11722 }, { "epoch": 0.30874374506189095, "grad_norm": 2.8924241065979004, "learning_rate": 3.457203055043455e-05, "loss": 1.8899, "step": 11723 }, { "epoch": 0.3087700816434027, "grad_norm": 3.0967254638671875, "learning_rate": 3.4570713721358974e-05, "loss": 1.9702, "step": 11724 }, { "epoch": 0.3087964182249144, "grad_norm": 1.8100147247314453, "learning_rate": 3.456939689228338e-05, "loss": 1.3615, "step": 11725 }, { "epoch": 0.3088227548064261, "grad_norm": 3.3585965633392334, "learning_rate": 3.45680800632078e-05, "loss": 1.3715, "step": 11726 }, { "epoch": 0.30884909138793787, "grad_norm": 1.8501466512680054, "learning_rate": 3.456676323413221e-05, "loss": 1.6813, "step": 11727 }, { "epoch": 0.30887542796944956, "grad_norm": 1.6948282718658447, "learning_rate": 3.456544640505662e-05, "loss": 1.5062, "step": 11728 }, { "epoch": 0.3089017645509613, "grad_norm": 3.161344289779663, "learning_rate": 3.456412957598104e-05, "loss": 1.7149, "step": 11729 }, { "epoch": 0.308928101132473, "grad_norm": 2.444887638092041, "learning_rate": 3.4562812746905454e-05, "loss": 1.9189, "step": 11730 }, { "epoch": 0.30895443771398473, "grad_norm": 1.7201979160308838, "learning_rate": 3.456149591782987e-05, "loss": 2.4303, "step": 11731 }, { "epoch": 0.3089807742954964, "grad_norm": 1.5144646167755127, "learning_rate": 3.456017908875428e-05, "loss": 1.898, "step": 11732 }, { "epoch": 0.30900711087700816, "grad_norm": 2.360762596130371, "learning_rate": 3.4558862259678694e-05, "loss": 2.447, "step": 11733 }, { "epoch": 0.3090334474585199, "grad_norm": 1.8019884824752808, "learning_rate": 3.455754543060311e-05, "loss": 2.1675, "step": 11734 }, { "epoch": 0.3090597840400316, "grad_norm": 2.088752269744873, "learning_rate": 3.4556228601527525e-05, "loss": 1.6127, "step": 11735 }, { "epoch": 0.30908612062154334, "grad_norm": NaN, "learning_rate": 3.4556228601527525e-05, "loss": 1.6811, "step": 11736 }, { "epoch": 0.309112457203055, "grad_norm": 1.9876090288162231, "learning_rate": 3.4554911772451934e-05, "loss": 1.4072, "step": 11737 }, { "epoch": 0.30913879378456677, "grad_norm": 1.7965960502624512, "learning_rate": 3.455359494337635e-05, "loss": 1.9114, "step": 11738 }, { "epoch": 0.30916513036607846, "grad_norm": 8.034951210021973, "learning_rate": 3.4552278114300765e-05, "loss": 1.8327, "step": 11739 }, { "epoch": 0.3091914669475902, "grad_norm": 1.7249491214752197, "learning_rate": 3.455096128522518e-05, "loss": 1.6869, "step": 11740 }, { "epoch": 0.30921780352910194, "grad_norm": 3.2184062004089355, "learning_rate": 3.4549644456149596e-05, "loss": 1.1492, "step": 11741 }, { "epoch": 0.30924414011061363, "grad_norm": 1.6110557317733765, "learning_rate": 3.4548327627074005e-05, "loss": 1.6734, "step": 11742 }, { "epoch": 0.3092704766921254, "grad_norm": 2.355536460876465, "learning_rate": 3.454701079799842e-05, "loss": 1.1599, "step": 11743 }, { "epoch": 0.30929681327363706, "grad_norm": 1.6065218448638916, "learning_rate": 3.4545693968922836e-05, "loss": 2.1427, "step": 11744 }, { "epoch": 0.3093231498551488, "grad_norm": 2.143044948577881, "learning_rate": 3.454437713984725e-05, "loss": 1.647, "step": 11745 }, { "epoch": 0.30934948643666055, "grad_norm": 2.4601407051086426, "learning_rate": 3.454306031077166e-05, "loss": 1.7834, "step": 11746 }, { "epoch": 0.30937582301817224, "grad_norm": 2.9811441898345947, "learning_rate": 3.4541743481696076e-05, "loss": 1.9454, "step": 11747 }, { "epoch": 0.309402159599684, "grad_norm": 1.672289490699768, "learning_rate": 3.454042665262049e-05, "loss": 2.2854, "step": 11748 }, { "epoch": 0.30942849618119567, "grad_norm": 4.177724838256836, "learning_rate": 3.453910982354491e-05, "loss": 2.0358, "step": 11749 }, { "epoch": 0.3094548327627074, "grad_norm": 1.5616108179092407, "learning_rate": 3.453779299446932e-05, "loss": 2.1984, "step": 11750 }, { "epoch": 0.3094811693442191, "grad_norm": 3.5323145389556885, "learning_rate": 3.453647616539373e-05, "loss": 0.4883, "step": 11751 }, { "epoch": 0.30950750592573084, "grad_norm": 2.3145110607147217, "learning_rate": 3.453515933631815e-05, "loss": 0.9525, "step": 11752 }, { "epoch": 0.3095338425072426, "grad_norm": 1.8923053741455078, "learning_rate": 3.453384250724256e-05, "loss": 0.687, "step": 11753 }, { "epoch": 0.3095601790887543, "grad_norm": 2.0831992626190186, "learning_rate": 3.453252567816698e-05, "loss": 1.9662, "step": 11754 }, { "epoch": 0.309586515670266, "grad_norm": 3.045464515686035, "learning_rate": 3.453120884909139e-05, "loss": 1.7909, "step": 11755 }, { "epoch": 0.3096128522517777, "grad_norm": 2.918975591659546, "learning_rate": 3.45298920200158e-05, "loss": 0.8279, "step": 11756 }, { "epoch": 0.30963918883328945, "grad_norm": 6.363572120666504, "learning_rate": 3.452857519094022e-05, "loss": 2.8167, "step": 11757 }, { "epoch": 0.30966552541480113, "grad_norm": 2.8602516651153564, "learning_rate": 3.4527258361864634e-05, "loss": 0.9209, "step": 11758 }, { "epoch": 0.3096918619963129, "grad_norm": 2.3194499015808105, "learning_rate": 3.452594153278905e-05, "loss": 1.5712, "step": 11759 }, { "epoch": 0.3097181985778246, "grad_norm": 1.7898188829421997, "learning_rate": 3.452462470371346e-05, "loss": 1.7326, "step": 11760 }, { "epoch": 0.3097445351593363, "grad_norm": 2.186042308807373, "learning_rate": 3.4523307874637874e-05, "loss": 2.3141, "step": 11761 }, { "epoch": 0.30977087174084805, "grad_norm": 2.2598307132720947, "learning_rate": 3.452199104556228e-05, "loss": 1.4974, "step": 11762 }, { "epoch": 0.30979720832235974, "grad_norm": 2.0029029846191406, "learning_rate": 3.4520674216486705e-05, "loss": 1.0925, "step": 11763 }, { "epoch": 0.3098235449038715, "grad_norm": 2.267841339111328, "learning_rate": 3.4519357387411114e-05, "loss": 2.3699, "step": 11764 }, { "epoch": 0.30984988148538317, "grad_norm": 1.9859501123428345, "learning_rate": 3.451804055833553e-05, "loss": 2.1064, "step": 11765 }, { "epoch": 0.3098762180668949, "grad_norm": 1.9182748794555664, "learning_rate": 3.4516723729259945e-05, "loss": 2.3219, "step": 11766 }, { "epoch": 0.30990255464840666, "grad_norm": 1.7138694524765015, "learning_rate": 3.4515406900184354e-05, "loss": 1.3656, "step": 11767 }, { "epoch": 0.30992889122991835, "grad_norm": 2.314241886138916, "learning_rate": 3.4514090071108777e-05, "loss": 1.4219, "step": 11768 }, { "epoch": 0.3099552278114301, "grad_norm": 4.386459827423096, "learning_rate": 3.4512773242033185e-05, "loss": 1.7324, "step": 11769 }, { "epoch": 0.3099815643929418, "grad_norm": 1.7235097885131836, "learning_rate": 3.45114564129576e-05, "loss": 1.9367, "step": 11770 }, { "epoch": 0.3100079009744535, "grad_norm": 3.6874759197235107, "learning_rate": 3.451013958388201e-05, "loss": 1.3602, "step": 11771 }, { "epoch": 0.3100342375559652, "grad_norm": 2.5705721378326416, "learning_rate": 3.450882275480643e-05, "loss": 1.8409, "step": 11772 }, { "epoch": 0.31006057413747695, "grad_norm": 1.8111506700515747, "learning_rate": 3.450750592573084e-05, "loss": 1.9093, "step": 11773 }, { "epoch": 0.3100869107189887, "grad_norm": 2.2576937675476074, "learning_rate": 3.4506189096655257e-05, "loss": 1.5432, "step": 11774 }, { "epoch": 0.3101132473005004, "grad_norm": 1.5894970893859863, "learning_rate": 3.4504872267579665e-05, "loss": 1.3978, "step": 11775 }, { "epoch": 0.3101395838820121, "grad_norm": 2.517726182937622, "learning_rate": 3.450355543850408e-05, "loss": 2.1917, "step": 11776 }, { "epoch": 0.3101659204635238, "grad_norm": 1.5919756889343262, "learning_rate": 3.4502238609428497e-05, "loss": 2.3847, "step": 11777 }, { "epoch": 0.31019225704503556, "grad_norm": 2.426539659500122, "learning_rate": 3.450092178035291e-05, "loss": 1.6726, "step": 11778 }, { "epoch": 0.3102185936265473, "grad_norm": 3.7654507160186768, "learning_rate": 3.449960495127733e-05, "loss": 1.026, "step": 11779 }, { "epoch": 0.310244930208059, "grad_norm": 1.7406479120254517, "learning_rate": 3.4498288122201737e-05, "loss": 2.3115, "step": 11780 }, { "epoch": 0.31027126678957073, "grad_norm": 3.847716808319092, "learning_rate": 3.449697129312615e-05, "loss": 1.7675, "step": 11781 }, { "epoch": 0.3102976033710824, "grad_norm": 2.8625152111053467, "learning_rate": 3.449565446405057e-05, "loss": 1.4775, "step": 11782 }, { "epoch": 0.31032393995259416, "grad_norm": 1.706165075302124, "learning_rate": 3.449433763497498e-05, "loss": 1.8195, "step": 11783 }, { "epoch": 0.31035027653410585, "grad_norm": 3.5676729679107666, "learning_rate": 3.449302080589939e-05, "loss": 1.9714, "step": 11784 }, { "epoch": 0.3103766131156176, "grad_norm": 1.5900192260742188, "learning_rate": 3.449170397682381e-05, "loss": 1.1815, "step": 11785 }, { "epoch": 0.31040294969712934, "grad_norm": 2.0018367767333984, "learning_rate": 3.449038714774822e-05, "loss": 1.9062, "step": 11786 }, { "epoch": 0.310429286278641, "grad_norm": 2.294731855392456, "learning_rate": 3.448907031867264e-05, "loss": 1.867, "step": 11787 }, { "epoch": 0.31045562286015277, "grad_norm": 2.2968533039093018, "learning_rate": 3.4487753489597055e-05, "loss": 0.3297, "step": 11788 }, { "epoch": 0.31048195944166446, "grad_norm": 2.0658628940582275, "learning_rate": 3.4486436660521463e-05, "loss": 1.1412, "step": 11789 }, { "epoch": 0.3105082960231762, "grad_norm": 2.0782594680786133, "learning_rate": 3.448511983144588e-05, "loss": 1.4775, "step": 11790 }, { "epoch": 0.3105346326046879, "grad_norm": 3.3005943298339844, "learning_rate": 3.4483803002370295e-05, "loss": 0.987, "step": 11791 }, { "epoch": 0.31056096918619963, "grad_norm": 1.907914638519287, "learning_rate": 3.448248617329471e-05, "loss": 2.0493, "step": 11792 }, { "epoch": 0.3105873057677114, "grad_norm": 2.1023778915405273, "learning_rate": 3.448116934421912e-05, "loss": 2.0876, "step": 11793 }, { "epoch": 0.31061364234922306, "grad_norm": 3.929502487182617, "learning_rate": 3.4479852515143535e-05, "loss": 2.0217, "step": 11794 }, { "epoch": 0.3106399789307348, "grad_norm": 1.5751134157180786, "learning_rate": 3.447853568606795e-05, "loss": 1.2519, "step": 11795 }, { "epoch": 0.3106663155122465, "grad_norm": 3.5574424266815186, "learning_rate": 3.4477218856992366e-05, "loss": 1.86, "step": 11796 }, { "epoch": 0.31069265209375824, "grad_norm": 6.201610565185547, "learning_rate": 3.447590202791678e-05, "loss": 0.9862, "step": 11797 }, { "epoch": 0.3107189886752699, "grad_norm": 1.842467188835144, "learning_rate": 3.447458519884119e-05, "loss": 2.1066, "step": 11798 }, { "epoch": 0.31074532525678167, "grad_norm": 1.8609633445739746, "learning_rate": 3.4473268369765606e-05, "loss": 1.3968, "step": 11799 }, { "epoch": 0.3107716618382934, "grad_norm": 3.8730084896087646, "learning_rate": 3.447195154069002e-05, "loss": 1.5647, "step": 11800 }, { "epoch": 0.3107979984198051, "grad_norm": 2.2613658905029297, "learning_rate": 3.447063471161444e-05, "loss": 1.6678, "step": 11801 }, { "epoch": 0.31082433500131684, "grad_norm": 1.7192591428756714, "learning_rate": 3.4469317882538846e-05, "loss": 1.9186, "step": 11802 }, { "epoch": 0.31085067158282853, "grad_norm": 1.6857656240463257, "learning_rate": 3.446800105346326e-05, "loss": 1.2556, "step": 11803 }, { "epoch": 0.3108770081643403, "grad_norm": 1.9221400022506714, "learning_rate": 3.446668422438768e-05, "loss": 2.2573, "step": 11804 }, { "epoch": 0.31090334474585196, "grad_norm": 2.819298505783081, "learning_rate": 3.446536739531209e-05, "loss": 1.0863, "step": 11805 }, { "epoch": 0.3109296813273637, "grad_norm": 2.186988115310669, "learning_rate": 3.446405056623651e-05, "loss": 2.1734, "step": 11806 }, { "epoch": 0.31095601790887545, "grad_norm": 2.2983505725860596, "learning_rate": 3.446273373716092e-05, "loss": 0.8218, "step": 11807 }, { "epoch": 0.31098235449038714, "grad_norm": 2.208716630935669, "learning_rate": 3.446141690808533e-05, "loss": 0.2124, "step": 11808 }, { "epoch": 0.3110086910718989, "grad_norm": 1.9391090869903564, "learning_rate": 3.446010007900974e-05, "loss": 2.4383, "step": 11809 }, { "epoch": 0.31103502765341057, "grad_norm": 2.9998319149017334, "learning_rate": 3.4458783249934164e-05, "loss": 1.4809, "step": 11810 }, { "epoch": 0.3110613642349223, "grad_norm": 2.185720682144165, "learning_rate": 3.445746642085857e-05, "loss": 1.7887, "step": 11811 }, { "epoch": 0.31108770081643405, "grad_norm": 3.1644480228424072, "learning_rate": 3.445614959178299e-05, "loss": 1.8198, "step": 11812 }, { "epoch": 0.31111403739794574, "grad_norm": 1.8772892951965332, "learning_rate": 3.4454832762707404e-05, "loss": 1.6651, "step": 11813 }, { "epoch": 0.3111403739794575, "grad_norm": 1.7570931911468506, "learning_rate": 3.445351593363181e-05, "loss": 1.8823, "step": 11814 }, { "epoch": 0.3111667105609692, "grad_norm": 3.7592968940734863, "learning_rate": 3.4452199104556235e-05, "loss": 1.3793, "step": 11815 }, { "epoch": 0.3111930471424809, "grad_norm": 3.3698315620422363, "learning_rate": 3.4450882275480644e-05, "loss": 0.8628, "step": 11816 }, { "epoch": 0.3112193837239926, "grad_norm": 1.9161858558654785, "learning_rate": 3.444956544640506e-05, "loss": 1.4254, "step": 11817 }, { "epoch": 0.31124572030550435, "grad_norm": 2.1362297534942627, "learning_rate": 3.444824861732947e-05, "loss": 1.7526, "step": 11818 }, { "epoch": 0.3112720568870161, "grad_norm": 2.8416190147399902, "learning_rate": 3.444693178825389e-05, "loss": 0.9944, "step": 11819 }, { "epoch": 0.3112983934685278, "grad_norm": 3.3196158409118652, "learning_rate": 3.44456149591783e-05, "loss": 1.3767, "step": 11820 }, { "epoch": 0.3113247300500395, "grad_norm": 2.473939895629883, "learning_rate": 3.4444298130102715e-05, "loss": 1.2947, "step": 11821 }, { "epoch": 0.3113510666315512, "grad_norm": 3.148097276687622, "learning_rate": 3.444298130102713e-05, "loss": 1.1591, "step": 11822 }, { "epoch": 0.31137740321306295, "grad_norm": 1.6426162719726562, "learning_rate": 3.444166447195154e-05, "loss": 1.7937, "step": 11823 }, { "epoch": 0.31140373979457464, "grad_norm": 1.8858553171157837, "learning_rate": 3.444034764287596e-05, "loss": 1.8173, "step": 11824 }, { "epoch": 0.3114300763760864, "grad_norm": 2.10609769821167, "learning_rate": 3.443903081380037e-05, "loss": 0.9571, "step": 11825 }, { "epoch": 0.3114564129575981, "grad_norm": 2.3406460285186768, "learning_rate": 3.4437713984724786e-05, "loss": 1.3258, "step": 11826 }, { "epoch": 0.3114827495391098, "grad_norm": 1.8892433643341064, "learning_rate": 3.4436397155649195e-05, "loss": 1.8035, "step": 11827 }, { "epoch": 0.31150908612062156, "grad_norm": 5.53434419631958, "learning_rate": 3.443508032657361e-05, "loss": 1.5868, "step": 11828 }, { "epoch": 0.31153542270213325, "grad_norm": 3.3683269023895264, "learning_rate": 3.4433763497498026e-05, "loss": 1.6449, "step": 11829 }, { "epoch": 0.311561759283645, "grad_norm": 2.940948963165283, "learning_rate": 3.443244666842244e-05, "loss": 0.8559, "step": 11830 }, { "epoch": 0.3115880958651567, "grad_norm": 4.314951419830322, "learning_rate": 3.443112983934685e-05, "loss": 2.3859, "step": 11831 }, { "epoch": 0.3116144324466684, "grad_norm": 3.335306167602539, "learning_rate": 3.4429813010271266e-05, "loss": 1.2916, "step": 11832 }, { "epoch": 0.31164076902818016, "grad_norm": 2.280775308609009, "learning_rate": 3.442849618119568e-05, "loss": 2.2159, "step": 11833 }, { "epoch": 0.31166710560969185, "grad_norm": 1.8198018074035645, "learning_rate": 3.44271793521201e-05, "loss": 1.974, "step": 11834 }, { "epoch": 0.3116934421912036, "grad_norm": 3.4338302612304688, "learning_rate": 3.442586252304451e-05, "loss": 1.9795, "step": 11835 }, { "epoch": 0.3117197787727153, "grad_norm": 2.032616138458252, "learning_rate": 3.442454569396892e-05, "loss": 1.5163, "step": 11836 }, { "epoch": 0.311746115354227, "grad_norm": 2.127558946609497, "learning_rate": 3.442322886489334e-05, "loss": 1.5309, "step": 11837 }, { "epoch": 0.3117724519357387, "grad_norm": 4.492937088012695, "learning_rate": 3.442191203581775e-05, "loss": 1.8677, "step": 11838 }, { "epoch": 0.31179878851725046, "grad_norm": 3.4926528930664062, "learning_rate": 3.442059520674217e-05, "loss": 1.7471, "step": 11839 }, { "epoch": 0.3118251250987622, "grad_norm": 2.8821794986724854, "learning_rate": 3.441927837766658e-05, "loss": 1.5013, "step": 11840 }, { "epoch": 0.3118514616802739, "grad_norm": 2.3584909439086914, "learning_rate": 3.441796154859099e-05, "loss": 1.8278, "step": 11841 }, { "epoch": 0.31187779826178563, "grad_norm": 2.28352427482605, "learning_rate": 3.441664471951541e-05, "loss": 1.4235, "step": 11842 }, { "epoch": 0.3119041348432973, "grad_norm": 2.3072593212127686, "learning_rate": 3.4415327890439824e-05, "loss": 2.0197, "step": 11843 }, { "epoch": 0.31193047142480906, "grad_norm": 3.4011669158935547, "learning_rate": 3.441401106136424e-05, "loss": 1.3139, "step": 11844 }, { "epoch": 0.3119568080063208, "grad_norm": 3.5458455085754395, "learning_rate": 3.441269423228865e-05, "loss": 1.8922, "step": 11845 }, { "epoch": 0.3119831445878325, "grad_norm": 2.366835355758667, "learning_rate": 3.4411377403213064e-05, "loss": 1.7695, "step": 11846 }, { "epoch": 0.31200948116934424, "grad_norm": 2.1246747970581055, "learning_rate": 3.441006057413747e-05, "loss": 1.7576, "step": 11847 }, { "epoch": 0.3120358177508559, "grad_norm": 5.915126323699951, "learning_rate": 3.4408743745061896e-05, "loss": 2.3039, "step": 11848 }, { "epoch": 0.31206215433236767, "grad_norm": 2.065683364868164, "learning_rate": 3.4407426915986304e-05, "loss": 1.4285, "step": 11849 }, { "epoch": 0.31208849091387936, "grad_norm": 2.0415496826171875, "learning_rate": 3.440611008691072e-05, "loss": 1.6201, "step": 11850 }, { "epoch": 0.3121148274953911, "grad_norm": 3.2185983657836914, "learning_rate": 3.4404793257835136e-05, "loss": 0.7069, "step": 11851 }, { "epoch": 0.31214116407690284, "grad_norm": 2.1158273220062256, "learning_rate": 3.440347642875955e-05, "loss": 1.7578, "step": 11852 }, { "epoch": 0.31216750065841453, "grad_norm": 1.7559586763381958, "learning_rate": 3.440215959968397e-05, "loss": 1.6289, "step": 11853 }, { "epoch": 0.3121938372399263, "grad_norm": 3.821509838104248, "learning_rate": 3.4400842770608376e-05, "loss": 1.9091, "step": 11854 }, { "epoch": 0.31222017382143796, "grad_norm": 5.022719860076904, "learning_rate": 3.439952594153279e-05, "loss": 1.1796, "step": 11855 }, { "epoch": 0.3122465104029497, "grad_norm": 2.2521557807922363, "learning_rate": 3.43982091124572e-05, "loss": 1.6953, "step": 11856 }, { "epoch": 0.3122728469844614, "grad_norm": 3.792581081390381, "learning_rate": 3.439689228338162e-05, "loss": 1.7778, "step": 11857 }, { "epoch": 0.31229918356597314, "grad_norm": 1.7502951622009277, "learning_rate": 3.439557545430603e-05, "loss": 1.9176, "step": 11858 }, { "epoch": 0.3123255201474849, "grad_norm": 1.73971426486969, "learning_rate": 3.439425862523045e-05, "loss": 1.8883, "step": 11859 }, { "epoch": 0.31235185672899657, "grad_norm": 2.5772743225097656, "learning_rate": 3.439294179615486e-05, "loss": 1.3508, "step": 11860 }, { "epoch": 0.3123781933105083, "grad_norm": 2.179870128631592, "learning_rate": 3.439162496707927e-05, "loss": 1.5847, "step": 11861 }, { "epoch": 0.31240452989202, "grad_norm": 2.0155560970306396, "learning_rate": 3.4390308138003694e-05, "loss": 1.4594, "step": 11862 }, { "epoch": 0.31243086647353174, "grad_norm": 3.4546289443969727, "learning_rate": 3.43889913089281e-05, "loss": 2.5174, "step": 11863 }, { "epoch": 0.31245720305504343, "grad_norm": 1.9563692808151245, "learning_rate": 3.438767447985252e-05, "loss": 1.3438, "step": 11864 }, { "epoch": 0.3124835396365552, "grad_norm": 1.6062841415405273, "learning_rate": 3.438635765077693e-05, "loss": 1.6144, "step": 11865 }, { "epoch": 0.3125098762180669, "grad_norm": 1.5504190921783447, "learning_rate": 3.438504082170135e-05, "loss": 1.5992, "step": 11866 }, { "epoch": 0.3125362127995786, "grad_norm": 2.0044147968292236, "learning_rate": 3.438372399262576e-05, "loss": 1.8003, "step": 11867 }, { "epoch": 0.31256254938109035, "grad_norm": 2.6076581478118896, "learning_rate": 3.4382407163550174e-05, "loss": 2.0047, "step": 11868 }, { "epoch": 0.31258888596260204, "grad_norm": 1.7118685245513916, "learning_rate": 3.438109033447459e-05, "loss": 1.7316, "step": 11869 }, { "epoch": 0.3126152225441138, "grad_norm": 2.617780923843384, "learning_rate": 3.4379773505399e-05, "loss": 1.9247, "step": 11870 }, { "epoch": 0.31264155912562547, "grad_norm": 1.7714011669158936, "learning_rate": 3.437845667632342e-05, "loss": 2.5514, "step": 11871 }, { "epoch": 0.3126678957071372, "grad_norm": 1.7230901718139648, "learning_rate": 3.437713984724783e-05, "loss": 2.4104, "step": 11872 }, { "epoch": 0.31269423228864895, "grad_norm": 2.9638564586639404, "learning_rate": 3.4375823018172245e-05, "loss": 2.4147, "step": 11873 }, { "epoch": 0.31272056887016064, "grad_norm": 4.746386528015137, "learning_rate": 3.4374506189096654e-05, "loss": 2.0206, "step": 11874 }, { "epoch": 0.3127469054516724, "grad_norm": 1.94407057762146, "learning_rate": 3.437318936002107e-05, "loss": 2.5493, "step": 11875 }, { "epoch": 0.3127732420331841, "grad_norm": 1.7084060907363892, "learning_rate": 3.4371872530945485e-05, "loss": 1.3561, "step": 11876 }, { "epoch": 0.3127995786146958, "grad_norm": 2.9552388191223145, "learning_rate": 3.43705557018699e-05, "loss": 1.8076, "step": 11877 }, { "epoch": 0.3128259151962075, "grad_norm": 5.004602432250977, "learning_rate": 3.436923887279431e-05, "loss": 0.7048, "step": 11878 }, { "epoch": 0.31285225177771925, "grad_norm": 4.178142070770264, "learning_rate": 3.4367922043718725e-05, "loss": 0.4902, "step": 11879 }, { "epoch": 0.312878588359231, "grad_norm": 3.9015309810638428, "learning_rate": 3.436660521464314e-05, "loss": 0.5962, "step": 11880 }, { "epoch": 0.3129049249407427, "grad_norm": 2.054222345352173, "learning_rate": 3.4365288385567556e-05, "loss": 1.7808, "step": 11881 }, { "epoch": 0.3129312615222544, "grad_norm": 4.6398725509643555, "learning_rate": 3.436397155649197e-05, "loss": 2.1166, "step": 11882 }, { "epoch": 0.3129575981037661, "grad_norm": 2.2833259105682373, "learning_rate": 3.436265472741638e-05, "loss": 2.2202, "step": 11883 }, { "epoch": 0.31298393468527785, "grad_norm": 1.8960294723510742, "learning_rate": 3.4361337898340796e-05, "loss": 1.922, "step": 11884 }, { "epoch": 0.3130102712667896, "grad_norm": 2.1866676807403564, "learning_rate": 3.436002106926521e-05, "loss": 1.8204, "step": 11885 }, { "epoch": 0.3130366078483013, "grad_norm": 3.9158682823181152, "learning_rate": 3.435870424018963e-05, "loss": 0.6397, "step": 11886 }, { "epoch": 0.31306294442981303, "grad_norm": 1.6878708600997925, "learning_rate": 3.4357387411114036e-05, "loss": 1.9255, "step": 11887 }, { "epoch": 0.3130892810113247, "grad_norm": 1.8021827936172485, "learning_rate": 3.435607058203845e-05, "loss": 2.3026, "step": 11888 }, { "epoch": 0.31311561759283646, "grad_norm": 2.618227243423462, "learning_rate": 3.435475375296287e-05, "loss": 1.9953, "step": 11889 }, { "epoch": 0.31314195417434815, "grad_norm": 3.2791929244995117, "learning_rate": 3.435343692388728e-05, "loss": 1.9422, "step": 11890 }, { "epoch": 0.3131682907558599, "grad_norm": 2.137345790863037, "learning_rate": 3.43521200948117e-05, "loss": 2.328, "step": 11891 }, { "epoch": 0.31319462733737163, "grad_norm": 3.2285099029541016, "learning_rate": 3.435080326573611e-05, "loss": 1.289, "step": 11892 }, { "epoch": 0.3132209639188833, "grad_norm": 1.742781639099121, "learning_rate": 3.434948643666052e-05, "loss": 2.2633, "step": 11893 }, { "epoch": 0.31324730050039507, "grad_norm": 1.6923943758010864, "learning_rate": 3.434816960758493e-05, "loss": 2.1714, "step": 11894 }, { "epoch": 0.31327363708190675, "grad_norm": 3.1474180221557617, "learning_rate": 3.4346852778509354e-05, "loss": 1.8829, "step": 11895 }, { "epoch": 0.3132999736634185, "grad_norm": 7.885355472564697, "learning_rate": 3.434553594943376e-05, "loss": 2.1793, "step": 11896 }, { "epoch": 0.3133263102449302, "grad_norm": 3.4797866344451904, "learning_rate": 3.434421912035818e-05, "loss": 0.8664, "step": 11897 }, { "epoch": 0.3133526468264419, "grad_norm": 2.1376543045043945, "learning_rate": 3.4342902291282594e-05, "loss": 1.5543, "step": 11898 }, { "epoch": 0.31337898340795367, "grad_norm": 1.9626778364181519, "learning_rate": 3.434158546220701e-05, "loss": 1.6245, "step": 11899 }, { "epoch": 0.31340531998946536, "grad_norm": 1.8312442302703857, "learning_rate": 3.4340268633131425e-05, "loss": 1.9471, "step": 11900 }, { "epoch": 0.3134316565709771, "grad_norm": 1.6652500629425049, "learning_rate": 3.4338951804055834e-05, "loss": 2.1866, "step": 11901 }, { "epoch": 0.3134579931524888, "grad_norm": 2.36041259765625, "learning_rate": 3.433763497498025e-05, "loss": 1.9075, "step": 11902 }, { "epoch": 0.31348432973400053, "grad_norm": 2.087564706802368, "learning_rate": 3.433631814590466e-05, "loss": 1.5581, "step": 11903 }, { "epoch": 0.3135106663155122, "grad_norm": 2.3994157314300537, "learning_rate": 3.433500131682908e-05, "loss": 2.1008, "step": 11904 }, { "epoch": 0.31353700289702396, "grad_norm": 2.7171549797058105, "learning_rate": 3.433368448775349e-05, "loss": 2.3056, "step": 11905 }, { "epoch": 0.3135633394785357, "grad_norm": 3.2091329097747803, "learning_rate": 3.4332367658677905e-05, "loss": 1.7718, "step": 11906 }, { "epoch": 0.3135896760600474, "grad_norm": 2.0868232250213623, "learning_rate": 3.433105082960232e-05, "loss": 1.5704, "step": 11907 }, { "epoch": 0.31361601264155914, "grad_norm": 1.8223464488983154, "learning_rate": 3.432973400052673e-05, "loss": 1.6108, "step": 11908 }, { "epoch": 0.3136423492230708, "grad_norm": 1.64521324634552, "learning_rate": 3.432841717145115e-05, "loss": 1.6907, "step": 11909 }, { "epoch": 0.31366868580458257, "grad_norm": 3.0415732860565186, "learning_rate": 3.432710034237556e-05, "loss": 0.929, "step": 11910 }, { "epoch": 0.31369502238609426, "grad_norm": 4.400512218475342, "learning_rate": 3.4325783513299977e-05, "loss": 1.7874, "step": 11911 }, { "epoch": 0.313721358967606, "grad_norm": 1.7861953973770142, "learning_rate": 3.4324466684224385e-05, "loss": 2.0303, "step": 11912 }, { "epoch": 0.31374769554911774, "grad_norm": 1.600196361541748, "learning_rate": 3.43231498551488e-05, "loss": 1.6419, "step": 11913 }, { "epoch": 0.31377403213062943, "grad_norm": 2.017338991165161, "learning_rate": 3.4321833026073217e-05, "loss": 2.0561, "step": 11914 }, { "epoch": 0.3138003687121412, "grad_norm": 1.924303412437439, "learning_rate": 3.432051619699763e-05, "loss": 2.0066, "step": 11915 }, { "epoch": 0.31382670529365286, "grad_norm": 1.8211301565170288, "learning_rate": 3.431919936792205e-05, "loss": 0.4485, "step": 11916 }, { "epoch": 0.3138530418751646, "grad_norm": 2.6942644119262695, "learning_rate": 3.431788253884646e-05, "loss": 1.2617, "step": 11917 }, { "epoch": 0.31387937845667635, "grad_norm": 2.10634708404541, "learning_rate": 3.431656570977088e-05, "loss": 1.9003, "step": 11918 }, { "epoch": 0.31390571503818804, "grad_norm": 2.9917962551116943, "learning_rate": 3.431524888069529e-05, "loss": 0.3855, "step": 11919 }, { "epoch": 0.3139320516196998, "grad_norm": 2.0381088256835938, "learning_rate": 3.4313932051619703e-05, "loss": 1.8381, "step": 11920 }, { "epoch": 0.31395838820121147, "grad_norm": 2.292893648147583, "learning_rate": 3.431261522254411e-05, "loss": 2.196, "step": 11921 }, { "epoch": 0.3139847247827232, "grad_norm": 3.7332489490509033, "learning_rate": 3.431129839346853e-05, "loss": 1.8974, "step": 11922 }, { "epoch": 0.3140110613642349, "grad_norm": 1.586669921875, "learning_rate": 3.4309981564392943e-05, "loss": 1.6117, "step": 11923 }, { "epoch": 0.31403739794574664, "grad_norm": 3.8536012172698975, "learning_rate": 3.430866473531736e-05, "loss": 1.6309, "step": 11924 }, { "epoch": 0.3140637345272584, "grad_norm": 2.281076669692993, "learning_rate": 3.4307347906241775e-05, "loss": 1.7698, "step": 11925 }, { "epoch": 0.3140900711087701, "grad_norm": 3.24711537361145, "learning_rate": 3.4306031077166183e-05, "loss": 2.044, "step": 11926 }, { "epoch": 0.3141164076902818, "grad_norm": 2.8330602645874023, "learning_rate": 3.43047142480906e-05, "loss": 2.0034, "step": 11927 }, { "epoch": 0.3141427442717935, "grad_norm": 2.4326963424682617, "learning_rate": 3.4303397419015015e-05, "loss": 0.4674, "step": 11928 }, { "epoch": 0.31416908085330525, "grad_norm": 1.8757840394973755, "learning_rate": 3.430208058993943e-05, "loss": 1.8203, "step": 11929 }, { "epoch": 0.31419541743481694, "grad_norm": 3.904343605041504, "learning_rate": 3.430076376086384e-05, "loss": 1.2149, "step": 11930 }, { "epoch": 0.3142217540163287, "grad_norm": 2.2272608280181885, "learning_rate": 3.4299446931788255e-05, "loss": 1.298, "step": 11931 }, { "epoch": 0.3142480905978404, "grad_norm": 2.268555164337158, "learning_rate": 3.429813010271267e-05, "loss": 2.1082, "step": 11932 }, { "epoch": 0.3142744271793521, "grad_norm": 2.5162017345428467, "learning_rate": 3.4296813273637086e-05, "loss": 0.5128, "step": 11933 }, { "epoch": 0.31430076376086385, "grad_norm": 1.671085000038147, "learning_rate": 3.4295496444561495e-05, "loss": 2.1305, "step": 11934 }, { "epoch": 0.31432710034237554, "grad_norm": 2.587003231048584, "learning_rate": 3.429417961548591e-05, "loss": 2.2135, "step": 11935 }, { "epoch": 0.3143534369238873, "grad_norm": 3.799715757369995, "learning_rate": 3.4292862786410326e-05, "loss": 1.9254, "step": 11936 }, { "epoch": 0.314379773505399, "grad_norm": 2.330026388168335, "learning_rate": 3.429154595733474e-05, "loss": 2.338, "step": 11937 }, { "epoch": 0.3144061100869107, "grad_norm": 2.73518705368042, "learning_rate": 3.429022912825916e-05, "loss": 1.9821, "step": 11938 }, { "epoch": 0.31443244666842246, "grad_norm": 4.204094409942627, "learning_rate": 3.4288912299183566e-05, "loss": 1.0299, "step": 11939 }, { "epoch": 0.31445878324993415, "grad_norm": 2.7655587196350098, "learning_rate": 3.428759547010798e-05, "loss": 1.3402, "step": 11940 }, { "epoch": 0.3144851198314459, "grad_norm": 1.9417668581008911, "learning_rate": 3.428627864103239e-05, "loss": 1.8288, "step": 11941 }, { "epoch": 0.3145114564129576, "grad_norm": 1.6638494729995728, "learning_rate": 3.428496181195681e-05, "loss": 1.2098, "step": 11942 }, { "epoch": 0.3145377929944693, "grad_norm": 4.269766330718994, "learning_rate": 3.428364498288122e-05, "loss": 1.5998, "step": 11943 }, { "epoch": 0.314564129575981, "grad_norm": 1.4448368549346924, "learning_rate": 3.428232815380564e-05, "loss": 1.6042, "step": 11944 }, { "epoch": 0.31459046615749275, "grad_norm": 2.146402597427368, "learning_rate": 3.428101132473005e-05, "loss": 1.9682, "step": 11945 }, { "epoch": 0.3146168027390045, "grad_norm": 3.751169443130493, "learning_rate": 3.427969449565446e-05, "loss": 1.2187, "step": 11946 }, { "epoch": 0.3146431393205162, "grad_norm": 3.6288695335388184, "learning_rate": 3.4278377666578884e-05, "loss": 1.1246, "step": 11947 }, { "epoch": 0.31466947590202793, "grad_norm": 2.1228652000427246, "learning_rate": 3.427706083750329e-05, "loss": 2.1966, "step": 11948 }, { "epoch": 0.3146958124835396, "grad_norm": 2.092362642288208, "learning_rate": 3.427574400842771e-05, "loss": 1.7767, "step": 11949 }, { "epoch": 0.31472214906505136, "grad_norm": 4.747443199157715, "learning_rate": 3.427442717935212e-05, "loss": 1.3056, "step": 11950 }, { "epoch": 0.3147484856465631, "grad_norm": 2.3654873371124268, "learning_rate": 3.427311035027654e-05, "loss": 1.5675, "step": 11951 }, { "epoch": 0.3147748222280748, "grad_norm": 1.8938640356063843, "learning_rate": 3.427179352120095e-05, "loss": 1.919, "step": 11952 }, { "epoch": 0.31480115880958653, "grad_norm": 1.851574420928955, "learning_rate": 3.4270476692125364e-05, "loss": 2.2255, "step": 11953 }, { "epoch": 0.3148274953910982, "grad_norm": 2.2016735076904297, "learning_rate": 3.426915986304978e-05, "loss": 1.7719, "step": 11954 }, { "epoch": 0.31485383197260997, "grad_norm": 2.4434311389923096, "learning_rate": 3.426784303397419e-05, "loss": 0.8764, "step": 11955 }, { "epoch": 0.31488016855412165, "grad_norm": 2.07375431060791, "learning_rate": 3.426652620489861e-05, "loss": 1.7581, "step": 11956 }, { "epoch": 0.3149065051356334, "grad_norm": 3.103745698928833, "learning_rate": 3.426520937582302e-05, "loss": 1.5745, "step": 11957 }, { "epoch": 0.31493284171714514, "grad_norm": 4.127291202545166, "learning_rate": 3.4263892546747435e-05, "loss": 1.4292, "step": 11958 }, { "epoch": 0.31495917829865683, "grad_norm": 2.5967373847961426, "learning_rate": 3.4262575717671844e-05, "loss": 1.8639, "step": 11959 }, { "epoch": 0.31498551488016857, "grad_norm": 3.1381006240844727, "learning_rate": 3.426125888859626e-05, "loss": 1.4107, "step": 11960 }, { "epoch": 0.31501185146168026, "grad_norm": 3.3036344051361084, "learning_rate": 3.4259942059520675e-05, "loss": 0.7293, "step": 11961 }, { "epoch": 0.315038188043192, "grad_norm": 1.9857488870620728, "learning_rate": 3.425862523044509e-05, "loss": 0.3787, "step": 11962 }, { "epoch": 0.3150645246247037, "grad_norm": 3.5623269081115723, "learning_rate": 3.4257308401369506e-05, "loss": 1.3363, "step": 11963 }, { "epoch": 0.31509086120621543, "grad_norm": 1.5893114805221558, "learning_rate": 3.4255991572293915e-05, "loss": 1.3695, "step": 11964 }, { "epoch": 0.3151171977877272, "grad_norm": 1.6632758378982544, "learning_rate": 3.425467474321834e-05, "loss": 1.9521, "step": 11965 }, { "epoch": 0.31514353436923886, "grad_norm": 1.9022972583770752, "learning_rate": 3.4253357914142746e-05, "loss": 2.2308, "step": 11966 }, { "epoch": 0.3151698709507506, "grad_norm": 1.949738621711731, "learning_rate": 3.425204108506716e-05, "loss": 2.175, "step": 11967 }, { "epoch": 0.3151962075322623, "grad_norm": 1.9077039957046509, "learning_rate": 3.425072425599157e-05, "loss": 1.5597, "step": 11968 }, { "epoch": 0.31522254411377404, "grad_norm": 1.7058870792388916, "learning_rate": 3.4249407426915986e-05, "loss": 1.5192, "step": 11969 }, { "epoch": 0.3152488806952857, "grad_norm": 2.6986324787139893, "learning_rate": 3.42480905978404e-05, "loss": 0.3859, "step": 11970 }, { "epoch": 0.31527521727679747, "grad_norm": 1.5514413118362427, "learning_rate": 3.424677376876482e-05, "loss": 1.5685, "step": 11971 }, { "epoch": 0.3153015538583092, "grad_norm": 2.907836437225342, "learning_rate": 3.424545693968923e-05, "loss": 2.2402, "step": 11972 }, { "epoch": 0.3153278904398209, "grad_norm": 2.425527572631836, "learning_rate": 3.424414011061364e-05, "loss": 1.0907, "step": 11973 }, { "epoch": 0.31535422702133264, "grad_norm": 1.9716839790344238, "learning_rate": 3.424282328153806e-05, "loss": 0.5059, "step": 11974 }, { "epoch": 0.31538056360284433, "grad_norm": 2.9178848266601562, "learning_rate": 3.424150645246247e-05, "loss": 0.5166, "step": 11975 }, { "epoch": 0.3154069001843561, "grad_norm": 1.8250172138214111, "learning_rate": 3.424018962338689e-05, "loss": 1.7656, "step": 11976 }, { "epoch": 0.31543323676586776, "grad_norm": 4.121735095977783, "learning_rate": 3.42388727943113e-05, "loss": 1.3751, "step": 11977 }, { "epoch": 0.3154595733473795, "grad_norm": 2.6907474994659424, "learning_rate": 3.423755596523571e-05, "loss": 1.9716, "step": 11978 }, { "epoch": 0.31548590992889125, "grad_norm": 3.6392195224761963, "learning_rate": 3.423623913616012e-05, "loss": 2.0938, "step": 11979 }, { "epoch": 0.31551224651040294, "grad_norm": 2.5202419757843018, "learning_rate": 3.4234922307084544e-05, "loss": 1.3766, "step": 11980 }, { "epoch": 0.3155385830919147, "grad_norm": 5.536961078643799, "learning_rate": 3.423360547800895e-05, "loss": 1.0434, "step": 11981 }, { "epoch": 0.31556491967342637, "grad_norm": 2.9225029945373535, "learning_rate": 3.423228864893337e-05, "loss": 0.6428, "step": 11982 }, { "epoch": 0.3155912562549381, "grad_norm": 1.761869192123413, "learning_rate": 3.4230971819857784e-05, "loss": 2.2562, "step": 11983 }, { "epoch": 0.31561759283644986, "grad_norm": 1.7251423597335815, "learning_rate": 3.42296549907822e-05, "loss": 1.9982, "step": 11984 }, { "epoch": 0.31564392941796154, "grad_norm": 1.9662408828735352, "learning_rate": 3.4228338161706616e-05, "loss": 1.976, "step": 11985 }, { "epoch": 0.3156702659994733, "grad_norm": 2.432746171951294, "learning_rate": 3.4227021332631024e-05, "loss": 1.0983, "step": 11986 }, { "epoch": 0.315696602580985, "grad_norm": 4.456958293914795, "learning_rate": 3.422570450355544e-05, "loss": 1.2475, "step": 11987 }, { "epoch": 0.3157229391624967, "grad_norm": 2.2281575202941895, "learning_rate": 3.422438767447985e-05, "loss": 1.646, "step": 11988 }, { "epoch": 0.3157492757440084, "grad_norm": 3.2517008781433105, "learning_rate": 3.422307084540427e-05, "loss": 0.9704, "step": 11989 }, { "epoch": 0.31577561232552015, "grad_norm": 3.9694466590881348, "learning_rate": 3.422175401632868e-05, "loss": 2.6364, "step": 11990 }, { "epoch": 0.3158019489070319, "grad_norm": 2.0546627044677734, "learning_rate": 3.4220437187253096e-05, "loss": 1.3628, "step": 11991 }, { "epoch": 0.3158282854885436, "grad_norm": 2.669036388397217, "learning_rate": 3.421912035817751e-05, "loss": 1.8031, "step": 11992 }, { "epoch": 0.3158546220700553, "grad_norm": 1.6669118404388428, "learning_rate": 3.421780352910192e-05, "loss": 2.0616, "step": 11993 }, { "epoch": 0.315880958651567, "grad_norm": 2.085125207901001, "learning_rate": 3.421648670002634e-05, "loss": 1.5328, "step": 11994 }, { "epoch": 0.31590729523307876, "grad_norm": 3.967073917388916, "learning_rate": 3.421516987095075e-05, "loss": 1.2698, "step": 11995 }, { "epoch": 0.31593363181459044, "grad_norm": 3.3713738918304443, "learning_rate": 3.421385304187517e-05, "loss": 1.4386, "step": 11996 }, { "epoch": 0.3159599683961022, "grad_norm": 2.2872700691223145, "learning_rate": 3.4212536212799576e-05, "loss": 1.8076, "step": 11997 }, { "epoch": 0.31598630497761393, "grad_norm": 3.812849283218384, "learning_rate": 3.4211219383724e-05, "loss": 1.7878, "step": 11998 }, { "epoch": 0.3160126415591256, "grad_norm": 1.8914098739624023, "learning_rate": 3.420990255464841e-05, "loss": 1.7793, "step": 11999 }, { "epoch": 0.31603897814063736, "grad_norm": 2.5909225940704346, "learning_rate": 3.420858572557282e-05, "loss": 1.3282, "step": 12000 }, { "epoch": 0.31606531472214905, "grad_norm": 2.4622552394866943, "learning_rate": 3.420726889649724e-05, "loss": 1.7412, "step": 12001 }, { "epoch": 0.3160916513036608, "grad_norm": 2.063779354095459, "learning_rate": 3.420595206742165e-05, "loss": 1.5135, "step": 12002 }, { "epoch": 0.3161179878851725, "grad_norm": 2.2328691482543945, "learning_rate": 3.420463523834607e-05, "loss": 0.8068, "step": 12003 }, { "epoch": 0.3161443244666842, "grad_norm": 3.361863613128662, "learning_rate": 3.420331840927048e-05, "loss": 0.6206, "step": 12004 }, { "epoch": 0.31617066104819597, "grad_norm": 3.3704581260681152, "learning_rate": 3.4202001580194894e-05, "loss": 1.3984, "step": 12005 }, { "epoch": 0.31619699762970765, "grad_norm": 2.329801082611084, "learning_rate": 3.42006847511193e-05, "loss": 0.8442, "step": 12006 }, { "epoch": 0.3162233342112194, "grad_norm": 1.9171693325042725, "learning_rate": 3.419936792204372e-05, "loss": 1.5431, "step": 12007 }, { "epoch": 0.3162496707927311, "grad_norm": 1.9286022186279297, "learning_rate": 3.4198051092968134e-05, "loss": 1.7693, "step": 12008 }, { "epoch": 0.31627600737424283, "grad_norm": 3.3286213874816895, "learning_rate": 3.419673426389255e-05, "loss": 0.8107, "step": 12009 }, { "epoch": 0.3163023439557545, "grad_norm": 1.7902685403823853, "learning_rate": 3.4195417434816965e-05, "loss": 1.6694, "step": 12010 }, { "epoch": 0.31632868053726626, "grad_norm": 3.36136794090271, "learning_rate": 3.4194100605741374e-05, "loss": 1.6546, "step": 12011 }, { "epoch": 0.316355017118778, "grad_norm": 1.8728545904159546, "learning_rate": 3.419278377666579e-05, "loss": 1.9547, "step": 12012 }, { "epoch": 0.3163813537002897, "grad_norm": 2.6027796268463135, "learning_rate": 3.4191466947590205e-05, "loss": 1.3122, "step": 12013 }, { "epoch": 0.31640769028180143, "grad_norm": 1.5420255661010742, "learning_rate": 3.419015011851462e-05, "loss": 2.2219, "step": 12014 }, { "epoch": 0.3164340268633131, "grad_norm": 1.9534305334091187, "learning_rate": 3.418883328943903e-05, "loss": 1.7044, "step": 12015 }, { "epoch": 0.31646036344482487, "grad_norm": 4.399682521820068, "learning_rate": 3.4187516460363445e-05, "loss": 1.5365, "step": 12016 }, { "epoch": 0.3164867000263366, "grad_norm": 2.8296594619750977, "learning_rate": 3.418619963128786e-05, "loss": 0.8142, "step": 12017 }, { "epoch": 0.3165130366078483, "grad_norm": 1.6229559183120728, "learning_rate": 3.4184882802212276e-05, "loss": 1.8636, "step": 12018 }, { "epoch": 0.31653937318936004, "grad_norm": 2.1545701026916504, "learning_rate": 3.418356597313669e-05, "loss": 1.4344, "step": 12019 }, { "epoch": 0.31656570977087173, "grad_norm": 1.4987447261810303, "learning_rate": 3.41822491440611e-05, "loss": 0.4292, "step": 12020 }, { "epoch": 0.31659204635238347, "grad_norm": 1.6686434745788574, "learning_rate": 3.4180932314985516e-05, "loss": 1.278, "step": 12021 }, { "epoch": 0.31661838293389516, "grad_norm": 2.217627763748169, "learning_rate": 3.417961548590993e-05, "loss": 2.4288, "step": 12022 }, { "epoch": 0.3166447195154069, "grad_norm": 1.6432602405548096, "learning_rate": 3.417829865683435e-05, "loss": 1.5443, "step": 12023 }, { "epoch": 0.31667105609691865, "grad_norm": 2.990502119064331, "learning_rate": 3.4176981827758756e-05, "loss": 1.5581, "step": 12024 }, { "epoch": 0.31669739267843033, "grad_norm": 2.026674509048462, "learning_rate": 3.417566499868317e-05, "loss": 1.8248, "step": 12025 }, { "epoch": 0.3167237292599421, "grad_norm": 1.969262957572937, "learning_rate": 3.417434816960759e-05, "loss": 1.7076, "step": 12026 }, { "epoch": 0.31675006584145377, "grad_norm": 2.062495470046997, "learning_rate": 3.4173031340532e-05, "loss": 2.3502, "step": 12027 }, { "epoch": 0.3167764024229655, "grad_norm": 1.755882740020752, "learning_rate": 3.417171451145642e-05, "loss": 1.6885, "step": 12028 }, { "epoch": 0.3168027390044772, "grad_norm": 3.3108365535736084, "learning_rate": 3.417039768238083e-05, "loss": 0.4137, "step": 12029 }, { "epoch": 0.31682907558598894, "grad_norm": 6.628294944763184, "learning_rate": 3.416908085330524e-05, "loss": 1.0136, "step": 12030 }, { "epoch": 0.3168554121675007, "grad_norm": 1.9180766344070435, "learning_rate": 3.416776402422966e-05, "loss": 1.8115, "step": 12031 }, { "epoch": 0.31688174874901237, "grad_norm": 3.267291307449341, "learning_rate": 3.4166447195154074e-05, "loss": 1.4986, "step": 12032 }, { "epoch": 0.3169080853305241, "grad_norm": 2.7749853134155273, "learning_rate": 3.416513036607848e-05, "loss": 1.1238, "step": 12033 }, { "epoch": 0.3169344219120358, "grad_norm": 1.5342379808425903, "learning_rate": 3.41638135370029e-05, "loss": 1.2737, "step": 12034 }, { "epoch": 0.31696075849354755, "grad_norm": 2.0451016426086426, "learning_rate": 3.416249670792731e-05, "loss": 1.9886, "step": 12035 }, { "epoch": 0.31698709507505923, "grad_norm": 2.403731107711792, "learning_rate": 3.416117987885173e-05, "loss": 1.4137, "step": 12036 }, { "epoch": 0.317013431656571, "grad_norm": 1.9101914167404175, "learning_rate": 3.415986304977614e-05, "loss": 1.4577, "step": 12037 }, { "epoch": 0.3170397682380827, "grad_norm": 2.213183641433716, "learning_rate": 3.4158546220700554e-05, "loss": 1.4521, "step": 12038 }, { "epoch": 0.3170661048195944, "grad_norm": 3.8314709663391113, "learning_rate": 3.415722939162497e-05, "loss": 2.6511, "step": 12039 }, { "epoch": 0.31709244140110615, "grad_norm": 3.5145654678344727, "learning_rate": 3.415591256254938e-05, "loss": 1.7947, "step": 12040 }, { "epoch": 0.31711877798261784, "grad_norm": 1.8209129571914673, "learning_rate": 3.41545957334738e-05, "loss": 1.7092, "step": 12041 }, { "epoch": 0.3171451145641296, "grad_norm": 2.524423837661743, "learning_rate": 3.415327890439821e-05, "loss": 1.6544, "step": 12042 }, { "epoch": 0.31717145114564127, "grad_norm": 1.9139906167984009, "learning_rate": 3.4151962075322625e-05, "loss": 1.6744, "step": 12043 }, { "epoch": 0.317197787727153, "grad_norm": 2.0966320037841797, "learning_rate": 3.4150645246247034e-05, "loss": 1.7232, "step": 12044 }, { "epoch": 0.31722412430866476, "grad_norm": 2.1557092666625977, "learning_rate": 3.414932841717145e-05, "loss": 1.8999, "step": 12045 }, { "epoch": 0.31725046089017644, "grad_norm": 4.261433124542236, "learning_rate": 3.4148011588095865e-05, "loss": 1.0017, "step": 12046 }, { "epoch": 0.3172767974716882, "grad_norm": 3.653890609741211, "learning_rate": 3.414669475902028e-05, "loss": 1.642, "step": 12047 }, { "epoch": 0.3173031340531999, "grad_norm": 1.5433707237243652, "learning_rate": 3.41453779299447e-05, "loss": 1.1799, "step": 12048 }, { "epoch": 0.3173294706347116, "grad_norm": 2.0163955688476562, "learning_rate": 3.4144061100869105e-05, "loss": 1.7505, "step": 12049 }, { "epoch": 0.3173558072162233, "grad_norm": 2.1403865814208984, "learning_rate": 3.414274427179353e-05, "loss": 0.3914, "step": 12050 }, { "epoch": 0.31738214379773505, "grad_norm": 3.932182550430298, "learning_rate": 3.414142744271794e-05, "loss": 1.4982, "step": 12051 }, { "epoch": 0.3174084803792468, "grad_norm": 3.1841015815734863, "learning_rate": 3.414011061364235e-05, "loss": 1.6272, "step": 12052 }, { "epoch": 0.3174348169607585, "grad_norm": 1.9604413509368896, "learning_rate": 3.413879378456676e-05, "loss": 1.4913, "step": 12053 }, { "epoch": 0.3174611535422702, "grad_norm": 3.753220319747925, "learning_rate": 3.413747695549118e-05, "loss": 2.1964, "step": 12054 }, { "epoch": 0.3174874901237819, "grad_norm": 2.9770264625549316, "learning_rate": 3.413616012641559e-05, "loss": 1.4059, "step": 12055 }, { "epoch": 0.31751382670529366, "grad_norm": 2.0103933811187744, "learning_rate": 3.413484329734001e-05, "loss": 1.8425, "step": 12056 }, { "epoch": 0.3175401632868054, "grad_norm": 2.5481927394866943, "learning_rate": 3.4133526468264423e-05, "loss": 1.5227, "step": 12057 }, { "epoch": 0.3175664998683171, "grad_norm": 1.9941326379776, "learning_rate": 3.413220963918883e-05, "loss": 0.549, "step": 12058 }, { "epoch": 0.31759283644982883, "grad_norm": 2.102574586868286, "learning_rate": 3.413089281011325e-05, "loss": 1.8916, "step": 12059 }, { "epoch": 0.3176191730313405, "grad_norm": 2.3135499954223633, "learning_rate": 3.4129575981037663e-05, "loss": 2.2591, "step": 12060 }, { "epoch": 0.31764550961285226, "grad_norm": 2.7947845458984375, "learning_rate": 3.412825915196208e-05, "loss": 1.9145, "step": 12061 }, { "epoch": 0.31767184619436395, "grad_norm": 2.105344295501709, "learning_rate": 3.412694232288649e-05, "loss": 1.4037, "step": 12062 }, { "epoch": 0.3176981827758757, "grad_norm": 1.7814490795135498, "learning_rate": 3.4125625493810903e-05, "loss": 1.9385, "step": 12063 }, { "epoch": 0.31772451935738744, "grad_norm": 3.6411960124969482, "learning_rate": 3.412430866473532e-05, "loss": 1.0843, "step": 12064 }, { "epoch": 0.3177508559388991, "grad_norm": 2.3417065143585205, "learning_rate": 3.4122991835659735e-05, "loss": 1.4668, "step": 12065 }, { "epoch": 0.31777719252041087, "grad_norm": 1.9404869079589844, "learning_rate": 3.412167500658415e-05, "loss": 1.8116, "step": 12066 }, { "epoch": 0.31780352910192256, "grad_norm": 1.628248929977417, "learning_rate": 3.412035817750856e-05, "loss": 0.3777, "step": 12067 }, { "epoch": 0.3178298656834343, "grad_norm": 3.219590663909912, "learning_rate": 3.4119041348432975e-05, "loss": 1.612, "step": 12068 }, { "epoch": 0.317856202264946, "grad_norm": 1.9707688093185425, "learning_rate": 3.411772451935739e-05, "loss": 1.6333, "step": 12069 }, { "epoch": 0.31788253884645773, "grad_norm": 3.203376293182373, "learning_rate": 3.4116407690281806e-05, "loss": 1.9735, "step": 12070 }, { "epoch": 0.3179088754279695, "grad_norm": 3.539792060852051, "learning_rate": 3.4115090861206215e-05, "loss": 0.2714, "step": 12071 }, { "epoch": 0.31793521200948116, "grad_norm": 2.378357410430908, "learning_rate": 3.411377403213063e-05, "loss": 1.9282, "step": 12072 }, { "epoch": 0.3179615485909929, "grad_norm": 2.246793746948242, "learning_rate": 3.4112457203055046e-05, "loss": 1.6198, "step": 12073 }, { "epoch": 0.3179878851725046, "grad_norm": 4.417208194732666, "learning_rate": 3.411114037397946e-05, "loss": 1.5774, "step": 12074 }, { "epoch": 0.31801422175401634, "grad_norm": 2.098876714706421, "learning_rate": 3.410982354490388e-05, "loss": 2.4549, "step": 12075 }, { "epoch": 0.318040558335528, "grad_norm": 2.2911365032196045, "learning_rate": 3.4108506715828286e-05, "loss": 2.2156, "step": 12076 }, { "epoch": 0.31806689491703977, "grad_norm": 1.9358553886413574, "learning_rate": 3.41071898867527e-05, "loss": 1.3755, "step": 12077 }, { "epoch": 0.3180932314985515, "grad_norm": 4.161898612976074, "learning_rate": 3.410587305767711e-05, "loss": 1.6894, "step": 12078 }, { "epoch": 0.3181195680800632, "grad_norm": 2.434128999710083, "learning_rate": 3.410455622860153e-05, "loss": 1.7256, "step": 12079 }, { "epoch": 0.31814590466157494, "grad_norm": 1.7992404699325562, "learning_rate": 3.410323939952594e-05, "loss": 1.8533, "step": 12080 }, { "epoch": 0.31817224124308663, "grad_norm": 3.1368579864501953, "learning_rate": 3.410192257045036e-05, "loss": 0.865, "step": 12081 }, { "epoch": 0.31819857782459837, "grad_norm": 2.074144124984741, "learning_rate": 3.4100605741374766e-05, "loss": 2.236, "step": 12082 }, { "epoch": 0.31822491440611006, "grad_norm": 2.1697885990142822, "learning_rate": 3.409928891229919e-05, "loss": 1.3337, "step": 12083 }, { "epoch": 0.3182512509876218, "grad_norm": 4.024163246154785, "learning_rate": 3.40979720832236e-05, "loss": 0.4982, "step": 12084 }, { "epoch": 0.31827758756913355, "grad_norm": 1.7225251197814941, "learning_rate": 3.409665525414801e-05, "loss": 1.8736, "step": 12085 }, { "epoch": 0.31830392415064523, "grad_norm": 1.5807503461837769, "learning_rate": 3.409533842507243e-05, "loss": 1.2598, "step": 12086 }, { "epoch": 0.318330260732157, "grad_norm": 2.1448254585266113, "learning_rate": 3.409402159599684e-05, "loss": 1.9726, "step": 12087 }, { "epoch": 0.31835659731366867, "grad_norm": 2.3013803958892822, "learning_rate": 3.409270476692126e-05, "loss": 1.5291, "step": 12088 }, { "epoch": 0.3183829338951804, "grad_norm": 2.0223681926727295, "learning_rate": 3.409138793784567e-05, "loss": 1.9953, "step": 12089 }, { "epoch": 0.31840927047669215, "grad_norm": 2.174805164337158, "learning_rate": 3.4090071108770084e-05, "loss": 1.5407, "step": 12090 }, { "epoch": 0.31843560705820384, "grad_norm": 3.7747116088867188, "learning_rate": 3.408875427969449e-05, "loss": 0.6589, "step": 12091 }, { "epoch": 0.3184619436397156, "grad_norm": 3.278367042541504, "learning_rate": 3.408743745061891e-05, "loss": 1.9589, "step": 12092 }, { "epoch": 0.31848828022122727, "grad_norm": 2.9114749431610107, "learning_rate": 3.4086120621543324e-05, "loss": 2.3347, "step": 12093 }, { "epoch": 0.318514616802739, "grad_norm": 1.9916917085647583, "learning_rate": 3.408480379246774e-05, "loss": 1.8849, "step": 12094 }, { "epoch": 0.3185409533842507, "grad_norm": 2.3800346851348877, "learning_rate": 3.4083486963392155e-05, "loss": 2.4327, "step": 12095 }, { "epoch": 0.31856728996576245, "grad_norm": 2.8650009632110596, "learning_rate": 3.4082170134316564e-05, "loss": 2.0588, "step": 12096 }, { "epoch": 0.3185936265472742, "grad_norm": 1.5561290979385376, "learning_rate": 3.4080853305240986e-05, "loss": 1.5646, "step": 12097 }, { "epoch": 0.3186199631287859, "grad_norm": 2.254415512084961, "learning_rate": 3.4079536476165395e-05, "loss": 1.4994, "step": 12098 }, { "epoch": 0.3186462997102976, "grad_norm": 1.9972496032714844, "learning_rate": 3.407821964708981e-05, "loss": 1.8567, "step": 12099 }, { "epoch": 0.3186726362918093, "grad_norm": 7.733311653137207, "learning_rate": 3.407690281801422e-05, "loss": 2.2662, "step": 12100 }, { "epoch": 0.31869897287332105, "grad_norm": 2.2288615703582764, "learning_rate": 3.4075585988938635e-05, "loss": 1.9325, "step": 12101 }, { "epoch": 0.31872530945483274, "grad_norm": 3.8718278408050537, "learning_rate": 3.407426915986305e-05, "loss": 1.1788, "step": 12102 }, { "epoch": 0.3187516460363445, "grad_norm": 2.4555227756500244, "learning_rate": 3.4072952330787466e-05, "loss": 1.8223, "step": 12103 }, { "epoch": 0.3187779826178562, "grad_norm": 1.5836929082870483, "learning_rate": 3.407163550171188e-05, "loss": 1.6193, "step": 12104 }, { "epoch": 0.3188043191993679, "grad_norm": 3.5536606311798096, "learning_rate": 3.407031867263629e-05, "loss": 0.9933, "step": 12105 }, { "epoch": 0.31883065578087966, "grad_norm": 2.322253942489624, "learning_rate": 3.4069001843560706e-05, "loss": 0.5696, "step": 12106 }, { "epoch": 0.31885699236239134, "grad_norm": 2.164285182952881, "learning_rate": 3.406768501448512e-05, "loss": 2.3208, "step": 12107 }, { "epoch": 0.3188833289439031, "grad_norm": 1.9893524646759033, "learning_rate": 3.406636818540954e-05, "loss": 1.8024, "step": 12108 }, { "epoch": 0.3189096655254148, "grad_norm": 2.137017011642456, "learning_rate": 3.4065051356333946e-05, "loss": 1.5274, "step": 12109 }, { "epoch": 0.3189360021069265, "grad_norm": 1.9572263956069946, "learning_rate": 3.406373452725836e-05, "loss": 0.9218, "step": 12110 }, { "epoch": 0.31896233868843826, "grad_norm": 1.8778127431869507, "learning_rate": 3.406241769818278e-05, "loss": 1.9975, "step": 12111 }, { "epoch": 0.31898867526994995, "grad_norm": 2.643911361694336, "learning_rate": 3.406110086910719e-05, "loss": 1.7684, "step": 12112 }, { "epoch": 0.3190150118514617, "grad_norm": 1.903601884841919, "learning_rate": 3.405978404003161e-05, "loss": 1.8392, "step": 12113 }, { "epoch": 0.3190413484329734, "grad_norm": 1.7770432233810425, "learning_rate": 3.405846721095602e-05, "loss": 0.8937, "step": 12114 }, { "epoch": 0.3190676850144851, "grad_norm": 4.653463363647461, "learning_rate": 3.405715038188043e-05, "loss": 1.2333, "step": 12115 }, { "epoch": 0.3190940215959968, "grad_norm": 2.2294225692749023, "learning_rate": 3.405583355280485e-05, "loss": 1.3843, "step": 12116 }, { "epoch": 0.31912035817750856, "grad_norm": 1.8459888696670532, "learning_rate": 3.4054516723729264e-05, "loss": 0.6468, "step": 12117 }, { "epoch": 0.3191466947590203, "grad_norm": 2.8435428142547607, "learning_rate": 3.405319989465367e-05, "loss": 1.4508, "step": 12118 }, { "epoch": 0.319173031340532, "grad_norm": 1.8968877792358398, "learning_rate": 3.405188306557809e-05, "loss": 1.6528, "step": 12119 }, { "epoch": 0.31919936792204373, "grad_norm": 2.4548451900482178, "learning_rate": 3.4050566236502504e-05, "loss": 1.4825, "step": 12120 }, { "epoch": 0.3192257045035554, "grad_norm": 1.5156699419021606, "learning_rate": 3.404924940742692e-05, "loss": 1.7052, "step": 12121 }, { "epoch": 0.31925204108506716, "grad_norm": 3.163053512573242, "learning_rate": 3.4047932578351336e-05, "loss": 1.839, "step": 12122 }, { "epoch": 0.3192783776665789, "grad_norm": 1.900566577911377, "learning_rate": 3.4046615749275744e-05, "loss": 1.8445, "step": 12123 }, { "epoch": 0.3193047142480906, "grad_norm": 2.2076849937438965, "learning_rate": 3.404529892020016e-05, "loss": 1.4097, "step": 12124 }, { "epoch": 0.31933105082960234, "grad_norm": 1.9737322330474854, "learning_rate": 3.404398209112457e-05, "loss": 1.607, "step": 12125 }, { "epoch": 0.319357387411114, "grad_norm": 3.2566566467285156, "learning_rate": 3.404266526204899e-05, "loss": 2.1212, "step": 12126 }, { "epoch": 0.31938372399262577, "grad_norm": 2.9648027420043945, "learning_rate": 3.40413484329734e-05, "loss": 1.6006, "step": 12127 }, { "epoch": 0.31941006057413746, "grad_norm": 1.5236337184906006, "learning_rate": 3.4040031603897816e-05, "loss": 1.8463, "step": 12128 }, { "epoch": 0.3194363971556492, "grad_norm": 2.531479835510254, "learning_rate": 3.403871477482223e-05, "loss": 1.1724, "step": 12129 }, { "epoch": 0.31946273373716094, "grad_norm": 4.001087188720703, "learning_rate": 3.403739794574665e-05, "loss": 1.6758, "step": 12130 }, { "epoch": 0.31948907031867263, "grad_norm": 2.445056915283203, "learning_rate": 3.403608111667106e-05, "loss": 2.7615, "step": 12131 }, { "epoch": 0.3195154069001844, "grad_norm": 2.243518829345703, "learning_rate": 3.403476428759547e-05, "loss": 1.4695, "step": 12132 }, { "epoch": 0.31954174348169606, "grad_norm": 1.8329102993011475, "learning_rate": 3.403344745851989e-05, "loss": 1.9909, "step": 12133 }, { "epoch": 0.3195680800632078, "grad_norm": 1.9460046291351318, "learning_rate": 3.4032130629444296e-05, "loss": 0.6969, "step": 12134 }, { "epoch": 0.3195944166447195, "grad_norm": 1.7553805112838745, "learning_rate": 3.403081380036872e-05, "loss": 2.4984, "step": 12135 }, { "epoch": 0.31962075322623124, "grad_norm": 2.204303741455078, "learning_rate": 3.402949697129313e-05, "loss": 2.247, "step": 12136 }, { "epoch": 0.319647089807743, "grad_norm": 2.5163116455078125, "learning_rate": 3.402818014221754e-05, "loss": 1.3254, "step": 12137 }, { "epoch": 0.31967342638925467, "grad_norm": 2.1387217044830322, "learning_rate": 3.402686331314195e-05, "loss": 1.8045, "step": 12138 }, { "epoch": 0.3196997629707664, "grad_norm": 2.000959873199463, "learning_rate": 3.402554648406637e-05, "loss": 1.3126, "step": 12139 }, { "epoch": 0.3197260995522781, "grad_norm": 2.282318592071533, "learning_rate": 3.402422965499078e-05, "loss": 1.1203, "step": 12140 }, { "epoch": 0.31975243613378984, "grad_norm": 2.2452168464660645, "learning_rate": 3.40229128259152e-05, "loss": 2.0484, "step": 12141 }, { "epoch": 0.31977877271530153, "grad_norm": 1.4107005596160889, "learning_rate": 3.4021595996839614e-05, "loss": 2.1388, "step": 12142 }, { "epoch": 0.3198051092968133, "grad_norm": 4.617241382598877, "learning_rate": 3.402027916776402e-05, "loss": 1.0133, "step": 12143 }, { "epoch": 0.319831445878325, "grad_norm": 2.887406826019287, "learning_rate": 3.4018962338688445e-05, "loss": 1.8506, "step": 12144 }, { "epoch": 0.3198577824598367, "grad_norm": 2.859163522720337, "learning_rate": 3.4017645509612854e-05, "loss": 2.4007, "step": 12145 }, { "epoch": 0.31988411904134845, "grad_norm": 2.2629973888397217, "learning_rate": 3.401632868053727e-05, "loss": 1.2927, "step": 12146 }, { "epoch": 0.31991045562286013, "grad_norm": 2.6468231678009033, "learning_rate": 3.401501185146168e-05, "loss": 0.6667, "step": 12147 }, { "epoch": 0.3199367922043719, "grad_norm": 2.2851922512054443, "learning_rate": 3.4013695022386094e-05, "loss": 2.0294, "step": 12148 }, { "epoch": 0.31996312878588357, "grad_norm": 3.090280771255493, "learning_rate": 3.401237819331051e-05, "loss": 1.3353, "step": 12149 }, { "epoch": 0.3199894653673953, "grad_norm": 2.6776058673858643, "learning_rate": 3.4011061364234925e-05, "loss": 2.5322, "step": 12150 }, { "epoch": 0.32001580194890705, "grad_norm": 1.7979923486709595, "learning_rate": 3.400974453515934e-05, "loss": 1.8427, "step": 12151 }, { "epoch": 0.32004213853041874, "grad_norm": 3.174229860305786, "learning_rate": 3.400842770608375e-05, "loss": 0.7254, "step": 12152 }, { "epoch": 0.3200684751119305, "grad_norm": 2.1720619201660156, "learning_rate": 3.4007110877008165e-05, "loss": 2.8202, "step": 12153 }, { "epoch": 0.32009481169344217, "grad_norm": 3.5753750801086426, "learning_rate": 3.400579404793258e-05, "loss": 1.0511, "step": 12154 }, { "epoch": 0.3201211482749539, "grad_norm": 2.3471946716308594, "learning_rate": 3.4004477218856996e-05, "loss": 1.0405, "step": 12155 }, { "epoch": 0.32014748485646566, "grad_norm": 2.4017467498779297, "learning_rate": 3.4003160389781405e-05, "loss": 2.0965, "step": 12156 }, { "epoch": 0.32017382143797735, "grad_norm": 2.4334700107574463, "learning_rate": 3.400184356070582e-05, "loss": 0.5958, "step": 12157 }, { "epoch": 0.3202001580194891, "grad_norm": 2.3893940448760986, "learning_rate": 3.4000526731630236e-05, "loss": 0.8703, "step": 12158 }, { "epoch": 0.3202264946010008, "grad_norm": 2.327883005142212, "learning_rate": 3.399920990255465e-05, "loss": 1.3811, "step": 12159 }, { "epoch": 0.3202528311825125, "grad_norm": 2.3480374813079834, "learning_rate": 3.399789307347907e-05, "loss": 1.6976, "step": 12160 }, { "epoch": 0.3202791677640242, "grad_norm": 1.9788191318511963, "learning_rate": 3.3996576244403476e-05, "loss": 1.4821, "step": 12161 }, { "epoch": 0.32030550434553595, "grad_norm": 2.0377748012542725, "learning_rate": 3.399525941532789e-05, "loss": 0.6055, "step": 12162 }, { "epoch": 0.3203318409270477, "grad_norm": 3.566192865371704, "learning_rate": 3.399394258625231e-05, "loss": 1.2174, "step": 12163 }, { "epoch": 0.3203581775085594, "grad_norm": 5.043156147003174, "learning_rate": 3.399262575717672e-05, "loss": 2.0138, "step": 12164 }, { "epoch": 0.3203845140900711, "grad_norm": 1.7911367416381836, "learning_rate": 3.399130892810113e-05, "loss": 2.0229, "step": 12165 }, { "epoch": 0.3204108506715828, "grad_norm": 3.9641623497009277, "learning_rate": 3.398999209902555e-05, "loss": 1.1658, "step": 12166 }, { "epoch": 0.32043718725309456, "grad_norm": 1.9728283882141113, "learning_rate": 3.398867526994996e-05, "loss": 1.6782, "step": 12167 }, { "epoch": 0.32046352383460625, "grad_norm": 1.463543176651001, "learning_rate": 3.398735844087438e-05, "loss": 1.6103, "step": 12168 }, { "epoch": 0.320489860416118, "grad_norm": 3.910576343536377, "learning_rate": 3.3986041611798794e-05, "loss": 1.4771, "step": 12169 }, { "epoch": 0.32051619699762973, "grad_norm": 1.9064232110977173, "learning_rate": 3.39847247827232e-05, "loss": 1.6162, "step": 12170 }, { "epoch": 0.3205425335791414, "grad_norm": 2.864616632461548, "learning_rate": 3.398340795364762e-05, "loss": 2.2952, "step": 12171 }, { "epoch": 0.32056887016065316, "grad_norm": 3.8245649337768555, "learning_rate": 3.398209112457203e-05, "loss": 2.1377, "step": 12172 }, { "epoch": 0.32059520674216485, "grad_norm": 3.4233615398406982, "learning_rate": 3.398077429549645e-05, "loss": 2.2099, "step": 12173 }, { "epoch": 0.3206215433236766, "grad_norm": 1.4633548259735107, "learning_rate": 3.397945746642086e-05, "loss": 1.6924, "step": 12174 }, { "epoch": 0.3206478799051883, "grad_norm": 4.341440677642822, "learning_rate": 3.3978140637345274e-05, "loss": 1.1626, "step": 12175 }, { "epoch": 0.3206742164867, "grad_norm": 4.72980260848999, "learning_rate": 3.397682380826969e-05, "loss": 1.0856, "step": 12176 }, { "epoch": 0.32070055306821177, "grad_norm": 1.5803526639938354, "learning_rate": 3.3975506979194105e-05, "loss": 2.1048, "step": 12177 }, { "epoch": 0.32072688964972346, "grad_norm": 2.1748430728912354, "learning_rate": 3.397419015011852e-05, "loss": 1.7495, "step": 12178 }, { "epoch": 0.3207532262312352, "grad_norm": 1.723378300666809, "learning_rate": 3.397287332104293e-05, "loss": 2.1792, "step": 12179 }, { "epoch": 0.3207795628127469, "grad_norm": 2.184858560562134, "learning_rate": 3.3971556491967345e-05, "loss": 1.9408, "step": 12180 }, { "epoch": 0.32080589939425863, "grad_norm": 1.7641090154647827, "learning_rate": 3.3970239662891754e-05, "loss": 2.3994, "step": 12181 }, { "epoch": 0.3208322359757703, "grad_norm": 2.993551254272461, "learning_rate": 3.396892283381618e-05, "loss": 1.1394, "step": 12182 }, { "epoch": 0.32085857255728206, "grad_norm": 1.6945013999938965, "learning_rate": 3.3967606004740585e-05, "loss": 1.6169, "step": 12183 }, { "epoch": 0.3208849091387938, "grad_norm": 1.5391260385513306, "learning_rate": 3.3966289175665e-05, "loss": 2.2243, "step": 12184 }, { "epoch": 0.3209112457203055, "grad_norm": 4.776205539703369, "learning_rate": 3.396497234658941e-05, "loss": 0.7905, "step": 12185 }, { "epoch": 0.32093758230181724, "grad_norm": 1.5860819816589355, "learning_rate": 3.3963655517513826e-05, "loss": 1.9358, "step": 12186 }, { "epoch": 0.3209639188833289, "grad_norm": 1.9016315937042236, "learning_rate": 3.396233868843824e-05, "loss": 1.6194, "step": 12187 }, { "epoch": 0.32099025546484067, "grad_norm": 1.9745495319366455, "learning_rate": 3.396102185936266e-05, "loss": 2.0536, "step": 12188 }, { "epoch": 0.32101659204635236, "grad_norm": 2.164306402206421, "learning_rate": 3.395970503028707e-05, "loss": 2.4298, "step": 12189 }, { "epoch": 0.3210429286278641, "grad_norm": 2.338559627532959, "learning_rate": 3.395838820121148e-05, "loss": 0.7754, "step": 12190 }, { "epoch": 0.32106926520937584, "grad_norm": 1.7641993761062622, "learning_rate": 3.39570713721359e-05, "loss": 1.7563, "step": 12191 }, { "epoch": 0.32109560179088753, "grad_norm": 2.2831690311431885, "learning_rate": 3.395575454306031e-05, "loss": 1.8925, "step": 12192 }, { "epoch": 0.3211219383723993, "grad_norm": 2.13026762008667, "learning_rate": 3.395443771398473e-05, "loss": 1.8684, "step": 12193 }, { "epoch": 0.32114827495391096, "grad_norm": 2.231293201446533, "learning_rate": 3.395312088490914e-05, "loss": 0.8055, "step": 12194 }, { "epoch": 0.3211746115354227, "grad_norm": 1.9471639394760132, "learning_rate": 3.395180405583355e-05, "loss": 1.7911, "step": 12195 }, { "epoch": 0.32120094811693445, "grad_norm": 1.8353394269943237, "learning_rate": 3.395048722675797e-05, "loss": 2.17, "step": 12196 }, { "epoch": 0.32122728469844614, "grad_norm": 2.2292640209198, "learning_rate": 3.3949170397682384e-05, "loss": 0.5162, "step": 12197 }, { "epoch": 0.3212536212799579, "grad_norm": 1.4795243740081787, "learning_rate": 3.39478535686068e-05, "loss": 1.4076, "step": 12198 }, { "epoch": 0.32127995786146957, "grad_norm": 3.0761191844940186, "learning_rate": 3.394653673953121e-05, "loss": 1.4909, "step": 12199 }, { "epoch": 0.3213062944429813, "grad_norm": 1.548272967338562, "learning_rate": 3.3945219910455624e-05, "loss": 1.3293, "step": 12200 }, { "epoch": 0.321332631024493, "grad_norm": 1.8872591257095337, "learning_rate": 3.394390308138004e-05, "loss": 1.068, "step": 12201 }, { "epoch": 0.32135896760600474, "grad_norm": 1.4373579025268555, "learning_rate": 3.3942586252304455e-05, "loss": 1.4413, "step": 12202 }, { "epoch": 0.3213853041875165, "grad_norm": 4.177943706512451, "learning_rate": 3.3941269423228864e-05, "loss": 2.1391, "step": 12203 }, { "epoch": 0.3214116407690282, "grad_norm": 1.7268850803375244, "learning_rate": 3.393995259415328e-05, "loss": 1.5692, "step": 12204 }, { "epoch": 0.3214379773505399, "grad_norm": 1.9290614128112793, "learning_rate": 3.3938635765077695e-05, "loss": 1.8553, "step": 12205 }, { "epoch": 0.3214643139320516, "grad_norm": 2.2076988220214844, "learning_rate": 3.393731893600211e-05, "loss": 1.4893, "step": 12206 }, { "epoch": 0.32149065051356335, "grad_norm": 3.3436062335968018, "learning_rate": 3.3936002106926526e-05, "loss": 1.4433, "step": 12207 }, { "epoch": 0.32151698709507504, "grad_norm": 1.653065800666809, "learning_rate": 3.3934685277850935e-05, "loss": 0.6151, "step": 12208 }, { "epoch": 0.3215433236765868, "grad_norm": 3.446596145629883, "learning_rate": 3.393336844877535e-05, "loss": 2.0906, "step": 12209 }, { "epoch": 0.3215696602580985, "grad_norm": 2.523874521255493, "learning_rate": 3.3932051619699766e-05, "loss": 1.2767, "step": 12210 }, { "epoch": 0.3215959968396102, "grad_norm": 2.230555772781372, "learning_rate": 3.393073479062418e-05, "loss": 2.0091, "step": 12211 }, { "epoch": 0.32162233342112195, "grad_norm": 3.2698090076446533, "learning_rate": 3.392941796154859e-05, "loss": 1.847, "step": 12212 }, { "epoch": 0.32164867000263364, "grad_norm": 2.6900603771209717, "learning_rate": 3.3928101132473006e-05, "loss": 1.9483, "step": 12213 }, { "epoch": 0.3216750065841454, "grad_norm": 2.081861734390259, "learning_rate": 3.392678430339742e-05, "loss": 0.761, "step": 12214 }, { "epoch": 0.32170134316565707, "grad_norm": 1.7848742008209229, "learning_rate": 3.392546747432184e-05, "loss": 0.7695, "step": 12215 }, { "epoch": 0.3217276797471688, "grad_norm": 2.8648509979248047, "learning_rate": 3.392415064524625e-05, "loss": 2.0384, "step": 12216 }, { "epoch": 0.32175401632868056, "grad_norm": 8.558164596557617, "learning_rate": 3.392283381617066e-05, "loss": 1.4706, "step": 12217 }, { "epoch": 0.32178035291019225, "grad_norm": 1.7810420989990234, "learning_rate": 3.392151698709508e-05, "loss": 1.9489, "step": 12218 }, { "epoch": 0.321806689491704, "grad_norm": 3.3097095489501953, "learning_rate": 3.3920200158019486e-05, "loss": 1.6992, "step": 12219 }, { "epoch": 0.3218330260732157, "grad_norm": 3.6095333099365234, "learning_rate": 3.391888332894391e-05, "loss": 0.9989, "step": 12220 }, { "epoch": 0.3218593626547274, "grad_norm": 2.969943046569824, "learning_rate": 3.391756649986832e-05, "loss": 1.9503, "step": 12221 }, { "epoch": 0.3218856992362391, "grad_norm": 1.9665263891220093, "learning_rate": 3.391624967079273e-05, "loss": 1.8913, "step": 12222 }, { "epoch": 0.32191203581775085, "grad_norm": 4.242003917694092, "learning_rate": 3.391493284171715e-05, "loss": 1.1605, "step": 12223 }, { "epoch": 0.3219383723992626, "grad_norm": 1.7454184293746948, "learning_rate": 3.391361601264156e-05, "loss": 1.9505, "step": 12224 }, { "epoch": 0.3219647089807743, "grad_norm": 2.203702211380005, "learning_rate": 3.391229918356598e-05, "loss": 1.9243, "step": 12225 }, { "epoch": 0.321991045562286, "grad_norm": 5.079842567443848, "learning_rate": 3.391098235449039e-05, "loss": 1.8182, "step": 12226 }, { "epoch": 0.3220173821437977, "grad_norm": 2.0906500816345215, "learning_rate": 3.3909665525414804e-05, "loss": 2.1544, "step": 12227 }, { "epoch": 0.32204371872530946, "grad_norm": 3.0349137783050537, "learning_rate": 3.390834869633921e-05, "loss": 1.5002, "step": 12228 }, { "epoch": 0.3220700553068212, "grad_norm": 2.0170960426330566, "learning_rate": 3.3907031867263635e-05, "loss": 1.4028, "step": 12229 }, { "epoch": 0.3220963918883329, "grad_norm": 1.68658447265625, "learning_rate": 3.3905715038188044e-05, "loss": 0.5158, "step": 12230 }, { "epoch": 0.32212272846984463, "grad_norm": 4.443417549133301, "learning_rate": 3.390439820911246e-05, "loss": 1.3005, "step": 12231 }, { "epoch": 0.3221490650513563, "grad_norm": 2.4294207096099854, "learning_rate": 3.390308138003687e-05, "loss": 0.3338, "step": 12232 }, { "epoch": 0.32217540163286806, "grad_norm": 2.0338082313537598, "learning_rate": 3.3901764550961284e-05, "loss": 1.6438, "step": 12233 }, { "epoch": 0.32220173821437975, "grad_norm": 2.186739206314087, "learning_rate": 3.3900447721885706e-05, "loss": 2.1927, "step": 12234 }, { "epoch": 0.3222280747958915, "grad_norm": 1.724195957183838, "learning_rate": 3.3899130892810115e-05, "loss": 1.8484, "step": 12235 }, { "epoch": 0.32225441137740324, "grad_norm": 1.8746455907821655, "learning_rate": 3.389781406373453e-05, "loss": 1.6226, "step": 12236 }, { "epoch": 0.3222807479589149, "grad_norm": 2.949380397796631, "learning_rate": 3.389649723465894e-05, "loss": 1.676, "step": 12237 }, { "epoch": 0.32230708454042667, "grad_norm": 2.543724775314331, "learning_rate": 3.3895180405583355e-05, "loss": 0.8616, "step": 12238 }, { "epoch": 0.32233342112193836, "grad_norm": 2.8032801151275635, "learning_rate": 3.389386357650777e-05, "loss": 1.4834, "step": 12239 }, { "epoch": 0.3223597577034501, "grad_norm": 1.9766201972961426, "learning_rate": 3.3892546747432186e-05, "loss": 2.1883, "step": 12240 }, { "epoch": 0.3223860942849618, "grad_norm": 1.6406527757644653, "learning_rate": 3.3891229918356595e-05, "loss": 2.2855, "step": 12241 }, { "epoch": 0.32241243086647353, "grad_norm": 1.7102583646774292, "learning_rate": 3.388991308928101e-05, "loss": 1.3, "step": 12242 }, { "epoch": 0.3224387674479853, "grad_norm": 2.1417951583862305, "learning_rate": 3.3888596260205426e-05, "loss": 1.8918, "step": 12243 }, { "epoch": 0.32246510402949696, "grad_norm": 1.9551180601119995, "learning_rate": 3.388727943112984e-05, "loss": 1.672, "step": 12244 }, { "epoch": 0.3224914406110087, "grad_norm": 2.5408904552459717, "learning_rate": 3.388596260205426e-05, "loss": 1.7678, "step": 12245 }, { "epoch": 0.3225177771925204, "grad_norm": 2.758303165435791, "learning_rate": 3.3884645772978667e-05, "loss": 1.9766, "step": 12246 }, { "epoch": 0.32254411377403214, "grad_norm": 3.7941858768463135, "learning_rate": 3.388332894390308e-05, "loss": 1.5693, "step": 12247 }, { "epoch": 0.3225704503555438, "grad_norm": 2.059492826461792, "learning_rate": 3.38820121148275e-05, "loss": 1.3157, "step": 12248 }, { "epoch": 0.32259678693705557, "grad_norm": 1.7243353128433228, "learning_rate": 3.388069528575191e-05, "loss": 1.6336, "step": 12249 }, { "epoch": 0.3226231235185673, "grad_norm": 2.2909388542175293, "learning_rate": 3.387937845667632e-05, "loss": 2.0044, "step": 12250 }, { "epoch": 0.322649460100079, "grad_norm": 3.018120050430298, "learning_rate": 3.387806162760074e-05, "loss": 1.943, "step": 12251 }, { "epoch": 0.32267579668159074, "grad_norm": 1.9069595336914062, "learning_rate": 3.387674479852515e-05, "loss": 1.9083, "step": 12252 }, { "epoch": 0.32270213326310243, "grad_norm": 1.864722490310669, "learning_rate": 3.387542796944957e-05, "loss": 1.5301, "step": 12253 }, { "epoch": 0.3227284698446142, "grad_norm": 1.5729719400405884, "learning_rate": 3.3874111140373985e-05, "loss": 0.2737, "step": 12254 }, { "epoch": 0.32275480642612586, "grad_norm": 1.5862400531768799, "learning_rate": 3.387279431129839e-05, "loss": 1.8645, "step": 12255 }, { "epoch": 0.3227811430076376, "grad_norm": 3.648317813873291, "learning_rate": 3.387147748222281e-05, "loss": 1.9888, "step": 12256 }, { "epoch": 0.32280747958914935, "grad_norm": 2.1134192943573, "learning_rate": 3.387016065314722e-05, "loss": 1.7019, "step": 12257 }, { "epoch": 0.32283381617066104, "grad_norm": 1.546311378479004, "learning_rate": 3.386884382407164e-05, "loss": 2.0524, "step": 12258 }, { "epoch": 0.3228601527521728, "grad_norm": 3.0315825939178467, "learning_rate": 3.386752699499605e-05, "loss": 1.0399, "step": 12259 }, { "epoch": 0.32288648933368447, "grad_norm": 3.750859260559082, "learning_rate": 3.3866210165920465e-05, "loss": 1.6299, "step": 12260 }, { "epoch": 0.3229128259151962, "grad_norm": 1.4505350589752197, "learning_rate": 3.386489333684488e-05, "loss": 2.1018, "step": 12261 }, { "epoch": 0.32293916249670795, "grad_norm": 2.239183187484741, "learning_rate": 3.3863576507769296e-05, "loss": 2.2628, "step": 12262 }, { "epoch": 0.32296549907821964, "grad_norm": 2.9473445415496826, "learning_rate": 3.386225967869371e-05, "loss": 1.3276, "step": 12263 }, { "epoch": 0.3229918356597314, "grad_norm": 2.2948169708251953, "learning_rate": 3.386094284961812e-05, "loss": 2.0559, "step": 12264 }, { "epoch": 0.3230181722412431, "grad_norm": 4.848300933837891, "learning_rate": 3.3859626020542536e-05, "loss": 0.8816, "step": 12265 }, { "epoch": 0.3230445088227548, "grad_norm": 1.973707675933838, "learning_rate": 3.3858309191466945e-05, "loss": 1.7563, "step": 12266 }, { "epoch": 0.3230708454042665, "grad_norm": 1.7574723958969116, "learning_rate": 3.385699236239137e-05, "loss": 1.714, "step": 12267 }, { "epoch": 0.32309718198577825, "grad_norm": 2.4838573932647705, "learning_rate": 3.3855675533315776e-05, "loss": 1.6102, "step": 12268 }, { "epoch": 0.32312351856729, "grad_norm": 1.9730017185211182, "learning_rate": 3.385435870424019e-05, "loss": 1.2705, "step": 12269 }, { "epoch": 0.3231498551488017, "grad_norm": 2.1752588748931885, "learning_rate": 3.385304187516461e-05, "loss": 1.6559, "step": 12270 }, { "epoch": 0.3231761917303134, "grad_norm": 2.166586399078369, "learning_rate": 3.3851725046089016e-05, "loss": 1.5732, "step": 12271 }, { "epoch": 0.3232025283118251, "grad_norm": 2.059168815612793, "learning_rate": 3.385040821701344e-05, "loss": 2.0538, "step": 12272 }, { "epoch": 0.32322886489333685, "grad_norm": 2.9020776748657227, "learning_rate": 3.384909138793785e-05, "loss": 1.8277, "step": 12273 }, { "epoch": 0.32325520147484854, "grad_norm": 1.9672114849090576, "learning_rate": 3.384777455886226e-05, "loss": 2.0089, "step": 12274 }, { "epoch": 0.3232815380563603, "grad_norm": 3.504380941390991, "learning_rate": 3.384645772978667e-05, "loss": 1.1946, "step": 12275 }, { "epoch": 0.32330787463787203, "grad_norm": 3.9589881896972656, "learning_rate": 3.3845140900711094e-05, "loss": 0.9693, "step": 12276 }, { "epoch": 0.3233342112193837, "grad_norm": 3.597632884979248, "learning_rate": 3.38438240716355e-05, "loss": 1.7431, "step": 12277 }, { "epoch": 0.32336054780089546, "grad_norm": 2.0312702655792236, "learning_rate": 3.384250724255992e-05, "loss": 1.4212, "step": 12278 }, { "epoch": 0.32338688438240715, "grad_norm": 1.6893987655639648, "learning_rate": 3.3841190413484334e-05, "loss": 1.8204, "step": 12279 }, { "epoch": 0.3234132209639189, "grad_norm": 2.1441118717193604, "learning_rate": 3.383987358440874e-05, "loss": 1.303, "step": 12280 }, { "epoch": 0.3234395575454306, "grad_norm": 1.5230028629302979, "learning_rate": 3.3838556755333165e-05, "loss": 1.7299, "step": 12281 }, { "epoch": 0.3234658941269423, "grad_norm": 2.374310255050659, "learning_rate": 3.3837239926257574e-05, "loss": 1.6029, "step": 12282 }, { "epoch": 0.32349223070845406, "grad_norm": 2.3975625038146973, "learning_rate": 3.383592309718199e-05, "loss": 2.222, "step": 12283 }, { "epoch": 0.32351856728996575, "grad_norm": 5.678077220916748, "learning_rate": 3.38346062681064e-05, "loss": 1.4825, "step": 12284 }, { "epoch": 0.3235449038714775, "grad_norm": 3.0202133655548096, "learning_rate": 3.3833289439030814e-05, "loss": 2.6372, "step": 12285 }, { "epoch": 0.3235712404529892, "grad_norm": 4.9063239097595215, "learning_rate": 3.383197260995523e-05, "loss": 1.6143, "step": 12286 }, { "epoch": 0.3235975770345009, "grad_norm": 1.5931549072265625, "learning_rate": 3.3830655780879645e-05, "loss": 1.6286, "step": 12287 }, { "epoch": 0.3236239136160126, "grad_norm": 5.184411525726318, "learning_rate": 3.3829338951804054e-05, "loss": 1.4513, "step": 12288 }, { "epoch": 0.32365025019752436, "grad_norm": 2.0110678672790527, "learning_rate": 3.382802212272847e-05, "loss": 1.8901, "step": 12289 }, { "epoch": 0.3236765867790361, "grad_norm": 2.3237550258636475, "learning_rate": 3.3826705293652885e-05, "loss": 1.7888, "step": 12290 }, { "epoch": 0.3237029233605478, "grad_norm": 2.883796215057373, "learning_rate": 3.38253884645773e-05, "loss": 2.0913, "step": 12291 }, { "epoch": 0.32372925994205953, "grad_norm": 2.086744785308838, "learning_rate": 3.3824071635501716e-05, "loss": 1.5644, "step": 12292 }, { "epoch": 0.3237555965235712, "grad_norm": 1.946567416191101, "learning_rate": 3.3822754806426125e-05, "loss": 2.0751, "step": 12293 }, { "epoch": 0.32378193310508296, "grad_norm": 2.670642137527466, "learning_rate": 3.382143797735054e-05, "loss": 1.8773, "step": 12294 }, { "epoch": 0.3238082696865947, "grad_norm": 1.9451168775558472, "learning_rate": 3.3820121148274956e-05, "loss": 2.8557, "step": 12295 }, { "epoch": 0.3238346062681064, "grad_norm": 1.9541621208190918, "learning_rate": 3.381880431919937e-05, "loss": 1.9104, "step": 12296 }, { "epoch": 0.32386094284961814, "grad_norm": 1.5282248258590698, "learning_rate": 3.381748749012378e-05, "loss": 0.558, "step": 12297 }, { "epoch": 0.3238872794311298, "grad_norm": 1.6153168678283691, "learning_rate": 3.3816170661048196e-05, "loss": 1.8507, "step": 12298 }, { "epoch": 0.32391361601264157, "grad_norm": 2.093909502029419, "learning_rate": 3.381485383197261e-05, "loss": 0.7208, "step": 12299 }, { "epoch": 0.32393995259415326, "grad_norm": 2.114145517349243, "learning_rate": 3.381353700289703e-05, "loss": 1.6762, "step": 12300 }, { "epoch": 0.323966289175665, "grad_norm": 3.022909164428711, "learning_rate": 3.381222017382144e-05, "loss": 1.3887, "step": 12301 }, { "epoch": 0.32399262575717674, "grad_norm": 1.9545685052871704, "learning_rate": 3.381090334474585e-05, "loss": 1.7816, "step": 12302 }, { "epoch": 0.32401896233868843, "grad_norm": 2.194612741470337, "learning_rate": 3.380958651567027e-05, "loss": 2.0078, "step": 12303 }, { "epoch": 0.3240452989202002, "grad_norm": 2.803537368774414, "learning_rate": 3.3808269686594676e-05, "loss": 1.0695, "step": 12304 }, { "epoch": 0.32407163550171186, "grad_norm": 2.8283658027648926, "learning_rate": 3.38069528575191e-05, "loss": 1.2953, "step": 12305 }, { "epoch": 0.3240979720832236, "grad_norm": 2.443341016769409, "learning_rate": 3.380563602844351e-05, "loss": 1.5769, "step": 12306 }, { "epoch": 0.3241243086647353, "grad_norm": 2.552309513092041, "learning_rate": 3.380431919936792e-05, "loss": 1.56, "step": 12307 }, { "epoch": 0.32415064524624704, "grad_norm": 1.6677271127700806, "learning_rate": 3.380300237029234e-05, "loss": 1.9866, "step": 12308 }, { "epoch": 0.3241769818277588, "grad_norm": 4.023653030395508, "learning_rate": 3.3801685541216754e-05, "loss": 1.5099, "step": 12309 }, { "epoch": 0.32420331840927047, "grad_norm": 1.7729556560516357, "learning_rate": 3.380036871214117e-05, "loss": 1.9922, "step": 12310 }, { "epoch": 0.3242296549907822, "grad_norm": 1.8447456359863281, "learning_rate": 3.379905188306558e-05, "loss": 1.8368, "step": 12311 }, { "epoch": 0.3242559915722939, "grad_norm": 1.5462827682495117, "learning_rate": 3.3797735053989994e-05, "loss": 1.727, "step": 12312 }, { "epoch": 0.32428232815380564, "grad_norm": 1.7580639123916626, "learning_rate": 3.37964182249144e-05, "loss": 2.4199, "step": 12313 }, { "epoch": 0.32430866473531733, "grad_norm": 2.0613253116607666, "learning_rate": 3.3795101395838826e-05, "loss": 1.3943, "step": 12314 }, { "epoch": 0.3243350013168291, "grad_norm": 2.076853036880493, "learning_rate": 3.3793784566763234e-05, "loss": 1.8899, "step": 12315 }, { "epoch": 0.3243613378983408, "grad_norm": 2.2645184993743896, "learning_rate": 3.379246773768765e-05, "loss": 2.3585, "step": 12316 }, { "epoch": 0.3243876744798525, "grad_norm": 1.675258994102478, "learning_rate": 3.3791150908612066e-05, "loss": 2.2064, "step": 12317 }, { "epoch": 0.32441401106136425, "grad_norm": 3.983132839202881, "learning_rate": 3.3789834079536474e-05, "loss": 1.1352, "step": 12318 }, { "epoch": 0.32444034764287594, "grad_norm": 2.144740581512451, "learning_rate": 3.37885172504609e-05, "loss": 1.8053, "step": 12319 }, { "epoch": 0.3244666842243877, "grad_norm": 1.438317894935608, "learning_rate": 3.3787200421385306e-05, "loss": 1.9145, "step": 12320 }, { "epoch": 0.32449302080589937, "grad_norm": 1.8782727718353271, "learning_rate": 3.378588359230972e-05, "loss": 1.996, "step": 12321 }, { "epoch": 0.3245193573874111, "grad_norm": 2.22731876373291, "learning_rate": 3.378456676323413e-05, "loss": 1.0348, "step": 12322 }, { "epoch": 0.32454569396892285, "grad_norm": 3.2880945205688477, "learning_rate": 3.3783249934158546e-05, "loss": 1.9547, "step": 12323 }, { "epoch": 0.32457203055043454, "grad_norm": 2.9851460456848145, "learning_rate": 3.378193310508296e-05, "loss": 1.8034, "step": 12324 }, { "epoch": 0.3245983671319463, "grad_norm": 5.274731636047363, "learning_rate": 3.378061627600738e-05, "loss": 1.7607, "step": 12325 }, { "epoch": 0.324624703713458, "grad_norm": 1.842307209968567, "learning_rate": 3.377929944693179e-05, "loss": 1.4218, "step": 12326 }, { "epoch": 0.3246510402949697, "grad_norm": 2.4799489974975586, "learning_rate": 3.37779826178562e-05, "loss": 1.7938, "step": 12327 }, { "epoch": 0.3246773768764814, "grad_norm": 2.7611265182495117, "learning_rate": 3.3776665788780624e-05, "loss": 1.3737, "step": 12328 }, { "epoch": 0.32470371345799315, "grad_norm": 1.8372855186462402, "learning_rate": 3.377534895970503e-05, "loss": 1.2665, "step": 12329 }, { "epoch": 0.3247300500395049, "grad_norm": 1.8387396335601807, "learning_rate": 3.377403213062945e-05, "loss": 2.0953, "step": 12330 }, { "epoch": 0.3247563866210166, "grad_norm": 2.6778085231781006, "learning_rate": 3.377271530155386e-05, "loss": 1.7535, "step": 12331 }, { "epoch": 0.3247827232025283, "grad_norm": 1.7880159616470337, "learning_rate": 3.377139847247827e-05, "loss": 1.8497, "step": 12332 }, { "epoch": 0.32480905978404, "grad_norm": 9.613354682922363, "learning_rate": 3.377008164340269e-05, "loss": 1.4307, "step": 12333 }, { "epoch": 0.32483539636555175, "grad_norm": 2.215589761734009, "learning_rate": 3.3768764814327104e-05, "loss": 1.9135, "step": 12334 }, { "epoch": 0.3248617329470635, "grad_norm": 2.5530459880828857, "learning_rate": 3.376744798525151e-05, "loss": 1.6609, "step": 12335 }, { "epoch": 0.3248880695285752, "grad_norm": 4.462213039398193, "learning_rate": 3.376613115617593e-05, "loss": 1.2865, "step": 12336 }, { "epoch": 0.32491440611008693, "grad_norm": 1.934516191482544, "learning_rate": 3.3764814327100344e-05, "loss": 1.1295, "step": 12337 }, { "epoch": 0.3249407426915986, "grad_norm": 2.748586654663086, "learning_rate": 3.376349749802476e-05, "loss": 1.414, "step": 12338 }, { "epoch": 0.32496707927311036, "grad_norm": 1.7129416465759277, "learning_rate": 3.3762180668949175e-05, "loss": 1.7636, "step": 12339 }, { "epoch": 0.32499341585462205, "grad_norm": 2.5688493251800537, "learning_rate": 3.3760863839873584e-05, "loss": 2.1608, "step": 12340 }, { "epoch": 0.3250197524361338, "grad_norm": 2.878887414932251, "learning_rate": 3.3759547010798e-05, "loss": 1.1483, "step": 12341 }, { "epoch": 0.32504608901764553, "grad_norm": 3.2131569385528564, "learning_rate": 3.3758230181722415e-05, "loss": 2.0589, "step": 12342 }, { "epoch": 0.3250724255991572, "grad_norm": 1.6457082033157349, "learning_rate": 3.375691335264683e-05, "loss": 1.6136, "step": 12343 }, { "epoch": 0.32509876218066897, "grad_norm": 2.799499034881592, "learning_rate": 3.375559652357124e-05, "loss": 1.3579, "step": 12344 }, { "epoch": 0.32512509876218065, "grad_norm": 1.765594482421875, "learning_rate": 3.3754279694495655e-05, "loss": 0.9123, "step": 12345 }, { "epoch": 0.3251514353436924, "grad_norm": 3.843245506286621, "learning_rate": 3.375296286542007e-05, "loss": 1.3026, "step": 12346 }, { "epoch": 0.3251777719252041, "grad_norm": 2.574646234512329, "learning_rate": 3.3751646036344486e-05, "loss": 1.84, "step": 12347 }, { "epoch": 0.3252041085067158, "grad_norm": 2.4024720191955566, "learning_rate": 3.37503292072689e-05, "loss": 2.2107, "step": 12348 }, { "epoch": 0.32523044508822757, "grad_norm": 1.7035294771194458, "learning_rate": 3.374901237819331e-05, "loss": 1.3948, "step": 12349 }, { "epoch": 0.32525678166973926, "grad_norm": 2.154402017593384, "learning_rate": 3.3747695549117726e-05, "loss": 2.227, "step": 12350 }, { "epoch": 0.325283118251251, "grad_norm": 2.6465022563934326, "learning_rate": 3.3746378720042135e-05, "loss": 1.0617, "step": 12351 }, { "epoch": 0.3253094548327627, "grad_norm": 1.896052598953247, "learning_rate": 3.374506189096656e-05, "loss": 1.5736, "step": 12352 }, { "epoch": 0.32533579141427443, "grad_norm": 1.764878273010254, "learning_rate": 3.3743745061890966e-05, "loss": 1.9243, "step": 12353 }, { "epoch": 0.3253621279957861, "grad_norm": 1.9510146379470825, "learning_rate": 3.374242823281538e-05, "loss": 1.4257, "step": 12354 }, { "epoch": 0.32538846457729786, "grad_norm": 2.0460031032562256, "learning_rate": 3.37411114037398e-05, "loss": 1.4265, "step": 12355 }, { "epoch": 0.3254148011588096, "grad_norm": 1.5084552764892578, "learning_rate": 3.3739794574664206e-05, "loss": 1.3017, "step": 12356 }, { "epoch": 0.3254411377403213, "grad_norm": 2.178532600402832, "learning_rate": 3.373847774558863e-05, "loss": 2.0115, "step": 12357 }, { "epoch": 0.32546747432183304, "grad_norm": 2.2687771320343018, "learning_rate": 3.373716091651304e-05, "loss": 0.5165, "step": 12358 }, { "epoch": 0.3254938109033447, "grad_norm": 1.6928813457489014, "learning_rate": 3.373584408743745e-05, "loss": 0.3237, "step": 12359 }, { "epoch": 0.32552014748485647, "grad_norm": 2.705045223236084, "learning_rate": 3.373452725836186e-05, "loss": 1.531, "step": 12360 }, { "epoch": 0.32554648406636816, "grad_norm": 3.2414040565490723, "learning_rate": 3.3733210429286284e-05, "loss": 1.9866, "step": 12361 }, { "epoch": 0.3255728206478799, "grad_norm": 2.9641237258911133, "learning_rate": 3.373189360021069e-05, "loss": 0.8239, "step": 12362 }, { "epoch": 0.32559915722939164, "grad_norm": 2.5464887619018555, "learning_rate": 3.373057677113511e-05, "loss": 1.5524, "step": 12363 }, { "epoch": 0.32562549381090333, "grad_norm": 1.8479719161987305, "learning_rate": 3.3729259942059524e-05, "loss": 1.626, "step": 12364 }, { "epoch": 0.3256518303924151, "grad_norm": 1.687949538230896, "learning_rate": 3.372794311298393e-05, "loss": 1.6007, "step": 12365 }, { "epoch": 0.32567816697392676, "grad_norm": 1.8811193704605103, "learning_rate": 3.3726626283908355e-05, "loss": 1.191, "step": 12366 }, { "epoch": 0.3257045035554385, "grad_norm": 2.518146514892578, "learning_rate": 3.3725309454832764e-05, "loss": 1.7585, "step": 12367 }, { "epoch": 0.32573084013695025, "grad_norm": 2.4291203022003174, "learning_rate": 3.372399262575718e-05, "loss": 1.6238, "step": 12368 }, { "epoch": 0.32575717671846194, "grad_norm": 2.936344861984253, "learning_rate": 3.372267579668159e-05, "loss": 1.6533, "step": 12369 }, { "epoch": 0.3257835132999737, "grad_norm": 2.00290846824646, "learning_rate": 3.3721358967606004e-05, "loss": 1.9891, "step": 12370 }, { "epoch": 0.32580984988148537, "grad_norm": 1.9928607940673828, "learning_rate": 3.372004213853042e-05, "loss": 1.72, "step": 12371 }, { "epoch": 0.3258361864629971, "grad_norm": 1.8765567541122437, "learning_rate": 3.3718725309454835e-05, "loss": 1.6647, "step": 12372 }, { "epoch": 0.3258625230445088, "grad_norm": 1.8526932001113892, "learning_rate": 3.371740848037925e-05, "loss": 1.9192, "step": 12373 }, { "epoch": 0.32588885962602054, "grad_norm": 2.6315598487854004, "learning_rate": 3.371609165130366e-05, "loss": 1.6672, "step": 12374 }, { "epoch": 0.3259151962075323, "grad_norm": 2.2314915657043457, "learning_rate": 3.371477482222808e-05, "loss": 1.392, "step": 12375 }, { "epoch": 0.325941532789044, "grad_norm": 1.7097437381744385, "learning_rate": 3.371345799315249e-05, "loss": 1.8734, "step": 12376 }, { "epoch": 0.3259678693705557, "grad_norm": 3.08762264251709, "learning_rate": 3.3712141164076907e-05, "loss": 2.1931, "step": 12377 }, { "epoch": 0.3259942059520674, "grad_norm": 3.167793035507202, "learning_rate": 3.3710824335001315e-05, "loss": 2.2833, "step": 12378 }, { "epoch": 0.32602054253357915, "grad_norm": 2.146003484725952, "learning_rate": 3.370950750592573e-05, "loss": 1.7115, "step": 12379 }, { "epoch": 0.32604687911509084, "grad_norm": 2.3912789821624756, "learning_rate": 3.3708190676850147e-05, "loss": 1.6485, "step": 12380 }, { "epoch": 0.3260732156966026, "grad_norm": 2.0407657623291016, "learning_rate": 3.370687384777456e-05, "loss": 1.1698, "step": 12381 }, { "epoch": 0.3260995522781143, "grad_norm": 1.600154161453247, "learning_rate": 3.370555701869898e-05, "loss": 1.6273, "step": 12382 }, { "epoch": 0.326125888859626, "grad_norm": 1.9794992208480835, "learning_rate": 3.3704240189623387e-05, "loss": 1.8946, "step": 12383 }, { "epoch": 0.32615222544113776, "grad_norm": 2.033773899078369, "learning_rate": 3.37029233605478e-05, "loss": 1.9258, "step": 12384 }, { "epoch": 0.32617856202264944, "grad_norm": 2.7881083488464355, "learning_rate": 3.370160653147222e-05, "loss": 1.7056, "step": 12385 }, { "epoch": 0.3262048986041612, "grad_norm": 2.3700952529907227, "learning_rate": 3.370028970239663e-05, "loss": 2.0705, "step": 12386 }, { "epoch": 0.3262312351856729, "grad_norm": 2.2574398517608643, "learning_rate": 3.369897287332104e-05, "loss": 1.6727, "step": 12387 }, { "epoch": 0.3262575717671846, "grad_norm": 1.82600736618042, "learning_rate": 3.369765604424546e-05, "loss": 1.4281, "step": 12388 }, { "epoch": 0.32628390834869636, "grad_norm": 1.817922830581665, "learning_rate": 3.3696339215169867e-05, "loss": 2.0027, "step": 12389 }, { "epoch": 0.32631024493020805, "grad_norm": 3.3472657203674316, "learning_rate": 3.369502238609429e-05, "loss": 0.8953, "step": 12390 }, { "epoch": 0.3263365815117198, "grad_norm": 3.2613322734832764, "learning_rate": 3.36937055570187e-05, "loss": 1.3292, "step": 12391 }, { "epoch": 0.3263629180932315, "grad_norm": 3.29406476020813, "learning_rate": 3.369238872794311e-05, "loss": 0.7488, "step": 12392 }, { "epoch": 0.3263892546747432, "grad_norm": 1.9919109344482422, "learning_rate": 3.369107189886753e-05, "loss": 1.9706, "step": 12393 }, { "epoch": 0.3264155912562549, "grad_norm": 2.754307508468628, "learning_rate": 3.3689755069791945e-05, "loss": 1.9736, "step": 12394 }, { "epoch": 0.32644192783776665, "grad_norm": 1.8472378253936768, "learning_rate": 3.368843824071636e-05, "loss": 1.8427, "step": 12395 }, { "epoch": 0.3264682644192784, "grad_norm": 6.103476047515869, "learning_rate": 3.368712141164077e-05, "loss": 1.8262, "step": 12396 }, { "epoch": 0.3264946010007901, "grad_norm": 1.5487394332885742, "learning_rate": 3.3685804582565185e-05, "loss": 2.1407, "step": 12397 }, { "epoch": 0.32652093758230183, "grad_norm": 1.9438989162445068, "learning_rate": 3.3684487753489593e-05, "loss": 1.1456, "step": 12398 }, { "epoch": 0.3265472741638135, "grad_norm": 2.440084457397461, "learning_rate": 3.3683170924414016e-05, "loss": 1.3676, "step": 12399 }, { "epoch": 0.32657361074532526, "grad_norm": 1.7957576513290405, "learning_rate": 3.3681854095338425e-05, "loss": 2.1561, "step": 12400 }, { "epoch": 0.326599947326837, "grad_norm": 7.39895486831665, "learning_rate": 3.368053726626284e-05, "loss": 1.3212, "step": 12401 }, { "epoch": 0.3266262839083487, "grad_norm": 3.1179237365722656, "learning_rate": 3.3679220437187256e-05, "loss": 1.9624, "step": 12402 }, { "epoch": 0.32665262048986043, "grad_norm": 1.9140419960021973, "learning_rate": 3.3677903608111665e-05, "loss": 0.7599, "step": 12403 }, { "epoch": 0.3266789570713721, "grad_norm": 2.248711347579956, "learning_rate": 3.367658677903609e-05, "loss": 0.7234, "step": 12404 }, { "epoch": 0.32670529365288387, "grad_norm": 3.0314104557037354, "learning_rate": 3.3675269949960496e-05, "loss": 0.3289, "step": 12405 }, { "epoch": 0.32673163023439555, "grad_norm": 2.7231507301330566, "learning_rate": 3.367395312088491e-05, "loss": 1.764, "step": 12406 }, { "epoch": 0.3267579668159073, "grad_norm": 2.1077165603637695, "learning_rate": 3.367263629180932e-05, "loss": 1.4074, "step": 12407 }, { "epoch": 0.32678430339741904, "grad_norm": 3.67710542678833, "learning_rate": 3.367131946273374e-05, "loss": 1.6472, "step": 12408 }, { "epoch": 0.32681063997893073, "grad_norm": 2.667994499206543, "learning_rate": 3.367000263365815e-05, "loss": 1.3321, "step": 12409 }, { "epoch": 0.32683697656044247, "grad_norm": 4.413286209106445, "learning_rate": 3.366868580458257e-05, "loss": 1.6661, "step": 12410 }, { "epoch": 0.32686331314195416, "grad_norm": 3.7115375995635986, "learning_rate": 3.366736897550698e-05, "loss": 2.1462, "step": 12411 }, { "epoch": 0.3268896497234659, "grad_norm": 1.8407247066497803, "learning_rate": 3.366605214643139e-05, "loss": 1.9583, "step": 12412 }, { "epoch": 0.3269159863049776, "grad_norm": 2.2176520824432373, "learning_rate": 3.3664735317355814e-05, "loss": 1.7093, "step": 12413 }, { "epoch": 0.32694232288648933, "grad_norm": 2.8115150928497314, "learning_rate": 3.366341848828022e-05, "loss": 1.3918, "step": 12414 }, { "epoch": 0.3269686594680011, "grad_norm": 4.907742977142334, "learning_rate": 3.366210165920464e-05, "loss": 1.931, "step": 12415 }, { "epoch": 0.32699499604951276, "grad_norm": 3.043381929397583, "learning_rate": 3.366078483012905e-05, "loss": 1.5289, "step": 12416 }, { "epoch": 0.3270213326310245, "grad_norm": 1.6548579931259155, "learning_rate": 3.365946800105346e-05, "loss": 1.6375, "step": 12417 }, { "epoch": 0.3270476692125362, "grad_norm": 1.8676625490188599, "learning_rate": 3.365815117197788e-05, "loss": 1.711, "step": 12418 }, { "epoch": 0.32707400579404794, "grad_norm": 2.144073247909546, "learning_rate": 3.3656834342902294e-05, "loss": 1.778, "step": 12419 }, { "epoch": 0.3271003423755596, "grad_norm": 1.729630708694458, "learning_rate": 3.365551751382671e-05, "loss": 1.9528, "step": 12420 }, { "epoch": 0.32712667895707137, "grad_norm": 3.874634027481079, "learning_rate": 3.365420068475112e-05, "loss": 1.094, "step": 12421 }, { "epoch": 0.3271530155385831, "grad_norm": 1.9382306337356567, "learning_rate": 3.365288385567554e-05, "loss": 1.9941, "step": 12422 }, { "epoch": 0.3271793521200948, "grad_norm": 1.4532071352005005, "learning_rate": 3.365156702659995e-05, "loss": 1.6583, "step": 12423 }, { "epoch": 0.32720568870160655, "grad_norm": 2.770235300064087, "learning_rate": 3.3650250197524365e-05, "loss": 2.3272, "step": 12424 }, { "epoch": 0.32723202528311823, "grad_norm": 4.797533988952637, "learning_rate": 3.3648933368448774e-05, "loss": 1.2025, "step": 12425 }, { "epoch": 0.32725836186463, "grad_norm": 1.8548167943954468, "learning_rate": 3.364761653937319e-05, "loss": 2.0565, "step": 12426 }, { "epoch": 0.32728469844614166, "grad_norm": 1.925743579864502, "learning_rate": 3.3646299710297605e-05, "loss": 2.0501, "step": 12427 }, { "epoch": 0.3273110350276534, "grad_norm": 1.7694215774536133, "learning_rate": 3.364498288122202e-05, "loss": 1.8351, "step": 12428 }, { "epoch": 0.32733737160916515, "grad_norm": 2.0810256004333496, "learning_rate": 3.3643666052146436e-05, "loss": 1.4491, "step": 12429 }, { "epoch": 0.32736370819067684, "grad_norm": 1.7464072704315186, "learning_rate": 3.3642349223070845e-05, "loss": 2.1283, "step": 12430 }, { "epoch": 0.3273900447721886, "grad_norm": 2.4301838874816895, "learning_rate": 3.364103239399526e-05, "loss": 2.4133, "step": 12431 }, { "epoch": 0.32741638135370027, "grad_norm": 3.1562137603759766, "learning_rate": 3.3639715564919676e-05, "loss": 1.9187, "step": 12432 }, { "epoch": 0.327442717935212, "grad_norm": 2.2507240772247314, "learning_rate": 3.363839873584409e-05, "loss": 1.6733, "step": 12433 }, { "epoch": 0.32746905451672376, "grad_norm": 4.765883922576904, "learning_rate": 3.36370819067685e-05, "loss": 1.9338, "step": 12434 }, { "epoch": 0.32749539109823544, "grad_norm": 2.0596113204956055, "learning_rate": 3.3635765077692916e-05, "loss": 2.1882, "step": 12435 }, { "epoch": 0.3275217276797472, "grad_norm": 1.7240318059921265, "learning_rate": 3.3634448248617325e-05, "loss": 1.4548, "step": 12436 }, { "epoch": 0.3275480642612589, "grad_norm": 3.7175097465515137, "learning_rate": 3.363313141954175e-05, "loss": 1.1278, "step": 12437 }, { "epoch": 0.3275744008427706, "grad_norm": 1.616626262664795, "learning_rate": 3.3631814590466156e-05, "loss": 1.6143, "step": 12438 }, { "epoch": 0.3276007374242823, "grad_norm": 2.5229551792144775, "learning_rate": 3.363049776139057e-05, "loss": 2.9027, "step": 12439 }, { "epoch": 0.32762707400579405, "grad_norm": 1.7705267667770386, "learning_rate": 3.362918093231499e-05, "loss": 1.5911, "step": 12440 }, { "epoch": 0.3276534105873058, "grad_norm": 2.192378044128418, "learning_rate": 3.36278641032394e-05, "loss": 2.8516, "step": 12441 }, { "epoch": 0.3276797471688175, "grad_norm": 3.0754435062408447, "learning_rate": 3.362654727416382e-05, "loss": 1.01, "step": 12442 }, { "epoch": 0.3277060837503292, "grad_norm": 2.4538142681121826, "learning_rate": 3.362523044508823e-05, "loss": 1.5274, "step": 12443 }, { "epoch": 0.3277324203318409, "grad_norm": 3.1316940784454346, "learning_rate": 3.362391361601264e-05, "loss": 1.5736, "step": 12444 }, { "epoch": 0.32775875691335266, "grad_norm": 2.8405120372772217, "learning_rate": 3.362259678693705e-05, "loss": 1.0018, "step": 12445 }, { "epoch": 0.32778509349486434, "grad_norm": 1.8347320556640625, "learning_rate": 3.3621279957861474e-05, "loss": 1.2207, "step": 12446 }, { "epoch": 0.3278114300763761, "grad_norm": 5.300283432006836, "learning_rate": 3.361996312878588e-05, "loss": 2.0342, "step": 12447 }, { "epoch": 0.32783776665788783, "grad_norm": 2.5827488899230957, "learning_rate": 3.36186462997103e-05, "loss": 2.1086, "step": 12448 }, { "epoch": 0.3278641032393995, "grad_norm": 3.9513463973999023, "learning_rate": 3.3617329470634714e-05, "loss": 1.9389, "step": 12449 }, { "epoch": 0.32789043982091126, "grad_norm": 3.1131837368011475, "learning_rate": 3.361601264155912e-05, "loss": 1.8678, "step": 12450 }, { "epoch": 0.32791677640242295, "grad_norm": 6.7721848487854, "learning_rate": 3.3614695812483546e-05, "loss": 1.4235, "step": 12451 }, { "epoch": 0.3279431129839347, "grad_norm": 2.0119690895080566, "learning_rate": 3.3613378983407954e-05, "loss": 1.7649, "step": 12452 }, { "epoch": 0.3279694495654464, "grad_norm": 1.8033760786056519, "learning_rate": 3.361206215433237e-05, "loss": 1.6084, "step": 12453 }, { "epoch": 0.3279957861469581, "grad_norm": 1.9612101316452026, "learning_rate": 3.361074532525678e-05, "loss": 1.449, "step": 12454 }, { "epoch": 0.32802212272846987, "grad_norm": 4.113655090332031, "learning_rate": 3.36094284961812e-05, "loss": 1.3775, "step": 12455 }, { "epoch": 0.32804845930998155, "grad_norm": 1.8795216083526611, "learning_rate": 3.360811166710561e-05, "loss": 1.908, "step": 12456 }, { "epoch": 0.3280747958914933, "grad_norm": 2.2812535762786865, "learning_rate": 3.3606794838030026e-05, "loss": 2.4844, "step": 12457 }, { "epoch": 0.328101132473005, "grad_norm": 2.2446975708007812, "learning_rate": 3.360547800895444e-05, "loss": 0.8677, "step": 12458 }, { "epoch": 0.32812746905451673, "grad_norm": 2.650233268737793, "learning_rate": 3.360416117987885e-05, "loss": 1.5683, "step": 12459 }, { "epoch": 0.3281538056360284, "grad_norm": 1.758612036705017, "learning_rate": 3.360284435080327e-05, "loss": 1.8904, "step": 12460 }, { "epoch": 0.32818014221754016, "grad_norm": 2.0016775131225586, "learning_rate": 3.360152752172768e-05, "loss": 1.5744, "step": 12461 }, { "epoch": 0.3282064787990519, "grad_norm": 2.1309597492218018, "learning_rate": 3.36002106926521e-05, "loss": 1.4482, "step": 12462 }, { "epoch": 0.3282328153805636, "grad_norm": 1.9692991971969604, "learning_rate": 3.3598893863576506e-05, "loss": 1.775, "step": 12463 }, { "epoch": 0.32825915196207534, "grad_norm": 2.5524942874908447, "learning_rate": 3.359757703450092e-05, "loss": 1.7822, "step": 12464 }, { "epoch": 0.328285488543587, "grad_norm": 1.9405978918075562, "learning_rate": 3.359626020542534e-05, "loss": 1.4305, "step": 12465 }, { "epoch": 0.32831182512509877, "grad_norm": 1.656632423400879, "learning_rate": 3.359494337634975e-05, "loss": 1.082, "step": 12466 }, { "epoch": 0.3283381617066105, "grad_norm": 3.533543825149536, "learning_rate": 3.359362654727417e-05, "loss": 2.1386, "step": 12467 }, { "epoch": 0.3283644982881222, "grad_norm": 2.101546049118042, "learning_rate": 3.359230971819858e-05, "loss": 1.5001, "step": 12468 }, { "epoch": 0.32839083486963394, "grad_norm": 2.174426555633545, "learning_rate": 3.359099288912299e-05, "loss": 1.789, "step": 12469 }, { "epoch": 0.32841717145114563, "grad_norm": 7.761478424072266, "learning_rate": 3.358967606004741e-05, "loss": 1.0197, "step": 12470 }, { "epoch": 0.32844350803265737, "grad_norm": 2.63983154296875, "learning_rate": 3.3588359230971824e-05, "loss": 2.0716, "step": 12471 }, { "epoch": 0.32846984461416906, "grad_norm": 2.8965344429016113, "learning_rate": 3.358704240189623e-05, "loss": 1.2944, "step": 12472 }, { "epoch": 0.3284961811956808, "grad_norm": 5.850398540496826, "learning_rate": 3.358572557282065e-05, "loss": 1.8968, "step": 12473 }, { "epoch": 0.32852251777719255, "grad_norm": 2.1706478595733643, "learning_rate": 3.3584408743745064e-05, "loss": 2.2624, "step": 12474 }, { "epoch": 0.32854885435870423, "grad_norm": 2.218282461166382, "learning_rate": 3.358309191466948e-05, "loss": 1.8951, "step": 12475 }, { "epoch": 0.328575190940216, "grad_norm": 2.908212661743164, "learning_rate": 3.3581775085593895e-05, "loss": 1.4327, "step": 12476 }, { "epoch": 0.32860152752172767, "grad_norm": 1.7072484493255615, "learning_rate": 3.3580458256518304e-05, "loss": 1.9014, "step": 12477 }, { "epoch": 0.3286278641032394, "grad_norm": 1.9307185411453247, "learning_rate": 3.357914142744272e-05, "loss": 0.7581, "step": 12478 }, { "epoch": 0.3286542006847511, "grad_norm": 1.5613802671432495, "learning_rate": 3.3577824598367135e-05, "loss": 1.4153, "step": 12479 }, { "epoch": 0.32868053726626284, "grad_norm": 1.8219326734542847, "learning_rate": 3.357650776929155e-05, "loss": 1.937, "step": 12480 }, { "epoch": 0.3287068738477746, "grad_norm": 2.997546672821045, "learning_rate": 3.357519094021596e-05, "loss": 1.003, "step": 12481 }, { "epoch": 0.32873321042928627, "grad_norm": 1.6718472242355347, "learning_rate": 3.3573874111140375e-05, "loss": 0.8359, "step": 12482 }, { "epoch": 0.328759547010798, "grad_norm": 1.7890629768371582, "learning_rate": 3.357255728206479e-05, "loss": 0.386, "step": 12483 }, { "epoch": 0.3287858835923097, "grad_norm": 1.9585483074188232, "learning_rate": 3.3571240452989206e-05, "loss": 1.9386, "step": 12484 }, { "epoch": 0.32881222017382145, "grad_norm": 1.6718919277191162, "learning_rate": 3.356992362391362e-05, "loss": 1.7794, "step": 12485 }, { "epoch": 0.32883855675533313, "grad_norm": 2.9827446937561035, "learning_rate": 3.356860679483803e-05, "loss": 1.6119, "step": 12486 }, { "epoch": 0.3288648933368449, "grad_norm": 1.8520004749298096, "learning_rate": 3.3567289965762446e-05, "loss": 1.3068, "step": 12487 }, { "epoch": 0.3288912299183566, "grad_norm": 1.957453966140747, "learning_rate": 3.356597313668686e-05, "loss": 2.1151, "step": 12488 }, { "epoch": 0.3289175664998683, "grad_norm": 2.7240259647369385, "learning_rate": 3.356465630761128e-05, "loss": 1.6568, "step": 12489 }, { "epoch": 0.32894390308138005, "grad_norm": 1.7372244596481323, "learning_rate": 3.3563339478535686e-05, "loss": 1.5207, "step": 12490 }, { "epoch": 0.32897023966289174, "grad_norm": 1.7322607040405273, "learning_rate": 3.35620226494601e-05, "loss": 2.1428, "step": 12491 }, { "epoch": 0.3289965762444035, "grad_norm": 1.8577086925506592, "learning_rate": 3.356070582038451e-05, "loss": 1.2703, "step": 12492 }, { "epoch": 0.32902291282591517, "grad_norm": 1.6735259294509888, "learning_rate": 3.355938899130893e-05, "loss": 1.4099, "step": 12493 }, { "epoch": 0.3290492494074269, "grad_norm": 3.109797954559326, "learning_rate": 3.355807216223334e-05, "loss": 1.8465, "step": 12494 }, { "epoch": 0.32907558598893866, "grad_norm": 1.9641128778457642, "learning_rate": 3.355675533315776e-05, "loss": 1.2254, "step": 12495 }, { "epoch": 0.32910192257045034, "grad_norm": 3.574244737625122, "learning_rate": 3.355543850408217e-05, "loss": 1.7763, "step": 12496 }, { "epoch": 0.3291282591519621, "grad_norm": 1.7841097116470337, "learning_rate": 3.355412167500658e-05, "loss": 1.7523, "step": 12497 }, { "epoch": 0.3291545957334738, "grad_norm": 2.0619144439697266, "learning_rate": 3.3552804845931004e-05, "loss": 1.8517, "step": 12498 }, { "epoch": 0.3291809323149855, "grad_norm": 2.235921621322632, "learning_rate": 3.355148801685541e-05, "loss": 1.2338, "step": 12499 }, { "epoch": 0.3292072688964972, "grad_norm": 4.301616191864014, "learning_rate": 3.355017118777983e-05, "loss": 1.2537, "step": 12500 }, { "epoch": 0.32923360547800895, "grad_norm": 3.540292739868164, "learning_rate": 3.354885435870424e-05, "loss": 2.1768, "step": 12501 }, { "epoch": 0.3292599420595207, "grad_norm": 1.493011713027954, "learning_rate": 3.354753752962865e-05, "loss": 0.552, "step": 12502 }, { "epoch": 0.3292862786410324, "grad_norm": 4.220021724700928, "learning_rate": 3.354622070055307e-05, "loss": 1.2234, "step": 12503 }, { "epoch": 0.3293126152225441, "grad_norm": 1.9487255811691284, "learning_rate": 3.3544903871477484e-05, "loss": 1.1783, "step": 12504 }, { "epoch": 0.3293389518040558, "grad_norm": 1.8226747512817383, "learning_rate": 3.35435870424019e-05, "loss": 1.3528, "step": 12505 }, { "epoch": 0.32936528838556756, "grad_norm": 1.8626196384429932, "learning_rate": 3.354227021332631e-05, "loss": 2.0449, "step": 12506 }, { "epoch": 0.3293916249670793, "grad_norm": 1.6120344400405884, "learning_rate": 3.354095338425073e-05, "loss": 1.9424, "step": 12507 }, { "epoch": 0.329417961548591, "grad_norm": 3.8325703144073486, "learning_rate": 3.353963655517514e-05, "loss": 1.5654, "step": 12508 }, { "epoch": 0.32944429813010273, "grad_norm": 1.9195082187652588, "learning_rate": 3.3538319726099555e-05, "loss": 2.3664, "step": 12509 }, { "epoch": 0.3294706347116144, "grad_norm": 1.8883514404296875, "learning_rate": 3.3537002897023964e-05, "loss": 1.9992, "step": 12510 }, { "epoch": 0.32949697129312616, "grad_norm": 3.385572671890259, "learning_rate": 3.353568606794838e-05, "loss": 1.2949, "step": 12511 }, { "epoch": 0.32952330787463785, "grad_norm": 2.470580816268921, "learning_rate": 3.3534369238872795e-05, "loss": 2.3398, "step": 12512 }, { "epoch": 0.3295496444561496, "grad_norm": 1.830094337463379, "learning_rate": 3.353305240979721e-05, "loss": 1.8316, "step": 12513 }, { "epoch": 0.32957598103766134, "grad_norm": 3.9431777000427246, "learning_rate": 3.3531735580721627e-05, "loss": 1.1087, "step": 12514 }, { "epoch": 0.329602317619173, "grad_norm": 1.9579212665557861, "learning_rate": 3.3530418751646035e-05, "loss": 1.177, "step": 12515 }, { "epoch": 0.32962865420068477, "grad_norm": 1.8958979845046997, "learning_rate": 3.352910192257045e-05, "loss": 0.5646, "step": 12516 }, { "epoch": 0.32965499078219646, "grad_norm": 2.0430240631103516, "learning_rate": 3.3527785093494867e-05, "loss": 1.7158, "step": 12517 }, { "epoch": 0.3296813273637082, "grad_norm": 4.056602954864502, "learning_rate": 3.352646826441928e-05, "loss": 1.1088, "step": 12518 }, { "epoch": 0.3297076639452199, "grad_norm": 2.469667434692383, "learning_rate": 3.352515143534369e-05, "loss": 1.8398, "step": 12519 }, { "epoch": 0.32973400052673163, "grad_norm": 4.454177379608154, "learning_rate": 3.3523834606268107e-05, "loss": 2.3589, "step": 12520 }, { "epoch": 0.3297603371082434, "grad_norm": 4.095808029174805, "learning_rate": 3.352251777719252e-05, "loss": 1.4604, "step": 12521 }, { "epoch": 0.32978667368975506, "grad_norm": 1.9336916208267212, "learning_rate": 3.352120094811694e-05, "loss": 1.6101, "step": 12522 }, { "epoch": 0.3298130102712668, "grad_norm": 2.4520530700683594, "learning_rate": 3.3519884119041353e-05, "loss": 2.602, "step": 12523 }, { "epoch": 0.3298393468527785, "grad_norm": 2.2900748252868652, "learning_rate": 3.351856728996576e-05, "loss": 1.3786, "step": 12524 }, { "epoch": 0.32986568343429024, "grad_norm": 2.832521438598633, "learning_rate": 3.351725046089018e-05, "loss": 1.1313, "step": 12525 }, { "epoch": 0.3298920200158019, "grad_norm": 1.9307314157485962, "learning_rate": 3.3515933631814593e-05, "loss": 0.5179, "step": 12526 }, { "epoch": 0.32991835659731367, "grad_norm": 1.689806580543518, "learning_rate": 3.351461680273901e-05, "loss": 1.8735, "step": 12527 }, { "epoch": 0.3299446931788254, "grad_norm": 1.8709040880203247, "learning_rate": 3.351329997366342e-05, "loss": 2.2646, "step": 12528 }, { "epoch": 0.3299710297603371, "grad_norm": 4.464748859405518, "learning_rate": 3.3511983144587833e-05, "loss": 1.6649, "step": 12529 }, { "epoch": 0.32999736634184884, "grad_norm": 5.140900611877441, "learning_rate": 3.351066631551225e-05, "loss": 1.8141, "step": 12530 }, { "epoch": 0.33002370292336053, "grad_norm": 2.42061710357666, "learning_rate": 3.3509349486436665e-05, "loss": 1.9346, "step": 12531 }, { "epoch": 0.3300500395048723, "grad_norm": 2.0868144035339355, "learning_rate": 3.350803265736108e-05, "loss": 2.2933, "step": 12532 }, { "epoch": 0.33007637608638396, "grad_norm": 1.780080795288086, "learning_rate": 3.350671582828549e-05, "loss": 2.1702, "step": 12533 }, { "epoch": 0.3301027126678957, "grad_norm": 1.5264103412628174, "learning_rate": 3.3505398999209905e-05, "loss": 1.8848, "step": 12534 }, { "epoch": 0.33012904924940745, "grad_norm": 1.7465996742248535, "learning_rate": 3.3504082170134313e-05, "loss": 2.0102, "step": 12535 }, { "epoch": 0.33015538583091913, "grad_norm": 1.7347049713134766, "learning_rate": 3.3502765341058736e-05, "loss": 2.2163, "step": 12536 }, { "epoch": 0.3301817224124309, "grad_norm": 6.342837810516357, "learning_rate": 3.3501448511983145e-05, "loss": 1.8344, "step": 12537 }, { "epoch": 0.33020805899394257, "grad_norm": 3.3314433097839355, "learning_rate": 3.350013168290756e-05, "loss": 1.1838, "step": 12538 }, { "epoch": 0.3302343955754543, "grad_norm": 1.7712961435317993, "learning_rate": 3.349881485383197e-05, "loss": 1.7312, "step": 12539 }, { "epoch": 0.33026073215696605, "grad_norm": 2.7138397693634033, "learning_rate": 3.349749802475639e-05, "loss": 2.2097, "step": 12540 }, { "epoch": 0.33028706873847774, "grad_norm": 1.8526287078857422, "learning_rate": 3.34961811956808e-05, "loss": 1.7366, "step": 12541 }, { "epoch": 0.3303134053199895, "grad_norm": 2.1880836486816406, "learning_rate": 3.3494864366605216e-05, "loss": 1.5198, "step": 12542 }, { "epoch": 0.33033974190150117, "grad_norm": 2.667555809020996, "learning_rate": 3.349354753752963e-05, "loss": 1.75, "step": 12543 }, { "epoch": 0.3303660784830129, "grad_norm": 3.640123128890991, "learning_rate": 3.349223070845404e-05, "loss": 2.2227, "step": 12544 }, { "epoch": 0.3303924150645246, "grad_norm": 3.579524517059326, "learning_rate": 3.349091387937846e-05, "loss": 1.6633, "step": 12545 }, { "epoch": 0.33041875164603635, "grad_norm": 2.2585041522979736, "learning_rate": 3.348959705030287e-05, "loss": 0.4713, "step": 12546 }, { "epoch": 0.3304450882275481, "grad_norm": 2.007910966873169, "learning_rate": 3.348828022122729e-05, "loss": 2.0308, "step": 12547 }, { "epoch": 0.3304714248090598, "grad_norm": 2.683708906173706, "learning_rate": 3.3486963392151696e-05, "loss": 1.2735, "step": 12548 }, { "epoch": 0.3304977613905715, "grad_norm": 2.4560959339141846, "learning_rate": 3.348564656307611e-05, "loss": 0.7222, "step": 12549 }, { "epoch": 0.3305240979720832, "grad_norm": 4.675262451171875, "learning_rate": 3.348432973400053e-05, "loss": 1.3754, "step": 12550 }, { "epoch": 0.33055043455359495, "grad_norm": 1.6472126245498657, "learning_rate": 3.348301290492494e-05, "loss": 1.6764, "step": 12551 }, { "epoch": 0.33057677113510664, "grad_norm": 4.038247108459473, "learning_rate": 3.348169607584936e-05, "loss": 1.5739, "step": 12552 }, { "epoch": 0.3306031077166184, "grad_norm": 2.0009679794311523, "learning_rate": 3.348037924677377e-05, "loss": 2.3286, "step": 12553 }, { "epoch": 0.3306294442981301, "grad_norm": 2.8953588008880615, "learning_rate": 3.347906241769819e-05, "loss": 1.573, "step": 12554 }, { "epoch": 0.3306557808796418, "grad_norm": 4.2981977462768555, "learning_rate": 3.34777455886226e-05, "loss": 1.5264, "step": 12555 }, { "epoch": 0.33068211746115356, "grad_norm": 1.6999446153640747, "learning_rate": 3.3476428759547014e-05, "loss": 0.4752, "step": 12556 }, { "epoch": 0.33070845404266525, "grad_norm": 1.5039056539535522, "learning_rate": 3.347511193047142e-05, "loss": 1.8579, "step": 12557 }, { "epoch": 0.330734790624177, "grad_norm": 1.9936712980270386, "learning_rate": 3.347379510139584e-05, "loss": 2.0047, "step": 12558 }, { "epoch": 0.3307611272056887, "grad_norm": 2.062723398208618, "learning_rate": 3.3472478272320254e-05, "loss": 2.0916, "step": 12559 }, { "epoch": 0.3307874637872004, "grad_norm": 2.9814114570617676, "learning_rate": 3.347116144324467e-05, "loss": 1.3651, "step": 12560 }, { "epoch": 0.33081380036871216, "grad_norm": 1.6131727695465088, "learning_rate": 3.3469844614169085e-05, "loss": 2.3453, "step": 12561 }, { "epoch": 0.33084013695022385, "grad_norm": 1.433284044265747, "learning_rate": 3.3468527785093494e-05, "loss": 2.0605, "step": 12562 }, { "epoch": 0.3308664735317356, "grad_norm": 1.5372118949890137, "learning_rate": 3.346721095601791e-05, "loss": 1.7518, "step": 12563 }, { "epoch": 0.3308928101132473, "grad_norm": 3.2360894680023193, "learning_rate": 3.3465894126942325e-05, "loss": 1.3049, "step": 12564 }, { "epoch": 0.330919146694759, "grad_norm": 1.6522250175476074, "learning_rate": 3.346457729786674e-05, "loss": 1.4266, "step": 12565 }, { "epoch": 0.3309454832762707, "grad_norm": 1.8428375720977783, "learning_rate": 3.346326046879115e-05, "loss": 2.0878, "step": 12566 }, { "epoch": 0.33097181985778246, "grad_norm": 2.827117681503296, "learning_rate": 3.3461943639715565e-05, "loss": 1.0205, "step": 12567 }, { "epoch": 0.3309981564392942, "grad_norm": 1.3901176452636719, "learning_rate": 3.346062681063998e-05, "loss": 1.7708, "step": 12568 }, { "epoch": 0.3310244930208059, "grad_norm": 1.8480952978134155, "learning_rate": 3.3459309981564396e-05, "loss": 1.5716, "step": 12569 }, { "epoch": 0.33105082960231763, "grad_norm": 3.0842599868774414, "learning_rate": 3.345799315248881e-05, "loss": 0.6009, "step": 12570 }, { "epoch": 0.3310771661838293, "grad_norm": 3.3772425651550293, "learning_rate": 3.345667632341322e-05, "loss": 1.7679, "step": 12571 }, { "epoch": 0.33110350276534106, "grad_norm": 2.100574016571045, "learning_rate": 3.3455359494337636e-05, "loss": 1.5663, "step": 12572 }, { "epoch": 0.3311298393468528, "grad_norm": 1.4141217470169067, "learning_rate": 3.345404266526205e-05, "loss": 1.4185, "step": 12573 }, { "epoch": 0.3311561759283645, "grad_norm": 1.649835467338562, "learning_rate": 3.345272583618647e-05, "loss": 2.0746, "step": 12574 }, { "epoch": 0.33118251250987624, "grad_norm": 1.8346282243728638, "learning_rate": 3.3451409007110876e-05, "loss": 1.5286, "step": 12575 }, { "epoch": 0.3312088490913879, "grad_norm": 1.6538931131362915, "learning_rate": 3.345009217803529e-05, "loss": 2.0253, "step": 12576 }, { "epoch": 0.33123518567289967, "grad_norm": 2.7198596000671387, "learning_rate": 3.344877534895971e-05, "loss": 1.4781, "step": 12577 }, { "epoch": 0.33126152225441136, "grad_norm": 2.6430201530456543, "learning_rate": 3.344745851988412e-05, "loss": 1.7649, "step": 12578 }, { "epoch": 0.3312878588359231, "grad_norm": 2.3806519508361816, "learning_rate": 3.344614169080854e-05, "loss": 1.5382, "step": 12579 }, { "epoch": 0.33131419541743484, "grad_norm": 1.9562166929244995, "learning_rate": 3.344482486173295e-05, "loss": 1.769, "step": 12580 }, { "epoch": 0.33134053199894653, "grad_norm": 2.0368337631225586, "learning_rate": 3.344350803265736e-05, "loss": 1.2347, "step": 12581 }, { "epoch": 0.3313668685804583, "grad_norm": 1.9362303018569946, "learning_rate": 3.344219120358177e-05, "loss": 1.8617, "step": 12582 }, { "epoch": 0.33139320516196996, "grad_norm": 2.699650764465332, "learning_rate": 3.3440874374506194e-05, "loss": 1.3428, "step": 12583 }, { "epoch": 0.3314195417434817, "grad_norm": 1.5381124019622803, "learning_rate": 3.34395575454306e-05, "loss": 2.0631, "step": 12584 }, { "epoch": 0.3314458783249934, "grad_norm": 2.6243417263031006, "learning_rate": 3.343824071635502e-05, "loss": 1.845, "step": 12585 }, { "epoch": 0.33147221490650514, "grad_norm": 2.563018798828125, "learning_rate": 3.3436923887279434e-05, "loss": 1.5551, "step": 12586 }, { "epoch": 0.3314985514880169, "grad_norm": 1.8089511394500732, "learning_rate": 3.343560705820385e-05, "loss": 2.6125, "step": 12587 }, { "epoch": 0.33152488806952857, "grad_norm": 1.701754093170166, "learning_rate": 3.3434290229128266e-05, "loss": 1.2412, "step": 12588 }, { "epoch": 0.3315512246510403, "grad_norm": 1.690680742263794, "learning_rate": 3.3432973400052674e-05, "loss": 0.5706, "step": 12589 }, { "epoch": 0.331577561232552, "grad_norm": 3.051246404647827, "learning_rate": 3.343165657097709e-05, "loss": 1.0539, "step": 12590 }, { "epoch": 0.33160389781406374, "grad_norm": 5.108091354370117, "learning_rate": 3.34303397419015e-05, "loss": 0.8685, "step": 12591 }, { "epoch": 0.33163023439557543, "grad_norm": 1.8559662103652954, "learning_rate": 3.342902291282592e-05, "loss": 2.1733, "step": 12592 }, { "epoch": 0.3316565709770872, "grad_norm": 1.5760260820388794, "learning_rate": 3.342770608375033e-05, "loss": 2.0728, "step": 12593 }, { "epoch": 0.3316829075585989, "grad_norm": 1.580976128578186, "learning_rate": 3.3426389254674746e-05, "loss": 1.4601, "step": 12594 }, { "epoch": 0.3317092441401106, "grad_norm": 1.7760361433029175, "learning_rate": 3.3425072425599154e-05, "loss": 0.6615, "step": 12595 }, { "epoch": 0.33173558072162235, "grad_norm": 1.6805483102798462, "learning_rate": 3.342375559652357e-05, "loss": 1.7424, "step": 12596 }, { "epoch": 0.33176191730313404, "grad_norm": 2.085524082183838, "learning_rate": 3.3422438767447986e-05, "loss": 1.8532, "step": 12597 }, { "epoch": 0.3317882538846458, "grad_norm": 1.7246835231781006, "learning_rate": 3.34211219383724e-05, "loss": 1.7175, "step": 12598 }, { "epoch": 0.33181459046615747, "grad_norm": 2.244180679321289, "learning_rate": 3.341980510929682e-05, "loss": 0.5694, "step": 12599 }, { "epoch": 0.3318409270476692, "grad_norm": 5.295844554901123, "learning_rate": 3.3418488280221226e-05, "loss": 2.1871, "step": 12600 }, { "epoch": 0.33186726362918095, "grad_norm": 1.8949546813964844, "learning_rate": 3.341717145114564e-05, "loss": 1.3858, "step": 12601 }, { "epoch": 0.33189360021069264, "grad_norm": 1.990843415260315, "learning_rate": 3.341585462207006e-05, "loss": 1.1522, "step": 12602 }, { "epoch": 0.3319199367922044, "grad_norm": NaN, "learning_rate": 3.341585462207006e-05, "loss": 2.7428, "step": 12603 }, { "epoch": 0.33194627337371607, "grad_norm": 3.1444976329803467, "learning_rate": 3.341453779299447e-05, "loss": 0.9988, "step": 12604 }, { "epoch": 0.3319726099552278, "grad_norm": 2.7017078399658203, "learning_rate": 3.341322096391888e-05, "loss": 1.7436, "step": 12605 }, { "epoch": 0.33199894653673956, "grad_norm": 1.9363417625427246, "learning_rate": 3.34119041348433e-05, "loss": 1.3717, "step": 12606 }, { "epoch": 0.33202528311825125, "grad_norm": 2.046461820602417, "learning_rate": 3.341058730576771e-05, "loss": 1.6626, "step": 12607 }, { "epoch": 0.332051619699763, "grad_norm": 2.164912462234497, "learning_rate": 3.340927047669213e-05, "loss": 1.0039, "step": 12608 }, { "epoch": 0.3320779562812747, "grad_norm": 1.5574190616607666, "learning_rate": 3.3407953647616544e-05, "loss": 2.043, "step": 12609 }, { "epoch": 0.3321042928627864, "grad_norm": 2.184595823287964, "learning_rate": 3.340663681854095e-05, "loss": 2.1146, "step": 12610 }, { "epoch": 0.3321306294442981, "grad_norm": 4.399532794952393, "learning_rate": 3.340531998946537e-05, "loss": 0.994, "step": 12611 }, { "epoch": 0.33215696602580985, "grad_norm": 2.892519950866699, "learning_rate": 3.3404003160389784e-05, "loss": 1.4885, "step": 12612 }, { "epoch": 0.3321833026073216, "grad_norm": 2.8012919425964355, "learning_rate": 3.34026863313142e-05, "loss": 1.4831, "step": 12613 }, { "epoch": 0.3322096391888333, "grad_norm": 2.3918423652648926, "learning_rate": 3.340136950223861e-05, "loss": 1.512, "step": 12614 }, { "epoch": 0.332235975770345, "grad_norm": 2.1756582260131836, "learning_rate": 3.3400052673163024e-05, "loss": 0.7221, "step": 12615 }, { "epoch": 0.3322623123518567, "grad_norm": 1.7098534107208252, "learning_rate": 3.339873584408744e-05, "loss": 2.1871, "step": 12616 }, { "epoch": 0.33228864893336846, "grad_norm": 2.1105356216430664, "learning_rate": 3.3397419015011855e-05, "loss": 2.1751, "step": 12617 }, { "epoch": 0.33231498551488015, "grad_norm": 1.6752023696899414, "learning_rate": 3.339610218593627e-05, "loss": 1.6417, "step": 12618 }, { "epoch": 0.3323413220963919, "grad_norm": 3.664382219314575, "learning_rate": 3.339478535686068e-05, "loss": 1.3972, "step": 12619 }, { "epoch": 0.33236765867790363, "grad_norm": 1.8466389179229736, "learning_rate": 3.3393468527785095e-05, "loss": 1.7655, "step": 12620 }, { "epoch": 0.3323939952594153, "grad_norm": 2.00946307182312, "learning_rate": 3.339215169870951e-05, "loss": 2.0125, "step": 12621 }, { "epoch": 0.33242033184092706, "grad_norm": 2.6447982788085938, "learning_rate": 3.3390834869633926e-05, "loss": 0.6782, "step": 12622 }, { "epoch": 0.33244666842243875, "grad_norm": 2.420147180557251, "learning_rate": 3.3389518040558335e-05, "loss": 1.4998, "step": 12623 }, { "epoch": 0.3324730050039505, "grad_norm": 3.5737648010253906, "learning_rate": 3.338820121148275e-05, "loss": 2.6596, "step": 12624 }, { "epoch": 0.3324993415854622, "grad_norm": 3.727470874786377, "learning_rate": 3.3386884382407166e-05, "loss": 1.3438, "step": 12625 }, { "epoch": 0.3325256781669739, "grad_norm": 1.6507397890090942, "learning_rate": 3.338556755333158e-05, "loss": 0.3542, "step": 12626 }, { "epoch": 0.33255201474848567, "grad_norm": 1.7301124334335327, "learning_rate": 3.3384250724256e-05, "loss": 2.1822, "step": 12627 }, { "epoch": 0.33257835132999736, "grad_norm": 3.38923716545105, "learning_rate": 3.3382933895180406e-05, "loss": 0.9428, "step": 12628 }, { "epoch": 0.3326046879115091, "grad_norm": 3.6878015995025635, "learning_rate": 3.338161706610482e-05, "loss": 1.372, "step": 12629 }, { "epoch": 0.3326310244930208, "grad_norm": 1.808009147644043, "learning_rate": 3.338030023702923e-05, "loss": 1.4246, "step": 12630 }, { "epoch": 0.33265736107453253, "grad_norm": 1.6601111888885498, "learning_rate": 3.337898340795365e-05, "loss": 2.0032, "step": 12631 }, { "epoch": 0.3326836976560442, "grad_norm": 1.7675055265426636, "learning_rate": 3.337766657887806e-05, "loss": 1.874, "step": 12632 }, { "epoch": 0.33271003423755596, "grad_norm": 1.8963303565979004, "learning_rate": 3.337634974980248e-05, "loss": 1.7249, "step": 12633 }, { "epoch": 0.3327363708190677, "grad_norm": 2.998598337173462, "learning_rate": 3.337503292072689e-05, "loss": 1.7926, "step": 12634 }, { "epoch": 0.3327627074005794, "grad_norm": 2.936476945877075, "learning_rate": 3.33737160916513e-05, "loss": 1.5406, "step": 12635 }, { "epoch": 0.33278904398209114, "grad_norm": 2.736581325531006, "learning_rate": 3.3372399262575724e-05, "loss": 1.3174, "step": 12636 }, { "epoch": 0.3328153805636028, "grad_norm": 3.6287879943847656, "learning_rate": 3.337108243350013e-05, "loss": 1.6855, "step": 12637 }, { "epoch": 0.33284171714511457, "grad_norm": 1.944210171699524, "learning_rate": 3.336976560442455e-05, "loss": 1.9133, "step": 12638 }, { "epoch": 0.33286805372662626, "grad_norm": 1.5569289922714233, "learning_rate": 3.336844877534896e-05, "loss": 1.8468, "step": 12639 }, { "epoch": 0.332894390308138, "grad_norm": 1.9692211151123047, "learning_rate": 3.336713194627338e-05, "loss": 1.8621, "step": 12640 }, { "epoch": 0.33292072688964974, "grad_norm": 4.389628887176514, "learning_rate": 3.336581511719779e-05, "loss": 0.577, "step": 12641 }, { "epoch": 0.33294706347116143, "grad_norm": 2.817185163497925, "learning_rate": 3.3364498288122204e-05, "loss": 1.8293, "step": 12642 }, { "epoch": 0.3329734000526732, "grad_norm": 2.639174222946167, "learning_rate": 3.336318145904661e-05, "loss": 1.4784, "step": 12643 }, { "epoch": 0.33299973663418486, "grad_norm": 2.9981637001037598, "learning_rate": 3.336186462997103e-05, "loss": 2.342, "step": 12644 }, { "epoch": 0.3330260732156966, "grad_norm": 1.6784089803695679, "learning_rate": 3.3360547800895444e-05, "loss": 1.8669, "step": 12645 }, { "epoch": 0.33305240979720835, "grad_norm": 1.9349316358566284, "learning_rate": 3.335923097181986e-05, "loss": 1.801, "step": 12646 }, { "epoch": 0.33307874637872004, "grad_norm": 1.6820112466812134, "learning_rate": 3.3357914142744275e-05, "loss": 1.9513, "step": 12647 }, { "epoch": 0.3331050829602318, "grad_norm": 2.608950614929199, "learning_rate": 3.3356597313668684e-05, "loss": 1.0613, "step": 12648 }, { "epoch": 0.33313141954174347, "grad_norm": 2.393916606903076, "learning_rate": 3.33552804845931e-05, "loss": 1.0479, "step": 12649 }, { "epoch": 0.3331577561232552, "grad_norm": 2.3476123809814453, "learning_rate": 3.3353963655517515e-05, "loss": 2.2433, "step": 12650 }, { "epoch": 0.3331840927047669, "grad_norm": 1.54328191280365, "learning_rate": 3.335264682644193e-05, "loss": 2.1673, "step": 12651 }, { "epoch": 0.33321042928627864, "grad_norm": 1.5280935764312744, "learning_rate": 3.335132999736634e-05, "loss": 1.7752, "step": 12652 }, { "epoch": 0.3332367658677904, "grad_norm": 2.336001396179199, "learning_rate": 3.3350013168290755e-05, "loss": 1.8214, "step": 12653 }, { "epoch": 0.3332631024493021, "grad_norm": 2.140209436416626, "learning_rate": 3.334869633921517e-05, "loss": 1.7811, "step": 12654 }, { "epoch": 0.3332894390308138, "grad_norm": 2.073690891265869, "learning_rate": 3.334737951013959e-05, "loss": 2.1738, "step": 12655 }, { "epoch": 0.3333157756123255, "grad_norm": 3.458463430404663, "learning_rate": 3.3346062681064e-05, "loss": 1.7545, "step": 12656 }, { "epoch": 0.33334211219383725, "grad_norm": 2.605024814605713, "learning_rate": 3.334474585198841e-05, "loss": 1.2952, "step": 12657 }, { "epoch": 0.33336844877534894, "grad_norm": 2.0049028396606445, "learning_rate": 3.334342902291283e-05, "loss": 1.8393, "step": 12658 }, { "epoch": 0.3333947853568607, "grad_norm": 2.1910781860351562, "learning_rate": 3.334211219383724e-05, "loss": 1.6451, "step": 12659 }, { "epoch": 0.3334211219383724, "grad_norm": 1.8754687309265137, "learning_rate": 3.334079536476166e-05, "loss": 2.0493, "step": 12660 }, { "epoch": 0.3334474585198841, "grad_norm": 1.9820432662963867, "learning_rate": 3.333947853568607e-05, "loss": 1.6025, "step": 12661 }, { "epoch": 0.33347379510139585, "grad_norm": 1.8536455631256104, "learning_rate": 3.333816170661048e-05, "loss": 1.8685, "step": 12662 }, { "epoch": 0.33350013168290754, "grad_norm": 1.7496771812438965, "learning_rate": 3.33368448775349e-05, "loss": 1.4563, "step": 12663 }, { "epoch": 0.3335264682644193, "grad_norm": 1.5448086261749268, "learning_rate": 3.3335528048459313e-05, "loss": 2.2682, "step": 12664 }, { "epoch": 0.333552804845931, "grad_norm": 1.804081916809082, "learning_rate": 3.333421121938373e-05, "loss": 1.9859, "step": 12665 }, { "epoch": 0.3335791414274427, "grad_norm": 2.6542768478393555, "learning_rate": 3.333289439030814e-05, "loss": 2.5179, "step": 12666 }, { "epoch": 0.33360547800895446, "grad_norm": 3.9531445503234863, "learning_rate": 3.3331577561232553e-05, "loss": 1.6019, "step": 12667 }, { "epoch": 0.33363181459046615, "grad_norm": 2.3091623783111572, "learning_rate": 3.333026073215696e-05, "loss": 1.3007, "step": 12668 }, { "epoch": 0.3336581511719779, "grad_norm": 2.5285756587982178, "learning_rate": 3.3328943903081385e-05, "loss": 2.961, "step": 12669 }, { "epoch": 0.3336844877534896, "grad_norm": 2.592228889465332, "learning_rate": 3.3327627074005793e-05, "loss": 1.9922, "step": 12670 }, { "epoch": 0.3337108243350013, "grad_norm": 1.453342080116272, "learning_rate": 3.332631024493021e-05, "loss": 1.1381, "step": 12671 }, { "epoch": 0.333737160916513, "grad_norm": 4.903116703033447, "learning_rate": 3.3324993415854625e-05, "loss": 1.6986, "step": 12672 }, { "epoch": 0.33376349749802475, "grad_norm": 2.249630928039551, "learning_rate": 3.332367658677904e-05, "loss": 1.9945, "step": 12673 }, { "epoch": 0.3337898340795365, "grad_norm": 3.2702884674072266, "learning_rate": 3.3322359757703456e-05, "loss": 2.5891, "step": 12674 }, { "epoch": 0.3338161706610482, "grad_norm": 2.378082752227783, "learning_rate": 3.3321042928627865e-05, "loss": 2.0394, "step": 12675 }, { "epoch": 0.3338425072425599, "grad_norm": 1.7774821519851685, "learning_rate": 3.331972609955228e-05, "loss": 0.6657, "step": 12676 }, { "epoch": 0.3338688438240716, "grad_norm": 1.9021217823028564, "learning_rate": 3.331840927047669e-05, "loss": 1.6685, "step": 12677 }, { "epoch": 0.33389518040558336, "grad_norm": 1.69480562210083, "learning_rate": 3.331709244140111e-05, "loss": 0.2684, "step": 12678 }, { "epoch": 0.3339215169870951, "grad_norm": 1.8669898509979248, "learning_rate": 3.331577561232552e-05, "loss": 0.7269, "step": 12679 }, { "epoch": 0.3339478535686068, "grad_norm": 1.9938433170318604, "learning_rate": 3.3314458783249936e-05, "loss": 1.1318, "step": 12680 }, { "epoch": 0.33397419015011853, "grad_norm": 1.5870530605316162, "learning_rate": 3.331314195417435e-05, "loss": 1.9258, "step": 12681 }, { "epoch": 0.3340005267316302, "grad_norm": 1.573981523513794, "learning_rate": 3.331182512509876e-05, "loss": 2.4933, "step": 12682 }, { "epoch": 0.33402686331314196, "grad_norm": 2.3521769046783447, "learning_rate": 3.331050829602318e-05, "loss": 0.5474, "step": 12683 }, { "epoch": 0.33405319989465365, "grad_norm": 3.06538462638855, "learning_rate": 3.330919146694759e-05, "loss": 2.3508, "step": 12684 }, { "epoch": 0.3340795364761654, "grad_norm": 2.0984601974487305, "learning_rate": 3.330787463787201e-05, "loss": 2.1414, "step": 12685 }, { "epoch": 0.33410587305767714, "grad_norm": 2.98837947845459, "learning_rate": 3.3306557808796416e-05, "loss": 1.2802, "step": 12686 }, { "epoch": 0.3341322096391888, "grad_norm": 2.247215747833252, "learning_rate": 3.330524097972084e-05, "loss": 1.7084, "step": 12687 }, { "epoch": 0.33415854622070057, "grad_norm": 3.222853899002075, "learning_rate": 3.330392415064525e-05, "loss": 1.945, "step": 12688 }, { "epoch": 0.33418488280221226, "grad_norm": 1.94883394241333, "learning_rate": 3.330260732156966e-05, "loss": 1.546, "step": 12689 }, { "epoch": 0.334211219383724, "grad_norm": 3.241529941558838, "learning_rate": 3.330129049249408e-05, "loss": 1.9507, "step": 12690 }, { "epoch": 0.3342375559652357, "grad_norm": 2.0178792476654053, "learning_rate": 3.329997366341849e-05, "loss": 1.9945, "step": 12691 }, { "epoch": 0.33426389254674743, "grad_norm": 3.6998403072357178, "learning_rate": 3.329865683434291e-05, "loss": 1.37, "step": 12692 }, { "epoch": 0.3342902291282592, "grad_norm": 2.0556843280792236, "learning_rate": 3.329734000526732e-05, "loss": 2.6426, "step": 12693 }, { "epoch": 0.33431656570977086, "grad_norm": 1.4851425886154175, "learning_rate": 3.3296023176191734e-05, "loss": 1.4306, "step": 12694 }, { "epoch": 0.3343429022912826, "grad_norm": 1.6809499263763428, "learning_rate": 3.329470634711614e-05, "loss": 1.8683, "step": 12695 }, { "epoch": 0.3343692388727943, "grad_norm": 2.6594858169555664, "learning_rate": 3.329338951804056e-05, "loss": 0.3499, "step": 12696 }, { "epoch": 0.33439557545430604, "grad_norm": 2.2146317958831787, "learning_rate": 3.3292072688964974e-05, "loss": 2.2095, "step": 12697 }, { "epoch": 0.3344219120358177, "grad_norm": 2.6948370933532715, "learning_rate": 3.329075585988939e-05, "loss": 1.5676, "step": 12698 }, { "epoch": 0.33444824861732947, "grad_norm": 2.5437676906585693, "learning_rate": 3.32894390308138e-05, "loss": 2.3722, "step": 12699 }, { "epoch": 0.3344745851988412, "grad_norm": 1.865189790725708, "learning_rate": 3.3288122201738214e-05, "loss": 0.5159, "step": 12700 }, { "epoch": 0.3345009217803529, "grad_norm": 1.8187694549560547, "learning_rate": 3.328680537266263e-05, "loss": 1.9236, "step": 12701 }, { "epoch": 0.33452725836186464, "grad_norm": 1.5275969505310059, "learning_rate": 3.3285488543587045e-05, "loss": 1.9205, "step": 12702 }, { "epoch": 0.33455359494337633, "grad_norm": 1.937986135482788, "learning_rate": 3.328417171451146e-05, "loss": 0.8848, "step": 12703 }, { "epoch": 0.3345799315248881, "grad_norm": 3.11193585395813, "learning_rate": 3.328285488543587e-05, "loss": 1.464, "step": 12704 }, { "epoch": 0.33460626810639976, "grad_norm": 2.118119716644287, "learning_rate": 3.3281538056360285e-05, "loss": 2.4367, "step": 12705 }, { "epoch": 0.3346326046879115, "grad_norm": 1.848755121231079, "learning_rate": 3.32802212272847e-05, "loss": 1.8896, "step": 12706 }, { "epoch": 0.33465894126942325, "grad_norm": 1.2457447052001953, "learning_rate": 3.3278904398209116e-05, "loss": 1.2816, "step": 12707 }, { "epoch": 0.33468527785093494, "grad_norm": 2.7649874687194824, "learning_rate": 3.3277587569133525e-05, "loss": 1.7536, "step": 12708 }, { "epoch": 0.3347116144324467, "grad_norm": 2.5353243350982666, "learning_rate": 3.327627074005794e-05, "loss": 0.3203, "step": 12709 }, { "epoch": 0.33473795101395837, "grad_norm": 1.5999507904052734, "learning_rate": 3.3274953910982356e-05, "loss": 1.6913, "step": 12710 }, { "epoch": 0.3347642875954701, "grad_norm": 2.764409303665161, "learning_rate": 3.327363708190677e-05, "loss": 1.5894, "step": 12711 }, { "epoch": 0.33479062417698185, "grad_norm": 2.7095680236816406, "learning_rate": 3.327232025283119e-05, "loss": 0.4185, "step": 12712 }, { "epoch": 0.33481696075849354, "grad_norm": 2.2557263374328613, "learning_rate": 3.3271003423755596e-05, "loss": 0.5209, "step": 12713 }, { "epoch": 0.3348432973400053, "grad_norm": 2.720989942550659, "learning_rate": 3.326968659468001e-05, "loss": 0.6882, "step": 12714 }, { "epoch": 0.334869633921517, "grad_norm": 1.8580718040466309, "learning_rate": 3.326836976560442e-05, "loss": 1.4242, "step": 12715 }, { "epoch": 0.3348959705030287, "grad_norm": 2.1879920959472656, "learning_rate": 3.326705293652884e-05, "loss": 0.566, "step": 12716 }, { "epoch": 0.3349223070845404, "grad_norm": 1.8712615966796875, "learning_rate": 3.326573610745325e-05, "loss": 1.7497, "step": 12717 }, { "epoch": 0.33494864366605215, "grad_norm": 3.7341501712799072, "learning_rate": 3.326441927837767e-05, "loss": 0.9295, "step": 12718 }, { "epoch": 0.3349749802475639, "grad_norm": 2.814065933227539, "learning_rate": 3.326310244930208e-05, "loss": 1.5899, "step": 12719 }, { "epoch": 0.3350013168290756, "grad_norm": 1.93490469455719, "learning_rate": 3.32617856202265e-05, "loss": 2.1805, "step": 12720 }, { "epoch": 0.3350276534105873, "grad_norm": 2.3104476928710938, "learning_rate": 3.3260468791150914e-05, "loss": 1.876, "step": 12721 }, { "epoch": 0.335053989992099, "grad_norm": 2.130648136138916, "learning_rate": 3.325915196207532e-05, "loss": 1.8649, "step": 12722 }, { "epoch": 0.33508032657361075, "grad_norm": 1.7028783559799194, "learning_rate": 3.325783513299974e-05, "loss": 2.3934, "step": 12723 }, { "epoch": 0.33510666315512244, "grad_norm": 1.3325281143188477, "learning_rate": 3.325651830392415e-05, "loss": 0.2996, "step": 12724 }, { "epoch": 0.3351329997366342, "grad_norm": 2.4049007892608643, "learning_rate": 3.325520147484857e-05, "loss": 2.0552, "step": 12725 }, { "epoch": 0.33515933631814593, "grad_norm": 3.0834882259368896, "learning_rate": 3.325388464577298e-05, "loss": 2.4281, "step": 12726 }, { "epoch": 0.3351856728996576, "grad_norm": 1.9483907222747803, "learning_rate": 3.3252567816697394e-05, "loss": 1.7043, "step": 12727 }, { "epoch": 0.33521200948116936, "grad_norm": 1.6540095806121826, "learning_rate": 3.325125098762181e-05, "loss": 1.7751, "step": 12728 }, { "epoch": 0.33523834606268105, "grad_norm": 2.1088151931762695, "learning_rate": 3.324993415854622e-05, "loss": 1.3019, "step": 12729 }, { "epoch": 0.3352646826441928, "grad_norm": 3.1819138526916504, "learning_rate": 3.324861732947064e-05, "loss": 1.6342, "step": 12730 }, { "epoch": 0.3352910192257045, "grad_norm": 1.47464919090271, "learning_rate": 3.324730050039505e-05, "loss": 0.8946, "step": 12731 }, { "epoch": 0.3353173558072162, "grad_norm": 2.3514928817749023, "learning_rate": 3.3245983671319466e-05, "loss": 1.2741, "step": 12732 }, { "epoch": 0.33534369238872797, "grad_norm": 1.9602787494659424, "learning_rate": 3.3244666842243874e-05, "loss": 1.7248, "step": 12733 }, { "epoch": 0.33537002897023965, "grad_norm": 3.0614047050476074, "learning_rate": 3.32433500131683e-05, "loss": 1.7016, "step": 12734 }, { "epoch": 0.3353963655517514, "grad_norm": 2.4778575897216797, "learning_rate": 3.3242033184092706e-05, "loss": 1.4643, "step": 12735 }, { "epoch": 0.3354227021332631, "grad_norm": 2.4843504428863525, "learning_rate": 3.324071635501712e-05, "loss": 1.8201, "step": 12736 }, { "epoch": 0.3354490387147748, "grad_norm": 3.3799288272857666, "learning_rate": 3.323939952594154e-05, "loss": 1.8345, "step": 12737 }, { "epoch": 0.3354753752962865, "grad_norm": 3.314253330230713, "learning_rate": 3.3238082696865946e-05, "loss": 0.7715, "step": 12738 }, { "epoch": 0.33550171187779826, "grad_norm": 2.94140887260437, "learning_rate": 3.323676586779037e-05, "loss": 1.6291, "step": 12739 }, { "epoch": 0.33552804845931, "grad_norm": 1.8520805835723877, "learning_rate": 3.323544903871478e-05, "loss": 1.6528, "step": 12740 }, { "epoch": 0.3355543850408217, "grad_norm": 2.0638961791992188, "learning_rate": 3.323413220963919e-05, "loss": 1.5233, "step": 12741 }, { "epoch": 0.33558072162233343, "grad_norm": 3.3796169757843018, "learning_rate": 3.32328153805636e-05, "loss": 1.8255, "step": 12742 }, { "epoch": 0.3356070582038451, "grad_norm": 2.9376111030578613, "learning_rate": 3.323149855148802e-05, "loss": 1.4259, "step": 12743 }, { "epoch": 0.33563339478535686, "grad_norm": 2.060811758041382, "learning_rate": 3.323018172241243e-05, "loss": 1.9522, "step": 12744 }, { "epoch": 0.3356597313668686, "grad_norm": 2.9156436920166016, "learning_rate": 3.322886489333685e-05, "loss": 0.3796, "step": 12745 }, { "epoch": 0.3356860679483803, "grad_norm": 1.841123104095459, "learning_rate": 3.322754806426126e-05, "loss": 1.648, "step": 12746 }, { "epoch": 0.33571240452989204, "grad_norm": 3.8256890773773193, "learning_rate": 3.322623123518567e-05, "loss": 1.8899, "step": 12747 }, { "epoch": 0.3357387411114037, "grad_norm": 2.747293472290039, "learning_rate": 3.322491440611009e-05, "loss": 1.8156, "step": 12748 }, { "epoch": 0.33576507769291547, "grad_norm": 2.571321964263916, "learning_rate": 3.3223597577034504e-05, "loss": 2.0543, "step": 12749 }, { "epoch": 0.33579141427442716, "grad_norm": 3.315554618835449, "learning_rate": 3.322228074795892e-05, "loss": 0.8903, "step": 12750 }, { "epoch": 0.3358177508559389, "grad_norm": 1.8580601215362549, "learning_rate": 3.322096391888333e-05, "loss": 2.3449, "step": 12751 }, { "epoch": 0.33584408743745064, "grad_norm": 1.6815662384033203, "learning_rate": 3.3219647089807744e-05, "loss": 1.8054, "step": 12752 }, { "epoch": 0.33587042401896233, "grad_norm": 1.826073169708252, "learning_rate": 3.321833026073216e-05, "loss": 1.8914, "step": 12753 }, { "epoch": 0.3358967606004741, "grad_norm": 4.068779468536377, "learning_rate": 3.3217013431656575e-05, "loss": 1.1452, "step": 12754 }, { "epoch": 0.33592309718198576, "grad_norm": 2.070976495742798, "learning_rate": 3.3215696602580984e-05, "loss": 2.1186, "step": 12755 }, { "epoch": 0.3359494337634975, "grad_norm": 2.1486213207244873, "learning_rate": 3.32143797735054e-05, "loss": 2.2322, "step": 12756 }, { "epoch": 0.3359757703450092, "grad_norm": 1.9530872106552124, "learning_rate": 3.3213062944429815e-05, "loss": 1.9658, "step": 12757 }, { "epoch": 0.33600210692652094, "grad_norm": 2.8226709365844727, "learning_rate": 3.321174611535423e-05, "loss": 0.8045, "step": 12758 }, { "epoch": 0.3360284435080327, "grad_norm": 2.085674524307251, "learning_rate": 3.3210429286278646e-05, "loss": 1.0863, "step": 12759 }, { "epoch": 0.33605478008954437, "grad_norm": 2.4771549701690674, "learning_rate": 3.3209112457203055e-05, "loss": 0.2493, "step": 12760 }, { "epoch": 0.3360811166710561, "grad_norm": 1.5374163389205933, "learning_rate": 3.320779562812747e-05, "loss": 1.4568, "step": 12761 }, { "epoch": 0.3361074532525678, "grad_norm": 3.3248648643493652, "learning_rate": 3.320647879905188e-05, "loss": 0.7358, "step": 12762 }, { "epoch": 0.33613378983407954, "grad_norm": 2.836918830871582, "learning_rate": 3.32051619699763e-05, "loss": 0.4637, "step": 12763 }, { "epoch": 0.33616012641559123, "grad_norm": 1.866237759590149, "learning_rate": 3.320384514090071e-05, "loss": 1.9141, "step": 12764 }, { "epoch": 0.336186462997103, "grad_norm": 1.6651567220687866, "learning_rate": 3.3202528311825126e-05, "loss": 2.2732, "step": 12765 }, { "epoch": 0.3362127995786147, "grad_norm": 2.9306986331939697, "learning_rate": 3.320121148274954e-05, "loss": 0.9799, "step": 12766 }, { "epoch": 0.3362391361601264, "grad_norm": 2.177807569503784, "learning_rate": 3.319989465367396e-05, "loss": 1.3731, "step": 12767 }, { "epoch": 0.33626547274163815, "grad_norm": 3.618196964263916, "learning_rate": 3.319857782459837e-05, "loss": 0.7953, "step": 12768 }, { "epoch": 0.33629180932314984, "grad_norm": 1.4932246208190918, "learning_rate": 3.319726099552278e-05, "loss": 1.4672, "step": 12769 }, { "epoch": 0.3363181459046616, "grad_norm": 4.007235527038574, "learning_rate": 3.31959441664472e-05, "loss": 1.9454, "step": 12770 }, { "epoch": 0.33634448248617327, "grad_norm": 1.438978910446167, "learning_rate": 3.3194627337371606e-05, "loss": 1.199, "step": 12771 }, { "epoch": 0.336370819067685, "grad_norm": 3.1330626010894775, "learning_rate": 3.319331050829603e-05, "loss": 0.8972, "step": 12772 }, { "epoch": 0.33639715564919676, "grad_norm": 2.868407964706421, "learning_rate": 3.319199367922044e-05, "loss": 1.3196, "step": 12773 }, { "epoch": 0.33642349223070844, "grad_norm": 1.7732014656066895, "learning_rate": 3.319067685014485e-05, "loss": 1.654, "step": 12774 }, { "epoch": 0.3364498288122202, "grad_norm": 2.6505112648010254, "learning_rate": 3.318936002106927e-05, "loss": 1.5214, "step": 12775 }, { "epoch": 0.3364761653937319, "grad_norm": 2.1598734855651855, "learning_rate": 3.318804319199368e-05, "loss": 1.652, "step": 12776 }, { "epoch": 0.3365025019752436, "grad_norm": 1.454651117324829, "learning_rate": 3.31867263629181e-05, "loss": 2.1821, "step": 12777 }, { "epoch": 0.33652883855675536, "grad_norm": 1.8590773344039917, "learning_rate": 3.318540953384251e-05, "loss": 0.7731, "step": 12778 }, { "epoch": 0.33655517513826705, "grad_norm": 2.3035006523132324, "learning_rate": 3.3184092704766924e-05, "loss": 1.4454, "step": 12779 }, { "epoch": 0.3365815117197788, "grad_norm": 2.9017720222473145, "learning_rate": 3.318277587569133e-05, "loss": 1.3172, "step": 12780 }, { "epoch": 0.3366078483012905, "grad_norm": 2.5573019981384277, "learning_rate": 3.318145904661575e-05, "loss": 2.1342, "step": 12781 }, { "epoch": 0.3366341848828022, "grad_norm": 5.44240665435791, "learning_rate": 3.3180142217540164e-05, "loss": 1.5762, "step": 12782 }, { "epoch": 0.3366605214643139, "grad_norm": 2.197305679321289, "learning_rate": 3.317882538846458e-05, "loss": 1.4593, "step": 12783 }, { "epoch": 0.33668685804582565, "grad_norm": 1.8757997751235962, "learning_rate": 3.3177508559388995e-05, "loss": 2.5188, "step": 12784 }, { "epoch": 0.3367131946273374, "grad_norm": 2.2119293212890625, "learning_rate": 3.3176191730313404e-05, "loss": 1.7756, "step": 12785 }, { "epoch": 0.3367395312088491, "grad_norm": 2.237448215484619, "learning_rate": 3.317487490123783e-05, "loss": 2.3963, "step": 12786 }, { "epoch": 0.33676586779036083, "grad_norm": 1.9436910152435303, "learning_rate": 3.3173558072162235e-05, "loss": 1.5204, "step": 12787 }, { "epoch": 0.3367922043718725, "grad_norm": 2.5121445655822754, "learning_rate": 3.317224124308665e-05, "loss": 2.4555, "step": 12788 }, { "epoch": 0.33681854095338426, "grad_norm": 3.137997627258301, "learning_rate": 3.317092441401106e-05, "loss": 1.0162, "step": 12789 }, { "epoch": 0.33684487753489595, "grad_norm": 2.0975558757781982, "learning_rate": 3.3169607584935475e-05, "loss": 1.5044, "step": 12790 }, { "epoch": 0.3368712141164077, "grad_norm": 3.707554817199707, "learning_rate": 3.316829075585989e-05, "loss": 1.8194, "step": 12791 }, { "epoch": 0.33689755069791943, "grad_norm": 2.3909502029418945, "learning_rate": 3.316697392678431e-05, "loss": 0.722, "step": 12792 }, { "epoch": 0.3369238872794311, "grad_norm": 3.9681878089904785, "learning_rate": 3.3165657097708715e-05, "loss": 1.4723, "step": 12793 }, { "epoch": 0.33695022386094287, "grad_norm": 3.120326519012451, "learning_rate": 3.316434026863313e-05, "loss": 1.7973, "step": 12794 }, { "epoch": 0.33697656044245455, "grad_norm": 2.3361856937408447, "learning_rate": 3.316302343955755e-05, "loss": 1.8561, "step": 12795 }, { "epoch": 0.3370028970239663, "grad_norm": 1.982590913772583, "learning_rate": 3.316170661048196e-05, "loss": 1.5747, "step": 12796 }, { "epoch": 0.337029233605478, "grad_norm": 2.360163927078247, "learning_rate": 3.316038978140638e-05, "loss": 1.8484, "step": 12797 }, { "epoch": 0.33705557018698973, "grad_norm": 1.897743582725525, "learning_rate": 3.315907295233079e-05, "loss": 1.6949, "step": 12798 }, { "epoch": 0.33708190676850147, "grad_norm": 2.4473464488983154, "learning_rate": 3.31577561232552e-05, "loss": 1.4171, "step": 12799 }, { "epoch": 0.33710824335001316, "grad_norm": 2.8679211139678955, "learning_rate": 3.315643929417962e-05, "loss": 0.4834, "step": 12800 }, { "epoch": 0.3371345799315249, "grad_norm": 3.1247572898864746, "learning_rate": 3.3155122465104033e-05, "loss": 0.6327, "step": 12801 }, { "epoch": 0.3371609165130366, "grad_norm": 1.7025243043899536, "learning_rate": 3.315380563602844e-05, "loss": 1.6993, "step": 12802 }, { "epoch": 0.33718725309454833, "grad_norm": 2.5768539905548096, "learning_rate": 3.315248880695286e-05, "loss": 1.9954, "step": 12803 }, { "epoch": 0.33721358967606, "grad_norm": 1.8112659454345703, "learning_rate": 3.3151171977877274e-05, "loss": 0.6745, "step": 12804 }, { "epoch": 0.33723992625757176, "grad_norm": 2.1577470302581787, "learning_rate": 3.314985514880169e-05, "loss": 1.6175, "step": 12805 }, { "epoch": 0.3372662628390835, "grad_norm": 4.229220390319824, "learning_rate": 3.3148538319726105e-05, "loss": 0.9116, "step": 12806 }, { "epoch": 0.3372925994205952, "grad_norm": 1.7240034341812134, "learning_rate": 3.3147221490650514e-05, "loss": 1.8792, "step": 12807 }, { "epoch": 0.33731893600210694, "grad_norm": 2.0040340423583984, "learning_rate": 3.314590466157493e-05, "loss": 1.8932, "step": 12808 }, { "epoch": 0.3373452725836186, "grad_norm": 1.5207792520523071, "learning_rate": 3.314458783249934e-05, "loss": 1.4416, "step": 12809 }, { "epoch": 0.33737160916513037, "grad_norm": 2.0664150714874268, "learning_rate": 3.314327100342376e-05, "loss": 1.7887, "step": 12810 }, { "epoch": 0.33739794574664206, "grad_norm": 1.918363332748413, "learning_rate": 3.314195417434817e-05, "loss": 2.1563, "step": 12811 }, { "epoch": 0.3374242823281538, "grad_norm": 1.9444149732589722, "learning_rate": 3.3140637345272585e-05, "loss": 1.494, "step": 12812 }, { "epoch": 0.33745061890966555, "grad_norm": 2.98043155670166, "learning_rate": 3.3139320516197e-05, "loss": 2.9948, "step": 12813 }, { "epoch": 0.33747695549117723, "grad_norm": 4.101751804351807, "learning_rate": 3.313800368712141e-05, "loss": 2.0274, "step": 12814 }, { "epoch": 0.337503292072689, "grad_norm": 3.0297694206237793, "learning_rate": 3.313668685804583e-05, "loss": 0.8011, "step": 12815 }, { "epoch": 0.33752962865420066, "grad_norm": 1.9750598669052124, "learning_rate": 3.313537002897024e-05, "loss": 1.5386, "step": 12816 }, { "epoch": 0.3375559652357124, "grad_norm": 1.8156180381774902, "learning_rate": 3.3134053199894656e-05, "loss": 1.6884, "step": 12817 }, { "epoch": 0.33758230181722415, "grad_norm": 1.7295427322387695, "learning_rate": 3.3132736370819065e-05, "loss": 1.3958, "step": 12818 }, { "epoch": 0.33760863839873584, "grad_norm": 3.0939669609069824, "learning_rate": 3.313141954174349e-05, "loss": 1.1396, "step": 12819 }, { "epoch": 0.3376349749802476, "grad_norm": 2.344482421875, "learning_rate": 3.3130102712667896e-05, "loss": 2.5968, "step": 12820 }, { "epoch": 0.33766131156175927, "grad_norm": 1.9602240324020386, "learning_rate": 3.312878588359231e-05, "loss": 1.378, "step": 12821 }, { "epoch": 0.337687648143271, "grad_norm": 2.8866045475006104, "learning_rate": 3.312746905451673e-05, "loss": 0.6235, "step": 12822 }, { "epoch": 0.3377139847247827, "grad_norm": 2.1267950534820557, "learning_rate": 3.3126152225441136e-05, "loss": 1.7626, "step": 12823 }, { "epoch": 0.33774032130629444, "grad_norm": 2.3368029594421387, "learning_rate": 3.312483539636556e-05, "loss": 1.2872, "step": 12824 }, { "epoch": 0.3377666578878062, "grad_norm": 2.0095088481903076, "learning_rate": 3.312351856728997e-05, "loss": 1.5743, "step": 12825 }, { "epoch": 0.3377929944693179, "grad_norm": 2.897247552871704, "learning_rate": 3.312220173821438e-05, "loss": 0.8913, "step": 12826 }, { "epoch": 0.3378193310508296, "grad_norm": 1.7282084226608276, "learning_rate": 3.312088490913879e-05, "loss": 1.7159, "step": 12827 }, { "epoch": 0.3378456676323413, "grad_norm": 2.563190460205078, "learning_rate": 3.311956808006321e-05, "loss": 1.0071, "step": 12828 }, { "epoch": 0.33787200421385305, "grad_norm": 3.6421115398406982, "learning_rate": 3.311825125098762e-05, "loss": 0.8396, "step": 12829 }, { "epoch": 0.33789834079536474, "grad_norm": 2.631117820739746, "learning_rate": 3.311693442191204e-05, "loss": 1.3745, "step": 12830 }, { "epoch": 0.3379246773768765, "grad_norm": 2.5396296977996826, "learning_rate": 3.3115617592836454e-05, "loss": 2.5701, "step": 12831 }, { "epoch": 0.3379510139583882, "grad_norm": 1.7202781438827515, "learning_rate": 3.311430076376086e-05, "loss": 0.6105, "step": 12832 }, { "epoch": 0.3379773505398999, "grad_norm": 3.588066816329956, "learning_rate": 3.3112983934685285e-05, "loss": 2.2954, "step": 12833 }, { "epoch": 0.33800368712141166, "grad_norm": 3.7804176807403564, "learning_rate": 3.3111667105609694e-05, "loss": 1.9091, "step": 12834 }, { "epoch": 0.33803002370292334, "grad_norm": 2.6650660037994385, "learning_rate": 3.311035027653411e-05, "loss": 2.4609, "step": 12835 }, { "epoch": 0.3380563602844351, "grad_norm": 1.6852717399597168, "learning_rate": 3.310903344745852e-05, "loss": 1.9774, "step": 12836 }, { "epoch": 0.3380826968659468, "grad_norm": 4.781483173370361, "learning_rate": 3.3107716618382934e-05, "loss": 3.3361, "step": 12837 }, { "epoch": 0.3381090334474585, "grad_norm": 2.265838146209717, "learning_rate": 3.310639978930735e-05, "loss": 1.8377, "step": 12838 }, { "epoch": 0.33813537002897026, "grad_norm": 3.260784864425659, "learning_rate": 3.3105082960231765e-05, "loss": 1.0938, "step": 12839 }, { "epoch": 0.33816170661048195, "grad_norm": 2.5565221309661865, "learning_rate": 3.310376613115618e-05, "loss": 1.9097, "step": 12840 }, { "epoch": 0.3381880431919937, "grad_norm": 1.9164501428604126, "learning_rate": 3.310244930208059e-05, "loss": 0.5112, "step": 12841 }, { "epoch": 0.3382143797735054, "grad_norm": 2.4099740982055664, "learning_rate": 3.3101132473005005e-05, "loss": 2.0523, "step": 12842 }, { "epoch": 0.3382407163550171, "grad_norm": 2.934231758117676, "learning_rate": 3.309981564392942e-05, "loss": 0.776, "step": 12843 }, { "epoch": 0.3382670529365288, "grad_norm": 2.5678024291992188, "learning_rate": 3.3098498814853836e-05, "loss": 2.0246, "step": 12844 }, { "epoch": 0.33829338951804055, "grad_norm": 2.3322393894195557, "learning_rate": 3.3097181985778245e-05, "loss": 1.0182, "step": 12845 }, { "epoch": 0.3383197260995523, "grad_norm": 4.788005352020264, "learning_rate": 3.309586515670266e-05, "loss": 1.0697, "step": 12846 }, { "epoch": 0.338346062681064, "grad_norm": 1.8777328729629517, "learning_rate": 3.309454832762707e-05, "loss": 1.9729, "step": 12847 }, { "epoch": 0.33837239926257573, "grad_norm": 2.1383659839630127, "learning_rate": 3.309323149855149e-05, "loss": 0.5294, "step": 12848 }, { "epoch": 0.3383987358440874, "grad_norm": 2.4611804485321045, "learning_rate": 3.30919146694759e-05, "loss": 2.306, "step": 12849 }, { "epoch": 0.33842507242559916, "grad_norm": 2.3517186641693115, "learning_rate": 3.3090597840400316e-05, "loss": 1.616, "step": 12850 }, { "epoch": 0.3384514090071109, "grad_norm": 1.985917568206787, "learning_rate": 3.308928101132473e-05, "loss": 2.3605, "step": 12851 }, { "epoch": 0.3384777455886226, "grad_norm": 1.65116286277771, "learning_rate": 3.308796418224915e-05, "loss": 2.601, "step": 12852 }, { "epoch": 0.33850408217013433, "grad_norm": 2.4410359859466553, "learning_rate": 3.308664735317356e-05, "loss": 2.4911, "step": 12853 }, { "epoch": 0.338530418751646, "grad_norm": 2.6291885375976562, "learning_rate": 3.308533052409797e-05, "loss": 1.7102, "step": 12854 }, { "epoch": 0.33855675533315777, "grad_norm": 2.732045888900757, "learning_rate": 3.308401369502239e-05, "loss": 0.7609, "step": 12855 }, { "epoch": 0.33858309191466945, "grad_norm": 3.9982266426086426, "learning_rate": 3.3082696865946797e-05, "loss": 0.9718, "step": 12856 }, { "epoch": 0.3386094284961812, "grad_norm": 1.8279403448104858, "learning_rate": 3.308138003687122e-05, "loss": 1.5876, "step": 12857 }, { "epoch": 0.33863576507769294, "grad_norm": 4.195092678070068, "learning_rate": 3.308006320779563e-05, "loss": 1.9992, "step": 12858 }, { "epoch": 0.33866210165920463, "grad_norm": 2.385598659515381, "learning_rate": 3.307874637872004e-05, "loss": 0.604, "step": 12859 }, { "epoch": 0.33868843824071637, "grad_norm": 2.4315998554229736, "learning_rate": 3.307742954964446e-05, "loss": 1.0191, "step": 12860 }, { "epoch": 0.33871477482222806, "grad_norm": 1.9641562700271606, "learning_rate": 3.307611272056887e-05, "loss": 1.3106, "step": 12861 }, { "epoch": 0.3387411114037398, "grad_norm": 2.0438058376312256, "learning_rate": 3.307479589149329e-05, "loss": 1.4516, "step": 12862 }, { "epoch": 0.3387674479852515, "grad_norm": 2.5008726119995117, "learning_rate": 3.30734790624177e-05, "loss": 0.714, "step": 12863 }, { "epoch": 0.33879378456676323, "grad_norm": 1.8285330533981323, "learning_rate": 3.3072162233342115e-05, "loss": 1.5515, "step": 12864 }, { "epoch": 0.338820121148275, "grad_norm": 3.1687991619110107, "learning_rate": 3.307084540426652e-05, "loss": 1.5192, "step": 12865 }, { "epoch": 0.33884645772978667, "grad_norm": 4.436827182769775, "learning_rate": 3.3069528575190946e-05, "loss": 1.5977, "step": 12866 }, { "epoch": 0.3388727943112984, "grad_norm": 2.423713445663452, "learning_rate": 3.3068211746115355e-05, "loss": 1.837, "step": 12867 }, { "epoch": 0.3388991308928101, "grad_norm": 2.350836992263794, "learning_rate": 3.306689491703977e-05, "loss": 2.1863, "step": 12868 }, { "epoch": 0.33892546747432184, "grad_norm": 1.7617946863174438, "learning_rate": 3.3065578087964186e-05, "loss": 1.1963, "step": 12869 }, { "epoch": 0.3389518040558335, "grad_norm": 1.6627624034881592, "learning_rate": 3.3064261258888595e-05, "loss": 1.8442, "step": 12870 }, { "epoch": 0.33897814063734527, "grad_norm": 3.7690541744232178, "learning_rate": 3.306294442981302e-05, "loss": 1.418, "step": 12871 }, { "epoch": 0.339004477218857, "grad_norm": 1.5701606273651123, "learning_rate": 3.3061627600737426e-05, "loss": 1.0721, "step": 12872 }, { "epoch": 0.3390308138003687, "grad_norm": 1.9976402521133423, "learning_rate": 3.306031077166184e-05, "loss": 0.2003, "step": 12873 }, { "epoch": 0.33905715038188045, "grad_norm": 2.4352684020996094, "learning_rate": 3.305899394258625e-05, "loss": 2.0669, "step": 12874 }, { "epoch": 0.33908348696339213, "grad_norm": 1.9757521152496338, "learning_rate": 3.3057677113510666e-05, "loss": 1.5744, "step": 12875 }, { "epoch": 0.3391098235449039, "grad_norm": 1.7175980806350708, "learning_rate": 3.305636028443508e-05, "loss": 2.5, "step": 12876 }, { "epoch": 0.33913616012641556, "grad_norm": 1.584818959236145, "learning_rate": 3.30550434553595e-05, "loss": 0.3951, "step": 12877 }, { "epoch": 0.3391624967079273, "grad_norm": 2.2938072681427, "learning_rate": 3.305372662628391e-05, "loss": 2.0871, "step": 12878 }, { "epoch": 0.33918883328943905, "grad_norm": 1.6901754140853882, "learning_rate": 3.305240979720832e-05, "loss": 1.5823, "step": 12879 }, { "epoch": 0.33921516987095074, "grad_norm": 1.4461736679077148, "learning_rate": 3.305109296813274e-05, "loss": 1.4325, "step": 12880 }, { "epoch": 0.3392415064524625, "grad_norm": 3.1961100101470947, "learning_rate": 3.304977613905715e-05, "loss": 1.4385, "step": 12881 }, { "epoch": 0.33926784303397417, "grad_norm": 1.6787663698196411, "learning_rate": 3.304845930998157e-05, "loss": 1.052, "step": 12882 }, { "epoch": 0.3392941796154859, "grad_norm": 3.1201679706573486, "learning_rate": 3.304714248090598e-05, "loss": 2.0348, "step": 12883 }, { "epoch": 0.33932051619699766, "grad_norm": 1.6835204362869263, "learning_rate": 3.304582565183039e-05, "loss": 1.5206, "step": 12884 }, { "epoch": 0.33934685277850934, "grad_norm": 2.4557645320892334, "learning_rate": 3.304450882275481e-05, "loss": 0.5015, "step": 12885 }, { "epoch": 0.3393731893600211, "grad_norm": 4.528027534484863, "learning_rate": 3.3043191993679224e-05, "loss": 1.3313, "step": 12886 }, { "epoch": 0.3393995259415328, "grad_norm": 2.5903267860412598, "learning_rate": 3.304187516460364e-05, "loss": 1.3941, "step": 12887 }, { "epoch": 0.3394258625230445, "grad_norm": 3.5780093669891357, "learning_rate": 3.304055833552805e-05, "loss": 0.9176, "step": 12888 }, { "epoch": 0.3394521991045562, "grad_norm": 1.750388741493225, "learning_rate": 3.3039241506452464e-05, "loss": 2.0672, "step": 12889 }, { "epoch": 0.33947853568606795, "grad_norm": 2.0618300437927246, "learning_rate": 3.303792467737688e-05, "loss": 2.1176, "step": 12890 }, { "epoch": 0.3395048722675797, "grad_norm": 5.245141983032227, "learning_rate": 3.3036607848301295e-05, "loss": 0.8398, "step": 12891 }, { "epoch": 0.3395312088490914, "grad_norm": 4.576908588409424, "learning_rate": 3.3035291019225704e-05, "loss": 1.6884, "step": 12892 }, { "epoch": 0.3395575454306031, "grad_norm": 2.666538953781128, "learning_rate": 3.303397419015012e-05, "loss": 0.3919, "step": 12893 }, { "epoch": 0.3395838820121148, "grad_norm": 2.1481711864471436, "learning_rate": 3.303265736107453e-05, "loss": 2.3544, "step": 12894 }, { "epoch": 0.33961021859362656, "grad_norm": 1.6148473024368286, "learning_rate": 3.303134053199895e-05, "loss": 2.0484, "step": 12895 }, { "epoch": 0.33963655517513824, "grad_norm": 4.5628862380981445, "learning_rate": 3.303002370292336e-05, "loss": 1.7455, "step": 12896 }, { "epoch": 0.33966289175665, "grad_norm": 4.540353775024414, "learning_rate": 3.3028706873847775e-05, "loss": 0.854, "step": 12897 }, { "epoch": 0.33968922833816173, "grad_norm": 2.170725107192993, "learning_rate": 3.302739004477219e-05, "loss": 1.9693, "step": 12898 }, { "epoch": 0.3397155649196734, "grad_norm": 1.7289457321166992, "learning_rate": 3.3026073215696606e-05, "loss": 1.6487, "step": 12899 }, { "epoch": 0.33974190150118516, "grad_norm": 3.3022546768188477, "learning_rate": 3.302475638662102e-05, "loss": 0.3773, "step": 12900 }, { "epoch": 0.33976823808269685, "grad_norm": 1.7960892915725708, "learning_rate": 3.302343955754543e-05, "loss": 1.6454, "step": 12901 }, { "epoch": 0.3397945746642086, "grad_norm": 2.1280105113983154, "learning_rate": 3.3022122728469846e-05, "loss": 0.6364, "step": 12902 }, { "epoch": 0.3398209112457203, "grad_norm": 5.191806316375732, "learning_rate": 3.3020805899394255e-05, "loss": 2.316, "step": 12903 }, { "epoch": 0.339847247827232, "grad_norm": 5.268148422241211, "learning_rate": 3.301948907031868e-05, "loss": 2.0061, "step": 12904 }, { "epoch": 0.33987358440874377, "grad_norm": 3.1319453716278076, "learning_rate": 3.3018172241243086e-05, "loss": 1.4343, "step": 12905 }, { "epoch": 0.33989992099025546, "grad_norm": 2.618544578552246, "learning_rate": 3.30168554121675e-05, "loss": 2.0187, "step": 12906 }, { "epoch": 0.3399262575717672, "grad_norm": 3.03944730758667, "learning_rate": 3.301553858309192e-05, "loss": 1.7853, "step": 12907 }, { "epoch": 0.3399525941532789, "grad_norm": 1.9733080863952637, "learning_rate": 3.3014221754016326e-05, "loss": 1.9372, "step": 12908 }, { "epoch": 0.33997893073479063, "grad_norm": 1.8079891204833984, "learning_rate": 3.301290492494075e-05, "loss": 2.2751, "step": 12909 }, { "epoch": 0.3400052673163023, "grad_norm": 2.9763898849487305, "learning_rate": 3.301158809586516e-05, "loss": 2.1679, "step": 12910 }, { "epoch": 0.34003160389781406, "grad_norm": 3.666808605194092, "learning_rate": 3.301027126678957e-05, "loss": 1.2256, "step": 12911 }, { "epoch": 0.3400579404793258, "grad_norm": 4.210350513458252, "learning_rate": 3.300895443771398e-05, "loss": 1.3845, "step": 12912 }, { "epoch": 0.3400842770608375, "grad_norm": 5.638673305511475, "learning_rate": 3.30076376086384e-05, "loss": 2.2161, "step": 12913 }, { "epoch": 0.34011061364234924, "grad_norm": 1.4734793901443481, "learning_rate": 3.300632077956281e-05, "loss": 1.9166, "step": 12914 }, { "epoch": 0.3401369502238609, "grad_norm": 3.058187246322632, "learning_rate": 3.300500395048723e-05, "loss": 1.6315, "step": 12915 }, { "epoch": 0.34016328680537267, "grad_norm": 2.6017966270446777, "learning_rate": 3.3003687121411644e-05, "loss": 1.0321, "step": 12916 }, { "epoch": 0.3401896233868844, "grad_norm": 2.0909831523895264, "learning_rate": 3.300237029233605e-05, "loss": 0.3767, "step": 12917 }, { "epoch": 0.3402159599683961, "grad_norm": 3.3884739875793457, "learning_rate": 3.3001053463260475e-05, "loss": 0.8194, "step": 12918 }, { "epoch": 0.34024229654990784, "grad_norm": 3.3136556148529053, "learning_rate": 3.2999736634184884e-05, "loss": 1.5631, "step": 12919 }, { "epoch": 0.34026863313141953, "grad_norm": 1.5209213495254517, "learning_rate": 3.29984198051093e-05, "loss": 1.633, "step": 12920 }, { "epoch": 0.3402949697129313, "grad_norm": 1.856662392616272, "learning_rate": 3.299710297603371e-05, "loss": 2.6551, "step": 12921 }, { "epoch": 0.34032130629444296, "grad_norm": 3.5940425395965576, "learning_rate": 3.2995786146958124e-05, "loss": 1.5755, "step": 12922 }, { "epoch": 0.3403476428759547, "grad_norm": 1.6247870922088623, "learning_rate": 3.299446931788254e-05, "loss": 1.5426, "step": 12923 }, { "epoch": 0.34037397945746645, "grad_norm": 2.991795539855957, "learning_rate": 3.2993152488806956e-05, "loss": 1.6865, "step": 12924 }, { "epoch": 0.34040031603897813, "grad_norm": 2.3574752807617188, "learning_rate": 3.299183565973137e-05, "loss": 1.4322, "step": 12925 }, { "epoch": 0.3404266526204899, "grad_norm": 2.4562690258026123, "learning_rate": 3.299051883065578e-05, "loss": 2.1818, "step": 12926 }, { "epoch": 0.34045298920200157, "grad_norm": 1.8044371604919434, "learning_rate": 3.2989202001580196e-05, "loss": 1.235, "step": 12927 }, { "epoch": 0.3404793257835133, "grad_norm": 1.6219490766525269, "learning_rate": 3.298788517250461e-05, "loss": 1.9105, "step": 12928 }, { "epoch": 0.340505662365025, "grad_norm": 1.761940360069275, "learning_rate": 3.298656834342903e-05, "loss": 1.2498, "step": 12929 }, { "epoch": 0.34053199894653674, "grad_norm": 1.5885579586029053, "learning_rate": 3.2985251514353436e-05, "loss": 1.6395, "step": 12930 }, { "epoch": 0.3405583355280485, "grad_norm": 2.0496907234191895, "learning_rate": 3.298393468527785e-05, "loss": 2.1072, "step": 12931 }, { "epoch": 0.34058467210956017, "grad_norm": 2.022467851638794, "learning_rate": 3.298261785620227e-05, "loss": 1.5395, "step": 12932 }, { "epoch": 0.3406110086910719, "grad_norm": 3.4611968994140625, "learning_rate": 3.298130102712668e-05, "loss": 1.7434, "step": 12933 }, { "epoch": 0.3406373452725836, "grad_norm": 2.2028777599334717, "learning_rate": 3.29799841980511e-05, "loss": 2.047, "step": 12934 }, { "epoch": 0.34066368185409535, "grad_norm": 2.633039712905884, "learning_rate": 3.297866736897551e-05, "loss": 0.4042, "step": 12935 }, { "epoch": 0.34069001843560703, "grad_norm": 2.170102596282959, "learning_rate": 3.297735053989992e-05, "loss": 1.5421, "step": 12936 }, { "epoch": 0.3407163550171188, "grad_norm": 1.722725749015808, "learning_rate": 3.297603371082434e-05, "loss": 1.4871, "step": 12937 }, { "epoch": 0.3407426915986305, "grad_norm": 2.009075880050659, "learning_rate": 3.2974716881748754e-05, "loss": 1.5152, "step": 12938 }, { "epoch": 0.3407690281801422, "grad_norm": 2.443164825439453, "learning_rate": 3.297340005267316e-05, "loss": 2.1061, "step": 12939 }, { "epoch": 0.34079536476165395, "grad_norm": 1.4557195901870728, "learning_rate": 3.297208322359758e-05, "loss": 1.889, "step": 12940 }, { "epoch": 0.34082170134316564, "grad_norm": 2.3596839904785156, "learning_rate": 3.2970766394521994e-05, "loss": 2.5858, "step": 12941 }, { "epoch": 0.3408480379246774, "grad_norm": 2.858344793319702, "learning_rate": 3.296944956544641e-05, "loss": 1.8894, "step": 12942 }, { "epoch": 0.34087437450618907, "grad_norm": 2.448979377746582, "learning_rate": 3.2968132736370825e-05, "loss": 1.77, "step": 12943 }, { "epoch": 0.3409007110877008, "grad_norm": 2.9496917724609375, "learning_rate": 3.2966815907295234e-05, "loss": 2.45, "step": 12944 }, { "epoch": 0.34092704766921256, "grad_norm": 1.8532429933547974, "learning_rate": 3.296549907821965e-05, "loss": 1.4375, "step": 12945 }, { "epoch": 0.34095338425072425, "grad_norm": 2.1792612075805664, "learning_rate": 3.296418224914406e-05, "loss": 2.1459, "step": 12946 }, { "epoch": 0.340979720832236, "grad_norm": 2.0391368865966797, "learning_rate": 3.296286542006848e-05, "loss": 2.8481, "step": 12947 }, { "epoch": 0.3410060574137477, "grad_norm": 2.3514368534088135, "learning_rate": 3.296154859099289e-05, "loss": 2.0533, "step": 12948 }, { "epoch": 0.3410323939952594, "grad_norm": 4.717340469360352, "learning_rate": 3.2960231761917305e-05, "loss": 2.1282, "step": 12949 }, { "epoch": 0.3410587305767711, "grad_norm": 1.5506004095077515, "learning_rate": 3.2958914932841714e-05, "loss": 0.3485, "step": 12950 }, { "epoch": 0.34108506715828285, "grad_norm": 5.972497463226318, "learning_rate": 3.2957598103766136e-05, "loss": 1.8649, "step": 12951 }, { "epoch": 0.3411114037397946, "grad_norm": 1.9077333211898804, "learning_rate": 3.2956281274690545e-05, "loss": 1.9687, "step": 12952 }, { "epoch": 0.3411377403213063, "grad_norm": 2.513373851776123, "learning_rate": 3.295496444561496e-05, "loss": 0.8289, "step": 12953 }, { "epoch": 0.341164076902818, "grad_norm": 2.042729616165161, "learning_rate": 3.2953647616539376e-05, "loss": 1.48, "step": 12954 }, { "epoch": 0.3411904134843297, "grad_norm": 1.7733979225158691, "learning_rate": 3.2952330787463785e-05, "loss": 1.4375, "step": 12955 }, { "epoch": 0.34121675006584146, "grad_norm": 2.3418757915496826, "learning_rate": 3.295101395838821e-05, "loss": 1.6959, "step": 12956 }, { "epoch": 0.3412430866473532, "grad_norm": 2.2886176109313965, "learning_rate": 3.2949697129312616e-05, "loss": 1.5131, "step": 12957 }, { "epoch": 0.3412694232288649, "grad_norm": 2.2938649654388428, "learning_rate": 3.294838030023703e-05, "loss": 1.4211, "step": 12958 }, { "epoch": 0.34129575981037663, "grad_norm": 1.7915611267089844, "learning_rate": 3.294706347116144e-05, "loss": 1.9845, "step": 12959 }, { "epoch": 0.3413220963918883, "grad_norm": 1.5601946115493774, "learning_rate": 3.2945746642085856e-05, "loss": 2.1526, "step": 12960 }, { "epoch": 0.34134843297340006, "grad_norm": 4.8775715827941895, "learning_rate": 3.294442981301027e-05, "loss": 1.3841, "step": 12961 }, { "epoch": 0.34137476955491175, "grad_norm": 4.573395729064941, "learning_rate": 3.294311298393469e-05, "loss": 1.3154, "step": 12962 }, { "epoch": 0.3414011061364235, "grad_norm": 1.9860544204711914, "learning_rate": 3.29417961548591e-05, "loss": 1.0699, "step": 12963 }, { "epoch": 0.34142744271793524, "grad_norm": 2.873884677886963, "learning_rate": 3.294047932578351e-05, "loss": 1.3907, "step": 12964 }, { "epoch": 0.3414537792994469, "grad_norm": 3.7345573902130127, "learning_rate": 3.2939162496707934e-05, "loss": 2.4449, "step": 12965 }, { "epoch": 0.34148011588095867, "grad_norm": 4.145462989807129, "learning_rate": 3.293784566763234e-05, "loss": 1.8259, "step": 12966 }, { "epoch": 0.34150645246247036, "grad_norm": 1.9677534103393555, "learning_rate": 3.293652883855676e-05, "loss": 1.6432, "step": 12967 }, { "epoch": 0.3415327890439821, "grad_norm": 1.9247239828109741, "learning_rate": 3.293521200948117e-05, "loss": 2.0941, "step": 12968 }, { "epoch": 0.3415591256254938, "grad_norm": 2.2761850357055664, "learning_rate": 3.293389518040558e-05, "loss": 2.1237, "step": 12969 }, { "epoch": 0.34158546220700553, "grad_norm": 4.019107341766357, "learning_rate": 3.293257835133e-05, "loss": 1.6666, "step": 12970 }, { "epoch": 0.3416117987885173, "grad_norm": 3.608370304107666, "learning_rate": 3.2931261522254414e-05, "loss": 2.2006, "step": 12971 }, { "epoch": 0.34163813537002896, "grad_norm": 2.49223256111145, "learning_rate": 3.292994469317883e-05, "loss": 2.1709, "step": 12972 }, { "epoch": 0.3416644719515407, "grad_norm": 2.9999496936798096, "learning_rate": 3.292862786410324e-05, "loss": 1.9288, "step": 12973 }, { "epoch": 0.3416908085330524, "grad_norm": 2.16973876953125, "learning_rate": 3.2927311035027654e-05, "loss": 1.7177, "step": 12974 }, { "epoch": 0.34171714511456414, "grad_norm": 1.6832624673843384, "learning_rate": 3.292599420595207e-05, "loss": 1.9217, "step": 12975 }, { "epoch": 0.3417434816960758, "grad_norm": 2.6180667877197266, "learning_rate": 3.2924677376876485e-05, "loss": 1.8813, "step": 12976 }, { "epoch": 0.34176981827758757, "grad_norm": 3.1550745964050293, "learning_rate": 3.2923360547800894e-05, "loss": 1.1645, "step": 12977 }, { "epoch": 0.3417961548590993, "grad_norm": 1.835385799407959, "learning_rate": 3.292204371872531e-05, "loss": 1.2515, "step": 12978 }, { "epoch": 0.341822491440611, "grad_norm": 3.1698458194732666, "learning_rate": 3.2920726889649725e-05, "loss": 1.7964, "step": 12979 }, { "epoch": 0.34184882802212274, "grad_norm": 1.6796510219573975, "learning_rate": 3.291941006057414e-05, "loss": 1.0998, "step": 12980 }, { "epoch": 0.34187516460363443, "grad_norm": 3.3053088188171387, "learning_rate": 3.2918093231498556e-05, "loss": 0.6363, "step": 12981 }, { "epoch": 0.3419015011851462, "grad_norm": 3.5661163330078125, "learning_rate": 3.2916776402422965e-05, "loss": 1.1938, "step": 12982 }, { "epoch": 0.34192783776665786, "grad_norm": 1.6379643678665161, "learning_rate": 3.291545957334738e-05, "loss": 1.4223, "step": 12983 }, { "epoch": 0.3419541743481696, "grad_norm": 1.9944275617599487, "learning_rate": 3.2914142744271797e-05, "loss": 1.5809, "step": 12984 }, { "epoch": 0.34198051092968135, "grad_norm": 4.442276477813721, "learning_rate": 3.291282591519621e-05, "loss": 1.8037, "step": 12985 }, { "epoch": 0.34200684751119304, "grad_norm": 1.8749949932098389, "learning_rate": 3.291150908612062e-05, "loss": 1.6713, "step": 12986 }, { "epoch": 0.3420331840927048, "grad_norm": 4.132696628570557, "learning_rate": 3.2910192257045037e-05, "loss": 1.8504, "step": 12987 }, { "epoch": 0.34205952067421647, "grad_norm": 4.653345108032227, "learning_rate": 3.290887542796945e-05, "loss": 1.1259, "step": 12988 }, { "epoch": 0.3420858572557282, "grad_norm": 2.014456272125244, "learning_rate": 3.290755859889387e-05, "loss": 1.4112, "step": 12989 }, { "epoch": 0.34211219383723995, "grad_norm": 2.1527702808380127, "learning_rate": 3.290624176981828e-05, "loss": 1.6946, "step": 12990 }, { "epoch": 0.34213853041875164, "grad_norm": 2.077350616455078, "learning_rate": 3.290492494074269e-05, "loss": 1.5587, "step": 12991 }, { "epoch": 0.3421648670002634, "grad_norm": 3.211123466491699, "learning_rate": 3.290360811166711e-05, "loss": 2.392, "step": 12992 }, { "epoch": 0.34219120358177507, "grad_norm": 1.8776613473892212, "learning_rate": 3.2902291282591517e-05, "loss": 1.6357, "step": 12993 }, { "epoch": 0.3422175401632868, "grad_norm": 1.6870720386505127, "learning_rate": 3.290097445351594e-05, "loss": 0.5568, "step": 12994 }, { "epoch": 0.3422438767447985, "grad_norm": 2.017930030822754, "learning_rate": 3.289965762444035e-05, "loss": 1.4112, "step": 12995 }, { "epoch": 0.34227021332631025, "grad_norm": 4.67117977142334, "learning_rate": 3.289834079536476e-05, "loss": 1.3809, "step": 12996 }, { "epoch": 0.342296549907822, "grad_norm": 3.131514549255371, "learning_rate": 3.289702396628917e-05, "loss": 2.1468, "step": 12997 }, { "epoch": 0.3423228864893337, "grad_norm": 1.802449107170105, "learning_rate": 3.2895707137213595e-05, "loss": 2.4584, "step": 12998 }, { "epoch": 0.3423492230708454, "grad_norm": 2.2986903190612793, "learning_rate": 3.2894390308138e-05, "loss": 2.6267, "step": 12999 }, { "epoch": 0.3423755596523571, "grad_norm": 1.7511192560195923, "learning_rate": 3.289307347906242e-05, "loss": 2.051, "step": 13000 }, { "epoch": 0.34240189623386885, "grad_norm": 1.713340401649475, "learning_rate": 3.2891756649986835e-05, "loss": 2.172, "step": 13001 }, { "epoch": 0.34242823281538054, "grad_norm": 3.6396396160125732, "learning_rate": 3.289043982091124e-05, "loss": 1.2792, "step": 13002 }, { "epoch": 0.3424545693968923, "grad_norm": 1.5632052421569824, "learning_rate": 3.2889122991835666e-05, "loss": 1.8273, "step": 13003 }, { "epoch": 0.342480905978404, "grad_norm": 1.641810417175293, "learning_rate": 3.2887806162760075e-05, "loss": 1.4613, "step": 13004 }, { "epoch": 0.3425072425599157, "grad_norm": 1.4302617311477661, "learning_rate": 3.288648933368449e-05, "loss": 1.393, "step": 13005 }, { "epoch": 0.34253357914142746, "grad_norm": 1.902970314025879, "learning_rate": 3.28851725046089e-05, "loss": 1.6965, "step": 13006 }, { "epoch": 0.34255991572293915, "grad_norm": 1.6893277168273926, "learning_rate": 3.2883855675533315e-05, "loss": 2.3678, "step": 13007 }, { "epoch": 0.3425862523044509, "grad_norm": 1.9441275596618652, "learning_rate": 3.288253884645773e-05, "loss": 0.4138, "step": 13008 }, { "epoch": 0.3426125888859626, "grad_norm": 3.6069068908691406, "learning_rate": 3.2881222017382146e-05, "loss": 1.3208, "step": 13009 }, { "epoch": 0.3426389254674743, "grad_norm": 4.586077690124512, "learning_rate": 3.287990518830656e-05, "loss": 1.0181, "step": 13010 }, { "epoch": 0.34266526204898606, "grad_norm": 4.205984115600586, "learning_rate": 3.287858835923097e-05, "loss": 2.0801, "step": 13011 }, { "epoch": 0.34269159863049775, "grad_norm": 1.795978307723999, "learning_rate": 3.2877271530155386e-05, "loss": 1.7916, "step": 13012 }, { "epoch": 0.3427179352120095, "grad_norm": 2.2747647762298584, "learning_rate": 3.28759547010798e-05, "loss": 0.5522, "step": 13013 }, { "epoch": 0.3427442717935212, "grad_norm": 2.388296604156494, "learning_rate": 3.287463787200422e-05, "loss": 2.6569, "step": 13014 }, { "epoch": 0.3427706083750329, "grad_norm": 3.4760310649871826, "learning_rate": 3.2873321042928626e-05, "loss": 1.2178, "step": 13015 }, { "epoch": 0.3427969449565446, "grad_norm": 2.434558868408203, "learning_rate": 3.287200421385304e-05, "loss": 1.6785, "step": 13016 }, { "epoch": 0.34282328153805636, "grad_norm": 1.4341470003128052, "learning_rate": 3.287068738477746e-05, "loss": 1.9739, "step": 13017 }, { "epoch": 0.3428496181195681, "grad_norm": 1.779070496559143, "learning_rate": 3.286937055570187e-05, "loss": 2.2729, "step": 13018 }, { "epoch": 0.3428759547010798, "grad_norm": 1.7738845348358154, "learning_rate": 3.286805372662629e-05, "loss": 2.2236, "step": 13019 }, { "epoch": 0.34290229128259153, "grad_norm": 1.6878130435943604, "learning_rate": 3.28667368975507e-05, "loss": 1.8886, "step": 13020 }, { "epoch": 0.3429286278641032, "grad_norm": 2.979458808898926, "learning_rate": 3.286542006847511e-05, "loss": 1.8621, "step": 13021 }, { "epoch": 0.34295496444561496, "grad_norm": 2.4134817123413086, "learning_rate": 3.286410323939953e-05, "loss": 1.8015, "step": 13022 }, { "epoch": 0.3429813010271267, "grad_norm": 1.909686803817749, "learning_rate": 3.2862786410323944e-05, "loss": 1.6881, "step": 13023 }, { "epoch": 0.3430076376086384, "grad_norm": 2.2279653549194336, "learning_rate": 3.286146958124835e-05, "loss": 2.0882, "step": 13024 }, { "epoch": 0.34303397419015014, "grad_norm": 2.4747092723846436, "learning_rate": 3.286015275217277e-05, "loss": 1.9472, "step": 13025 }, { "epoch": 0.3430603107716618, "grad_norm": 1.6904834508895874, "learning_rate": 3.2858835923097184e-05, "loss": 2.2355, "step": 13026 }, { "epoch": 0.34308664735317357, "grad_norm": 1.9961947202682495, "learning_rate": 3.28575190940216e-05, "loss": 2.1338, "step": 13027 }, { "epoch": 0.34311298393468526, "grad_norm": 3.1862409114837646, "learning_rate": 3.2856202264946015e-05, "loss": 1.42, "step": 13028 }, { "epoch": 0.343139320516197, "grad_norm": 2.469804286956787, "learning_rate": 3.2854885435870424e-05, "loss": 2.1905, "step": 13029 }, { "epoch": 0.34316565709770874, "grad_norm": 1.7831059694290161, "learning_rate": 3.285356860679484e-05, "loss": 1.8759, "step": 13030 }, { "epoch": 0.34319199367922043, "grad_norm": 3.725809335708618, "learning_rate": 3.2852251777719255e-05, "loss": 2.3442, "step": 13031 }, { "epoch": 0.3432183302607322, "grad_norm": 1.7289234399795532, "learning_rate": 3.285093494864367e-05, "loss": 0.5565, "step": 13032 }, { "epoch": 0.34324466684224386, "grad_norm": 1.9075827598571777, "learning_rate": 3.284961811956808e-05, "loss": 1.2642, "step": 13033 }, { "epoch": 0.3432710034237556, "grad_norm": 2.122783899307251, "learning_rate": 3.2848301290492495e-05, "loss": 1.82, "step": 13034 }, { "epoch": 0.3432973400052673, "grad_norm": 2.0404770374298096, "learning_rate": 3.284698446141691e-05, "loss": 1.8464, "step": 13035 }, { "epoch": 0.34332367658677904, "grad_norm": 5.143828868865967, "learning_rate": 3.2845667632341326e-05, "loss": 1.7895, "step": 13036 }, { "epoch": 0.3433500131682908, "grad_norm": 2.072608470916748, "learning_rate": 3.284435080326574e-05, "loss": 1.6936, "step": 13037 }, { "epoch": 0.34337634974980247, "grad_norm": 2.0791525840759277, "learning_rate": 3.284303397419015e-05, "loss": 1.8769, "step": 13038 }, { "epoch": 0.3434026863313142, "grad_norm": 1.8462088108062744, "learning_rate": 3.2841717145114566e-05, "loss": 1.6844, "step": 13039 }, { "epoch": 0.3434290229128259, "grad_norm": 2.5627951622009277, "learning_rate": 3.2840400316038975e-05, "loss": 2.3559, "step": 13040 }, { "epoch": 0.34345535949433764, "grad_norm": 2.5939695835113525, "learning_rate": 3.28390834869634e-05, "loss": 1.325, "step": 13041 }, { "epoch": 0.34348169607584933, "grad_norm": 2.1293795108795166, "learning_rate": 3.2837766657887806e-05, "loss": 1.7575, "step": 13042 }, { "epoch": 0.3435080326573611, "grad_norm": 2.22837233543396, "learning_rate": 3.283644982881222e-05, "loss": 1.5616, "step": 13043 }, { "epoch": 0.3435343692388728, "grad_norm": 2.1316654682159424, "learning_rate": 3.283513299973664e-05, "loss": 1.1524, "step": 13044 }, { "epoch": 0.3435607058203845, "grad_norm": 3.323835849761963, "learning_rate": 3.283381617066105e-05, "loss": 1.7337, "step": 13045 }, { "epoch": 0.34358704240189625, "grad_norm": 2.016371488571167, "learning_rate": 3.283249934158547e-05, "loss": 1.7525, "step": 13046 }, { "epoch": 0.34361337898340794, "grad_norm": 1.7369242906570435, "learning_rate": 3.283118251250988e-05, "loss": 1.5029, "step": 13047 }, { "epoch": 0.3436397155649197, "grad_norm": 3.1583664417266846, "learning_rate": 3.282986568343429e-05, "loss": 1.9056, "step": 13048 }, { "epoch": 0.34366605214643137, "grad_norm": 1.6618739366531372, "learning_rate": 3.28285488543587e-05, "loss": 2.0452, "step": 13049 }, { "epoch": 0.3436923887279431, "grad_norm": 3.042189598083496, "learning_rate": 3.2827232025283124e-05, "loss": 1.2496, "step": 13050 }, { "epoch": 0.34371872530945485, "grad_norm": 1.8863723278045654, "learning_rate": 3.282591519620753e-05, "loss": 1.9603, "step": 13051 }, { "epoch": 0.34374506189096654, "grad_norm": 4.606906890869141, "learning_rate": 3.282459836713195e-05, "loss": 2.0634, "step": 13052 }, { "epoch": 0.3437713984724783, "grad_norm": 2.1368930339813232, "learning_rate": 3.282328153805636e-05, "loss": 2.0722, "step": 13053 }, { "epoch": 0.34379773505399, "grad_norm": 4.334184169769287, "learning_rate": 3.282196470898077e-05, "loss": 2.0015, "step": 13054 }, { "epoch": 0.3438240716355017, "grad_norm": 4.264140605926514, "learning_rate": 3.282064787990519e-05, "loss": 1.1041, "step": 13055 }, { "epoch": 0.34385040821701346, "grad_norm": 1.9202227592468262, "learning_rate": 3.2819331050829604e-05, "loss": 1.6563, "step": 13056 }, { "epoch": 0.34387674479852515, "grad_norm": 1.792994737625122, "learning_rate": 3.281801422175402e-05, "loss": 1.588, "step": 13057 }, { "epoch": 0.3439030813800369, "grad_norm": 1.7406229972839355, "learning_rate": 3.281669739267843e-05, "loss": 2.0877, "step": 13058 }, { "epoch": 0.3439294179615486, "grad_norm": 2.530524253845215, "learning_rate": 3.2815380563602844e-05, "loss": 1.5121, "step": 13059 }, { "epoch": 0.3439557545430603, "grad_norm": 1.6033681631088257, "learning_rate": 3.281406373452726e-05, "loss": 1.7437, "step": 13060 }, { "epoch": 0.343982091124572, "grad_norm": 3.5794901847839355, "learning_rate": 3.2812746905451676e-05, "loss": 1.6998, "step": 13061 }, { "epoch": 0.34400842770608375, "grad_norm": 1.7232699394226074, "learning_rate": 3.2811430076376084e-05, "loss": 1.8842, "step": 13062 }, { "epoch": 0.3440347642875955, "grad_norm": 2.4835448265075684, "learning_rate": 3.28101132473005e-05, "loss": 2.1897, "step": 13063 }, { "epoch": 0.3440611008691072, "grad_norm": 4.697007179260254, "learning_rate": 3.2808796418224916e-05, "loss": 1.6079, "step": 13064 }, { "epoch": 0.3440874374506189, "grad_norm": 1.7171131372451782, "learning_rate": 3.280747958914933e-05, "loss": 2.024, "step": 13065 }, { "epoch": 0.3441137740321306, "grad_norm": 4.3096022605896, "learning_rate": 3.280616276007375e-05, "loss": 1.5266, "step": 13066 }, { "epoch": 0.34414011061364236, "grad_norm": 3.385385751724243, "learning_rate": 3.2804845930998156e-05, "loss": 0.9625, "step": 13067 }, { "epoch": 0.34416644719515405, "grad_norm": 1.8404408693313599, "learning_rate": 3.280352910192257e-05, "loss": 1.7764, "step": 13068 }, { "epoch": 0.3441927837766658, "grad_norm": 1.7703425884246826, "learning_rate": 3.280221227284699e-05, "loss": 1.8662, "step": 13069 }, { "epoch": 0.34421912035817753, "grad_norm": 2.0454976558685303, "learning_rate": 3.28008954437714e-05, "loss": 2.2252, "step": 13070 }, { "epoch": 0.3442454569396892, "grad_norm": 2.457709312438965, "learning_rate": 3.279957861469581e-05, "loss": 2.1304, "step": 13071 }, { "epoch": 0.34427179352120096, "grad_norm": 1.8219228982925415, "learning_rate": 3.279826178562023e-05, "loss": 1.4442, "step": 13072 }, { "epoch": 0.34429813010271265, "grad_norm": 1.6668879985809326, "learning_rate": 3.279694495654464e-05, "loss": 2.169, "step": 13073 }, { "epoch": 0.3443244666842244, "grad_norm": 1.4907307624816895, "learning_rate": 3.279562812746906e-05, "loss": 1.8519, "step": 13074 }, { "epoch": 0.3443508032657361, "grad_norm": 1.9547961950302124, "learning_rate": 3.2794311298393474e-05, "loss": 1.5461, "step": 13075 }, { "epoch": 0.3443771398472478, "grad_norm": 2.242504596710205, "learning_rate": 3.279299446931788e-05, "loss": 2.4091, "step": 13076 }, { "epoch": 0.34440347642875957, "grad_norm": 4.072727203369141, "learning_rate": 3.27916776402423e-05, "loss": 2.1891, "step": 13077 }, { "epoch": 0.34442981301027126, "grad_norm": 1.9525781869888306, "learning_rate": 3.2790360811166714e-05, "loss": 1.9789, "step": 13078 }, { "epoch": 0.344456149591783, "grad_norm": 1.893464207649231, "learning_rate": 3.278904398209113e-05, "loss": 0.7665, "step": 13079 }, { "epoch": 0.3444824861732947, "grad_norm": 3.1126601696014404, "learning_rate": 3.278772715301554e-05, "loss": 1.3905, "step": 13080 }, { "epoch": 0.34450882275480643, "grad_norm": 1.7459403276443481, "learning_rate": 3.2786410323939954e-05, "loss": 1.6161, "step": 13081 }, { "epoch": 0.3445351593363181, "grad_norm": 1.9930229187011719, "learning_rate": 3.278509349486437e-05, "loss": 1.4954, "step": 13082 }, { "epoch": 0.34456149591782986, "grad_norm": 1.9269921779632568, "learning_rate": 3.2783776665788785e-05, "loss": 1.9001, "step": 13083 }, { "epoch": 0.3445878324993416, "grad_norm": 3.482120990753174, "learning_rate": 3.27824598367132e-05, "loss": 1.5318, "step": 13084 }, { "epoch": 0.3446141690808533, "grad_norm": 1.725644588470459, "learning_rate": 3.278114300763761e-05, "loss": 1.8467, "step": 13085 }, { "epoch": 0.34464050566236504, "grad_norm": 1.6535245180130005, "learning_rate": 3.2779826178562025e-05, "loss": 2.6959, "step": 13086 }, { "epoch": 0.3446668422438767, "grad_norm": 3.432035446166992, "learning_rate": 3.2778509349486434e-05, "loss": 1.0838, "step": 13087 }, { "epoch": 0.34469317882538847, "grad_norm": 2.724940061569214, "learning_rate": 3.2777192520410856e-05, "loss": 2.7897, "step": 13088 }, { "epoch": 0.3447195154069002, "grad_norm": 2.3714377880096436, "learning_rate": 3.2775875691335265e-05, "loss": 1.0903, "step": 13089 }, { "epoch": 0.3447458519884119, "grad_norm": 1.5452818870544434, "learning_rate": 3.277455886225968e-05, "loss": 2.132, "step": 13090 }, { "epoch": 0.34477218856992364, "grad_norm": 1.769982099533081, "learning_rate": 3.2773242033184096e-05, "loss": 1.8803, "step": 13091 }, { "epoch": 0.34479852515143533, "grad_norm": 2.5638909339904785, "learning_rate": 3.2771925204108505e-05, "loss": 1.83, "step": 13092 }, { "epoch": 0.3448248617329471, "grad_norm": 1.5546481609344482, "learning_rate": 3.277060837503293e-05, "loss": 1.5173, "step": 13093 }, { "epoch": 0.34485119831445876, "grad_norm": 6.6082305908203125, "learning_rate": 3.2769291545957336e-05, "loss": 1.6051, "step": 13094 }, { "epoch": 0.3448775348959705, "grad_norm": 2.1726670265197754, "learning_rate": 3.276797471688175e-05, "loss": 1.6503, "step": 13095 }, { "epoch": 0.34490387147748225, "grad_norm": 2.1224615573883057, "learning_rate": 3.276665788780616e-05, "loss": 1.6912, "step": 13096 }, { "epoch": 0.34493020805899394, "grad_norm": 4.337820529937744, "learning_rate": 3.276534105873058e-05, "loss": 1.7426, "step": 13097 }, { "epoch": 0.3449565446405057, "grad_norm": 3.636662244796753, "learning_rate": 3.276402422965499e-05, "loss": 1.1134, "step": 13098 }, { "epoch": 0.34498288122201737, "grad_norm": 2.017798900604248, "learning_rate": 3.276270740057941e-05, "loss": 1.544, "step": 13099 }, { "epoch": 0.3450092178035291, "grad_norm": 2.245283365249634, "learning_rate": 3.2761390571503816e-05, "loss": 0.43, "step": 13100 }, { "epoch": 0.3450355543850408, "grad_norm": 1.9564303159713745, "learning_rate": 3.276007374242823e-05, "loss": 2.0878, "step": 13101 }, { "epoch": 0.34506189096655254, "grad_norm": 1.8316985368728638, "learning_rate": 3.275875691335265e-05, "loss": 1.8981, "step": 13102 }, { "epoch": 0.3450882275480643, "grad_norm": 2.384197950363159, "learning_rate": 3.275744008427706e-05, "loss": 2.1968, "step": 13103 }, { "epoch": 0.345114564129576, "grad_norm": 1.8945741653442383, "learning_rate": 3.275612325520148e-05, "loss": 0.467, "step": 13104 }, { "epoch": 0.3451409007110877, "grad_norm": 4.219059467315674, "learning_rate": 3.275480642612589e-05, "loss": 1.385, "step": 13105 }, { "epoch": 0.3451672372925994, "grad_norm": 1.6160128116607666, "learning_rate": 3.27534895970503e-05, "loss": 1.9738, "step": 13106 }, { "epoch": 0.34519357387411115, "grad_norm": 2.2300801277160645, "learning_rate": 3.275217276797472e-05, "loss": 1.9553, "step": 13107 }, { "epoch": 0.34521991045562284, "grad_norm": 2.761461019515991, "learning_rate": 3.2750855938899134e-05, "loss": 1.8339, "step": 13108 }, { "epoch": 0.3452462470371346, "grad_norm": 1.5213525295257568, "learning_rate": 3.274953910982354e-05, "loss": 1.2781, "step": 13109 }, { "epoch": 0.3452725836186463, "grad_norm": 4.686323165893555, "learning_rate": 3.274822228074796e-05, "loss": 1.2551, "step": 13110 }, { "epoch": 0.345298920200158, "grad_norm": 1.545095682144165, "learning_rate": 3.2746905451672374e-05, "loss": 1.1122, "step": 13111 }, { "epoch": 0.34532525678166975, "grad_norm": 2.2858619689941406, "learning_rate": 3.274558862259679e-05, "loss": 1.7444, "step": 13112 }, { "epoch": 0.34535159336318144, "grad_norm": 5.111452102661133, "learning_rate": 3.2744271793521205e-05, "loss": 1.7006, "step": 13113 }, { "epoch": 0.3453779299446932, "grad_norm": 1.594421625137329, "learning_rate": 3.2742954964445614e-05, "loss": 1.4181, "step": 13114 }, { "epoch": 0.3454042665262049, "grad_norm": 1.8380615711212158, "learning_rate": 3.274163813537003e-05, "loss": 1.8114, "step": 13115 }, { "epoch": 0.3454306031077166, "grad_norm": 2.3619723320007324, "learning_rate": 3.2740321306294445e-05, "loss": 1.2619, "step": 13116 }, { "epoch": 0.34545693968922836, "grad_norm": 2.5805907249450684, "learning_rate": 3.273900447721886e-05, "loss": 1.5314, "step": 13117 }, { "epoch": 0.34548327627074005, "grad_norm": 2.029003858566284, "learning_rate": 3.273768764814327e-05, "loss": 1.6718, "step": 13118 }, { "epoch": 0.3455096128522518, "grad_norm": 3.640721321105957, "learning_rate": 3.2736370819067685e-05, "loss": 0.8545, "step": 13119 }, { "epoch": 0.3455359494337635, "grad_norm": 2.7304906845092773, "learning_rate": 3.27350539899921e-05, "loss": 1.6673, "step": 13120 }, { "epoch": 0.3455622860152752, "grad_norm": 2.343400001525879, "learning_rate": 3.2733737160916517e-05, "loss": 1.9146, "step": 13121 }, { "epoch": 0.3455886225967869, "grad_norm": 3.9495620727539062, "learning_rate": 3.273242033184093e-05, "loss": 0.7853, "step": 13122 }, { "epoch": 0.34561495917829865, "grad_norm": 2.723336935043335, "learning_rate": 3.273110350276534e-05, "loss": 2.1949, "step": 13123 }, { "epoch": 0.3456412957598104, "grad_norm": 2.9893717765808105, "learning_rate": 3.2729786673689757e-05, "loss": 0.6294, "step": 13124 }, { "epoch": 0.3456676323413221, "grad_norm": 3.2053024768829346, "learning_rate": 3.2728469844614165e-05, "loss": 1.748, "step": 13125 }, { "epoch": 0.3456939689228338, "grad_norm": 2.148850917816162, "learning_rate": 3.272715301553859e-05, "loss": 1.8449, "step": 13126 }, { "epoch": 0.3457203055043455, "grad_norm": 1.597330093383789, "learning_rate": 3.2725836186462997e-05, "loss": 2.1249, "step": 13127 }, { "epoch": 0.34574664208585726, "grad_norm": 2.352307081222534, "learning_rate": 3.272451935738741e-05, "loss": 1.9722, "step": 13128 }, { "epoch": 0.345772978667369, "grad_norm": 1.9829521179199219, "learning_rate": 3.272320252831183e-05, "loss": 1.6498, "step": 13129 }, { "epoch": 0.3457993152488807, "grad_norm": 2.9146358966827393, "learning_rate": 3.272188569923624e-05, "loss": 1.5053, "step": 13130 }, { "epoch": 0.34582565183039243, "grad_norm": 2.101445436477661, "learning_rate": 3.272056887016066e-05, "loss": 1.8205, "step": 13131 }, { "epoch": 0.3458519884119041, "grad_norm": 2.021803617477417, "learning_rate": 3.271925204108507e-05, "loss": 1.5883, "step": 13132 }, { "epoch": 0.34587832499341586, "grad_norm": 2.4753849506378174, "learning_rate": 3.2717935212009483e-05, "loss": 1.9541, "step": 13133 }, { "epoch": 0.34590466157492755, "grad_norm": 2.9785354137420654, "learning_rate": 3.271661838293389e-05, "loss": 1.5607, "step": 13134 }, { "epoch": 0.3459309981564393, "grad_norm": 2.653076410293579, "learning_rate": 3.2715301553858315e-05, "loss": 0.7222, "step": 13135 }, { "epoch": 0.34595733473795104, "grad_norm": 4.669103622436523, "learning_rate": 3.2713984724782723e-05, "loss": 0.6319, "step": 13136 }, { "epoch": 0.3459836713194627, "grad_norm": 2.5796475410461426, "learning_rate": 3.271266789570714e-05, "loss": 1.6803, "step": 13137 }, { "epoch": 0.34601000790097447, "grad_norm": 1.9838327169418335, "learning_rate": 3.2711351066631555e-05, "loss": 2.0145, "step": 13138 }, { "epoch": 0.34603634448248616, "grad_norm": 2.317049503326416, "learning_rate": 3.2710034237555963e-05, "loss": 1.7199, "step": 13139 }, { "epoch": 0.3460626810639979, "grad_norm": 1.638168215751648, "learning_rate": 3.2708717408480386e-05, "loss": 1.2904, "step": 13140 }, { "epoch": 0.3460890176455096, "grad_norm": 1.910266399383545, "learning_rate": 3.2707400579404795e-05, "loss": 0.8098, "step": 13141 }, { "epoch": 0.34611535422702133, "grad_norm": 2.1312108039855957, "learning_rate": 3.270608375032921e-05, "loss": 1.7403, "step": 13142 }, { "epoch": 0.3461416908085331, "grad_norm": 2.5649542808532715, "learning_rate": 3.270476692125362e-05, "loss": 1.4819, "step": 13143 }, { "epoch": 0.34616802739004476, "grad_norm": 3.8648250102996826, "learning_rate": 3.270345009217804e-05, "loss": 1.5417, "step": 13144 }, { "epoch": 0.3461943639715565, "grad_norm": 3.1407723426818848, "learning_rate": 3.270213326310245e-05, "loss": 1.0769, "step": 13145 }, { "epoch": 0.3462207005530682, "grad_norm": 3.0411038398742676, "learning_rate": 3.2700816434026866e-05, "loss": 1.1421, "step": 13146 }, { "epoch": 0.34624703713457994, "grad_norm": 3.575984239578247, "learning_rate": 3.269949960495128e-05, "loss": 2.2352, "step": 13147 }, { "epoch": 0.3462733737160916, "grad_norm": 2.0217156410217285, "learning_rate": 3.269818277587569e-05, "loss": 0.9092, "step": 13148 }, { "epoch": 0.34629971029760337, "grad_norm": 1.5097116231918335, "learning_rate": 3.269686594680011e-05, "loss": 1.5321, "step": 13149 }, { "epoch": 0.3463260468791151, "grad_norm": 2.2257752418518066, "learning_rate": 3.269554911772452e-05, "loss": 2.223, "step": 13150 }, { "epoch": 0.3463523834606268, "grad_norm": 1.7590159177780151, "learning_rate": 3.269423228864894e-05, "loss": 0.8483, "step": 13151 }, { "epoch": 0.34637872004213854, "grad_norm": 2.7122015953063965, "learning_rate": 3.2692915459573346e-05, "loss": 1.8246, "step": 13152 }, { "epoch": 0.34640505662365023, "grad_norm": 4.413259983062744, "learning_rate": 3.269159863049776e-05, "loss": 1.0559, "step": 13153 }, { "epoch": 0.346431393205162, "grad_norm": 5.453820705413818, "learning_rate": 3.269028180142218e-05, "loss": 1.1155, "step": 13154 }, { "epoch": 0.34645772978667366, "grad_norm": 3.453749656677246, "learning_rate": 3.268896497234659e-05, "loss": 2.0343, "step": 13155 }, { "epoch": 0.3464840663681854, "grad_norm": 1.9833474159240723, "learning_rate": 3.2687648143271e-05, "loss": 0.4347, "step": 13156 }, { "epoch": 0.34651040294969715, "grad_norm": 2.256859540939331, "learning_rate": 3.268633131419542e-05, "loss": 0.6473, "step": 13157 }, { "epoch": 0.34653673953120884, "grad_norm": 1.7505801916122437, "learning_rate": 3.268501448511983e-05, "loss": 1.6198, "step": 13158 }, { "epoch": 0.3465630761127206, "grad_norm": 1.9900546073913574, "learning_rate": 3.268369765604425e-05, "loss": 1.9354, "step": 13159 }, { "epoch": 0.34658941269423227, "grad_norm": 2.2289633750915527, "learning_rate": 3.2682380826968664e-05, "loss": 1.5528, "step": 13160 }, { "epoch": 0.346615749275744, "grad_norm": 1.8603487014770508, "learning_rate": 3.268106399789307e-05, "loss": 2.3699, "step": 13161 }, { "epoch": 0.34664208585725576, "grad_norm": 3.21364164352417, "learning_rate": 3.267974716881749e-05, "loss": 1.2129, "step": 13162 }, { "epoch": 0.34666842243876744, "grad_norm": 3.918067455291748, "learning_rate": 3.2678430339741904e-05, "loss": 1.1659, "step": 13163 }, { "epoch": 0.3466947590202792, "grad_norm": 1.5441951751708984, "learning_rate": 3.267711351066632e-05, "loss": 1.3983, "step": 13164 }, { "epoch": 0.3467210956017909, "grad_norm": 2.0096254348754883, "learning_rate": 3.267579668159073e-05, "loss": 0.4344, "step": 13165 }, { "epoch": 0.3467474321833026, "grad_norm": 2.4570610523223877, "learning_rate": 3.2674479852515144e-05, "loss": 1.6607, "step": 13166 }, { "epoch": 0.3467737687648143, "grad_norm": 2.5036914348602295, "learning_rate": 3.267316302343956e-05, "loss": 0.5084, "step": 13167 }, { "epoch": 0.34680010534632605, "grad_norm": 3.401029348373413, "learning_rate": 3.2671846194363975e-05, "loss": 2.0058, "step": 13168 }, { "epoch": 0.3468264419278378, "grad_norm": 1.9968043565750122, "learning_rate": 3.267052936528839e-05, "loss": 2.4615, "step": 13169 }, { "epoch": 0.3468527785093495, "grad_norm": 1.5256257057189941, "learning_rate": 3.26692125362128e-05, "loss": 1.7771, "step": 13170 }, { "epoch": 0.3468791150908612, "grad_norm": 1.498011827468872, "learning_rate": 3.2667895707137215e-05, "loss": 1.7604, "step": 13171 }, { "epoch": 0.3469054516723729, "grad_norm": 1.7540767192840576, "learning_rate": 3.2666578878061624e-05, "loss": 1.9506, "step": 13172 }, { "epoch": 0.34693178825388465, "grad_norm": 4.443070411682129, "learning_rate": 3.2665262048986046e-05, "loss": 1.3594, "step": 13173 }, { "epoch": 0.34695812483539634, "grad_norm": 1.8055349588394165, "learning_rate": 3.2663945219910455e-05, "loss": 1.6866, "step": 13174 }, { "epoch": 0.3469844614169081, "grad_norm": 3.2847204208374023, "learning_rate": 3.266262839083487e-05, "loss": 0.4526, "step": 13175 }, { "epoch": 0.34701079799841983, "grad_norm": 1.8152081966400146, "learning_rate": 3.2661311561759286e-05, "loss": 2.342, "step": 13176 }, { "epoch": 0.3470371345799315, "grad_norm": 4.156172275543213, "learning_rate": 3.26599947326837e-05, "loss": 1.5455, "step": 13177 }, { "epoch": 0.34706347116144326, "grad_norm": 3.141244888305664, "learning_rate": 3.265867790360812e-05, "loss": 2.1393, "step": 13178 }, { "epoch": 0.34708980774295495, "grad_norm": 1.998428463935852, "learning_rate": 3.2657361074532526e-05, "loss": 0.3482, "step": 13179 }, { "epoch": 0.3471161443244667, "grad_norm": 1.8900747299194336, "learning_rate": 3.265604424545694e-05, "loss": 2.0578, "step": 13180 }, { "epoch": 0.3471424809059784, "grad_norm": 1.4871665239334106, "learning_rate": 3.265472741638135e-05, "loss": 0.9511, "step": 13181 }, { "epoch": 0.3471688174874901, "grad_norm": 1.6186096668243408, "learning_rate": 3.265341058730577e-05, "loss": 2.3809, "step": 13182 }, { "epoch": 0.34719515406900187, "grad_norm": 2.7033965587615967, "learning_rate": 3.265209375823018e-05, "loss": 1.4537, "step": 13183 }, { "epoch": 0.34722149065051355, "grad_norm": 2.3884329795837402, "learning_rate": 3.26507769291546e-05, "loss": 2.3727, "step": 13184 }, { "epoch": 0.3472478272320253, "grad_norm": 2.401170492172241, "learning_rate": 3.264946010007901e-05, "loss": 2.3979, "step": 13185 }, { "epoch": 0.347274163813537, "grad_norm": 1.9125306606292725, "learning_rate": 3.264814327100342e-05, "loss": 1.0401, "step": 13186 }, { "epoch": 0.34730050039504873, "grad_norm": 3.3054792881011963, "learning_rate": 3.2646826441927844e-05, "loss": 1.7221, "step": 13187 }, { "epoch": 0.3473268369765604, "grad_norm": 1.8396989107131958, "learning_rate": 3.264550961285225e-05, "loss": 2.0625, "step": 13188 }, { "epoch": 0.34735317355807216, "grad_norm": 3.386190891265869, "learning_rate": 3.264419278377667e-05, "loss": 1.1385, "step": 13189 }, { "epoch": 0.3473795101395839, "grad_norm": 2.0150368213653564, "learning_rate": 3.264287595470108e-05, "loss": 1.9855, "step": 13190 }, { "epoch": 0.3474058467210956, "grad_norm": 4.199228286743164, "learning_rate": 3.264155912562549e-05, "loss": 0.7868, "step": 13191 }, { "epoch": 0.34743218330260733, "grad_norm": 1.7155773639678955, "learning_rate": 3.264024229654991e-05, "loss": 2.4113, "step": 13192 }, { "epoch": 0.347458519884119, "grad_norm": 4.085933208465576, "learning_rate": 3.2638925467474324e-05, "loss": 1.4366, "step": 13193 }, { "epoch": 0.34748485646563076, "grad_norm": 1.725295901298523, "learning_rate": 3.263760863839874e-05, "loss": 1.8667, "step": 13194 }, { "epoch": 0.3475111930471425, "grad_norm": 2.126094341278076, "learning_rate": 3.263629180932315e-05, "loss": 1.7663, "step": 13195 }, { "epoch": 0.3475375296286542, "grad_norm": 2.895192861557007, "learning_rate": 3.263497498024757e-05, "loss": 1.2456, "step": 13196 }, { "epoch": 0.34756386621016594, "grad_norm": 4.887364387512207, "learning_rate": 3.263365815117198e-05, "loss": 2.3179, "step": 13197 }, { "epoch": 0.3475902027916776, "grad_norm": 2.686774253845215, "learning_rate": 3.2632341322096396e-05, "loss": 1.737, "step": 13198 }, { "epoch": 0.34761653937318937, "grad_norm": 1.8099535703659058, "learning_rate": 3.2631024493020804e-05, "loss": 1.4543, "step": 13199 }, { "epoch": 0.34764287595470106, "grad_norm": 2.709017038345337, "learning_rate": 3.262970766394522e-05, "loss": 1.5653, "step": 13200 }, { "epoch": 0.3476692125362128, "grad_norm": 2.230703592300415, "learning_rate": 3.2628390834869636e-05, "loss": 1.7813, "step": 13201 }, { "epoch": 0.34769554911772454, "grad_norm": 2.315885543823242, "learning_rate": 3.262707400579405e-05, "loss": 1.67, "step": 13202 }, { "epoch": 0.34772188569923623, "grad_norm": 2.4082462787628174, "learning_rate": 3.262575717671846e-05, "loss": 1.6513, "step": 13203 }, { "epoch": 0.347748222280748, "grad_norm": 7.442232131958008, "learning_rate": 3.2624440347642876e-05, "loss": 1.8802, "step": 13204 }, { "epoch": 0.34777455886225966, "grad_norm": 1.9536820650100708, "learning_rate": 3.262312351856729e-05, "loss": 2.2042, "step": 13205 }, { "epoch": 0.3478008954437714, "grad_norm": 3.228766679763794, "learning_rate": 3.262180668949171e-05, "loss": 1.863, "step": 13206 }, { "epoch": 0.3478272320252831, "grad_norm": 1.9316649436950684, "learning_rate": 3.262048986041612e-05, "loss": 2.1574, "step": 13207 }, { "epoch": 0.34785356860679484, "grad_norm": 1.660380244255066, "learning_rate": 3.261917303134053e-05, "loss": 1.8578, "step": 13208 }, { "epoch": 0.3478799051883066, "grad_norm": 1.4779479503631592, "learning_rate": 3.261785620226495e-05, "loss": 0.3396, "step": 13209 }, { "epoch": 0.34790624176981827, "grad_norm": 2.8942437171936035, "learning_rate": 3.261653937318936e-05, "loss": 0.5271, "step": 13210 }, { "epoch": 0.34793257835133, "grad_norm": 2.0104780197143555, "learning_rate": 3.261522254411378e-05, "loss": 2.1426, "step": 13211 }, { "epoch": 0.3479589149328417, "grad_norm": 1.9063493013381958, "learning_rate": 3.261390571503819e-05, "loss": 0.3851, "step": 13212 }, { "epoch": 0.34798525151435344, "grad_norm": 2.0151333808898926, "learning_rate": 3.26125888859626e-05, "loss": 1.5941, "step": 13213 }, { "epoch": 0.34801158809586513, "grad_norm": 1.9707542657852173, "learning_rate": 3.261127205688702e-05, "loss": 1.3774, "step": 13214 }, { "epoch": 0.3480379246773769, "grad_norm": 2.3949368000030518, "learning_rate": 3.2609955227811434e-05, "loss": 1.7263, "step": 13215 }, { "epoch": 0.3480642612588886, "grad_norm": 1.9590846300125122, "learning_rate": 3.260863839873585e-05, "loss": 1.8277, "step": 13216 }, { "epoch": 0.3480905978404003, "grad_norm": 2.1931588649749756, "learning_rate": 3.260732156966026e-05, "loss": 1.124, "step": 13217 }, { "epoch": 0.34811693442191205, "grad_norm": 2.7168567180633545, "learning_rate": 3.2606004740584674e-05, "loss": 1.2126, "step": 13218 }, { "epoch": 0.34814327100342374, "grad_norm": 2.025538206100464, "learning_rate": 3.260468791150908e-05, "loss": 1.4572, "step": 13219 }, { "epoch": 0.3481696075849355, "grad_norm": 3.5675971508026123, "learning_rate": 3.2603371082433505e-05, "loss": 1.3672, "step": 13220 }, { "epoch": 0.34819594416644717, "grad_norm": 1.7756645679473877, "learning_rate": 3.2602054253357914e-05, "loss": 1.5728, "step": 13221 }, { "epoch": 0.3482222807479589, "grad_norm": 1.990281581878662, "learning_rate": 3.260073742428233e-05, "loss": 1.4883, "step": 13222 }, { "epoch": 0.34824861732947066, "grad_norm": 1.7624545097351074, "learning_rate": 3.2599420595206745e-05, "loss": 1.85, "step": 13223 }, { "epoch": 0.34827495391098234, "grad_norm": 3.2606406211853027, "learning_rate": 3.2598103766131154e-05, "loss": 1.872, "step": 13224 }, { "epoch": 0.3483012904924941, "grad_norm": 3.6914162635803223, "learning_rate": 3.2596786937055576e-05, "loss": 1.2663, "step": 13225 }, { "epoch": 0.3483276270740058, "grad_norm": 2.4079458713531494, "learning_rate": 3.2595470107979985e-05, "loss": 1.5693, "step": 13226 }, { "epoch": 0.3483539636555175, "grad_norm": 3.0475223064422607, "learning_rate": 3.25941532789044e-05, "loss": 2.0814, "step": 13227 }, { "epoch": 0.34838030023702926, "grad_norm": 2.0033042430877686, "learning_rate": 3.259283644982881e-05, "loss": 1.902, "step": 13228 }, { "epoch": 0.34840663681854095, "grad_norm": 2.124267101287842, "learning_rate": 3.259151962075323e-05, "loss": 0.7724, "step": 13229 }, { "epoch": 0.3484329734000527, "grad_norm": 3.014657974243164, "learning_rate": 3.259020279167764e-05, "loss": 1.4377, "step": 13230 }, { "epoch": 0.3484593099815644, "grad_norm": 2.3430721759796143, "learning_rate": 3.2588885962602056e-05, "loss": 1.7488, "step": 13231 }, { "epoch": 0.3484856465630761, "grad_norm": 1.9165147542953491, "learning_rate": 3.258756913352647e-05, "loss": 2.018, "step": 13232 }, { "epoch": 0.3485119831445878, "grad_norm": 2.3940341472625732, "learning_rate": 3.258625230445088e-05, "loss": 1.2182, "step": 13233 }, { "epoch": 0.34853831972609955, "grad_norm": 2.1990294456481934, "learning_rate": 3.25849354753753e-05, "loss": 1.9036, "step": 13234 }, { "epoch": 0.3485646563076113, "grad_norm": 2.697700262069702, "learning_rate": 3.258361864629971e-05, "loss": 1.5049, "step": 13235 }, { "epoch": 0.348590992889123, "grad_norm": 3.199383497238159, "learning_rate": 3.258230181722413e-05, "loss": 1.9841, "step": 13236 }, { "epoch": 0.34861732947063473, "grad_norm": 2.8425440788269043, "learning_rate": 3.2580984988148536e-05, "loss": 2.4673, "step": 13237 }, { "epoch": 0.3486436660521464, "grad_norm": 2.4211385250091553, "learning_rate": 3.257966815907295e-05, "loss": 1.9615, "step": 13238 }, { "epoch": 0.34867000263365816, "grad_norm": 3.0204195976257324, "learning_rate": 3.257835132999737e-05, "loss": 0.7215, "step": 13239 }, { "epoch": 0.34869633921516985, "grad_norm": 1.7180991172790527, "learning_rate": 3.257703450092178e-05, "loss": 2.0885, "step": 13240 }, { "epoch": 0.3487226757966816, "grad_norm": 9.043501853942871, "learning_rate": 3.25757176718462e-05, "loss": 2.3929, "step": 13241 }, { "epoch": 0.34874901237819333, "grad_norm": 1.855078101158142, "learning_rate": 3.257440084277061e-05, "loss": 1.5952, "step": 13242 }, { "epoch": 0.348775348959705, "grad_norm": 2.160182237625122, "learning_rate": 3.257308401369503e-05, "loss": 0.4405, "step": 13243 }, { "epoch": 0.34880168554121677, "grad_norm": 1.6421520709991455, "learning_rate": 3.257176718461944e-05, "loss": 1.5546, "step": 13244 }, { "epoch": 0.34882802212272845, "grad_norm": 3.1143510341644287, "learning_rate": 3.2570450355543854e-05, "loss": 1.4497, "step": 13245 }, { "epoch": 0.3488543587042402, "grad_norm": 3.018958330154419, "learning_rate": 3.256913352646826e-05, "loss": 1.6898, "step": 13246 }, { "epoch": 0.3488806952857519, "grad_norm": 5.543989658355713, "learning_rate": 3.256781669739268e-05, "loss": 0.9067, "step": 13247 }, { "epoch": 0.34890703186726363, "grad_norm": 1.92924964427948, "learning_rate": 3.2566499868317094e-05, "loss": 1.7132, "step": 13248 }, { "epoch": 0.34893336844877537, "grad_norm": 1.6289355754852295, "learning_rate": 3.256518303924151e-05, "loss": 1.461, "step": 13249 }, { "epoch": 0.34895970503028706, "grad_norm": 1.9468270540237427, "learning_rate": 3.2563866210165925e-05, "loss": 1.6564, "step": 13250 }, { "epoch": 0.3489860416117988, "grad_norm": 1.8982024192810059, "learning_rate": 3.2562549381090334e-05, "loss": 1.5647, "step": 13251 }, { "epoch": 0.3490123781933105, "grad_norm": 3.0467891693115234, "learning_rate": 3.256123255201475e-05, "loss": 1.7748, "step": 13252 }, { "epoch": 0.34903871477482223, "grad_norm": 1.782957911491394, "learning_rate": 3.2559915722939165e-05, "loss": 2.0367, "step": 13253 }, { "epoch": 0.3490650513563339, "grad_norm": 10.435483932495117, "learning_rate": 3.255859889386358e-05, "loss": 3.0346, "step": 13254 }, { "epoch": 0.34909138793784567, "grad_norm": 2.1778833866119385, "learning_rate": 3.255728206478799e-05, "loss": 1.4659, "step": 13255 }, { "epoch": 0.3491177245193574, "grad_norm": 2.911656141281128, "learning_rate": 3.2555965235712405e-05, "loss": 0.8559, "step": 13256 }, { "epoch": 0.3491440611008691, "grad_norm": 1.9022785425186157, "learning_rate": 3.2554648406636814e-05, "loss": 1.4912, "step": 13257 }, { "epoch": 0.34917039768238084, "grad_norm": 2.2376620769500732, "learning_rate": 3.2553331577561237e-05, "loss": 2.1282, "step": 13258 }, { "epoch": 0.3491967342638925, "grad_norm": 2.574781894683838, "learning_rate": 3.2552014748485645e-05, "loss": 1.3146, "step": 13259 }, { "epoch": 0.34922307084540427, "grad_norm": 2.566833019256592, "learning_rate": 3.255069791941006e-05, "loss": 1.9038, "step": 13260 }, { "epoch": 0.34924940742691596, "grad_norm": 3.688613176345825, "learning_rate": 3.254938109033448e-05, "loss": 1.5275, "step": 13261 }, { "epoch": 0.3492757440084277, "grad_norm": 2.2053143978118896, "learning_rate": 3.254806426125889e-05, "loss": 1.5929, "step": 13262 }, { "epoch": 0.34930208058993945, "grad_norm": 1.8571240901947021, "learning_rate": 3.254674743218331e-05, "loss": 1.4883, "step": 13263 }, { "epoch": 0.34932841717145113, "grad_norm": 1.7839723825454712, "learning_rate": 3.254543060310772e-05, "loss": 0.7762, "step": 13264 }, { "epoch": 0.3493547537529629, "grad_norm": 1.915321707725525, "learning_rate": 3.254411377403213e-05, "loss": 2.0091, "step": 13265 }, { "epoch": 0.34938109033447456, "grad_norm": 1.7027623653411865, "learning_rate": 3.254279694495654e-05, "loss": 1.6055, "step": 13266 }, { "epoch": 0.3494074269159863, "grad_norm": 1.6140773296356201, "learning_rate": 3.2541480115880963e-05, "loss": 1.0359, "step": 13267 }, { "epoch": 0.34943376349749805, "grad_norm": 2.3103713989257812, "learning_rate": 3.254016328680537e-05, "loss": 1.8346, "step": 13268 }, { "epoch": 0.34946010007900974, "grad_norm": 2.0320658683776855, "learning_rate": 3.253884645772979e-05, "loss": 1.7831, "step": 13269 }, { "epoch": 0.3494864366605215, "grad_norm": 1.7528281211853027, "learning_rate": 3.2537529628654203e-05, "loss": 1.5211, "step": 13270 }, { "epoch": 0.34951277324203317, "grad_norm": 4.402647018432617, "learning_rate": 3.253621279957861e-05, "loss": 2.6433, "step": 13271 }, { "epoch": 0.3495391098235449, "grad_norm": 5.450698375701904, "learning_rate": 3.2534895970503035e-05, "loss": 1.4586, "step": 13272 }, { "epoch": 0.3495654464050566, "grad_norm": 1.3967461585998535, "learning_rate": 3.2533579141427443e-05, "loss": 1.3089, "step": 13273 }, { "epoch": 0.34959178298656834, "grad_norm": 2.8539626598358154, "learning_rate": 3.253226231235186e-05, "loss": 1.6664, "step": 13274 }, { "epoch": 0.3496181195680801, "grad_norm": 2.060029983520508, "learning_rate": 3.253094548327627e-05, "loss": 0.2958, "step": 13275 }, { "epoch": 0.3496444561495918, "grad_norm": 5.360213279724121, "learning_rate": 3.252962865420069e-05, "loss": 1.3407, "step": 13276 }, { "epoch": 0.3496707927311035, "grad_norm": 1.8764805793762207, "learning_rate": 3.25283118251251e-05, "loss": 2.253, "step": 13277 }, { "epoch": 0.3496971293126152, "grad_norm": 2.882615089416504, "learning_rate": 3.2526994996049515e-05, "loss": 2.2276, "step": 13278 }, { "epoch": 0.34972346589412695, "grad_norm": 2.2269115447998047, "learning_rate": 3.252567816697393e-05, "loss": 2.0075, "step": 13279 }, { "epoch": 0.34974980247563864, "grad_norm": 2.1476926803588867, "learning_rate": 3.252436133789834e-05, "loss": 0.8737, "step": 13280 }, { "epoch": 0.3497761390571504, "grad_norm": 2.3212385177612305, "learning_rate": 3.252304450882276e-05, "loss": 1.7872, "step": 13281 }, { "epoch": 0.3498024756386621, "grad_norm": 1.9313105344772339, "learning_rate": 3.252172767974717e-05, "loss": 2.3175, "step": 13282 }, { "epoch": 0.3498288122201738, "grad_norm": 5.24793815612793, "learning_rate": 3.2520410850671586e-05, "loss": 1.6308, "step": 13283 }, { "epoch": 0.34985514880168556, "grad_norm": 5.203807830810547, "learning_rate": 3.2519094021595995e-05, "loss": 1.5655, "step": 13284 }, { "epoch": 0.34988148538319724, "grad_norm": 1.610917329788208, "learning_rate": 3.251777719252041e-05, "loss": 1.7317, "step": 13285 }, { "epoch": 0.349907821964709, "grad_norm": 3.397705316543579, "learning_rate": 3.2516460363444826e-05, "loss": 0.3327, "step": 13286 }, { "epoch": 0.3499341585462207, "grad_norm": 2.4374136924743652, "learning_rate": 3.251514353436924e-05, "loss": 2.3435, "step": 13287 }, { "epoch": 0.3499604951277324, "grad_norm": 1.9728240966796875, "learning_rate": 3.251382670529366e-05, "loss": 1.6025, "step": 13288 }, { "epoch": 0.34998683170924416, "grad_norm": 1.6892491579055786, "learning_rate": 3.2512509876218066e-05, "loss": 2.0315, "step": 13289 }, { "epoch": 0.35001316829075585, "grad_norm": 3.261169195175171, "learning_rate": 3.251119304714248e-05, "loss": 2.3634, "step": 13290 }, { "epoch": 0.3500395048722676, "grad_norm": 2.568793296813965, "learning_rate": 3.25098762180669e-05, "loss": 1.3906, "step": 13291 }, { "epoch": 0.3500658414537793, "grad_norm": 2.0497589111328125, "learning_rate": 3.250855938899131e-05, "loss": 2.0864, "step": 13292 }, { "epoch": 0.350092178035291, "grad_norm": 2.225813627243042, "learning_rate": 3.250724255991572e-05, "loss": 0.2089, "step": 13293 }, { "epoch": 0.3501185146168027, "grad_norm": 2.0244300365448, "learning_rate": 3.250592573084014e-05, "loss": 1.4985, "step": 13294 }, { "epoch": 0.35014485119831446, "grad_norm": 2.8434624671936035, "learning_rate": 3.250460890176455e-05, "loss": 0.8789, "step": 13295 }, { "epoch": 0.3501711877798262, "grad_norm": 1.7693883180618286, "learning_rate": 3.250329207268897e-05, "loss": 2.1361, "step": 13296 }, { "epoch": 0.3501975243613379, "grad_norm": 2.94120454788208, "learning_rate": 3.2501975243613384e-05, "loss": 2.1523, "step": 13297 }, { "epoch": 0.35022386094284963, "grad_norm": 2.399862766265869, "learning_rate": 3.250065841453779e-05, "loss": 1.3294, "step": 13298 }, { "epoch": 0.3502501975243613, "grad_norm": 2.7700185775756836, "learning_rate": 3.249934158546221e-05, "loss": 1.2198, "step": 13299 }, { "epoch": 0.35027653410587306, "grad_norm": 2.1279079914093018, "learning_rate": 3.2498024756386624e-05, "loss": 1.683, "step": 13300 }, { "epoch": 0.3503028706873848, "grad_norm": 3.119107723236084, "learning_rate": 3.249670792731104e-05, "loss": 0.5042, "step": 13301 }, { "epoch": 0.3503292072688965, "grad_norm": 3.5494678020477295, "learning_rate": 3.249539109823545e-05, "loss": 1.184, "step": 13302 }, { "epoch": 0.35035554385040824, "grad_norm": 2.842054843902588, "learning_rate": 3.2494074269159864e-05, "loss": 1.5409, "step": 13303 }, { "epoch": 0.3503818804319199, "grad_norm": 1.8012070655822754, "learning_rate": 3.249275744008427e-05, "loss": 2.1819, "step": 13304 }, { "epoch": 0.35040821701343167, "grad_norm": 3.4269027709960938, "learning_rate": 3.2491440611008695e-05, "loss": 1.6984, "step": 13305 }, { "epoch": 0.35043455359494335, "grad_norm": 1.6999813318252563, "learning_rate": 3.2490123781933104e-05, "loss": 1.9599, "step": 13306 }, { "epoch": 0.3504608901764551, "grad_norm": 2.2925491333007812, "learning_rate": 3.248880695285752e-05, "loss": 2.3996, "step": 13307 }, { "epoch": 0.35048722675796684, "grad_norm": 2.1514554023742676, "learning_rate": 3.2487490123781935e-05, "loss": 1.7487, "step": 13308 }, { "epoch": 0.35051356333947853, "grad_norm": 3.124458074569702, "learning_rate": 3.248617329470635e-05, "loss": 1.66, "step": 13309 }, { "epoch": 0.35053989992099027, "grad_norm": 2.718674659729004, "learning_rate": 3.2484856465630766e-05, "loss": 1.3637, "step": 13310 }, { "epoch": 0.35056623650250196, "grad_norm": 4.2831196784973145, "learning_rate": 3.2483539636555175e-05, "loss": 1.4355, "step": 13311 }, { "epoch": 0.3505925730840137, "grad_norm": 2.0692555904388428, "learning_rate": 3.248222280747959e-05, "loss": 2.0315, "step": 13312 }, { "epoch": 0.3506189096655254, "grad_norm": 3.712114095687866, "learning_rate": 3.2480905978404e-05, "loss": 1.8928, "step": 13313 }, { "epoch": 0.35064524624703713, "grad_norm": 1.9922786951065063, "learning_rate": 3.247958914932842e-05, "loss": 1.3376, "step": 13314 }, { "epoch": 0.3506715828285489, "grad_norm": 3.8773305416107178, "learning_rate": 3.247827232025283e-05, "loss": 1.6715, "step": 13315 }, { "epoch": 0.35069791941006057, "grad_norm": 3.6027517318725586, "learning_rate": 3.2476955491177246e-05, "loss": 1.9208, "step": 13316 }, { "epoch": 0.3507242559915723, "grad_norm": 3.8815882205963135, "learning_rate": 3.247563866210166e-05, "loss": 1.7572, "step": 13317 }, { "epoch": 0.350750592573084, "grad_norm": 2.51364803314209, "learning_rate": 3.247432183302607e-05, "loss": 1.6966, "step": 13318 }, { "epoch": 0.35077692915459574, "grad_norm": 1.8374290466308594, "learning_rate": 3.247300500395049e-05, "loss": 1.0813, "step": 13319 }, { "epoch": 0.35080326573610743, "grad_norm": 2.029656171798706, "learning_rate": 3.24716881748749e-05, "loss": 2.0684, "step": 13320 }, { "epoch": 0.35082960231761917, "grad_norm": 1.9576358795166016, "learning_rate": 3.247037134579932e-05, "loss": 1.8653, "step": 13321 }, { "epoch": 0.3508559388991309, "grad_norm": 6.42474889755249, "learning_rate": 3.2469054516723726e-05, "loss": 0.8853, "step": 13322 }, { "epoch": 0.3508822754806426, "grad_norm": 1.848995327949524, "learning_rate": 3.246773768764814e-05, "loss": 1.4848, "step": 13323 }, { "epoch": 0.35090861206215435, "grad_norm": 1.9281425476074219, "learning_rate": 3.246642085857256e-05, "loss": 1.9347, "step": 13324 }, { "epoch": 0.35093494864366603, "grad_norm": 4.627933502197266, "learning_rate": 3.246510402949697e-05, "loss": 2.0178, "step": 13325 }, { "epoch": 0.3509612852251778, "grad_norm": 3.9753177165985107, "learning_rate": 3.246378720042139e-05, "loss": 2.6702, "step": 13326 }, { "epoch": 0.35098762180668946, "grad_norm": 2.228876829147339, "learning_rate": 3.24624703713458e-05, "loss": 0.9303, "step": 13327 }, { "epoch": 0.3510139583882012, "grad_norm": 4.0302629470825195, "learning_rate": 3.246115354227022e-05, "loss": 1.5857, "step": 13328 }, { "epoch": 0.35104029496971295, "grad_norm": 6.6096510887146, "learning_rate": 3.245983671319463e-05, "loss": 1.6103, "step": 13329 }, { "epoch": 0.35106663155122464, "grad_norm": 2.4697418212890625, "learning_rate": 3.2458519884119044e-05, "loss": 2.5915, "step": 13330 }, { "epoch": 0.3510929681327364, "grad_norm": 2.167027711868286, "learning_rate": 3.245720305504345e-05, "loss": 0.9266, "step": 13331 }, { "epoch": 0.35111930471424807, "grad_norm": 2.2277779579162598, "learning_rate": 3.245588622596787e-05, "loss": 0.7662, "step": 13332 }, { "epoch": 0.3511456412957598, "grad_norm": 5.894300937652588, "learning_rate": 3.2454569396892284e-05, "loss": 3.318, "step": 13333 }, { "epoch": 0.35117197787727156, "grad_norm": 1.594135046005249, "learning_rate": 3.24532525678167e-05, "loss": 1.6331, "step": 13334 }, { "epoch": 0.35119831445878325, "grad_norm": 2.1567468643188477, "learning_rate": 3.2451935738741116e-05, "loss": 1.8917, "step": 13335 }, { "epoch": 0.351224651040295, "grad_norm": 1.9755929708480835, "learning_rate": 3.2450618909665524e-05, "loss": 1.5999, "step": 13336 }, { "epoch": 0.3512509876218067, "grad_norm": 4.384850025177002, "learning_rate": 3.244930208058994e-05, "loss": 1.1528, "step": 13337 }, { "epoch": 0.3512773242033184, "grad_norm": 2.446580648422241, "learning_rate": 3.2447985251514356e-05, "loss": 1.6648, "step": 13338 }, { "epoch": 0.3513036607848301, "grad_norm": 3.4179165363311768, "learning_rate": 3.244666842243877e-05, "loss": 0.9763, "step": 13339 }, { "epoch": 0.35132999736634185, "grad_norm": 2.2655434608459473, "learning_rate": 3.244535159336318e-05, "loss": 1.1891, "step": 13340 }, { "epoch": 0.3513563339478536, "grad_norm": 1.9768743515014648, "learning_rate": 3.2444034764287596e-05, "loss": 2.0416, "step": 13341 }, { "epoch": 0.3513826705293653, "grad_norm": 2.30531644821167, "learning_rate": 3.244271793521201e-05, "loss": 2.5406, "step": 13342 }, { "epoch": 0.351409007110877, "grad_norm": 2.2749857902526855, "learning_rate": 3.244140110613643e-05, "loss": 0.9018, "step": 13343 }, { "epoch": 0.3514353436923887, "grad_norm": 1.62916100025177, "learning_rate": 3.244008427706084e-05, "loss": 1.388, "step": 13344 }, { "epoch": 0.35146168027390046, "grad_norm": 2.0928359031677246, "learning_rate": 3.243876744798525e-05, "loss": 2.0932, "step": 13345 }, { "epoch": 0.35148801685541214, "grad_norm": 3.6840946674346924, "learning_rate": 3.243745061890967e-05, "loss": 1.6484, "step": 13346 }, { "epoch": 0.3515143534369239, "grad_norm": 3.200409173965454, "learning_rate": 3.243613378983408e-05, "loss": 1.8889, "step": 13347 }, { "epoch": 0.35154069001843563, "grad_norm": 2.4166016578674316, "learning_rate": 3.24348169607585e-05, "loss": 1.3953, "step": 13348 }, { "epoch": 0.3515670265999473, "grad_norm": 2.2247650623321533, "learning_rate": 3.243350013168291e-05, "loss": 1.7617, "step": 13349 }, { "epoch": 0.35159336318145906, "grad_norm": 1.8270072937011719, "learning_rate": 3.243218330260732e-05, "loss": 1.7973, "step": 13350 }, { "epoch": 0.35161969976297075, "grad_norm": 2.3578460216522217, "learning_rate": 3.243086647353173e-05, "loss": 1.7287, "step": 13351 }, { "epoch": 0.3516460363444825, "grad_norm": 2.202893018722534, "learning_rate": 3.2429549644456154e-05, "loss": 1.8195, "step": 13352 }, { "epoch": 0.3516723729259942, "grad_norm": 2.8357491493225098, "learning_rate": 3.242823281538056e-05, "loss": 1.8782, "step": 13353 }, { "epoch": 0.3516987095075059, "grad_norm": 1.8396583795547485, "learning_rate": 3.242691598630498e-05, "loss": 2.3974, "step": 13354 }, { "epoch": 0.35172504608901767, "grad_norm": 2.1672394275665283, "learning_rate": 3.2425599157229394e-05, "loss": 1.7462, "step": 13355 }, { "epoch": 0.35175138267052936, "grad_norm": 2.935796022415161, "learning_rate": 3.242428232815381e-05, "loss": 1.8066, "step": 13356 }, { "epoch": 0.3517777192520411, "grad_norm": 2.407139301300049, "learning_rate": 3.2422965499078225e-05, "loss": 1.7109, "step": 13357 }, { "epoch": 0.3518040558335528, "grad_norm": 1.8942136764526367, "learning_rate": 3.2421648670002634e-05, "loss": 1.4957, "step": 13358 }, { "epoch": 0.35183039241506453, "grad_norm": 1.977776288986206, "learning_rate": 3.242033184092705e-05, "loss": 1.5155, "step": 13359 }, { "epoch": 0.3518567289965762, "grad_norm": 4.4377570152282715, "learning_rate": 3.241901501185146e-05, "loss": 1.1665, "step": 13360 }, { "epoch": 0.35188306557808796, "grad_norm": 3.2606704235076904, "learning_rate": 3.241769818277588e-05, "loss": 1.2178, "step": 13361 }, { "epoch": 0.3519094021595997, "grad_norm": 2.151022434234619, "learning_rate": 3.241638135370029e-05, "loss": 1.5879, "step": 13362 }, { "epoch": 0.3519357387411114, "grad_norm": 2.694943428039551, "learning_rate": 3.2415064524624705e-05, "loss": 0.9485, "step": 13363 }, { "epoch": 0.35196207532262314, "grad_norm": 1.9729725122451782, "learning_rate": 3.241374769554912e-05, "loss": 2.0231, "step": 13364 }, { "epoch": 0.3519884119041348, "grad_norm": 4.62639856338501, "learning_rate": 3.241243086647353e-05, "loss": 1.5547, "step": 13365 }, { "epoch": 0.35201474848564657, "grad_norm": 1.7988792657852173, "learning_rate": 3.241111403739795e-05, "loss": 0.3154, "step": 13366 }, { "epoch": 0.3520410850671583, "grad_norm": 3.6884405612945557, "learning_rate": 3.240979720832236e-05, "loss": 1.5824, "step": 13367 }, { "epoch": 0.35206742164867, "grad_norm": 1.5911521911621094, "learning_rate": 3.2408480379246776e-05, "loss": 0.899, "step": 13368 }, { "epoch": 0.35209375823018174, "grad_norm": 1.6808979511260986, "learning_rate": 3.2407163550171185e-05, "loss": 1.6173, "step": 13369 }, { "epoch": 0.35212009481169343, "grad_norm": 2.386752128601074, "learning_rate": 3.24058467210956e-05, "loss": 0.4426, "step": 13370 }, { "epoch": 0.3521464313932052, "grad_norm": 3.679684638977051, "learning_rate": 3.2404529892020016e-05, "loss": 1.5691, "step": 13371 }, { "epoch": 0.35217276797471686, "grad_norm": 2.225318670272827, "learning_rate": 3.240321306294443e-05, "loss": 1.3965, "step": 13372 }, { "epoch": 0.3521991045562286, "grad_norm": 1.9934961795806885, "learning_rate": 3.240189623386885e-05, "loss": 1.4888, "step": 13373 }, { "epoch": 0.35222544113774035, "grad_norm": 1.7122739553451538, "learning_rate": 3.2400579404793256e-05, "loss": 1.6821, "step": 13374 }, { "epoch": 0.35225177771925203, "grad_norm": 2.938758134841919, "learning_rate": 3.239926257571768e-05, "loss": 1.1511, "step": 13375 }, { "epoch": 0.3522781143007638, "grad_norm": 2.356916666030884, "learning_rate": 3.239794574664209e-05, "loss": 2.2471, "step": 13376 }, { "epoch": 0.35230445088227547, "grad_norm": 1.8385820388793945, "learning_rate": 3.23966289175665e-05, "loss": 1.7183, "step": 13377 }, { "epoch": 0.3523307874637872, "grad_norm": 1.902010202407837, "learning_rate": 3.239531208849091e-05, "loss": 1.8149, "step": 13378 }, { "epoch": 0.3523571240452989, "grad_norm": 2.9832451343536377, "learning_rate": 3.239399525941533e-05, "loss": 1.2653, "step": 13379 }, { "epoch": 0.35238346062681064, "grad_norm": 1.6726583242416382, "learning_rate": 3.239267843033974e-05, "loss": 1.8035, "step": 13380 }, { "epoch": 0.3524097972083224, "grad_norm": 2.710875988006592, "learning_rate": 3.239136160126416e-05, "loss": 1.2807, "step": 13381 }, { "epoch": 0.35243613378983407, "grad_norm": 2.777984380722046, "learning_rate": 3.2390044772188574e-05, "loss": 1.5081, "step": 13382 }, { "epoch": 0.3524624703713458, "grad_norm": 2.4720728397369385, "learning_rate": 3.238872794311298e-05, "loss": 1.9083, "step": 13383 }, { "epoch": 0.3524888069528575, "grad_norm": 1.59549081325531, "learning_rate": 3.23874111140374e-05, "loss": 0.672, "step": 13384 }, { "epoch": 0.35251514353436925, "grad_norm": 2.0250182151794434, "learning_rate": 3.2386094284961814e-05, "loss": 1.736, "step": 13385 }, { "epoch": 0.35254148011588093, "grad_norm": 1.8447422981262207, "learning_rate": 3.238477745588623e-05, "loss": 1.2377, "step": 13386 }, { "epoch": 0.3525678166973927, "grad_norm": 2.266187906265259, "learning_rate": 3.238346062681064e-05, "loss": 2.0352, "step": 13387 }, { "epoch": 0.3525941532789044, "grad_norm": 3.264073133468628, "learning_rate": 3.2382143797735054e-05, "loss": 1.4058, "step": 13388 }, { "epoch": 0.3526204898604161, "grad_norm": 3.0453848838806152, "learning_rate": 3.238082696865947e-05, "loss": 1.6684, "step": 13389 }, { "epoch": 0.35264682644192785, "grad_norm": 1.4842275381088257, "learning_rate": 3.2379510139583885e-05, "loss": 1.7708, "step": 13390 }, { "epoch": 0.35267316302343954, "grad_norm": 2.915254592895508, "learning_rate": 3.23781933105083e-05, "loss": 0.908, "step": 13391 }, { "epoch": 0.3526994996049513, "grad_norm": 1.4451037645339966, "learning_rate": 3.237687648143271e-05, "loss": 1.8519, "step": 13392 }, { "epoch": 0.35272583618646297, "grad_norm": 1.965159296989441, "learning_rate": 3.2375559652357125e-05, "loss": 1.0986, "step": 13393 }, { "epoch": 0.3527521727679747, "grad_norm": 2.003401041030884, "learning_rate": 3.237424282328154e-05, "loss": 2.0036, "step": 13394 }, { "epoch": 0.35277850934948646, "grad_norm": 2.3906521797180176, "learning_rate": 3.237292599420596e-05, "loss": 1.1264, "step": 13395 }, { "epoch": 0.35280484593099815, "grad_norm": 2.345811367034912, "learning_rate": 3.2371609165130365e-05, "loss": 1.8041, "step": 13396 }, { "epoch": 0.3528311825125099, "grad_norm": 4.376338958740234, "learning_rate": 3.237029233605478e-05, "loss": 1.7419, "step": 13397 }, { "epoch": 0.3528575190940216, "grad_norm": 1.6952074766159058, "learning_rate": 3.23689755069792e-05, "loss": 1.8855, "step": 13398 }, { "epoch": 0.3528838556755333, "grad_norm": 2.1415939331054688, "learning_rate": 3.236765867790361e-05, "loss": 2.4641, "step": 13399 }, { "epoch": 0.352910192257045, "grad_norm": 4.881741523742676, "learning_rate": 3.236634184882803e-05, "loss": 1.8107, "step": 13400 }, { "epoch": 0.35293652883855675, "grad_norm": 1.3757063150405884, "learning_rate": 3.236502501975244e-05, "loss": 1.6012, "step": 13401 }, { "epoch": 0.3529628654200685, "grad_norm": 2.28267502784729, "learning_rate": 3.236370819067685e-05, "loss": 2.0571, "step": 13402 }, { "epoch": 0.3529892020015802, "grad_norm": 2.4455177783966064, "learning_rate": 3.236239136160126e-05, "loss": 2.2408, "step": 13403 }, { "epoch": 0.3530155385830919, "grad_norm": 3.009761333465576, "learning_rate": 3.2361074532525683e-05, "loss": 1.973, "step": 13404 }, { "epoch": 0.3530418751646036, "grad_norm": 1.7002278566360474, "learning_rate": 3.235975770345009e-05, "loss": 1.9107, "step": 13405 }, { "epoch": 0.35306821174611536, "grad_norm": 2.9433672428131104, "learning_rate": 3.235844087437451e-05, "loss": 1.2209, "step": 13406 }, { "epoch": 0.3530945483276271, "grad_norm": 3.264383554458618, "learning_rate": 3.235712404529892e-05, "loss": 1.9347, "step": 13407 }, { "epoch": 0.3531208849091388, "grad_norm": 2.463010311126709, "learning_rate": 3.235580721622334e-05, "loss": 1.6167, "step": 13408 }, { "epoch": 0.35314722149065053, "grad_norm": 4.239632606506348, "learning_rate": 3.235449038714775e-05, "loss": 1.0254, "step": 13409 }, { "epoch": 0.3531735580721622, "grad_norm": 2.923922300338745, "learning_rate": 3.2353173558072164e-05, "loss": 0.6323, "step": 13410 }, { "epoch": 0.35319989465367396, "grad_norm": 2.098973035812378, "learning_rate": 3.235185672899658e-05, "loss": 1.5639, "step": 13411 }, { "epoch": 0.35322623123518565, "grad_norm": 3.2833895683288574, "learning_rate": 3.235053989992099e-05, "loss": 1.7127, "step": 13412 }, { "epoch": 0.3532525678166974, "grad_norm": 5.73862886428833, "learning_rate": 3.234922307084541e-05, "loss": 1.5556, "step": 13413 }, { "epoch": 0.35327890439820914, "grad_norm": 2.7117037773132324, "learning_rate": 3.234790624176982e-05, "loss": 1.4845, "step": 13414 }, { "epoch": 0.3533052409797208, "grad_norm": 1.8839472532272339, "learning_rate": 3.2346589412694235e-05, "loss": 1.406, "step": 13415 }, { "epoch": 0.35333157756123257, "grad_norm": 3.145193338394165, "learning_rate": 3.2345272583618644e-05, "loss": 1.8111, "step": 13416 }, { "epoch": 0.35335791414274426, "grad_norm": 2.4471664428710938, "learning_rate": 3.234395575454306e-05, "loss": 2.0764, "step": 13417 }, { "epoch": 0.353384250724256, "grad_norm": 1.9531912803649902, "learning_rate": 3.2342638925467475e-05, "loss": 1.7331, "step": 13418 }, { "epoch": 0.3534105873057677, "grad_norm": 2.8200695514678955, "learning_rate": 3.234132209639189e-05, "loss": 1.9612, "step": 13419 }, { "epoch": 0.35343692388727943, "grad_norm": 2.060945749282837, "learning_rate": 3.2340005267316306e-05, "loss": 1.7328, "step": 13420 }, { "epoch": 0.3534632604687912, "grad_norm": 5.0167341232299805, "learning_rate": 3.2338688438240715e-05, "loss": 1.8001, "step": 13421 }, { "epoch": 0.35348959705030286, "grad_norm": 2.379786729812622, "learning_rate": 3.233737160916514e-05, "loss": 1.9806, "step": 13422 }, { "epoch": 0.3535159336318146, "grad_norm": 1.8490166664123535, "learning_rate": 3.2336054780089546e-05, "loss": 1.4596, "step": 13423 }, { "epoch": 0.3535422702133263, "grad_norm": 1.505871057510376, "learning_rate": 3.233473795101396e-05, "loss": 1.5093, "step": 13424 }, { "epoch": 0.35356860679483804, "grad_norm": 1.8197870254516602, "learning_rate": 3.233342112193837e-05, "loss": 1.9049, "step": 13425 }, { "epoch": 0.3535949433763497, "grad_norm": 2.3923511505126953, "learning_rate": 3.2332104292862786e-05, "loss": 0.9344, "step": 13426 }, { "epoch": 0.35362127995786147, "grad_norm": 1.7724586725234985, "learning_rate": 3.23307874637872e-05, "loss": 2.396, "step": 13427 }, { "epoch": 0.3536476165393732, "grad_norm": 1.9437782764434814, "learning_rate": 3.232947063471162e-05, "loss": 2.1122, "step": 13428 }, { "epoch": 0.3536739531208849, "grad_norm": 3.4790189266204834, "learning_rate": 3.232815380563603e-05, "loss": 0.9541, "step": 13429 }, { "epoch": 0.35370028970239664, "grad_norm": 1.8331210613250732, "learning_rate": 3.232683697656044e-05, "loss": 1.9549, "step": 13430 }, { "epoch": 0.35372662628390833, "grad_norm": 1.7642755508422852, "learning_rate": 3.232552014748486e-05, "loss": 1.6784, "step": 13431 }, { "epoch": 0.3537529628654201, "grad_norm": 1.832125186920166, "learning_rate": 3.232420331840927e-05, "loss": 0.401, "step": 13432 }, { "epoch": 0.35377929944693176, "grad_norm": 3.800145149230957, "learning_rate": 3.232288648933369e-05, "loss": 1.004, "step": 13433 }, { "epoch": 0.3538056360284435, "grad_norm": 3.660468578338623, "learning_rate": 3.23215696602581e-05, "loss": 0.9384, "step": 13434 }, { "epoch": 0.35383197260995525, "grad_norm": 2.0878841876983643, "learning_rate": 3.232025283118251e-05, "loss": 1.5391, "step": 13435 }, { "epoch": 0.35385830919146694, "grad_norm": 1.6763434410095215, "learning_rate": 3.231893600210693e-05, "loss": 1.56, "step": 13436 }, { "epoch": 0.3538846457729787, "grad_norm": 3.052415609359741, "learning_rate": 3.2317619173031344e-05, "loss": 1.2428, "step": 13437 }, { "epoch": 0.35391098235449037, "grad_norm": 5.878122806549072, "learning_rate": 3.231630234395576e-05, "loss": 1.4001, "step": 13438 }, { "epoch": 0.3539373189360021, "grad_norm": 2.6253037452697754, "learning_rate": 3.231498551488017e-05, "loss": 1.9999, "step": 13439 }, { "epoch": 0.35396365551751385, "grad_norm": 4.330442905426025, "learning_rate": 3.2313668685804584e-05, "loss": 1.801, "step": 13440 }, { "epoch": 0.35398999209902554, "grad_norm": 1.8195282220840454, "learning_rate": 3.2312351856729e-05, "loss": 1.954, "step": 13441 }, { "epoch": 0.3540163286805373, "grad_norm": 2.3423333168029785, "learning_rate": 3.2311035027653415e-05, "loss": 1.455, "step": 13442 }, { "epoch": 0.354042665262049, "grad_norm": 1.7573970556259155, "learning_rate": 3.2309718198577824e-05, "loss": 2.0738, "step": 13443 }, { "epoch": 0.3540690018435607, "grad_norm": 2.465963840484619, "learning_rate": 3.230840136950224e-05, "loss": 0.6359, "step": 13444 }, { "epoch": 0.3540953384250724, "grad_norm": 2.3648228645324707, "learning_rate": 3.2307084540426655e-05, "loss": 1.4879, "step": 13445 }, { "epoch": 0.35412167500658415, "grad_norm": 1.4378904104232788, "learning_rate": 3.230576771135107e-05, "loss": 1.4814, "step": 13446 }, { "epoch": 0.3541480115880959, "grad_norm": 2.2449519634246826, "learning_rate": 3.2304450882275486e-05, "loss": 1.6078, "step": 13447 }, { "epoch": 0.3541743481696076, "grad_norm": 1.7108471393585205, "learning_rate": 3.2303134053199895e-05, "loss": 0.6002, "step": 13448 }, { "epoch": 0.3542006847511193, "grad_norm": 1.500854253768921, "learning_rate": 3.230181722412431e-05, "loss": 2.1894, "step": 13449 }, { "epoch": 0.354227021332631, "grad_norm": 1.9202920198440552, "learning_rate": 3.230050039504872e-05, "loss": 1.1398, "step": 13450 }, { "epoch": 0.35425335791414275, "grad_norm": 2.657386064529419, "learning_rate": 3.229918356597314e-05, "loss": 1.8775, "step": 13451 }, { "epoch": 0.35427969449565444, "grad_norm": 1.9549400806427002, "learning_rate": 3.229786673689755e-05, "loss": 1.6835, "step": 13452 }, { "epoch": 0.3543060310771662, "grad_norm": 2.1215004920959473, "learning_rate": 3.2296549907821966e-05, "loss": 1.6879, "step": 13453 }, { "epoch": 0.3543323676586779, "grad_norm": 2.3544883728027344, "learning_rate": 3.2295233078746375e-05, "loss": 1.9081, "step": 13454 }, { "epoch": 0.3543587042401896, "grad_norm": 2.0533502101898193, "learning_rate": 3.22939162496708e-05, "loss": 2.0292, "step": 13455 }, { "epoch": 0.35438504082170136, "grad_norm": 3.74015212059021, "learning_rate": 3.2292599420595206e-05, "loss": 2.3287, "step": 13456 }, { "epoch": 0.35441137740321305, "grad_norm": 4.024369716644287, "learning_rate": 3.229128259151962e-05, "loss": 1.5144, "step": 13457 }, { "epoch": 0.3544377139847248, "grad_norm": 1.64836585521698, "learning_rate": 3.228996576244404e-05, "loss": 2.3711, "step": 13458 }, { "epoch": 0.3544640505662365, "grad_norm": 2.312168836593628, "learning_rate": 3.2288648933368446e-05, "loss": 1.7675, "step": 13459 }, { "epoch": 0.3544903871477482, "grad_norm": 2.635197401046753, "learning_rate": 3.228733210429287e-05, "loss": 1.2045, "step": 13460 }, { "epoch": 0.35451672372925996, "grad_norm": 5.81008768081665, "learning_rate": 3.228601527521728e-05, "loss": 1.6074, "step": 13461 }, { "epoch": 0.35454306031077165, "grad_norm": 2.237865447998047, "learning_rate": 3.228469844614169e-05, "loss": 1.7721, "step": 13462 }, { "epoch": 0.3545693968922834, "grad_norm": 2.20149564743042, "learning_rate": 3.22833816170661e-05, "loss": 1.8338, "step": 13463 }, { "epoch": 0.3545957334737951, "grad_norm": 3.2823715209960938, "learning_rate": 3.228206478799052e-05, "loss": 1.0056, "step": 13464 }, { "epoch": 0.3546220700553068, "grad_norm": 4.911948204040527, "learning_rate": 3.228074795891493e-05, "loss": 1.4288, "step": 13465 }, { "epoch": 0.3546484066368185, "grad_norm": 2.457892656326294, "learning_rate": 3.227943112983935e-05, "loss": 1.6921, "step": 13466 }, { "epoch": 0.35467474321833026, "grad_norm": 1.9747319221496582, "learning_rate": 3.2278114300763764e-05, "loss": 1.6734, "step": 13467 }, { "epoch": 0.354701079799842, "grad_norm": 1.8965075016021729, "learning_rate": 3.227679747168817e-05, "loss": 1.1413, "step": 13468 }, { "epoch": 0.3547274163813537, "grad_norm": 2.6162514686584473, "learning_rate": 3.227548064261259e-05, "loss": 1.8204, "step": 13469 }, { "epoch": 0.35475375296286543, "grad_norm": 1.559047818183899, "learning_rate": 3.2274163813537004e-05, "loss": 1.8301, "step": 13470 }, { "epoch": 0.3547800895443771, "grad_norm": 2.322930097579956, "learning_rate": 3.227284698446142e-05, "loss": 0.8137, "step": 13471 }, { "epoch": 0.35480642612588886, "grad_norm": 2.4971282482147217, "learning_rate": 3.227153015538583e-05, "loss": 2.2915, "step": 13472 }, { "epoch": 0.3548327627074006, "grad_norm": 1.7717419862747192, "learning_rate": 3.2270213326310245e-05, "loss": 1.28, "step": 13473 }, { "epoch": 0.3548590992889123, "grad_norm": 4.11823034286499, "learning_rate": 3.226889649723466e-05, "loss": 2.0423, "step": 13474 }, { "epoch": 0.35488543587042404, "grad_norm": 1.5041719675064087, "learning_rate": 3.2267579668159076e-05, "loss": 2.2459, "step": 13475 }, { "epoch": 0.3549117724519357, "grad_norm": 2.930955648422241, "learning_rate": 3.226626283908349e-05, "loss": 2.0029, "step": 13476 }, { "epoch": 0.35493810903344747, "grad_norm": 10.966865539550781, "learning_rate": 3.22649460100079e-05, "loss": 2.6103, "step": 13477 }, { "epoch": 0.35496444561495916, "grad_norm": 1.669223666191101, "learning_rate": 3.2263629180932316e-05, "loss": 2.2197, "step": 13478 }, { "epoch": 0.3549907821964709, "grad_norm": 1.4289343357086182, "learning_rate": 3.226231235185673e-05, "loss": 1.3207, "step": 13479 }, { "epoch": 0.35501711877798264, "grad_norm": 2.8188369274139404, "learning_rate": 3.226099552278115e-05, "loss": 2.0416, "step": 13480 }, { "epoch": 0.35504345535949433, "grad_norm": 2.122403144836426, "learning_rate": 3.2259678693705556e-05, "loss": 0.5008, "step": 13481 }, { "epoch": 0.3550697919410061, "grad_norm": 1.9776302576065063, "learning_rate": 3.225836186462997e-05, "loss": 0.3941, "step": 13482 }, { "epoch": 0.35509612852251776, "grad_norm": 2.0202600955963135, "learning_rate": 3.225704503555439e-05, "loss": 1.5958, "step": 13483 }, { "epoch": 0.3551224651040295, "grad_norm": 2.663630485534668, "learning_rate": 3.22557282064788e-05, "loss": 1.3616, "step": 13484 }, { "epoch": 0.3551488016855412, "grad_norm": 2.960670232772827, "learning_rate": 3.225441137740322e-05, "loss": 1.6209, "step": 13485 }, { "epoch": 0.35517513826705294, "grad_norm": 2.3142905235290527, "learning_rate": 3.225309454832763e-05, "loss": 2.1586, "step": 13486 }, { "epoch": 0.3552014748485647, "grad_norm": 1.773917555809021, "learning_rate": 3.225177771925204e-05, "loss": 2.2527, "step": 13487 }, { "epoch": 0.35522781143007637, "grad_norm": 2.314061164855957, "learning_rate": 3.225046089017646e-05, "loss": 1.4456, "step": 13488 }, { "epoch": 0.3552541480115881, "grad_norm": 1.6005955934524536, "learning_rate": 3.2249144061100874e-05, "loss": 1.9263, "step": 13489 }, { "epoch": 0.3552804845930998, "grad_norm": 2.636345863342285, "learning_rate": 3.224782723202528e-05, "loss": 1.9443, "step": 13490 }, { "epoch": 0.35530682117461154, "grad_norm": 2.1058764457702637, "learning_rate": 3.22465104029497e-05, "loss": 1.413, "step": 13491 }, { "epoch": 0.35533315775612323, "grad_norm": 5.849967002868652, "learning_rate": 3.2245193573874114e-05, "loss": 1.8406, "step": 13492 }, { "epoch": 0.355359494337635, "grad_norm": 2.232043981552124, "learning_rate": 3.224387674479853e-05, "loss": 1.6915, "step": 13493 }, { "epoch": 0.3553858309191467, "grad_norm": 2.2010385990142822, "learning_rate": 3.2242559915722945e-05, "loss": 1.5191, "step": 13494 }, { "epoch": 0.3554121675006584, "grad_norm": 2.755608558654785, "learning_rate": 3.2241243086647354e-05, "loss": 1.1885, "step": 13495 }, { "epoch": 0.35543850408217015, "grad_norm": 2.041734457015991, "learning_rate": 3.223992625757177e-05, "loss": 1.5228, "step": 13496 }, { "epoch": 0.35546484066368184, "grad_norm": 1.7045328617095947, "learning_rate": 3.223860942849618e-05, "loss": 1.7614, "step": 13497 }, { "epoch": 0.3554911772451936, "grad_norm": 3.127450704574585, "learning_rate": 3.22372925994206e-05, "loss": 2.3074, "step": 13498 }, { "epoch": 0.35551751382670527, "grad_norm": 2.6444132328033447, "learning_rate": 3.223597577034501e-05, "loss": 1.9232, "step": 13499 }, { "epoch": 0.355543850408217, "grad_norm": 6.129952430725098, "learning_rate": 3.2234658941269425e-05, "loss": 1.7753, "step": 13500 }, { "epoch": 0.35557018698972875, "grad_norm": 4.689990997314453, "learning_rate": 3.223334211219384e-05, "loss": 1.7396, "step": 13501 }, { "epoch": 0.35559652357124044, "grad_norm": 2.0375030040740967, "learning_rate": 3.223202528311825e-05, "loss": 1.8252, "step": 13502 }, { "epoch": 0.3556228601527522, "grad_norm": 2.710348129272461, "learning_rate": 3.223070845404267e-05, "loss": 1.4784, "step": 13503 }, { "epoch": 0.3556491967342639, "grad_norm": 2.265514373779297, "learning_rate": 3.222939162496708e-05, "loss": 1.9078, "step": 13504 }, { "epoch": 0.3556755333157756, "grad_norm": 3.0866858959198, "learning_rate": 3.2228074795891496e-05, "loss": 0.9979, "step": 13505 }, { "epoch": 0.35570186989728736, "grad_norm": 2.3354218006134033, "learning_rate": 3.2226757966815905e-05, "loss": 2.048, "step": 13506 }, { "epoch": 0.35572820647879905, "grad_norm": 2.2681877613067627, "learning_rate": 3.222544113774033e-05, "loss": 1.1664, "step": 13507 }, { "epoch": 0.3557545430603108, "grad_norm": 1.4610751867294312, "learning_rate": 3.2224124308664736e-05, "loss": 1.7744, "step": 13508 }, { "epoch": 0.3557808796418225, "grad_norm": 2.241502285003662, "learning_rate": 3.222280747958915e-05, "loss": 1.7951, "step": 13509 }, { "epoch": 0.3558072162233342, "grad_norm": 2.4376156330108643, "learning_rate": 3.222149065051356e-05, "loss": 1.3366, "step": 13510 }, { "epoch": 0.3558335528048459, "grad_norm": 1.910470962524414, "learning_rate": 3.2220173821437976e-05, "loss": 1.5958, "step": 13511 }, { "epoch": 0.35585988938635765, "grad_norm": 1.63845956325531, "learning_rate": 3.221885699236239e-05, "loss": 1.2993, "step": 13512 }, { "epoch": 0.3558862259678694, "grad_norm": 1.8612556457519531, "learning_rate": 3.221754016328681e-05, "loss": 0.7126, "step": 13513 }, { "epoch": 0.3559125625493811, "grad_norm": 2.365784168243408, "learning_rate": 3.221622333421122e-05, "loss": 1.0563, "step": 13514 }, { "epoch": 0.3559388991308928, "grad_norm": 2.4441957473754883, "learning_rate": 3.221490650513563e-05, "loss": 1.5295, "step": 13515 }, { "epoch": 0.3559652357124045, "grad_norm": 1.5101608037948608, "learning_rate": 3.221358967606005e-05, "loss": 1.4965, "step": 13516 }, { "epoch": 0.35599157229391626, "grad_norm": 2.502692699432373, "learning_rate": 3.221227284698446e-05, "loss": 1.4537, "step": 13517 }, { "epoch": 0.35601790887542795, "grad_norm": 3.574384927749634, "learning_rate": 3.221095601790888e-05, "loss": 1.475, "step": 13518 }, { "epoch": 0.3560442454569397, "grad_norm": 1.7567169666290283, "learning_rate": 3.220963918883329e-05, "loss": 1.9212, "step": 13519 }, { "epoch": 0.35607058203845143, "grad_norm": 4.268300533294678, "learning_rate": 3.22083223597577e-05, "loss": 1.5846, "step": 13520 }, { "epoch": 0.3560969186199631, "grad_norm": 1.6878823041915894, "learning_rate": 3.220700553068212e-05, "loss": 1.8979, "step": 13521 }, { "epoch": 0.35612325520147486, "grad_norm": 1.9981175661087036, "learning_rate": 3.2205688701606534e-05, "loss": 1.15, "step": 13522 }, { "epoch": 0.35614959178298655, "grad_norm": 2.7628042697906494, "learning_rate": 3.220437187253095e-05, "loss": 0.8069, "step": 13523 }, { "epoch": 0.3561759283644983, "grad_norm": 3.2204749584198, "learning_rate": 3.220305504345536e-05, "loss": 1.7214, "step": 13524 }, { "epoch": 0.35620226494601, "grad_norm": 1.607557773590088, "learning_rate": 3.2201738214379774e-05, "loss": 1.8681, "step": 13525 }, { "epoch": 0.3562286015275217, "grad_norm": 3.5377190113067627, "learning_rate": 3.220042138530419e-05, "loss": 2.5758, "step": 13526 }, { "epoch": 0.35625493810903347, "grad_norm": 2.097262382507324, "learning_rate": 3.2199104556228605e-05, "loss": 1.076, "step": 13527 }, { "epoch": 0.35628127469054516, "grad_norm": 3.9482004642486572, "learning_rate": 3.2197787727153014e-05, "loss": 1.8825, "step": 13528 }, { "epoch": 0.3563076112720569, "grad_norm": 2.0969722270965576, "learning_rate": 3.219647089807743e-05, "loss": 1.5867, "step": 13529 }, { "epoch": 0.3563339478535686, "grad_norm": 1.8546539545059204, "learning_rate": 3.2195154069001845e-05, "loss": 0.797, "step": 13530 }, { "epoch": 0.35636028443508033, "grad_norm": 1.6689138412475586, "learning_rate": 3.219383723992626e-05, "loss": 1.7193, "step": 13531 }, { "epoch": 0.356386621016592, "grad_norm": 1.855383276939392, "learning_rate": 3.219252041085068e-05, "loss": 2.3078, "step": 13532 }, { "epoch": 0.35641295759810376, "grad_norm": 1.8257544040679932, "learning_rate": 3.2191203581775086e-05, "loss": 1.8635, "step": 13533 }, { "epoch": 0.3564392941796155, "grad_norm": 2.4283852577209473, "learning_rate": 3.21898867526995e-05, "loss": 2.4823, "step": 13534 }, { "epoch": 0.3564656307611272, "grad_norm": 1.6496526002883911, "learning_rate": 3.218856992362391e-05, "loss": 1.5239, "step": 13535 }, { "epoch": 0.35649196734263894, "grad_norm": 1.8665874004364014, "learning_rate": 3.218725309454833e-05, "loss": 2.0893, "step": 13536 }, { "epoch": 0.3565183039241506, "grad_norm": 2.8739356994628906, "learning_rate": 3.218593626547274e-05, "loss": 1.8981, "step": 13537 }, { "epoch": 0.35654464050566237, "grad_norm": 2.7292630672454834, "learning_rate": 3.218461943639716e-05, "loss": 1.0954, "step": 13538 }, { "epoch": 0.3565709770871741, "grad_norm": 3.0815908908843994, "learning_rate": 3.218330260732157e-05, "loss": 1.4115, "step": 13539 }, { "epoch": 0.3565973136686858, "grad_norm": 1.9336557388305664, "learning_rate": 3.218198577824599e-05, "loss": 2.1785, "step": 13540 }, { "epoch": 0.35662365025019754, "grad_norm": 1.9636908769607544, "learning_rate": 3.2180668949170404e-05, "loss": 1.5667, "step": 13541 }, { "epoch": 0.35664998683170923, "grad_norm": 1.776019811630249, "learning_rate": 3.217935212009481e-05, "loss": 1.7945, "step": 13542 }, { "epoch": 0.356676323413221, "grad_norm": 1.9169420003890991, "learning_rate": 3.217803529101923e-05, "loss": 2.4759, "step": 13543 }, { "epoch": 0.35670265999473266, "grad_norm": 1.8773916959762573, "learning_rate": 3.217671846194364e-05, "loss": 1.7936, "step": 13544 }, { "epoch": 0.3567289965762444, "grad_norm": 1.571679949760437, "learning_rate": 3.217540163286806e-05, "loss": 1.4563, "step": 13545 }, { "epoch": 0.35675533315775615, "grad_norm": 1.9549353122711182, "learning_rate": 3.217408480379247e-05, "loss": 2.6193, "step": 13546 }, { "epoch": 0.35678166973926784, "grad_norm": 1.5232690572738647, "learning_rate": 3.2172767974716884e-05, "loss": 1.4649, "step": 13547 }, { "epoch": 0.3568080063207796, "grad_norm": 2.7277729511260986, "learning_rate": 3.21714511456413e-05, "loss": 2.143, "step": 13548 }, { "epoch": 0.35683434290229127, "grad_norm": 1.6067876815795898, "learning_rate": 3.217013431656571e-05, "loss": 1.8885, "step": 13549 }, { "epoch": 0.356860679483803, "grad_norm": 1.8268053531646729, "learning_rate": 3.216881748749013e-05, "loss": 0.7139, "step": 13550 }, { "epoch": 0.3568870160653147, "grad_norm": 2.4645025730133057, "learning_rate": 3.216750065841454e-05, "loss": 1.6641, "step": 13551 }, { "epoch": 0.35691335264682644, "grad_norm": 2.83951473236084, "learning_rate": 3.2166183829338955e-05, "loss": 1.5956, "step": 13552 }, { "epoch": 0.3569396892283382, "grad_norm": 2.5665435791015625, "learning_rate": 3.2164867000263364e-05, "loss": 0.8501, "step": 13553 }, { "epoch": 0.3569660258098499, "grad_norm": 2.968637466430664, "learning_rate": 3.2163550171187786e-05, "loss": 0.7681, "step": 13554 }, { "epoch": 0.3569923623913616, "grad_norm": 2.0003342628479004, "learning_rate": 3.2162233342112195e-05, "loss": 1.7505, "step": 13555 }, { "epoch": 0.3570186989728733, "grad_norm": 3.052992343902588, "learning_rate": 3.216091651303661e-05, "loss": 0.6985, "step": 13556 }, { "epoch": 0.35704503555438505, "grad_norm": 7.121495723724365, "learning_rate": 3.215959968396102e-05, "loss": 1.5725, "step": 13557 }, { "epoch": 0.35707137213589674, "grad_norm": 4.353183269500732, "learning_rate": 3.2158282854885435e-05, "loss": 0.9106, "step": 13558 }, { "epoch": 0.3570977087174085, "grad_norm": 3.0545105934143066, "learning_rate": 3.215696602580985e-05, "loss": 1.7404, "step": 13559 }, { "epoch": 0.3571240452989202, "grad_norm": 3.831552505493164, "learning_rate": 3.2155649196734266e-05, "loss": 1.7135, "step": 13560 }, { "epoch": 0.3571503818804319, "grad_norm": 3.9955251216888428, "learning_rate": 3.215433236765868e-05, "loss": 1.7484, "step": 13561 }, { "epoch": 0.35717671846194365, "grad_norm": 2.0099432468414307, "learning_rate": 3.215301553858309e-05, "loss": 1.9631, "step": 13562 }, { "epoch": 0.35720305504345534, "grad_norm": 2.879105806350708, "learning_rate": 3.2151698709507506e-05, "loss": 1.8359, "step": 13563 }, { "epoch": 0.3572293916249671, "grad_norm": 2.9097330570220947, "learning_rate": 3.215038188043192e-05, "loss": 0.4684, "step": 13564 }, { "epoch": 0.3572557282064788, "grad_norm": 2.0938796997070312, "learning_rate": 3.214906505135634e-05, "loss": 1.5081, "step": 13565 }, { "epoch": 0.3572820647879905, "grad_norm": 2.119171619415283, "learning_rate": 3.2147748222280746e-05, "loss": 1.6377, "step": 13566 }, { "epoch": 0.35730840136950226, "grad_norm": 1.9012625217437744, "learning_rate": 3.214643139320516e-05, "loss": 1.5652, "step": 13567 }, { "epoch": 0.35733473795101395, "grad_norm": 4.844491004943848, "learning_rate": 3.214511456412958e-05, "loss": 1.0606, "step": 13568 }, { "epoch": 0.3573610745325257, "grad_norm": 2.415804862976074, "learning_rate": 3.214379773505399e-05, "loss": 1.7434, "step": 13569 }, { "epoch": 0.3573874111140374, "grad_norm": 2.3577330112457275, "learning_rate": 3.214248090597841e-05, "loss": 0.9716, "step": 13570 }, { "epoch": 0.3574137476955491, "grad_norm": 2.1850931644439697, "learning_rate": 3.214116407690282e-05, "loss": 2.2315, "step": 13571 }, { "epoch": 0.3574400842770608, "grad_norm": 1.663676142692566, "learning_rate": 3.213984724782723e-05, "loss": 2.1599, "step": 13572 }, { "epoch": 0.35746642085857255, "grad_norm": 2.945284366607666, "learning_rate": 3.213853041875165e-05, "loss": 2.4107, "step": 13573 }, { "epoch": 0.3574927574400843, "grad_norm": 3.5954642295837402, "learning_rate": 3.2137213589676064e-05, "loss": 1.3837, "step": 13574 }, { "epoch": 0.357519094021596, "grad_norm": 1.7816054821014404, "learning_rate": 3.213589676060047e-05, "loss": 1.7309, "step": 13575 }, { "epoch": 0.35754543060310773, "grad_norm": 1.815504789352417, "learning_rate": 3.213457993152489e-05, "loss": 1.7403, "step": 13576 }, { "epoch": 0.3575717671846194, "grad_norm": 4.519070148468018, "learning_rate": 3.2133263102449304e-05, "loss": 1.2178, "step": 13577 }, { "epoch": 0.35759810376613116, "grad_norm": 1.7884514331817627, "learning_rate": 3.213194627337372e-05, "loss": 1.8143, "step": 13578 }, { "epoch": 0.3576244403476429, "grad_norm": 2.0682077407836914, "learning_rate": 3.2130629444298135e-05, "loss": 1.939, "step": 13579 }, { "epoch": 0.3576507769291546, "grad_norm": 1.8512648344039917, "learning_rate": 3.2129312615222544e-05, "loss": 1.9758, "step": 13580 }, { "epoch": 0.35767711351066633, "grad_norm": 1.7880021333694458, "learning_rate": 3.212799578614696e-05, "loss": 1.6864, "step": 13581 }, { "epoch": 0.357703450092178, "grad_norm": 2.221271514892578, "learning_rate": 3.212667895707137e-05, "loss": 2.2582, "step": 13582 }, { "epoch": 0.35772978667368976, "grad_norm": 2.561495065689087, "learning_rate": 3.212536212799579e-05, "loss": 1.4875, "step": 13583 }, { "epoch": 0.35775612325520145, "grad_norm": 3.4658031463623047, "learning_rate": 3.21240452989202e-05, "loss": 0.9863, "step": 13584 }, { "epoch": 0.3577824598367132, "grad_norm": 1.603253960609436, "learning_rate": 3.2122728469844615e-05, "loss": 2.396, "step": 13585 }, { "epoch": 0.35780879641822494, "grad_norm": 2.470778465270996, "learning_rate": 3.212141164076903e-05, "loss": 1.7577, "step": 13586 }, { "epoch": 0.3578351329997366, "grad_norm": 1.4860738515853882, "learning_rate": 3.2120094811693446e-05, "loss": 2.0019, "step": 13587 }, { "epoch": 0.35786146958124837, "grad_norm": 4.009979724884033, "learning_rate": 3.211877798261786e-05, "loss": 1.8871, "step": 13588 }, { "epoch": 0.35788780616276006, "grad_norm": 3.2963204383850098, "learning_rate": 3.211746115354227e-05, "loss": 1.2485, "step": 13589 }, { "epoch": 0.3579141427442718, "grad_norm": 1.6084405183792114, "learning_rate": 3.2116144324466686e-05, "loss": 1.7061, "step": 13590 }, { "epoch": 0.3579404793257835, "grad_norm": 2.6024277210235596, "learning_rate": 3.2114827495391095e-05, "loss": 1.8364, "step": 13591 }, { "epoch": 0.35796681590729523, "grad_norm": 4.801141262054443, "learning_rate": 3.211351066631552e-05, "loss": 1.0169, "step": 13592 }, { "epoch": 0.357993152488807, "grad_norm": 2.9791507720947266, "learning_rate": 3.2112193837239927e-05, "loss": 2.1186, "step": 13593 }, { "epoch": 0.35801948907031866, "grad_norm": 1.6529208421707153, "learning_rate": 3.211087700816434e-05, "loss": 1.6764, "step": 13594 }, { "epoch": 0.3580458256518304, "grad_norm": 1.5181199312210083, "learning_rate": 3.210956017908876e-05, "loss": 1.9035, "step": 13595 }, { "epoch": 0.3580721622333421, "grad_norm": 1.9680874347686768, "learning_rate": 3.2108243350013167e-05, "loss": 2.2813, "step": 13596 }, { "epoch": 0.35809849881485384, "grad_norm": 2.8364412784576416, "learning_rate": 3.210692652093759e-05, "loss": 1.4889, "step": 13597 }, { "epoch": 0.3581248353963655, "grad_norm": 2.6851561069488525, "learning_rate": 3.2105609691862e-05, "loss": 1.1029, "step": 13598 }, { "epoch": 0.35815117197787727, "grad_norm": 3.910747766494751, "learning_rate": 3.210429286278641e-05, "loss": 1.1004, "step": 13599 }, { "epoch": 0.358177508559389, "grad_norm": 1.3464211225509644, "learning_rate": 3.210297603371082e-05, "loss": 1.922, "step": 13600 }, { "epoch": 0.3582038451409007, "grad_norm": 3.04995059967041, "learning_rate": 3.210165920463524e-05, "loss": 1.5378, "step": 13601 }, { "epoch": 0.35823018172241244, "grad_norm": 2.3936808109283447, "learning_rate": 3.210034237555965e-05, "loss": 1.7822, "step": 13602 }, { "epoch": 0.35825651830392413, "grad_norm": 3.0493557453155518, "learning_rate": 3.209902554648407e-05, "loss": 1.9069, "step": 13603 }, { "epoch": 0.3582828548854359, "grad_norm": 3.2228777408599854, "learning_rate": 3.2097708717408485e-05, "loss": 1.3135, "step": 13604 }, { "epoch": 0.35830919146694756, "grad_norm": 2.2920727729797363, "learning_rate": 3.209639188833289e-05, "loss": 1.9463, "step": 13605 }, { "epoch": 0.3583355280484593, "grad_norm": 1.8838752508163452, "learning_rate": 3.2095075059257316e-05, "loss": 1.9288, "step": 13606 }, { "epoch": 0.35836186462997105, "grad_norm": 1.6796777248382568, "learning_rate": 3.2093758230181725e-05, "loss": 1.7367, "step": 13607 }, { "epoch": 0.35838820121148274, "grad_norm": 4.060592174530029, "learning_rate": 3.209244140110614e-05, "loss": 1.1152, "step": 13608 }, { "epoch": 0.3584145377929945, "grad_norm": 1.508970022201538, "learning_rate": 3.209112457203055e-05, "loss": 1.6006, "step": 13609 }, { "epoch": 0.35844087437450617, "grad_norm": 1.9285054206848145, "learning_rate": 3.2089807742954965e-05, "loss": 1.9426, "step": 13610 }, { "epoch": 0.3584672109560179, "grad_norm": 1.9635648727416992, "learning_rate": 3.208849091387938e-05, "loss": 0.401, "step": 13611 }, { "epoch": 0.35849354753752966, "grad_norm": 2.7901957035064697, "learning_rate": 3.2087174084803796e-05, "loss": 1.241, "step": 13612 }, { "epoch": 0.35851988411904134, "grad_norm": 1.3184195756912231, "learning_rate": 3.2085857255728205e-05, "loss": 0.6769, "step": 13613 }, { "epoch": 0.3585462207005531, "grad_norm": 1.9425921440124512, "learning_rate": 3.208454042665262e-05, "loss": 1.1361, "step": 13614 }, { "epoch": 0.3585725572820648, "grad_norm": 2.5613203048706055, "learning_rate": 3.2083223597577036e-05, "loss": 1.5979, "step": 13615 }, { "epoch": 0.3585988938635765, "grad_norm": 4.758947372436523, "learning_rate": 3.208190676850145e-05, "loss": 1.0036, "step": 13616 }, { "epoch": 0.3586252304450882, "grad_norm": 1.8957648277282715, "learning_rate": 3.208058993942587e-05, "loss": 2.7816, "step": 13617 }, { "epoch": 0.35865156702659995, "grad_norm": 2.223864793777466, "learning_rate": 3.2079273110350276e-05, "loss": 0.4141, "step": 13618 }, { "epoch": 0.3586779036081117, "grad_norm": 1.6040529012680054, "learning_rate": 3.207795628127469e-05, "loss": 1.8158, "step": 13619 }, { "epoch": 0.3587042401896234, "grad_norm": 2.693751335144043, "learning_rate": 3.207663945219911e-05, "loss": 1.2111, "step": 13620 }, { "epoch": 0.3587305767711351, "grad_norm": 2.041635513305664, "learning_rate": 3.207532262312352e-05, "loss": 1.3927, "step": 13621 }, { "epoch": 0.3587569133526468, "grad_norm": 2.149848699569702, "learning_rate": 3.207400579404793e-05, "loss": 1.3231, "step": 13622 }, { "epoch": 0.35878324993415855, "grad_norm": 1.643959641456604, "learning_rate": 3.207268896497235e-05, "loss": 2.0312, "step": 13623 }, { "epoch": 0.35880958651567024, "grad_norm": 2.539884090423584, "learning_rate": 3.207137213589676e-05, "loss": 2.755, "step": 13624 }, { "epoch": 0.358835923097182, "grad_norm": 2.047617197036743, "learning_rate": 3.207005530682118e-05, "loss": 1.6491, "step": 13625 }, { "epoch": 0.35886225967869373, "grad_norm": 2.015315532684326, "learning_rate": 3.2068738477745594e-05, "loss": 1.6363, "step": 13626 }, { "epoch": 0.3588885962602054, "grad_norm": 1.962422490119934, "learning_rate": 3.206742164867e-05, "loss": 2.2409, "step": 13627 }, { "epoch": 0.35891493284171716, "grad_norm": 3.1431314945220947, "learning_rate": 3.206610481959442e-05, "loss": 1.6786, "step": 13628 }, { "epoch": 0.35894126942322885, "grad_norm": 3.141082763671875, "learning_rate": 3.206478799051883e-05, "loss": 1.4124, "step": 13629 }, { "epoch": 0.3589676060047406, "grad_norm": 1.9669657945632935, "learning_rate": 3.206347116144325e-05, "loss": 1.5652, "step": 13630 }, { "epoch": 0.3589939425862523, "grad_norm": 3.1610116958618164, "learning_rate": 3.206215433236766e-05, "loss": 1.2665, "step": 13631 }, { "epoch": 0.359020279167764, "grad_norm": 2.7507312297821045, "learning_rate": 3.2060837503292074e-05, "loss": 1.9826, "step": 13632 }, { "epoch": 0.35904661574927577, "grad_norm": 1.7981044054031372, "learning_rate": 3.205952067421649e-05, "loss": 2.1147, "step": 13633 }, { "epoch": 0.35907295233078745, "grad_norm": 1.6805517673492432, "learning_rate": 3.20582038451409e-05, "loss": 1.9729, "step": 13634 }, { "epoch": 0.3590992889122992, "grad_norm": 1.812591791152954, "learning_rate": 3.205688701606532e-05, "loss": 2.0497, "step": 13635 }, { "epoch": 0.3591256254938109, "grad_norm": 2.0936026573181152, "learning_rate": 3.205557018698973e-05, "loss": 1.9917, "step": 13636 }, { "epoch": 0.35915196207532263, "grad_norm": 3.4688844680786133, "learning_rate": 3.2054253357914145e-05, "loss": 1.5249, "step": 13637 }, { "epoch": 0.3591782986568343, "grad_norm": 2.9845259189605713, "learning_rate": 3.2052936528838554e-05, "loss": 1.0429, "step": 13638 }, { "epoch": 0.35920463523834606, "grad_norm": 3.8587541580200195, "learning_rate": 3.2051619699762976e-05, "loss": 2.0806, "step": 13639 }, { "epoch": 0.3592309718198578, "grad_norm": 2.1985764503479004, "learning_rate": 3.2050302870687385e-05, "loss": 1.6956, "step": 13640 }, { "epoch": 0.3592573084013695, "grad_norm": 2.889796495437622, "learning_rate": 3.20489860416118e-05, "loss": 0.9912, "step": 13641 }, { "epoch": 0.35928364498288123, "grad_norm": 2.5522637367248535, "learning_rate": 3.2047669212536216e-05, "loss": 2.0458, "step": 13642 }, { "epoch": 0.3593099815643929, "grad_norm": 1.8815504312515259, "learning_rate": 3.2046352383460625e-05, "loss": 1.8488, "step": 13643 }, { "epoch": 0.35933631814590467, "grad_norm": 2.516996383666992, "learning_rate": 3.204503555438505e-05, "loss": 1.8765, "step": 13644 }, { "epoch": 0.3593626547274164, "grad_norm": 4.308852672576904, "learning_rate": 3.2043718725309456e-05, "loss": 1.0197, "step": 13645 }, { "epoch": 0.3593889913089281, "grad_norm": 3.4652295112609863, "learning_rate": 3.204240189623387e-05, "loss": 0.9293, "step": 13646 }, { "epoch": 0.35941532789043984, "grad_norm": 1.6348474025726318, "learning_rate": 3.204108506715828e-05, "loss": 1.4954, "step": 13647 }, { "epoch": 0.3594416644719515, "grad_norm": 2.7688302993774414, "learning_rate": 3.2039768238082696e-05, "loss": 1.2846, "step": 13648 }, { "epoch": 0.35946800105346327, "grad_norm": 2.0089244842529297, "learning_rate": 3.203845140900711e-05, "loss": 1.5039, "step": 13649 }, { "epoch": 0.35949433763497496, "grad_norm": 2.0078160762786865, "learning_rate": 3.203713457993153e-05, "loss": 2.3831, "step": 13650 }, { "epoch": 0.3595206742164867, "grad_norm": 1.9557090997695923, "learning_rate": 3.203581775085594e-05, "loss": 1.8255, "step": 13651 }, { "epoch": 0.35954701079799845, "grad_norm": 2.9324398040771484, "learning_rate": 3.203450092178035e-05, "loss": 2.2562, "step": 13652 }, { "epoch": 0.35957334737951013, "grad_norm": 2.6382675170898438, "learning_rate": 3.2033184092704774e-05, "loss": 2.0875, "step": 13653 }, { "epoch": 0.3595996839610219, "grad_norm": 1.9812148809432983, "learning_rate": 3.203186726362918e-05, "loss": 1.0656, "step": 13654 }, { "epoch": 0.35962602054253356, "grad_norm": 2.2188010215759277, "learning_rate": 3.20305504345536e-05, "loss": 1.8069, "step": 13655 }, { "epoch": 0.3596523571240453, "grad_norm": 3.9850947856903076, "learning_rate": 3.202923360547801e-05, "loss": 1.0791, "step": 13656 }, { "epoch": 0.359678693705557, "grad_norm": 2.224342107772827, "learning_rate": 3.202791677640242e-05, "loss": 1.6742, "step": 13657 }, { "epoch": 0.35970503028706874, "grad_norm": 3.0013134479522705, "learning_rate": 3.202659994732684e-05, "loss": 0.882, "step": 13658 }, { "epoch": 0.3597313668685805, "grad_norm": 2.805025100708008, "learning_rate": 3.2025283118251254e-05, "loss": 0.9205, "step": 13659 }, { "epoch": 0.35975770345009217, "grad_norm": 1.639492154121399, "learning_rate": 3.202396628917566e-05, "loss": 1.8974, "step": 13660 }, { "epoch": 0.3597840400316039, "grad_norm": 1.6123700141906738, "learning_rate": 3.202264946010008e-05, "loss": 1.8357, "step": 13661 }, { "epoch": 0.3598103766131156, "grad_norm": 3.004641532897949, "learning_rate": 3.2021332631024494e-05, "loss": 0.6059, "step": 13662 }, { "epoch": 0.35983671319462734, "grad_norm": 1.9932104349136353, "learning_rate": 3.202001580194891e-05, "loss": 1.8332, "step": 13663 }, { "epoch": 0.35986304977613903, "grad_norm": 2.7735795974731445, "learning_rate": 3.2018698972873326e-05, "loss": 2.023, "step": 13664 }, { "epoch": 0.3598893863576508, "grad_norm": 1.7488255500793457, "learning_rate": 3.2017382143797734e-05, "loss": 1.8532, "step": 13665 }, { "epoch": 0.3599157229391625, "grad_norm": 2.5629940032958984, "learning_rate": 3.201606531472215e-05, "loss": 2.1842, "step": 13666 }, { "epoch": 0.3599420595206742, "grad_norm": 1.6515700817108154, "learning_rate": 3.2014748485646566e-05, "loss": 1.3936, "step": 13667 }, { "epoch": 0.35996839610218595, "grad_norm": 2.527289390563965, "learning_rate": 3.201343165657098e-05, "loss": 1.6333, "step": 13668 }, { "epoch": 0.35999473268369764, "grad_norm": 1.7834123373031616, "learning_rate": 3.201211482749539e-05, "loss": 1.8042, "step": 13669 }, { "epoch": 0.3600210692652094, "grad_norm": 3.378293752670288, "learning_rate": 3.2010797998419806e-05, "loss": 1.5895, "step": 13670 }, { "epoch": 0.36004740584672107, "grad_norm": 1.6988415718078613, "learning_rate": 3.200948116934422e-05, "loss": 1.6087, "step": 13671 }, { "epoch": 0.3600737424282328, "grad_norm": 2.0479817390441895, "learning_rate": 3.200816434026864e-05, "loss": 1.2008, "step": 13672 }, { "epoch": 0.36010007900974456, "grad_norm": 1.7988804578781128, "learning_rate": 3.200684751119305e-05, "loss": 1.5911, "step": 13673 }, { "epoch": 0.36012641559125624, "grad_norm": 1.8863316774368286, "learning_rate": 3.200553068211746e-05, "loss": 2.048, "step": 13674 }, { "epoch": 0.360152752172768, "grad_norm": 4.116148948669434, "learning_rate": 3.200421385304188e-05, "loss": 1.6954, "step": 13675 }, { "epoch": 0.3601790887542797, "grad_norm": 2.3305835723876953, "learning_rate": 3.2002897023966286e-05, "loss": 2.4382, "step": 13676 }, { "epoch": 0.3602054253357914, "grad_norm": 2.539477586746216, "learning_rate": 3.200158019489071e-05, "loss": 1.5695, "step": 13677 }, { "epoch": 0.36023176191730316, "grad_norm": 2.0611891746520996, "learning_rate": 3.200026336581512e-05, "loss": 1.953, "step": 13678 }, { "epoch": 0.36025809849881485, "grad_norm": 1.764420747756958, "learning_rate": 3.199894653673953e-05, "loss": 1.1791, "step": 13679 }, { "epoch": 0.3602844350803266, "grad_norm": 2.050950050354004, "learning_rate": 3.199762970766395e-05, "loss": 1.9048, "step": 13680 }, { "epoch": 0.3603107716618383, "grad_norm": 3.1922240257263184, "learning_rate": 3.199631287858836e-05, "loss": 1.1164, "step": 13681 }, { "epoch": 0.36033710824335, "grad_norm": 2.410733461380005, "learning_rate": 3.199499604951278e-05, "loss": 1.9025, "step": 13682 }, { "epoch": 0.3603634448248617, "grad_norm": 2.043968915939331, "learning_rate": 3.199367922043719e-05, "loss": 2.3087, "step": 13683 }, { "epoch": 0.36038978140637346, "grad_norm": 2.3882358074188232, "learning_rate": 3.1992362391361604e-05, "loss": 1.0314, "step": 13684 }, { "epoch": 0.3604161179878852, "grad_norm": 1.6741105318069458, "learning_rate": 3.199104556228601e-05, "loss": 1.1299, "step": 13685 }, { "epoch": 0.3604424545693969, "grad_norm": 1.5031324625015259, "learning_rate": 3.1989728733210435e-05, "loss": 2.1062, "step": 13686 }, { "epoch": 0.36046879115090863, "grad_norm": 2.597062349319458, "learning_rate": 3.1988411904134844e-05, "loss": 1.4521, "step": 13687 }, { "epoch": 0.3604951277324203, "grad_norm": 3.2964096069335938, "learning_rate": 3.198709507505926e-05, "loss": 1.5763, "step": 13688 }, { "epoch": 0.36052146431393206, "grad_norm": 2.20664644241333, "learning_rate": 3.1985778245983675e-05, "loss": 1.9681, "step": 13689 }, { "epoch": 0.36054780089544375, "grad_norm": 2.8661561012268066, "learning_rate": 3.1984461416908084e-05, "loss": 1.8483, "step": 13690 }, { "epoch": 0.3605741374769555, "grad_norm": 2.307616710662842, "learning_rate": 3.1983144587832506e-05, "loss": 1.8336, "step": 13691 }, { "epoch": 0.36060047405846724, "grad_norm": 1.659900426864624, "learning_rate": 3.1981827758756915e-05, "loss": 1.9063, "step": 13692 }, { "epoch": 0.3606268106399789, "grad_norm": 7.053070545196533, "learning_rate": 3.198051092968133e-05, "loss": 1.7106, "step": 13693 }, { "epoch": 0.36065314722149067, "grad_norm": 4.508133411407471, "learning_rate": 3.197919410060574e-05, "loss": 1.4931, "step": 13694 }, { "epoch": 0.36067948380300235, "grad_norm": 2.002432107925415, "learning_rate": 3.1977877271530155e-05, "loss": 1.9358, "step": 13695 }, { "epoch": 0.3607058203845141, "grad_norm": 2.658022880554199, "learning_rate": 3.197656044245457e-05, "loss": 0.9224, "step": 13696 }, { "epoch": 0.3607321569660258, "grad_norm": 2.70473575592041, "learning_rate": 3.1975243613378986e-05, "loss": 1.2302, "step": 13697 }, { "epoch": 0.36075849354753753, "grad_norm": 3.511639356613159, "learning_rate": 3.19739267843034e-05, "loss": 2.2309, "step": 13698 }, { "epoch": 0.36078483012904927, "grad_norm": 2.263486385345459, "learning_rate": 3.197260995522781e-05, "loss": 2.0399, "step": 13699 }, { "epoch": 0.36081116671056096, "grad_norm": 1.5785053968429565, "learning_rate": 3.197129312615223e-05, "loss": 1.9869, "step": 13700 }, { "epoch": 0.3608375032920727, "grad_norm": 1.7243232727050781, "learning_rate": 3.196997629707664e-05, "loss": 1.446, "step": 13701 }, { "epoch": 0.3608638398735844, "grad_norm": 2.390009880065918, "learning_rate": 3.196865946800106e-05, "loss": 1.6143, "step": 13702 }, { "epoch": 0.36089017645509613, "grad_norm": 3.2779555320739746, "learning_rate": 3.1967342638925466e-05, "loss": 0.884, "step": 13703 }, { "epoch": 0.3609165130366078, "grad_norm": 4.38844633102417, "learning_rate": 3.196602580984988e-05, "loss": 0.9997, "step": 13704 }, { "epoch": 0.36094284961811957, "grad_norm": 1.8490869998931885, "learning_rate": 3.19647089807743e-05, "loss": 1.1045, "step": 13705 }, { "epoch": 0.3609691861996313, "grad_norm": 2.193153142929077, "learning_rate": 3.196339215169871e-05, "loss": 1.3679, "step": 13706 }, { "epoch": 0.360995522781143, "grad_norm": 2.7777693271636963, "learning_rate": 3.196207532262313e-05, "loss": 1.8684, "step": 13707 }, { "epoch": 0.36102185936265474, "grad_norm": 1.6836715936660767, "learning_rate": 3.196075849354754e-05, "loss": 1.6561, "step": 13708 }, { "epoch": 0.36104819594416643, "grad_norm": 1.59881591796875, "learning_rate": 3.195944166447195e-05, "loss": 2.1252, "step": 13709 }, { "epoch": 0.36107453252567817, "grad_norm": 2.737924098968506, "learning_rate": 3.195812483539637e-05, "loss": 2.0593, "step": 13710 }, { "epoch": 0.36110086910718986, "grad_norm": 2.5390844345092773, "learning_rate": 3.1956808006320784e-05, "loss": 1.9723, "step": 13711 }, { "epoch": 0.3611272056887016, "grad_norm": 1.9286346435546875, "learning_rate": 3.195549117724519e-05, "loss": 2.3703, "step": 13712 }, { "epoch": 0.36115354227021335, "grad_norm": 2.622424840927124, "learning_rate": 3.195417434816961e-05, "loss": 0.4147, "step": 13713 }, { "epoch": 0.36117987885172503, "grad_norm": 1.4867925643920898, "learning_rate": 3.195285751909402e-05, "loss": 2.0865, "step": 13714 }, { "epoch": 0.3612062154332368, "grad_norm": 2.314683437347412, "learning_rate": 3.195154069001844e-05, "loss": 1.625, "step": 13715 }, { "epoch": 0.36123255201474846, "grad_norm": 1.9138222932815552, "learning_rate": 3.195022386094285e-05, "loss": 1.8405, "step": 13716 }, { "epoch": 0.3612588885962602, "grad_norm": 1.9646731615066528, "learning_rate": 3.1948907031867264e-05, "loss": 1.9014, "step": 13717 }, { "epoch": 0.36128522517777195, "grad_norm": 2.100433588027954, "learning_rate": 3.194759020279168e-05, "loss": 1.491, "step": 13718 }, { "epoch": 0.36131156175928364, "grad_norm": 1.6655570268630981, "learning_rate": 3.1946273373716095e-05, "loss": 2.4489, "step": 13719 }, { "epoch": 0.3613378983407954, "grad_norm": 2.423673391342163, "learning_rate": 3.194495654464051e-05, "loss": 1.9112, "step": 13720 }, { "epoch": 0.36136423492230707, "grad_norm": 1.5658369064331055, "learning_rate": 3.194363971556492e-05, "loss": 1.5957, "step": 13721 }, { "epoch": 0.3613905715038188, "grad_norm": 4.259023666381836, "learning_rate": 3.1942322886489335e-05, "loss": 1.4321, "step": 13722 }, { "epoch": 0.3614169080853305, "grad_norm": 1.8690143823623657, "learning_rate": 3.1941006057413744e-05, "loss": 2.0743, "step": 13723 }, { "epoch": 0.36144324466684224, "grad_norm": 3.2913100719451904, "learning_rate": 3.1939689228338167e-05, "loss": 1.6028, "step": 13724 }, { "epoch": 0.361469581248354, "grad_norm": 4.696711540222168, "learning_rate": 3.1938372399262575e-05, "loss": 1.1349, "step": 13725 }, { "epoch": 0.3614959178298657, "grad_norm": 1.7497495412826538, "learning_rate": 3.193705557018699e-05, "loss": 2.0273, "step": 13726 }, { "epoch": 0.3615222544113774, "grad_norm": 1.4944347143173218, "learning_rate": 3.1935738741111407e-05, "loss": 1.1767, "step": 13727 }, { "epoch": 0.3615485909928891, "grad_norm": 2.4703528881073, "learning_rate": 3.1934421912035815e-05, "loss": 1.9198, "step": 13728 }, { "epoch": 0.36157492757440085, "grad_norm": 1.8481627702713013, "learning_rate": 3.193310508296024e-05, "loss": 1.1765, "step": 13729 }, { "epoch": 0.36160126415591254, "grad_norm": 5.01578950881958, "learning_rate": 3.1931788253884647e-05, "loss": 2.1003, "step": 13730 }, { "epoch": 0.3616276007374243, "grad_norm": 2.382566452026367, "learning_rate": 3.193047142480906e-05, "loss": 2.1148, "step": 13731 }, { "epoch": 0.361653937318936, "grad_norm": 2.1275060176849365, "learning_rate": 3.192915459573347e-05, "loss": 2.4598, "step": 13732 }, { "epoch": 0.3616802739004477, "grad_norm": 5.664269924163818, "learning_rate": 3.192783776665789e-05, "loss": 1.2847, "step": 13733 }, { "epoch": 0.36170661048195946, "grad_norm": 1.9295722246170044, "learning_rate": 3.19265209375823e-05, "loss": 2.077, "step": 13734 }, { "epoch": 0.36173294706347114, "grad_norm": 4.477755546569824, "learning_rate": 3.192520410850672e-05, "loss": 1.0247, "step": 13735 }, { "epoch": 0.3617592836449829, "grad_norm": 4.621309757232666, "learning_rate": 3.192388727943113e-05, "loss": 1.1349, "step": 13736 }, { "epoch": 0.3617856202264946, "grad_norm": 1.9026362895965576, "learning_rate": 3.192257045035554e-05, "loss": 0.9632, "step": 13737 }, { "epoch": 0.3618119568080063, "grad_norm": 2.698049783706665, "learning_rate": 3.1921253621279965e-05, "loss": 1.5551, "step": 13738 }, { "epoch": 0.36183829338951806, "grad_norm": 1.847318410873413, "learning_rate": 3.191993679220437e-05, "loss": 1.8988, "step": 13739 }, { "epoch": 0.36186462997102975, "grad_norm": 3.2921340465545654, "learning_rate": 3.191861996312879e-05, "loss": 1.9521, "step": 13740 }, { "epoch": 0.3618909665525415, "grad_norm": 1.75319242477417, "learning_rate": 3.19173031340532e-05, "loss": 1.7666, "step": 13741 }, { "epoch": 0.3619173031340532, "grad_norm": 2.2795064449310303, "learning_rate": 3.1915986304977613e-05, "loss": 2.241, "step": 13742 }, { "epoch": 0.3619436397155649, "grad_norm": 3.087273120880127, "learning_rate": 3.191466947590203e-05, "loss": 1.6454, "step": 13743 }, { "epoch": 0.3619699762970766, "grad_norm": 2.4293100833892822, "learning_rate": 3.1913352646826445e-05, "loss": 1.8657, "step": 13744 }, { "epoch": 0.36199631287858836, "grad_norm": 2.0871572494506836, "learning_rate": 3.191203581775086e-05, "loss": 0.8201, "step": 13745 }, { "epoch": 0.3620226494601001, "grad_norm": 3.9278769493103027, "learning_rate": 3.191071898867527e-05, "loss": 1.5707, "step": 13746 }, { "epoch": 0.3620489860416118, "grad_norm": 2.2865798473358154, "learning_rate": 3.1909402159599685e-05, "loss": 1.307, "step": 13747 }, { "epoch": 0.36207532262312353, "grad_norm": 2.4687952995300293, "learning_rate": 3.19080853305241e-05, "loss": 1.5658, "step": 13748 }, { "epoch": 0.3621016592046352, "grad_norm": 1.641245722770691, "learning_rate": 3.1906768501448516e-05, "loss": 1.3165, "step": 13749 }, { "epoch": 0.36212799578614696, "grad_norm": 3.333493947982788, "learning_rate": 3.1905451672372925e-05, "loss": 1.6897, "step": 13750 }, { "epoch": 0.3621543323676587, "grad_norm": 2.264084577560425, "learning_rate": 3.190413484329734e-05, "loss": 2.0074, "step": 13751 }, { "epoch": 0.3621806689491704, "grad_norm": 2.8487136363983154, "learning_rate": 3.1902818014221756e-05, "loss": 1.7925, "step": 13752 }, { "epoch": 0.36220700553068214, "grad_norm": 1.8497778177261353, "learning_rate": 3.190150118514617e-05, "loss": 1.4377, "step": 13753 }, { "epoch": 0.3622333421121938, "grad_norm": 2.043375253677368, "learning_rate": 3.190018435607059e-05, "loss": 1.0672, "step": 13754 }, { "epoch": 0.36225967869370557, "grad_norm": 2.831580877304077, "learning_rate": 3.1898867526994996e-05, "loss": 1.8116, "step": 13755 }, { "epoch": 0.36228601527521725, "grad_norm": 2.812997579574585, "learning_rate": 3.189755069791941e-05, "loss": 2.0191, "step": 13756 }, { "epoch": 0.362312351856729, "grad_norm": 1.557414174079895, "learning_rate": 3.189623386884383e-05, "loss": 1.3359, "step": 13757 }, { "epoch": 0.36233868843824074, "grad_norm": 1.910851001739502, "learning_rate": 3.189491703976824e-05, "loss": 2.1032, "step": 13758 }, { "epoch": 0.36236502501975243, "grad_norm": 2.2622439861297607, "learning_rate": 3.189360021069265e-05, "loss": 1.8948, "step": 13759 }, { "epoch": 0.3623913616012642, "grad_norm": 2.6514997482299805, "learning_rate": 3.189228338161707e-05, "loss": 2.0692, "step": 13760 }, { "epoch": 0.36241769818277586, "grad_norm": 2.974579334259033, "learning_rate": 3.1890966552541476e-05, "loss": 2.5212, "step": 13761 }, { "epoch": 0.3624440347642876, "grad_norm": 2.500119924545288, "learning_rate": 3.18896497234659e-05, "loss": 2.3911, "step": 13762 }, { "epoch": 0.3624703713457993, "grad_norm": 1.5414624214172363, "learning_rate": 3.188833289439031e-05, "loss": 1.7751, "step": 13763 }, { "epoch": 0.36249670792731103, "grad_norm": 2.443995475769043, "learning_rate": 3.188701606531472e-05, "loss": 1.7468, "step": 13764 }, { "epoch": 0.3625230445088228, "grad_norm": 2.0128374099731445, "learning_rate": 3.188569923623914e-05, "loss": 2.0167, "step": 13765 }, { "epoch": 0.36254938109033447, "grad_norm": 3.2428340911865234, "learning_rate": 3.1884382407163554e-05, "loss": 1.6361, "step": 13766 }, { "epoch": 0.3625757176718462, "grad_norm": 3.4356157779693604, "learning_rate": 3.188306557808797e-05, "loss": 1.7618, "step": 13767 }, { "epoch": 0.3626020542533579, "grad_norm": 1.9007205963134766, "learning_rate": 3.188174874901238e-05, "loss": 1.5445, "step": 13768 }, { "epoch": 0.36262839083486964, "grad_norm": 2.2970147132873535, "learning_rate": 3.1880431919936794e-05, "loss": 1.9541, "step": 13769 }, { "epoch": 0.36265472741638133, "grad_norm": 2.8699581623077393, "learning_rate": 3.18791150908612e-05, "loss": 1.303, "step": 13770 }, { "epoch": 0.36268106399789307, "grad_norm": 2.1957647800445557, "learning_rate": 3.1877798261785625e-05, "loss": 2.1676, "step": 13771 }, { "epoch": 0.3627074005794048, "grad_norm": 3.043992757797241, "learning_rate": 3.1876481432710034e-05, "loss": 0.6439, "step": 13772 }, { "epoch": 0.3627337371609165, "grad_norm": 1.519282579421997, "learning_rate": 3.187516460363445e-05, "loss": 1.997, "step": 13773 }, { "epoch": 0.36276007374242825, "grad_norm": 2.8921141624450684, "learning_rate": 3.1873847774558865e-05, "loss": 1.475, "step": 13774 }, { "epoch": 0.36278641032393993, "grad_norm": 3.218425750732422, "learning_rate": 3.1872530945483274e-05, "loss": 1.6625, "step": 13775 }, { "epoch": 0.3628127469054517, "grad_norm": 1.689947485923767, "learning_rate": 3.1871214116407696e-05, "loss": 1.5918, "step": 13776 }, { "epoch": 0.36283908348696337, "grad_norm": 2.2229950428009033, "learning_rate": 3.1869897287332105e-05, "loss": 2.3097, "step": 13777 }, { "epoch": 0.3628654200684751, "grad_norm": 2.1253154277801514, "learning_rate": 3.186858045825652e-05, "loss": 1.8892, "step": 13778 }, { "epoch": 0.36289175664998685, "grad_norm": 1.8596086502075195, "learning_rate": 3.186726362918093e-05, "loss": 2.1572, "step": 13779 }, { "epoch": 0.36291809323149854, "grad_norm": 2.1153604984283447, "learning_rate": 3.1865946800105345e-05, "loss": 1.6767, "step": 13780 }, { "epoch": 0.3629444298130103, "grad_norm": 2.6510214805603027, "learning_rate": 3.186462997102976e-05, "loss": 1.7224, "step": 13781 }, { "epoch": 0.36297076639452197, "grad_norm": 4.937135696411133, "learning_rate": 3.1863313141954176e-05, "loss": 2.2747, "step": 13782 }, { "epoch": 0.3629971029760337, "grad_norm": 2.1835336685180664, "learning_rate": 3.186199631287859e-05, "loss": 1.5629, "step": 13783 }, { "epoch": 0.36302343955754546, "grad_norm": 2.072180986404419, "learning_rate": 3.1860679483803e-05, "loss": 0.8332, "step": 13784 }, { "epoch": 0.36304977613905715, "grad_norm": 1.637162685394287, "learning_rate": 3.185936265472742e-05, "loss": 1.308, "step": 13785 }, { "epoch": 0.3630761127205689, "grad_norm": 2.446707248687744, "learning_rate": 3.185804582565183e-05, "loss": 1.0632, "step": 13786 }, { "epoch": 0.3631024493020806, "grad_norm": 2.045194625854492, "learning_rate": 3.185672899657625e-05, "loss": 1.8064, "step": 13787 }, { "epoch": 0.3631287858835923, "grad_norm": 2.5239710807800293, "learning_rate": 3.1855412167500656e-05, "loss": 2.092, "step": 13788 }, { "epoch": 0.363155122465104, "grad_norm": 1.823291540145874, "learning_rate": 3.185409533842507e-05, "loss": 0.7776, "step": 13789 }, { "epoch": 0.36318145904661575, "grad_norm": 1.759790301322937, "learning_rate": 3.185277850934949e-05, "loss": 0.7001, "step": 13790 }, { "epoch": 0.3632077956281275, "grad_norm": 2.4322996139526367, "learning_rate": 3.18514616802739e-05, "loss": 2.4051, "step": 13791 }, { "epoch": 0.3632341322096392, "grad_norm": 1.4365615844726562, "learning_rate": 3.185014485119832e-05, "loss": 1.5694, "step": 13792 }, { "epoch": 0.3632604687911509, "grad_norm": 1.5395278930664062, "learning_rate": 3.184882802212273e-05, "loss": 1.7923, "step": 13793 }, { "epoch": 0.3632868053726626, "grad_norm": 1.8849338293075562, "learning_rate": 3.184751119304714e-05, "loss": 1.4222, "step": 13794 }, { "epoch": 0.36331314195417436, "grad_norm": 2.9102373123168945, "learning_rate": 3.184619436397156e-05, "loss": 1.2308, "step": 13795 }, { "epoch": 0.36333947853568604, "grad_norm": 2.196582794189453, "learning_rate": 3.1844877534895974e-05, "loss": 2.0528, "step": 13796 }, { "epoch": 0.3633658151171978, "grad_norm": 1.4518133401870728, "learning_rate": 3.184356070582038e-05, "loss": 0.5009, "step": 13797 }, { "epoch": 0.36339215169870953, "grad_norm": 2.4499549865722656, "learning_rate": 3.18422438767448e-05, "loss": 1.2751, "step": 13798 }, { "epoch": 0.3634184882802212, "grad_norm": 4.62742280960083, "learning_rate": 3.1840927047669214e-05, "loss": 1.2049, "step": 13799 }, { "epoch": 0.36344482486173296, "grad_norm": 3.095268487930298, "learning_rate": 3.183961021859363e-05, "loss": 1.7698, "step": 13800 }, { "epoch": 0.36347116144324465, "grad_norm": 2.9536473751068115, "learning_rate": 3.1838293389518046e-05, "loss": 1.6317, "step": 13801 }, { "epoch": 0.3634974980247564, "grad_norm": 1.8991554975509644, "learning_rate": 3.1836976560442454e-05, "loss": 1.1643, "step": 13802 }, { "epoch": 0.3635238346062681, "grad_norm": 2.9806320667266846, "learning_rate": 3.183565973136687e-05, "loss": 1.5999, "step": 13803 }, { "epoch": 0.3635501711877798, "grad_norm": 2.817274570465088, "learning_rate": 3.1834342902291286e-05, "loss": 1.756, "step": 13804 }, { "epoch": 0.36357650776929157, "grad_norm": 2.6712772846221924, "learning_rate": 3.18330260732157e-05, "loss": 1.7977, "step": 13805 }, { "epoch": 0.36360284435080326, "grad_norm": 2.101950168609619, "learning_rate": 3.183170924414011e-05, "loss": 1.6977, "step": 13806 }, { "epoch": 0.363629180932315, "grad_norm": 3.1772356033325195, "learning_rate": 3.1830392415064526e-05, "loss": 0.657, "step": 13807 }, { "epoch": 0.3636555175138267, "grad_norm": 3.3101799488067627, "learning_rate": 3.1829075585988934e-05, "loss": 0.3071, "step": 13808 }, { "epoch": 0.36368185409533843, "grad_norm": 3.0180869102478027, "learning_rate": 3.182775875691336e-05, "loss": 0.721, "step": 13809 }, { "epoch": 0.3637081906768501, "grad_norm": 3.193493366241455, "learning_rate": 3.182644192783777e-05, "loss": 1.6397, "step": 13810 }, { "epoch": 0.36373452725836186, "grad_norm": 2.2841835021972656, "learning_rate": 3.182512509876218e-05, "loss": 0.2804, "step": 13811 }, { "epoch": 0.3637608638398736, "grad_norm": 2.378143548965454, "learning_rate": 3.18238082696866e-05, "loss": 1.7643, "step": 13812 }, { "epoch": 0.3637872004213853, "grad_norm": 3.1494781970977783, "learning_rate": 3.1822491440611006e-05, "loss": 1.851, "step": 13813 }, { "epoch": 0.36381353700289704, "grad_norm": 2.591458797454834, "learning_rate": 3.182117461153543e-05, "loss": 0.8916, "step": 13814 }, { "epoch": 0.3638398735844087, "grad_norm": 2.2837886810302734, "learning_rate": 3.181985778245984e-05, "loss": 1.909, "step": 13815 }, { "epoch": 0.36386621016592047, "grad_norm": 6.3013529777526855, "learning_rate": 3.181854095338425e-05, "loss": 0.7834, "step": 13816 }, { "epoch": 0.3638925467474322, "grad_norm": 1.827169418334961, "learning_rate": 3.181722412430866e-05, "loss": 1.7862, "step": 13817 }, { "epoch": 0.3639188833289439, "grad_norm": 1.9870465993881226, "learning_rate": 3.1815907295233084e-05, "loss": 1.679, "step": 13818 }, { "epoch": 0.36394521991045564, "grad_norm": 2.318375825881958, "learning_rate": 3.181459046615749e-05, "loss": 2.2049, "step": 13819 }, { "epoch": 0.36397155649196733, "grad_norm": 3.828956127166748, "learning_rate": 3.181327363708191e-05, "loss": 1.2161, "step": 13820 }, { "epoch": 0.3639978930734791, "grad_norm": 3.1623432636260986, "learning_rate": 3.1811956808006324e-05, "loss": 1.1136, "step": 13821 }, { "epoch": 0.36402422965499076, "grad_norm": 3.3095269203186035, "learning_rate": 3.181063997893073e-05, "loss": 1.496, "step": 13822 }, { "epoch": 0.3640505662365025, "grad_norm": 1.5347200632095337, "learning_rate": 3.1809323149855155e-05, "loss": 1.5613, "step": 13823 }, { "epoch": 0.36407690281801425, "grad_norm": 1.721107840538025, "learning_rate": 3.1808006320779564e-05, "loss": 2.2651, "step": 13824 }, { "epoch": 0.36410323939952594, "grad_norm": 1.8439258337020874, "learning_rate": 3.180668949170398e-05, "loss": 2.05, "step": 13825 }, { "epoch": 0.3641295759810377, "grad_norm": 1.7087953090667725, "learning_rate": 3.180537266262839e-05, "loss": 2.0225, "step": 13826 }, { "epoch": 0.36415591256254937, "grad_norm": 1.8066130876541138, "learning_rate": 3.1804055833552804e-05, "loss": 1.496, "step": 13827 }, { "epoch": 0.3641822491440611, "grad_norm": 1.5990849733352661, "learning_rate": 3.180273900447722e-05, "loss": 1.9284, "step": 13828 }, { "epoch": 0.3642085857255728, "grad_norm": 3.6670408248901367, "learning_rate": 3.1801422175401635e-05, "loss": 1.157, "step": 13829 }, { "epoch": 0.36423492230708454, "grad_norm": 3.810096502304077, "learning_rate": 3.180010534632605e-05, "loss": 1.1495, "step": 13830 }, { "epoch": 0.3642612588885963, "grad_norm": 2.1517443656921387, "learning_rate": 3.179878851725046e-05, "loss": 1.5242, "step": 13831 }, { "epoch": 0.36428759547010797, "grad_norm": 1.8491172790527344, "learning_rate": 3.179747168817488e-05, "loss": 1.7226, "step": 13832 }, { "epoch": 0.3643139320516197, "grad_norm": 2.996687412261963, "learning_rate": 3.179615485909929e-05, "loss": 0.4152, "step": 13833 }, { "epoch": 0.3643402686331314, "grad_norm": 2.764782667160034, "learning_rate": 3.1794838030023706e-05, "loss": 1.2242, "step": 13834 }, { "epoch": 0.36436660521464315, "grad_norm": 1.8812557458877563, "learning_rate": 3.1793521200948115e-05, "loss": 2.0346, "step": 13835 }, { "epoch": 0.36439294179615483, "grad_norm": 2.4403786659240723, "learning_rate": 3.179220437187253e-05, "loss": 2.2777, "step": 13836 }, { "epoch": 0.3644192783776666, "grad_norm": 2.702369213104248, "learning_rate": 3.1790887542796946e-05, "loss": 0.9321, "step": 13837 }, { "epoch": 0.3644456149591783, "grad_norm": 2.129138946533203, "learning_rate": 3.178957071372136e-05, "loss": 2.5341, "step": 13838 }, { "epoch": 0.36447195154069, "grad_norm": 1.6012060642242432, "learning_rate": 3.178825388464578e-05, "loss": 1.3708, "step": 13839 }, { "epoch": 0.36449828812220175, "grad_norm": 1.7756980657577515, "learning_rate": 3.1786937055570186e-05, "loss": 2.2441, "step": 13840 }, { "epoch": 0.36452462470371344, "grad_norm": 3.4371607303619385, "learning_rate": 3.17856202264946e-05, "loss": 0.7192, "step": 13841 }, { "epoch": 0.3645509612852252, "grad_norm": 1.8897078037261963, "learning_rate": 3.178430339741902e-05, "loss": 1.9421, "step": 13842 }, { "epoch": 0.36457729786673687, "grad_norm": 2.871610164642334, "learning_rate": 3.178298656834343e-05, "loss": 1.1143, "step": 13843 }, { "epoch": 0.3646036344482486, "grad_norm": 2.6649508476257324, "learning_rate": 3.178166973926784e-05, "loss": 1.7913, "step": 13844 }, { "epoch": 0.36462997102976036, "grad_norm": 1.9537975788116455, "learning_rate": 3.178035291019226e-05, "loss": 1.6434, "step": 13845 }, { "epoch": 0.36465630761127205, "grad_norm": 1.7834559679031372, "learning_rate": 3.177903608111667e-05, "loss": 1.6543, "step": 13846 }, { "epoch": 0.3646826441927838, "grad_norm": 2.697648048400879, "learning_rate": 3.177771925204109e-05, "loss": 1.8356, "step": 13847 }, { "epoch": 0.3647089807742955, "grad_norm": 1.8176823854446411, "learning_rate": 3.1776402422965504e-05, "loss": 1.0223, "step": 13848 }, { "epoch": 0.3647353173558072, "grad_norm": 5.114029407501221, "learning_rate": 3.177508559388991e-05, "loss": 1.688, "step": 13849 }, { "epoch": 0.36476165393731896, "grad_norm": 3.140770196914673, "learning_rate": 3.177376876481433e-05, "loss": 1.5448, "step": 13850 }, { "epoch": 0.36478799051883065, "grad_norm": 3.648015022277832, "learning_rate": 3.1772451935738744e-05, "loss": 1.0719, "step": 13851 }, { "epoch": 0.3648143271003424, "grad_norm": 2.382293939590454, "learning_rate": 3.177113510666316e-05, "loss": 1.9129, "step": 13852 }, { "epoch": 0.3648406636818541, "grad_norm": 1.719102144241333, "learning_rate": 3.176981827758757e-05, "loss": 2.1479, "step": 13853 }, { "epoch": 0.3648670002633658, "grad_norm": 2.2585225105285645, "learning_rate": 3.1768501448511984e-05, "loss": 1.5737, "step": 13854 }, { "epoch": 0.3648933368448775, "grad_norm": 2.392477512359619, "learning_rate": 3.17671846194364e-05, "loss": 0.1898, "step": 13855 }, { "epoch": 0.36491967342638926, "grad_norm": 2.503675699234009, "learning_rate": 3.1765867790360815e-05, "loss": 1.4292, "step": 13856 }, { "epoch": 0.364946010007901, "grad_norm": 2.0193629264831543, "learning_rate": 3.176455096128523e-05, "loss": 1.087, "step": 13857 }, { "epoch": 0.3649723465894127, "grad_norm": 2.2090227603912354, "learning_rate": 3.176323413220964e-05, "loss": 1.7748, "step": 13858 }, { "epoch": 0.36499868317092443, "grad_norm": 2.4805684089660645, "learning_rate": 3.1761917303134055e-05, "loss": 0.3666, "step": 13859 }, { "epoch": 0.3650250197524361, "grad_norm": 3.4558446407318115, "learning_rate": 3.1760600474058464e-05, "loss": 1.5241, "step": 13860 }, { "epoch": 0.36505135633394786, "grad_norm": 1.7901235818862915, "learning_rate": 3.1759283644982887e-05, "loss": 1.3702, "step": 13861 }, { "epoch": 0.36507769291545955, "grad_norm": 2.0470352172851562, "learning_rate": 3.1757966815907295e-05, "loss": 1.6389, "step": 13862 }, { "epoch": 0.3651040294969713, "grad_norm": 2.186436653137207, "learning_rate": 3.175664998683171e-05, "loss": 2.1418, "step": 13863 }, { "epoch": 0.36513036607848304, "grad_norm": 2.1997463703155518, "learning_rate": 3.175533315775612e-05, "loss": 1.5805, "step": 13864 }, { "epoch": 0.3651567026599947, "grad_norm": 1.9361793994903564, "learning_rate": 3.175401632868054e-05, "loss": 0.3786, "step": 13865 }, { "epoch": 0.36518303924150647, "grad_norm": 2.4012434482574463, "learning_rate": 3.175269949960495e-05, "loss": 2.1721, "step": 13866 }, { "epoch": 0.36520937582301816, "grad_norm": 2.103909969329834, "learning_rate": 3.1751382670529367e-05, "loss": 0.6738, "step": 13867 }, { "epoch": 0.3652357124045299, "grad_norm": 2.1778693199157715, "learning_rate": 3.175006584145378e-05, "loss": 1.7869, "step": 13868 }, { "epoch": 0.3652620489860416, "grad_norm": 2.101128578186035, "learning_rate": 3.174874901237819e-05, "loss": 1.5545, "step": 13869 }, { "epoch": 0.36528838556755333, "grad_norm": 1.7771625518798828, "learning_rate": 3.1747432183302613e-05, "loss": 2.0729, "step": 13870 }, { "epoch": 0.3653147221490651, "grad_norm": 2.4678893089294434, "learning_rate": 3.174611535422702e-05, "loss": 1.8233, "step": 13871 }, { "epoch": 0.36534105873057676, "grad_norm": 3.338731288909912, "learning_rate": 3.174479852515144e-05, "loss": 1.7099, "step": 13872 }, { "epoch": 0.3653673953120885, "grad_norm": 4.617074489593506, "learning_rate": 3.174348169607585e-05, "loss": 1.5664, "step": 13873 }, { "epoch": 0.3653937318936002, "grad_norm": 1.9998185634613037, "learning_rate": 3.174216486700026e-05, "loss": 1.8124, "step": 13874 }, { "epoch": 0.36542006847511194, "grad_norm": 3.106323719024658, "learning_rate": 3.174084803792468e-05, "loss": 2.6014, "step": 13875 }, { "epoch": 0.3654464050566236, "grad_norm": 2.9282407760620117, "learning_rate": 3.1739531208849093e-05, "loss": 1.2614, "step": 13876 }, { "epoch": 0.36547274163813537, "grad_norm": 1.9555573463439941, "learning_rate": 3.173821437977351e-05, "loss": 0.6516, "step": 13877 }, { "epoch": 0.3654990782196471, "grad_norm": 2.4293127059936523, "learning_rate": 3.173689755069792e-05, "loss": 1.6645, "step": 13878 }, { "epoch": 0.3655254148011588, "grad_norm": 2.7621352672576904, "learning_rate": 3.1735580721622333e-05, "loss": 1.2575, "step": 13879 }, { "epoch": 0.36555175138267054, "grad_norm": 1.6219511032104492, "learning_rate": 3.173426389254675e-05, "loss": 1.6224, "step": 13880 }, { "epoch": 0.36557808796418223, "grad_norm": 3.178426742553711, "learning_rate": 3.1732947063471165e-05, "loss": 1.7361, "step": 13881 }, { "epoch": 0.365604424545694, "grad_norm": 2.1224637031555176, "learning_rate": 3.1731630234395573e-05, "loss": 1.6094, "step": 13882 }, { "epoch": 0.36563076112720566, "grad_norm": 2.019261598587036, "learning_rate": 3.173031340531999e-05, "loss": 1.9523, "step": 13883 }, { "epoch": 0.3656570977087174, "grad_norm": 1.9840797185897827, "learning_rate": 3.1728996576244405e-05, "loss": 1.7547, "step": 13884 }, { "epoch": 0.36568343429022915, "grad_norm": 4.115976810455322, "learning_rate": 3.172767974716882e-05, "loss": 1.8783, "step": 13885 }, { "epoch": 0.36570977087174084, "grad_norm": 7.090014934539795, "learning_rate": 3.1726362918093236e-05, "loss": 1.7113, "step": 13886 }, { "epoch": 0.3657361074532526, "grad_norm": 1.7239238023757935, "learning_rate": 3.1725046089017645e-05, "loss": 1.7662, "step": 13887 }, { "epoch": 0.36576244403476427, "grad_norm": 1.8261460065841675, "learning_rate": 3.172372925994206e-05, "loss": 1.9016, "step": 13888 }, { "epoch": 0.365788780616276, "grad_norm": 1.575415015220642, "learning_rate": 3.1722412430866476e-05, "loss": 1.8926, "step": 13889 }, { "epoch": 0.36581511719778775, "grad_norm": 1.7461912631988525, "learning_rate": 3.172109560179089e-05, "loss": 1.7464, "step": 13890 }, { "epoch": 0.36584145377929944, "grad_norm": 2.8639485836029053, "learning_rate": 3.17197787727153e-05, "loss": 0.9562, "step": 13891 }, { "epoch": 0.3658677903608112, "grad_norm": 2.717808723449707, "learning_rate": 3.1718461943639716e-05, "loss": 1.9312, "step": 13892 }, { "epoch": 0.3658941269423229, "grad_norm": 2.0594468116760254, "learning_rate": 3.171714511456413e-05, "loss": 1.3841, "step": 13893 }, { "epoch": 0.3659204635238346, "grad_norm": 2.767845869064331, "learning_rate": 3.171582828548855e-05, "loss": 2.4951, "step": 13894 }, { "epoch": 0.3659468001053463, "grad_norm": 3.5502774715423584, "learning_rate": 3.171451145641296e-05, "loss": 1.9461, "step": 13895 }, { "epoch": 0.36597313668685805, "grad_norm": 4.4372172355651855, "learning_rate": 3.171319462733737e-05, "loss": 0.8682, "step": 13896 }, { "epoch": 0.3659994732683698, "grad_norm": 1.8434040546417236, "learning_rate": 3.171187779826179e-05, "loss": 2.3746, "step": 13897 }, { "epoch": 0.3660258098498815, "grad_norm": 2.7245044708251953, "learning_rate": 3.17105609691862e-05, "loss": 1.8081, "step": 13898 }, { "epoch": 0.3660521464313932, "grad_norm": 2.771296739578247, "learning_rate": 3.170924414011062e-05, "loss": 2.1925, "step": 13899 }, { "epoch": 0.3660784830129049, "grad_norm": 6.998247146606445, "learning_rate": 3.170792731103503e-05, "loss": 0.7724, "step": 13900 }, { "epoch": 0.36610481959441665, "grad_norm": 3.2889821529388428, "learning_rate": 3.170661048195944e-05, "loss": 0.7153, "step": 13901 }, { "epoch": 0.36613115617592834, "grad_norm": 4.045352458953857, "learning_rate": 3.170529365288386e-05, "loss": 0.8026, "step": 13902 }, { "epoch": 0.3661574927574401, "grad_norm": 1.4328993558883667, "learning_rate": 3.1703976823808274e-05, "loss": 1.3598, "step": 13903 }, { "epoch": 0.3661838293389518, "grad_norm": 1.7620875835418701, "learning_rate": 3.170265999473269e-05, "loss": 2.3268, "step": 13904 }, { "epoch": 0.3662101659204635, "grad_norm": 1.6862494945526123, "learning_rate": 3.17013431656571e-05, "loss": 2.4352, "step": 13905 }, { "epoch": 0.36623650250197526, "grad_norm": 1.809762716293335, "learning_rate": 3.1700026336581514e-05, "loss": 1.7069, "step": 13906 }, { "epoch": 0.36626283908348695, "grad_norm": 5.644355297088623, "learning_rate": 3.169870950750592e-05, "loss": 1.3505, "step": 13907 }, { "epoch": 0.3662891756649987, "grad_norm": 2.797412157058716, "learning_rate": 3.1697392678430345e-05, "loss": 1.4805, "step": 13908 }, { "epoch": 0.3663155122465104, "grad_norm": 2.3762357234954834, "learning_rate": 3.1696075849354754e-05, "loss": 1.7948, "step": 13909 }, { "epoch": 0.3663418488280221, "grad_norm": 2.9792027473449707, "learning_rate": 3.169475902027917e-05, "loss": 1.215, "step": 13910 }, { "epoch": 0.36636818540953386, "grad_norm": 1.773647665977478, "learning_rate": 3.169344219120358e-05, "loss": 1.4595, "step": 13911 }, { "epoch": 0.36639452199104555, "grad_norm": 1.992507815361023, "learning_rate": 3.1692125362127994e-05, "loss": 1.9775, "step": 13912 }, { "epoch": 0.3664208585725573, "grad_norm": 1.9619947671890259, "learning_rate": 3.169080853305241e-05, "loss": 1.7646, "step": 13913 }, { "epoch": 0.366447195154069, "grad_norm": 2.018383741378784, "learning_rate": 3.1689491703976825e-05, "loss": 1.9394, "step": 13914 }, { "epoch": 0.3664735317355807, "grad_norm": 1.5302658081054688, "learning_rate": 3.168817487490124e-05, "loss": 2.0235, "step": 13915 }, { "epoch": 0.3664998683170924, "grad_norm": 2.3637197017669678, "learning_rate": 3.168685804582565e-05, "loss": 2.003, "step": 13916 }, { "epoch": 0.36652620489860416, "grad_norm": 1.772335171699524, "learning_rate": 3.168554121675007e-05, "loss": 1.6057, "step": 13917 }, { "epoch": 0.3665525414801159, "grad_norm": 2.13639235496521, "learning_rate": 3.168422438767448e-05, "loss": 1.4412, "step": 13918 }, { "epoch": 0.3665788780616276, "grad_norm": 2.634377956390381, "learning_rate": 3.1682907558598896e-05, "loss": 1.2497, "step": 13919 }, { "epoch": 0.36660521464313933, "grad_norm": 2.1368846893310547, "learning_rate": 3.1681590729523305e-05, "loss": 1.9891, "step": 13920 }, { "epoch": 0.366631551224651, "grad_norm": 3.9307925701141357, "learning_rate": 3.168027390044772e-05, "loss": 1.1267, "step": 13921 }, { "epoch": 0.36665788780616276, "grad_norm": 1.6822090148925781, "learning_rate": 3.1678957071372136e-05, "loss": 1.7764, "step": 13922 }, { "epoch": 0.3666842243876745, "grad_norm": 1.7323521375656128, "learning_rate": 3.167764024229655e-05, "loss": 1.0156, "step": 13923 }, { "epoch": 0.3667105609691862, "grad_norm": 2.6515190601348877, "learning_rate": 3.167632341322097e-05, "loss": 1.5725, "step": 13924 }, { "epoch": 0.36673689755069794, "grad_norm": 1.7710896730422974, "learning_rate": 3.1675006584145376e-05, "loss": 2.4908, "step": 13925 }, { "epoch": 0.3667632341322096, "grad_norm": 2.7734415531158447, "learning_rate": 3.167368975506979e-05, "loss": 2.0622, "step": 13926 }, { "epoch": 0.36678957071372137, "grad_norm": 1.8482822179794312, "learning_rate": 3.167237292599421e-05, "loss": 1.7745, "step": 13927 }, { "epoch": 0.36681590729523306, "grad_norm": 1.8044371604919434, "learning_rate": 3.167105609691862e-05, "loss": 1.8633, "step": 13928 }, { "epoch": 0.3668422438767448, "grad_norm": 2.230562686920166, "learning_rate": 3.166973926784303e-05, "loss": 1.9962, "step": 13929 }, { "epoch": 0.36686858045825654, "grad_norm": 2.4081809520721436, "learning_rate": 3.166842243876745e-05, "loss": 2.0626, "step": 13930 }, { "epoch": 0.36689491703976823, "grad_norm": 1.549582600593567, "learning_rate": 3.166710560969186e-05, "loss": 1.8033, "step": 13931 }, { "epoch": 0.36692125362128, "grad_norm": 2.1413769721984863, "learning_rate": 3.166578878061628e-05, "loss": 1.5805, "step": 13932 }, { "epoch": 0.36694759020279166, "grad_norm": 1.5038267374038696, "learning_rate": 3.1664471951540694e-05, "loss": 2.4161, "step": 13933 }, { "epoch": 0.3669739267843034, "grad_norm": 1.8666999340057373, "learning_rate": 3.16631551224651e-05, "loss": 2.2109, "step": 13934 }, { "epoch": 0.3670002633658151, "grad_norm": 3.083068370819092, "learning_rate": 3.166183829338952e-05, "loss": 2.0459, "step": 13935 }, { "epoch": 0.36702659994732684, "grad_norm": 2.6817305088043213, "learning_rate": 3.1660521464313934e-05, "loss": 1.7322, "step": 13936 }, { "epoch": 0.3670529365288386, "grad_norm": 2.3508315086364746, "learning_rate": 3.165920463523835e-05, "loss": 1.6017, "step": 13937 }, { "epoch": 0.36707927311035027, "grad_norm": 3.2340803146362305, "learning_rate": 3.165788780616276e-05, "loss": 1.3548, "step": 13938 }, { "epoch": 0.367105609691862, "grad_norm": 1.6249994039535522, "learning_rate": 3.1656570977087174e-05, "loss": 0.6447, "step": 13939 }, { "epoch": 0.3671319462733737, "grad_norm": 1.6927450895309448, "learning_rate": 3.165525414801159e-05, "loss": 1.7647, "step": 13940 }, { "epoch": 0.36715828285488544, "grad_norm": 1.6416312456130981, "learning_rate": 3.1653937318936006e-05, "loss": 1.3798, "step": 13941 }, { "epoch": 0.36718461943639713, "grad_norm": 4.144277572631836, "learning_rate": 3.165262048986042e-05, "loss": 1.6596, "step": 13942 }, { "epoch": 0.3672109560179089, "grad_norm": 2.075324535369873, "learning_rate": 3.165130366078483e-05, "loss": 1.4787, "step": 13943 }, { "epoch": 0.3672372925994206, "grad_norm": 1.62732994556427, "learning_rate": 3.1649986831709246e-05, "loss": 1.5477, "step": 13944 }, { "epoch": 0.3672636291809323, "grad_norm": 3.860360860824585, "learning_rate": 3.164867000263366e-05, "loss": 1.0535, "step": 13945 }, { "epoch": 0.36728996576244405, "grad_norm": 1.8674042224884033, "learning_rate": 3.164735317355808e-05, "loss": 1.7931, "step": 13946 }, { "epoch": 0.36731630234395574, "grad_norm": 2.8013572692871094, "learning_rate": 3.1646036344482486e-05, "loss": 1.2016, "step": 13947 }, { "epoch": 0.3673426389254675, "grad_norm": 3.1399917602539062, "learning_rate": 3.16447195154069e-05, "loss": 0.5751, "step": 13948 }, { "epoch": 0.36736897550697917, "grad_norm": 2.061265230178833, "learning_rate": 3.164340268633132e-05, "loss": 2.5703, "step": 13949 }, { "epoch": 0.3673953120884909, "grad_norm": 2.405780792236328, "learning_rate": 3.164208585725573e-05, "loss": 1.323, "step": 13950 }, { "epoch": 0.36742164867000265, "grad_norm": 2.8864221572875977, "learning_rate": 3.164076902818015e-05, "loss": 0.8512, "step": 13951 }, { "epoch": 0.36744798525151434, "grad_norm": 1.5359828472137451, "learning_rate": 3.163945219910456e-05, "loss": 1.984, "step": 13952 }, { "epoch": 0.3674743218330261, "grad_norm": 3.3540399074554443, "learning_rate": 3.163813537002897e-05, "loss": 1.1482, "step": 13953 }, { "epoch": 0.3675006584145378, "grad_norm": 1.594154953956604, "learning_rate": 3.163681854095338e-05, "loss": 2.3009, "step": 13954 }, { "epoch": 0.3675269949960495, "grad_norm": 1.3619701862335205, "learning_rate": 3.1635501711877804e-05, "loss": 0.5792, "step": 13955 }, { "epoch": 0.36755333157756126, "grad_norm": 2.1329445838928223, "learning_rate": 3.163418488280221e-05, "loss": 2.2423, "step": 13956 }, { "epoch": 0.36757966815907295, "grad_norm": 1.7652164697647095, "learning_rate": 3.163286805372663e-05, "loss": 1.7559, "step": 13957 }, { "epoch": 0.3676060047405847, "grad_norm": 2.0448415279388428, "learning_rate": 3.1631551224651044e-05, "loss": 2.2698, "step": 13958 }, { "epoch": 0.3676323413220964, "grad_norm": 2.5216546058654785, "learning_rate": 3.163023439557545e-05, "loss": 1.8614, "step": 13959 }, { "epoch": 0.3676586779036081, "grad_norm": 1.7806518077850342, "learning_rate": 3.1628917566499875e-05, "loss": 1.4737, "step": 13960 }, { "epoch": 0.3676850144851198, "grad_norm": 2.049075126647949, "learning_rate": 3.1627600737424284e-05, "loss": 1.6401, "step": 13961 }, { "epoch": 0.36771135106663155, "grad_norm": 2.9491817951202393, "learning_rate": 3.16262839083487e-05, "loss": 0.9926, "step": 13962 }, { "epoch": 0.3677376876481433, "grad_norm": 1.6878855228424072, "learning_rate": 3.162496707927311e-05, "loss": 1.8869, "step": 13963 }, { "epoch": 0.367764024229655, "grad_norm": 1.6858166456222534, "learning_rate": 3.162365025019753e-05, "loss": 1.8512, "step": 13964 }, { "epoch": 0.3677903608111667, "grad_norm": 5.557626247406006, "learning_rate": 3.162233342112194e-05, "loss": 1.187, "step": 13965 }, { "epoch": 0.3678166973926784, "grad_norm": 1.8709737062454224, "learning_rate": 3.1621016592046355e-05, "loss": 2.032, "step": 13966 }, { "epoch": 0.36784303397419016, "grad_norm": 2.255427837371826, "learning_rate": 3.1619699762970764e-05, "loss": 1.6211, "step": 13967 }, { "epoch": 0.36786937055570185, "grad_norm": 3.316115140914917, "learning_rate": 3.161838293389518e-05, "loss": 1.7174, "step": 13968 }, { "epoch": 0.3678957071372136, "grad_norm": 3.3950183391571045, "learning_rate": 3.1617066104819595e-05, "loss": 1.3451, "step": 13969 }, { "epoch": 0.36792204371872533, "grad_norm": 1.9111573696136475, "learning_rate": 3.161574927574401e-05, "loss": 1.8871, "step": 13970 }, { "epoch": 0.367948380300237, "grad_norm": 2.1103687286376953, "learning_rate": 3.1614432446668426e-05, "loss": 2.2399, "step": 13971 }, { "epoch": 0.36797471688174876, "grad_norm": 4.592798233032227, "learning_rate": 3.1613115617592835e-05, "loss": 2.2421, "step": 13972 }, { "epoch": 0.36800105346326045, "grad_norm": 1.7692029476165771, "learning_rate": 3.161179878851725e-05, "loss": 1.669, "step": 13973 }, { "epoch": 0.3680273900447722, "grad_norm": 1.8893780708312988, "learning_rate": 3.1610481959441666e-05, "loss": 2.2138, "step": 13974 }, { "epoch": 0.3680537266262839, "grad_norm": 1.958081841468811, "learning_rate": 3.160916513036608e-05, "loss": 1.8246, "step": 13975 }, { "epoch": 0.3680800632077956, "grad_norm": 3.833266019821167, "learning_rate": 3.160784830129049e-05, "loss": 2.0845, "step": 13976 }, { "epoch": 0.36810639978930737, "grad_norm": 2.422652006149292, "learning_rate": 3.1606531472214906e-05, "loss": 0.7655, "step": 13977 }, { "epoch": 0.36813273637081906, "grad_norm": 2.3722052574157715, "learning_rate": 3.160521464313932e-05, "loss": 2.0574, "step": 13978 }, { "epoch": 0.3681590729523308, "grad_norm": 2.0069072246551514, "learning_rate": 3.160389781406374e-05, "loss": 1.3412, "step": 13979 }, { "epoch": 0.3681854095338425, "grad_norm": 2.0078208446502686, "learning_rate": 3.160258098498815e-05, "loss": 2.1492, "step": 13980 }, { "epoch": 0.36821174611535423, "grad_norm": 2.508742094039917, "learning_rate": 3.160126415591256e-05, "loss": 1.8643, "step": 13981 }, { "epoch": 0.3682380826968659, "grad_norm": 2.8134686946868896, "learning_rate": 3.159994732683698e-05, "loss": 0.5469, "step": 13982 }, { "epoch": 0.36826441927837766, "grad_norm": 3.2314374446868896, "learning_rate": 3.159863049776139e-05, "loss": 0.4694, "step": 13983 }, { "epoch": 0.3682907558598894, "grad_norm": 5.888422966003418, "learning_rate": 3.159731366868581e-05, "loss": 1.07, "step": 13984 }, { "epoch": 0.3683170924414011, "grad_norm": 2.3816802501678467, "learning_rate": 3.159599683961022e-05, "loss": 1.8833, "step": 13985 }, { "epoch": 0.36834342902291284, "grad_norm": 5.047017574310303, "learning_rate": 3.159468001053463e-05, "loss": 0.8706, "step": 13986 }, { "epoch": 0.3683697656044245, "grad_norm": 2.0239899158477783, "learning_rate": 3.159336318145905e-05, "loss": 2.0173, "step": 13987 }, { "epoch": 0.36839610218593627, "grad_norm": 3.18420672416687, "learning_rate": 3.1592046352383464e-05, "loss": 1.8623, "step": 13988 }, { "epoch": 0.368422438767448, "grad_norm": 2.366476535797119, "learning_rate": 3.159072952330788e-05, "loss": 1.5839, "step": 13989 }, { "epoch": 0.3684487753489597, "grad_norm": 2.115718364715576, "learning_rate": 3.158941269423229e-05, "loss": 1.6292, "step": 13990 }, { "epoch": 0.36847511193047144, "grad_norm": 2.5027735233306885, "learning_rate": 3.1588095865156704e-05, "loss": 2.0801, "step": 13991 }, { "epoch": 0.36850144851198313, "grad_norm": 1.529545783996582, "learning_rate": 3.158677903608111e-05, "loss": 1.6444, "step": 13992 }, { "epoch": 0.3685277850934949, "grad_norm": 10.095187187194824, "learning_rate": 3.1585462207005535e-05, "loss": 2.4624, "step": 13993 }, { "epoch": 0.36855412167500656, "grad_norm": 2.531205177307129, "learning_rate": 3.1584145377929944e-05, "loss": 1.7764, "step": 13994 }, { "epoch": 0.3685804582565183, "grad_norm": 1.7562317848205566, "learning_rate": 3.158282854885436e-05, "loss": 1.4229, "step": 13995 }, { "epoch": 0.36860679483803005, "grad_norm": 2.2997307777404785, "learning_rate": 3.1581511719778775e-05, "loss": 1.3205, "step": 13996 }, { "epoch": 0.36863313141954174, "grad_norm": 2.063079595565796, "learning_rate": 3.158019489070319e-05, "loss": 0.5504, "step": 13997 }, { "epoch": 0.3686594680010535, "grad_norm": 2.4551782608032227, "learning_rate": 3.157887806162761e-05, "loss": 2.3043, "step": 13998 }, { "epoch": 0.36868580458256517, "grad_norm": 3.2041988372802734, "learning_rate": 3.1577561232552015e-05, "loss": 1.3218, "step": 13999 }, { "epoch": 0.3687121411640769, "grad_norm": 2.6684558391571045, "learning_rate": 3.157624440347643e-05, "loss": 2.4471, "step": 14000 }, { "epoch": 0.3687384777455886, "grad_norm": 3.5551936626434326, "learning_rate": 3.157492757440084e-05, "loss": 1.2455, "step": 14001 }, { "epoch": 0.36876481432710034, "grad_norm": 1.9673200845718384, "learning_rate": 3.157361074532526e-05, "loss": 1.1001, "step": 14002 }, { "epoch": 0.3687911509086121, "grad_norm": 1.8047468662261963, "learning_rate": 3.157229391624967e-05, "loss": 2.0843, "step": 14003 }, { "epoch": 0.3688174874901238, "grad_norm": 2.695791006088257, "learning_rate": 3.157097708717409e-05, "loss": 1.5977, "step": 14004 }, { "epoch": 0.3688438240716355, "grad_norm": 3.1595258712768555, "learning_rate": 3.15696602580985e-05, "loss": 1.2335, "step": 14005 }, { "epoch": 0.3688701606531472, "grad_norm": 2.4013636112213135, "learning_rate": 3.156834342902291e-05, "loss": 1.2647, "step": 14006 }, { "epoch": 0.36889649723465895, "grad_norm": 1.7196820974349976, "learning_rate": 3.1567026599947333e-05, "loss": 0.7567, "step": 14007 }, { "epoch": 0.36892283381617064, "grad_norm": 2.1799941062927246, "learning_rate": 3.156570977087174e-05, "loss": 1.4471, "step": 14008 }, { "epoch": 0.3689491703976824, "grad_norm": 3.406611442565918, "learning_rate": 3.156439294179616e-05, "loss": 1.7441, "step": 14009 }, { "epoch": 0.3689755069791941, "grad_norm": 4.917551517486572, "learning_rate": 3.156307611272057e-05, "loss": 1.8835, "step": 14010 }, { "epoch": 0.3690018435607058, "grad_norm": 2.311849594116211, "learning_rate": 3.156175928364499e-05, "loss": 1.934, "step": 14011 }, { "epoch": 0.36902818014221755, "grad_norm": 1.8395415544509888, "learning_rate": 3.15604424545694e-05, "loss": 0.4708, "step": 14012 }, { "epoch": 0.36905451672372924, "grad_norm": 1.7007797956466675, "learning_rate": 3.1559125625493813e-05, "loss": 0.9208, "step": 14013 }, { "epoch": 0.369080853305241, "grad_norm": 2.0950088500976562, "learning_rate": 3.155780879641822e-05, "loss": 0.9732, "step": 14014 }, { "epoch": 0.3691071898867527, "grad_norm": 2.068774938583374, "learning_rate": 3.155649196734264e-05, "loss": 1.4884, "step": 14015 }, { "epoch": 0.3691335264682644, "grad_norm": 1.7766845226287842, "learning_rate": 3.1555175138267053e-05, "loss": 1.9346, "step": 14016 }, { "epoch": 0.36915986304977616, "grad_norm": 1.7502129077911377, "learning_rate": 3.155385830919147e-05, "loss": 1.8656, "step": 14017 }, { "epoch": 0.36918619963128785, "grad_norm": 1.852126955986023, "learning_rate": 3.1552541480115885e-05, "loss": 1.5744, "step": 14018 }, { "epoch": 0.3692125362127996, "grad_norm": 2.4089267253875732, "learning_rate": 3.1551224651040294e-05, "loss": 1.6667, "step": 14019 }, { "epoch": 0.3692388727943113, "grad_norm": 4.818479061126709, "learning_rate": 3.154990782196471e-05, "loss": 1.986, "step": 14020 }, { "epoch": 0.369265209375823, "grad_norm": 1.6262494325637817, "learning_rate": 3.1548590992889125e-05, "loss": 1.9149, "step": 14021 }, { "epoch": 0.3692915459573347, "grad_norm": 2.3452014923095703, "learning_rate": 3.154727416381354e-05, "loss": 2.1754, "step": 14022 }, { "epoch": 0.36931788253884645, "grad_norm": 2.2143008708953857, "learning_rate": 3.154595733473795e-05, "loss": 0.5868, "step": 14023 }, { "epoch": 0.3693442191203582, "grad_norm": 2.0937740802764893, "learning_rate": 3.1544640505662365e-05, "loss": 2.1145, "step": 14024 }, { "epoch": 0.3693705557018699, "grad_norm": 1.6205493211746216, "learning_rate": 3.154332367658678e-05, "loss": 1.9693, "step": 14025 }, { "epoch": 0.36939689228338163, "grad_norm": 4.79160213470459, "learning_rate": 3.1542006847511196e-05, "loss": 2.1585, "step": 14026 }, { "epoch": 0.3694232288648933, "grad_norm": 2.5807693004608154, "learning_rate": 3.154069001843561e-05, "loss": 1.9397, "step": 14027 }, { "epoch": 0.36944956544640506, "grad_norm": 1.9860213994979858, "learning_rate": 3.153937318936002e-05, "loss": 1.8714, "step": 14028 }, { "epoch": 0.3694759020279168, "grad_norm": 2.9638314247131348, "learning_rate": 3.1538056360284436e-05, "loss": 2.0282, "step": 14029 }, { "epoch": 0.3695022386094285, "grad_norm": 2.4194157123565674, "learning_rate": 3.153673953120885e-05, "loss": 1.5762, "step": 14030 }, { "epoch": 0.36952857519094023, "grad_norm": 2.640869140625, "learning_rate": 3.153542270213327e-05, "loss": 1.4509, "step": 14031 }, { "epoch": 0.3695549117724519, "grad_norm": 2.0341551303863525, "learning_rate": 3.1534105873057676e-05, "loss": 2.2935, "step": 14032 }, { "epoch": 0.36958124835396367, "grad_norm": 1.718632698059082, "learning_rate": 3.153278904398209e-05, "loss": 1.522, "step": 14033 }, { "epoch": 0.36960758493547535, "grad_norm": 1.8101212978363037, "learning_rate": 3.153147221490651e-05, "loss": 2.5663, "step": 14034 }, { "epoch": 0.3696339215169871, "grad_norm": 2.603178024291992, "learning_rate": 3.153015538583092e-05, "loss": 1.6246, "step": 14035 }, { "epoch": 0.36966025809849884, "grad_norm": 1.9641749858856201, "learning_rate": 3.152883855675534e-05, "loss": 1.8041, "step": 14036 }, { "epoch": 0.3696865946800105, "grad_norm": 1.7965713739395142, "learning_rate": 3.152752172767975e-05, "loss": 1.8808, "step": 14037 }, { "epoch": 0.36971293126152227, "grad_norm": 2.3773248195648193, "learning_rate": 3.152620489860416e-05, "loss": 1.9717, "step": 14038 }, { "epoch": 0.36973926784303396, "grad_norm": 2.828329086303711, "learning_rate": 3.152488806952857e-05, "loss": 1.5003, "step": 14039 }, { "epoch": 0.3697656044245457, "grad_norm": 2.3443846702575684, "learning_rate": 3.1523571240452994e-05, "loss": 2.7071, "step": 14040 }, { "epoch": 0.3697919410060574, "grad_norm": 1.8259594440460205, "learning_rate": 3.15222544113774e-05, "loss": 2.0529, "step": 14041 }, { "epoch": 0.36981827758756913, "grad_norm": 1.7357100248336792, "learning_rate": 3.152093758230182e-05, "loss": 1.8714, "step": 14042 }, { "epoch": 0.3698446141690809, "grad_norm": 2.4874861240386963, "learning_rate": 3.1519620753226234e-05, "loss": 2.1107, "step": 14043 }, { "epoch": 0.36987095075059256, "grad_norm": 2.044776439666748, "learning_rate": 3.151830392415065e-05, "loss": 1.4348, "step": 14044 }, { "epoch": 0.3698972873321043, "grad_norm": 2.6963422298431396, "learning_rate": 3.1516987095075065e-05, "loss": 1.9687, "step": 14045 }, { "epoch": 0.369923623913616, "grad_norm": 4.4860405921936035, "learning_rate": 3.1515670265999474e-05, "loss": 0.7362, "step": 14046 }, { "epoch": 0.36994996049512774, "grad_norm": 2.948392391204834, "learning_rate": 3.151435343692389e-05, "loss": 1.5328, "step": 14047 }, { "epoch": 0.3699762970766394, "grad_norm": 2.040557622909546, "learning_rate": 3.15130366078483e-05, "loss": 1.6299, "step": 14048 }, { "epoch": 0.37000263365815117, "grad_norm": 2.2876884937286377, "learning_rate": 3.151171977877272e-05, "loss": 1.6243, "step": 14049 }, { "epoch": 0.3700289702396629, "grad_norm": 4.5751471519470215, "learning_rate": 3.151040294969713e-05, "loss": 1.913, "step": 14050 }, { "epoch": 0.3700553068211746, "grad_norm": 1.823683261871338, "learning_rate": 3.1509086120621545e-05, "loss": 1.8026, "step": 14051 }, { "epoch": 0.37008164340268634, "grad_norm": 2.302485466003418, "learning_rate": 3.150776929154596e-05, "loss": 2.0231, "step": 14052 }, { "epoch": 0.37010797998419803, "grad_norm": 2.1640784740448, "learning_rate": 3.150645246247037e-05, "loss": 1.938, "step": 14053 }, { "epoch": 0.3701343165657098, "grad_norm": 2.714353561401367, "learning_rate": 3.150513563339479e-05, "loss": 0.9813, "step": 14054 }, { "epoch": 0.37016065314722146, "grad_norm": 2.1586687564849854, "learning_rate": 3.15038188043192e-05, "loss": 2.364, "step": 14055 }, { "epoch": 0.3701869897287332, "grad_norm": 2.4510347843170166, "learning_rate": 3.1502501975243616e-05, "loss": 0.5145, "step": 14056 }, { "epoch": 0.37021332631024495, "grad_norm": 1.7453266382217407, "learning_rate": 3.1501185146168025e-05, "loss": 1.3365, "step": 14057 }, { "epoch": 0.37023966289175664, "grad_norm": 2.4171650409698486, "learning_rate": 3.149986831709244e-05, "loss": 1.6939, "step": 14058 }, { "epoch": 0.3702659994732684, "grad_norm": 1.585357666015625, "learning_rate": 3.1498551488016856e-05, "loss": 1.4889, "step": 14059 }, { "epoch": 0.37029233605478007, "grad_norm": 1.7167292833328247, "learning_rate": 3.149723465894127e-05, "loss": 1.9869, "step": 14060 }, { "epoch": 0.3703186726362918, "grad_norm": 1.7951264381408691, "learning_rate": 3.149591782986569e-05, "loss": 1.6581, "step": 14061 }, { "epoch": 0.37034500921780356, "grad_norm": 4.58056640625, "learning_rate": 3.1494601000790096e-05, "loss": 1.4034, "step": 14062 }, { "epoch": 0.37037134579931524, "grad_norm": 2.6444268226623535, "learning_rate": 3.149328417171452e-05, "loss": 1.0226, "step": 14063 }, { "epoch": 0.370397682380827, "grad_norm": 2.370619773864746, "learning_rate": 3.149196734263893e-05, "loss": 1.6543, "step": 14064 }, { "epoch": 0.3704240189623387, "grad_norm": 1.7518035173416138, "learning_rate": 3.149065051356334e-05, "loss": 1.5299, "step": 14065 }, { "epoch": 0.3704503555438504, "grad_norm": 2.046924114227295, "learning_rate": 3.148933368448775e-05, "loss": 2.1612, "step": 14066 }, { "epoch": 0.3704766921253621, "grad_norm": 3.127081871032715, "learning_rate": 3.148801685541217e-05, "loss": 2.0245, "step": 14067 }, { "epoch": 0.37050302870687385, "grad_norm": 3.3367018699645996, "learning_rate": 3.148670002633658e-05, "loss": 1.4806, "step": 14068 }, { "epoch": 0.3705293652883856, "grad_norm": 3.3937251567840576, "learning_rate": 3.1485383197261e-05, "loss": 1.8675, "step": 14069 }, { "epoch": 0.3705557018698973, "grad_norm": 2.0157434940338135, "learning_rate": 3.148406636818541e-05, "loss": 2.1464, "step": 14070 }, { "epoch": 0.370582038451409, "grad_norm": 1.835428237915039, "learning_rate": 3.148274953910982e-05, "loss": 1.8917, "step": 14071 }, { "epoch": 0.3706083750329207, "grad_norm": 2.2136576175689697, "learning_rate": 3.148143271003424e-05, "loss": 2.4428, "step": 14072 }, { "epoch": 0.37063471161443245, "grad_norm": 2.241074800491333, "learning_rate": 3.1480115880958654e-05, "loss": 0.9379, "step": 14073 }, { "epoch": 0.37066104819594414, "grad_norm": 2.1240148544311523, "learning_rate": 3.147879905188307e-05, "loss": 1.8163, "step": 14074 }, { "epoch": 0.3706873847774559, "grad_norm": 2.615171194076538, "learning_rate": 3.147748222280748e-05, "loss": 1.5766, "step": 14075 }, { "epoch": 0.37071372135896763, "grad_norm": 2.40596866607666, "learning_rate": 3.1476165393731894e-05, "loss": 1.6367, "step": 14076 }, { "epoch": 0.3707400579404793, "grad_norm": 1.585900068283081, "learning_rate": 3.147484856465631e-05, "loss": 1.8951, "step": 14077 }, { "epoch": 0.37076639452199106, "grad_norm": 1.8931550979614258, "learning_rate": 3.1473531735580726e-05, "loss": 2.3826, "step": 14078 }, { "epoch": 0.37079273110350275, "grad_norm": 1.6468175649642944, "learning_rate": 3.1472214906505135e-05, "loss": 2.4275, "step": 14079 }, { "epoch": 0.3708190676850145, "grad_norm": 2.1604626178741455, "learning_rate": 3.147089807742955e-05, "loss": 2.1381, "step": 14080 }, { "epoch": 0.3708454042665262, "grad_norm": 2.925337553024292, "learning_rate": 3.1469581248353966e-05, "loss": 1.1897, "step": 14081 }, { "epoch": 0.3708717408480379, "grad_norm": 4.752488136291504, "learning_rate": 3.146826441927838e-05, "loss": 1.2945, "step": 14082 }, { "epoch": 0.37089807742954967, "grad_norm": 1.8100332021713257, "learning_rate": 3.14669475902028e-05, "loss": 1.8183, "step": 14083 }, { "epoch": 0.37092441401106135, "grad_norm": 1.6442036628723145, "learning_rate": 3.1465630761127206e-05, "loss": 1.7176, "step": 14084 }, { "epoch": 0.3709507505925731, "grad_norm": 2.836866617202759, "learning_rate": 3.146431393205162e-05, "loss": 0.6367, "step": 14085 }, { "epoch": 0.3709770871740848, "grad_norm": 2.276623249053955, "learning_rate": 3.146299710297603e-05, "loss": 2.0926, "step": 14086 }, { "epoch": 0.37100342375559653, "grad_norm": 4.055276870727539, "learning_rate": 3.146168027390045e-05, "loss": 1.5615, "step": 14087 }, { "epoch": 0.3710297603371082, "grad_norm": 5.498299598693848, "learning_rate": 3.146036344482486e-05, "loss": 0.8085, "step": 14088 }, { "epoch": 0.37105609691861996, "grad_norm": 4.85291862487793, "learning_rate": 3.145904661574928e-05, "loss": 0.8505, "step": 14089 }, { "epoch": 0.3710824335001317, "grad_norm": 4.257732391357422, "learning_rate": 3.145772978667369e-05, "loss": 1.4856, "step": 14090 }, { "epoch": 0.3711087700816434, "grad_norm": 3.3910834789276123, "learning_rate": 3.14564129575981e-05, "loss": 0.9633, "step": 14091 }, { "epoch": 0.37113510666315513, "grad_norm": 3.682894468307495, "learning_rate": 3.1455096128522524e-05, "loss": 1.108, "step": 14092 }, { "epoch": 0.3711614432446668, "grad_norm": 1.7492684125900269, "learning_rate": 3.145377929944693e-05, "loss": 1.7462, "step": 14093 }, { "epoch": 0.37118777982617857, "grad_norm": 4.625382900238037, "learning_rate": 3.145246247037135e-05, "loss": 1.1887, "step": 14094 }, { "epoch": 0.3712141164076903, "grad_norm": 3.399789333343506, "learning_rate": 3.145114564129576e-05, "loss": 1.3714, "step": 14095 }, { "epoch": 0.371240452989202, "grad_norm": 1.87528395652771, "learning_rate": 3.144982881222018e-05, "loss": 2.0846, "step": 14096 }, { "epoch": 0.37126678957071374, "grad_norm": 2.0241079330444336, "learning_rate": 3.144851198314459e-05, "loss": 1.4942, "step": 14097 }, { "epoch": 0.37129312615222543, "grad_norm": 4.415994167327881, "learning_rate": 3.1447195154069004e-05, "loss": 0.8733, "step": 14098 }, { "epoch": 0.37131946273373717, "grad_norm": 3.184434413909912, "learning_rate": 3.144587832499342e-05, "loss": 1.9379, "step": 14099 }, { "epoch": 0.37134579931524886, "grad_norm": 2.7417070865631104, "learning_rate": 3.144456149591783e-05, "loss": 0.6104, "step": 14100 }, { "epoch": 0.3713721358967606, "grad_norm": 2.1382856369018555, "learning_rate": 3.144324466684225e-05, "loss": 1.6291, "step": 14101 }, { "epoch": 0.37139847247827235, "grad_norm": 2.849663257598877, "learning_rate": 3.144192783776666e-05, "loss": 2.1041, "step": 14102 }, { "epoch": 0.37142480905978403, "grad_norm": 1.925470232963562, "learning_rate": 3.1440611008691075e-05, "loss": 1.441, "step": 14103 }, { "epoch": 0.3714511456412958, "grad_norm": 1.9539369344711304, "learning_rate": 3.1439294179615484e-05, "loss": 1.4344, "step": 14104 }, { "epoch": 0.37147748222280746, "grad_norm": 1.9117008447647095, "learning_rate": 3.14379773505399e-05, "loss": 1.6653, "step": 14105 }, { "epoch": 0.3715038188043192, "grad_norm": 2.064760446548462, "learning_rate": 3.1436660521464315e-05, "loss": 1.9968, "step": 14106 }, { "epoch": 0.3715301553858309, "grad_norm": 2.7670607566833496, "learning_rate": 3.143534369238873e-05, "loss": 1.5052, "step": 14107 }, { "epoch": 0.37155649196734264, "grad_norm": 1.9472836256027222, "learning_rate": 3.1434026863313146e-05, "loss": 0.2269, "step": 14108 }, { "epoch": 0.3715828285488544, "grad_norm": 4.877762794494629, "learning_rate": 3.1432710034237555e-05, "loss": 1.0154, "step": 14109 }, { "epoch": 0.37160916513036607, "grad_norm": 6.371516227722168, "learning_rate": 3.143139320516198e-05, "loss": 0.9444, "step": 14110 }, { "epoch": 0.3716355017118778, "grad_norm": 1.8459303379058838, "learning_rate": 3.1430076376086386e-05, "loss": 1.5265, "step": 14111 }, { "epoch": 0.3716618382933895, "grad_norm": 2.2648868560791016, "learning_rate": 3.14287595470108e-05, "loss": 1.8607, "step": 14112 }, { "epoch": 0.37168817487490124, "grad_norm": 2.055403232574463, "learning_rate": 3.142744271793521e-05, "loss": 2.0137, "step": 14113 }, { "epoch": 0.37171451145641293, "grad_norm": 1.556557297706604, "learning_rate": 3.1426125888859626e-05, "loss": 1.997, "step": 14114 }, { "epoch": 0.3717408480379247, "grad_norm": 1.945652723312378, "learning_rate": 3.142480905978404e-05, "loss": 1.4773, "step": 14115 }, { "epoch": 0.3717671846194364, "grad_norm": 2.3292694091796875, "learning_rate": 3.142349223070846e-05, "loss": 1.7397, "step": 14116 }, { "epoch": 0.3717935212009481, "grad_norm": 2.0406198501586914, "learning_rate": 3.1422175401632866e-05, "loss": 1.3014, "step": 14117 }, { "epoch": 0.37181985778245985, "grad_norm": 2.925997734069824, "learning_rate": 3.142085857255728e-05, "loss": 1.5552, "step": 14118 }, { "epoch": 0.37184619436397154, "grad_norm": 1.776259422302246, "learning_rate": 3.14195417434817e-05, "loss": 1.7039, "step": 14119 }, { "epoch": 0.3718725309454833, "grad_norm": 1.7593868970870972, "learning_rate": 3.141822491440611e-05, "loss": 1.7532, "step": 14120 }, { "epoch": 0.37189886752699497, "grad_norm": 2.4789812564849854, "learning_rate": 3.141690808533053e-05, "loss": 0.8185, "step": 14121 }, { "epoch": 0.3719252041085067, "grad_norm": 1.977687954902649, "learning_rate": 3.141559125625494e-05, "loss": 1.1145, "step": 14122 }, { "epoch": 0.37195154069001846, "grad_norm": 2.5104732513427734, "learning_rate": 3.141427442717935e-05, "loss": 2.3535, "step": 14123 }, { "epoch": 0.37197787727153014, "grad_norm": 1.8159688711166382, "learning_rate": 3.141295759810376e-05, "loss": 1.5078, "step": 14124 }, { "epoch": 0.3720042138530419, "grad_norm": 7.478771686553955, "learning_rate": 3.1411640769028184e-05, "loss": 1.4843, "step": 14125 }, { "epoch": 0.3720305504345536, "grad_norm": 2.509643793106079, "learning_rate": 3.141032393995259e-05, "loss": 1.6703, "step": 14126 }, { "epoch": 0.3720568870160653, "grad_norm": 3.9190351963043213, "learning_rate": 3.140900711087701e-05, "loss": 1.6153, "step": 14127 }, { "epoch": 0.37208322359757706, "grad_norm": 2.1300253868103027, "learning_rate": 3.1407690281801424e-05, "loss": 2.2397, "step": 14128 }, { "epoch": 0.37210956017908875, "grad_norm": 1.976865530014038, "learning_rate": 3.140637345272584e-05, "loss": 1.4254, "step": 14129 }, { "epoch": 0.3721358967606005, "grad_norm": 3.5399374961853027, "learning_rate": 3.1405056623650255e-05, "loss": 1.1319, "step": 14130 }, { "epoch": 0.3721622333421122, "grad_norm": 1.8781288862228394, "learning_rate": 3.1403739794574664e-05, "loss": 2.0124, "step": 14131 }, { "epoch": 0.3721885699236239, "grad_norm": 3.8063457012176514, "learning_rate": 3.140242296549908e-05, "loss": 1.9692, "step": 14132 }, { "epoch": 0.3722149065051356, "grad_norm": 1.6347147226333618, "learning_rate": 3.140110613642349e-05, "loss": 2.3463, "step": 14133 }, { "epoch": 0.37224124308664736, "grad_norm": 2.5241732597351074, "learning_rate": 3.139978930734791e-05, "loss": 2.0984, "step": 14134 }, { "epoch": 0.3722675796681591, "grad_norm": 2.893841505050659, "learning_rate": 3.139847247827232e-05, "loss": 2.226, "step": 14135 }, { "epoch": 0.3722939162496708, "grad_norm": 1.7463747262954712, "learning_rate": 3.1397155649196735e-05, "loss": 2.2076, "step": 14136 }, { "epoch": 0.37232025283118253, "grad_norm": 1.6108602285385132, "learning_rate": 3.139583882012115e-05, "loss": 1.8188, "step": 14137 }, { "epoch": 0.3723465894126942, "grad_norm": 2.745497465133667, "learning_rate": 3.139452199104556e-05, "loss": 2.2784, "step": 14138 }, { "epoch": 0.37237292599420596, "grad_norm": 2.3145902156829834, "learning_rate": 3.139320516196998e-05, "loss": 1.0686, "step": 14139 }, { "epoch": 0.37239926257571765, "grad_norm": 1.4394451379776, "learning_rate": 3.139188833289439e-05, "loss": 0.5054, "step": 14140 }, { "epoch": 0.3724255991572294, "grad_norm": 1.9597985744476318, "learning_rate": 3.139057150381881e-05, "loss": 1.9649, "step": 14141 }, { "epoch": 0.37245193573874114, "grad_norm": 5.424899578094482, "learning_rate": 3.1389254674743216e-05, "loss": 0.8165, "step": 14142 }, { "epoch": 0.3724782723202528, "grad_norm": 1.6741294860839844, "learning_rate": 3.138793784566764e-05, "loss": 2.3714, "step": 14143 }, { "epoch": 0.37250460890176457, "grad_norm": 2.9758431911468506, "learning_rate": 3.138662101659205e-05, "loss": 1.5172, "step": 14144 }, { "epoch": 0.37253094548327625, "grad_norm": 2.6219727993011475, "learning_rate": 3.138530418751646e-05, "loss": 2.1054, "step": 14145 }, { "epoch": 0.372557282064788, "grad_norm": 2.9232587814331055, "learning_rate": 3.138398735844088e-05, "loss": 1.5619, "step": 14146 }, { "epoch": 0.3725836186462997, "grad_norm": 3.4238758087158203, "learning_rate": 3.138267052936529e-05, "loss": 0.7913, "step": 14147 }, { "epoch": 0.37260995522781143, "grad_norm": 2.1738319396972656, "learning_rate": 3.138135370028971e-05, "loss": 1.6217, "step": 14148 }, { "epoch": 0.3726362918093232, "grad_norm": 2.092710494995117, "learning_rate": 3.138003687121412e-05, "loss": 0.7267, "step": 14149 }, { "epoch": 0.37266262839083486, "grad_norm": 2.406601667404175, "learning_rate": 3.1378720042138534e-05, "loss": 1.948, "step": 14150 }, { "epoch": 0.3726889649723466, "grad_norm": 4.631277084350586, "learning_rate": 3.137740321306294e-05, "loss": 2.0091, "step": 14151 }, { "epoch": 0.3727153015538583, "grad_norm": 2.006218910217285, "learning_rate": 3.137608638398736e-05, "loss": 1.6647, "step": 14152 }, { "epoch": 0.37274163813537003, "grad_norm": 3.4320740699768066, "learning_rate": 3.1374769554911774e-05, "loss": 1.5823, "step": 14153 }, { "epoch": 0.3727679747168817, "grad_norm": 1.6155775785446167, "learning_rate": 3.137345272583619e-05, "loss": 1.6534, "step": 14154 }, { "epoch": 0.37279431129839347, "grad_norm": 3.2763047218322754, "learning_rate": 3.1372135896760605e-05, "loss": 1.8737, "step": 14155 }, { "epoch": 0.3728206478799052, "grad_norm": 1.7518818378448486, "learning_rate": 3.1370819067685014e-05, "loss": 2.08, "step": 14156 }, { "epoch": 0.3728469844614169, "grad_norm": 1.9589077234268188, "learning_rate": 3.136950223860943e-05, "loss": 1.1261, "step": 14157 }, { "epoch": 0.37287332104292864, "grad_norm": 1.6220275163650513, "learning_rate": 3.1368185409533845e-05, "loss": 0.6425, "step": 14158 }, { "epoch": 0.37289965762444033, "grad_norm": 1.9339220523834229, "learning_rate": 3.136686858045826e-05, "loss": 1.9084, "step": 14159 }, { "epoch": 0.37292599420595207, "grad_norm": 5.558414459228516, "learning_rate": 3.136555175138267e-05, "loss": 1.541, "step": 14160 }, { "epoch": 0.3729523307874638, "grad_norm": 1.734666347503662, "learning_rate": 3.1364234922307085e-05, "loss": 1.9991, "step": 14161 }, { "epoch": 0.3729786673689755, "grad_norm": 1.5355716943740845, "learning_rate": 3.13629180932315e-05, "loss": 2.1111, "step": 14162 }, { "epoch": 0.37300500395048725, "grad_norm": 2.3051373958587646, "learning_rate": 3.1361601264155916e-05, "loss": 1.4456, "step": 14163 }, { "epoch": 0.37303134053199893, "grad_norm": 5.503693103790283, "learning_rate": 3.136028443508033e-05, "loss": 1.0033, "step": 14164 }, { "epoch": 0.3730576771135107, "grad_norm": 1.771206259727478, "learning_rate": 3.135896760600474e-05, "loss": 1.7851, "step": 14165 }, { "epoch": 0.37308401369502237, "grad_norm": 1.6473339796066284, "learning_rate": 3.1357650776929156e-05, "loss": 1.6213, "step": 14166 }, { "epoch": 0.3731103502765341, "grad_norm": 2.366232395172119, "learning_rate": 3.135633394785357e-05, "loss": 1.5638, "step": 14167 }, { "epoch": 0.37313668685804585, "grad_norm": 1.6880323886871338, "learning_rate": 3.135501711877799e-05, "loss": 0.6219, "step": 14168 }, { "epoch": 0.37316302343955754, "grad_norm": 1.7996622323989868, "learning_rate": 3.1353700289702396e-05, "loss": 1.8728, "step": 14169 }, { "epoch": 0.3731893600210693, "grad_norm": 2.6654815673828125, "learning_rate": 3.135238346062681e-05, "loss": 0.6643, "step": 14170 }, { "epoch": 0.37321569660258097, "grad_norm": 2.1555862426757812, "learning_rate": 3.135106663155122e-05, "loss": 1.3774, "step": 14171 }, { "epoch": 0.3732420331840927, "grad_norm": 1.8232494592666626, "learning_rate": 3.134974980247564e-05, "loss": 1.8661, "step": 14172 }, { "epoch": 0.3732683697656044, "grad_norm": 2.0101449489593506, "learning_rate": 3.134843297340005e-05, "loss": 1.9914, "step": 14173 }, { "epoch": 0.37329470634711615, "grad_norm": 1.7984455823898315, "learning_rate": 3.134711614432447e-05, "loss": 2.402, "step": 14174 }, { "epoch": 0.3733210429286279, "grad_norm": 2.6387932300567627, "learning_rate": 3.134579931524888e-05, "loss": 1.3151, "step": 14175 }, { "epoch": 0.3733473795101396, "grad_norm": 2.278677225112915, "learning_rate": 3.13444824861733e-05, "loss": 0.8002, "step": 14176 }, { "epoch": 0.3733737160916513, "grad_norm": 2.45867919921875, "learning_rate": 3.1343165657097714e-05, "loss": 2.1641, "step": 14177 }, { "epoch": 0.373400052673163, "grad_norm": 2.5741069316864014, "learning_rate": 3.134184882802212e-05, "loss": 1.6112, "step": 14178 }, { "epoch": 0.37342638925467475, "grad_norm": 3.022954225540161, "learning_rate": 3.134053199894654e-05, "loss": 1.7018, "step": 14179 }, { "epoch": 0.37345272583618644, "grad_norm": 2.2265570163726807, "learning_rate": 3.133921516987095e-05, "loss": 1.3314, "step": 14180 }, { "epoch": 0.3734790624176982, "grad_norm": 6.127312660217285, "learning_rate": 3.133789834079537e-05, "loss": 1.6663, "step": 14181 }, { "epoch": 0.3735053989992099, "grad_norm": 2.0783584117889404, "learning_rate": 3.133658151171978e-05, "loss": 1.6706, "step": 14182 }, { "epoch": 0.3735317355807216, "grad_norm": 2.471951723098755, "learning_rate": 3.1335264682644194e-05, "loss": 1.0596, "step": 14183 }, { "epoch": 0.37355807216223336, "grad_norm": 1.9284427165985107, "learning_rate": 3.133394785356861e-05, "loss": 2.7822, "step": 14184 }, { "epoch": 0.37358440874374504, "grad_norm": 4.185712814331055, "learning_rate": 3.133263102449302e-05, "loss": 1.5907, "step": 14185 }, { "epoch": 0.3736107453252568, "grad_norm": 1.985437035560608, "learning_rate": 3.133131419541744e-05, "loss": 1.951, "step": 14186 }, { "epoch": 0.3736370819067685, "grad_norm": 3.8078482151031494, "learning_rate": 3.132999736634185e-05, "loss": 1.2052, "step": 14187 }, { "epoch": 0.3736634184882802, "grad_norm": 7.1121296882629395, "learning_rate": 3.1328680537266265e-05, "loss": 1.4383, "step": 14188 }, { "epoch": 0.37368975506979196, "grad_norm": 2.2999610900878906, "learning_rate": 3.1327363708190674e-05, "loss": 1.0848, "step": 14189 }, { "epoch": 0.37371609165130365, "grad_norm": 1.4526771306991577, "learning_rate": 3.132604687911509e-05, "loss": 1.7914, "step": 14190 }, { "epoch": 0.3737424282328154, "grad_norm": 3.920475721359253, "learning_rate": 3.1324730050039505e-05, "loss": 0.5756, "step": 14191 }, { "epoch": 0.3737687648143271, "grad_norm": 2.5304274559020996, "learning_rate": 3.132341322096392e-05, "loss": 1.6966, "step": 14192 }, { "epoch": 0.3737951013958388, "grad_norm": 2.783161163330078, "learning_rate": 3.1322096391888336e-05, "loss": 1.561, "step": 14193 }, { "epoch": 0.3738214379773505, "grad_norm": 1.5175174474716187, "learning_rate": 3.1320779562812745e-05, "loss": 2.2743, "step": 14194 }, { "epoch": 0.37384777455886226, "grad_norm": 3.7950727939605713, "learning_rate": 3.131946273373717e-05, "loss": 1.6094, "step": 14195 }, { "epoch": 0.373874111140374, "grad_norm": 1.7602031230926514, "learning_rate": 3.1318145904661576e-05, "loss": 1.6025, "step": 14196 }, { "epoch": 0.3739004477218857, "grad_norm": 2.615161657333374, "learning_rate": 3.131682907558599e-05, "loss": 1.7188, "step": 14197 }, { "epoch": 0.37392678430339743, "grad_norm": 2.873991012573242, "learning_rate": 3.13155122465104e-05, "loss": 1.7009, "step": 14198 }, { "epoch": 0.3739531208849091, "grad_norm": 2.0137271881103516, "learning_rate": 3.1314195417434816e-05, "loss": 1.8465, "step": 14199 }, { "epoch": 0.37397945746642086, "grad_norm": 4.88822603225708, "learning_rate": 3.131287858835923e-05, "loss": 1.0312, "step": 14200 }, { "epoch": 0.3740057940479326, "grad_norm": 2.652528762817383, "learning_rate": 3.131156175928365e-05, "loss": 2.0592, "step": 14201 }, { "epoch": 0.3740321306294443, "grad_norm": 2.9017488956451416, "learning_rate": 3.131024493020806e-05, "loss": 1.9195, "step": 14202 }, { "epoch": 0.37405846721095604, "grad_norm": 3.1204323768615723, "learning_rate": 3.130892810113247e-05, "loss": 1.662, "step": 14203 }, { "epoch": 0.3740848037924677, "grad_norm": 1.5588823556900024, "learning_rate": 3.130761127205689e-05, "loss": 1.8854, "step": 14204 }, { "epoch": 0.37411114037397947, "grad_norm": 2.6608121395111084, "learning_rate": 3.13062944429813e-05, "loss": 1.3078, "step": 14205 }, { "epoch": 0.37413747695549116, "grad_norm": 2.2102999687194824, "learning_rate": 3.130497761390572e-05, "loss": 2.0173, "step": 14206 }, { "epoch": 0.3741638135370029, "grad_norm": 1.4377378225326538, "learning_rate": 3.130366078483013e-05, "loss": 0.3523, "step": 14207 }, { "epoch": 0.37419015011851464, "grad_norm": 1.919411063194275, "learning_rate": 3.130234395575454e-05, "loss": 1.6903, "step": 14208 }, { "epoch": 0.37421648670002633, "grad_norm": 2.787741184234619, "learning_rate": 3.130102712667896e-05, "loss": 1.117, "step": 14209 }, { "epoch": 0.3742428232815381, "grad_norm": 2.6549298763275146, "learning_rate": 3.1299710297603375e-05, "loss": 1.3847, "step": 14210 }, { "epoch": 0.37426915986304976, "grad_norm": 1.8287451267242432, "learning_rate": 3.129839346852779e-05, "loss": 2.3063, "step": 14211 }, { "epoch": 0.3742954964445615, "grad_norm": 1.8640003204345703, "learning_rate": 3.12970766394522e-05, "loss": 1.1505, "step": 14212 }, { "epoch": 0.3743218330260732, "grad_norm": 4.248157024383545, "learning_rate": 3.1295759810376615e-05, "loss": 2.1364, "step": 14213 }, { "epoch": 0.37434816960758494, "grad_norm": 3.034585475921631, "learning_rate": 3.129444298130103e-05, "loss": 1.7178, "step": 14214 }, { "epoch": 0.3743745061890967, "grad_norm": 2.0356380939483643, "learning_rate": 3.1293126152225446e-05, "loss": 1.6205, "step": 14215 }, { "epoch": 0.37440084277060837, "grad_norm": 2.47143292427063, "learning_rate": 3.1291809323149855e-05, "loss": 1.8065, "step": 14216 }, { "epoch": 0.3744271793521201, "grad_norm": 3.6035900115966797, "learning_rate": 3.129049249407427e-05, "loss": 1.685, "step": 14217 }, { "epoch": 0.3744535159336318, "grad_norm": 1.8034420013427734, "learning_rate": 3.128917566499868e-05, "loss": 1.755, "step": 14218 }, { "epoch": 0.37447985251514354, "grad_norm": 2.6940577030181885, "learning_rate": 3.12878588359231e-05, "loss": 2.3524, "step": 14219 }, { "epoch": 0.37450618909665523, "grad_norm": 1.714198112487793, "learning_rate": 3.128654200684751e-05, "loss": 2.4048, "step": 14220 }, { "epoch": 0.37453252567816697, "grad_norm": 2.2402350902557373, "learning_rate": 3.1285225177771926e-05, "loss": 2.2052, "step": 14221 }, { "epoch": 0.3745588622596787, "grad_norm": 1.5270017385482788, "learning_rate": 3.128390834869634e-05, "loss": 1.6884, "step": 14222 }, { "epoch": 0.3745851988411904, "grad_norm": 1.75799560546875, "learning_rate": 3.128259151962075e-05, "loss": 2.4456, "step": 14223 }, { "epoch": 0.37461153542270215, "grad_norm": 1.701562523841858, "learning_rate": 3.128127469054517e-05, "loss": 1.8755, "step": 14224 }, { "epoch": 0.37463787200421383, "grad_norm": 1.492091417312622, "learning_rate": 3.127995786146958e-05, "loss": 1.8, "step": 14225 }, { "epoch": 0.3746642085857256, "grad_norm": 5.475642681121826, "learning_rate": 3.1278641032394e-05, "loss": 1.9598, "step": 14226 }, { "epoch": 0.37469054516723727, "grad_norm": 1.8512247800827026, "learning_rate": 3.1277324203318406e-05, "loss": 0.8004, "step": 14227 }, { "epoch": 0.374716881748749, "grad_norm": 1.6653473377227783, "learning_rate": 3.127600737424283e-05, "loss": 1.8888, "step": 14228 }, { "epoch": 0.37474321833026075, "grad_norm": 2.3999147415161133, "learning_rate": 3.127469054516724e-05, "loss": 0.4719, "step": 14229 }, { "epoch": 0.37476955491177244, "grad_norm": 1.8173974752426147, "learning_rate": 3.127337371609165e-05, "loss": 1.2811, "step": 14230 }, { "epoch": 0.3747958914932842, "grad_norm": 2.0996522903442383, "learning_rate": 3.127205688701607e-05, "loss": 1.2555, "step": 14231 }, { "epoch": 0.37482222807479587, "grad_norm": 3.778411626815796, "learning_rate": 3.127074005794048e-05, "loss": 1.3732, "step": 14232 }, { "epoch": 0.3748485646563076, "grad_norm": 1.517289161682129, "learning_rate": 3.12694232288649e-05, "loss": 1.9646, "step": 14233 }, { "epoch": 0.37487490123781936, "grad_norm": 1.2089574337005615, "learning_rate": 3.126810639978931e-05, "loss": 1.5684, "step": 14234 }, { "epoch": 0.37490123781933105, "grad_norm": 2.1433467864990234, "learning_rate": 3.1266789570713724e-05, "loss": 1.7222, "step": 14235 }, { "epoch": 0.3749275744008428, "grad_norm": 3.5721564292907715, "learning_rate": 3.126547274163813e-05, "loss": 1.775, "step": 14236 }, { "epoch": 0.3749539109823545, "grad_norm": 2.8540072441101074, "learning_rate": 3.126415591256255e-05, "loss": 1.4602, "step": 14237 }, { "epoch": 0.3749802475638662, "grad_norm": 2.129418134689331, "learning_rate": 3.1262839083486964e-05, "loss": 1.9492, "step": 14238 }, { "epoch": 0.3750065841453779, "grad_norm": 2.413728952407837, "learning_rate": 3.126152225441138e-05, "loss": 1.7976, "step": 14239 }, { "epoch": 0.37503292072688965, "grad_norm": 1.9544322490692139, "learning_rate": 3.1260205425335795e-05, "loss": 1.621, "step": 14240 }, { "epoch": 0.3750592573084014, "grad_norm": 2.624230146408081, "learning_rate": 3.1258888596260204e-05, "loss": 0.5389, "step": 14241 }, { "epoch": 0.3750855938899131, "grad_norm": 3.2988929748535156, "learning_rate": 3.1257571767184626e-05, "loss": 1.164, "step": 14242 }, { "epoch": 0.3751119304714248, "grad_norm": 4.124659538269043, "learning_rate": 3.1256254938109035e-05, "loss": 1.8363, "step": 14243 }, { "epoch": 0.3751382670529365, "grad_norm": 2.81253719329834, "learning_rate": 3.125493810903345e-05, "loss": 0.4286, "step": 14244 }, { "epoch": 0.37516460363444826, "grad_norm": 1.5272924900054932, "learning_rate": 3.125362127995786e-05, "loss": 1.2971, "step": 14245 }, { "epoch": 0.37519094021595994, "grad_norm": 1.772472858428955, "learning_rate": 3.1252304450882275e-05, "loss": 2.2284, "step": 14246 }, { "epoch": 0.3752172767974717, "grad_norm": 1.7253715991973877, "learning_rate": 3.125098762180669e-05, "loss": 1.6207, "step": 14247 }, { "epoch": 0.37524361337898343, "grad_norm": 3.1210827827453613, "learning_rate": 3.1249670792731106e-05, "loss": 1.4943, "step": 14248 }, { "epoch": 0.3752699499604951, "grad_norm": 1.8335567712783813, "learning_rate": 3.124835396365552e-05, "loss": 1.5876, "step": 14249 }, { "epoch": 0.37529628654200686, "grad_norm": 2.278203010559082, "learning_rate": 3.124703713457993e-05, "loss": 2.2487, "step": 14250 }, { "epoch": 0.37532262312351855, "grad_norm": 2.0389113426208496, "learning_rate": 3.1245720305504346e-05, "loss": 2.1409, "step": 14251 }, { "epoch": 0.3753489597050303, "grad_norm": 2.7280356884002686, "learning_rate": 3.124440347642876e-05, "loss": 1.7056, "step": 14252 }, { "epoch": 0.375375296286542, "grad_norm": 2.6063644886016846, "learning_rate": 3.124308664735318e-05, "loss": 1.0552, "step": 14253 }, { "epoch": 0.3754016328680537, "grad_norm": 2.29797101020813, "learning_rate": 3.1241769818277586e-05, "loss": 2.0279, "step": 14254 }, { "epoch": 0.37542796944956547, "grad_norm": 2.8565499782562256, "learning_rate": 3.1240452989202e-05, "loss": 1.7775, "step": 14255 }, { "epoch": 0.37545430603107716, "grad_norm": 1.9371744394302368, "learning_rate": 3.123913616012642e-05, "loss": 0.6295, "step": 14256 }, { "epoch": 0.3754806426125889, "grad_norm": 2.2951388359069824, "learning_rate": 3.123781933105083e-05, "loss": 1.7932, "step": 14257 }, { "epoch": 0.3755069791941006, "grad_norm": 1.7291057109832764, "learning_rate": 3.123650250197525e-05, "loss": 1.6975, "step": 14258 }, { "epoch": 0.37553331577561233, "grad_norm": 3.547032594680786, "learning_rate": 3.123518567289966e-05, "loss": 1.7393, "step": 14259 }, { "epoch": 0.375559652357124, "grad_norm": 3.6985456943511963, "learning_rate": 3.123386884382407e-05, "loss": 0.9514, "step": 14260 }, { "epoch": 0.37558598893863576, "grad_norm": 3.034712791442871, "learning_rate": 3.123255201474849e-05, "loss": 1.9821, "step": 14261 }, { "epoch": 0.3756123255201475, "grad_norm": 2.120142698287964, "learning_rate": 3.1231235185672904e-05, "loss": 1.5462, "step": 14262 }, { "epoch": 0.3756386621016592, "grad_norm": 1.5356390476226807, "learning_rate": 3.122991835659731e-05, "loss": 1.8039, "step": 14263 }, { "epoch": 0.37566499868317094, "grad_norm": 2.625803232192993, "learning_rate": 3.122860152752173e-05, "loss": 1.663, "step": 14264 }, { "epoch": 0.3756913352646826, "grad_norm": 2.216063976287842, "learning_rate": 3.122728469844614e-05, "loss": 0.4178, "step": 14265 }, { "epoch": 0.37571767184619437, "grad_norm": 1.937024474143982, "learning_rate": 3.122596786937056e-05, "loss": 1.9679, "step": 14266 }, { "epoch": 0.3757440084277061, "grad_norm": 2.2436394691467285, "learning_rate": 3.1224651040294975e-05, "loss": 1.7544, "step": 14267 }, { "epoch": 0.3757703450092178, "grad_norm": 3.0948965549468994, "learning_rate": 3.1223334211219384e-05, "loss": 0.8748, "step": 14268 }, { "epoch": 0.37579668159072954, "grad_norm": 3.4630696773529053, "learning_rate": 3.12220173821438e-05, "loss": 1.6059, "step": 14269 }, { "epoch": 0.37582301817224123, "grad_norm": 2.200232982635498, "learning_rate": 3.122070055306821e-05, "loss": 1.9663, "step": 14270 }, { "epoch": 0.375849354753753, "grad_norm": 1.6948637962341309, "learning_rate": 3.121938372399263e-05, "loss": 1.7678, "step": 14271 }, { "epoch": 0.37587569133526466, "grad_norm": 2.5107574462890625, "learning_rate": 3.121806689491704e-05, "loss": 1.7259, "step": 14272 }, { "epoch": 0.3759020279167764, "grad_norm": 1.7776470184326172, "learning_rate": 3.1216750065841456e-05, "loss": 2.071, "step": 14273 }, { "epoch": 0.37592836449828815, "grad_norm": 3.5129220485687256, "learning_rate": 3.1215433236765864e-05, "loss": 2.2855, "step": 14274 }, { "epoch": 0.37595470107979984, "grad_norm": 2.658547878265381, "learning_rate": 3.121411640769029e-05, "loss": 1.6774, "step": 14275 }, { "epoch": 0.3759810376613116, "grad_norm": 2.8447399139404297, "learning_rate": 3.1212799578614696e-05, "loss": 1.5263, "step": 14276 }, { "epoch": 0.37600737424282327, "grad_norm": 2.125356435775757, "learning_rate": 3.121148274953911e-05, "loss": 1.8265, "step": 14277 }, { "epoch": 0.376033710824335, "grad_norm": 1.9518109560012817, "learning_rate": 3.121016592046353e-05, "loss": 1.8052, "step": 14278 }, { "epoch": 0.3760600474058467, "grad_norm": 3.5467631816864014, "learning_rate": 3.1208849091387936e-05, "loss": 0.8193, "step": 14279 }, { "epoch": 0.37608638398735844, "grad_norm": 1.7447534799575806, "learning_rate": 3.120753226231236e-05, "loss": 1.9299, "step": 14280 }, { "epoch": 0.3761127205688702, "grad_norm": 4.877032279968262, "learning_rate": 3.120621543323677e-05, "loss": 1.4584, "step": 14281 }, { "epoch": 0.3761390571503819, "grad_norm": 1.6478792428970337, "learning_rate": 3.120489860416118e-05, "loss": 1.7871, "step": 14282 }, { "epoch": 0.3761653937318936, "grad_norm": 4.124168872833252, "learning_rate": 3.120358177508559e-05, "loss": 1.8913, "step": 14283 }, { "epoch": 0.3761917303134053, "grad_norm": 1.914576530456543, "learning_rate": 3.120226494601001e-05, "loss": 1.765, "step": 14284 }, { "epoch": 0.37621806689491705, "grad_norm": 2.2469708919525146, "learning_rate": 3.120094811693442e-05, "loss": 1.6705, "step": 14285 }, { "epoch": 0.37624440347642873, "grad_norm": 2.927720308303833, "learning_rate": 3.119963128785884e-05, "loss": 0.6566, "step": 14286 }, { "epoch": 0.3762707400579405, "grad_norm": 2.779019355773926, "learning_rate": 3.1198314458783254e-05, "loss": 2.1069, "step": 14287 }, { "epoch": 0.3762970766394522, "grad_norm": 4.50929069519043, "learning_rate": 3.119699762970766e-05, "loss": 1.3856, "step": 14288 }, { "epoch": 0.3763234132209639, "grad_norm": 2.9545562267303467, "learning_rate": 3.1195680800632085e-05, "loss": 1.5467, "step": 14289 }, { "epoch": 0.37634974980247565, "grad_norm": 1.5907179117202759, "learning_rate": 3.1194363971556494e-05, "loss": 1.7183, "step": 14290 }, { "epoch": 0.37637608638398734, "grad_norm": 2.221656560897827, "learning_rate": 3.119304714248091e-05, "loss": 1.9421, "step": 14291 }, { "epoch": 0.3764024229654991, "grad_norm": 1.7442340850830078, "learning_rate": 3.119173031340532e-05, "loss": 1.963, "step": 14292 }, { "epoch": 0.37642875954701077, "grad_norm": 2.589993715286255, "learning_rate": 3.1190413484329734e-05, "loss": 1.234, "step": 14293 }, { "epoch": 0.3764550961285225, "grad_norm": 1.6990188360214233, "learning_rate": 3.118909665525415e-05, "loss": 1.8883, "step": 14294 }, { "epoch": 0.37648143271003426, "grad_norm": 2.229484796524048, "learning_rate": 3.1187779826178565e-05, "loss": 1.3622, "step": 14295 }, { "epoch": 0.37650776929154595, "grad_norm": 2.9781570434570312, "learning_rate": 3.118646299710298e-05, "loss": 1.5697, "step": 14296 }, { "epoch": 0.3765341058730577, "grad_norm": 2.28656268119812, "learning_rate": 3.118514616802739e-05, "loss": 1.4697, "step": 14297 }, { "epoch": 0.3765604424545694, "grad_norm": 2.038545608520508, "learning_rate": 3.1183829338951805e-05, "loss": 2.3618, "step": 14298 }, { "epoch": 0.3765867790360811, "grad_norm": 2.4138267040252686, "learning_rate": 3.118251250987622e-05, "loss": 1.2343, "step": 14299 }, { "epoch": 0.37661311561759286, "grad_norm": 2.261301040649414, "learning_rate": 3.1181195680800636e-05, "loss": 1.678, "step": 14300 }, { "epoch": 0.37663945219910455, "grad_norm": 2.581404685974121, "learning_rate": 3.1179878851725045e-05, "loss": 0.6939, "step": 14301 }, { "epoch": 0.3766657887806163, "grad_norm": 2.3161263465881348, "learning_rate": 3.117856202264946e-05, "loss": 1.53, "step": 14302 }, { "epoch": 0.376692125362128, "grad_norm": 4.823411464691162, "learning_rate": 3.1177245193573876e-05, "loss": 1.2247, "step": 14303 }, { "epoch": 0.3767184619436397, "grad_norm": 3.043858528137207, "learning_rate": 3.117592836449829e-05, "loss": 1.5187, "step": 14304 }, { "epoch": 0.3767447985251514, "grad_norm": 3.514554738998413, "learning_rate": 3.117461153542271e-05, "loss": 1.9775, "step": 14305 }, { "epoch": 0.37677113510666316, "grad_norm": 6.378372669219971, "learning_rate": 3.1173294706347116e-05, "loss": 1.3842, "step": 14306 }, { "epoch": 0.3767974716881749, "grad_norm": 1.59242844581604, "learning_rate": 3.117197787727153e-05, "loss": 1.8225, "step": 14307 }, { "epoch": 0.3768238082696866, "grad_norm": 3.1263887882232666, "learning_rate": 3.117066104819595e-05, "loss": 0.9102, "step": 14308 }, { "epoch": 0.37685014485119833, "grad_norm": 2.5310606956481934, "learning_rate": 3.116934421912036e-05, "loss": 1.5848, "step": 14309 }, { "epoch": 0.37687648143271, "grad_norm": 2.244361162185669, "learning_rate": 3.116802739004477e-05, "loss": 2.0851, "step": 14310 }, { "epoch": 0.37690281801422176, "grad_norm": 6.106942653656006, "learning_rate": 3.116671056096919e-05, "loss": 1.9758, "step": 14311 }, { "epoch": 0.37692915459573345, "grad_norm": 3.1216237545013428, "learning_rate": 3.11653937318936e-05, "loss": 1.7616, "step": 14312 }, { "epoch": 0.3769554911772452, "grad_norm": 1.9634357690811157, "learning_rate": 3.116407690281802e-05, "loss": 1.4297, "step": 14313 }, { "epoch": 0.37698182775875694, "grad_norm": 2.4202892780303955, "learning_rate": 3.1162760073742434e-05, "loss": 1.1619, "step": 14314 }, { "epoch": 0.3770081643402686, "grad_norm": 5.1971821784973145, "learning_rate": 3.116144324466684e-05, "loss": 1.3166, "step": 14315 }, { "epoch": 0.37703450092178037, "grad_norm": 4.500625133514404, "learning_rate": 3.116012641559126e-05, "loss": 1.5447, "step": 14316 }, { "epoch": 0.37706083750329206, "grad_norm": 2.278379440307617, "learning_rate": 3.115880958651567e-05, "loss": 1.3583, "step": 14317 }, { "epoch": 0.3770871740848038, "grad_norm": 1.9895977973937988, "learning_rate": 3.115749275744009e-05, "loss": 2.1477, "step": 14318 }, { "epoch": 0.3771135106663155, "grad_norm": 2.3393542766571045, "learning_rate": 3.11561759283645e-05, "loss": 1.4648, "step": 14319 }, { "epoch": 0.37713984724782723, "grad_norm": 2.84989595413208, "learning_rate": 3.1154859099288914e-05, "loss": 1.08, "step": 14320 }, { "epoch": 0.377166183829339, "grad_norm": 2.0685625076293945, "learning_rate": 3.115354227021332e-05, "loss": 2.2206, "step": 14321 }, { "epoch": 0.37719252041085066, "grad_norm": 2.290341377258301, "learning_rate": 3.1152225441137745e-05, "loss": 1.2715, "step": 14322 }, { "epoch": 0.3772188569923624, "grad_norm": 2.8589868545532227, "learning_rate": 3.1150908612062154e-05, "loss": 1.4608, "step": 14323 }, { "epoch": 0.3772451935738741, "grad_norm": 3.002072334289551, "learning_rate": 3.114959178298657e-05, "loss": 1.0854, "step": 14324 }, { "epoch": 0.37727153015538584, "grad_norm": 1.778875470161438, "learning_rate": 3.1148274953910985e-05, "loss": 1.919, "step": 14325 }, { "epoch": 0.3772978667368975, "grad_norm": 3.146024465560913, "learning_rate": 3.1146958124835394e-05, "loss": 1.6025, "step": 14326 }, { "epoch": 0.37732420331840927, "grad_norm": 2.0379104614257812, "learning_rate": 3.1145641295759816e-05, "loss": 1.8401, "step": 14327 }, { "epoch": 0.377350539899921, "grad_norm": 5.192755222320557, "learning_rate": 3.1144324466684225e-05, "loss": 2.2354, "step": 14328 }, { "epoch": 0.3773768764814327, "grad_norm": 3.909512758255005, "learning_rate": 3.114300763760864e-05, "loss": 1.8898, "step": 14329 }, { "epoch": 0.37740321306294444, "grad_norm": 3.0050933361053467, "learning_rate": 3.114169080853305e-05, "loss": 1.3986, "step": 14330 }, { "epoch": 0.37742954964445613, "grad_norm": 4.337152004241943, "learning_rate": 3.1140373979457465e-05, "loss": 1.3896, "step": 14331 }, { "epoch": 0.3774558862259679, "grad_norm": 3.046328067779541, "learning_rate": 3.113905715038188e-05, "loss": 1.6536, "step": 14332 }, { "epoch": 0.37748222280747956, "grad_norm": 1.9460607767105103, "learning_rate": 3.1137740321306297e-05, "loss": 0.8839, "step": 14333 }, { "epoch": 0.3775085593889913, "grad_norm": 1.8497570753097534, "learning_rate": 3.113642349223071e-05, "loss": 0.5827, "step": 14334 }, { "epoch": 0.37753489597050305, "grad_norm": 2.7690281867980957, "learning_rate": 3.113510666315512e-05, "loss": 2.3619, "step": 14335 }, { "epoch": 0.37756123255201474, "grad_norm": 3.2679927349090576, "learning_rate": 3.1133789834079537e-05, "loss": 2.3432, "step": 14336 }, { "epoch": 0.3775875691335265, "grad_norm": 2.757723808288574, "learning_rate": 3.113247300500395e-05, "loss": 1.5453, "step": 14337 }, { "epoch": 0.37761390571503817, "grad_norm": 1.9668294191360474, "learning_rate": 3.113115617592837e-05, "loss": 1.3436, "step": 14338 }, { "epoch": 0.3776402422965499, "grad_norm": 3.4913876056671143, "learning_rate": 3.1129839346852777e-05, "loss": 1.083, "step": 14339 }, { "epoch": 0.37766657887806165, "grad_norm": 1.5953645706176758, "learning_rate": 3.112852251777719e-05, "loss": 2.4503, "step": 14340 }, { "epoch": 0.37769291545957334, "grad_norm": 2.0951671600341797, "learning_rate": 3.112720568870161e-05, "loss": 2.0702, "step": 14341 }, { "epoch": 0.3777192520410851, "grad_norm": 2.076826333999634, "learning_rate": 3.112588885962602e-05, "loss": 1.8927, "step": 14342 }, { "epoch": 0.3777455886225968, "grad_norm": 3.5407419204711914, "learning_rate": 3.112457203055044e-05, "loss": 1.0983, "step": 14343 }, { "epoch": 0.3777719252041085, "grad_norm": 2.486903667449951, "learning_rate": 3.112325520147485e-05, "loss": 1.7075, "step": 14344 }, { "epoch": 0.3777982617856202, "grad_norm": 3.300886392593384, "learning_rate": 3.112193837239926e-05, "loss": 1.1176, "step": 14345 }, { "epoch": 0.37782459836713195, "grad_norm": 3.4425394535064697, "learning_rate": 3.112062154332368e-05, "loss": 2.2989, "step": 14346 }, { "epoch": 0.3778509349486437, "grad_norm": 1.7959916591644287, "learning_rate": 3.1119304714248095e-05, "loss": 1.9963, "step": 14347 }, { "epoch": 0.3778772715301554, "grad_norm": 5.633258819580078, "learning_rate": 3.11179878851725e-05, "loss": 1.5917, "step": 14348 }, { "epoch": 0.3779036081116671, "grad_norm": 2.0160927772521973, "learning_rate": 3.111667105609692e-05, "loss": 1.7902, "step": 14349 }, { "epoch": 0.3779299446931788, "grad_norm": 5.685546398162842, "learning_rate": 3.1115354227021335e-05, "loss": 1.2652, "step": 14350 }, { "epoch": 0.37795628127469055, "grad_norm": 5.127215385437012, "learning_rate": 3.111403739794575e-05, "loss": 1.3402, "step": 14351 }, { "epoch": 0.37798261785620224, "grad_norm": 1.8516260385513306, "learning_rate": 3.1112720568870166e-05, "loss": 1.6599, "step": 14352 }, { "epoch": 0.378008954437714, "grad_norm": 1.600586175918579, "learning_rate": 3.1111403739794575e-05, "loss": 1.6299, "step": 14353 }, { "epoch": 0.3780352910192257, "grad_norm": 3.3044614791870117, "learning_rate": 3.111008691071899e-05, "loss": 0.5645, "step": 14354 }, { "epoch": 0.3780616276007374, "grad_norm": 2.66239333152771, "learning_rate": 3.1108770081643406e-05, "loss": 0.9753, "step": 14355 }, { "epoch": 0.37808796418224916, "grad_norm": 2.1020607948303223, "learning_rate": 3.110745325256782e-05, "loss": 1.621, "step": 14356 }, { "epoch": 0.37811430076376085, "grad_norm": 3.535782814025879, "learning_rate": 3.110613642349223e-05, "loss": 1.623, "step": 14357 }, { "epoch": 0.3781406373452726, "grad_norm": 1.6616016626358032, "learning_rate": 3.1104819594416646e-05, "loss": 1.938, "step": 14358 }, { "epoch": 0.3781669739267843, "grad_norm": 2.5869176387786865, "learning_rate": 3.110350276534106e-05, "loss": 1.134, "step": 14359 }, { "epoch": 0.378193310508296, "grad_norm": 2.189210891723633, "learning_rate": 3.110218593626548e-05, "loss": 1.5386, "step": 14360 }, { "epoch": 0.37821964708980776, "grad_norm": 1.8407204151153564, "learning_rate": 3.110086910718989e-05, "loss": 1.9766, "step": 14361 }, { "epoch": 0.37824598367131945, "grad_norm": 2.904205799102783, "learning_rate": 3.10995522781143e-05, "loss": 1.7497, "step": 14362 }, { "epoch": 0.3782723202528312, "grad_norm": 2.136925220489502, "learning_rate": 3.109823544903872e-05, "loss": 1.4866, "step": 14363 }, { "epoch": 0.3782986568343429, "grad_norm": 3.7563652992248535, "learning_rate": 3.1096918619963126e-05, "loss": 2.0095, "step": 14364 }, { "epoch": 0.3783249934158546, "grad_norm": 1.619258999824524, "learning_rate": 3.109560179088755e-05, "loss": 1.599, "step": 14365 }, { "epoch": 0.3783513299973663, "grad_norm": 1.876483678817749, "learning_rate": 3.109428496181196e-05, "loss": 2.3886, "step": 14366 }, { "epoch": 0.37837766657887806, "grad_norm": 4.6059956550598145, "learning_rate": 3.109296813273637e-05, "loss": 1.3271, "step": 14367 }, { "epoch": 0.3784040031603898, "grad_norm": 2.8964126110076904, "learning_rate": 3.109165130366078e-05, "loss": 2.1777, "step": 14368 }, { "epoch": 0.3784303397419015, "grad_norm": 1.7897956371307373, "learning_rate": 3.10903344745852e-05, "loss": 1.8077, "step": 14369 }, { "epoch": 0.37845667632341323, "grad_norm": 2.401308298110962, "learning_rate": 3.108901764550961e-05, "loss": 1.1056, "step": 14370 }, { "epoch": 0.3784830129049249, "grad_norm": 3.4855611324310303, "learning_rate": 3.108770081643403e-05, "loss": 1.0769, "step": 14371 }, { "epoch": 0.37850934948643666, "grad_norm": 1.5533421039581299, "learning_rate": 3.1086383987358444e-05, "loss": 1.8009, "step": 14372 }, { "epoch": 0.3785356860679484, "grad_norm": 2.0275983810424805, "learning_rate": 3.108506715828285e-05, "loss": 1.8764, "step": 14373 }, { "epoch": 0.3785620226494601, "grad_norm": 3.17368483543396, "learning_rate": 3.1083750329207275e-05, "loss": 1.5531, "step": 14374 }, { "epoch": 0.37858835923097184, "grad_norm": 1.7626057863235474, "learning_rate": 3.1082433500131684e-05, "loss": 1.8141, "step": 14375 }, { "epoch": 0.3786146958124835, "grad_norm": 2.963604211807251, "learning_rate": 3.10811166710561e-05, "loss": 2.0978, "step": 14376 }, { "epoch": 0.37864103239399527, "grad_norm": 1.7826277017593384, "learning_rate": 3.107979984198051e-05, "loss": 2.2907, "step": 14377 }, { "epoch": 0.37866736897550696, "grad_norm": 3.0602974891662598, "learning_rate": 3.1078483012904924e-05, "loss": 1.7572, "step": 14378 }, { "epoch": 0.3786937055570187, "grad_norm": 2.30256986618042, "learning_rate": 3.107716618382934e-05, "loss": 1.7281, "step": 14379 }, { "epoch": 0.37872004213853044, "grad_norm": 4.591120719909668, "learning_rate": 3.1075849354753755e-05, "loss": 1.9756, "step": 14380 }, { "epoch": 0.37874637872004213, "grad_norm": 1.236663818359375, "learning_rate": 3.107453252567817e-05, "loss": 0.272, "step": 14381 }, { "epoch": 0.3787727153015539, "grad_norm": 3.8244235515594482, "learning_rate": 3.107321569660258e-05, "loss": 1.4714, "step": 14382 }, { "epoch": 0.37879905188306556, "grad_norm": 1.9342507123947144, "learning_rate": 3.1071898867526995e-05, "loss": 2.2386, "step": 14383 }, { "epoch": 0.3788253884645773, "grad_norm": 2.342454433441162, "learning_rate": 3.107058203845141e-05, "loss": 1.7896, "step": 14384 }, { "epoch": 0.378851725046089, "grad_norm": 2.1913812160491943, "learning_rate": 3.1069265209375826e-05, "loss": 1.7999, "step": 14385 }, { "epoch": 0.37887806162760074, "grad_norm": 2.001167058944702, "learning_rate": 3.1067948380300235e-05, "loss": 0.3985, "step": 14386 }, { "epoch": 0.3789043982091125, "grad_norm": 2.160733938217163, "learning_rate": 3.106663155122465e-05, "loss": 1.6194, "step": 14387 }, { "epoch": 0.37893073479062417, "grad_norm": 4.565048694610596, "learning_rate": 3.1065314722149066e-05, "loss": 1.1386, "step": 14388 }, { "epoch": 0.3789570713721359, "grad_norm": 2.2453153133392334, "learning_rate": 3.106399789307348e-05, "loss": 1.8115, "step": 14389 }, { "epoch": 0.3789834079536476, "grad_norm": 1.8231923580169678, "learning_rate": 3.10626810639979e-05, "loss": 1.8439, "step": 14390 }, { "epoch": 0.37900974453515934, "grad_norm": 1.9987602233886719, "learning_rate": 3.1061364234922306e-05, "loss": 0.4152, "step": 14391 }, { "epoch": 0.37903608111667103, "grad_norm": 1.613195538520813, "learning_rate": 3.106004740584672e-05, "loss": 1.7044, "step": 14392 }, { "epoch": 0.3790624176981828, "grad_norm": 2.655754327774048, "learning_rate": 3.105873057677114e-05, "loss": 1.3212, "step": 14393 }, { "epoch": 0.3790887542796945, "grad_norm": 1.6318649053573608, "learning_rate": 3.105741374769555e-05, "loss": 1.1494, "step": 14394 }, { "epoch": 0.3791150908612062, "grad_norm": 1.8283952474594116, "learning_rate": 3.105609691861996e-05, "loss": 1.6822, "step": 14395 }, { "epoch": 0.37914142744271795, "grad_norm": 1.5157963037490845, "learning_rate": 3.105478008954438e-05, "loss": 2.0481, "step": 14396 }, { "epoch": 0.37916776402422964, "grad_norm": 3.3179919719696045, "learning_rate": 3.105346326046879e-05, "loss": 0.7524, "step": 14397 }, { "epoch": 0.3791941006057414, "grad_norm": 2.909348964691162, "learning_rate": 3.105214643139321e-05, "loss": 0.9953, "step": 14398 }, { "epoch": 0.37922043718725307, "grad_norm": 2.466460943222046, "learning_rate": 3.1050829602317624e-05, "loss": 1.6126, "step": 14399 }, { "epoch": 0.3792467737687648, "grad_norm": 2.0761566162109375, "learning_rate": 3.104951277324203e-05, "loss": 1.0647, "step": 14400 }, { "epoch": 0.37927311035027655, "grad_norm": 4.008663177490234, "learning_rate": 3.104819594416645e-05, "loss": 1.4261, "step": 14401 }, { "epoch": 0.37929944693178824, "grad_norm": 2.338958740234375, "learning_rate": 3.104687911509086e-05, "loss": 2.3638, "step": 14402 }, { "epoch": 0.3793257835133, "grad_norm": 2.0169806480407715, "learning_rate": 3.104556228601528e-05, "loss": 2.0514, "step": 14403 }, { "epoch": 0.3793521200948117, "grad_norm": 2.0204029083251953, "learning_rate": 3.104424545693969e-05, "loss": 2.676, "step": 14404 }, { "epoch": 0.3793784566763234, "grad_norm": 3.0137548446655273, "learning_rate": 3.1042928627864104e-05, "loss": 1.4679, "step": 14405 }, { "epoch": 0.37940479325783516, "grad_norm": 1.8132253885269165, "learning_rate": 3.104161179878852e-05, "loss": 1.8828, "step": 14406 }, { "epoch": 0.37943112983934685, "grad_norm": 1.7056853771209717, "learning_rate": 3.1040294969712936e-05, "loss": 1.9261, "step": 14407 }, { "epoch": 0.3794574664208586, "grad_norm": 2.4139211177825928, "learning_rate": 3.103897814063735e-05, "loss": 2.2971, "step": 14408 }, { "epoch": 0.3794838030023703, "grad_norm": 2.2264411449432373, "learning_rate": 3.103766131156176e-05, "loss": 2.0354, "step": 14409 }, { "epoch": 0.379510139583882, "grad_norm": 5.448110580444336, "learning_rate": 3.1036344482486176e-05, "loss": 1.9249, "step": 14410 }, { "epoch": 0.3795364761653937, "grad_norm": 1.6863667964935303, "learning_rate": 3.1035027653410584e-05, "loss": 2.158, "step": 14411 }, { "epoch": 0.37956281274690545, "grad_norm": 2.6604042053222656, "learning_rate": 3.103371082433501e-05, "loss": 1.4999, "step": 14412 }, { "epoch": 0.3795891493284172, "grad_norm": 3.0340991020202637, "learning_rate": 3.1032393995259416e-05, "loss": 1.2997, "step": 14413 }, { "epoch": 0.3796154859099289, "grad_norm": 3.5236711502075195, "learning_rate": 3.103107716618383e-05, "loss": 0.4889, "step": 14414 }, { "epoch": 0.37964182249144063, "grad_norm": 2.264178514480591, "learning_rate": 3.102976033710825e-05, "loss": 1.979, "step": 14415 }, { "epoch": 0.3796681590729523, "grad_norm": 3.9308578968048096, "learning_rate": 3.1028443508032656e-05, "loss": 1.9012, "step": 14416 }, { "epoch": 0.37969449565446406, "grad_norm": 2.6817424297332764, "learning_rate": 3.102712667895708e-05, "loss": 2.2367, "step": 14417 }, { "epoch": 0.37972083223597575, "grad_norm": 3.8696348667144775, "learning_rate": 3.102580984988149e-05, "loss": 1.5145, "step": 14418 }, { "epoch": 0.3797471688174875, "grad_norm": 1.4997214078903198, "learning_rate": 3.10244930208059e-05, "loss": 2.2239, "step": 14419 }, { "epoch": 0.37977350539899923, "grad_norm": 3.1607394218444824, "learning_rate": 3.102317619173031e-05, "loss": 1.5356, "step": 14420 }, { "epoch": 0.3797998419805109, "grad_norm": 1.7745493650436401, "learning_rate": 3.1021859362654734e-05, "loss": 1.7118, "step": 14421 }, { "epoch": 0.37982617856202266, "grad_norm": 2.050464630126953, "learning_rate": 3.102054253357914e-05, "loss": 1.5729, "step": 14422 }, { "epoch": 0.37985251514353435, "grad_norm": 3.483598470687866, "learning_rate": 3.101922570450356e-05, "loss": 1.9902, "step": 14423 }, { "epoch": 0.3798788517250461, "grad_norm": 3.1781907081604004, "learning_rate": 3.101790887542797e-05, "loss": 2.0966, "step": 14424 }, { "epoch": 0.3799051883065578, "grad_norm": 2.0831942558288574, "learning_rate": 3.101659204635238e-05, "loss": 2.1314, "step": 14425 }, { "epoch": 0.3799315248880695, "grad_norm": 2.6782853603363037, "learning_rate": 3.10152752172768e-05, "loss": 1.7379, "step": 14426 }, { "epoch": 0.37995786146958127, "grad_norm": 4.894845485687256, "learning_rate": 3.1013958388201214e-05, "loss": 1.8873, "step": 14427 }, { "epoch": 0.37998419805109296, "grad_norm": 1.956839919090271, "learning_rate": 3.101264155912563e-05, "loss": 1.4455, "step": 14428 }, { "epoch": 0.3800105346326047, "grad_norm": 4.308040142059326, "learning_rate": 3.101132473005004e-05, "loss": 2.4415, "step": 14429 }, { "epoch": 0.3800368712141164, "grad_norm": 2.0065255165100098, "learning_rate": 3.1010007900974454e-05, "loss": 1.9858, "step": 14430 }, { "epoch": 0.38006320779562813, "grad_norm": 2.4225807189941406, "learning_rate": 3.100869107189887e-05, "loss": 1.9079, "step": 14431 }, { "epoch": 0.3800895443771398, "grad_norm": 3.417452335357666, "learning_rate": 3.1007374242823285e-05, "loss": 1.0014, "step": 14432 }, { "epoch": 0.38011588095865156, "grad_norm": 2.403873920440674, "learning_rate": 3.1006057413747694e-05, "loss": 0.5602, "step": 14433 }, { "epoch": 0.3801422175401633, "grad_norm": 1.9683886766433716, "learning_rate": 3.100474058467211e-05, "loss": 1.8379, "step": 14434 }, { "epoch": 0.380168554121675, "grad_norm": 1.9898672103881836, "learning_rate": 3.1003423755596525e-05, "loss": 1.2432, "step": 14435 }, { "epoch": 0.38019489070318674, "grad_norm": 2.5677411556243896, "learning_rate": 3.100210692652094e-05, "loss": 1.6397, "step": 14436 }, { "epoch": 0.3802212272846984, "grad_norm": 4.034700393676758, "learning_rate": 3.1000790097445356e-05, "loss": 1.8715, "step": 14437 }, { "epoch": 0.38024756386621017, "grad_norm": 1.8698471784591675, "learning_rate": 3.0999473268369765e-05, "loss": 2.2374, "step": 14438 }, { "epoch": 0.3802739004477219, "grad_norm": 2.496015787124634, "learning_rate": 3.099815643929418e-05, "loss": 1.8703, "step": 14439 }, { "epoch": 0.3803002370292336, "grad_norm": 2.114288568496704, "learning_rate": 3.0996839610218596e-05, "loss": 1.7906, "step": 14440 }, { "epoch": 0.38032657361074534, "grad_norm": 1.9222286939620972, "learning_rate": 3.099552278114301e-05, "loss": 2.1159, "step": 14441 }, { "epoch": 0.38035291019225703, "grad_norm": 2.053709030151367, "learning_rate": 3.099420595206742e-05, "loss": 2.1609, "step": 14442 }, { "epoch": 0.3803792467737688, "grad_norm": 1.8872405290603638, "learning_rate": 3.0992889122991836e-05, "loss": 1.2362, "step": 14443 }, { "epoch": 0.38040558335528046, "grad_norm": 2.4812612533569336, "learning_rate": 3.099157229391625e-05, "loss": 2.1593, "step": 14444 }, { "epoch": 0.3804319199367922, "grad_norm": 3.93999981880188, "learning_rate": 3.099025546484067e-05, "loss": 1.0586, "step": 14445 }, { "epoch": 0.38045825651830395, "grad_norm": 1.6276928186416626, "learning_rate": 3.098893863576508e-05, "loss": 1.3704, "step": 14446 }, { "epoch": 0.38048459309981564, "grad_norm": 3.938692331314087, "learning_rate": 3.098762180668949e-05, "loss": 1.4437, "step": 14447 }, { "epoch": 0.3805109296813274, "grad_norm": 1.9843577146530151, "learning_rate": 3.098630497761391e-05, "loss": 1.7513, "step": 14448 }, { "epoch": 0.38053726626283907, "grad_norm": 2.6324470043182373, "learning_rate": 3.0984988148538316e-05, "loss": 1.0439, "step": 14449 }, { "epoch": 0.3805636028443508, "grad_norm": 4.131014347076416, "learning_rate": 3.098367131946274e-05, "loss": 1.7271, "step": 14450 }, { "epoch": 0.3805899394258625, "grad_norm": 1.9271693229675293, "learning_rate": 3.098235449038715e-05, "loss": 2.5178, "step": 14451 }, { "epoch": 0.38061627600737424, "grad_norm": 1.9114900827407837, "learning_rate": 3.098103766131156e-05, "loss": 2.3694, "step": 14452 }, { "epoch": 0.380642612588886, "grad_norm": 2.3361005783081055, "learning_rate": 3.097972083223598e-05, "loss": 2.1915, "step": 14453 }, { "epoch": 0.3806689491703977, "grad_norm": 2.273688554763794, "learning_rate": 3.0978404003160394e-05, "loss": 0.9563, "step": 14454 }, { "epoch": 0.3806952857519094, "grad_norm": 1.5529550313949585, "learning_rate": 3.097708717408481e-05, "loss": 1.7367, "step": 14455 }, { "epoch": 0.3807216223334211, "grad_norm": 2.8191428184509277, "learning_rate": 3.097577034500922e-05, "loss": 1.2118, "step": 14456 }, { "epoch": 0.38074795891493285, "grad_norm": 1.668314814567566, "learning_rate": 3.0974453515933634e-05, "loss": 1.6507, "step": 14457 }, { "epoch": 0.38077429549644454, "grad_norm": 2.7240145206451416, "learning_rate": 3.097313668685804e-05, "loss": 1.5051, "step": 14458 }, { "epoch": 0.3808006320779563, "grad_norm": 3.0239665508270264, "learning_rate": 3.0971819857782465e-05, "loss": 1.5282, "step": 14459 }, { "epoch": 0.380826968659468, "grad_norm": 2.5275745391845703, "learning_rate": 3.0970503028706874e-05, "loss": 1.6858, "step": 14460 }, { "epoch": 0.3808533052409797, "grad_norm": 2.205512762069702, "learning_rate": 3.096918619963129e-05, "loss": 1.484, "step": 14461 }, { "epoch": 0.38087964182249145, "grad_norm": 4.681382179260254, "learning_rate": 3.0967869370555705e-05, "loss": 1.3491, "step": 14462 }, { "epoch": 0.38090597840400314, "grad_norm": 2.1899831295013428, "learning_rate": 3.0966552541480114e-05, "loss": 1.7056, "step": 14463 }, { "epoch": 0.3809323149855149, "grad_norm": 9.964446067810059, "learning_rate": 3.0965235712404537e-05, "loss": 1.6069, "step": 14464 }, { "epoch": 0.3809586515670266, "grad_norm": 1.5020250082015991, "learning_rate": 3.0963918883328945e-05, "loss": 1.6391, "step": 14465 }, { "epoch": 0.3809849881485383, "grad_norm": 1.814849615097046, "learning_rate": 3.096260205425336e-05, "loss": 2.075, "step": 14466 }, { "epoch": 0.38101132473005006, "grad_norm": 2.910660743713379, "learning_rate": 3.096128522517777e-05, "loss": 0.4715, "step": 14467 }, { "epoch": 0.38103766131156175, "grad_norm": 2.3053295612335205, "learning_rate": 3.0959968396102185e-05, "loss": 1.8975, "step": 14468 }, { "epoch": 0.3810639978930735, "grad_norm": 3.6511478424072266, "learning_rate": 3.09586515670266e-05, "loss": 0.8469, "step": 14469 }, { "epoch": 0.3810903344745852, "grad_norm": 5.193423748016357, "learning_rate": 3.0957334737951017e-05, "loss": 2.223, "step": 14470 }, { "epoch": 0.3811166710560969, "grad_norm": 2.920355796813965, "learning_rate": 3.0956017908875425e-05, "loss": 1.4857, "step": 14471 }, { "epoch": 0.3811430076376086, "grad_norm": 2.167603015899658, "learning_rate": 3.095470107979984e-05, "loss": 1.2876, "step": 14472 }, { "epoch": 0.38116934421912035, "grad_norm": 2.236863851547241, "learning_rate": 3.0953384250724257e-05, "loss": 2.157, "step": 14473 }, { "epoch": 0.3811956808006321, "grad_norm": 1.8141050338745117, "learning_rate": 3.095206742164867e-05, "loss": 2.1034, "step": 14474 }, { "epoch": 0.3812220173821438, "grad_norm": 2.4759411811828613, "learning_rate": 3.095075059257309e-05, "loss": 1.2742, "step": 14475 }, { "epoch": 0.38124835396365553, "grad_norm": 2.4107723236083984, "learning_rate": 3.0949433763497497e-05, "loss": 1.7625, "step": 14476 }, { "epoch": 0.3812746905451672, "grad_norm": 2.3482730388641357, "learning_rate": 3.094811693442191e-05, "loss": 0.7068, "step": 14477 }, { "epoch": 0.38130102712667896, "grad_norm": 2.733046770095825, "learning_rate": 3.094680010534633e-05, "loss": 2.1464, "step": 14478 }, { "epoch": 0.3813273637081907, "grad_norm": 1.857745885848999, "learning_rate": 3.0945483276270743e-05, "loss": 1.406, "step": 14479 }, { "epoch": 0.3813537002897024, "grad_norm": 1.3975458145141602, "learning_rate": 3.094416644719515e-05, "loss": 2.0023, "step": 14480 }, { "epoch": 0.38138003687121413, "grad_norm": 3.0195488929748535, "learning_rate": 3.094284961811957e-05, "loss": 0.9969, "step": 14481 }, { "epoch": 0.3814063734527258, "grad_norm": 2.890155076980591, "learning_rate": 3.0941532789043983e-05, "loss": 0.9167, "step": 14482 }, { "epoch": 0.38143271003423757, "grad_norm": 2.8606345653533936, "learning_rate": 3.09402159599684e-05, "loss": 1.456, "step": 14483 }, { "epoch": 0.38145904661574925, "grad_norm": 1.6089520454406738, "learning_rate": 3.0938899130892815e-05, "loss": 2.4486, "step": 14484 }, { "epoch": 0.381485383197261, "grad_norm": 2.385244846343994, "learning_rate": 3.0937582301817223e-05, "loss": 0.2576, "step": 14485 }, { "epoch": 0.38151171977877274, "grad_norm": 2.513944625854492, "learning_rate": 3.093626547274164e-05, "loss": 1.2196, "step": 14486 }, { "epoch": 0.3815380563602844, "grad_norm": 4.3541364669799805, "learning_rate": 3.0934948643666055e-05, "loss": 2.4994, "step": 14487 }, { "epoch": 0.38156439294179617, "grad_norm": 2.828737258911133, "learning_rate": 3.093363181459047e-05, "loss": 1.1445, "step": 14488 }, { "epoch": 0.38159072952330786, "grad_norm": 1.66840660572052, "learning_rate": 3.093231498551488e-05, "loss": 1.6256, "step": 14489 }, { "epoch": 0.3816170661048196, "grad_norm": 5.299621105194092, "learning_rate": 3.0930998156439295e-05, "loss": 2.0904, "step": 14490 }, { "epoch": 0.3816434026863313, "grad_norm": 1.7058268785476685, "learning_rate": 3.092968132736371e-05, "loss": 1.6761, "step": 14491 }, { "epoch": 0.38166973926784303, "grad_norm": 2.0077342987060547, "learning_rate": 3.0928364498288126e-05, "loss": 2.0041, "step": 14492 }, { "epoch": 0.3816960758493548, "grad_norm": 1.6694526672363281, "learning_rate": 3.092704766921254e-05, "loss": 2.3546, "step": 14493 }, { "epoch": 0.38172241243086646, "grad_norm": 2.921759605407715, "learning_rate": 3.092573084013695e-05, "loss": 1.4905, "step": 14494 }, { "epoch": 0.3817487490123782, "grad_norm": 1.6434262990951538, "learning_rate": 3.0924414011061366e-05, "loss": 1.5862, "step": 14495 }, { "epoch": 0.3817750855938899, "grad_norm": 4.454122543334961, "learning_rate": 3.0923097181985775e-05, "loss": 1.3625, "step": 14496 }, { "epoch": 0.38180142217540164, "grad_norm": 1.860121726989746, "learning_rate": 3.09217803529102e-05, "loss": 0.3075, "step": 14497 }, { "epoch": 0.3818277587569133, "grad_norm": 1.612991213798523, "learning_rate": 3.0920463523834606e-05, "loss": 1.9426, "step": 14498 }, { "epoch": 0.38185409533842507, "grad_norm": 2.6928908824920654, "learning_rate": 3.091914669475902e-05, "loss": 1.6539, "step": 14499 }, { "epoch": 0.3818804319199368, "grad_norm": 1.951027274131775, "learning_rate": 3.091782986568344e-05, "loss": 1.8509, "step": 14500 }, { "epoch": 0.3819067685014485, "grad_norm": 5.054205894470215, "learning_rate": 3.0916513036607846e-05, "loss": 0.9297, "step": 14501 }, { "epoch": 0.38193310508296024, "grad_norm": 2.7478649616241455, "learning_rate": 3.091519620753227e-05, "loss": 2.3899, "step": 14502 }, { "epoch": 0.38195944166447193, "grad_norm": 2.4717133045196533, "learning_rate": 3.091387937845668e-05, "loss": 1.8558, "step": 14503 }, { "epoch": 0.3819857782459837, "grad_norm": 2.6459312438964844, "learning_rate": 3.091256254938109e-05, "loss": 1.8517, "step": 14504 }, { "epoch": 0.38201211482749536, "grad_norm": 2.268254041671753, "learning_rate": 3.09112457203055e-05, "loss": 2.0417, "step": 14505 }, { "epoch": 0.3820384514090071, "grad_norm": 2.6595206260681152, "learning_rate": 3.0909928891229924e-05, "loss": 0.627, "step": 14506 }, { "epoch": 0.38206478799051885, "grad_norm": 1.9094496965408325, "learning_rate": 3.090861206215433e-05, "loss": 1.8996, "step": 14507 }, { "epoch": 0.38209112457203054, "grad_norm": 3.0797224044799805, "learning_rate": 3.090729523307875e-05, "loss": 2.0929, "step": 14508 }, { "epoch": 0.3821174611535423, "grad_norm": 2.0068109035491943, "learning_rate": 3.0905978404003164e-05, "loss": 0.379, "step": 14509 }, { "epoch": 0.38214379773505397, "grad_norm": 5.691693305969238, "learning_rate": 3.090466157492757e-05, "loss": 2.0054, "step": 14510 }, { "epoch": 0.3821701343165657, "grad_norm": 2.9892454147338867, "learning_rate": 3.0903344745851995e-05, "loss": 0.8063, "step": 14511 }, { "epoch": 0.38219647089807746, "grad_norm": 2.283273220062256, "learning_rate": 3.0902027916776404e-05, "loss": 0.6373, "step": 14512 }, { "epoch": 0.38222280747958914, "grad_norm": 1.8923076391220093, "learning_rate": 3.090071108770082e-05, "loss": 1.8576, "step": 14513 }, { "epoch": 0.3822491440611009, "grad_norm": 1.8745142221450806, "learning_rate": 3.089939425862523e-05, "loss": 1.583, "step": 14514 }, { "epoch": 0.3822754806426126, "grad_norm": 1.8707177639007568, "learning_rate": 3.0898077429549644e-05, "loss": 2.426, "step": 14515 }, { "epoch": 0.3823018172241243, "grad_norm": 4.815286636352539, "learning_rate": 3.089676060047406e-05, "loss": 1.4576, "step": 14516 }, { "epoch": 0.382328153805636, "grad_norm": 1.6012474298477173, "learning_rate": 3.0895443771398475e-05, "loss": 2.2897, "step": 14517 }, { "epoch": 0.38235449038714775, "grad_norm": 2.140408515930176, "learning_rate": 3.089412694232289e-05, "loss": 2.4381, "step": 14518 }, { "epoch": 0.3823808269686595, "grad_norm": 2.8224799633026123, "learning_rate": 3.08928101132473e-05, "loss": 1.1844, "step": 14519 }, { "epoch": 0.3824071635501712, "grad_norm": 1.8939311504364014, "learning_rate": 3.089149328417172e-05, "loss": 1.8609, "step": 14520 }, { "epoch": 0.3824335001316829, "grad_norm": 1.5587173700332642, "learning_rate": 3.089017645509613e-05, "loss": 1.1591, "step": 14521 }, { "epoch": 0.3824598367131946, "grad_norm": 1.8770238161087036, "learning_rate": 3.0888859626020546e-05, "loss": 2.3402, "step": 14522 }, { "epoch": 0.38248617329470636, "grad_norm": 2.0927324295043945, "learning_rate": 3.0887542796944955e-05, "loss": 1.9234, "step": 14523 }, { "epoch": 0.38251250987621804, "grad_norm": 3.052306890487671, "learning_rate": 3.088622596786937e-05, "loss": 1.9123, "step": 14524 }, { "epoch": 0.3825388464577298, "grad_norm": 1.836662769317627, "learning_rate": 3.0884909138793786e-05, "loss": 1.8598, "step": 14525 }, { "epoch": 0.38256518303924153, "grad_norm": 1.8895927667617798, "learning_rate": 3.08835923097182e-05, "loss": 1.899, "step": 14526 }, { "epoch": 0.3825915196207532, "grad_norm": 1.8681902885437012, "learning_rate": 3.088227548064261e-05, "loss": 1.6546, "step": 14527 }, { "epoch": 0.38261785620226496, "grad_norm": 5.182409286499023, "learning_rate": 3.0880958651567026e-05, "loss": 1.2162, "step": 14528 }, { "epoch": 0.38264419278377665, "grad_norm": 1.9442241191864014, "learning_rate": 3.087964182249144e-05, "loss": 1.8905, "step": 14529 }, { "epoch": 0.3826705293652884, "grad_norm": 2.077543020248413, "learning_rate": 3.087832499341586e-05, "loss": 2.0442, "step": 14530 }, { "epoch": 0.3826968659468001, "grad_norm": 1.8882946968078613, "learning_rate": 3.087700816434027e-05, "loss": 1.992, "step": 14531 }, { "epoch": 0.3827232025283118, "grad_norm": 1.9640634059906006, "learning_rate": 3.087569133526468e-05, "loss": 1.8207, "step": 14532 }, { "epoch": 0.38274953910982357, "grad_norm": 1.5785528421401978, "learning_rate": 3.08743745061891e-05, "loss": 2.1099, "step": 14533 }, { "epoch": 0.38277587569133525, "grad_norm": 2.320997714996338, "learning_rate": 3.0873057677113506e-05, "loss": 1.9014, "step": 14534 }, { "epoch": 0.382802212272847, "grad_norm": 4.660298824310303, "learning_rate": 3.087174084803793e-05, "loss": 0.8468, "step": 14535 }, { "epoch": 0.3828285488543587, "grad_norm": 3.4066030979156494, "learning_rate": 3.087042401896234e-05, "loss": 2.2176, "step": 14536 }, { "epoch": 0.38285488543587043, "grad_norm": 2.178804874420166, "learning_rate": 3.086910718988675e-05, "loss": 2.3646, "step": 14537 }, { "epoch": 0.3828812220173821, "grad_norm": 3.9844250679016113, "learning_rate": 3.086779036081117e-05, "loss": 1.2117, "step": 14538 }, { "epoch": 0.38290755859889386, "grad_norm": 1.9921941757202148, "learning_rate": 3.0866473531735584e-05, "loss": 2.7739, "step": 14539 }, { "epoch": 0.3829338951804056, "grad_norm": 1.6976536512374878, "learning_rate": 3.086515670266e-05, "loss": 1.8513, "step": 14540 }, { "epoch": 0.3829602317619173, "grad_norm": 1.7839652299880981, "learning_rate": 3.086383987358441e-05, "loss": 1.8555, "step": 14541 }, { "epoch": 0.38298656834342903, "grad_norm": 1.7729908227920532, "learning_rate": 3.0862523044508824e-05, "loss": 1.3811, "step": 14542 }, { "epoch": 0.3830129049249407, "grad_norm": 1.4872736930847168, "learning_rate": 3.086120621543323e-05, "loss": 1.8093, "step": 14543 }, { "epoch": 0.38303924150645247, "grad_norm": 2.047466278076172, "learning_rate": 3.0859889386357656e-05, "loss": 1.0088, "step": 14544 }, { "epoch": 0.3830655780879642, "grad_norm": 1.874760389328003, "learning_rate": 3.0858572557282064e-05, "loss": 1.8577, "step": 14545 }, { "epoch": 0.3830919146694759, "grad_norm": 2.512969970703125, "learning_rate": 3.085725572820648e-05, "loss": 1.8125, "step": 14546 }, { "epoch": 0.38311825125098764, "grad_norm": 1.4420921802520752, "learning_rate": 3.0855938899130896e-05, "loss": 1.9235, "step": 14547 }, { "epoch": 0.38314458783249933, "grad_norm": 3.0532658100128174, "learning_rate": 3.0854622070055304e-05, "loss": 0.887, "step": 14548 }, { "epoch": 0.38317092441401107, "grad_norm": 3.250452756881714, "learning_rate": 3.085330524097973e-05, "loss": 0.9619, "step": 14549 }, { "epoch": 0.38319726099552276, "grad_norm": 3.1377077102661133, "learning_rate": 3.0851988411904136e-05, "loss": 1.3583, "step": 14550 }, { "epoch": 0.3832235975770345, "grad_norm": 2.734541416168213, "learning_rate": 3.085067158282855e-05, "loss": 0.7901, "step": 14551 }, { "epoch": 0.38324993415854625, "grad_norm": 2.777575731277466, "learning_rate": 3.084935475375296e-05, "loss": 1.5313, "step": 14552 }, { "epoch": 0.38327627074005793, "grad_norm": 3.8294436931610107, "learning_rate": 3.084803792467738e-05, "loss": 1.7996, "step": 14553 }, { "epoch": 0.3833026073215697, "grad_norm": 2.3994832038879395, "learning_rate": 3.084672109560179e-05, "loss": 2.3727, "step": 14554 }, { "epoch": 0.38332894390308137, "grad_norm": 1.6705495119094849, "learning_rate": 3.084540426652621e-05, "loss": 1.6402, "step": 14555 }, { "epoch": 0.3833552804845931, "grad_norm": 3.5592119693756104, "learning_rate": 3.084408743745062e-05, "loss": 0.967, "step": 14556 }, { "epoch": 0.3833816170661048, "grad_norm": 1.7447296380996704, "learning_rate": 3.084277060837503e-05, "loss": 1.5127, "step": 14557 }, { "epoch": 0.38340795364761654, "grad_norm": 2.0983502864837646, "learning_rate": 3.0841453779299454e-05, "loss": 2.4396, "step": 14558 }, { "epoch": 0.3834342902291283, "grad_norm": 1.859695553779602, "learning_rate": 3.084013695022386e-05, "loss": 2.2114, "step": 14559 }, { "epoch": 0.38346062681063997, "grad_norm": 1.955888032913208, "learning_rate": 3.083882012114828e-05, "loss": 1.519, "step": 14560 }, { "epoch": 0.3834869633921517, "grad_norm": 4.932207107543945, "learning_rate": 3.083750329207269e-05, "loss": 1.5375, "step": 14561 }, { "epoch": 0.3835132999736634, "grad_norm": 3.0014209747314453, "learning_rate": 3.08361864629971e-05, "loss": 1.5491, "step": 14562 }, { "epoch": 0.38353963655517515, "grad_norm": 1.3994399309158325, "learning_rate": 3.083486963392152e-05, "loss": 1.6835, "step": 14563 }, { "epoch": 0.38356597313668683, "grad_norm": 2.186720371246338, "learning_rate": 3.0833552804845934e-05, "loss": 1.7745, "step": 14564 }, { "epoch": 0.3835923097181986, "grad_norm": 1.9380356073379517, "learning_rate": 3.083223597577035e-05, "loss": 1.8975, "step": 14565 }, { "epoch": 0.3836186462997103, "grad_norm": 3.60066819190979, "learning_rate": 3.083091914669476e-05, "loss": 1.1451, "step": 14566 }, { "epoch": 0.383644982881222, "grad_norm": 1.6131770610809326, "learning_rate": 3.082960231761918e-05, "loss": 1.4917, "step": 14567 }, { "epoch": 0.38367131946273375, "grad_norm": 2.484441041946411, "learning_rate": 3.082828548854359e-05, "loss": 0.8565, "step": 14568 }, { "epoch": 0.38369765604424544, "grad_norm": 1.8261693716049194, "learning_rate": 3.0826968659468005e-05, "loss": 1.2457, "step": 14569 }, { "epoch": 0.3837239926257572, "grad_norm": 2.0360586643218994, "learning_rate": 3.0825651830392414e-05, "loss": 1.4147, "step": 14570 }, { "epoch": 0.38375032920726887, "grad_norm": 4.179394245147705, "learning_rate": 3.082433500131683e-05, "loss": 1.6205, "step": 14571 }, { "epoch": 0.3837766657887806, "grad_norm": 3.0943753719329834, "learning_rate": 3.0823018172241245e-05, "loss": 1.467, "step": 14572 }, { "epoch": 0.38380300237029236, "grad_norm": 2.677536725997925, "learning_rate": 3.082170134316566e-05, "loss": 1.583, "step": 14573 }, { "epoch": 0.38382933895180404, "grad_norm": 2.0822668075561523, "learning_rate": 3.082038451409007e-05, "loss": 1.9005, "step": 14574 }, { "epoch": 0.3838556755333158, "grad_norm": 6.708086967468262, "learning_rate": 3.0819067685014485e-05, "loss": 1.8824, "step": 14575 }, { "epoch": 0.3838820121148275, "grad_norm": 2.0805656909942627, "learning_rate": 3.08177508559389e-05, "loss": 1.5773, "step": 14576 }, { "epoch": 0.3839083486963392, "grad_norm": 2.1125094890594482, "learning_rate": 3.0816434026863316e-05, "loss": 1.4737, "step": 14577 }, { "epoch": 0.38393468527785096, "grad_norm": 1.682759404182434, "learning_rate": 3.081511719778773e-05, "loss": 1.7282, "step": 14578 }, { "epoch": 0.38396102185936265, "grad_norm": 2.025913715362549, "learning_rate": 3.081380036871214e-05, "loss": 1.4737, "step": 14579 }, { "epoch": 0.3839873584408744, "grad_norm": 4.8360795974731445, "learning_rate": 3.0812483539636556e-05, "loss": 1.2927, "step": 14580 }, { "epoch": 0.3840136950223861, "grad_norm": 1.5795741081237793, "learning_rate": 3.0811166710560965e-05, "loss": 1.8777, "step": 14581 }, { "epoch": 0.3840400316038978, "grad_norm": 3.2506656646728516, "learning_rate": 3.080984988148539e-05, "loss": 0.5757, "step": 14582 }, { "epoch": 0.3840663681854095, "grad_norm": 1.514147162437439, "learning_rate": 3.0808533052409796e-05, "loss": 2.1856, "step": 14583 }, { "epoch": 0.38409270476692126, "grad_norm": 2.3269248008728027, "learning_rate": 3.080721622333421e-05, "loss": 1.1516, "step": 14584 }, { "epoch": 0.384119041348433, "grad_norm": 3.3791117668151855, "learning_rate": 3.080589939425863e-05, "loss": 1.2936, "step": 14585 }, { "epoch": 0.3841453779299447, "grad_norm": 2.661522626876831, "learning_rate": 3.080458256518304e-05, "loss": 2.1369, "step": 14586 }, { "epoch": 0.38417171451145643, "grad_norm": 2.0250821113586426, "learning_rate": 3.080326573610746e-05, "loss": 1.6623, "step": 14587 }, { "epoch": 0.3841980510929681, "grad_norm": 3.6315219402313232, "learning_rate": 3.080194890703187e-05, "loss": 0.7954, "step": 14588 }, { "epoch": 0.38422438767447986, "grad_norm": 3.226318597793579, "learning_rate": 3.080063207795628e-05, "loss": 1.4729, "step": 14589 }, { "epoch": 0.38425072425599155, "grad_norm": 2.603069543838501, "learning_rate": 3.079931524888069e-05, "loss": 1.193, "step": 14590 }, { "epoch": 0.3842770608375033, "grad_norm": 1.8471455574035645, "learning_rate": 3.0797998419805114e-05, "loss": 1.6258, "step": 14591 }, { "epoch": 0.38430339741901504, "grad_norm": 3.5210866928100586, "learning_rate": 3.079668159072952e-05, "loss": 2.4627, "step": 14592 }, { "epoch": 0.3843297340005267, "grad_norm": 3.6657567024230957, "learning_rate": 3.079536476165394e-05, "loss": 1.4888, "step": 14593 }, { "epoch": 0.38435607058203847, "grad_norm": 1.986794352531433, "learning_rate": 3.0794047932578354e-05, "loss": 1.442, "step": 14594 }, { "epoch": 0.38438240716355015, "grad_norm": 1.9501502513885498, "learning_rate": 3.079273110350276e-05, "loss": 2.0969, "step": 14595 }, { "epoch": 0.3844087437450619, "grad_norm": 2.129408121109009, "learning_rate": 3.0791414274427185e-05, "loss": 0.4437, "step": 14596 }, { "epoch": 0.3844350803265736, "grad_norm": 1.6300894021987915, "learning_rate": 3.0790097445351594e-05, "loss": 0.6411, "step": 14597 }, { "epoch": 0.38446141690808533, "grad_norm": 1.8381468057632446, "learning_rate": 3.078878061627601e-05, "loss": 1.7314, "step": 14598 }, { "epoch": 0.3844877534895971, "grad_norm": 1.5415160655975342, "learning_rate": 3.078746378720042e-05, "loss": 1.8967, "step": 14599 }, { "epoch": 0.38451409007110876, "grad_norm": 3.338806629180908, "learning_rate": 3.078614695812484e-05, "loss": 1.8835, "step": 14600 }, { "epoch": 0.3845404266526205, "grad_norm": 3.8954193592071533, "learning_rate": 3.078483012904925e-05, "loss": 1.7467, "step": 14601 }, { "epoch": 0.3845667632341322, "grad_norm": 4.459294319152832, "learning_rate": 3.0783513299973665e-05, "loss": 0.528, "step": 14602 }, { "epoch": 0.38459309981564394, "grad_norm": 1.9710445404052734, "learning_rate": 3.078219647089808e-05, "loss": 2.3077, "step": 14603 }, { "epoch": 0.3846194363971556, "grad_norm": 3.190796375274658, "learning_rate": 3.078087964182249e-05, "loss": 1.3843, "step": 14604 }, { "epoch": 0.38464577297866737, "grad_norm": 1.8104718923568726, "learning_rate": 3.077956281274691e-05, "loss": 1.4553, "step": 14605 }, { "epoch": 0.3846721095601791, "grad_norm": 4.741323947906494, "learning_rate": 3.077824598367132e-05, "loss": 1.6188, "step": 14606 }, { "epoch": 0.3846984461416908, "grad_norm": 1.5914028882980347, "learning_rate": 3.077692915459574e-05, "loss": 1.8913, "step": 14607 }, { "epoch": 0.38472478272320254, "grad_norm": 3.255990982055664, "learning_rate": 3.0775612325520145e-05, "loss": 0.7662, "step": 14608 }, { "epoch": 0.38475111930471423, "grad_norm": 2.700408697128296, "learning_rate": 3.077429549644456e-05, "loss": 2.073, "step": 14609 }, { "epoch": 0.38477745588622597, "grad_norm": 2.3465452194213867, "learning_rate": 3.077297866736898e-05, "loss": 1.8965, "step": 14610 }, { "epoch": 0.3848037924677377, "grad_norm": 2.544355869293213, "learning_rate": 3.077166183829339e-05, "loss": 1.035, "step": 14611 }, { "epoch": 0.3848301290492494, "grad_norm": 3.2025418281555176, "learning_rate": 3.077034500921781e-05, "loss": 1.8649, "step": 14612 }, { "epoch": 0.38485646563076115, "grad_norm": 5.358971118927002, "learning_rate": 3.076902818014222e-05, "loss": 1.8094, "step": 14613 }, { "epoch": 0.38488280221227283, "grad_norm": 1.4875799417495728, "learning_rate": 3.076771135106663e-05, "loss": 1.9678, "step": 14614 }, { "epoch": 0.3849091387937846, "grad_norm": 2.4168639183044434, "learning_rate": 3.076639452199105e-05, "loss": 2.0317, "step": 14615 }, { "epoch": 0.38493547537529627, "grad_norm": 1.462188482284546, "learning_rate": 3.0765077692915463e-05, "loss": 2.0147, "step": 14616 }, { "epoch": 0.384961811956808, "grad_norm": 9.157552719116211, "learning_rate": 3.076376086383987e-05, "loss": 2.6409, "step": 14617 }, { "epoch": 0.38498814853831975, "grad_norm": 2.000145196914673, "learning_rate": 3.076244403476429e-05, "loss": 2.059, "step": 14618 }, { "epoch": 0.38501448511983144, "grad_norm": 4.788470268249512, "learning_rate": 3.0761127205688703e-05, "loss": 1.9909, "step": 14619 }, { "epoch": 0.3850408217013432, "grad_norm": 3.421393871307373, "learning_rate": 3.075981037661312e-05, "loss": 1.3119, "step": 14620 }, { "epoch": 0.38506715828285487, "grad_norm": 2.9175899028778076, "learning_rate": 3.0758493547537535e-05, "loss": 0.2579, "step": 14621 }, { "epoch": 0.3850934948643666, "grad_norm": 2.8873727321624756, "learning_rate": 3.0757176718461943e-05, "loss": 1.5352, "step": 14622 }, { "epoch": 0.3851198314458783, "grad_norm": 3.437680959701538, "learning_rate": 3.075585988938636e-05, "loss": 1.3921, "step": 14623 }, { "epoch": 0.38514616802739005, "grad_norm": 2.003678321838379, "learning_rate": 3.0754543060310775e-05, "loss": 1.202, "step": 14624 }, { "epoch": 0.3851725046089018, "grad_norm": 1.838108777999878, "learning_rate": 3.075322623123519e-05, "loss": 1.346, "step": 14625 }, { "epoch": 0.3851988411904135, "grad_norm": 2.86262845993042, "learning_rate": 3.07519094021596e-05, "loss": 2.4319, "step": 14626 }, { "epoch": 0.3852251777719252, "grad_norm": 3.324017286300659, "learning_rate": 3.0750592573084015e-05, "loss": 1.9097, "step": 14627 }, { "epoch": 0.3852515143534369, "grad_norm": 1.6928248405456543, "learning_rate": 3.0749275744008424e-05, "loss": 1.9146, "step": 14628 }, { "epoch": 0.38527785093494865, "grad_norm": 2.3610126972198486, "learning_rate": 3.0747958914932846e-05, "loss": 1.0951, "step": 14629 }, { "epoch": 0.38530418751646034, "grad_norm": 1.782149076461792, "learning_rate": 3.0746642085857255e-05, "loss": 2.0689, "step": 14630 }, { "epoch": 0.3853305240979721, "grad_norm": 2.182767152786255, "learning_rate": 3.074532525678167e-05, "loss": 0.5372, "step": 14631 }, { "epoch": 0.3853568606794838, "grad_norm": 3.282320976257324, "learning_rate": 3.0744008427706086e-05, "loss": 1.6094, "step": 14632 }, { "epoch": 0.3853831972609955, "grad_norm": 1.773176908493042, "learning_rate": 3.07426915986305e-05, "loss": 1.3609, "step": 14633 }, { "epoch": 0.38540953384250726, "grad_norm": 1.8026783466339111, "learning_rate": 3.074137476955492e-05, "loss": 1.7881, "step": 14634 }, { "epoch": 0.38543587042401894, "grad_norm": 2.116224765777588, "learning_rate": 3.0740057940479326e-05, "loss": 1.9769, "step": 14635 }, { "epoch": 0.3854622070055307, "grad_norm": 1.8278933763504028, "learning_rate": 3.073874111140374e-05, "loss": 1.6444, "step": 14636 }, { "epoch": 0.3854885435870424, "grad_norm": 8.668768882751465, "learning_rate": 3.073742428232815e-05, "loss": 1.471, "step": 14637 }, { "epoch": 0.3855148801685541, "grad_norm": 2.8513600826263428, "learning_rate": 3.073610745325257e-05, "loss": 1.4664, "step": 14638 }, { "epoch": 0.38554121675006586, "grad_norm": 1.5551142692565918, "learning_rate": 3.073479062417698e-05, "loss": 1.7675, "step": 14639 }, { "epoch": 0.38556755333157755, "grad_norm": 1.9914963245391846, "learning_rate": 3.07334737951014e-05, "loss": 0.3447, "step": 14640 }, { "epoch": 0.3855938899130893, "grad_norm": 4.118869304656982, "learning_rate": 3.073215696602581e-05, "loss": 1.2941, "step": 14641 }, { "epoch": 0.385620226494601, "grad_norm": 2.391664981842041, "learning_rate": 3.073084013695022e-05, "loss": 0.751, "step": 14642 }, { "epoch": 0.3856465630761127, "grad_norm": 1.451113224029541, "learning_rate": 3.0729523307874644e-05, "loss": 1.949, "step": 14643 }, { "epoch": 0.3856728996576244, "grad_norm": 15.994695663452148, "learning_rate": 3.072820647879905e-05, "loss": 2.4012, "step": 14644 }, { "epoch": 0.38569923623913616, "grad_norm": 1.9516600370407104, "learning_rate": 3.072688964972347e-05, "loss": 1.9063, "step": 14645 }, { "epoch": 0.3857255728206479, "grad_norm": 2.363690137863159, "learning_rate": 3.072557282064788e-05, "loss": 1.4092, "step": 14646 }, { "epoch": 0.3857519094021596, "grad_norm": 4.139781475067139, "learning_rate": 3.072425599157229e-05, "loss": 2.207, "step": 14647 }, { "epoch": 0.38577824598367133, "grad_norm": 2.0568952560424805, "learning_rate": 3.072293916249671e-05, "loss": 1.8473, "step": 14648 }, { "epoch": 0.385804582565183, "grad_norm": 2.984442710876465, "learning_rate": 3.0721622333421124e-05, "loss": 1.2596, "step": 14649 }, { "epoch": 0.38583091914669476, "grad_norm": 1.5389206409454346, "learning_rate": 3.072030550434554e-05, "loss": 1.9694, "step": 14650 }, { "epoch": 0.3858572557282065, "grad_norm": 1.6544585227966309, "learning_rate": 3.071898867526995e-05, "loss": 1.6766, "step": 14651 }, { "epoch": 0.3858835923097182, "grad_norm": 1.9473893642425537, "learning_rate": 3.071767184619437e-05, "loss": 1.7157, "step": 14652 }, { "epoch": 0.38590992889122994, "grad_norm": 2.6641438007354736, "learning_rate": 3.071635501711878e-05, "loss": 0.8589, "step": 14653 }, { "epoch": 0.3859362654727416, "grad_norm": 3.6428356170654297, "learning_rate": 3.0715038188043195e-05, "loss": 1.7024, "step": 14654 }, { "epoch": 0.38596260205425337, "grad_norm": 2.059915781021118, "learning_rate": 3.0713721358967604e-05, "loss": 1.828, "step": 14655 }, { "epoch": 0.38598893863576506, "grad_norm": 5.337553977966309, "learning_rate": 3.071240452989202e-05, "loss": 0.5662, "step": 14656 }, { "epoch": 0.3860152752172768, "grad_norm": 1.7763347625732422, "learning_rate": 3.0711087700816435e-05, "loss": 2.4173, "step": 14657 }, { "epoch": 0.38604161179878854, "grad_norm": 2.045865535736084, "learning_rate": 3.070977087174085e-05, "loss": 1.8914, "step": 14658 }, { "epoch": 0.38606794838030023, "grad_norm": 5.311978816986084, "learning_rate": 3.0708454042665266e-05, "loss": 1.5053, "step": 14659 }, { "epoch": 0.386094284961812, "grad_norm": 1.73963463306427, "learning_rate": 3.0707137213589675e-05, "loss": 1.9052, "step": 14660 }, { "epoch": 0.38612062154332366, "grad_norm": 1.6706151962280273, "learning_rate": 3.070582038451409e-05, "loss": 1.9272, "step": 14661 }, { "epoch": 0.3861469581248354, "grad_norm": 2.0047805309295654, "learning_rate": 3.0704503555438506e-05, "loss": 1.7219, "step": 14662 }, { "epoch": 0.3861732947063471, "grad_norm": 5.656368732452393, "learning_rate": 3.070318672636292e-05, "loss": 2.0545, "step": 14663 }, { "epoch": 0.38619963128785884, "grad_norm": 1.8844788074493408, "learning_rate": 3.070186989728733e-05, "loss": 1.5804, "step": 14664 }, { "epoch": 0.3862259678693706, "grad_norm": 2.44380259513855, "learning_rate": 3.0700553068211746e-05, "loss": 1.6355, "step": 14665 }, { "epoch": 0.38625230445088227, "grad_norm": 1.6666203737258911, "learning_rate": 3.069923623913616e-05, "loss": 1.7636, "step": 14666 }, { "epoch": 0.386278641032394, "grad_norm": 1.6123305559158325, "learning_rate": 3.069791941006058e-05, "loss": 1.9662, "step": 14667 }, { "epoch": 0.3863049776139057, "grad_norm": 1.7656770944595337, "learning_rate": 3.069660258098499e-05, "loss": 1.5156, "step": 14668 }, { "epoch": 0.38633131419541744, "grad_norm": 2.0412185192108154, "learning_rate": 3.06952857519094e-05, "loss": 1.8337, "step": 14669 }, { "epoch": 0.38635765077692913, "grad_norm": 3.4634346961975098, "learning_rate": 3.069396892283382e-05, "loss": 1.1345, "step": 14670 }, { "epoch": 0.3863839873584409, "grad_norm": 1.418215036392212, "learning_rate": 3.069265209375823e-05, "loss": 1.6277, "step": 14671 }, { "epoch": 0.3864103239399526, "grad_norm": 2.4495434761047363, "learning_rate": 3.069133526468265e-05, "loss": 1.3871, "step": 14672 }, { "epoch": 0.3864366605214643, "grad_norm": 2.523446798324585, "learning_rate": 3.069001843560706e-05, "loss": 1.9659, "step": 14673 }, { "epoch": 0.38646299710297605, "grad_norm": 1.8715912103652954, "learning_rate": 3.068870160653147e-05, "loss": 1.4866, "step": 14674 }, { "epoch": 0.38648933368448773, "grad_norm": 4.594829559326172, "learning_rate": 3.068738477745588e-05, "loss": 0.8213, "step": 14675 }, { "epoch": 0.3865156702659995, "grad_norm": 1.8268094062805176, "learning_rate": 3.0686067948380304e-05, "loss": 2.1443, "step": 14676 }, { "epoch": 0.38654200684751117, "grad_norm": 2.2711901664733887, "learning_rate": 3.068475111930471e-05, "loss": 2.1928, "step": 14677 }, { "epoch": 0.3865683434290229, "grad_norm": 1.5995267629623413, "learning_rate": 3.068343429022913e-05, "loss": 2.1466, "step": 14678 }, { "epoch": 0.38659468001053465, "grad_norm": 2.0233495235443115, "learning_rate": 3.0682117461153544e-05, "loss": 1.6078, "step": 14679 }, { "epoch": 0.38662101659204634, "grad_norm": 4.105635166168213, "learning_rate": 3.068080063207795e-05, "loss": 1.9882, "step": 14680 }, { "epoch": 0.3866473531735581, "grad_norm": 2.1287147998809814, "learning_rate": 3.0679483803002376e-05, "loss": 0.8567, "step": 14681 }, { "epoch": 0.38667368975506977, "grad_norm": 2.2934086322784424, "learning_rate": 3.0678166973926784e-05, "loss": 2.3544, "step": 14682 }, { "epoch": 0.3867000263365815, "grad_norm": 2.210073471069336, "learning_rate": 3.06768501448512e-05, "loss": 2.3108, "step": 14683 }, { "epoch": 0.38672636291809326, "grad_norm": 2.6420516967773438, "learning_rate": 3.067553331577561e-05, "loss": 1.2743, "step": 14684 }, { "epoch": 0.38675269949960495, "grad_norm": 6.093904972076416, "learning_rate": 3.067421648670003e-05, "loss": 0.8936, "step": 14685 }, { "epoch": 0.3867790360811167, "grad_norm": 1.5850657224655151, "learning_rate": 3.067289965762444e-05, "loss": 1.8752, "step": 14686 }, { "epoch": 0.3868053726626284, "grad_norm": 1.843186616897583, "learning_rate": 3.0671582828548856e-05, "loss": 2.3141, "step": 14687 }, { "epoch": 0.3868317092441401, "grad_norm": 2.4175901412963867, "learning_rate": 3.067026599947327e-05, "loss": 1.6592, "step": 14688 }, { "epoch": 0.3868580458256518, "grad_norm": 2.634650468826294, "learning_rate": 3.066894917039768e-05, "loss": 1.691, "step": 14689 }, { "epoch": 0.38688438240716355, "grad_norm": 2.40167236328125, "learning_rate": 3.06676323413221e-05, "loss": 1.4214, "step": 14690 }, { "epoch": 0.3869107189886753, "grad_norm": 2.1508870124816895, "learning_rate": 3.066631551224651e-05, "loss": 1.5924, "step": 14691 }, { "epoch": 0.386937055570187, "grad_norm": 2.137068033218384, "learning_rate": 3.066499868317093e-05, "loss": 1.3394, "step": 14692 }, { "epoch": 0.3869633921516987, "grad_norm": 4.683769226074219, "learning_rate": 3.0663681854095336e-05, "loss": 0.8157, "step": 14693 }, { "epoch": 0.3869897287332104, "grad_norm": 2.0051162242889404, "learning_rate": 3.066236502501975e-05, "loss": 0.3501, "step": 14694 }, { "epoch": 0.38701606531472216, "grad_norm": 1.565938115119934, "learning_rate": 3.066104819594417e-05, "loss": 1.746, "step": 14695 }, { "epoch": 0.38704240189623385, "grad_norm": 2.1133971214294434, "learning_rate": 3.065973136686858e-05, "loss": 2.0262, "step": 14696 }, { "epoch": 0.3870687384777456, "grad_norm": 1.8275271654129028, "learning_rate": 3.0658414537793e-05, "loss": 2.2772, "step": 14697 }, { "epoch": 0.38709507505925733, "grad_norm": 2.5442302227020264, "learning_rate": 3.065709770871741e-05, "loss": 0.4184, "step": 14698 }, { "epoch": 0.387121411640769, "grad_norm": 2.144209146499634, "learning_rate": 3.065578087964183e-05, "loss": 1.6374, "step": 14699 }, { "epoch": 0.38714774822228076, "grad_norm": 2.5194780826568604, "learning_rate": 3.065446405056624e-05, "loss": 1.5709, "step": 14700 }, { "epoch": 0.38717408480379245, "grad_norm": 1.77311110496521, "learning_rate": 3.0653147221490654e-05, "loss": 2.0474, "step": 14701 }, { "epoch": 0.3872004213853042, "grad_norm": 1.478083848953247, "learning_rate": 3.065183039241506e-05, "loss": 1.4352, "step": 14702 }, { "epoch": 0.3872267579668159, "grad_norm": 2.535492420196533, "learning_rate": 3.065051356333948e-05, "loss": 2.0801, "step": 14703 }, { "epoch": 0.3872530945483276, "grad_norm": 2.155217170715332, "learning_rate": 3.0649196734263894e-05, "loss": 1.8583, "step": 14704 }, { "epoch": 0.38727943112983937, "grad_norm": 2.1672637462615967, "learning_rate": 3.064787990518831e-05, "loss": 2.0878, "step": 14705 }, { "epoch": 0.38730576771135106, "grad_norm": 3.0483036041259766, "learning_rate": 3.0646563076112725e-05, "loss": 2.4231, "step": 14706 }, { "epoch": 0.3873321042928628, "grad_norm": 3.048377275466919, "learning_rate": 3.0645246247037134e-05, "loss": 2.7498, "step": 14707 }, { "epoch": 0.3873584408743745, "grad_norm": 2.419497489929199, "learning_rate": 3.064392941796155e-05, "loss": 1.8527, "step": 14708 }, { "epoch": 0.38738477745588623, "grad_norm": 2.4085893630981445, "learning_rate": 3.0642612588885965e-05, "loss": 1.7436, "step": 14709 }, { "epoch": 0.3874111140373979, "grad_norm": 2.110053777694702, "learning_rate": 3.064129575981038e-05, "loss": 1.0266, "step": 14710 }, { "epoch": 0.38743745061890966, "grad_norm": 4.174890995025635, "learning_rate": 3.063997893073479e-05, "loss": 2.5177, "step": 14711 }, { "epoch": 0.3874637872004214, "grad_norm": 2.3036508560180664, "learning_rate": 3.0638662101659205e-05, "loss": 2.0052, "step": 14712 }, { "epoch": 0.3874901237819331, "grad_norm": 3.0439138412475586, "learning_rate": 3.063734527258362e-05, "loss": 0.8506, "step": 14713 }, { "epoch": 0.38751646036344484, "grad_norm": 3.9835851192474365, "learning_rate": 3.0636028443508036e-05, "loss": 1.057, "step": 14714 }, { "epoch": 0.3875427969449565, "grad_norm": 2.618096351623535, "learning_rate": 3.063471161443245e-05, "loss": 2.0236, "step": 14715 }, { "epoch": 0.38756913352646827, "grad_norm": 1.820068120956421, "learning_rate": 3.063339478535686e-05, "loss": 1.9378, "step": 14716 }, { "epoch": 0.38759547010798, "grad_norm": 2.012728214263916, "learning_rate": 3.0632077956281276e-05, "loss": 1.9251, "step": 14717 }, { "epoch": 0.3876218066894917, "grad_norm": 5.958434104919434, "learning_rate": 3.063076112720569e-05, "loss": 2.9743, "step": 14718 }, { "epoch": 0.38764814327100344, "grad_norm": 2.6244795322418213, "learning_rate": 3.062944429813011e-05, "loss": 2.0101, "step": 14719 }, { "epoch": 0.38767447985251513, "grad_norm": 2.020388126373291, "learning_rate": 3.0628127469054516e-05, "loss": 0.8601, "step": 14720 }, { "epoch": 0.3877008164340269, "grad_norm": 1.7658718824386597, "learning_rate": 3.062681063997893e-05, "loss": 1.5219, "step": 14721 }, { "epoch": 0.38772715301553856, "grad_norm": 1.4189436435699463, "learning_rate": 3.062549381090335e-05, "loss": 1.7126, "step": 14722 }, { "epoch": 0.3877534895970503, "grad_norm": 3.4577560424804688, "learning_rate": 3.062417698182776e-05, "loss": 1.8513, "step": 14723 }, { "epoch": 0.38777982617856205, "grad_norm": 1.5386041402816772, "learning_rate": 3.062286015275218e-05, "loss": 2.1029, "step": 14724 }, { "epoch": 0.38780616276007374, "grad_norm": 3.1291961669921875, "learning_rate": 3.062154332367659e-05, "loss": 1.3294, "step": 14725 }, { "epoch": 0.3878324993415855, "grad_norm": 3.991101026535034, "learning_rate": 3.0620226494601e-05, "loss": 1.2592, "step": 14726 }, { "epoch": 0.38785883592309717, "grad_norm": 2.179367780685425, "learning_rate": 3.061890966552541e-05, "loss": 1.814, "step": 14727 }, { "epoch": 0.3878851725046089, "grad_norm": 3.0196750164031982, "learning_rate": 3.0617592836449834e-05, "loss": 2.2316, "step": 14728 }, { "epoch": 0.3879115090861206, "grad_norm": 1.6353254318237305, "learning_rate": 3.061627600737424e-05, "loss": 0.4928, "step": 14729 }, { "epoch": 0.38793784566763234, "grad_norm": 4.2615156173706055, "learning_rate": 3.061495917829866e-05, "loss": 1.2137, "step": 14730 }, { "epoch": 0.3879641822491441, "grad_norm": 1.837337613105774, "learning_rate": 3.061364234922307e-05, "loss": 2.0321, "step": 14731 }, { "epoch": 0.3879905188306558, "grad_norm": 4.390244483947754, "learning_rate": 3.061232552014749e-05, "loss": 0.8569, "step": 14732 }, { "epoch": 0.3880168554121675, "grad_norm": 1.7401536703109741, "learning_rate": 3.06110086910719e-05, "loss": 1.1982, "step": 14733 }, { "epoch": 0.3880431919936792, "grad_norm": 2.0399770736694336, "learning_rate": 3.0609691861996314e-05, "loss": 1.6896, "step": 14734 }, { "epoch": 0.38806952857519095, "grad_norm": 2.6990976333618164, "learning_rate": 3.060837503292073e-05, "loss": 1.8925, "step": 14735 }, { "epoch": 0.38809586515670264, "grad_norm": 3.387763261795044, "learning_rate": 3.060705820384514e-05, "loss": 0.8911, "step": 14736 }, { "epoch": 0.3881222017382144, "grad_norm": 2.366509199142456, "learning_rate": 3.060574137476956e-05, "loss": 2.0934, "step": 14737 }, { "epoch": 0.3881485383197261, "grad_norm": 4.434013843536377, "learning_rate": 3.060442454569397e-05, "loss": 1.4463, "step": 14738 }, { "epoch": 0.3881748749012378, "grad_norm": 3.942121982574463, "learning_rate": 3.0603107716618385e-05, "loss": 1.9234, "step": 14739 }, { "epoch": 0.38820121148274955, "grad_norm": 3.9252727031707764, "learning_rate": 3.0601790887542794e-05, "loss": 1.659, "step": 14740 }, { "epoch": 0.38822754806426124, "grad_norm": 2.316638231277466, "learning_rate": 3.060047405846721e-05, "loss": 2.1054, "step": 14741 }, { "epoch": 0.388253884645773, "grad_norm": 1.949794054031372, "learning_rate": 3.0599157229391625e-05, "loss": 0.58, "step": 14742 }, { "epoch": 0.38828022122728467, "grad_norm": 2.1228957176208496, "learning_rate": 3.059784040031604e-05, "loss": 1.2641, "step": 14743 }, { "epoch": 0.3883065578087964, "grad_norm": 1.845453143119812, "learning_rate": 3.059652357124046e-05, "loss": 0.651, "step": 14744 }, { "epoch": 0.38833289439030816, "grad_norm": 1.449431300163269, "learning_rate": 3.0595206742164865e-05, "loss": 1.8533, "step": 14745 }, { "epoch": 0.38835923097181985, "grad_norm": 2.3409101963043213, "learning_rate": 3.059388991308928e-05, "loss": 1.4373, "step": 14746 }, { "epoch": 0.3883855675533316, "grad_norm": 4.517127513885498, "learning_rate": 3.05925730840137e-05, "loss": 0.7689, "step": 14747 }, { "epoch": 0.3884119041348433, "grad_norm": 1.7706965208053589, "learning_rate": 3.059125625493811e-05, "loss": 1.3555, "step": 14748 }, { "epoch": 0.388438240716355, "grad_norm": 1.6963951587677002, "learning_rate": 3.058993942586252e-05, "loss": 2.0346, "step": 14749 }, { "epoch": 0.38846457729786676, "grad_norm": 3.086503028869629, "learning_rate": 3.058862259678694e-05, "loss": 1.6292, "step": 14750 }, { "epoch": 0.38849091387937845, "grad_norm": 1.985912799835205, "learning_rate": 3.058730576771135e-05, "loss": 1.9139, "step": 14751 }, { "epoch": 0.3885172504608902, "grad_norm": 1.7167713642120361, "learning_rate": 3.058598893863577e-05, "loss": 1.7078, "step": 14752 }, { "epoch": 0.3885435870424019, "grad_norm": 2.294015645980835, "learning_rate": 3.0584672109560183e-05, "loss": 1.6863, "step": 14753 }, { "epoch": 0.3885699236239136, "grad_norm": 3.438015937805176, "learning_rate": 3.058335528048459e-05, "loss": 0.9469, "step": 14754 }, { "epoch": 0.3885962602054253, "grad_norm": 2.79647159576416, "learning_rate": 3.058203845140901e-05, "loss": 0.4692, "step": 14755 }, { "epoch": 0.38862259678693706, "grad_norm": 1.8933472633361816, "learning_rate": 3.0580721622333424e-05, "loss": 2.0658, "step": 14756 }, { "epoch": 0.3886489333684488, "grad_norm": 1.668656349182129, "learning_rate": 3.057940479325784e-05, "loss": 2.131, "step": 14757 }, { "epoch": 0.3886752699499605, "grad_norm": 1.7183003425598145, "learning_rate": 3.057808796418225e-05, "loss": 1.7702, "step": 14758 }, { "epoch": 0.38870160653147223, "grad_norm": 3.017056465148926, "learning_rate": 3.0576771135106664e-05, "loss": 0.9699, "step": 14759 }, { "epoch": 0.3887279431129839, "grad_norm": 1.6128453016281128, "learning_rate": 3.057545430603108e-05, "loss": 2.1362, "step": 14760 }, { "epoch": 0.38875427969449566, "grad_norm": 3.795217514038086, "learning_rate": 3.0574137476955495e-05, "loss": 0.8622, "step": 14761 }, { "epoch": 0.38878061627600735, "grad_norm": 1.7457852363586426, "learning_rate": 3.057282064787991e-05, "loss": 2.3109, "step": 14762 }, { "epoch": 0.3888069528575191, "grad_norm": 2.2657079696655273, "learning_rate": 3.057150381880432e-05, "loss": 1.2325, "step": 14763 }, { "epoch": 0.38883328943903084, "grad_norm": 1.9281193017959595, "learning_rate": 3.0570186989728735e-05, "loss": 0.4385, "step": 14764 }, { "epoch": 0.3888596260205425, "grad_norm": 1.8128288984298706, "learning_rate": 3.056887016065315e-05, "loss": 1.7317, "step": 14765 }, { "epoch": 0.38888596260205427, "grad_norm": 1.6248143911361694, "learning_rate": 3.0567553331577566e-05, "loss": 1.7334, "step": 14766 }, { "epoch": 0.38891229918356596, "grad_norm": 2.2053070068359375, "learning_rate": 3.0566236502501975e-05, "loss": 1.6227, "step": 14767 }, { "epoch": 0.3889386357650777, "grad_norm": 2.0217785835266113, "learning_rate": 3.056491967342639e-05, "loss": 1.4445, "step": 14768 }, { "epoch": 0.3889649723465894, "grad_norm": 1.6188602447509766, "learning_rate": 3.0563602844350806e-05, "loss": 1.5988, "step": 14769 }, { "epoch": 0.38899130892810113, "grad_norm": 3.8315913677215576, "learning_rate": 3.056228601527522e-05, "loss": 1.4201, "step": 14770 }, { "epoch": 0.3890176455096129, "grad_norm": 3.125159502029419, "learning_rate": 3.056096918619964e-05, "loss": 1.4993, "step": 14771 }, { "epoch": 0.38904398209112456, "grad_norm": 1.9284790754318237, "learning_rate": 3.0559652357124046e-05, "loss": 0.6335, "step": 14772 }, { "epoch": 0.3890703186726363, "grad_norm": 2.823977470397949, "learning_rate": 3.055833552804846e-05, "loss": 1.5023, "step": 14773 }, { "epoch": 0.389096655254148, "grad_norm": 2.3714962005615234, "learning_rate": 3.055701869897287e-05, "loss": 2.1538, "step": 14774 }, { "epoch": 0.38912299183565974, "grad_norm": 2.0893714427948, "learning_rate": 3.055570186989729e-05, "loss": 2.2786, "step": 14775 }, { "epoch": 0.3891493284171714, "grad_norm": 7.809403896331787, "learning_rate": 3.05543850408217e-05, "loss": 1.475, "step": 14776 }, { "epoch": 0.38917566499868317, "grad_norm": 1.7811617851257324, "learning_rate": 3.055306821174612e-05, "loss": 2.0439, "step": 14777 }, { "epoch": 0.3892020015801949, "grad_norm": 1.877899408340454, "learning_rate": 3.0551751382670526e-05, "loss": 1.6747, "step": 14778 }, { "epoch": 0.3892283381617066, "grad_norm": 1.5328963994979858, "learning_rate": 3.055043455359494e-05, "loss": 1.8227, "step": 14779 }, { "epoch": 0.38925467474321834, "grad_norm": 2.7405691146850586, "learning_rate": 3.054911772451936e-05, "loss": 1.8776, "step": 14780 }, { "epoch": 0.38928101132473003, "grad_norm": 2.2172694206237793, "learning_rate": 3.054780089544377e-05, "loss": 2.0139, "step": 14781 }, { "epoch": 0.3893073479062418, "grad_norm": 1.8459181785583496, "learning_rate": 3.054648406636819e-05, "loss": 1.8849, "step": 14782 }, { "epoch": 0.38933368448775346, "grad_norm": 1.8791639804840088, "learning_rate": 3.05451672372926e-05, "loss": 1.8156, "step": 14783 }, { "epoch": 0.3893600210692652, "grad_norm": 1.9491006135940552, "learning_rate": 3.054385040821702e-05, "loss": 1.8969, "step": 14784 }, { "epoch": 0.38938635765077695, "grad_norm": 3.5040998458862305, "learning_rate": 3.054253357914143e-05, "loss": 2.5006, "step": 14785 }, { "epoch": 0.38941269423228864, "grad_norm": 1.7740840911865234, "learning_rate": 3.0541216750065844e-05, "loss": 1.8072, "step": 14786 }, { "epoch": 0.3894390308138004, "grad_norm": 3.1526856422424316, "learning_rate": 3.053989992099025e-05, "loss": 1.7881, "step": 14787 }, { "epoch": 0.38946536739531207, "grad_norm": 1.7720575332641602, "learning_rate": 3.053858309191467e-05, "loss": 2.1246, "step": 14788 }, { "epoch": 0.3894917039768238, "grad_norm": 1.658571720123291, "learning_rate": 3.0537266262839084e-05, "loss": 2.1139, "step": 14789 }, { "epoch": 0.38951804055833555, "grad_norm": 3.5377626419067383, "learning_rate": 3.05359494337635e-05, "loss": 1.5869, "step": 14790 }, { "epoch": 0.38954437713984724, "grad_norm": 2.2462644577026367, "learning_rate": 3.0534632604687915e-05, "loss": 1.8108, "step": 14791 }, { "epoch": 0.389570713721359, "grad_norm": 1.6197788715362549, "learning_rate": 3.0533315775612324e-05, "loss": 0.6414, "step": 14792 }, { "epoch": 0.3895970503028707, "grad_norm": 2.375152111053467, "learning_rate": 3.053199894653674e-05, "loss": 1.95, "step": 14793 }, { "epoch": 0.3896233868843824, "grad_norm": 2.354613780975342, "learning_rate": 3.0530682117461155e-05, "loss": 0.9079, "step": 14794 }, { "epoch": 0.3896497234658941, "grad_norm": 2.530278444290161, "learning_rate": 3.052936528838557e-05, "loss": 1.6069, "step": 14795 }, { "epoch": 0.38967606004740585, "grad_norm": 3.9799392223358154, "learning_rate": 3.052804845930998e-05, "loss": 1.6279, "step": 14796 }, { "epoch": 0.3897023966289176, "grad_norm": 3.936377763748169, "learning_rate": 3.0526731630234395e-05, "loss": 2.1348, "step": 14797 }, { "epoch": 0.3897287332104293, "grad_norm": 2.7203400135040283, "learning_rate": 3.052541480115881e-05, "loss": 0.5619, "step": 14798 }, { "epoch": 0.389755069791941, "grad_norm": 1.6430703401565552, "learning_rate": 3.0524097972083226e-05, "loss": 1.5575, "step": 14799 }, { "epoch": 0.3897814063734527, "grad_norm": 3.247898817062378, "learning_rate": 3.052278114300764e-05, "loss": 1.8223, "step": 14800 }, { "epoch": 0.38980774295496445, "grad_norm": 1.4750473499298096, "learning_rate": 3.052146431393205e-05, "loss": 1.5932, "step": 14801 }, { "epoch": 0.38983407953647614, "grad_norm": 3.719090223312378, "learning_rate": 3.0520147484856466e-05, "loss": 2.2278, "step": 14802 }, { "epoch": 0.3898604161179879, "grad_norm": 3.1308326721191406, "learning_rate": 3.051883065578088e-05, "loss": 1.1906, "step": 14803 }, { "epoch": 0.38988675269949963, "grad_norm": 3.238124132156372, "learning_rate": 3.0517513826705298e-05, "loss": 2.3122, "step": 14804 }, { "epoch": 0.3899130892810113, "grad_norm": 2.135883331298828, "learning_rate": 3.051619699762971e-05, "loss": 1.5543, "step": 14805 }, { "epoch": 0.38993942586252306, "grad_norm": 2.100149154663086, "learning_rate": 3.0514880168554122e-05, "loss": 2.0551, "step": 14806 }, { "epoch": 0.38996576244403475, "grad_norm": 2.451986789703369, "learning_rate": 3.0513563339478534e-05, "loss": 0.6453, "step": 14807 }, { "epoch": 0.3899920990255465, "grad_norm": 1.7605277299880981, "learning_rate": 3.0512246510402953e-05, "loss": 1.8757, "step": 14808 }, { "epoch": 0.3900184356070582, "grad_norm": 1.8690348863601685, "learning_rate": 3.0510929681327365e-05, "loss": 1.336, "step": 14809 }, { "epoch": 0.3900447721885699, "grad_norm": 1.969211459159851, "learning_rate": 3.0509612852251778e-05, "loss": 1.9297, "step": 14810 }, { "epoch": 0.39007110877008166, "grad_norm": 1.8377444744110107, "learning_rate": 3.0508296023176193e-05, "loss": 2.1067, "step": 14811 }, { "epoch": 0.39009744535159335, "grad_norm": 2.1249094009399414, "learning_rate": 3.0506979194100606e-05, "loss": 1.1892, "step": 14812 }, { "epoch": 0.3901237819331051, "grad_norm": 5.820160388946533, "learning_rate": 3.0505662365025024e-05, "loss": 0.9323, "step": 14813 }, { "epoch": 0.3901501185146168, "grad_norm": 1.8319264650344849, "learning_rate": 3.0504345535949437e-05, "loss": 1.7942, "step": 14814 }, { "epoch": 0.3901764550961285, "grad_norm": 2.809297800064087, "learning_rate": 3.050302870687385e-05, "loss": 1.0859, "step": 14815 }, { "epoch": 0.3902027916776402, "grad_norm": 2.9805855751037598, "learning_rate": 3.050171187779826e-05, "loss": 1.1923, "step": 14816 }, { "epoch": 0.39022912825915196, "grad_norm": 3.4213922023773193, "learning_rate": 3.050039504872268e-05, "loss": 2.4029, "step": 14817 }, { "epoch": 0.3902554648406637, "grad_norm": 1.3115689754486084, "learning_rate": 3.0499078219647092e-05, "loss": 1.4285, "step": 14818 }, { "epoch": 0.3902818014221754, "grad_norm": 1.8398888111114502, "learning_rate": 3.0497761390571505e-05, "loss": 1.9925, "step": 14819 }, { "epoch": 0.39030813800368713, "grad_norm": 1.6225249767303467, "learning_rate": 3.0496444561495917e-05, "loss": 1.5331, "step": 14820 }, { "epoch": 0.3903344745851988, "grad_norm": 3.7366654872894287, "learning_rate": 3.0495127732420332e-05, "loss": 1.7678, "step": 14821 }, { "epoch": 0.39036081116671056, "grad_norm": 1.8978986740112305, "learning_rate": 3.0493810903344748e-05, "loss": 0.4165, "step": 14822 }, { "epoch": 0.3903871477482223, "grad_norm": 1.9186005592346191, "learning_rate": 3.0492494074269164e-05, "loss": 1.4529, "step": 14823 }, { "epoch": 0.390413484329734, "grad_norm": 2.3490982055664062, "learning_rate": 3.0491177245193576e-05, "loss": 1.8218, "step": 14824 }, { "epoch": 0.39043982091124574, "grad_norm": 1.5955188274383545, "learning_rate": 3.0489860416117988e-05, "loss": 1.6101, "step": 14825 }, { "epoch": 0.3904661574927574, "grad_norm": 1.7607296705245972, "learning_rate": 3.04885435870424e-05, "loss": 1.7532, "step": 14826 }, { "epoch": 0.39049249407426917, "grad_norm": 2.687554359436035, "learning_rate": 3.048722675796682e-05, "loss": 1.0286, "step": 14827 }, { "epoch": 0.39051883065578086, "grad_norm": 2.9266109466552734, "learning_rate": 3.048590992889123e-05, "loss": 1.7798, "step": 14828 }, { "epoch": 0.3905451672372926, "grad_norm": 3.858023166656494, "learning_rate": 3.0484593099815644e-05, "loss": 1.9827, "step": 14829 }, { "epoch": 0.39057150381880434, "grad_norm": 10.170050621032715, "learning_rate": 3.048327627074006e-05, "loss": 1.5157, "step": 14830 }, { "epoch": 0.39059784040031603, "grad_norm": 1.9377825260162354, "learning_rate": 3.0481959441664475e-05, "loss": 2.1592, "step": 14831 }, { "epoch": 0.3906241769818278, "grad_norm": 1.8588459491729736, "learning_rate": 3.048064261258889e-05, "loss": 1.4449, "step": 14832 }, { "epoch": 0.39065051356333946, "grad_norm": 2.999317169189453, "learning_rate": 3.0479325783513303e-05, "loss": 0.9911, "step": 14833 }, { "epoch": 0.3906768501448512, "grad_norm": 2.076369047164917, "learning_rate": 3.0478008954437715e-05, "loss": 2.4455, "step": 14834 }, { "epoch": 0.3907031867263629, "grad_norm": 1.6744062900543213, "learning_rate": 3.0476692125362127e-05, "loss": 0.3062, "step": 14835 }, { "epoch": 0.39072952330787464, "grad_norm": 1.8977676630020142, "learning_rate": 3.0475375296286546e-05, "loss": 1.889, "step": 14836 }, { "epoch": 0.3907558598893864, "grad_norm": 1.4191335439682007, "learning_rate": 3.0474058467210958e-05, "loss": 1.7728, "step": 14837 }, { "epoch": 0.39078219647089807, "grad_norm": 2.384071111679077, "learning_rate": 3.047274163813537e-05, "loss": 1.6112, "step": 14838 }, { "epoch": 0.3908085330524098, "grad_norm": 1.572990894317627, "learning_rate": 3.0471424809059783e-05, "loss": 1.8101, "step": 14839 }, { "epoch": 0.3908348696339215, "grad_norm": 2.649071455001831, "learning_rate": 3.0470107979984198e-05, "loss": 1.7474, "step": 14840 }, { "epoch": 0.39086120621543324, "grad_norm": 1.543198823928833, "learning_rate": 3.0468791150908614e-05, "loss": 2.2882, "step": 14841 }, { "epoch": 0.39088754279694493, "grad_norm": 1.572638988494873, "learning_rate": 3.046747432183303e-05, "loss": 1.6301, "step": 14842 }, { "epoch": 0.3909138793784567, "grad_norm": 3.6765494346618652, "learning_rate": 3.046615749275744e-05, "loss": 1.4851, "step": 14843 }, { "epoch": 0.3909402159599684, "grad_norm": 1.8132128715515137, "learning_rate": 3.0464840663681854e-05, "loss": 1.281, "step": 14844 }, { "epoch": 0.3909665525414801, "grad_norm": 2.5002686977386475, "learning_rate": 3.0463523834606266e-05, "loss": 1.3173, "step": 14845 }, { "epoch": 0.39099288912299185, "grad_norm": 2.05584716796875, "learning_rate": 3.0462207005530685e-05, "loss": 0.381, "step": 14846 }, { "epoch": 0.39101922570450354, "grad_norm": 2.1467530727386475, "learning_rate": 3.0460890176455097e-05, "loss": 1.4935, "step": 14847 }, { "epoch": 0.3910455622860153, "grad_norm": 1.642984390258789, "learning_rate": 3.045957334737951e-05, "loss": 1.7472, "step": 14848 }, { "epoch": 0.39107189886752697, "grad_norm": 4.430726528167725, "learning_rate": 3.0458256518303925e-05, "loss": 1.0849, "step": 14849 }, { "epoch": 0.3910982354490387, "grad_norm": 2.195782423019409, "learning_rate": 3.045693968922834e-05, "loss": 2.0548, "step": 14850 }, { "epoch": 0.39112457203055045, "grad_norm": 2.064549207687378, "learning_rate": 3.0455622860152756e-05, "loss": 1.1847, "step": 14851 }, { "epoch": 0.39115090861206214, "grad_norm": 1.8034393787384033, "learning_rate": 3.045430603107717e-05, "loss": 1.5991, "step": 14852 }, { "epoch": 0.3911772451935739, "grad_norm": 1.5642762184143066, "learning_rate": 3.045298920200158e-05, "loss": 1.4413, "step": 14853 }, { "epoch": 0.3912035817750856, "grad_norm": 2.640472412109375, "learning_rate": 3.0451672372925993e-05, "loss": 0.8813, "step": 14854 }, { "epoch": 0.3912299183565973, "grad_norm": 2.2595884799957275, "learning_rate": 3.0450355543850412e-05, "loss": 2.0476, "step": 14855 }, { "epoch": 0.39125625493810906, "grad_norm": 2.333843469619751, "learning_rate": 3.0449038714774824e-05, "loss": 1.2486, "step": 14856 }, { "epoch": 0.39128259151962075, "grad_norm": 2.0612878799438477, "learning_rate": 3.0447721885699236e-05, "loss": 1.7296, "step": 14857 }, { "epoch": 0.3913089281011325, "grad_norm": 1.7651320695877075, "learning_rate": 3.0446405056623652e-05, "loss": 2.4206, "step": 14858 }, { "epoch": 0.3913352646826442, "grad_norm": 2.996703624725342, "learning_rate": 3.0445088227548064e-05, "loss": 0.7184, "step": 14859 }, { "epoch": 0.3913616012641559, "grad_norm": 3.8319647312164307, "learning_rate": 3.0443771398472483e-05, "loss": 1.1606, "step": 14860 }, { "epoch": 0.3913879378456676, "grad_norm": 2.659003257751465, "learning_rate": 3.0442454569396895e-05, "loss": 1.4834, "step": 14861 }, { "epoch": 0.39141427442717935, "grad_norm": 2.2486412525177, "learning_rate": 3.0441137740321307e-05, "loss": 2.1412, "step": 14862 }, { "epoch": 0.3914406110086911, "grad_norm": 1.9141794443130493, "learning_rate": 3.043982091124572e-05, "loss": 1.6169, "step": 14863 }, { "epoch": 0.3914669475902028, "grad_norm": 2.2230496406555176, "learning_rate": 3.043850408217014e-05, "loss": 2.0887, "step": 14864 }, { "epoch": 0.39149328417171453, "grad_norm": 2.0029618740081787, "learning_rate": 3.043718725309455e-05, "loss": 2.2636, "step": 14865 }, { "epoch": 0.3915196207532262, "grad_norm": 1.8449727296829224, "learning_rate": 3.0435870424018963e-05, "loss": 1.9363, "step": 14866 }, { "epoch": 0.39154595733473796, "grad_norm": 1.8683706521987915, "learning_rate": 3.0434553594943375e-05, "loss": 1.2098, "step": 14867 }, { "epoch": 0.39157229391624965, "grad_norm": 1.562099814414978, "learning_rate": 3.043323676586779e-05, "loss": 0.3352, "step": 14868 }, { "epoch": 0.3915986304977614, "grad_norm": 1.8673365116119385, "learning_rate": 3.0431919936792206e-05, "loss": 1.7897, "step": 14869 }, { "epoch": 0.39162496707927313, "grad_norm": 2.4109718799591064, "learning_rate": 3.0430603107716622e-05, "loss": 1.804, "step": 14870 }, { "epoch": 0.3916513036607848, "grad_norm": 1.8080004453659058, "learning_rate": 3.0429286278641034e-05, "loss": 2.0486, "step": 14871 }, { "epoch": 0.39167764024229657, "grad_norm": 1.4065428972244263, "learning_rate": 3.0427969449565446e-05, "loss": 1.2804, "step": 14872 }, { "epoch": 0.39170397682380825, "grad_norm": 1.6550333499908447, "learning_rate": 3.042665262048986e-05, "loss": 1.8038, "step": 14873 }, { "epoch": 0.39173031340532, "grad_norm": 2.656360149383545, "learning_rate": 3.0425335791414278e-05, "loss": 1.1885, "step": 14874 }, { "epoch": 0.3917566499868317, "grad_norm": 3.0260536670684814, "learning_rate": 3.042401896233869e-05, "loss": 0.8293, "step": 14875 }, { "epoch": 0.3917829865683434, "grad_norm": 2.475206136703491, "learning_rate": 3.0422702133263102e-05, "loss": 1.7176, "step": 14876 }, { "epoch": 0.39180932314985517, "grad_norm": 2.055607318878174, "learning_rate": 3.0421385304187518e-05, "loss": 1.9977, "step": 14877 }, { "epoch": 0.39183565973136686, "grad_norm": 3.346900463104248, "learning_rate": 3.0420068475111933e-05, "loss": 2.0267, "step": 14878 }, { "epoch": 0.3918619963128786, "grad_norm": 4.251241207122803, "learning_rate": 3.041875164603635e-05, "loss": 0.9735, "step": 14879 }, { "epoch": 0.3918883328943903, "grad_norm": 1.5219513177871704, "learning_rate": 3.041743481696076e-05, "loss": 1.7822, "step": 14880 }, { "epoch": 0.39191466947590203, "grad_norm": 1.882404088973999, "learning_rate": 3.0416117987885173e-05, "loss": 1.2649, "step": 14881 }, { "epoch": 0.3919410060574137, "grad_norm": 1.8339011669158936, "learning_rate": 3.0414801158809586e-05, "loss": 1.3799, "step": 14882 }, { "epoch": 0.39196734263892546, "grad_norm": 2.841479539871216, "learning_rate": 3.0413484329734005e-05, "loss": 1.3098, "step": 14883 }, { "epoch": 0.3919936792204372, "grad_norm": 4.686696529388428, "learning_rate": 3.0412167500658417e-05, "loss": 2.2694, "step": 14884 }, { "epoch": 0.3920200158019489, "grad_norm": 1.785043716430664, "learning_rate": 3.041085067158283e-05, "loss": 1.6564, "step": 14885 }, { "epoch": 0.39204635238346064, "grad_norm": 2.304102897644043, "learning_rate": 3.0409533842507245e-05, "loss": 0.3795, "step": 14886 }, { "epoch": 0.3920726889649723, "grad_norm": 2.1050198078155518, "learning_rate": 3.0408217013431657e-05, "loss": 1.7311, "step": 14887 }, { "epoch": 0.39209902554648407, "grad_norm": 3.097066879272461, "learning_rate": 3.0406900184356076e-05, "loss": 2.1014, "step": 14888 }, { "epoch": 0.3921253621279958, "grad_norm": 2.9771621227264404, "learning_rate": 3.0405583355280488e-05, "loss": 1.3121, "step": 14889 }, { "epoch": 0.3921516987095075, "grad_norm": 4.402443885803223, "learning_rate": 3.04042665262049e-05, "loss": 0.8534, "step": 14890 }, { "epoch": 0.39217803529101924, "grad_norm": 2.157930374145508, "learning_rate": 3.0402949697129312e-05, "loss": 1.8071, "step": 14891 }, { "epoch": 0.39220437187253093, "grad_norm": 2.9447126388549805, "learning_rate": 3.0401632868053725e-05, "loss": 1.3017, "step": 14892 }, { "epoch": 0.3922307084540427, "grad_norm": 2.579279899597168, "learning_rate": 3.0400316038978144e-05, "loss": 1.7322, "step": 14893 }, { "epoch": 0.39225704503555436, "grad_norm": 2.387075185775757, "learning_rate": 3.0398999209902556e-05, "loss": 1.056, "step": 14894 }, { "epoch": 0.3922833816170661, "grad_norm": 1.9575387239456177, "learning_rate": 3.0397682380826968e-05, "loss": 1.7838, "step": 14895 }, { "epoch": 0.39230971819857785, "grad_norm": 2.322054147720337, "learning_rate": 3.0396365551751384e-05, "loss": 1.4233, "step": 14896 }, { "epoch": 0.39233605478008954, "grad_norm": 1.781912922859192, "learning_rate": 3.03950487226758e-05, "loss": 1.4996, "step": 14897 }, { "epoch": 0.3923623913616013, "grad_norm": 2.9463624954223633, "learning_rate": 3.0393731893600215e-05, "loss": 1.6873, "step": 14898 }, { "epoch": 0.39238872794311297, "grad_norm": 1.6454142332077026, "learning_rate": 3.0392415064524627e-05, "loss": 1.287, "step": 14899 }, { "epoch": 0.3924150645246247, "grad_norm": 1.539923906326294, "learning_rate": 3.039109823544904e-05, "loss": 1.605, "step": 14900 }, { "epoch": 0.3924414011061364, "grad_norm": 2.622748613357544, "learning_rate": 3.038978140637345e-05, "loss": 1.644, "step": 14901 }, { "epoch": 0.39246773768764814, "grad_norm": 2.081028699874878, "learning_rate": 3.038846457729787e-05, "loss": 2.0464, "step": 14902 }, { "epoch": 0.3924940742691599, "grad_norm": 1.8871220350265503, "learning_rate": 3.0387147748222283e-05, "loss": 1.8112, "step": 14903 }, { "epoch": 0.3925204108506716, "grad_norm": 1.806481957435608, "learning_rate": 3.0385830919146695e-05, "loss": 0.3939, "step": 14904 }, { "epoch": 0.3925467474321833, "grad_norm": 1.8234916925430298, "learning_rate": 3.038451409007111e-05, "loss": 1.6051, "step": 14905 }, { "epoch": 0.392573084013695, "grad_norm": 5.302242279052734, "learning_rate": 3.0383197260995523e-05, "loss": 1.1637, "step": 14906 }, { "epoch": 0.39259942059520675, "grad_norm": 2.1670749187469482, "learning_rate": 3.038188043191994e-05, "loss": 1.913, "step": 14907 }, { "epoch": 0.39262575717671844, "grad_norm": 2.4837429523468018, "learning_rate": 3.0380563602844354e-05, "loss": 1.5496, "step": 14908 }, { "epoch": 0.3926520937582302, "grad_norm": 5.978394508361816, "learning_rate": 3.0379246773768766e-05, "loss": 1.972, "step": 14909 }, { "epoch": 0.3926784303397419, "grad_norm": 2.4605159759521484, "learning_rate": 3.0377929944693178e-05, "loss": 2.3206, "step": 14910 }, { "epoch": 0.3927047669212536, "grad_norm": 2.2506847381591797, "learning_rate": 3.0376613115617597e-05, "loss": 1.839, "step": 14911 }, { "epoch": 0.39273110350276536, "grad_norm": 1.8196157217025757, "learning_rate": 3.037529628654201e-05, "loss": 2.1418, "step": 14912 }, { "epoch": 0.39275744008427704, "grad_norm": 2.0755486488342285, "learning_rate": 3.037397945746642e-05, "loss": 1.9227, "step": 14913 }, { "epoch": 0.3927837766657888, "grad_norm": 2.607734441757202, "learning_rate": 3.0372662628390837e-05, "loss": 2.2429, "step": 14914 }, { "epoch": 0.3928101132473005, "grad_norm": 4.638509750366211, "learning_rate": 3.037134579931525e-05, "loss": 0.7759, "step": 14915 }, { "epoch": 0.3928364498288122, "grad_norm": 2.1411778926849365, "learning_rate": 3.037002897023967e-05, "loss": 1.6683, "step": 14916 }, { "epoch": 0.39286278641032396, "grad_norm": 2.4409782886505127, "learning_rate": 3.036871214116408e-05, "loss": 1.956, "step": 14917 }, { "epoch": 0.39288912299183565, "grad_norm": 1.7738896608352661, "learning_rate": 3.0367395312088493e-05, "loss": 2.1934, "step": 14918 }, { "epoch": 0.3929154595733474, "grad_norm": 2.4149856567382812, "learning_rate": 3.0366078483012905e-05, "loss": 1.902, "step": 14919 }, { "epoch": 0.3929417961548591, "grad_norm": 3.9030544757843018, "learning_rate": 3.0364761653937317e-05, "loss": 1.3667, "step": 14920 }, { "epoch": 0.3929681327363708, "grad_norm": 2.000345230102539, "learning_rate": 3.0363444824861736e-05, "loss": 2.022, "step": 14921 }, { "epoch": 0.39299446931788257, "grad_norm": 3.048539400100708, "learning_rate": 3.036212799578615e-05, "loss": 2.7481, "step": 14922 }, { "epoch": 0.39302080589939425, "grad_norm": 2.1777024269104004, "learning_rate": 3.036081116671056e-05, "loss": 1.5604, "step": 14923 }, { "epoch": 0.393047142480906, "grad_norm": 1.943151831626892, "learning_rate": 3.0359494337634976e-05, "loss": 2.3059, "step": 14924 }, { "epoch": 0.3930734790624177, "grad_norm": 2.3496744632720947, "learning_rate": 3.035817750855939e-05, "loss": 2.3355, "step": 14925 }, { "epoch": 0.39309981564392943, "grad_norm": 1.7827156782150269, "learning_rate": 3.0356860679483807e-05, "loss": 1.8794, "step": 14926 }, { "epoch": 0.3931261522254411, "grad_norm": 1.7456151247024536, "learning_rate": 3.035554385040822e-05, "loss": 0.5393, "step": 14927 }, { "epoch": 0.39315248880695286, "grad_norm": 1.9638248682022095, "learning_rate": 3.0354227021332632e-05, "loss": 0.2196, "step": 14928 }, { "epoch": 0.3931788253884646, "grad_norm": 2.987661123275757, "learning_rate": 3.0352910192257044e-05, "loss": 1.4612, "step": 14929 }, { "epoch": 0.3932051619699763, "grad_norm": 1.900553584098816, "learning_rate": 3.0351593363181463e-05, "loss": 1.0964, "step": 14930 }, { "epoch": 0.39323149855148803, "grad_norm": 2.4086766242980957, "learning_rate": 3.0350276534105875e-05, "loss": 1.6261, "step": 14931 }, { "epoch": 0.3932578351329997, "grad_norm": 3.8307127952575684, "learning_rate": 3.0348959705030287e-05, "loss": 0.9342, "step": 14932 }, { "epoch": 0.39328417171451147, "grad_norm": 2.4297211170196533, "learning_rate": 3.0347642875954703e-05, "loss": 1.6449, "step": 14933 }, { "epoch": 0.39331050829602315, "grad_norm": 2.250925302505493, "learning_rate": 3.0346326046879115e-05, "loss": 1.6995, "step": 14934 }, { "epoch": 0.3933368448775349, "grad_norm": 1.9429324865341187, "learning_rate": 3.0345009217803534e-05, "loss": 1.7846, "step": 14935 }, { "epoch": 0.39336318145904664, "grad_norm": 2.4213273525238037, "learning_rate": 3.0343692388727946e-05, "loss": 0.9225, "step": 14936 }, { "epoch": 0.39338951804055833, "grad_norm": 1.8437020778656006, "learning_rate": 3.034237555965236e-05, "loss": 2.2285, "step": 14937 }, { "epoch": 0.39341585462207007, "grad_norm": 2.1062188148498535, "learning_rate": 3.034105873057677e-05, "loss": 1.8727, "step": 14938 }, { "epoch": 0.39344219120358176, "grad_norm": 2.6376893520355225, "learning_rate": 3.0339741901501183e-05, "loss": 2.5793, "step": 14939 }, { "epoch": 0.3934685277850935, "grad_norm": 2.2039520740509033, "learning_rate": 3.0338425072425602e-05, "loss": 1.2864, "step": 14940 }, { "epoch": 0.3934948643666052, "grad_norm": 1.7054497003555298, "learning_rate": 3.0337108243350014e-05, "loss": 2.1886, "step": 14941 }, { "epoch": 0.39352120094811693, "grad_norm": 1.5571074485778809, "learning_rate": 3.0335791414274427e-05, "loss": 1.6261, "step": 14942 }, { "epoch": 0.3935475375296287, "grad_norm": 2.9445605278015137, "learning_rate": 3.0334474585198842e-05, "loss": 1.5774, "step": 14943 }, { "epoch": 0.39357387411114036, "grad_norm": 2.143604040145874, "learning_rate": 3.0333157756123258e-05, "loss": 1.8837, "step": 14944 }, { "epoch": 0.3936002106926521, "grad_norm": 3.8407857418060303, "learning_rate": 3.0331840927047673e-05, "loss": 1.7972, "step": 14945 }, { "epoch": 0.3936265472741638, "grad_norm": 1.8203668594360352, "learning_rate": 3.0330524097972086e-05, "loss": 2.0145, "step": 14946 }, { "epoch": 0.39365288385567554, "grad_norm": 4.857835292816162, "learning_rate": 3.0329207268896498e-05, "loss": 0.9698, "step": 14947 }, { "epoch": 0.3936792204371872, "grad_norm": 1.586806058883667, "learning_rate": 3.032789043982091e-05, "loss": 2.0293, "step": 14948 }, { "epoch": 0.39370555701869897, "grad_norm": 2.2219021320343018, "learning_rate": 3.032657361074533e-05, "loss": 1.4939, "step": 14949 }, { "epoch": 0.3937318936002107, "grad_norm": 2.316866636276245, "learning_rate": 3.032525678166974e-05, "loss": 0.7091, "step": 14950 }, { "epoch": 0.3937582301817224, "grad_norm": 1.5804301500320435, "learning_rate": 3.0323939952594153e-05, "loss": 1.2736, "step": 14951 }, { "epoch": 0.39378456676323415, "grad_norm": 1.731168508529663, "learning_rate": 3.032262312351857e-05, "loss": 1.6398, "step": 14952 }, { "epoch": 0.39381090334474583, "grad_norm": 1.931602954864502, "learning_rate": 3.032130629444298e-05, "loss": 1.4843, "step": 14953 }, { "epoch": 0.3938372399262576, "grad_norm": 3.5113279819488525, "learning_rate": 3.03199894653674e-05, "loss": 1.0426, "step": 14954 }, { "epoch": 0.39386357650776926, "grad_norm": 2.936785936355591, "learning_rate": 3.0318672636291812e-05, "loss": 0.9301, "step": 14955 }, { "epoch": 0.393889913089281, "grad_norm": 2.2373158931732178, "learning_rate": 3.0317355807216225e-05, "loss": 2.2058, "step": 14956 }, { "epoch": 0.39391624967079275, "grad_norm": 4.877676010131836, "learning_rate": 3.0316038978140637e-05, "loss": 1.7309, "step": 14957 }, { "epoch": 0.39394258625230444, "grad_norm": 2.009469985961914, "learning_rate": 3.031472214906505e-05, "loss": 0.236, "step": 14958 }, { "epoch": 0.3939689228338162, "grad_norm": 2.7458291053771973, "learning_rate": 3.0313405319989468e-05, "loss": 1.7673, "step": 14959 }, { "epoch": 0.39399525941532787, "grad_norm": 2.7475838661193848, "learning_rate": 3.031208849091388e-05, "loss": 1.792, "step": 14960 }, { "epoch": 0.3940215959968396, "grad_norm": 3.2309162616729736, "learning_rate": 3.0310771661838296e-05, "loss": 1.621, "step": 14961 }, { "epoch": 0.39404793257835136, "grad_norm": 1.9680989980697632, "learning_rate": 3.0309454832762708e-05, "loss": 1.4533, "step": 14962 }, { "epoch": 0.39407426915986304, "grad_norm": 2.2345597743988037, "learning_rate": 3.0308138003687127e-05, "loss": 1.6085, "step": 14963 }, { "epoch": 0.3941006057413748, "grad_norm": 1.9043540954589844, "learning_rate": 3.030682117461154e-05, "loss": 2.0718, "step": 14964 }, { "epoch": 0.3941269423228865, "grad_norm": 3.382739305496216, "learning_rate": 3.030550434553595e-05, "loss": 1.3709, "step": 14965 }, { "epoch": 0.3941532789043982, "grad_norm": 2.239262580871582, "learning_rate": 3.0304187516460364e-05, "loss": 0.3446, "step": 14966 }, { "epoch": 0.3941796154859099, "grad_norm": 2.3586490154266357, "learning_rate": 3.0302870687384776e-05, "loss": 1.7833, "step": 14967 }, { "epoch": 0.39420595206742165, "grad_norm": 1.9577274322509766, "learning_rate": 3.0301553858309195e-05, "loss": 1.5152, "step": 14968 }, { "epoch": 0.3942322886489334, "grad_norm": 4.146500587463379, "learning_rate": 3.0300237029233607e-05, "loss": 0.8889, "step": 14969 }, { "epoch": 0.3942586252304451, "grad_norm": 1.9502582550048828, "learning_rate": 3.029892020015802e-05, "loss": 1.7861, "step": 14970 }, { "epoch": 0.3942849618119568, "grad_norm": 2.187791585922241, "learning_rate": 3.0297603371082435e-05, "loss": 1.2702, "step": 14971 }, { "epoch": 0.3943112983934685, "grad_norm": 1.8412171602249146, "learning_rate": 3.0296286542006847e-05, "loss": 1.6453, "step": 14972 }, { "epoch": 0.39433763497498026, "grad_norm": 3.715301990509033, "learning_rate": 3.0294969712931266e-05, "loss": 2.5333, "step": 14973 }, { "epoch": 0.39436397155649194, "grad_norm": 2.007598638534546, "learning_rate": 3.0293652883855678e-05, "loss": 1.884, "step": 14974 }, { "epoch": 0.3943903081380037, "grad_norm": 2.7604873180389404, "learning_rate": 3.029233605478009e-05, "loss": 1.9422, "step": 14975 }, { "epoch": 0.39441664471951543, "grad_norm": 1.9046781063079834, "learning_rate": 3.0291019225704503e-05, "loss": 1.399, "step": 14976 }, { "epoch": 0.3944429813010271, "grad_norm": 4.05242395401001, "learning_rate": 3.028970239662892e-05, "loss": 1.2327, "step": 14977 }, { "epoch": 0.39446931788253886, "grad_norm": 2.39886212348938, "learning_rate": 3.0288385567553334e-05, "loss": 1.4573, "step": 14978 }, { "epoch": 0.39449565446405055, "grad_norm": 1.5059698820114136, "learning_rate": 3.0287068738477746e-05, "loss": 1.9572, "step": 14979 }, { "epoch": 0.3945219910455623, "grad_norm": 3.4010074138641357, "learning_rate": 3.028575190940216e-05, "loss": 0.7581, "step": 14980 }, { "epoch": 0.394548327627074, "grad_norm": 2.100843667984009, "learning_rate": 3.0284435080326574e-05, "loss": 1.8349, "step": 14981 }, { "epoch": 0.3945746642085857, "grad_norm": 3.752262592315674, "learning_rate": 3.0283118251250993e-05, "loss": 1.5736, "step": 14982 }, { "epoch": 0.39460100079009747, "grad_norm": 1.9618964195251465, "learning_rate": 3.0281801422175405e-05, "loss": 1.0923, "step": 14983 }, { "epoch": 0.39462733737160915, "grad_norm": 3.5801913738250732, "learning_rate": 3.0280484593099817e-05, "loss": 1.3707, "step": 14984 }, { "epoch": 0.3946536739531209, "grad_norm": 1.8059853315353394, "learning_rate": 3.027916776402423e-05, "loss": 2.5138, "step": 14985 }, { "epoch": 0.3946800105346326, "grad_norm": 1.8930373191833496, "learning_rate": 3.027785093494864e-05, "loss": 2.2534, "step": 14986 }, { "epoch": 0.39470634711614433, "grad_norm": 2.008183240890503, "learning_rate": 3.027653410587306e-05, "loss": 2.3735, "step": 14987 }, { "epoch": 0.394732683697656, "grad_norm": 1.706168293952942, "learning_rate": 3.0275217276797473e-05, "loss": 1.5547, "step": 14988 }, { "epoch": 0.39475902027916776, "grad_norm": 1.617963433265686, "learning_rate": 3.027390044772189e-05, "loss": 2.0334, "step": 14989 }, { "epoch": 0.3947853568606795, "grad_norm": 3.651312828063965, "learning_rate": 3.02725836186463e-05, "loss": 1.6696, "step": 14990 }, { "epoch": 0.3948116934421912, "grad_norm": 1.7130506038665771, "learning_rate": 3.0271266789570713e-05, "loss": 1.9091, "step": 14991 }, { "epoch": 0.39483803002370294, "grad_norm": 2.075068712234497, "learning_rate": 3.0269949960495132e-05, "loss": 1.3314, "step": 14992 }, { "epoch": 0.3948643666052146, "grad_norm": 3.142453193664551, "learning_rate": 3.0268633131419544e-05, "loss": 1.3264, "step": 14993 }, { "epoch": 0.39489070318672637, "grad_norm": 5.37628173828125, "learning_rate": 3.0267316302343956e-05, "loss": 1.1177, "step": 14994 }, { "epoch": 0.3949170397682381, "grad_norm": 2.0898795127868652, "learning_rate": 3.026599947326837e-05, "loss": 2.521, "step": 14995 }, { "epoch": 0.3949433763497498, "grad_norm": 2.007396936416626, "learning_rate": 3.0264682644192787e-05, "loss": 1.9554, "step": 14996 }, { "epoch": 0.39496971293126154, "grad_norm": 2.0360639095306396, "learning_rate": 3.02633658151172e-05, "loss": 1.5276, "step": 14997 }, { "epoch": 0.39499604951277323, "grad_norm": 2.260202646255493, "learning_rate": 3.0262048986041612e-05, "loss": 0.6219, "step": 14998 }, { "epoch": 0.39502238609428497, "grad_norm": 1.609933853149414, "learning_rate": 3.0260732156966028e-05, "loss": 2.3078, "step": 14999 }, { "epoch": 0.39504872267579666, "grad_norm": 2.0231568813323975, "learning_rate": 3.025941532789044e-05, "loss": 1.7696, "step": 15000 }, { "epoch": 0.3950750592573084, "grad_norm": 1.7712076902389526, "learning_rate": 3.025809849881486e-05, "loss": 2.223, "step": 15001 }, { "epoch": 0.39510139583882015, "grad_norm": 1.5881341695785522, "learning_rate": 3.025678166973927e-05, "loss": 1.6882, "step": 15002 }, { "epoch": 0.39512773242033183, "grad_norm": 4.1025238037109375, "learning_rate": 3.0255464840663683e-05, "loss": 1.678, "step": 15003 }, { "epoch": 0.3951540690018436, "grad_norm": 3.62084698677063, "learning_rate": 3.0254148011588095e-05, "loss": 1.2356, "step": 15004 }, { "epoch": 0.39518040558335527, "grad_norm": 2.124835968017578, "learning_rate": 3.0252831182512508e-05, "loss": 0.7734, "step": 15005 }, { "epoch": 0.395206742164867, "grad_norm": 2.9498844146728516, "learning_rate": 3.0251514353436927e-05, "loss": 2.7692, "step": 15006 }, { "epoch": 0.3952330787463787, "grad_norm": 1.5863209962844849, "learning_rate": 3.025019752436134e-05, "loss": 2.1596, "step": 15007 }, { "epoch": 0.39525941532789044, "grad_norm": 1.664631962776184, "learning_rate": 3.0248880695285754e-05, "loss": 1.7198, "step": 15008 }, { "epoch": 0.3952857519094022, "grad_norm": 6.359514236450195, "learning_rate": 3.0247563866210167e-05, "loss": 1.261, "step": 15009 }, { "epoch": 0.39531208849091387, "grad_norm": 2.363149404525757, "learning_rate": 3.0246247037134586e-05, "loss": 1.6975, "step": 15010 }, { "epoch": 0.3953384250724256, "grad_norm": 2.035966634750366, "learning_rate": 3.0244930208058998e-05, "loss": 2.056, "step": 15011 }, { "epoch": 0.3953647616539373, "grad_norm": 1.7164771556854248, "learning_rate": 3.024361337898341e-05, "loss": 2.0896, "step": 15012 }, { "epoch": 0.39539109823544905, "grad_norm": 1.7573095560073853, "learning_rate": 3.0242296549907822e-05, "loss": 1.5428, "step": 15013 }, { "epoch": 0.39541743481696073, "grad_norm": 1.6729694604873657, "learning_rate": 3.0240979720832234e-05, "loss": 1.9735, "step": 15014 }, { "epoch": 0.3954437713984725, "grad_norm": 3.3652641773223877, "learning_rate": 3.0239662891756653e-05, "loss": 1.4089, "step": 15015 }, { "epoch": 0.3954701079799842, "grad_norm": 2.047421455383301, "learning_rate": 3.0238346062681066e-05, "loss": 2.4291, "step": 15016 }, { "epoch": 0.3954964445614959, "grad_norm": 3.5864269733428955, "learning_rate": 3.0237029233605478e-05, "loss": 1.7278, "step": 15017 }, { "epoch": 0.39552278114300765, "grad_norm": 2.2813830375671387, "learning_rate": 3.0235712404529893e-05, "loss": 1.7869, "step": 15018 }, { "epoch": 0.39554911772451934, "grad_norm": 2.0391955375671387, "learning_rate": 3.0234395575454306e-05, "loss": 1.834, "step": 15019 }, { "epoch": 0.3955754543060311, "grad_norm": 3.667698860168457, "learning_rate": 3.0233078746378725e-05, "loss": 1.115, "step": 15020 }, { "epoch": 0.39560179088754277, "grad_norm": 2.070315361022949, "learning_rate": 3.0231761917303137e-05, "loss": 1.9976, "step": 15021 }, { "epoch": 0.3956281274690545, "grad_norm": 2.282550573348999, "learning_rate": 3.023044508822755e-05, "loss": 2.3074, "step": 15022 }, { "epoch": 0.39565446405056626, "grad_norm": 4.889825820922852, "learning_rate": 3.022912825915196e-05, "loss": 1.904, "step": 15023 }, { "epoch": 0.39568080063207794, "grad_norm": 2.5398616790771484, "learning_rate": 3.0227811430076373e-05, "loss": 2.1748, "step": 15024 }, { "epoch": 0.3957071372135897, "grad_norm": 1.7243691682815552, "learning_rate": 3.0226494601000792e-05, "loss": 1.7195, "step": 15025 }, { "epoch": 0.3957334737951014, "grad_norm": 2.2395265102386475, "learning_rate": 3.0225177771925205e-05, "loss": 1.3513, "step": 15026 }, { "epoch": 0.3957598103766131, "grad_norm": 2.8299758434295654, "learning_rate": 3.022386094284962e-05, "loss": 2.7042, "step": 15027 }, { "epoch": 0.39578614695812486, "grad_norm": 2.2655575275421143, "learning_rate": 3.0222544113774032e-05, "loss": 1.5202, "step": 15028 }, { "epoch": 0.39581248353963655, "grad_norm": 6.257394790649414, "learning_rate": 3.022122728469845e-05, "loss": 1.5074, "step": 15029 }, { "epoch": 0.3958388201211483, "grad_norm": 1.8992619514465332, "learning_rate": 3.0219910455622864e-05, "loss": 1.6583, "step": 15030 }, { "epoch": 0.39586515670266, "grad_norm": 1.9327623844146729, "learning_rate": 3.0218593626547276e-05, "loss": 1.5987, "step": 15031 }, { "epoch": 0.3958914932841717, "grad_norm": 2.0884854793548584, "learning_rate": 3.0217276797471688e-05, "loss": 0.4415, "step": 15032 }, { "epoch": 0.3959178298656834, "grad_norm": 1.670845627784729, "learning_rate": 3.02159599683961e-05, "loss": 1.833, "step": 15033 }, { "epoch": 0.39594416644719516, "grad_norm": 1.6577280759811401, "learning_rate": 3.021464313932052e-05, "loss": 1.8602, "step": 15034 }, { "epoch": 0.3959705030287069, "grad_norm": 2.7833259105682373, "learning_rate": 3.021332631024493e-05, "loss": 1.9647, "step": 15035 }, { "epoch": 0.3959968396102186, "grad_norm": 4.874198913574219, "learning_rate": 3.0212009481169347e-05, "loss": 1.8049, "step": 15036 }, { "epoch": 0.39602317619173033, "grad_norm": 3.5404245853424072, "learning_rate": 3.021069265209376e-05, "loss": 1.9333, "step": 15037 }, { "epoch": 0.396049512773242, "grad_norm": 1.8737950325012207, "learning_rate": 3.020937582301817e-05, "loss": 2.0376, "step": 15038 }, { "epoch": 0.39607584935475376, "grad_norm": 5.297119140625, "learning_rate": 3.020805899394259e-05, "loss": 2.4164, "step": 15039 }, { "epoch": 0.39610218593626545, "grad_norm": 4.721755027770996, "learning_rate": 3.0206742164867003e-05, "loss": 1.1681, "step": 15040 }, { "epoch": 0.3961285225177772, "grad_norm": 1.5547116994857788, "learning_rate": 3.0205425335791415e-05, "loss": 1.5787, "step": 15041 }, { "epoch": 0.39615485909928894, "grad_norm": 2.7580041885375977, "learning_rate": 3.0204108506715827e-05, "loss": 0.4349, "step": 15042 }, { "epoch": 0.3961811956808006, "grad_norm": 2.6049814224243164, "learning_rate": 3.0202791677640246e-05, "loss": 1.7179, "step": 15043 }, { "epoch": 0.39620753226231237, "grad_norm": 1.8934935331344604, "learning_rate": 3.0201474848564658e-05, "loss": 2.4449, "step": 15044 }, { "epoch": 0.39623386884382406, "grad_norm": 2.2490322589874268, "learning_rate": 3.020015801948907e-05, "loss": 1.9849, "step": 15045 }, { "epoch": 0.3962602054253358, "grad_norm": 1.8754563331604004, "learning_rate": 3.0198841190413486e-05, "loss": 1.9317, "step": 15046 }, { "epoch": 0.3962865420068475, "grad_norm": 1.4597707986831665, "learning_rate": 3.0197524361337898e-05, "loss": 1.562, "step": 15047 }, { "epoch": 0.39631287858835923, "grad_norm": 2.1651532649993896, "learning_rate": 3.0196207532262317e-05, "loss": 0.6542, "step": 15048 }, { "epoch": 0.396339215169871, "grad_norm": 2.0441808700561523, "learning_rate": 3.019489070318673e-05, "loss": 1.7482, "step": 15049 }, { "epoch": 0.39636555175138266, "grad_norm": 6.155063629150391, "learning_rate": 3.019357387411114e-05, "loss": 2.167, "step": 15050 }, { "epoch": 0.3963918883328944, "grad_norm": 2.0099036693573, "learning_rate": 3.0192257045035554e-05, "loss": 0.6813, "step": 15051 }, { "epoch": 0.3964182249144061, "grad_norm": 4.4810895919799805, "learning_rate": 3.0190940215959966e-05, "loss": 2.5688, "step": 15052 }, { "epoch": 0.39644456149591784, "grad_norm": 2.7615575790405273, "learning_rate": 3.0189623386884385e-05, "loss": 2.0952, "step": 15053 }, { "epoch": 0.3964708980774295, "grad_norm": 1.6449549198150635, "learning_rate": 3.0188306557808797e-05, "loss": 1.878, "step": 15054 }, { "epoch": 0.39649723465894127, "grad_norm": 2.340190887451172, "learning_rate": 3.0186989728733213e-05, "loss": 1.6676, "step": 15055 }, { "epoch": 0.396523571240453, "grad_norm": 3.026587724685669, "learning_rate": 3.0185672899657625e-05, "loss": 2.116, "step": 15056 }, { "epoch": 0.3965499078219647, "grad_norm": 1.80243718624115, "learning_rate": 3.0184356070582037e-05, "loss": 2.058, "step": 15057 }, { "epoch": 0.39657624440347644, "grad_norm": 4.1269707679748535, "learning_rate": 3.0183039241506456e-05, "loss": 2.1053, "step": 15058 }, { "epoch": 0.39660258098498813, "grad_norm": 1.9416321516036987, "learning_rate": 3.018172241243087e-05, "loss": 1.3026, "step": 15059 }, { "epoch": 0.3966289175664999, "grad_norm": 1.2719355821609497, "learning_rate": 3.018040558335528e-05, "loss": 1.7994, "step": 15060 }, { "epoch": 0.3966552541480116, "grad_norm": 2.6693902015686035, "learning_rate": 3.0179088754279693e-05, "loss": 2.3408, "step": 15061 }, { "epoch": 0.3966815907295233, "grad_norm": 2.63718843460083, "learning_rate": 3.0177771925204112e-05, "loss": 1.6426, "step": 15062 }, { "epoch": 0.39670792731103505, "grad_norm": 1.9943994283676147, "learning_rate": 3.0176455096128524e-05, "loss": 1.999, "step": 15063 }, { "epoch": 0.39673426389254673, "grad_norm": 1.724923014640808, "learning_rate": 3.017513826705294e-05, "loss": 1.8849, "step": 15064 }, { "epoch": 0.3967606004740585, "grad_norm": 1.839485764503479, "learning_rate": 3.0173821437977352e-05, "loss": 1.6697, "step": 15065 }, { "epoch": 0.39678693705557017, "grad_norm": 2.3077034950256348, "learning_rate": 3.0172504608901764e-05, "loss": 1.6201, "step": 15066 }, { "epoch": 0.3968132736370819, "grad_norm": 2.369795083999634, "learning_rate": 3.0171187779826183e-05, "loss": 2.334, "step": 15067 }, { "epoch": 0.39683961021859365, "grad_norm": 2.1524362564086914, "learning_rate": 3.0169870950750595e-05, "loss": 1.6187, "step": 15068 }, { "epoch": 0.39686594680010534, "grad_norm": 1.5614216327667236, "learning_rate": 3.0168554121675008e-05, "loss": 2.3111, "step": 15069 }, { "epoch": 0.3968922833816171, "grad_norm": 1.5796303749084473, "learning_rate": 3.016723729259942e-05, "loss": 2.2566, "step": 15070 }, { "epoch": 0.39691861996312877, "grad_norm": 2.016678810119629, "learning_rate": 3.0165920463523832e-05, "loss": 2.0844, "step": 15071 }, { "epoch": 0.3969449565446405, "grad_norm": 2.135490894317627, "learning_rate": 3.016460363444825e-05, "loss": 1.9233, "step": 15072 }, { "epoch": 0.3969712931261522, "grad_norm": 2.128314971923828, "learning_rate": 3.0163286805372663e-05, "loss": 1.8996, "step": 15073 }, { "epoch": 0.39699762970766395, "grad_norm": 4.088359832763672, "learning_rate": 3.016196997629708e-05, "loss": 2.0225, "step": 15074 }, { "epoch": 0.3970239662891757, "grad_norm": 1.775320291519165, "learning_rate": 3.016065314722149e-05, "loss": 1.7487, "step": 15075 }, { "epoch": 0.3970503028706874, "grad_norm": 1.635765552520752, "learning_rate": 3.015933631814591e-05, "loss": 1.8751, "step": 15076 }, { "epoch": 0.3970766394521991, "grad_norm": 3.182035446166992, "learning_rate": 3.0158019489070322e-05, "loss": 2.136, "step": 15077 }, { "epoch": 0.3971029760337108, "grad_norm": 1.9329135417938232, "learning_rate": 3.0156702659994734e-05, "loss": 1.6874, "step": 15078 }, { "epoch": 0.39712931261522255, "grad_norm": 3.233552932739258, "learning_rate": 3.0155385830919147e-05, "loss": 1.498, "step": 15079 }, { "epoch": 0.39715564919673424, "grad_norm": 3.0069446563720703, "learning_rate": 3.015406900184356e-05, "loss": 0.4941, "step": 15080 }, { "epoch": 0.397181985778246, "grad_norm": 2.362168550491333, "learning_rate": 3.0152752172767978e-05, "loss": 0.9619, "step": 15081 }, { "epoch": 0.3972083223597577, "grad_norm": 2.007009983062744, "learning_rate": 3.015143534369239e-05, "loss": 2.4339, "step": 15082 }, { "epoch": 0.3972346589412694, "grad_norm": 1.9347221851348877, "learning_rate": 3.0150118514616806e-05, "loss": 2.0082, "step": 15083 }, { "epoch": 0.39726099552278116, "grad_norm": 2.677497625350952, "learning_rate": 3.0148801685541218e-05, "loss": 1.3221, "step": 15084 }, { "epoch": 0.39728733210429285, "grad_norm": 2.126732349395752, "learning_rate": 3.014748485646563e-05, "loss": 2.0514, "step": 15085 }, { "epoch": 0.3973136686858046, "grad_norm": 2.031367063522339, "learning_rate": 3.014616802739005e-05, "loss": 1.9752, "step": 15086 }, { "epoch": 0.3973400052673163, "grad_norm": 1.534645676612854, "learning_rate": 3.014485119831446e-05, "loss": 1.7627, "step": 15087 }, { "epoch": 0.397366341848828, "grad_norm": 1.8645728826522827, "learning_rate": 3.0143534369238873e-05, "loss": 0.52, "step": 15088 }, { "epoch": 0.39739267843033976, "grad_norm": 2.380117416381836, "learning_rate": 3.0142217540163286e-05, "loss": 2.1512, "step": 15089 }, { "epoch": 0.39741901501185145, "grad_norm": 3.748344898223877, "learning_rate": 3.01409007110877e-05, "loss": 0.59, "step": 15090 }, { "epoch": 0.3974453515933632, "grad_norm": 1.996188759803772, "learning_rate": 3.0139583882012117e-05, "loss": 1.8976, "step": 15091 }, { "epoch": 0.3974716881748749, "grad_norm": 2.647844076156616, "learning_rate": 3.0138267052936532e-05, "loss": 2.2353, "step": 15092 }, { "epoch": 0.3974980247563866, "grad_norm": 2.239309310913086, "learning_rate": 3.0136950223860945e-05, "loss": 2.1724, "step": 15093 }, { "epoch": 0.3975243613378983, "grad_norm": 1.5903981924057007, "learning_rate": 3.0135633394785357e-05, "loss": 1.6906, "step": 15094 }, { "epoch": 0.39755069791941006, "grad_norm": 1.4769396781921387, "learning_rate": 3.0134316565709776e-05, "loss": 2.3228, "step": 15095 }, { "epoch": 0.3975770345009218, "grad_norm": 1.7581170797348022, "learning_rate": 3.0132999736634188e-05, "loss": 0.5144, "step": 15096 }, { "epoch": 0.3976033710824335, "grad_norm": 2.6350605487823486, "learning_rate": 3.01316829075586e-05, "loss": 1.6155, "step": 15097 }, { "epoch": 0.39762970766394523, "grad_norm": 1.9088119268417358, "learning_rate": 3.0130366078483012e-05, "loss": 1.7295, "step": 15098 }, { "epoch": 0.3976560442454569, "grad_norm": 1.4263521432876587, "learning_rate": 3.0129049249407425e-05, "loss": 2.1658, "step": 15099 }, { "epoch": 0.39768238082696866, "grad_norm": 1.822358250617981, "learning_rate": 3.0127732420331844e-05, "loss": 1.9254, "step": 15100 }, { "epoch": 0.3977087174084804, "grad_norm": 1.7165281772613525, "learning_rate": 3.0126415591256256e-05, "loss": 1.6772, "step": 15101 }, { "epoch": 0.3977350539899921, "grad_norm": 2.9805707931518555, "learning_rate": 3.012509876218067e-05, "loss": 1.379, "step": 15102 }, { "epoch": 0.39776139057150384, "grad_norm": 2.0595970153808594, "learning_rate": 3.0123781933105084e-05, "loss": 1.0463, "step": 15103 }, { "epoch": 0.3977877271530155, "grad_norm": 2.1131811141967773, "learning_rate": 3.0122465104029496e-05, "loss": 2.3488, "step": 15104 }, { "epoch": 0.39781406373452727, "grad_norm": 1.6708382368087769, "learning_rate": 3.0121148274953915e-05, "loss": 1.5547, "step": 15105 }, { "epoch": 0.39784040031603896, "grad_norm": 2.2232890129089355, "learning_rate": 3.0119831445878327e-05, "loss": 1.5653, "step": 15106 }, { "epoch": 0.3978667368975507, "grad_norm": 2.054849624633789, "learning_rate": 3.011851461680274e-05, "loss": 1.8846, "step": 15107 }, { "epoch": 0.39789307347906244, "grad_norm": 1.518528699874878, "learning_rate": 3.011719778772715e-05, "loss": 1.6737, "step": 15108 }, { "epoch": 0.39791941006057413, "grad_norm": 2.1234230995178223, "learning_rate": 3.011588095865157e-05, "loss": 1.5898, "step": 15109 }, { "epoch": 0.3979457466420859, "grad_norm": 2.1404612064361572, "learning_rate": 3.0114564129575983e-05, "loss": 2.6745, "step": 15110 }, { "epoch": 0.39797208322359756, "grad_norm": 1.6770737171173096, "learning_rate": 3.0113247300500398e-05, "loss": 2.4682, "step": 15111 }, { "epoch": 0.3979984198051093, "grad_norm": 2.5243773460388184, "learning_rate": 3.011193047142481e-05, "loss": 1.7645, "step": 15112 }, { "epoch": 0.398024756386621, "grad_norm": 2.369124174118042, "learning_rate": 3.0110613642349223e-05, "loss": 1.8384, "step": 15113 }, { "epoch": 0.39805109296813274, "grad_norm": 5.346293926239014, "learning_rate": 3.010929681327364e-05, "loss": 1.7805, "step": 15114 }, { "epoch": 0.3980774295496445, "grad_norm": 2.931126356124878, "learning_rate": 3.0107979984198054e-05, "loss": 1.0728, "step": 15115 }, { "epoch": 0.39810376613115617, "grad_norm": 1.9691942930221558, "learning_rate": 3.0106663155122466e-05, "loss": 1.8304, "step": 15116 }, { "epoch": 0.3981301027126679, "grad_norm": 3.2896289825439453, "learning_rate": 3.0105346326046878e-05, "loss": 1.493, "step": 15117 }, { "epoch": 0.3981564392941796, "grad_norm": 1.655034065246582, "learning_rate": 3.010402949697129e-05, "loss": 1.7281, "step": 15118 }, { "epoch": 0.39818277587569134, "grad_norm": 2.4506442546844482, "learning_rate": 3.010271266789571e-05, "loss": 1.8252, "step": 15119 }, { "epoch": 0.39820911245720303, "grad_norm": 1.6349669694900513, "learning_rate": 3.0101395838820122e-05, "loss": 1.5295, "step": 15120 }, { "epoch": 0.3982354490387148, "grad_norm": 3.4391379356384277, "learning_rate": 3.0100079009744537e-05, "loss": 2.0839, "step": 15121 }, { "epoch": 0.3982617856202265, "grad_norm": 2.0003373622894287, "learning_rate": 3.009876218066895e-05, "loss": 1.4843, "step": 15122 }, { "epoch": 0.3982881222017382, "grad_norm": 2.7001326084136963, "learning_rate": 3.0097445351593362e-05, "loss": 1.944, "step": 15123 }, { "epoch": 0.39831445878324995, "grad_norm": 1.8603417873382568, "learning_rate": 3.009612852251778e-05, "loss": 1.5126, "step": 15124 }, { "epoch": 0.39834079536476164, "grad_norm": 1.9072836637496948, "learning_rate": 3.0094811693442193e-05, "loss": 2.3098, "step": 15125 }, { "epoch": 0.3983671319462734, "grad_norm": 2.9438912868499756, "learning_rate": 3.0093494864366605e-05, "loss": 1.7904, "step": 15126 }, { "epoch": 0.39839346852778507, "grad_norm": 2.894235134124756, "learning_rate": 3.0092178035291017e-05, "loss": 1.5091, "step": 15127 }, { "epoch": 0.3984198051092968, "grad_norm": 1.6695466041564941, "learning_rate": 3.0090861206215436e-05, "loss": 1.8956, "step": 15128 }, { "epoch": 0.39844614169080855, "grad_norm": 2.9755256175994873, "learning_rate": 3.008954437713985e-05, "loss": 1.5015, "step": 15129 }, { "epoch": 0.39847247827232024, "grad_norm": 1.456181526184082, "learning_rate": 3.0088227548064264e-05, "loss": 1.9403, "step": 15130 }, { "epoch": 0.398498814853832, "grad_norm": 1.6136459112167358, "learning_rate": 3.0086910718988676e-05, "loss": 2.0042, "step": 15131 }, { "epoch": 0.39852515143534367, "grad_norm": 2.339348077774048, "learning_rate": 3.008559388991309e-05, "loss": 1.5742, "step": 15132 }, { "epoch": 0.3985514880168554, "grad_norm": 4.0757365226745605, "learning_rate": 3.0084277060837508e-05, "loss": 1.9648, "step": 15133 }, { "epoch": 0.39857782459836716, "grad_norm": 2.3704676628112793, "learning_rate": 3.008296023176192e-05, "loss": 0.6897, "step": 15134 }, { "epoch": 0.39860416117987885, "grad_norm": 4.269392967224121, "learning_rate": 3.0081643402686332e-05, "loss": 2.3461, "step": 15135 }, { "epoch": 0.3986304977613906, "grad_norm": 3.692760944366455, "learning_rate": 3.0080326573610744e-05, "loss": 0.4298, "step": 15136 }, { "epoch": 0.3986568343429023, "grad_norm": 3.4072606563568115, "learning_rate": 3.007900974453516e-05, "loss": 0.8908, "step": 15137 }, { "epoch": 0.398683170924414, "grad_norm": 1.6600042581558228, "learning_rate": 3.0077692915459575e-05, "loss": 0.7987, "step": 15138 }, { "epoch": 0.3987095075059257, "grad_norm": 2.570112466812134, "learning_rate": 3.007637608638399e-05, "loss": 1.4689, "step": 15139 }, { "epoch": 0.39873584408743745, "grad_norm": 2.197693109512329, "learning_rate": 3.0075059257308403e-05, "loss": 1.5503, "step": 15140 }, { "epoch": 0.3987621806689492, "grad_norm": 1.4908580780029297, "learning_rate": 3.0073742428232815e-05, "loss": 1.8281, "step": 15141 }, { "epoch": 0.3987885172504609, "grad_norm": 4.311302185058594, "learning_rate": 3.0072425599157234e-05, "loss": 2.1156, "step": 15142 }, { "epoch": 0.3988148538319726, "grad_norm": 1.8114593029022217, "learning_rate": 3.0071108770081647e-05, "loss": 2.2262, "step": 15143 }, { "epoch": 0.3988411904134843, "grad_norm": 2.8582775592803955, "learning_rate": 3.006979194100606e-05, "loss": 1.0121, "step": 15144 }, { "epoch": 0.39886752699499606, "grad_norm": 1.7252779006958008, "learning_rate": 3.006847511193047e-05, "loss": 1.7096, "step": 15145 }, { "epoch": 0.39889386357650775, "grad_norm": 2.47562575340271, "learning_rate": 3.0067158282854883e-05, "loss": 1.1531, "step": 15146 }, { "epoch": 0.3989202001580195, "grad_norm": 2.3413383960723877, "learning_rate": 3.0065841453779302e-05, "loss": 2.1053, "step": 15147 }, { "epoch": 0.39894653673953123, "grad_norm": 2.820797920227051, "learning_rate": 3.0064524624703714e-05, "loss": 2.2294, "step": 15148 }, { "epoch": 0.3989728733210429, "grad_norm": 2.303964138031006, "learning_rate": 3.006320779562813e-05, "loss": 1.807, "step": 15149 }, { "epoch": 0.39899920990255466, "grad_norm": 2.3117685317993164, "learning_rate": 3.0061890966552542e-05, "loss": 2.1772, "step": 15150 }, { "epoch": 0.39902554648406635, "grad_norm": 2.523735284805298, "learning_rate": 3.0060574137476954e-05, "loss": 1.8462, "step": 15151 }, { "epoch": 0.3990518830655781, "grad_norm": 1.6964685916900635, "learning_rate": 3.0059257308401373e-05, "loss": 1.3635, "step": 15152 }, { "epoch": 0.3990782196470898, "grad_norm": 3.270782709121704, "learning_rate": 3.0057940479325786e-05, "loss": 1.6391, "step": 15153 }, { "epoch": 0.3991045562286015, "grad_norm": 1.6087555885314941, "learning_rate": 3.0056623650250198e-05, "loss": 1.6876, "step": 15154 }, { "epoch": 0.39913089281011327, "grad_norm": 2.850135326385498, "learning_rate": 3.005530682117461e-05, "loss": 1.0185, "step": 15155 }, { "epoch": 0.39915722939162496, "grad_norm": 2.8274800777435303, "learning_rate": 3.0053989992099026e-05, "loss": 0.7069, "step": 15156 }, { "epoch": 0.3991835659731367, "grad_norm": 2.156588315963745, "learning_rate": 3.005267316302344e-05, "loss": 0.6161, "step": 15157 }, { "epoch": 0.3992099025546484, "grad_norm": 3.547437906265259, "learning_rate": 3.0051356333947857e-05, "loss": 1.4996, "step": 15158 }, { "epoch": 0.39923623913616013, "grad_norm": 2.4358303546905518, "learning_rate": 3.005003950487227e-05, "loss": 1.5792, "step": 15159 }, { "epoch": 0.3992625757176718, "grad_norm": 1.5521892309188843, "learning_rate": 3.004872267579668e-05, "loss": 1.9927, "step": 15160 }, { "epoch": 0.39928891229918356, "grad_norm": 1.8601088523864746, "learning_rate": 3.00474058467211e-05, "loss": 1.7021, "step": 15161 }, { "epoch": 0.3993152488806953, "grad_norm": 1.4800333976745605, "learning_rate": 3.0046089017645512e-05, "loss": 1.7235, "step": 15162 }, { "epoch": 0.399341585462207, "grad_norm": 3.543912649154663, "learning_rate": 3.0044772188569925e-05, "loss": 1.613, "step": 15163 }, { "epoch": 0.39936792204371874, "grad_norm": 1.5966562032699585, "learning_rate": 3.0043455359494337e-05, "loss": 2.065, "step": 15164 }, { "epoch": 0.3993942586252304, "grad_norm": 1.6676326990127563, "learning_rate": 3.0042138530418752e-05, "loss": 1.5868, "step": 15165 }, { "epoch": 0.39942059520674217, "grad_norm": 2.480081558227539, "learning_rate": 3.0040821701343168e-05, "loss": 2.6109, "step": 15166 }, { "epoch": 0.3994469317882539, "grad_norm": 1.7020961046218872, "learning_rate": 3.0039504872267584e-05, "loss": 1.5241, "step": 15167 }, { "epoch": 0.3994732683697656, "grad_norm": 1.8415707349777222, "learning_rate": 3.0038188043191996e-05, "loss": 1.4608, "step": 15168 }, { "epoch": 0.39949960495127734, "grad_norm": 1.535257339477539, "learning_rate": 3.0036871214116408e-05, "loss": 1.8111, "step": 15169 }, { "epoch": 0.39952594153278903, "grad_norm": 4.163630485534668, "learning_rate": 3.003555438504082e-05, "loss": 1.5191, "step": 15170 }, { "epoch": 0.3995522781143008, "grad_norm": 1.6537508964538574, "learning_rate": 3.003423755596524e-05, "loss": 1.6542, "step": 15171 }, { "epoch": 0.39957861469581246, "grad_norm": 1.6200356483459473, "learning_rate": 3.003292072688965e-05, "loss": 1.5739, "step": 15172 }, { "epoch": 0.3996049512773242, "grad_norm": 1.6518728733062744, "learning_rate": 3.0031603897814064e-05, "loss": 1.218, "step": 15173 }, { "epoch": 0.39963128785883595, "grad_norm": 2.3867006301879883, "learning_rate": 3.0030287068738476e-05, "loss": 1.5534, "step": 15174 }, { "epoch": 0.39965762444034764, "grad_norm": 3.7688419818878174, "learning_rate": 3.0028970239662895e-05, "loss": 1.92, "step": 15175 }, { "epoch": 0.3996839610218594, "grad_norm": 4.719405651092529, "learning_rate": 3.0027653410587307e-05, "loss": 1.874, "step": 15176 }, { "epoch": 0.39971029760337107, "grad_norm": 3.285053014755249, "learning_rate": 3.0026336581511723e-05, "loss": 1.3879, "step": 15177 }, { "epoch": 0.3997366341848828, "grad_norm": 2.066946029663086, "learning_rate": 3.0025019752436135e-05, "loss": 1.4685, "step": 15178 }, { "epoch": 0.3997629707663945, "grad_norm": 1.6314629316329956, "learning_rate": 3.0023702923360547e-05, "loss": 1.3335, "step": 15179 }, { "epoch": 0.39978930734790624, "grad_norm": 2.5768818855285645, "learning_rate": 3.0022386094284966e-05, "loss": 2.0223, "step": 15180 }, { "epoch": 0.399815643929418, "grad_norm": 1.4182546138763428, "learning_rate": 3.0021069265209378e-05, "loss": 2.1824, "step": 15181 }, { "epoch": 0.3998419805109297, "grad_norm": 1.720131754875183, "learning_rate": 3.001975243613379e-05, "loss": 1.7485, "step": 15182 }, { "epoch": 0.3998683170924414, "grad_norm": 2.2177376747131348, "learning_rate": 3.0018435607058203e-05, "loss": 1.6336, "step": 15183 }, { "epoch": 0.3998946536739531, "grad_norm": 1.6714279651641846, "learning_rate": 3.001711877798262e-05, "loss": 2.0179, "step": 15184 }, { "epoch": 0.39992099025546485, "grad_norm": 2.0650711059570312, "learning_rate": 3.0015801948907034e-05, "loss": 1.8289, "step": 15185 }, { "epoch": 0.39994732683697654, "grad_norm": 1.6439447402954102, "learning_rate": 3.001448511983145e-05, "loss": 1.9966, "step": 15186 }, { "epoch": 0.3999736634184883, "grad_norm": 2.4915354251861572, "learning_rate": 3.0013168290755862e-05, "loss": 1.5785, "step": 15187 }, { "epoch": 0.4, "grad_norm": 1.703201413154602, "learning_rate": 3.0011851461680274e-05, "loss": 0.9085, "step": 15188 }, { "epoch": 0.4000263365815117, "grad_norm": 2.088538646697998, "learning_rate": 3.0010534632604693e-05, "loss": 1.737, "step": 15189 }, { "epoch": 0.40005267316302345, "grad_norm": 1.7217639684677124, "learning_rate": 3.0009217803529105e-05, "loss": 1.2694, "step": 15190 }, { "epoch": 0.40007900974453514, "grad_norm": 3.157841205596924, "learning_rate": 3.0007900974453517e-05, "loss": 0.91, "step": 15191 }, { "epoch": 0.4001053463260469, "grad_norm": 2.1383683681488037, "learning_rate": 3.000658414537793e-05, "loss": 2.0331, "step": 15192 }, { "epoch": 0.4001316829075586, "grad_norm": 2.416210174560547, "learning_rate": 3.0005267316302342e-05, "loss": 1.6528, "step": 15193 }, { "epoch": 0.4001580194890703, "grad_norm": 2.38584041595459, "learning_rate": 3.000395048722676e-05, "loss": 1.9925, "step": 15194 }, { "epoch": 0.40018435607058206, "grad_norm": 5.951260566711426, "learning_rate": 3.0002633658151176e-05, "loss": 2.1103, "step": 15195 }, { "epoch": 0.40021069265209375, "grad_norm": 2.055037021636963, "learning_rate": 3.000131682907559e-05, "loss": 1.4111, "step": 15196 }, { "epoch": 0.4002370292336055, "grad_norm": 1.6284364461898804, "learning_rate": 3e-05, "loss": 2.157, "step": 15197 }, { "epoch": 0.4002633658151172, "grad_norm": 1.3630836009979248, "learning_rate": 2.9998683170924413e-05, "loss": 1.8156, "step": 15198 }, { "epoch": 0.4002897023966289, "grad_norm": 1.895262360572815, "learning_rate": 2.9997366341848832e-05, "loss": 1.7106, "step": 15199 }, { "epoch": 0.40031603897814066, "grad_norm": 1.7643390893936157, "learning_rate": 2.9996049512773244e-05, "loss": 2.2111, "step": 15200 }, { "epoch": 0.40034237555965235, "grad_norm": 2.426508665084839, "learning_rate": 2.9994732683697656e-05, "loss": 0.8327, "step": 15201 }, { "epoch": 0.4003687121411641, "grad_norm": 2.0043630599975586, "learning_rate": 2.999341585462207e-05, "loss": 1.8773, "step": 15202 }, { "epoch": 0.4003950487226758, "grad_norm": 3.117605447769165, "learning_rate": 2.9992099025546484e-05, "loss": 0.627, "step": 15203 }, { "epoch": 0.4004213853041875, "grad_norm": 2.4989943504333496, "learning_rate": 2.99907821964709e-05, "loss": 1.8619, "step": 15204 }, { "epoch": 0.4004477218856992, "grad_norm": 3.1154608726501465, "learning_rate": 2.9989465367395315e-05, "loss": 0.9862, "step": 15205 }, { "epoch": 0.40047405846721096, "grad_norm": 1.5244344472885132, "learning_rate": 2.9988148538319728e-05, "loss": 2.279, "step": 15206 }, { "epoch": 0.4005003950487227, "grad_norm": 1.8436567783355713, "learning_rate": 2.998683170924414e-05, "loss": 1.3582, "step": 15207 }, { "epoch": 0.4005267316302344, "grad_norm": 2.4069817066192627, "learning_rate": 2.998551488016856e-05, "loss": 1.9272, "step": 15208 }, { "epoch": 0.40055306821174613, "grad_norm": 3.6580328941345215, "learning_rate": 2.998419805109297e-05, "loss": 1.7492, "step": 15209 }, { "epoch": 0.4005794047932578, "grad_norm": 2.241776943206787, "learning_rate": 2.9982881222017383e-05, "loss": 1.7254, "step": 15210 }, { "epoch": 0.40060574137476956, "grad_norm": 2.118487596511841, "learning_rate": 2.9981564392941795e-05, "loss": 1.2622, "step": 15211 }, { "epoch": 0.40063207795628125, "grad_norm": 1.6129636764526367, "learning_rate": 2.998024756386621e-05, "loss": 1.9641, "step": 15212 }, { "epoch": 0.400658414537793, "grad_norm": 2.9925856590270996, "learning_rate": 2.9978930734790627e-05, "loss": 1.3286, "step": 15213 }, { "epoch": 0.40068475111930474, "grad_norm": 2.6571152210235596, "learning_rate": 2.9977613905715042e-05, "loss": 2.051, "step": 15214 }, { "epoch": 0.4007110877008164, "grad_norm": 2.9185569286346436, "learning_rate": 2.9976297076639454e-05, "loss": 0.8716, "step": 15215 }, { "epoch": 0.40073742428232817, "grad_norm": 2.085442304611206, "learning_rate": 2.9974980247563867e-05, "loss": 1.7261, "step": 15216 }, { "epoch": 0.40076376086383986, "grad_norm": 1.9198839664459229, "learning_rate": 2.997366341848828e-05, "loss": 2.0768, "step": 15217 }, { "epoch": 0.4007900974453516, "grad_norm": 3.3180854320526123, "learning_rate": 2.9972346589412698e-05, "loss": 1.3378, "step": 15218 }, { "epoch": 0.4008164340268633, "grad_norm": 1.9435937404632568, "learning_rate": 2.997102976033711e-05, "loss": 2.0091, "step": 15219 }, { "epoch": 0.40084277060837503, "grad_norm": 1.6458818912506104, "learning_rate": 2.9969712931261522e-05, "loss": 1.8368, "step": 15220 }, { "epoch": 0.4008691071898868, "grad_norm": 3.5103559494018555, "learning_rate": 2.9968396102185934e-05, "loss": 0.669, "step": 15221 }, { "epoch": 0.40089544377139846, "grad_norm": 2.4205689430236816, "learning_rate": 2.9967079273110353e-05, "loss": 1.717, "step": 15222 }, { "epoch": 0.4009217803529102, "grad_norm": 4.01300573348999, "learning_rate": 2.9965762444034766e-05, "loss": 1.4283, "step": 15223 }, { "epoch": 0.4009481169344219, "grad_norm": 5.249772548675537, "learning_rate": 2.996444561495918e-05, "loss": 0.4765, "step": 15224 }, { "epoch": 0.40097445351593364, "grad_norm": 2.009230375289917, "learning_rate": 2.9963128785883593e-05, "loss": 2.1892, "step": 15225 }, { "epoch": 0.4010007900974453, "grad_norm": 1.9879140853881836, "learning_rate": 2.9961811956808006e-05, "loss": 1.4495, "step": 15226 }, { "epoch": 0.40102712667895707, "grad_norm": 1.5879912376403809, "learning_rate": 2.9960495127732425e-05, "loss": 1.3655, "step": 15227 }, { "epoch": 0.4010534632604688, "grad_norm": 3.1911466121673584, "learning_rate": 2.9959178298656837e-05, "loss": 1.2413, "step": 15228 }, { "epoch": 0.4010797998419805, "grad_norm": 1.970922827720642, "learning_rate": 2.995786146958125e-05, "loss": 0.3081, "step": 15229 }, { "epoch": 0.40110613642349224, "grad_norm": 2.2098400592803955, "learning_rate": 2.995654464050566e-05, "loss": 1.5208, "step": 15230 }, { "epoch": 0.40113247300500393, "grad_norm": 5.250458717346191, "learning_rate": 2.9955227811430077e-05, "loss": 2.4769, "step": 15231 }, { "epoch": 0.4011588095865157, "grad_norm": 2.2800164222717285, "learning_rate": 2.9953910982354492e-05, "loss": 1.538, "step": 15232 }, { "epoch": 0.4011851461680274, "grad_norm": 3.3152554035186768, "learning_rate": 2.9952594153278908e-05, "loss": 1.314, "step": 15233 }, { "epoch": 0.4012114827495391, "grad_norm": 2.258779764175415, "learning_rate": 2.995127732420332e-05, "loss": 1.8959, "step": 15234 }, { "epoch": 0.40123781933105085, "grad_norm": 2.482424259185791, "learning_rate": 2.9949960495127732e-05, "loss": 1.1497, "step": 15235 }, { "epoch": 0.40126415591256254, "grad_norm": 8.307866096496582, "learning_rate": 2.9948643666052145e-05, "loss": 1.9556, "step": 15236 }, { "epoch": 0.4012904924940743, "grad_norm": 2.1465039253234863, "learning_rate": 2.9947326836976564e-05, "loss": 1.9607, "step": 15237 }, { "epoch": 0.40131682907558597, "grad_norm": 2.1289169788360596, "learning_rate": 2.9946010007900976e-05, "loss": 1.322, "step": 15238 }, { "epoch": 0.4013431656570977, "grad_norm": 2.1380739212036133, "learning_rate": 2.9944693178825388e-05, "loss": 2.5748, "step": 15239 }, { "epoch": 0.40136950223860945, "grad_norm": 2.718348264694214, "learning_rate": 2.9943376349749804e-05, "loss": 1.8643, "step": 15240 }, { "epoch": 0.40139583882012114, "grad_norm": 1.6842036247253418, "learning_rate": 2.994205952067422e-05, "loss": 1.9707, "step": 15241 }, { "epoch": 0.4014221754016329, "grad_norm": 1.3637287616729736, "learning_rate": 2.9940742691598635e-05, "loss": 1.8458, "step": 15242 }, { "epoch": 0.4014485119831446, "grad_norm": 1.6709080934524536, "learning_rate": 2.9939425862523047e-05, "loss": 1.5773, "step": 15243 }, { "epoch": 0.4014748485646563, "grad_norm": 1.8406020402908325, "learning_rate": 2.993810903344746e-05, "loss": 1.7483, "step": 15244 }, { "epoch": 0.401501185146168, "grad_norm": 2.2115960121154785, "learning_rate": 2.993679220437187e-05, "loss": 1.9933, "step": 15245 }, { "epoch": 0.40152752172767975, "grad_norm": 2.0160882472991943, "learning_rate": 2.993547537529629e-05, "loss": 2.0755, "step": 15246 }, { "epoch": 0.4015538583091915, "grad_norm": 1.9304190874099731, "learning_rate": 2.9934158546220703e-05, "loss": 2.1465, "step": 15247 }, { "epoch": 0.4015801948907032, "grad_norm": 1.6468080282211304, "learning_rate": 2.9932841717145115e-05, "loss": 2.1391, "step": 15248 }, { "epoch": 0.4016065314722149, "grad_norm": 2.487684488296509, "learning_rate": 2.9931524888069527e-05, "loss": 1.9947, "step": 15249 }, { "epoch": 0.4016328680537266, "grad_norm": 2.044767141342163, "learning_rate": 2.9930208058993943e-05, "loss": 2.3249, "step": 15250 }, { "epoch": 0.40165920463523835, "grad_norm": 2.7380313873291016, "learning_rate": 2.992889122991836e-05, "loss": 1.1538, "step": 15251 }, { "epoch": 0.40168554121675004, "grad_norm": 2.301825523376465, "learning_rate": 2.9927574400842774e-05, "loss": 2.036, "step": 15252 }, { "epoch": 0.4017118777982618, "grad_norm": 1.6190699338912964, "learning_rate": 2.9926257571767186e-05, "loss": 0.4114, "step": 15253 }, { "epoch": 0.40173821437977353, "grad_norm": 2.370206594467163, "learning_rate": 2.99249407426916e-05, "loss": 1.871, "step": 15254 }, { "epoch": 0.4017645509612852, "grad_norm": 2.315791130065918, "learning_rate": 2.9923623913616017e-05, "loss": 2.0115, "step": 15255 }, { "epoch": 0.40179088754279696, "grad_norm": 2.0392849445343018, "learning_rate": 2.992230708454043e-05, "loss": 1.7318, "step": 15256 }, { "epoch": 0.40181722412430865, "grad_norm": 1.7041723728179932, "learning_rate": 2.9920990255464842e-05, "loss": 1.7035, "step": 15257 }, { "epoch": 0.4018435607058204, "grad_norm": 2.8666014671325684, "learning_rate": 2.9919673426389254e-05, "loss": 1.525, "step": 15258 }, { "epoch": 0.4018698972873321, "grad_norm": 1.810825228691101, "learning_rate": 2.991835659731367e-05, "loss": 1.3962, "step": 15259 }, { "epoch": 0.4018962338688438, "grad_norm": 1.681111216545105, "learning_rate": 2.9917039768238085e-05, "loss": 1.0429, "step": 15260 }, { "epoch": 0.40192257045035557, "grad_norm": 1.6370668411254883, "learning_rate": 2.99157229391625e-05, "loss": 0.6586, "step": 15261 }, { "epoch": 0.40194890703186725, "grad_norm": 1.371125340461731, "learning_rate": 2.9914406110086913e-05, "loss": 1.4529, "step": 15262 }, { "epoch": 0.401975243613379, "grad_norm": 1.7869479656219482, "learning_rate": 2.9913089281011325e-05, "loss": 1.4347, "step": 15263 }, { "epoch": 0.4020015801948907, "grad_norm": 4.511172771453857, "learning_rate": 2.9911772451935737e-05, "loss": 1.6509, "step": 15264 }, { "epoch": 0.4020279167764024, "grad_norm": 1.900025725364685, "learning_rate": 2.9910455622860156e-05, "loss": 1.9286, "step": 15265 }, { "epoch": 0.4020542533579141, "grad_norm": 1.5888254642486572, "learning_rate": 2.990913879378457e-05, "loss": 1.3138, "step": 15266 }, { "epoch": 0.40208058993942586, "grad_norm": 1.9465323686599731, "learning_rate": 2.990782196470898e-05, "loss": 0.7556, "step": 15267 }, { "epoch": 0.4021069265209376, "grad_norm": 6.241235733032227, "learning_rate": 2.9906505135633396e-05, "loss": 1.1109, "step": 15268 }, { "epoch": 0.4021332631024493, "grad_norm": 1.9666777849197388, "learning_rate": 2.990518830655781e-05, "loss": 2.4749, "step": 15269 }, { "epoch": 0.40215959968396103, "grad_norm": 2.5248916149139404, "learning_rate": 2.9903871477482228e-05, "loss": 2.0952, "step": 15270 }, { "epoch": 0.4021859362654727, "grad_norm": 1.5898040533065796, "learning_rate": 2.990255464840664e-05, "loss": 2.0229, "step": 15271 }, { "epoch": 0.40221227284698446, "grad_norm": 1.7938041687011719, "learning_rate": 2.9901237819331052e-05, "loss": 1.8544, "step": 15272 }, { "epoch": 0.4022386094284962, "grad_norm": 1.9576348066329956, "learning_rate": 2.9899920990255464e-05, "loss": 2.3689, "step": 15273 }, { "epoch": 0.4022649460100079, "grad_norm": 2.5532422065734863, "learning_rate": 2.9898604161179883e-05, "loss": 1.3425, "step": 15274 }, { "epoch": 0.40229128259151964, "grad_norm": 2.293853282928467, "learning_rate": 2.9897287332104295e-05, "loss": 0.9007, "step": 15275 }, { "epoch": 0.4023176191730313, "grad_norm": 3.9450156688690186, "learning_rate": 2.9895970503028708e-05, "loss": 1.0792, "step": 15276 }, { "epoch": 0.40234395575454307, "grad_norm": 5.532327175140381, "learning_rate": 2.989465367395312e-05, "loss": 1.6955, "step": 15277 }, { "epoch": 0.40237029233605476, "grad_norm": 1.4562660455703735, "learning_rate": 2.9893336844877535e-05, "loss": 2.1516, "step": 15278 }, { "epoch": 0.4023966289175665, "grad_norm": 2.0622482299804688, "learning_rate": 2.989202001580195e-05, "loss": 1.4577, "step": 15279 }, { "epoch": 0.40242296549907824, "grad_norm": 1.4985934495925903, "learning_rate": 2.9890703186726367e-05, "loss": 0.6312, "step": 15280 }, { "epoch": 0.40244930208058993, "grad_norm": 2.648171901702881, "learning_rate": 2.988938635765078e-05, "loss": 2.422, "step": 15281 }, { "epoch": 0.4024756386621017, "grad_norm": 1.9110594987869263, "learning_rate": 2.988806952857519e-05, "loss": 1.6282, "step": 15282 }, { "epoch": 0.40250197524361336, "grad_norm": 1.717250943183899, "learning_rate": 2.9886752699499603e-05, "loss": 1.9295, "step": 15283 }, { "epoch": 0.4025283118251251, "grad_norm": 2.044684648513794, "learning_rate": 2.9885435870424022e-05, "loss": 1.5773, "step": 15284 }, { "epoch": 0.4025546484066368, "grad_norm": 2.2607014179229736, "learning_rate": 2.9884119041348434e-05, "loss": 2.1883, "step": 15285 }, { "epoch": 0.40258098498814854, "grad_norm": 7.618353843688965, "learning_rate": 2.9882802212272847e-05, "loss": 1.602, "step": 15286 }, { "epoch": 0.4026073215696603, "grad_norm": 1.863555908203125, "learning_rate": 2.9881485383197262e-05, "loss": 2.0446, "step": 15287 }, { "epoch": 0.40263365815117197, "grad_norm": 3.020829200744629, "learning_rate": 2.9880168554121678e-05, "loss": 1.3902, "step": 15288 }, { "epoch": 0.4026599947326837, "grad_norm": 1.7568062543869019, "learning_rate": 2.9878851725046093e-05, "loss": 2.0562, "step": 15289 }, { "epoch": 0.4026863313141954, "grad_norm": 2.7996366024017334, "learning_rate": 2.9877534895970506e-05, "loss": 0.5481, "step": 15290 }, { "epoch": 0.40271266789570714, "grad_norm": 3.4967260360717773, "learning_rate": 2.9876218066894918e-05, "loss": 1.2692, "step": 15291 }, { "epoch": 0.40273900447721883, "grad_norm": 1.4886181354522705, "learning_rate": 2.987490123781933e-05, "loss": 1.8797, "step": 15292 }, { "epoch": 0.4027653410587306, "grad_norm": 3.722468137741089, "learning_rate": 2.987358440874375e-05, "loss": 0.983, "step": 15293 }, { "epoch": 0.4027916776402423, "grad_norm": 2.472588539123535, "learning_rate": 2.987226757966816e-05, "loss": 1.3117, "step": 15294 }, { "epoch": 0.402818014221754, "grad_norm": 5.674534320831299, "learning_rate": 2.9870950750592573e-05, "loss": 1.8303, "step": 15295 }, { "epoch": 0.40284435080326575, "grad_norm": 2.336442708969116, "learning_rate": 2.9869633921516986e-05, "loss": 2.1713, "step": 15296 }, { "epoch": 0.40287068738477744, "grad_norm": 3.656493663787842, "learning_rate": 2.98683170924414e-05, "loss": 1.4337, "step": 15297 }, { "epoch": 0.4028970239662892, "grad_norm": 1.4978317022323608, "learning_rate": 2.9867000263365817e-05, "loss": 1.9913, "step": 15298 }, { "epoch": 0.40292336054780087, "grad_norm": 2.005683183670044, "learning_rate": 2.9865683434290232e-05, "loss": 1.8837, "step": 15299 }, { "epoch": 0.4029496971293126, "grad_norm": 2.178419351577759, "learning_rate": 2.9864366605214645e-05, "loss": 0.7118, "step": 15300 }, { "epoch": 0.40297603371082436, "grad_norm": 2.4660890102386475, "learning_rate": 2.9863049776139057e-05, "loss": 1.3228, "step": 15301 }, { "epoch": 0.40300237029233604, "grad_norm": 2.8309640884399414, "learning_rate": 2.986173294706347e-05, "loss": 1.9071, "step": 15302 }, { "epoch": 0.4030287068738478, "grad_norm": 2.135223388671875, "learning_rate": 2.9860416117987888e-05, "loss": 1.7802, "step": 15303 }, { "epoch": 0.4030550434553595, "grad_norm": 2.9628942012786865, "learning_rate": 2.98590992889123e-05, "loss": 1.1917, "step": 15304 }, { "epoch": 0.4030813800368712, "grad_norm": 2.924274206161499, "learning_rate": 2.9857782459836713e-05, "loss": 0.712, "step": 15305 }, { "epoch": 0.40310771661838296, "grad_norm": 1.6790590286254883, "learning_rate": 2.9856465630761128e-05, "loss": 1.4436, "step": 15306 }, { "epoch": 0.40313405319989465, "grad_norm": 1.6263816356658936, "learning_rate": 2.9855148801685544e-05, "loss": 1.7542, "step": 15307 }, { "epoch": 0.4031603897814064, "grad_norm": 1.7824804782867432, "learning_rate": 2.985383197260996e-05, "loss": 2.2186, "step": 15308 }, { "epoch": 0.4031867263629181, "grad_norm": 1.734452724456787, "learning_rate": 2.985251514353437e-05, "loss": 0.43, "step": 15309 }, { "epoch": 0.4032130629444298, "grad_norm": 2.0390615463256836, "learning_rate": 2.9851198314458784e-05, "loss": 1.274, "step": 15310 }, { "epoch": 0.4032393995259415, "grad_norm": 2.4935436248779297, "learning_rate": 2.9849881485383196e-05, "loss": 0.32, "step": 15311 }, { "epoch": 0.40326573610745325, "grad_norm": 2.8859407901763916, "learning_rate": 2.9848564656307615e-05, "loss": 1.7137, "step": 15312 }, { "epoch": 0.403292072688965, "grad_norm": 2.611424446105957, "learning_rate": 2.9847247827232027e-05, "loss": 1.5125, "step": 15313 }, { "epoch": 0.4033184092704767, "grad_norm": 1.930966854095459, "learning_rate": 2.984593099815644e-05, "loss": 1.6543, "step": 15314 }, { "epoch": 0.40334474585198843, "grad_norm": 2.146080255508423, "learning_rate": 2.9844614169080855e-05, "loss": 1.751, "step": 15315 }, { "epoch": 0.4033710824335001, "grad_norm": 2.155388355255127, "learning_rate": 2.9843297340005267e-05, "loss": 1.1706, "step": 15316 }, { "epoch": 0.40339741901501186, "grad_norm": 3.0469207763671875, "learning_rate": 2.9841980510929686e-05, "loss": 2.4527, "step": 15317 }, { "epoch": 0.40342375559652355, "grad_norm": 2.508324146270752, "learning_rate": 2.98406636818541e-05, "loss": 2.2702, "step": 15318 }, { "epoch": 0.4034500921780353, "grad_norm": 2.140326976776123, "learning_rate": 2.983934685277851e-05, "loss": 1.7395, "step": 15319 }, { "epoch": 0.40347642875954703, "grad_norm": 2.279519557952881, "learning_rate": 2.9838030023702923e-05, "loss": 1.9951, "step": 15320 }, { "epoch": 0.4035027653410587, "grad_norm": 2.0406579971313477, "learning_rate": 2.9836713194627342e-05, "loss": 2.4102, "step": 15321 }, { "epoch": 0.40352910192257047, "grad_norm": 1.907092571258545, "learning_rate": 2.9835396365551754e-05, "loss": 1.8873, "step": 15322 }, { "epoch": 0.40355543850408215, "grad_norm": 2.1900689601898193, "learning_rate": 2.9834079536476166e-05, "loss": 1.9562, "step": 15323 }, { "epoch": 0.4035817750855939, "grad_norm": 1.8613706827163696, "learning_rate": 2.983276270740058e-05, "loss": 1.9495, "step": 15324 }, { "epoch": 0.4036081116671056, "grad_norm": 1.395447015762329, "learning_rate": 2.9831445878324994e-05, "loss": 1.6595, "step": 15325 }, { "epoch": 0.40363444824861733, "grad_norm": 1.4495803117752075, "learning_rate": 2.983012904924941e-05, "loss": 1.687, "step": 15326 }, { "epoch": 0.40366078483012907, "grad_norm": 3.6437270641326904, "learning_rate": 2.9828812220173825e-05, "loss": 1.4631, "step": 15327 }, { "epoch": 0.40368712141164076, "grad_norm": 1.7573702335357666, "learning_rate": 2.9827495391098237e-05, "loss": 1.9854, "step": 15328 }, { "epoch": 0.4037134579931525, "grad_norm": NaN, "learning_rate": 2.9827495391098237e-05, "loss": 2.0548, "step": 15329 }, { "epoch": 0.4037397945746642, "grad_norm": 2.279595136642456, "learning_rate": 2.982617856202265e-05, "loss": 1.5082, "step": 15330 }, { "epoch": 0.40376613115617593, "grad_norm": 1.6415857076644897, "learning_rate": 2.9824861732947062e-05, "loss": 1.8043, "step": 15331 }, { "epoch": 0.4037924677376876, "grad_norm": 3.3703339099884033, "learning_rate": 2.982354490387148e-05, "loss": 1.8585, "step": 15332 }, { "epoch": 0.40381880431919936, "grad_norm": 2.0637922286987305, "learning_rate": 2.9822228074795893e-05, "loss": 1.8432, "step": 15333 }, { "epoch": 0.4038451409007111, "grad_norm": 3.3325417041778564, "learning_rate": 2.9820911245720305e-05, "loss": 1.4587, "step": 15334 }, { "epoch": 0.4038714774822228, "grad_norm": 1.8212943077087402, "learning_rate": 2.981959441664472e-05, "loss": 2.1724, "step": 15335 }, { "epoch": 0.40389781406373454, "grad_norm": 2.6107821464538574, "learning_rate": 2.9818277587569133e-05, "loss": 2.114, "step": 15336 }, { "epoch": 0.4039241506452462, "grad_norm": 2.176142454147339, "learning_rate": 2.9816960758493552e-05, "loss": 1.7516, "step": 15337 }, { "epoch": 0.40395048722675797, "grad_norm": 3.1308257579803467, "learning_rate": 2.9815643929417964e-05, "loss": 1.9061, "step": 15338 }, { "epoch": 0.4039768238082697, "grad_norm": 1.6352956295013428, "learning_rate": 2.9814327100342376e-05, "loss": 1.4167, "step": 15339 }, { "epoch": 0.4040031603897814, "grad_norm": 2.701352834701538, "learning_rate": 2.981301027126679e-05, "loss": 1.2969, "step": 15340 }, { "epoch": 0.40402949697129314, "grad_norm": 1.825621485710144, "learning_rate": 2.9811693442191208e-05, "loss": 2.1002, "step": 15341 }, { "epoch": 0.40405583355280483, "grad_norm": 1.8138121366500854, "learning_rate": 2.981037661311562e-05, "loss": 0.303, "step": 15342 }, { "epoch": 0.4040821701343166, "grad_norm": 5.9109272956848145, "learning_rate": 2.9809059784040032e-05, "loss": 1.5597, "step": 15343 }, { "epoch": 0.40410850671582826, "grad_norm": 3.0749809741973877, "learning_rate": 2.9807742954964448e-05, "loss": 1.9889, "step": 15344 }, { "epoch": 0.40413484329734, "grad_norm": 3.8184762001037598, "learning_rate": 2.980642612588886e-05, "loss": 1.1444, "step": 15345 }, { "epoch": 0.40416117987885175, "grad_norm": 1.7192103862762451, "learning_rate": 2.980510929681328e-05, "loss": 2.154, "step": 15346 }, { "epoch": 0.40418751646036344, "grad_norm": 3.9199676513671875, "learning_rate": 2.980379246773769e-05, "loss": 1.6701, "step": 15347 }, { "epoch": 0.4042138530418752, "grad_norm": 3.170886516571045, "learning_rate": 2.9802475638662103e-05, "loss": 2.2398, "step": 15348 }, { "epoch": 0.40424018962338687, "grad_norm": 2.3458900451660156, "learning_rate": 2.9801158809586515e-05, "loss": 1.8996, "step": 15349 }, { "epoch": 0.4042665262048986, "grad_norm": 3.924548387527466, "learning_rate": 2.9799841980510928e-05, "loss": 0.5476, "step": 15350 }, { "epoch": 0.4042928627864103, "grad_norm": 1.8035708665847778, "learning_rate": 2.9798525151435347e-05, "loss": 2.1935, "step": 15351 }, { "epoch": 0.40431919936792204, "grad_norm": 1.8139768838882446, "learning_rate": 2.979720832235976e-05, "loss": 1.7111, "step": 15352 }, { "epoch": 0.4043455359494338, "grad_norm": 1.618435025215149, "learning_rate": 2.979589149328417e-05, "loss": 1.5709, "step": 15353 }, { "epoch": 0.4043718725309455, "grad_norm": 3.874896764755249, "learning_rate": 2.9794574664208587e-05, "loss": 2.0537, "step": 15354 }, { "epoch": 0.4043982091124572, "grad_norm": 1.975533366203308, "learning_rate": 2.9793257835133002e-05, "loss": 2.1022, "step": 15355 }, { "epoch": 0.4044245456939689, "grad_norm": 3.2438900470733643, "learning_rate": 2.9791941006057418e-05, "loss": 0.9999, "step": 15356 }, { "epoch": 0.40445088227548065, "grad_norm": 2.1629087924957275, "learning_rate": 2.979062417698183e-05, "loss": 1.9705, "step": 15357 }, { "epoch": 0.40447721885699234, "grad_norm": 2.615725040435791, "learning_rate": 2.9789307347906242e-05, "loss": 1.5595, "step": 15358 }, { "epoch": 0.4045035554385041, "grad_norm": 1.3952604532241821, "learning_rate": 2.9787990518830654e-05, "loss": 1.7454, "step": 15359 }, { "epoch": 0.4045298920200158, "grad_norm": 1.5876686573028564, "learning_rate": 2.9786673689755073e-05, "loss": 1.5213, "step": 15360 }, { "epoch": 0.4045562286015275, "grad_norm": 2.065218448638916, "learning_rate": 2.9785356860679486e-05, "loss": 1.4468, "step": 15361 }, { "epoch": 0.40458256518303926, "grad_norm": 2.1356821060180664, "learning_rate": 2.9784040031603898e-05, "loss": 2.152, "step": 15362 }, { "epoch": 0.40460890176455094, "grad_norm": 3.850576162338257, "learning_rate": 2.9782723202528313e-05, "loss": 1.0064, "step": 15363 }, { "epoch": 0.4046352383460627, "grad_norm": 2.0404551029205322, "learning_rate": 2.9781406373452726e-05, "loss": 2.4963, "step": 15364 }, { "epoch": 0.4046615749275744, "grad_norm": 2.5017809867858887, "learning_rate": 2.9780089544377145e-05, "loss": 1.1651, "step": 15365 }, { "epoch": 0.4046879115090861, "grad_norm": 2.846797466278076, "learning_rate": 2.9778772715301557e-05, "loss": 1.5911, "step": 15366 }, { "epoch": 0.40471424809059786, "grad_norm": 3.607956647872925, "learning_rate": 2.977745588622597e-05, "loss": 1.0438, "step": 15367 }, { "epoch": 0.40474058467210955, "grad_norm": 3.209007501602173, "learning_rate": 2.977613905715038e-05, "loss": 1.7777, "step": 15368 }, { "epoch": 0.4047669212536213, "grad_norm": 3.2647480964660645, "learning_rate": 2.9774822228074794e-05, "loss": 0.52, "step": 15369 }, { "epoch": 0.404793257835133, "grad_norm": 1.669777274131775, "learning_rate": 2.9773505398999213e-05, "loss": 2.3838, "step": 15370 }, { "epoch": 0.4048195944166447, "grad_norm": 2.823716878890991, "learning_rate": 2.9772188569923625e-05, "loss": 1.998, "step": 15371 }, { "epoch": 0.40484593099815647, "grad_norm": 2.0487589836120605, "learning_rate": 2.977087174084804e-05, "loss": 1.6007, "step": 15372 }, { "epoch": 0.40487226757966815, "grad_norm": 6.423557281494141, "learning_rate": 2.9769554911772453e-05, "loss": 1.6826, "step": 15373 }, { "epoch": 0.4048986041611799, "grad_norm": 2.6650333404541016, "learning_rate": 2.976823808269687e-05, "loss": 1.6367, "step": 15374 }, { "epoch": 0.4049249407426916, "grad_norm": 2.885716438293457, "learning_rate": 2.9766921253621284e-05, "loss": 1.8185, "step": 15375 }, { "epoch": 0.40495127732420333, "grad_norm": 3.1580519676208496, "learning_rate": 2.9765604424545696e-05, "loss": 1.6559, "step": 15376 }, { "epoch": 0.404977613905715, "grad_norm": 1.8138922452926636, "learning_rate": 2.9764287595470108e-05, "loss": 1.6343, "step": 15377 }, { "epoch": 0.40500395048722676, "grad_norm": 1.733119010925293, "learning_rate": 2.976297076639452e-05, "loss": 1.851, "step": 15378 }, { "epoch": 0.4050302870687385, "grad_norm": 5.5334367752075195, "learning_rate": 2.976165393731894e-05, "loss": 2.0338, "step": 15379 }, { "epoch": 0.4050566236502502, "grad_norm": 2.062154531478882, "learning_rate": 2.976033710824335e-05, "loss": 1.5095, "step": 15380 }, { "epoch": 0.40508296023176193, "grad_norm": 2.687317132949829, "learning_rate": 2.9759020279167764e-05, "loss": 0.8981, "step": 15381 }, { "epoch": 0.4051092968132736, "grad_norm": 2.1819286346435547, "learning_rate": 2.975770345009218e-05, "loss": 1.4309, "step": 15382 }, { "epoch": 0.40513563339478537, "grad_norm": 1.8535792827606201, "learning_rate": 2.975638662101659e-05, "loss": 1.2911, "step": 15383 }, { "epoch": 0.40516196997629705, "grad_norm": 3.983138084411621, "learning_rate": 2.975506979194101e-05, "loss": 1.5396, "step": 15384 }, { "epoch": 0.4051883065578088, "grad_norm": 3.4726104736328125, "learning_rate": 2.9753752962865423e-05, "loss": 1.1441, "step": 15385 }, { "epoch": 0.40521464313932054, "grad_norm": 1.7042008638381958, "learning_rate": 2.9752436133789835e-05, "loss": 1.8604, "step": 15386 }, { "epoch": 0.40524097972083223, "grad_norm": 3.164184331893921, "learning_rate": 2.9751119304714247e-05, "loss": 1.6527, "step": 15387 }, { "epoch": 0.40526731630234397, "grad_norm": 1.9780757427215576, "learning_rate": 2.9749802475638666e-05, "loss": 1.3688, "step": 15388 }, { "epoch": 0.40529365288385566, "grad_norm": 2.3368895053863525, "learning_rate": 2.974848564656308e-05, "loss": 0.4079, "step": 15389 }, { "epoch": 0.4053199894653674, "grad_norm": 1.8770625591278076, "learning_rate": 2.974716881748749e-05, "loss": 0.9074, "step": 15390 }, { "epoch": 0.4053463260468791, "grad_norm": 2.0360267162323, "learning_rate": 2.9745851988411906e-05, "loss": 1.4624, "step": 15391 }, { "epoch": 0.40537266262839083, "grad_norm": 4.387003421783447, "learning_rate": 2.974453515933632e-05, "loss": 1.8201, "step": 15392 }, { "epoch": 0.4053989992099026, "grad_norm": 3.6480557918548584, "learning_rate": 2.9743218330260737e-05, "loss": 2.5745, "step": 15393 }, { "epoch": 0.40542533579141427, "grad_norm": 3.384028673171997, "learning_rate": 2.974190150118515e-05, "loss": 1.6655, "step": 15394 }, { "epoch": 0.405451672372926, "grad_norm": 2.7470521926879883, "learning_rate": 2.9740584672109562e-05, "loss": 1.9622, "step": 15395 }, { "epoch": 0.4054780089544377, "grad_norm": 2.243062734603882, "learning_rate": 2.9739267843033974e-05, "loss": 1.7063, "step": 15396 }, { "epoch": 0.40550434553594944, "grad_norm": 4.0135602951049805, "learning_rate": 2.9737951013958386e-05, "loss": 1.247, "step": 15397 }, { "epoch": 0.4055306821174611, "grad_norm": 1.9372515678405762, "learning_rate": 2.9736634184882805e-05, "loss": 1.7156, "step": 15398 }, { "epoch": 0.40555701869897287, "grad_norm": 1.9279767274856567, "learning_rate": 2.9735317355807217e-05, "loss": 1.7418, "step": 15399 }, { "epoch": 0.4055833552804846, "grad_norm": 2.5324831008911133, "learning_rate": 2.973400052673163e-05, "loss": 1.3552, "step": 15400 }, { "epoch": 0.4056096918619963, "grad_norm": 3.1103415489196777, "learning_rate": 2.9732683697656045e-05, "loss": 1.5672, "step": 15401 }, { "epoch": 0.40563602844350805, "grad_norm": 1.7157319784164429, "learning_rate": 2.9731366868580457e-05, "loss": 2.0582, "step": 15402 }, { "epoch": 0.40566236502501973, "grad_norm": 1.8707804679870605, "learning_rate": 2.9730050039504876e-05, "loss": 2.1754, "step": 15403 }, { "epoch": 0.4056887016065315, "grad_norm": 3.8068370819091797, "learning_rate": 2.972873321042929e-05, "loss": 1.8343, "step": 15404 }, { "epoch": 0.40571503818804316, "grad_norm": 5.422422409057617, "learning_rate": 2.97274163813537e-05, "loss": 1.6686, "step": 15405 }, { "epoch": 0.4057413747695549, "grad_norm": 5.505545616149902, "learning_rate": 2.9726099552278113e-05, "loss": 2.467, "step": 15406 }, { "epoch": 0.40576771135106665, "grad_norm": 1.986251711845398, "learning_rate": 2.9724782723202532e-05, "loss": 2.0649, "step": 15407 }, { "epoch": 0.40579404793257834, "grad_norm": 2.211836338043213, "learning_rate": 2.9723465894126944e-05, "loss": 0.5509, "step": 15408 }, { "epoch": 0.4058203845140901, "grad_norm": 1.5220776796340942, "learning_rate": 2.9722149065051356e-05, "loss": 1.2357, "step": 15409 }, { "epoch": 0.40584672109560177, "grad_norm": 3.980742931365967, "learning_rate": 2.9720832235975772e-05, "loss": 1.2308, "step": 15410 }, { "epoch": 0.4058730576771135, "grad_norm": 2.5117199420928955, "learning_rate": 2.9719515406900184e-05, "loss": 2.5167, "step": 15411 }, { "epoch": 0.40589939425862526, "grad_norm": 3.5928966999053955, "learning_rate": 2.9718198577824603e-05, "loss": 1.5631, "step": 15412 }, { "epoch": 0.40592573084013694, "grad_norm": 3.316068649291992, "learning_rate": 2.9716881748749015e-05, "loss": 2.32, "step": 15413 }, { "epoch": 0.4059520674216487, "grad_norm": 3.653067111968994, "learning_rate": 2.9715564919673428e-05, "loss": 1.0957, "step": 15414 }, { "epoch": 0.4059784040031604, "grad_norm": 1.9494534730911255, "learning_rate": 2.971424809059784e-05, "loss": 1.9805, "step": 15415 }, { "epoch": 0.4060047405846721, "grad_norm": 2.3916592597961426, "learning_rate": 2.9712931261522252e-05, "loss": 0.4726, "step": 15416 }, { "epoch": 0.4060310771661838, "grad_norm": 1.8614389896392822, "learning_rate": 2.971161443244667e-05, "loss": 2.1196, "step": 15417 }, { "epoch": 0.40605741374769555, "grad_norm": 2.3933017253875732, "learning_rate": 2.9710297603371083e-05, "loss": 1.8667, "step": 15418 }, { "epoch": 0.4060837503292073, "grad_norm": 2.385232448577881, "learning_rate": 2.97089807742955e-05, "loss": 1.5811, "step": 15419 }, { "epoch": 0.406110086910719, "grad_norm": 3.0019991397857666, "learning_rate": 2.970766394521991e-05, "loss": 1.588, "step": 15420 }, { "epoch": 0.4061364234922307, "grad_norm": 2.4925224781036377, "learning_rate": 2.970634711614433e-05, "loss": 0.9186, "step": 15421 }, { "epoch": 0.4061627600737424, "grad_norm": 3.316859483718872, "learning_rate": 2.9705030287068742e-05, "loss": 1.296, "step": 15422 }, { "epoch": 0.40618909665525416, "grad_norm": 2.2114908695220947, "learning_rate": 2.9703713457993154e-05, "loss": 2.0354, "step": 15423 }, { "epoch": 0.40621543323676584, "grad_norm": 4.095826148986816, "learning_rate": 2.9702396628917567e-05, "loss": 1.5299, "step": 15424 }, { "epoch": 0.4062417698182776, "grad_norm": 1.9702955484390259, "learning_rate": 2.970107979984198e-05, "loss": 1.6186, "step": 15425 }, { "epoch": 0.40626810639978933, "grad_norm": 2.061579704284668, "learning_rate": 2.9699762970766398e-05, "loss": 1.4586, "step": 15426 }, { "epoch": 0.406294442981301, "grad_norm": 1.6376806497573853, "learning_rate": 2.969844614169081e-05, "loss": 1.5134, "step": 15427 }, { "epoch": 0.40632077956281276, "grad_norm": 2.303356885910034, "learning_rate": 2.9697129312615222e-05, "loss": 0.8613, "step": 15428 }, { "epoch": 0.40634711614432445, "grad_norm": 1.7938517332077026, "learning_rate": 2.9695812483539638e-05, "loss": 1.7145, "step": 15429 }, { "epoch": 0.4063734527258362, "grad_norm": 1.947532057762146, "learning_rate": 2.969449565446405e-05, "loss": 1.7949, "step": 15430 }, { "epoch": 0.4063997893073479, "grad_norm": 1.6662968397140503, "learning_rate": 2.969317882538847e-05, "loss": 1.8401, "step": 15431 }, { "epoch": 0.4064261258888596, "grad_norm": 4.729133129119873, "learning_rate": 2.969186199631288e-05, "loss": 0.8145, "step": 15432 }, { "epoch": 0.40645246247037137, "grad_norm": 5.384486198425293, "learning_rate": 2.9690545167237294e-05, "loss": 2.2228, "step": 15433 }, { "epoch": 0.40647879905188306, "grad_norm": 1.9077547788619995, "learning_rate": 2.9689228338161706e-05, "loss": 1.8364, "step": 15434 }, { "epoch": 0.4065051356333948, "grad_norm": 2.904686689376831, "learning_rate": 2.9687911509086118e-05, "loss": 1.8479, "step": 15435 }, { "epoch": 0.4065314722149065, "grad_norm": 2.0099730491638184, "learning_rate": 2.9686594680010537e-05, "loss": 1.8528, "step": 15436 }, { "epoch": 0.40655780879641823, "grad_norm": 5.712948322296143, "learning_rate": 2.968527785093495e-05, "loss": 0.7206, "step": 15437 }, { "epoch": 0.4065841453779299, "grad_norm": 3.6986382007598877, "learning_rate": 2.9683961021859365e-05, "loss": 2.4564, "step": 15438 }, { "epoch": 0.40661048195944166, "grad_norm": 3.9545769691467285, "learning_rate": 2.9682644192783777e-05, "loss": 1.4015, "step": 15439 }, { "epoch": 0.4066368185409534, "grad_norm": 3.0367674827575684, "learning_rate": 2.9681327363708196e-05, "loss": 1.123, "step": 15440 }, { "epoch": 0.4066631551224651, "grad_norm": 2.2434582710266113, "learning_rate": 2.9680010534632608e-05, "loss": 1.2418, "step": 15441 }, { "epoch": 0.40668949170397684, "grad_norm": 2.2279560565948486, "learning_rate": 2.967869370555702e-05, "loss": 1.505, "step": 15442 }, { "epoch": 0.4067158282854885, "grad_norm": 1.5036253929138184, "learning_rate": 2.9677376876481433e-05, "loss": 1.5769, "step": 15443 }, { "epoch": 0.40674216486700027, "grad_norm": 2.491455316543579, "learning_rate": 2.9676060047405845e-05, "loss": 1.724, "step": 15444 }, { "epoch": 0.406768501448512, "grad_norm": 3.2721242904663086, "learning_rate": 2.9674743218330264e-05, "loss": 1.6572, "step": 15445 }, { "epoch": 0.4067948380300237, "grad_norm": 2.012260913848877, "learning_rate": 2.9673426389254676e-05, "loss": 2.1404, "step": 15446 }, { "epoch": 0.40682117461153544, "grad_norm": 1.5121233463287354, "learning_rate": 2.967210956017909e-05, "loss": 1.7112, "step": 15447 }, { "epoch": 0.40684751119304713, "grad_norm": 1.834301233291626, "learning_rate": 2.9670792731103504e-05, "loss": 1.4894, "step": 15448 }, { "epoch": 0.40687384777455887, "grad_norm": 2.0125436782836914, "learning_rate": 2.9669475902027916e-05, "loss": 1.7273, "step": 15449 }, { "epoch": 0.40690018435607056, "grad_norm": 6.842875003814697, "learning_rate": 2.9668159072952335e-05, "loss": 1.8526, "step": 15450 }, { "epoch": 0.4069265209375823, "grad_norm": 2.128511428833008, "learning_rate": 2.9666842243876747e-05, "loss": 1.5089, "step": 15451 }, { "epoch": 0.40695285751909405, "grad_norm": 1.5033659934997559, "learning_rate": 2.966552541480116e-05, "loss": 2.066, "step": 15452 }, { "epoch": 0.40697919410060573, "grad_norm": 3.0011868476867676, "learning_rate": 2.966420858572557e-05, "loss": 0.6518, "step": 15453 }, { "epoch": 0.4070055306821175, "grad_norm": 2.5068671703338623, "learning_rate": 2.966289175664999e-05, "loss": 1.5624, "step": 15454 }, { "epoch": 0.40703186726362917, "grad_norm": 3.2575442790985107, "learning_rate": 2.9661574927574403e-05, "loss": 0.2517, "step": 15455 }, { "epoch": 0.4070582038451409, "grad_norm": 1.6077076196670532, "learning_rate": 2.9660258098498815e-05, "loss": 1.7122, "step": 15456 }, { "epoch": 0.4070845404266526, "grad_norm": 1.9932819604873657, "learning_rate": 2.965894126942323e-05, "loss": 1.2649, "step": 15457 }, { "epoch": 0.40711087700816434, "grad_norm": 3.8360519409179688, "learning_rate": 2.9657624440347643e-05, "loss": 2.2119, "step": 15458 }, { "epoch": 0.4071372135896761, "grad_norm": 1.9107203483581543, "learning_rate": 2.9656307611272062e-05, "loss": 1.8703, "step": 15459 }, { "epoch": 0.40716355017118777, "grad_norm": 1.6759370565414429, "learning_rate": 2.9654990782196474e-05, "loss": 1.6466, "step": 15460 }, { "epoch": 0.4071898867526995, "grad_norm": 3.092898368835449, "learning_rate": 2.9653673953120886e-05, "loss": 1.3679, "step": 15461 }, { "epoch": 0.4072162233342112, "grad_norm": 4.308140277862549, "learning_rate": 2.96523571240453e-05, "loss": 1.6723, "step": 15462 }, { "epoch": 0.40724255991572295, "grad_norm": 1.9461251497268677, "learning_rate": 2.965104029496971e-05, "loss": 1.3311, "step": 15463 }, { "epoch": 0.40726889649723463, "grad_norm": 3.026602029800415, "learning_rate": 2.964972346589413e-05, "loss": 1.4665, "step": 15464 }, { "epoch": 0.4072952330787464, "grad_norm": 1.8129791021347046, "learning_rate": 2.9648406636818542e-05, "loss": 1.6168, "step": 15465 }, { "epoch": 0.4073215696602581, "grad_norm": 1.740459680557251, "learning_rate": 2.9647089807742957e-05, "loss": 2.5732, "step": 15466 }, { "epoch": 0.4073479062417698, "grad_norm": 3.010624647140503, "learning_rate": 2.964577297866737e-05, "loss": 2.3867, "step": 15467 }, { "epoch": 0.40737424282328155, "grad_norm": 3.168281316757202, "learning_rate": 2.9644456149591782e-05, "loss": 1.5576, "step": 15468 }, { "epoch": 0.40740057940479324, "grad_norm": 3.2274458408355713, "learning_rate": 2.96431393205162e-05, "loss": 1.5266, "step": 15469 }, { "epoch": 0.407426915986305, "grad_norm": 3.892594337463379, "learning_rate": 2.9641822491440613e-05, "loss": 2.3756, "step": 15470 }, { "epoch": 0.40745325256781667, "grad_norm": 2.5363903045654297, "learning_rate": 2.9640505662365025e-05, "loss": 1.7264, "step": 15471 }, { "epoch": 0.4074795891493284, "grad_norm": 3.7932369709014893, "learning_rate": 2.9639188833289437e-05, "loss": 0.8553, "step": 15472 }, { "epoch": 0.40750592573084016, "grad_norm": 2.0251471996307373, "learning_rate": 2.9637872004213856e-05, "loss": 2.0359, "step": 15473 }, { "epoch": 0.40753226231235185, "grad_norm": 2.834667444229126, "learning_rate": 2.963655517513827e-05, "loss": 1.2173, "step": 15474 }, { "epoch": 0.4075585988938636, "grad_norm": 1.8057821989059448, "learning_rate": 2.9635238346062684e-05, "loss": 1.39, "step": 15475 }, { "epoch": 0.4075849354753753, "grad_norm": 3.6677236557006836, "learning_rate": 2.9633921516987096e-05, "loss": 1.0169, "step": 15476 }, { "epoch": 0.407611272056887, "grad_norm": 2.107739210128784, "learning_rate": 2.963260468791151e-05, "loss": 2.0358, "step": 15477 }, { "epoch": 0.40763760863839876, "grad_norm": 1.481797695159912, "learning_rate": 2.9631287858835928e-05, "loss": 2.0033, "step": 15478 }, { "epoch": 0.40766394521991045, "grad_norm": 4.4800801277160645, "learning_rate": 2.962997102976034e-05, "loss": 1.8054, "step": 15479 }, { "epoch": 0.4076902818014222, "grad_norm": 2.457227945327759, "learning_rate": 2.9628654200684752e-05, "loss": 1.0475, "step": 15480 }, { "epoch": 0.4077166183829339, "grad_norm": 2.21356463432312, "learning_rate": 2.9627337371609164e-05, "loss": 0.8472, "step": 15481 }, { "epoch": 0.4077429549644456, "grad_norm": 1.7571008205413818, "learning_rate": 2.9626020542533577e-05, "loss": 1.7823, "step": 15482 }, { "epoch": 0.4077692915459573, "grad_norm": 1.961984634399414, "learning_rate": 2.9624703713457995e-05, "loss": 1.7197, "step": 15483 }, { "epoch": 0.40779562812746906, "grad_norm": 2.202336311340332, "learning_rate": 2.9623386884382408e-05, "loss": 2.2676, "step": 15484 }, { "epoch": 0.4078219647089808, "grad_norm": 1.9808951616287231, "learning_rate": 2.9622070055306823e-05, "loss": 1.9398, "step": 15485 }, { "epoch": 0.4078483012904925, "grad_norm": 2.01914119720459, "learning_rate": 2.9620753226231236e-05, "loss": 1.3534, "step": 15486 }, { "epoch": 0.40787463787200423, "grad_norm": 2.289134979248047, "learning_rate": 2.9619436397155654e-05, "loss": 1.6258, "step": 15487 }, { "epoch": 0.4079009744535159, "grad_norm": 3.446135997772217, "learning_rate": 2.9618119568080067e-05, "loss": 1.4653, "step": 15488 }, { "epoch": 0.40792731103502766, "grad_norm": 2.072317123413086, "learning_rate": 2.961680273900448e-05, "loss": 0.4528, "step": 15489 }, { "epoch": 0.40795364761653935, "grad_norm": 1.6260031461715698, "learning_rate": 2.961548590992889e-05, "loss": 1.7824, "step": 15490 }, { "epoch": 0.4079799841980511, "grad_norm": 2.3289427757263184, "learning_rate": 2.9614169080853303e-05, "loss": 0.9303, "step": 15491 }, { "epoch": 0.40800632077956284, "grad_norm": 2.5439000129699707, "learning_rate": 2.9612852251777722e-05, "loss": 1.4794, "step": 15492 }, { "epoch": 0.4080326573610745, "grad_norm": 2.2047746181488037, "learning_rate": 2.9611535422702135e-05, "loss": 2.3047, "step": 15493 }, { "epoch": 0.40805899394258627, "grad_norm": 2.7033281326293945, "learning_rate": 2.961021859362655e-05, "loss": 1.8353, "step": 15494 }, { "epoch": 0.40808533052409796, "grad_norm": 1.8041285276412964, "learning_rate": 2.9608901764550962e-05, "loss": 1.716, "step": 15495 }, { "epoch": 0.4081116671056097, "grad_norm": 2.187530040740967, "learning_rate": 2.9607584935475375e-05, "loss": 0.8654, "step": 15496 }, { "epoch": 0.4081380036871214, "grad_norm": 2.350358009338379, "learning_rate": 2.9606268106399794e-05, "loss": 1.6203, "step": 15497 }, { "epoch": 0.40816434026863313, "grad_norm": 3.5552783012390137, "learning_rate": 2.9604951277324206e-05, "loss": 0.6763, "step": 15498 }, { "epoch": 0.4081906768501449, "grad_norm": 1.7227190732955933, "learning_rate": 2.9603634448248618e-05, "loss": 1.3624, "step": 15499 }, { "epoch": 0.40821701343165656, "grad_norm": 2.537081241607666, "learning_rate": 2.960231761917303e-05, "loss": 1.8798, "step": 15500 }, { "epoch": 0.4082433500131683, "grad_norm": 3.4115798473358154, "learning_rate": 2.960100079009745e-05, "loss": 1.4744, "step": 15501 }, { "epoch": 0.40826968659468, "grad_norm": 1.8979332447052002, "learning_rate": 2.959968396102186e-05, "loss": 2.2764, "step": 15502 }, { "epoch": 0.40829602317619174, "grad_norm": 4.006608963012695, "learning_rate": 2.9598367131946274e-05, "loss": 0.831, "step": 15503 }, { "epoch": 0.4083223597577034, "grad_norm": 3.032528877258301, "learning_rate": 2.959705030287069e-05, "loss": 0.4143, "step": 15504 }, { "epoch": 0.40834869633921517, "grad_norm": 3.0231807231903076, "learning_rate": 2.95957334737951e-05, "loss": 2.2178, "step": 15505 }, { "epoch": 0.4083750329207269, "grad_norm": 1.7320244312286377, "learning_rate": 2.959441664471952e-05, "loss": 2.0389, "step": 15506 }, { "epoch": 0.4084013695022386, "grad_norm": 1.7165251970291138, "learning_rate": 2.9593099815643933e-05, "loss": 1.8923, "step": 15507 }, { "epoch": 0.40842770608375034, "grad_norm": 2.2482924461364746, "learning_rate": 2.9591782986568345e-05, "loss": 1.9368, "step": 15508 }, { "epoch": 0.40845404266526203, "grad_norm": 3.1780142784118652, "learning_rate": 2.9590466157492757e-05, "loss": 1.6005, "step": 15509 }, { "epoch": 0.4084803792467738, "grad_norm": 1.8063474893569946, "learning_rate": 2.958914932841717e-05, "loss": 1.7215, "step": 15510 }, { "epoch": 0.4085067158282855, "grad_norm": 1.6821879148483276, "learning_rate": 2.9587832499341588e-05, "loss": 2.305, "step": 15511 }, { "epoch": 0.4085330524097972, "grad_norm": 1.7118793725967407, "learning_rate": 2.9586515670266e-05, "loss": 2.0801, "step": 15512 }, { "epoch": 0.40855938899130895, "grad_norm": 1.8230926990509033, "learning_rate": 2.9585198841190416e-05, "loss": 1.7009, "step": 15513 }, { "epoch": 0.40858572557282063, "grad_norm": 2.3839879035949707, "learning_rate": 2.9583882012114828e-05, "loss": 1.3916, "step": 15514 }, { "epoch": 0.4086120621543324, "grad_norm": 2.482487440109253, "learning_rate": 2.958256518303924e-05, "loss": 0.8833, "step": 15515 }, { "epoch": 0.40863839873584407, "grad_norm": 3.2751338481903076, "learning_rate": 2.958124835396366e-05, "loss": 0.968, "step": 15516 }, { "epoch": 0.4086647353173558, "grad_norm": 4.714774131774902, "learning_rate": 2.957993152488807e-05, "loss": 1.4814, "step": 15517 }, { "epoch": 0.40869107189886755, "grad_norm": 1.7100814580917358, "learning_rate": 2.9578614695812484e-05, "loss": 2.2582, "step": 15518 }, { "epoch": 0.40871740848037924, "grad_norm": 2.099839448928833, "learning_rate": 2.9577297866736896e-05, "loss": 1.3471, "step": 15519 }, { "epoch": 0.408743745061891, "grad_norm": 1.9254851341247559, "learning_rate": 2.9575981037661315e-05, "loss": 1.886, "step": 15520 }, { "epoch": 0.40877008164340267, "grad_norm": 2.3890559673309326, "learning_rate": 2.9574664208585727e-05, "loss": 1.6631, "step": 15521 }, { "epoch": 0.4087964182249144, "grad_norm": 1.8621312379837036, "learning_rate": 2.9573347379510143e-05, "loss": 2.6855, "step": 15522 }, { "epoch": 0.4088227548064261, "grad_norm": 2.55586314201355, "learning_rate": 2.9572030550434555e-05, "loss": 2.027, "step": 15523 }, { "epoch": 0.40884909138793785, "grad_norm": 1.8009172677993774, "learning_rate": 2.9570713721358967e-05, "loss": 1.7208, "step": 15524 }, { "epoch": 0.4088754279694496, "grad_norm": 1.7379064559936523, "learning_rate": 2.9569396892283386e-05, "loss": 1.5554, "step": 15525 }, { "epoch": 0.4089017645509613, "grad_norm": 3.0741071701049805, "learning_rate": 2.95680800632078e-05, "loss": 0.4173, "step": 15526 }, { "epoch": 0.408928101132473, "grad_norm": 3.096268892288208, "learning_rate": 2.956676323413221e-05, "loss": 1.8136, "step": 15527 }, { "epoch": 0.4089544377139847, "grad_norm": 2.805260419845581, "learning_rate": 2.9565446405056623e-05, "loss": 1.7647, "step": 15528 }, { "epoch": 0.40898077429549645, "grad_norm": 4.796494960784912, "learning_rate": 2.9564129575981035e-05, "loss": 1.2489, "step": 15529 }, { "epoch": 0.40900711087700814, "grad_norm": 1.892318606376648, "learning_rate": 2.9562812746905454e-05, "loss": 1.9592, "step": 15530 }, { "epoch": 0.4090334474585199, "grad_norm": 3.3813490867614746, "learning_rate": 2.9561495917829866e-05, "loss": 1.3326, "step": 15531 }, { "epoch": 0.4090597840400316, "grad_norm": 1.989195466041565, "learning_rate": 2.9560179088754282e-05, "loss": 2.2213, "step": 15532 }, { "epoch": 0.4090861206215433, "grad_norm": 2.7464845180511475, "learning_rate": 2.9558862259678694e-05, "loss": 1.5868, "step": 15533 }, { "epoch": 0.40911245720305506, "grad_norm": 2.1392910480499268, "learning_rate": 2.9557545430603113e-05, "loss": 1.3225, "step": 15534 }, { "epoch": 0.40913879378456675, "grad_norm": 3.0204877853393555, "learning_rate": 2.9556228601527525e-05, "loss": 1.3558, "step": 15535 }, { "epoch": 0.4091651303660785, "grad_norm": 2.2563998699188232, "learning_rate": 2.9554911772451937e-05, "loss": 0.7977, "step": 15536 }, { "epoch": 0.4091914669475902, "grad_norm": 2.4553513526916504, "learning_rate": 2.955359494337635e-05, "loss": 1.7936, "step": 15537 }, { "epoch": 0.4092178035291019, "grad_norm": 3.082031011581421, "learning_rate": 2.9552278114300762e-05, "loss": 1.9524, "step": 15538 }, { "epoch": 0.40924414011061366, "grad_norm": 3.5475902557373047, "learning_rate": 2.955096128522518e-05, "loss": 1.4063, "step": 15539 }, { "epoch": 0.40927047669212535, "grad_norm": 2.4492595195770264, "learning_rate": 2.9549644456149593e-05, "loss": 1.8252, "step": 15540 }, { "epoch": 0.4092968132736371, "grad_norm": 2.0702593326568604, "learning_rate": 2.954832762707401e-05, "loss": 1.8981, "step": 15541 }, { "epoch": 0.4093231498551488, "grad_norm": 1.7347869873046875, "learning_rate": 2.954701079799842e-05, "loss": 1.5274, "step": 15542 }, { "epoch": 0.4093494864366605, "grad_norm": 2.504793643951416, "learning_rate": 2.9545693968922833e-05, "loss": 1.814, "step": 15543 }, { "epoch": 0.4093758230181722, "grad_norm": 1.5416631698608398, "learning_rate": 2.9544377139847252e-05, "loss": 0.965, "step": 15544 }, { "epoch": 0.40940215959968396, "grad_norm": 2.92168927192688, "learning_rate": 2.9543060310771664e-05, "loss": 1.941, "step": 15545 }, { "epoch": 0.4094284961811957, "grad_norm": 1.651750087738037, "learning_rate": 2.9541743481696077e-05, "loss": 2.0861, "step": 15546 }, { "epoch": 0.4094548327627074, "grad_norm": 3.253331184387207, "learning_rate": 2.954042665262049e-05, "loss": 1.1731, "step": 15547 }, { "epoch": 0.40948116934421913, "grad_norm": 3.992542028427124, "learning_rate": 2.9539109823544904e-05, "loss": 1.5168, "step": 15548 }, { "epoch": 0.4095075059257308, "grad_norm": 3.426276683807373, "learning_rate": 2.953779299446932e-05, "loss": 1.5996, "step": 15549 }, { "epoch": 0.40953384250724256, "grad_norm": 1.898941993713379, "learning_rate": 2.9536476165393736e-05, "loss": 2.5171, "step": 15550 }, { "epoch": 0.4095601790887543, "grad_norm": 1.886821985244751, "learning_rate": 2.9535159336318148e-05, "loss": 1.3618, "step": 15551 }, { "epoch": 0.409586515670266, "grad_norm": 1.95891273021698, "learning_rate": 2.953384250724256e-05, "loss": 2.0994, "step": 15552 }, { "epoch": 0.40961285225177774, "grad_norm": 4.050502777099609, "learning_rate": 2.953252567816698e-05, "loss": 1.8797, "step": 15553 }, { "epoch": 0.4096391888332894, "grad_norm": 2.9661693572998047, "learning_rate": 2.953120884909139e-05, "loss": 1.9262, "step": 15554 }, { "epoch": 0.40966552541480117, "grad_norm": 1.9914149045944214, "learning_rate": 2.9529892020015803e-05, "loss": 2.1554, "step": 15555 }, { "epoch": 0.40969186199631286, "grad_norm": 2.4739737510681152, "learning_rate": 2.9528575190940216e-05, "loss": 1.2371, "step": 15556 }, { "epoch": 0.4097181985778246, "grad_norm": 1.9299970865249634, "learning_rate": 2.9527258361864628e-05, "loss": 0.926, "step": 15557 }, { "epoch": 0.40974453515933634, "grad_norm": 3.3257062435150146, "learning_rate": 2.9525941532789047e-05, "loss": 1.6199, "step": 15558 }, { "epoch": 0.40977087174084803, "grad_norm": 1.7143402099609375, "learning_rate": 2.952462470371346e-05, "loss": 2.1645, "step": 15559 }, { "epoch": 0.4097972083223598, "grad_norm": 1.9123141765594482, "learning_rate": 2.9523307874637875e-05, "loss": 1.7718, "step": 15560 }, { "epoch": 0.40982354490387146, "grad_norm": 1.8397691249847412, "learning_rate": 2.9521991045562287e-05, "loss": 1.7463, "step": 15561 }, { "epoch": 0.4098498814853832, "grad_norm": 3.843637466430664, "learning_rate": 2.95206742164867e-05, "loss": 1.6918, "step": 15562 }, { "epoch": 0.4098762180668949, "grad_norm": 2.0925049781799316, "learning_rate": 2.9519357387411118e-05, "loss": 1.312, "step": 15563 }, { "epoch": 0.40990255464840664, "grad_norm": 2.4778192043304443, "learning_rate": 2.951804055833553e-05, "loss": 1.1761, "step": 15564 }, { "epoch": 0.4099288912299184, "grad_norm": 1.7483197450637817, "learning_rate": 2.9516723729259942e-05, "loss": 2.0501, "step": 15565 }, { "epoch": 0.40995522781143007, "grad_norm": 2.340784788131714, "learning_rate": 2.9515406900184355e-05, "loss": 0.9956, "step": 15566 }, { "epoch": 0.4099815643929418, "grad_norm": 2.448150396347046, "learning_rate": 2.9514090071108774e-05, "loss": 2.1957, "step": 15567 }, { "epoch": 0.4100079009744535, "grad_norm": 2.7702066898345947, "learning_rate": 2.9512773242033186e-05, "loss": 1.0067, "step": 15568 }, { "epoch": 0.41003423755596524, "grad_norm": 1.5721662044525146, "learning_rate": 2.95114564129576e-05, "loss": 1.856, "step": 15569 }, { "epoch": 0.41006057413747693, "grad_norm": 2.173760175704956, "learning_rate": 2.9510139583882014e-05, "loss": 0.4897, "step": 15570 }, { "epoch": 0.4100869107189887, "grad_norm": 2.960336923599243, "learning_rate": 2.9508822754806426e-05, "loss": 1.3055, "step": 15571 }, { "epoch": 0.4101132473005004, "grad_norm": 1.64013671875, "learning_rate": 2.9507505925730845e-05, "loss": 1.875, "step": 15572 }, { "epoch": 0.4101395838820121, "grad_norm": 2.414738893508911, "learning_rate": 2.9506189096655257e-05, "loss": 2.0077, "step": 15573 }, { "epoch": 0.41016592046352385, "grad_norm": 3.0318429470062256, "learning_rate": 2.950487226757967e-05, "loss": 1.9638, "step": 15574 }, { "epoch": 0.41019225704503554, "grad_norm": 3.7558023929595947, "learning_rate": 2.950355543850408e-05, "loss": 1.4986, "step": 15575 }, { "epoch": 0.4102185936265473, "grad_norm": 2.0034916400909424, "learning_rate": 2.9502238609428494e-05, "loss": 1.9444, "step": 15576 }, { "epoch": 0.41024493020805897, "grad_norm": 1.6488306522369385, "learning_rate": 2.9500921780352913e-05, "loss": 2.0599, "step": 15577 }, { "epoch": 0.4102712667895707, "grad_norm": 1.8521102666854858, "learning_rate": 2.9499604951277325e-05, "loss": 1.2668, "step": 15578 }, { "epoch": 0.41029760337108245, "grad_norm": 1.8435574769973755, "learning_rate": 2.949828812220174e-05, "loss": 1.0013, "step": 15579 }, { "epoch": 0.41032393995259414, "grad_norm": 5.426388740539551, "learning_rate": 2.9496971293126153e-05, "loss": 1.9879, "step": 15580 }, { "epoch": 0.4103502765341059, "grad_norm": 2.0824577808380127, "learning_rate": 2.9495654464050565e-05, "loss": 1.8558, "step": 15581 }, { "epoch": 0.4103766131156176, "grad_norm": 1.792357325553894, "learning_rate": 2.9494337634974984e-05, "loss": 1.7638, "step": 15582 }, { "epoch": 0.4104029496971293, "grad_norm": 3.8681271076202393, "learning_rate": 2.9493020805899396e-05, "loss": 1.761, "step": 15583 }, { "epoch": 0.41042928627864106, "grad_norm": 1.743836760520935, "learning_rate": 2.9491703976823808e-05, "loss": 1.5564, "step": 15584 }, { "epoch": 0.41045562286015275, "grad_norm": 1.7832916975021362, "learning_rate": 2.949038714774822e-05, "loss": 1.8188, "step": 15585 }, { "epoch": 0.4104819594416645, "grad_norm": 2.4880542755126953, "learning_rate": 2.948907031867264e-05, "loss": 1.5534, "step": 15586 }, { "epoch": 0.4105082960231762, "grad_norm": 4.8710618019104, "learning_rate": 2.948775348959705e-05, "loss": 1.84, "step": 15587 }, { "epoch": 0.4105346326046879, "grad_norm": 1.872733235359192, "learning_rate": 2.9486436660521467e-05, "loss": 1.9138, "step": 15588 }, { "epoch": 0.4105609691861996, "grad_norm": 1.8661011457443237, "learning_rate": 2.948511983144588e-05, "loss": 1.635, "step": 15589 }, { "epoch": 0.41058730576771135, "grad_norm": 1.962339997291565, "learning_rate": 2.948380300237029e-05, "loss": 1.3617, "step": 15590 }, { "epoch": 0.4106136423492231, "grad_norm": 3.3088715076446533, "learning_rate": 2.948248617329471e-05, "loss": 1.6787, "step": 15591 }, { "epoch": 0.4106399789307348, "grad_norm": 2.4555041790008545, "learning_rate": 2.9481169344219123e-05, "loss": 1.7043, "step": 15592 }, { "epoch": 0.4106663155122465, "grad_norm": 1.9453097581863403, "learning_rate": 2.9479852515143535e-05, "loss": 1.475, "step": 15593 }, { "epoch": 0.4106926520937582, "grad_norm": 2.8604366779327393, "learning_rate": 2.9478535686067947e-05, "loss": 1.1677, "step": 15594 }, { "epoch": 0.41071898867526996, "grad_norm": 3.6796457767486572, "learning_rate": 2.9477218856992363e-05, "loss": 1.6573, "step": 15595 }, { "epoch": 0.41074532525678165, "grad_norm": 2.751086473464966, "learning_rate": 2.947590202791678e-05, "loss": 2.5069, "step": 15596 }, { "epoch": 0.4107716618382934, "grad_norm": 2.846808433532715, "learning_rate": 2.9474585198841194e-05, "loss": 0.495, "step": 15597 }, { "epoch": 0.41079799841980513, "grad_norm": 2.100917339324951, "learning_rate": 2.9473268369765606e-05, "loss": 1.4297, "step": 15598 }, { "epoch": 0.4108243350013168, "grad_norm": 2.561904191970825, "learning_rate": 2.947195154069002e-05, "loss": 1.4738, "step": 15599 }, { "epoch": 0.41085067158282856, "grad_norm": 3.0994951725006104, "learning_rate": 2.9470634711614437e-05, "loss": 0.3795, "step": 15600 }, { "epoch": 0.41087700816434025, "grad_norm": 2.137200117111206, "learning_rate": 2.946931788253885e-05, "loss": 2.4627, "step": 15601 }, { "epoch": 0.410903344745852, "grad_norm": 1.5876655578613281, "learning_rate": 2.9468001053463262e-05, "loss": 1.7906, "step": 15602 }, { "epoch": 0.4109296813273637, "grad_norm": 2.9217071533203125, "learning_rate": 2.9466684224387674e-05, "loss": 2.1725, "step": 15603 }, { "epoch": 0.4109560179088754, "grad_norm": 1.8206844329833984, "learning_rate": 2.9465367395312086e-05, "loss": 1.6498, "step": 15604 }, { "epoch": 0.41098235449038717, "grad_norm": 2.162393093109131, "learning_rate": 2.9464050566236505e-05, "loss": 2.1612, "step": 15605 }, { "epoch": 0.41100869107189886, "grad_norm": 2.7006442546844482, "learning_rate": 2.9462733737160917e-05, "loss": 1.3873, "step": 15606 }, { "epoch": 0.4110350276534106, "grad_norm": 1.8123433589935303, "learning_rate": 2.9461416908085333e-05, "loss": 2.2301, "step": 15607 }, { "epoch": 0.4110613642349223, "grad_norm": 1.8404465913772583, "learning_rate": 2.9460100079009745e-05, "loss": 1.8474, "step": 15608 }, { "epoch": 0.41108770081643403, "grad_norm": 4.121599197387695, "learning_rate": 2.9458783249934158e-05, "loss": 2.6606, "step": 15609 }, { "epoch": 0.4111140373979457, "grad_norm": 1.860534429550171, "learning_rate": 2.9457466420858576e-05, "loss": 2.0835, "step": 15610 }, { "epoch": 0.41114037397945746, "grad_norm": 1.944881796836853, "learning_rate": 2.945614959178299e-05, "loss": 1.8828, "step": 15611 }, { "epoch": 0.4111667105609692, "grad_norm": 1.9096068143844604, "learning_rate": 2.94548327627074e-05, "loss": 0.4872, "step": 15612 }, { "epoch": 0.4111930471424809, "grad_norm": 1.6398636102676392, "learning_rate": 2.9453515933631813e-05, "loss": 1.9815, "step": 15613 }, { "epoch": 0.41121938372399264, "grad_norm": 1.7676432132720947, "learning_rate": 2.945219910455623e-05, "loss": 1.3109, "step": 15614 }, { "epoch": 0.4112457203055043, "grad_norm": 5.615110874176025, "learning_rate": 2.9450882275480644e-05, "loss": 1.7162, "step": 15615 }, { "epoch": 0.41127205688701607, "grad_norm": 5.8755316734313965, "learning_rate": 2.944956544640506e-05, "loss": 2.2305, "step": 15616 }, { "epoch": 0.4112983934685278, "grad_norm": 3.4218411445617676, "learning_rate": 2.9448248617329472e-05, "loss": 1.588, "step": 15617 }, { "epoch": 0.4113247300500395, "grad_norm": 2.338719129562378, "learning_rate": 2.9446931788253884e-05, "loss": 2.0573, "step": 15618 }, { "epoch": 0.41135106663155124, "grad_norm": 3.0660626888275146, "learning_rate": 2.9445614959178303e-05, "loss": 2.1837, "step": 15619 }, { "epoch": 0.41137740321306293, "grad_norm": 2.0762314796447754, "learning_rate": 2.9444298130102716e-05, "loss": 1.0922, "step": 15620 }, { "epoch": 0.4114037397945747, "grad_norm": 1.964686393737793, "learning_rate": 2.9442981301027128e-05, "loss": 1.9216, "step": 15621 }, { "epoch": 0.41143007637608636, "grad_norm": 2.870673179626465, "learning_rate": 2.944166447195154e-05, "loss": 1.2022, "step": 15622 }, { "epoch": 0.4114564129575981, "grad_norm": 2.181758165359497, "learning_rate": 2.9440347642875956e-05, "loss": 1.9927, "step": 15623 }, { "epoch": 0.41148274953910985, "grad_norm": 1.7397756576538086, "learning_rate": 2.943903081380037e-05, "loss": 1.7732, "step": 15624 }, { "epoch": 0.41150908612062154, "grad_norm": 4.026196002960205, "learning_rate": 2.9437713984724787e-05, "loss": 1.7795, "step": 15625 }, { "epoch": 0.4115354227021333, "grad_norm": 2.311596393585205, "learning_rate": 2.94363971556492e-05, "loss": 1.8337, "step": 15626 }, { "epoch": 0.41156175928364497, "grad_norm": 2.982847213745117, "learning_rate": 2.943508032657361e-05, "loss": 0.5568, "step": 15627 }, { "epoch": 0.4115880958651567, "grad_norm": 4.708019256591797, "learning_rate": 2.9433763497498023e-05, "loss": 1.2951, "step": 15628 }, { "epoch": 0.4116144324466684, "grad_norm": 2.341606616973877, "learning_rate": 2.9432446668422442e-05, "loss": 1.5093, "step": 15629 }, { "epoch": 0.41164076902818014, "grad_norm": 2.5291550159454346, "learning_rate": 2.9431129839346855e-05, "loss": 1.6805, "step": 15630 }, { "epoch": 0.4116671056096919, "grad_norm": 2.195976734161377, "learning_rate": 2.9429813010271267e-05, "loss": 1.7182, "step": 15631 }, { "epoch": 0.4116934421912036, "grad_norm": 2.999455451965332, "learning_rate": 2.942849618119568e-05, "loss": 1.9971, "step": 15632 }, { "epoch": 0.4117197787727153, "grad_norm": 1.6503539085388184, "learning_rate": 2.9427179352120098e-05, "loss": 1.8061, "step": 15633 }, { "epoch": 0.411746115354227, "grad_norm": 1.6644328832626343, "learning_rate": 2.942586252304451e-05, "loss": 2.4462, "step": 15634 }, { "epoch": 0.41177245193573875, "grad_norm": 1.9927966594696045, "learning_rate": 2.9424545693968926e-05, "loss": 1.8709, "step": 15635 }, { "epoch": 0.41179878851725044, "grad_norm": 1.9618141651153564, "learning_rate": 2.9423228864893338e-05, "loss": 1.4128, "step": 15636 }, { "epoch": 0.4118251250987622, "grad_norm": 1.914323329925537, "learning_rate": 2.942191203581775e-05, "loss": 2.2699, "step": 15637 }, { "epoch": 0.4118514616802739, "grad_norm": 1.8678549528121948, "learning_rate": 2.942059520674217e-05, "loss": 1.647, "step": 15638 }, { "epoch": 0.4118777982617856, "grad_norm": 2.154546022415161, "learning_rate": 2.941927837766658e-05, "loss": 1.4129, "step": 15639 }, { "epoch": 0.41190413484329735, "grad_norm": 1.3817179203033447, "learning_rate": 2.9417961548590994e-05, "loss": 1.8555, "step": 15640 }, { "epoch": 0.41193047142480904, "grad_norm": 2.392235517501831, "learning_rate": 2.9416644719515406e-05, "loss": 1.6919, "step": 15641 }, { "epoch": 0.4119568080063208, "grad_norm": 2.5953352451324463, "learning_rate": 2.941532789043982e-05, "loss": 1.9096, "step": 15642 }, { "epoch": 0.4119831445878325, "grad_norm": 1.8519266843795776, "learning_rate": 2.9414011061364237e-05, "loss": 0.5479, "step": 15643 }, { "epoch": 0.4120094811693442, "grad_norm": 2.8138234615325928, "learning_rate": 2.9412694232288653e-05, "loss": 0.9956, "step": 15644 }, { "epoch": 0.41203581775085596, "grad_norm": 2.1778721809387207, "learning_rate": 2.9411377403213065e-05, "loss": 0.7772, "step": 15645 }, { "epoch": 0.41206215433236765, "grad_norm": 2.140077829360962, "learning_rate": 2.9410060574137477e-05, "loss": 1.8864, "step": 15646 }, { "epoch": 0.4120884909138794, "grad_norm": 6.711975574493408, "learning_rate": 2.940874374506189e-05, "loss": 0.829, "step": 15647 }, { "epoch": 0.4121148274953911, "grad_norm": 1.9497487545013428, "learning_rate": 2.9407426915986308e-05, "loss": 1.9392, "step": 15648 }, { "epoch": 0.4121411640769028, "grad_norm": 3.6461007595062256, "learning_rate": 2.940611008691072e-05, "loss": 1.3397, "step": 15649 }, { "epoch": 0.41216750065841457, "grad_norm": 2.270423412322998, "learning_rate": 2.9404793257835133e-05, "loss": 1.6914, "step": 15650 }, { "epoch": 0.41219383723992625, "grad_norm": 2.1901330947875977, "learning_rate": 2.9403476428759548e-05, "loss": 0.9298, "step": 15651 }, { "epoch": 0.412220173821438, "grad_norm": 1.6930476427078247, "learning_rate": 2.9402159599683964e-05, "loss": 1.15, "step": 15652 }, { "epoch": 0.4122465104029497, "grad_norm": 11.654403686523438, "learning_rate": 2.940084277060838e-05, "loss": 1.1725, "step": 15653 }, { "epoch": 0.4122728469844614, "grad_norm": 1.8939121961593628, "learning_rate": 2.939952594153279e-05, "loss": 2.4951, "step": 15654 }, { "epoch": 0.4122991835659731, "grad_norm": 1.7732517719268799, "learning_rate": 2.9398209112457204e-05, "loss": 1.036, "step": 15655 }, { "epoch": 0.41232552014748486, "grad_norm": 2.204071521759033, "learning_rate": 2.9396892283381616e-05, "loss": 2.1034, "step": 15656 }, { "epoch": 0.4123518567289966, "grad_norm": 2.244224786758423, "learning_rate": 2.9395575454306035e-05, "loss": 2.2298, "step": 15657 }, { "epoch": 0.4123781933105083, "grad_norm": 1.5890324115753174, "learning_rate": 2.9394258625230447e-05, "loss": 1.909, "step": 15658 }, { "epoch": 0.41240452989202003, "grad_norm": 2.022317409515381, "learning_rate": 2.939294179615486e-05, "loss": 1.4609, "step": 15659 }, { "epoch": 0.4124308664735317, "grad_norm": 1.8920706510543823, "learning_rate": 2.939162496707927e-05, "loss": 1.3667, "step": 15660 }, { "epoch": 0.41245720305504346, "grad_norm": 2.809635639190674, "learning_rate": 2.9390308138003687e-05, "loss": 1.5838, "step": 15661 }, { "epoch": 0.41248353963655515, "grad_norm": 2.007632255554199, "learning_rate": 2.9388991308928103e-05, "loss": 0.4959, "step": 15662 }, { "epoch": 0.4125098762180669, "grad_norm": 3.5080292224884033, "learning_rate": 2.938767447985252e-05, "loss": 0.9457, "step": 15663 }, { "epoch": 0.41253621279957864, "grad_norm": 2.179311513900757, "learning_rate": 2.938635765077693e-05, "loss": 2.4743, "step": 15664 }, { "epoch": 0.4125625493810903, "grad_norm": 1.788968563079834, "learning_rate": 2.9385040821701343e-05, "loss": 1.6155, "step": 15665 }, { "epoch": 0.41258888596260207, "grad_norm": 2.640054702758789, "learning_rate": 2.9383723992625762e-05, "loss": 1.6905, "step": 15666 }, { "epoch": 0.41261522254411376, "grad_norm": 3.193744421005249, "learning_rate": 2.9382407163550174e-05, "loss": 1.9119, "step": 15667 }, { "epoch": 0.4126415591256255, "grad_norm": 2.9051668643951416, "learning_rate": 2.9381090334474586e-05, "loss": 1.6336, "step": 15668 }, { "epoch": 0.4126678957071372, "grad_norm": 3.2675421237945557, "learning_rate": 2.9379773505399e-05, "loss": 0.9673, "step": 15669 }, { "epoch": 0.41269423228864893, "grad_norm": 1.8139410018920898, "learning_rate": 2.9378456676323414e-05, "loss": 1.6944, "step": 15670 }, { "epoch": 0.4127205688701607, "grad_norm": 2.856947898864746, "learning_rate": 2.937713984724783e-05, "loss": 0.563, "step": 15671 }, { "epoch": 0.41274690545167236, "grad_norm": 2.128417491912842, "learning_rate": 2.9375823018172245e-05, "loss": 1.8091, "step": 15672 }, { "epoch": 0.4127732420331841, "grad_norm": 1.8805828094482422, "learning_rate": 2.9374506189096658e-05, "loss": 1.9038, "step": 15673 }, { "epoch": 0.4127995786146958, "grad_norm": 1.591185212135315, "learning_rate": 2.937318936002107e-05, "loss": 1.6888, "step": 15674 }, { "epoch": 0.41282591519620754, "grad_norm": 3.423896312713623, "learning_rate": 2.9371872530945482e-05, "loss": 1.0556, "step": 15675 }, { "epoch": 0.4128522517777192, "grad_norm": 1.8265759944915771, "learning_rate": 2.93705557018699e-05, "loss": 1.3261, "step": 15676 }, { "epoch": 0.41287858835923097, "grad_norm": 3.0720016956329346, "learning_rate": 2.9369238872794313e-05, "loss": 0.9389, "step": 15677 }, { "epoch": 0.4129049249407427, "grad_norm": 1.8436627388000488, "learning_rate": 2.9367922043718725e-05, "loss": 1.9235, "step": 15678 }, { "epoch": 0.4129312615222544, "grad_norm": 2.1809499263763428, "learning_rate": 2.9366605214643138e-05, "loss": 2.3201, "step": 15679 }, { "epoch": 0.41295759810376614, "grad_norm": 5.844761848449707, "learning_rate": 2.9365288385567553e-05, "loss": 1.2053, "step": 15680 }, { "epoch": 0.41298393468527783, "grad_norm": 1.7083134651184082, "learning_rate": 2.936397155649197e-05, "loss": 1.8222, "step": 15681 }, { "epoch": 0.4130102712667896, "grad_norm": 4.175252914428711, "learning_rate": 2.9362654727416384e-05, "loss": 1.3668, "step": 15682 }, { "epoch": 0.4130366078483013, "grad_norm": 1.6955143213272095, "learning_rate": 2.9361337898340797e-05, "loss": 1.9421, "step": 15683 }, { "epoch": 0.413062944429813, "grad_norm": 3.372965097427368, "learning_rate": 2.936002106926521e-05, "loss": 1.31, "step": 15684 }, { "epoch": 0.41308928101132475, "grad_norm": 1.7077680826187134, "learning_rate": 2.9358704240189628e-05, "loss": 0.4251, "step": 15685 }, { "epoch": 0.41311561759283644, "grad_norm": 1.6981302499771118, "learning_rate": 2.935738741111404e-05, "loss": 1.9722, "step": 15686 }, { "epoch": 0.4131419541743482, "grad_norm": 1.8662424087524414, "learning_rate": 2.9356070582038452e-05, "loss": 1.7829, "step": 15687 }, { "epoch": 0.41316829075585987, "grad_norm": 2.0773918628692627, "learning_rate": 2.9354753752962864e-05, "loss": 1.0619, "step": 15688 }, { "epoch": 0.4131946273373716, "grad_norm": 2.776118755340576, "learning_rate": 2.935343692388728e-05, "loss": 0.2652, "step": 15689 }, { "epoch": 0.41322096391888335, "grad_norm": 3.3183960914611816, "learning_rate": 2.9352120094811696e-05, "loss": 1.7356, "step": 15690 }, { "epoch": 0.41324730050039504, "grad_norm": 1.9203873872756958, "learning_rate": 2.935080326573611e-05, "loss": 1.8409, "step": 15691 }, { "epoch": 0.4132736370819068, "grad_norm": 4.229273796081543, "learning_rate": 2.9349486436660523e-05, "loss": 1.3657, "step": 15692 }, { "epoch": 0.4132999736634185, "grad_norm": 2.3400189876556396, "learning_rate": 2.9348169607584936e-05, "loss": 2.1804, "step": 15693 }, { "epoch": 0.4133263102449302, "grad_norm": 2.9474289417266846, "learning_rate": 2.9346852778509348e-05, "loss": 1.4853, "step": 15694 }, { "epoch": 0.4133526468264419, "grad_norm": 1.7531671524047852, "learning_rate": 2.9345535949433767e-05, "loss": 2.0666, "step": 15695 }, { "epoch": 0.41337898340795365, "grad_norm": 1.5099817514419556, "learning_rate": 2.934421912035818e-05, "loss": 2.0596, "step": 15696 }, { "epoch": 0.4134053199894654, "grad_norm": 1.5720902681350708, "learning_rate": 2.934290229128259e-05, "loss": 0.6829, "step": 15697 }, { "epoch": 0.4134316565709771, "grad_norm": 4.438876152038574, "learning_rate": 2.9341585462207007e-05, "loss": 1.9974, "step": 15698 }, { "epoch": 0.4134579931524888, "grad_norm": 1.7951011657714844, "learning_rate": 2.9340268633131422e-05, "loss": 1.7568, "step": 15699 }, { "epoch": 0.4134843297340005, "grad_norm": 4.371419906616211, "learning_rate": 2.9338951804055838e-05, "loss": 1.6832, "step": 15700 }, { "epoch": 0.41351066631551225, "grad_norm": 1.3973851203918457, "learning_rate": 2.933763497498025e-05, "loss": 2.046, "step": 15701 }, { "epoch": 0.41353700289702394, "grad_norm": 2.2307183742523193, "learning_rate": 2.9336318145904662e-05, "loss": 1.6017, "step": 15702 }, { "epoch": 0.4135633394785357, "grad_norm": 1.8933690786361694, "learning_rate": 2.9335001316829075e-05, "loss": 1.7058, "step": 15703 }, { "epoch": 0.41358967606004743, "grad_norm": 1.9415560960769653, "learning_rate": 2.9333684487753494e-05, "loss": 1.6413, "step": 15704 }, { "epoch": 0.4136160126415591, "grad_norm": 1.902345895767212, "learning_rate": 2.9332367658677906e-05, "loss": 1.4989, "step": 15705 }, { "epoch": 0.41364234922307086, "grad_norm": 4.692122936248779, "learning_rate": 2.9331050829602318e-05, "loss": 0.7606, "step": 15706 }, { "epoch": 0.41366868580458255, "grad_norm": 2.022602081298828, "learning_rate": 2.932973400052673e-05, "loss": 1.405, "step": 15707 }, { "epoch": 0.4136950223860943, "grad_norm": 2.992035388946533, "learning_rate": 2.9328417171451146e-05, "loss": 0.4388, "step": 15708 }, { "epoch": 0.413721358967606, "grad_norm": 2.544341802597046, "learning_rate": 2.932710034237556e-05, "loss": 1.5207, "step": 15709 }, { "epoch": 0.4137476955491177, "grad_norm": 1.8995567560195923, "learning_rate": 2.9325783513299977e-05, "loss": 0.8497, "step": 15710 }, { "epoch": 0.41377403213062947, "grad_norm": 1.6597929000854492, "learning_rate": 2.932446668422439e-05, "loss": 1.4809, "step": 15711 }, { "epoch": 0.41380036871214115, "grad_norm": 3.636852502822876, "learning_rate": 2.93231498551488e-05, "loss": 1.3056, "step": 15712 }, { "epoch": 0.4138267052936529, "grad_norm": 1.5252666473388672, "learning_rate": 2.9321833026073214e-05, "loss": 1.4693, "step": 15713 }, { "epoch": 0.4138530418751646, "grad_norm": 1.7964345216751099, "learning_rate": 2.9320516196997633e-05, "loss": 1.7382, "step": 15714 }, { "epoch": 0.41387937845667633, "grad_norm": 1.7553869485855103, "learning_rate": 2.9319199367922045e-05, "loss": 1.8619, "step": 15715 }, { "epoch": 0.413905715038188, "grad_norm": 2.1346728801727295, "learning_rate": 2.9317882538846457e-05, "loss": 1.9323, "step": 15716 }, { "epoch": 0.41393205161969976, "grad_norm": 1.9322916269302368, "learning_rate": 2.9316565709770873e-05, "loss": 2.2971, "step": 15717 }, { "epoch": 0.4139583882012115, "grad_norm": 3.9815216064453125, "learning_rate": 2.9315248880695288e-05, "loss": 1.924, "step": 15718 }, { "epoch": 0.4139847247827232, "grad_norm": 4.49163818359375, "learning_rate": 2.9313932051619704e-05, "loss": 1.0998, "step": 15719 }, { "epoch": 0.41401106136423493, "grad_norm": 3.1728100776672363, "learning_rate": 2.9312615222544116e-05, "loss": 0.986, "step": 15720 }, { "epoch": 0.4140373979457466, "grad_norm": 1.8438752889633179, "learning_rate": 2.9311298393468528e-05, "loss": 2.7322, "step": 15721 }, { "epoch": 0.41406373452725836, "grad_norm": 2.5315842628479004, "learning_rate": 2.930998156439294e-05, "loss": 1.8991, "step": 15722 }, { "epoch": 0.4140900711087701, "grad_norm": 1.876872181892395, "learning_rate": 2.930866473531736e-05, "loss": 0.6388, "step": 15723 }, { "epoch": 0.4141164076902818, "grad_norm": 2.0644257068634033, "learning_rate": 2.930734790624177e-05, "loss": 1.7264, "step": 15724 }, { "epoch": 0.41414274427179354, "grad_norm": 2.4856345653533936, "learning_rate": 2.9306031077166184e-05, "loss": 1.5433, "step": 15725 }, { "epoch": 0.4141690808533052, "grad_norm": 1.8194998502731323, "learning_rate": 2.93047142480906e-05, "loss": 1.6386, "step": 15726 }, { "epoch": 0.41419541743481697, "grad_norm": 1.9625462293624878, "learning_rate": 2.9303397419015012e-05, "loss": 1.8224, "step": 15727 }, { "epoch": 0.41422175401632866, "grad_norm": 1.771073818206787, "learning_rate": 2.930208058993943e-05, "loss": 1.9254, "step": 15728 }, { "epoch": 0.4142480905978404, "grad_norm": 1.9150766134262085, "learning_rate": 2.9300763760863843e-05, "loss": 1.7029, "step": 15729 }, { "epoch": 0.41427442717935214, "grad_norm": 3.26265025138855, "learning_rate": 2.9299446931788255e-05, "loss": 1.9199, "step": 15730 }, { "epoch": 0.41430076376086383, "grad_norm": 2.0969038009643555, "learning_rate": 2.9298130102712667e-05, "loss": 2.1066, "step": 15731 }, { "epoch": 0.4143271003423756, "grad_norm": 1.9390043020248413, "learning_rate": 2.9296813273637086e-05, "loss": 1.9867, "step": 15732 }, { "epoch": 0.41435343692388726, "grad_norm": 2.376207113265991, "learning_rate": 2.92954964445615e-05, "loss": 1.8355, "step": 15733 }, { "epoch": 0.414379773505399, "grad_norm": 1.623523235321045, "learning_rate": 2.929417961548591e-05, "loss": 1.8488, "step": 15734 }, { "epoch": 0.4144061100869107, "grad_norm": 3.2486753463745117, "learning_rate": 2.9292862786410323e-05, "loss": 1.2581, "step": 15735 }, { "epoch": 0.41443244666842244, "grad_norm": 3.6002862453460693, "learning_rate": 2.929154595733474e-05, "loss": 0.5613, "step": 15736 }, { "epoch": 0.4144587832499342, "grad_norm": 2.0421414375305176, "learning_rate": 2.9290229128259154e-05, "loss": 0.2003, "step": 15737 }, { "epoch": 0.41448511983144587, "grad_norm": 4.687199115753174, "learning_rate": 2.928891229918357e-05, "loss": 1.2453, "step": 15738 }, { "epoch": 0.4145114564129576, "grad_norm": 3.138654947280884, "learning_rate": 2.9287595470107982e-05, "loss": 2.3656, "step": 15739 }, { "epoch": 0.4145377929944693, "grad_norm": 1.958830714225769, "learning_rate": 2.9286278641032394e-05, "loss": 1.2919, "step": 15740 }, { "epoch": 0.41456412957598104, "grad_norm": 1.6532645225524902, "learning_rate": 2.9284961811956806e-05, "loss": 1.6098, "step": 15741 }, { "epoch": 0.41459046615749273, "grad_norm": 2.0887038707733154, "learning_rate": 2.9283644982881225e-05, "loss": 1.334, "step": 15742 }, { "epoch": 0.4146168027390045, "grad_norm": 2.085057020187378, "learning_rate": 2.9282328153805638e-05, "loss": 1.8236, "step": 15743 }, { "epoch": 0.4146431393205162, "grad_norm": 2.1905174255371094, "learning_rate": 2.928101132473005e-05, "loss": 1.5245, "step": 15744 }, { "epoch": 0.4146694759020279, "grad_norm": 3.5042781829833984, "learning_rate": 2.9279694495654465e-05, "loss": 1.4656, "step": 15745 }, { "epoch": 0.41469581248353965, "grad_norm": 3.223958730697632, "learning_rate": 2.9278377666578878e-05, "loss": 0.6077, "step": 15746 }, { "epoch": 0.41472214906505134, "grad_norm": 1.6542402505874634, "learning_rate": 2.9277060837503297e-05, "loss": 1.5184, "step": 15747 }, { "epoch": 0.4147484856465631, "grad_norm": 3.485166072845459, "learning_rate": 2.927574400842771e-05, "loss": 1.7476, "step": 15748 }, { "epoch": 0.41477482222807477, "grad_norm": 1.248049259185791, "learning_rate": 2.927442717935212e-05, "loss": 1.5584, "step": 15749 }, { "epoch": 0.4148011588095865, "grad_norm": 1.49611496925354, "learning_rate": 2.9273110350276533e-05, "loss": 2.0072, "step": 15750 }, { "epoch": 0.41482749539109826, "grad_norm": 1.6200965642929077, "learning_rate": 2.9271793521200952e-05, "loss": 1.9917, "step": 15751 }, { "epoch": 0.41485383197260994, "grad_norm": 2.377087116241455, "learning_rate": 2.9270476692125364e-05, "loss": 1.5305, "step": 15752 }, { "epoch": 0.4148801685541217, "grad_norm": 1.693340539932251, "learning_rate": 2.9269159863049777e-05, "loss": 1.9082, "step": 15753 }, { "epoch": 0.4149065051356334, "grad_norm": 1.8340027332305908, "learning_rate": 2.926784303397419e-05, "loss": 1.5705, "step": 15754 }, { "epoch": 0.4149328417171451, "grad_norm": 2.9371960163116455, "learning_rate": 2.9266526204898604e-05, "loss": 1.625, "step": 15755 }, { "epoch": 0.41495917829865686, "grad_norm": 3.9172139167785645, "learning_rate": 2.9265209375823023e-05, "loss": 0.7591, "step": 15756 }, { "epoch": 0.41498551488016855, "grad_norm": 6.552670478820801, "learning_rate": 2.9263892546747436e-05, "loss": 1.898, "step": 15757 }, { "epoch": 0.4150118514616803, "grad_norm": 3.311701774597168, "learning_rate": 2.9262575717671848e-05, "loss": 2.1974, "step": 15758 }, { "epoch": 0.415038188043192, "grad_norm": 1.7526640892028809, "learning_rate": 2.926125888859626e-05, "loss": 2.0597, "step": 15759 }, { "epoch": 0.4150645246247037, "grad_norm": 4.0637359619140625, "learning_rate": 2.9259942059520672e-05, "loss": 1.9377, "step": 15760 }, { "epoch": 0.4150908612062154, "grad_norm": 3.394953966140747, "learning_rate": 2.925862523044509e-05, "loss": 2.076, "step": 15761 }, { "epoch": 0.41511719778772715, "grad_norm": 1.8149725198745728, "learning_rate": 2.9257308401369503e-05, "loss": 1.8685, "step": 15762 }, { "epoch": 0.4151435343692389, "grad_norm": 2.1194307804107666, "learning_rate": 2.9255991572293916e-05, "loss": 0.6746, "step": 15763 }, { "epoch": 0.4151698709507506, "grad_norm": 1.7504782676696777, "learning_rate": 2.925467474321833e-05, "loss": 1.8717, "step": 15764 }, { "epoch": 0.41519620753226233, "grad_norm": 5.995121002197266, "learning_rate": 2.9253357914142747e-05, "loss": 1.748, "step": 15765 }, { "epoch": 0.415222544113774, "grad_norm": 1.5490652322769165, "learning_rate": 2.9252041085067162e-05, "loss": 0.5824, "step": 15766 }, { "epoch": 0.41524888069528576, "grad_norm": 1.5723910331726074, "learning_rate": 2.9250724255991575e-05, "loss": 1.5109, "step": 15767 }, { "epoch": 0.41527521727679745, "grad_norm": 2.016627550125122, "learning_rate": 2.9249407426915987e-05, "loss": 1.8315, "step": 15768 }, { "epoch": 0.4153015538583092, "grad_norm": 1.6356393098831177, "learning_rate": 2.92480905978404e-05, "loss": 1.7194, "step": 15769 }, { "epoch": 0.41532789043982093, "grad_norm": 3.4606828689575195, "learning_rate": 2.9246773768764818e-05, "loss": 0.6875, "step": 15770 }, { "epoch": 0.4153542270213326, "grad_norm": 2.5112810134887695, "learning_rate": 2.924545693968923e-05, "loss": 1.7626, "step": 15771 }, { "epoch": 0.41538056360284437, "grad_norm": 1.9759786128997803, "learning_rate": 2.9244140110613642e-05, "loss": 1.3572, "step": 15772 }, { "epoch": 0.41540690018435605, "grad_norm": 5.558615207672119, "learning_rate": 2.9242823281538058e-05, "loss": 0.9321, "step": 15773 }, { "epoch": 0.4154332367658678, "grad_norm": 2.4439620971679688, "learning_rate": 2.924150645246247e-05, "loss": 0.9004, "step": 15774 }, { "epoch": 0.4154595733473795, "grad_norm": 2.812452793121338, "learning_rate": 2.924018962338689e-05, "loss": 1.4295, "step": 15775 }, { "epoch": 0.41548590992889123, "grad_norm": 1.741997241973877, "learning_rate": 2.92388727943113e-05, "loss": 2.7799, "step": 15776 }, { "epoch": 0.41551224651040297, "grad_norm": 1.874658226966858, "learning_rate": 2.9237555965235714e-05, "loss": 2.321, "step": 15777 }, { "epoch": 0.41553858309191466, "grad_norm": 2.0538578033447266, "learning_rate": 2.9236239136160126e-05, "loss": 1.8635, "step": 15778 }, { "epoch": 0.4155649196734264, "grad_norm": 1.9786607027053833, "learning_rate": 2.9234922307084538e-05, "loss": 2.4563, "step": 15779 }, { "epoch": 0.4155912562549381, "grad_norm": 2.311415433883667, "learning_rate": 2.9233605478008957e-05, "loss": 1.7576, "step": 15780 }, { "epoch": 0.41561759283644983, "grad_norm": 3.0113344192504883, "learning_rate": 2.923228864893337e-05, "loss": 0.5366, "step": 15781 }, { "epoch": 0.4156439294179615, "grad_norm": 2.2133822441101074, "learning_rate": 2.923097181985778e-05, "loss": 1.0131, "step": 15782 }, { "epoch": 0.41567026599947327, "grad_norm": 1.736491322517395, "learning_rate": 2.9229654990782197e-05, "loss": 2.6036, "step": 15783 }, { "epoch": 0.415696602580985, "grad_norm": 1.9607834815979004, "learning_rate": 2.9228338161706613e-05, "loss": 1.8037, "step": 15784 }, { "epoch": 0.4157229391624967, "grad_norm": 2.7674288749694824, "learning_rate": 2.9227021332631028e-05, "loss": 2.2169, "step": 15785 }, { "epoch": 0.41574927574400844, "grad_norm": 1.9476398229599, "learning_rate": 2.922570450355544e-05, "loss": 2.0498, "step": 15786 }, { "epoch": 0.4157756123255201, "grad_norm": 7.260297775268555, "learning_rate": 2.9224387674479853e-05, "loss": 0.6484, "step": 15787 }, { "epoch": 0.41580194890703187, "grad_norm": 1.9710294008255005, "learning_rate": 2.9223070845404265e-05, "loss": 1.9031, "step": 15788 }, { "epoch": 0.4158282854885436, "grad_norm": 1.526004433631897, "learning_rate": 2.9221754016328684e-05, "loss": 2.0321, "step": 15789 }, { "epoch": 0.4158546220700553, "grad_norm": 2.4040145874023438, "learning_rate": 2.9220437187253096e-05, "loss": 1.5207, "step": 15790 }, { "epoch": 0.41588095865156705, "grad_norm": 2.398714780807495, "learning_rate": 2.921912035817751e-05, "loss": 1.8171, "step": 15791 }, { "epoch": 0.41590729523307873, "grad_norm": 3.9659485816955566, "learning_rate": 2.9217803529101924e-05, "loss": 1.5815, "step": 15792 }, { "epoch": 0.4159336318145905, "grad_norm": 2.435270071029663, "learning_rate": 2.9216486700026336e-05, "loss": 2.3045, "step": 15793 }, { "epoch": 0.41595996839610216, "grad_norm": 2.325011968612671, "learning_rate": 2.9215169870950755e-05, "loss": 1.3099, "step": 15794 }, { "epoch": 0.4159863049776139, "grad_norm": 3.9564249515533447, "learning_rate": 2.9213853041875167e-05, "loss": 1.6155, "step": 15795 }, { "epoch": 0.41601264155912565, "grad_norm": 3.003162384033203, "learning_rate": 2.921253621279958e-05, "loss": 2.0027, "step": 15796 }, { "epoch": 0.41603897814063734, "grad_norm": 1.8829834461212158, "learning_rate": 2.9211219383723992e-05, "loss": 1.4139, "step": 15797 }, { "epoch": 0.4160653147221491, "grad_norm": 1.7265219688415527, "learning_rate": 2.920990255464841e-05, "loss": 1.6493, "step": 15798 }, { "epoch": 0.41609165130366077, "grad_norm": 1.6604843139648438, "learning_rate": 2.9208585725572823e-05, "loss": 2.0548, "step": 15799 }, { "epoch": 0.4161179878851725, "grad_norm": 2.3974146842956543, "learning_rate": 2.9207268896497235e-05, "loss": 1.7182, "step": 15800 }, { "epoch": 0.4161443244666842, "grad_norm": 1.6551018953323364, "learning_rate": 2.920595206742165e-05, "loss": 1.833, "step": 15801 }, { "epoch": 0.41617066104819594, "grad_norm": 2.1091370582580566, "learning_rate": 2.9204635238346063e-05, "loss": 1.6866, "step": 15802 }, { "epoch": 0.4161969976297077, "grad_norm": 1.6421785354614258, "learning_rate": 2.9203318409270482e-05, "loss": 1.813, "step": 15803 }, { "epoch": 0.4162233342112194, "grad_norm": 4.921753406524658, "learning_rate": 2.9202001580194894e-05, "loss": 1.7102, "step": 15804 }, { "epoch": 0.4162496707927311, "grad_norm": 2.5916521549224854, "learning_rate": 2.9200684751119306e-05, "loss": 1.1856, "step": 15805 }, { "epoch": 0.4162760073742428, "grad_norm": 2.8312504291534424, "learning_rate": 2.919936792204372e-05, "loss": 2.4651, "step": 15806 }, { "epoch": 0.41630234395575455, "grad_norm": 3.3246586322784424, "learning_rate": 2.919805109296813e-05, "loss": 1.7152, "step": 15807 }, { "epoch": 0.41632868053726624, "grad_norm": 1.5243314504623413, "learning_rate": 2.919673426389255e-05, "loss": 1.6843, "step": 15808 }, { "epoch": 0.416355017118778, "grad_norm": 4.375945091247559, "learning_rate": 2.9195417434816962e-05, "loss": 1.1469, "step": 15809 }, { "epoch": 0.4163813537002897, "grad_norm": 2.180367946624756, "learning_rate": 2.9194100605741374e-05, "loss": 2.4722, "step": 15810 }, { "epoch": 0.4164076902818014, "grad_norm": 1.7941190004348755, "learning_rate": 2.919278377666579e-05, "loss": 1.7475, "step": 15811 }, { "epoch": 0.41643402686331316, "grad_norm": 2.0264933109283447, "learning_rate": 2.9191466947590205e-05, "loss": 1.8101, "step": 15812 }, { "epoch": 0.41646036344482484, "grad_norm": 1.70437753200531, "learning_rate": 2.919015011851462e-05, "loss": 0.8151, "step": 15813 }, { "epoch": 0.4164867000263366, "grad_norm": 2.530991554260254, "learning_rate": 2.9188833289439033e-05, "loss": 1.4086, "step": 15814 }, { "epoch": 0.4165130366078483, "grad_norm": 1.8729264736175537, "learning_rate": 2.9187516460363445e-05, "loss": 1.7716, "step": 15815 }, { "epoch": 0.41653937318936, "grad_norm": 4.4749979972839355, "learning_rate": 2.9186199631287858e-05, "loss": 1.7745, "step": 15816 }, { "epoch": 0.41656570977087176, "grad_norm": 4.986862659454346, "learning_rate": 2.9184882802212277e-05, "loss": 2.198, "step": 15817 }, { "epoch": 0.41659204635238345, "grad_norm": 1.4708622694015503, "learning_rate": 2.918356597313669e-05, "loss": 1.823, "step": 15818 }, { "epoch": 0.4166183829338952, "grad_norm": 3.1022653579711914, "learning_rate": 2.91822491440611e-05, "loss": 0.8135, "step": 15819 }, { "epoch": 0.4166447195154069, "grad_norm": 1.5363943576812744, "learning_rate": 2.9180932314985517e-05, "loss": 2.1016, "step": 15820 }, { "epoch": 0.4166710560969186, "grad_norm": 1.6736706495285034, "learning_rate": 2.917961548590993e-05, "loss": 1.2078, "step": 15821 }, { "epoch": 0.41669739267843037, "grad_norm": 1.6720129251480103, "learning_rate": 2.9178298656834348e-05, "loss": 1.6384, "step": 15822 }, { "epoch": 0.41672372925994206, "grad_norm": 1.7238825559616089, "learning_rate": 2.917698182775876e-05, "loss": 1.902, "step": 15823 }, { "epoch": 0.4167500658414538, "grad_norm": 1.7199745178222656, "learning_rate": 2.9175664998683172e-05, "loss": 1.4587, "step": 15824 }, { "epoch": 0.4167764024229655, "grad_norm": 2.393589496612549, "learning_rate": 2.9174348169607584e-05, "loss": 1.1613, "step": 15825 }, { "epoch": 0.41680273900447723, "grad_norm": 3.2851781845092773, "learning_rate": 2.9173031340531997e-05, "loss": 1.8116, "step": 15826 }, { "epoch": 0.4168290755859889, "grad_norm": 3.0794565677642822, "learning_rate": 2.9171714511456416e-05, "loss": 2.1656, "step": 15827 }, { "epoch": 0.41685541216750066, "grad_norm": 3.5296437740325928, "learning_rate": 2.9170397682380828e-05, "loss": 1.5944, "step": 15828 }, { "epoch": 0.4168817487490124, "grad_norm": 1.4852328300476074, "learning_rate": 2.9169080853305243e-05, "loss": 1.9289, "step": 15829 }, { "epoch": 0.4169080853305241, "grad_norm": 3.157398223876953, "learning_rate": 2.9167764024229656e-05, "loss": 1.7469, "step": 15830 }, { "epoch": 0.41693442191203584, "grad_norm": 2.660172462463379, "learning_rate": 2.9166447195154075e-05, "loss": 0.5563, "step": 15831 }, { "epoch": 0.4169607584935475, "grad_norm": 1.7150225639343262, "learning_rate": 2.9165130366078487e-05, "loss": 2.2012, "step": 15832 }, { "epoch": 0.41698709507505927, "grad_norm": 2.8092753887176514, "learning_rate": 2.91638135370029e-05, "loss": 0.4589, "step": 15833 }, { "epoch": 0.41701343165657095, "grad_norm": 2.415296792984009, "learning_rate": 2.916249670792731e-05, "loss": 1.1623, "step": 15834 }, { "epoch": 0.4170397682380827, "grad_norm": 1.9134904146194458, "learning_rate": 2.9161179878851723e-05, "loss": 1.1477, "step": 15835 }, { "epoch": 0.41706610481959444, "grad_norm": 1.5199739933013916, "learning_rate": 2.9159863049776142e-05, "loss": 2.1967, "step": 15836 }, { "epoch": 0.41709244140110613, "grad_norm": 4.269620895385742, "learning_rate": 2.9158546220700555e-05, "loss": 0.9638, "step": 15837 }, { "epoch": 0.41711877798261787, "grad_norm": 5.702027320861816, "learning_rate": 2.9157229391624967e-05, "loss": 1.925, "step": 15838 }, { "epoch": 0.41714511456412956, "grad_norm": 2.487232208251953, "learning_rate": 2.9155912562549382e-05, "loss": 2.1036, "step": 15839 }, { "epoch": 0.4171714511456413, "grad_norm": 3.8568320274353027, "learning_rate": 2.9154595733473795e-05, "loss": 1.7937, "step": 15840 }, { "epoch": 0.417197787727153, "grad_norm": 1.774970531463623, "learning_rate": 2.9153278904398214e-05, "loss": 1.6955, "step": 15841 }, { "epoch": 0.41722412430866473, "grad_norm": 2.2013163566589355, "learning_rate": 2.9151962075322626e-05, "loss": 1.6738, "step": 15842 }, { "epoch": 0.4172504608901765, "grad_norm": 3.0009524822235107, "learning_rate": 2.9150645246247038e-05, "loss": 1.8865, "step": 15843 }, { "epoch": 0.41727679747168817, "grad_norm": 2.259378671646118, "learning_rate": 2.914932841717145e-05, "loss": 1.9571, "step": 15844 }, { "epoch": 0.4173031340531999, "grad_norm": 1.8683756589889526, "learning_rate": 2.914801158809587e-05, "loss": 1.7056, "step": 15845 }, { "epoch": 0.4173294706347116, "grad_norm": 2.313812017440796, "learning_rate": 2.914669475902028e-05, "loss": 2.0922, "step": 15846 }, { "epoch": 0.41735580721622334, "grad_norm": 6.496887683868408, "learning_rate": 2.9145377929944694e-05, "loss": 2.0727, "step": 15847 }, { "epoch": 0.41738214379773503, "grad_norm": 2.928194284439087, "learning_rate": 2.914406110086911e-05, "loss": 1.9944, "step": 15848 }, { "epoch": 0.41740848037924677, "grad_norm": 3.5459048748016357, "learning_rate": 2.914274427179352e-05, "loss": 1.1235, "step": 15849 }, { "epoch": 0.4174348169607585, "grad_norm": 3.09625506401062, "learning_rate": 2.914142744271794e-05, "loss": 1.624, "step": 15850 }, { "epoch": 0.4174611535422702, "grad_norm": 1.8869256973266602, "learning_rate": 2.9140110613642353e-05, "loss": 1.0171, "step": 15851 }, { "epoch": 0.41748749012378195, "grad_norm": 2.104810953140259, "learning_rate": 2.9138793784566765e-05, "loss": 2.0962, "step": 15852 }, { "epoch": 0.41751382670529363, "grad_norm": 2.244263172149658, "learning_rate": 2.9137476955491177e-05, "loss": 1.9973, "step": 15853 }, { "epoch": 0.4175401632868054, "grad_norm": 4.747417449951172, "learning_rate": 2.913616012641559e-05, "loss": 0.9527, "step": 15854 }, { "epoch": 0.41756649986831706, "grad_norm": 2.2802634239196777, "learning_rate": 2.9134843297340008e-05, "loss": 0.8045, "step": 15855 }, { "epoch": 0.4175928364498288, "grad_norm": 2.1140217781066895, "learning_rate": 2.913352646826442e-05, "loss": 2.0167, "step": 15856 }, { "epoch": 0.41761917303134055, "grad_norm": 1.5266413688659668, "learning_rate": 2.9132209639188833e-05, "loss": 1.2593, "step": 15857 }, { "epoch": 0.41764550961285224, "grad_norm": 4.241495609283447, "learning_rate": 2.913089281011325e-05, "loss": 2.0269, "step": 15858 }, { "epoch": 0.417671846194364, "grad_norm": 1.7115955352783203, "learning_rate": 2.912957598103766e-05, "loss": 0.4918, "step": 15859 }, { "epoch": 0.41769818277587567, "grad_norm": 1.7254637479782104, "learning_rate": 2.912825915196208e-05, "loss": 2.0225, "step": 15860 }, { "epoch": 0.4177245193573874, "grad_norm": 3.146047592163086, "learning_rate": 2.9126942322886492e-05, "loss": 1.7603, "step": 15861 }, { "epoch": 0.41775085593889916, "grad_norm": 1.4908065795898438, "learning_rate": 2.9125625493810904e-05, "loss": 1.612, "step": 15862 }, { "epoch": 0.41777719252041084, "grad_norm": 2.681586503982544, "learning_rate": 2.9124308664735316e-05, "loss": 1.4214, "step": 15863 }, { "epoch": 0.4178035291019226, "grad_norm": 16.2877197265625, "learning_rate": 2.9122991835659735e-05, "loss": 1.8115, "step": 15864 }, { "epoch": 0.4178298656834343, "grad_norm": 1.5722683668136597, "learning_rate": 2.9121675006584147e-05, "loss": 2.499, "step": 15865 }, { "epoch": 0.417856202264946, "grad_norm": 1.9022072553634644, "learning_rate": 2.912035817750856e-05, "loss": 1.5993, "step": 15866 }, { "epoch": 0.4178825388464577, "grad_norm": 1.8968604803085327, "learning_rate": 2.9119041348432975e-05, "loss": 2.1083, "step": 15867 }, { "epoch": 0.41790887542796945, "grad_norm": 1.8811109066009521, "learning_rate": 2.9117724519357387e-05, "loss": 2.2591, "step": 15868 }, { "epoch": 0.4179352120094812, "grad_norm": 2.29788875579834, "learning_rate": 2.9116407690281806e-05, "loss": 1.5296, "step": 15869 }, { "epoch": 0.4179615485909929, "grad_norm": 3.653179168701172, "learning_rate": 2.911509086120622e-05, "loss": 0.9831, "step": 15870 }, { "epoch": 0.4179878851725046, "grad_norm": 1.916192650794983, "learning_rate": 2.911377403213063e-05, "loss": 2.2055, "step": 15871 }, { "epoch": 0.4180142217540163, "grad_norm": 3.860635757446289, "learning_rate": 2.9112457203055043e-05, "loss": 0.6394, "step": 15872 }, { "epoch": 0.41804055833552806, "grad_norm": 1.938744306564331, "learning_rate": 2.9111140373979455e-05, "loss": 1.2032, "step": 15873 }, { "epoch": 0.41806689491703974, "grad_norm": 1.6979886293411255, "learning_rate": 2.9109823544903874e-05, "loss": 0.2738, "step": 15874 }, { "epoch": 0.4180932314985515, "grad_norm": 2.175947666168213, "learning_rate": 2.9108506715828286e-05, "loss": 1.4278, "step": 15875 }, { "epoch": 0.41811956808006323, "grad_norm": 5.7649054527282715, "learning_rate": 2.9107189886752702e-05, "loss": 0.9207, "step": 15876 }, { "epoch": 0.4181459046615749, "grad_norm": 3.200817823410034, "learning_rate": 2.9105873057677114e-05, "loss": 1.3369, "step": 15877 }, { "epoch": 0.41817224124308666, "grad_norm": 2.9281327724456787, "learning_rate": 2.9104556228601533e-05, "loss": 1.7935, "step": 15878 }, { "epoch": 0.41819857782459835, "grad_norm": 1.5523358583450317, "learning_rate": 2.9103239399525945e-05, "loss": 1.7339, "step": 15879 }, { "epoch": 0.4182249144061101, "grad_norm": 3.345917224884033, "learning_rate": 2.9101922570450358e-05, "loss": 1.3455, "step": 15880 }, { "epoch": 0.4182512509876218, "grad_norm": 1.7881011962890625, "learning_rate": 2.910060574137477e-05, "loss": 1.7529, "step": 15881 }, { "epoch": 0.4182775875691335, "grad_norm": 1.555479884147644, "learning_rate": 2.9099288912299182e-05, "loss": 1.7742, "step": 15882 }, { "epoch": 0.41830392415064527, "grad_norm": 2.2716665267944336, "learning_rate": 2.90979720832236e-05, "loss": 1.3526, "step": 15883 }, { "epoch": 0.41833026073215696, "grad_norm": 1.9988768100738525, "learning_rate": 2.9096655254148013e-05, "loss": 2.3314, "step": 15884 }, { "epoch": 0.4183565973136687, "grad_norm": 4.702402114868164, "learning_rate": 2.9095338425072425e-05, "loss": 1.5109, "step": 15885 }, { "epoch": 0.4183829338951804, "grad_norm": 1.7483413219451904, "learning_rate": 2.909402159599684e-05, "loss": 1.9926, "step": 15886 }, { "epoch": 0.41840927047669213, "grad_norm": 3.561209201812744, "learning_rate": 2.9092704766921253e-05, "loss": 1.2674, "step": 15887 }, { "epoch": 0.4184356070582038, "grad_norm": 2.1175496578216553, "learning_rate": 2.9091387937845672e-05, "loss": 1.7887, "step": 15888 }, { "epoch": 0.41846194363971556, "grad_norm": 1.6008661985397339, "learning_rate": 2.9090071108770084e-05, "loss": 1.8637, "step": 15889 }, { "epoch": 0.4184882802212273, "grad_norm": 1.8572592735290527, "learning_rate": 2.9088754279694497e-05, "loss": 1.3288, "step": 15890 }, { "epoch": 0.418514616802739, "grad_norm": 2.8444910049438477, "learning_rate": 2.908743745061891e-05, "loss": 0.7816, "step": 15891 }, { "epoch": 0.41854095338425074, "grad_norm": 2.1428418159484863, "learning_rate": 2.908612062154332e-05, "loss": 0.7519, "step": 15892 }, { "epoch": 0.4185672899657624, "grad_norm": 1.5301878452301025, "learning_rate": 2.908480379246774e-05, "loss": 2.3541, "step": 15893 }, { "epoch": 0.41859362654727417, "grad_norm": 2.8264079093933105, "learning_rate": 2.9083486963392152e-05, "loss": 2.1651, "step": 15894 }, { "epoch": 0.4186199631287859, "grad_norm": 1.492467999458313, "learning_rate": 2.9082170134316568e-05, "loss": 1.3263, "step": 15895 }, { "epoch": 0.4186462997102976, "grad_norm": 2.664212226867676, "learning_rate": 2.908085330524098e-05, "loss": 1.4047, "step": 15896 }, { "epoch": 0.41867263629180934, "grad_norm": 2.399510145187378, "learning_rate": 2.90795364761654e-05, "loss": 2.0924, "step": 15897 }, { "epoch": 0.41869897287332103, "grad_norm": 1.7267956733703613, "learning_rate": 2.907821964708981e-05, "loss": 1.6726, "step": 15898 }, { "epoch": 0.4187253094548328, "grad_norm": 1.7243093252182007, "learning_rate": 2.9076902818014223e-05, "loss": 1.0425, "step": 15899 }, { "epoch": 0.41875164603634446, "grad_norm": 1.4584072828292847, "learning_rate": 2.9075585988938636e-05, "loss": 1.5449, "step": 15900 }, { "epoch": 0.4187779826178562, "grad_norm": 3.6513116359710693, "learning_rate": 2.9074269159863048e-05, "loss": 1.6765, "step": 15901 }, { "epoch": 0.41880431919936795, "grad_norm": 5.887742042541504, "learning_rate": 2.9072952330787467e-05, "loss": 1.8525, "step": 15902 }, { "epoch": 0.41883065578087963, "grad_norm": 1.7512121200561523, "learning_rate": 2.907163550171188e-05, "loss": 1.2852, "step": 15903 }, { "epoch": 0.4188569923623914, "grad_norm": 4.285510540008545, "learning_rate": 2.9070318672636295e-05, "loss": 1.7264, "step": 15904 }, { "epoch": 0.41888332894390307, "grad_norm": 1.7619471549987793, "learning_rate": 2.9069001843560707e-05, "loss": 1.3431, "step": 15905 }, { "epoch": 0.4189096655254148, "grad_norm": 2.8740603923797607, "learning_rate": 2.906768501448512e-05, "loss": 1.9017, "step": 15906 }, { "epoch": 0.4189360021069265, "grad_norm": 3.6611809730529785, "learning_rate": 2.9066368185409538e-05, "loss": 5.5193, "step": 15907 }, { "epoch": 0.41896233868843824, "grad_norm": 2.0516936779022217, "learning_rate": 2.906505135633395e-05, "loss": 1.9473, "step": 15908 }, { "epoch": 0.41898867526995, "grad_norm": 5.081444263458252, "learning_rate": 2.9063734527258362e-05, "loss": 2.0338, "step": 15909 }, { "epoch": 0.41901501185146167, "grad_norm": 2.4108309745788574, "learning_rate": 2.9062417698182775e-05, "loss": 1.1406, "step": 15910 }, { "epoch": 0.4190413484329734, "grad_norm": 1.5972779989242554, "learning_rate": 2.9061100869107194e-05, "loss": 1.1847, "step": 15911 }, { "epoch": 0.4190676850144851, "grad_norm": 1.9686408042907715, "learning_rate": 2.9059784040031606e-05, "loss": 2.0174, "step": 15912 }, { "epoch": 0.41909402159599685, "grad_norm": 1.8970401287078857, "learning_rate": 2.9058467210956018e-05, "loss": 2.7467, "step": 15913 }, { "epoch": 0.41912035817750853, "grad_norm": 3.6564159393310547, "learning_rate": 2.9057150381880434e-05, "loss": 1.0873, "step": 15914 }, { "epoch": 0.4191466947590203, "grad_norm": 2.1340556144714355, "learning_rate": 2.9055833552804846e-05, "loss": 1.7619, "step": 15915 }, { "epoch": 0.419173031340532, "grad_norm": 1.7491053342819214, "learning_rate": 2.9054516723729265e-05, "loss": 1.8354, "step": 15916 }, { "epoch": 0.4191993679220437, "grad_norm": 2.103529453277588, "learning_rate": 2.9053199894653677e-05, "loss": 2.0311, "step": 15917 }, { "epoch": 0.41922570450355545, "grad_norm": 1.4501547813415527, "learning_rate": 2.905188306557809e-05, "loss": 0.7853, "step": 15918 }, { "epoch": 0.41925204108506714, "grad_norm": 1.9094527959823608, "learning_rate": 2.90505662365025e-05, "loss": 2.3446, "step": 15919 }, { "epoch": 0.4192783776665789, "grad_norm": 2.823381185531616, "learning_rate": 2.9049249407426914e-05, "loss": 1.0354, "step": 15920 }, { "epoch": 0.41930471424809057, "grad_norm": 1.5064691305160522, "learning_rate": 2.9047932578351333e-05, "loss": 1.7708, "step": 15921 }, { "epoch": 0.4193310508296023, "grad_norm": 2.477412700653076, "learning_rate": 2.9046615749275745e-05, "loss": 2.3426, "step": 15922 }, { "epoch": 0.41935738741111406, "grad_norm": 2.4033079147338867, "learning_rate": 2.904529892020016e-05, "loss": 1.602, "step": 15923 }, { "epoch": 0.41938372399262575, "grad_norm": 2.9287612438201904, "learning_rate": 2.9043982091124573e-05, "loss": 1.1466, "step": 15924 }, { "epoch": 0.4194100605741375, "grad_norm": 2.5729892253875732, "learning_rate": 2.9042665262048985e-05, "loss": 0.8843, "step": 15925 }, { "epoch": 0.4194363971556492, "grad_norm": 2.1236963272094727, "learning_rate": 2.9041348432973404e-05, "loss": 2.0105, "step": 15926 }, { "epoch": 0.4194627337371609, "grad_norm": 1.672542691230774, "learning_rate": 2.9040031603897816e-05, "loss": 1.7941, "step": 15927 }, { "epoch": 0.41948907031867266, "grad_norm": 2.8274424076080322, "learning_rate": 2.903871477482223e-05, "loss": 0.7584, "step": 15928 }, { "epoch": 0.41951540690018435, "grad_norm": 1.7160110473632812, "learning_rate": 2.903739794574664e-05, "loss": 1.8366, "step": 15929 }, { "epoch": 0.4195417434816961, "grad_norm": 1.9629780054092407, "learning_rate": 2.903608111667106e-05, "loss": 1.5624, "step": 15930 }, { "epoch": 0.4195680800632078, "grad_norm": 1.6561269760131836, "learning_rate": 2.9034764287595472e-05, "loss": 0.4414, "step": 15931 }, { "epoch": 0.4195944166447195, "grad_norm": 2.1763134002685547, "learning_rate": 2.9033447458519887e-05, "loss": 1.8046, "step": 15932 }, { "epoch": 0.4196207532262312, "grad_norm": 2.108588218688965, "learning_rate": 2.90321306294443e-05, "loss": 1.073, "step": 15933 }, { "epoch": 0.41964708980774296, "grad_norm": 1.8754918575286865, "learning_rate": 2.9030813800368712e-05, "loss": 1.8888, "step": 15934 }, { "epoch": 0.4196734263892547, "grad_norm": 3.2108142375946045, "learning_rate": 2.902949697129313e-05, "loss": 0.8351, "step": 15935 }, { "epoch": 0.4196997629707664, "grad_norm": 2.4658565521240234, "learning_rate": 2.9028180142217543e-05, "loss": 1.3003, "step": 15936 }, { "epoch": 0.41972609955227813, "grad_norm": 2.130405902862549, "learning_rate": 2.9026863313141955e-05, "loss": 1.7646, "step": 15937 }, { "epoch": 0.4197524361337898, "grad_norm": 1.5278515815734863, "learning_rate": 2.9025546484066367e-05, "loss": 1.7577, "step": 15938 }, { "epoch": 0.41977877271530156, "grad_norm": 1.652101755142212, "learning_rate": 2.902422965499078e-05, "loss": 2.0732, "step": 15939 }, { "epoch": 0.41980510929681325, "grad_norm": 3.1307928562164307, "learning_rate": 2.90229128259152e-05, "loss": 1.7772, "step": 15940 }, { "epoch": 0.419831445878325, "grad_norm": 2.3606410026550293, "learning_rate": 2.902159599683961e-05, "loss": 1.221, "step": 15941 }, { "epoch": 0.41985778245983674, "grad_norm": 1.6596097946166992, "learning_rate": 2.9020279167764026e-05, "loss": 2.1541, "step": 15942 }, { "epoch": 0.4198841190413484, "grad_norm": 2.633572816848755, "learning_rate": 2.901896233868844e-05, "loss": 1.5241, "step": 15943 }, { "epoch": 0.41991045562286017, "grad_norm": 2.3885717391967773, "learning_rate": 2.9017645509612858e-05, "loss": 0.4491, "step": 15944 }, { "epoch": 0.41993679220437186, "grad_norm": 2.233095645904541, "learning_rate": 2.901632868053727e-05, "loss": 0.9087, "step": 15945 }, { "epoch": 0.4199631287858836, "grad_norm": 1.7320566177368164, "learning_rate": 2.9015011851461682e-05, "loss": 1.9692, "step": 15946 }, { "epoch": 0.4199894653673953, "grad_norm": 2.7305448055267334, "learning_rate": 2.9013695022386094e-05, "loss": 1.0588, "step": 15947 }, { "epoch": 0.42001580194890703, "grad_norm": 1.9532499313354492, "learning_rate": 2.9012378193310506e-05, "loss": 1.2022, "step": 15948 }, { "epoch": 0.4200421385304188, "grad_norm": 2.070375919342041, "learning_rate": 2.9011061364234925e-05, "loss": 2.1529, "step": 15949 }, { "epoch": 0.42006847511193046, "grad_norm": 1.5699832439422607, "learning_rate": 2.9009744535159338e-05, "loss": 1.9472, "step": 15950 }, { "epoch": 0.4200948116934422, "grad_norm": 2.310711622238159, "learning_rate": 2.9008427706083753e-05, "loss": 1.9255, "step": 15951 }, { "epoch": 0.4201211482749539, "grad_norm": 2.3300163745880127, "learning_rate": 2.9007110877008165e-05, "loss": 1.9764, "step": 15952 }, { "epoch": 0.42014748485646564, "grad_norm": 1.8245055675506592, "learning_rate": 2.9005794047932578e-05, "loss": 1.8168, "step": 15953 }, { "epoch": 0.4201738214379773, "grad_norm": 3.797804117202759, "learning_rate": 2.9004477218856997e-05, "loss": 2.3182, "step": 15954 }, { "epoch": 0.42020015801948907, "grad_norm": 2.3537936210632324, "learning_rate": 2.900316038978141e-05, "loss": 2.1785, "step": 15955 }, { "epoch": 0.4202264946010008, "grad_norm": 2.155189275741577, "learning_rate": 2.900184356070582e-05, "loss": 2.2098, "step": 15956 }, { "epoch": 0.4202528311825125, "grad_norm": 1.8503748178482056, "learning_rate": 2.9000526731630233e-05, "loss": 1.9656, "step": 15957 }, { "epoch": 0.42027916776402424, "grad_norm": 2.7719359397888184, "learning_rate": 2.8999209902554645e-05, "loss": 2.2013, "step": 15958 }, { "epoch": 0.42030550434553593, "grad_norm": 2.182180404663086, "learning_rate": 2.8997893073479064e-05, "loss": 1.5532, "step": 15959 }, { "epoch": 0.4203318409270477, "grad_norm": 2.1619744300842285, "learning_rate": 2.8996576244403477e-05, "loss": 1.9065, "step": 15960 }, { "epoch": 0.4203581775085594, "grad_norm": 1.6861029863357544, "learning_rate": 2.8995259415327892e-05, "loss": 1.3705, "step": 15961 }, { "epoch": 0.4203845140900711, "grad_norm": 2.342731475830078, "learning_rate": 2.8993942586252304e-05, "loss": 2.328, "step": 15962 }, { "epoch": 0.42041085067158285, "grad_norm": 3.1964547634124756, "learning_rate": 2.8992625757176723e-05, "loss": 1.206, "step": 15963 }, { "epoch": 0.42043718725309454, "grad_norm": 1.734544277191162, "learning_rate": 2.8991308928101136e-05, "loss": 1.6301, "step": 15964 }, { "epoch": 0.4204635238346063, "grad_norm": 1.902092695236206, "learning_rate": 2.8989992099025548e-05, "loss": 1.8637, "step": 15965 }, { "epoch": 0.42048986041611797, "grad_norm": 2.298835515975952, "learning_rate": 2.898867526994996e-05, "loss": 1.769, "step": 15966 }, { "epoch": 0.4205161969976297, "grad_norm": 2.136662483215332, "learning_rate": 2.8987358440874372e-05, "loss": 1.8443, "step": 15967 }, { "epoch": 0.42054253357914145, "grad_norm": 3.5213842391967773, "learning_rate": 2.898604161179879e-05, "loss": 2.3779, "step": 15968 }, { "epoch": 0.42056887016065314, "grad_norm": 1.706467628479004, "learning_rate": 2.8984724782723203e-05, "loss": 1.424, "step": 15969 }, { "epoch": 0.4205952067421649, "grad_norm": 1.7673289775848389, "learning_rate": 2.898340795364762e-05, "loss": 1.9519, "step": 15970 }, { "epoch": 0.42062154332367657, "grad_norm": 4.114656448364258, "learning_rate": 2.898209112457203e-05, "loss": 0.9185, "step": 15971 }, { "epoch": 0.4206478799051883, "grad_norm": 1.8710373640060425, "learning_rate": 2.8980774295496444e-05, "loss": 1.8882, "step": 15972 }, { "epoch": 0.4206742164867, "grad_norm": 1.4987995624542236, "learning_rate": 2.8979457466420862e-05, "loss": 1.5713, "step": 15973 }, { "epoch": 0.42070055306821175, "grad_norm": 1.9544272422790527, "learning_rate": 2.8978140637345275e-05, "loss": 2.1226, "step": 15974 }, { "epoch": 0.4207268896497235, "grad_norm": 1.9978083372116089, "learning_rate": 2.8976823808269687e-05, "loss": 1.9512, "step": 15975 }, { "epoch": 0.4207532262312352, "grad_norm": 3.6790876388549805, "learning_rate": 2.89755069791941e-05, "loss": 1.3376, "step": 15976 }, { "epoch": 0.4207795628127469, "grad_norm": 1.8221933841705322, "learning_rate": 2.8974190150118518e-05, "loss": 1.1866, "step": 15977 }, { "epoch": 0.4208058993942586, "grad_norm": 1.7291162014007568, "learning_rate": 2.897287332104293e-05, "loss": 2.202, "step": 15978 }, { "epoch": 0.42083223597577035, "grad_norm": 2.848832845687866, "learning_rate": 2.8971556491967346e-05, "loss": 1.5461, "step": 15979 }, { "epoch": 0.42085857255728204, "grad_norm": 2.547539472579956, "learning_rate": 2.8970239662891758e-05, "loss": 1.8744, "step": 15980 }, { "epoch": 0.4208849091387938, "grad_norm": 2.420252799987793, "learning_rate": 2.896892283381617e-05, "loss": 2.2973, "step": 15981 }, { "epoch": 0.4209112457203055, "grad_norm": 2.7987349033355713, "learning_rate": 2.896760600474059e-05, "loss": 1.9818, "step": 15982 }, { "epoch": 0.4209375823018172, "grad_norm": 4.058008670806885, "learning_rate": 2.8966289175665e-05, "loss": 1.3098, "step": 15983 }, { "epoch": 0.42096391888332896, "grad_norm": 1.5876502990722656, "learning_rate": 2.8964972346589414e-05, "loss": 1.5784, "step": 15984 }, { "epoch": 0.42099025546484065, "grad_norm": 2.0043179988861084, "learning_rate": 2.8963655517513826e-05, "loss": 1.8581, "step": 15985 }, { "epoch": 0.4210165920463524, "grad_norm": 3.14562726020813, "learning_rate": 2.8962338688438238e-05, "loss": 1.9681, "step": 15986 }, { "epoch": 0.4210429286278641, "grad_norm": 1.9450507164001465, "learning_rate": 2.8961021859362657e-05, "loss": 1.1009, "step": 15987 }, { "epoch": 0.4210692652093758, "grad_norm": 2.42622709274292, "learning_rate": 2.895970503028707e-05, "loss": 1.4945, "step": 15988 }, { "epoch": 0.42109560179088756, "grad_norm": 1.9436426162719727, "learning_rate": 2.8958388201211485e-05, "loss": 1.044, "step": 15989 }, { "epoch": 0.42112193837239925, "grad_norm": 1.7001558542251587, "learning_rate": 2.8957071372135897e-05, "loss": 1.9212, "step": 15990 }, { "epoch": 0.421148274953911, "grad_norm": 3.6431925296783447, "learning_rate": 2.895575454306031e-05, "loss": 1.3559, "step": 15991 }, { "epoch": 0.4211746115354227, "grad_norm": 1.6292749643325806, "learning_rate": 2.895443771398473e-05, "loss": 1.3776, "step": 15992 }, { "epoch": 0.4212009481169344, "grad_norm": 4.4837965965271, "learning_rate": 2.895312088490914e-05, "loss": 2.5701, "step": 15993 }, { "epoch": 0.42122728469844617, "grad_norm": 1.6011847257614136, "learning_rate": 2.8951804055833553e-05, "loss": 1.9054, "step": 15994 }, { "epoch": 0.42125362127995786, "grad_norm": 2.1386990547180176, "learning_rate": 2.8950487226757965e-05, "loss": 1.4471, "step": 15995 }, { "epoch": 0.4212799578614696, "grad_norm": 2.749276638031006, "learning_rate": 2.8949170397682384e-05, "loss": 1.9002, "step": 15996 }, { "epoch": 0.4213062944429813, "grad_norm": 3.0917043685913086, "learning_rate": 2.8947853568606796e-05, "loss": 1.9788, "step": 15997 }, { "epoch": 0.42133263102449303, "grad_norm": 2.294778347015381, "learning_rate": 2.8946536739531212e-05, "loss": 0.8067, "step": 15998 }, { "epoch": 0.4213589676060047, "grad_norm": 1.9548379182815552, "learning_rate": 2.8945219910455624e-05, "loss": 1.6664, "step": 15999 }, { "epoch": 0.42138530418751646, "grad_norm": 1.8013393878936768, "learning_rate": 2.8943903081380036e-05, "loss": 2.5083, "step": 16000 }, { "epoch": 0.4214116407690282, "grad_norm": 4.092260360717773, "learning_rate": 2.8942586252304455e-05, "loss": 0.9086, "step": 16001 }, { "epoch": 0.4214379773505399, "grad_norm": 1.7577232122421265, "learning_rate": 2.8941269423228867e-05, "loss": 1.8162, "step": 16002 }, { "epoch": 0.42146431393205164, "grad_norm": 1.989029884338379, "learning_rate": 2.893995259415328e-05, "loss": 1.9404, "step": 16003 }, { "epoch": 0.4214906505135633, "grad_norm": 3.9387192726135254, "learning_rate": 2.8938635765077692e-05, "loss": 1.6626, "step": 16004 }, { "epoch": 0.42151698709507507, "grad_norm": 1.6529161930084229, "learning_rate": 2.8937318936002107e-05, "loss": 1.1396, "step": 16005 }, { "epoch": 0.42154332367658676, "grad_norm": 3.0110344886779785, "learning_rate": 2.8936002106926523e-05, "loss": 1.5878, "step": 16006 }, { "epoch": 0.4215696602580985, "grad_norm": 4.271462440490723, "learning_rate": 2.893468527785094e-05, "loss": 0.7398, "step": 16007 }, { "epoch": 0.42159599683961024, "grad_norm": 3.1086785793304443, "learning_rate": 2.893336844877535e-05, "loss": 1.9452, "step": 16008 }, { "epoch": 0.42162233342112193, "grad_norm": 1.503722071647644, "learning_rate": 2.8932051619699763e-05, "loss": 1.4847, "step": 16009 }, { "epoch": 0.4216486700026337, "grad_norm": 1.560887098312378, "learning_rate": 2.8930734790624182e-05, "loss": 1.6213, "step": 16010 }, { "epoch": 0.42167500658414536, "grad_norm": 1.760580062866211, "learning_rate": 2.8929417961548594e-05, "loss": 1.5621, "step": 16011 }, { "epoch": 0.4217013431656571, "grad_norm": 1.9348901510238647, "learning_rate": 2.8928101132473006e-05, "loss": 2.2423, "step": 16012 }, { "epoch": 0.4217276797471688, "grad_norm": 1.8959280252456665, "learning_rate": 2.892678430339742e-05, "loss": 2.3895, "step": 16013 }, { "epoch": 0.42175401632868054, "grad_norm": 2.430936813354492, "learning_rate": 2.892546747432183e-05, "loss": 1.2636, "step": 16014 }, { "epoch": 0.4217803529101923, "grad_norm": 1.6049647331237793, "learning_rate": 2.892415064524625e-05, "loss": 2.0687, "step": 16015 }, { "epoch": 0.42180668949170397, "grad_norm": 2.852264165878296, "learning_rate": 2.8922833816170662e-05, "loss": 0.3198, "step": 16016 }, { "epoch": 0.4218330260732157, "grad_norm": 1.8353729248046875, "learning_rate": 2.8921516987095078e-05, "loss": 1.6445, "step": 16017 }, { "epoch": 0.4218593626547274, "grad_norm": 2.0958211421966553, "learning_rate": 2.892020015801949e-05, "loss": 1.9426, "step": 16018 }, { "epoch": 0.42188569923623914, "grad_norm": 3.931833028793335, "learning_rate": 2.8918883328943902e-05, "loss": 2.7907, "step": 16019 }, { "epoch": 0.42191203581775083, "grad_norm": 3.747877836227417, "learning_rate": 2.891756649986832e-05, "loss": 1.6726, "step": 16020 }, { "epoch": 0.4219383723992626, "grad_norm": 3.309068202972412, "learning_rate": 2.8916249670792733e-05, "loss": 1.791, "step": 16021 }, { "epoch": 0.4219647089807743, "grad_norm": 3.228571653366089, "learning_rate": 2.8914932841717145e-05, "loss": 1.802, "step": 16022 }, { "epoch": 0.421991045562286, "grad_norm": 1.9346814155578613, "learning_rate": 2.8913616012641558e-05, "loss": 1.787, "step": 16023 }, { "epoch": 0.42201738214379775, "grad_norm": 1.7779711484909058, "learning_rate": 2.8912299183565973e-05, "loss": 1.111, "step": 16024 }, { "epoch": 0.42204371872530944, "grad_norm": 1.7615411281585693, "learning_rate": 2.891098235449039e-05, "loss": 1.4725, "step": 16025 }, { "epoch": 0.4220700553068212, "grad_norm": 2.5310657024383545, "learning_rate": 2.8909665525414804e-05, "loss": 1.7039, "step": 16026 }, { "epoch": 0.42209639188833287, "grad_norm": 1.7241129875183105, "learning_rate": 2.8908348696339217e-05, "loss": 1.5877, "step": 16027 }, { "epoch": 0.4221227284698446, "grad_norm": 1.4583377838134766, "learning_rate": 2.890703186726363e-05, "loss": 1.7874, "step": 16028 }, { "epoch": 0.42214906505135635, "grad_norm": 3.046745777130127, "learning_rate": 2.8905715038188048e-05, "loss": 0.8007, "step": 16029 }, { "epoch": 0.42217540163286804, "grad_norm": 1.6729148626327515, "learning_rate": 2.890439820911246e-05, "loss": 1.5125, "step": 16030 }, { "epoch": 0.4222017382143798, "grad_norm": 1.5012695789337158, "learning_rate": 2.8903081380036872e-05, "loss": 1.5603, "step": 16031 }, { "epoch": 0.4222280747958915, "grad_norm": 3.8798723220825195, "learning_rate": 2.8901764550961284e-05, "loss": 1.0792, "step": 16032 }, { "epoch": 0.4222544113774032, "grad_norm": 2.6700475215911865, "learning_rate": 2.8900447721885697e-05, "loss": 1.1422, "step": 16033 }, { "epoch": 0.42228074795891496, "grad_norm": 2.944796562194824, "learning_rate": 2.8899130892810116e-05, "loss": 1.8004, "step": 16034 }, { "epoch": 0.42230708454042665, "grad_norm": 1.6480005979537964, "learning_rate": 2.8897814063734528e-05, "loss": 1.8199, "step": 16035 }, { "epoch": 0.4223334211219384, "grad_norm": 2.051287889480591, "learning_rate": 2.8896497234658944e-05, "loss": 1.6361, "step": 16036 }, { "epoch": 0.4223597577034501, "grad_norm": 2.1824262142181396, "learning_rate": 2.8895180405583356e-05, "loss": 2.0993, "step": 16037 }, { "epoch": 0.4223860942849618, "grad_norm": 2.374279499053955, "learning_rate": 2.8893863576507768e-05, "loss": 2.4941, "step": 16038 }, { "epoch": 0.4224124308664735, "grad_norm": 1.8703571557998657, "learning_rate": 2.8892546747432187e-05, "loss": 1.6973, "step": 16039 }, { "epoch": 0.42243876744798525, "grad_norm": 3.0656161308288574, "learning_rate": 2.88912299183566e-05, "loss": 0.9282, "step": 16040 }, { "epoch": 0.422465104029497, "grad_norm": 2.0200493335723877, "learning_rate": 2.888991308928101e-05, "loss": 1.5263, "step": 16041 }, { "epoch": 0.4224914406110087, "grad_norm": 2.0137667655944824, "learning_rate": 2.8888596260205424e-05, "loss": 1.8077, "step": 16042 }, { "epoch": 0.4225177771925204, "grad_norm": 1.68495774269104, "learning_rate": 2.8887279431129843e-05, "loss": 1.6741, "step": 16043 }, { "epoch": 0.4225441137740321, "grad_norm": 2.9733004570007324, "learning_rate": 2.8885962602054255e-05, "loss": 1.4297, "step": 16044 }, { "epoch": 0.42257045035554386, "grad_norm": 1.8878774642944336, "learning_rate": 2.888464577297867e-05, "loss": 0.8646, "step": 16045 }, { "epoch": 0.42259678693705555, "grad_norm": 1.8320461511611938, "learning_rate": 2.8883328943903083e-05, "loss": 2.0844, "step": 16046 }, { "epoch": 0.4226231235185673, "grad_norm": 2.896723985671997, "learning_rate": 2.8882012114827495e-05, "loss": 1.6323, "step": 16047 }, { "epoch": 0.42264946010007903, "grad_norm": 2.671372413635254, "learning_rate": 2.8880695285751914e-05, "loss": 2.2406, "step": 16048 }, { "epoch": 0.4226757966815907, "grad_norm": 2.8025848865509033, "learning_rate": 2.8879378456676326e-05, "loss": 1.0264, "step": 16049 }, { "epoch": 0.42270213326310246, "grad_norm": 1.8795288801193237, "learning_rate": 2.8878061627600738e-05, "loss": 1.9231, "step": 16050 }, { "epoch": 0.42272846984461415, "grad_norm": 2.6844730377197266, "learning_rate": 2.887674479852515e-05, "loss": 1.764, "step": 16051 }, { "epoch": 0.4227548064261259, "grad_norm": 2.2637476921081543, "learning_rate": 2.8875427969449566e-05, "loss": 2.0575, "step": 16052 }, { "epoch": 0.4227811430076376, "grad_norm": 1.7152243852615356, "learning_rate": 2.887411114037398e-05, "loss": 1.7721, "step": 16053 }, { "epoch": 0.4228074795891493, "grad_norm": 1.7404292821884155, "learning_rate": 2.8872794311298397e-05, "loss": 1.5301, "step": 16054 }, { "epoch": 0.42283381617066107, "grad_norm": 2.5333354473114014, "learning_rate": 2.887147748222281e-05, "loss": 2.0883, "step": 16055 }, { "epoch": 0.42286015275217276, "grad_norm": 1.995476484298706, "learning_rate": 2.887016065314722e-05, "loss": 0.4145, "step": 16056 }, { "epoch": 0.4228864893336845, "grad_norm": 2.1039559841156006, "learning_rate": 2.8868843824071634e-05, "loss": 1.8087, "step": 16057 }, { "epoch": 0.4229128259151962, "grad_norm": 2.312376022338867, "learning_rate": 2.8867526994996053e-05, "loss": 1.724, "step": 16058 }, { "epoch": 0.42293916249670793, "grad_norm": 3.3007993698120117, "learning_rate": 2.8866210165920465e-05, "loss": 1.0639, "step": 16059 }, { "epoch": 0.4229654990782196, "grad_norm": 2.103898763656616, "learning_rate": 2.8864893336844877e-05, "loss": 2.3941, "step": 16060 }, { "epoch": 0.42299183565973136, "grad_norm": 2.1146938800811768, "learning_rate": 2.886357650776929e-05, "loss": 2.715, "step": 16061 }, { "epoch": 0.4230181722412431, "grad_norm": 2.1182732582092285, "learning_rate": 2.886225967869371e-05, "loss": 2.145, "step": 16062 }, { "epoch": 0.4230445088227548, "grad_norm": 3.7761735916137695, "learning_rate": 2.886094284961812e-05, "loss": 2.6465, "step": 16063 }, { "epoch": 0.42307084540426654, "grad_norm": 8.249588012695312, "learning_rate": 2.8859626020542536e-05, "loss": 1.8051, "step": 16064 }, { "epoch": 0.4230971819857782, "grad_norm": 1.4332759380340576, "learning_rate": 2.885830919146695e-05, "loss": 1.9378, "step": 16065 }, { "epoch": 0.42312351856728997, "grad_norm": 2.812303304672241, "learning_rate": 2.885699236239136e-05, "loss": 1.3506, "step": 16066 }, { "epoch": 0.4231498551488017, "grad_norm": 2.758058547973633, "learning_rate": 2.885567553331578e-05, "loss": 1.8187, "step": 16067 }, { "epoch": 0.4231761917303134, "grad_norm": 1.809330701828003, "learning_rate": 2.8854358704240192e-05, "loss": 1.8237, "step": 16068 }, { "epoch": 0.42320252831182514, "grad_norm": 3.5669260025024414, "learning_rate": 2.8853041875164604e-05, "loss": 1.7585, "step": 16069 }, { "epoch": 0.42322886489333683, "grad_norm": 1.5892715454101562, "learning_rate": 2.8851725046089016e-05, "loss": 1.8642, "step": 16070 }, { "epoch": 0.4232552014748486, "grad_norm": 2.5576107501983643, "learning_rate": 2.8850408217013432e-05, "loss": 1.5692, "step": 16071 }, { "epoch": 0.42328153805636026, "grad_norm": 2.669217824935913, "learning_rate": 2.8849091387937847e-05, "loss": 1.5435, "step": 16072 }, { "epoch": 0.423307874637872, "grad_norm": 1.5828750133514404, "learning_rate": 2.8847774558862263e-05, "loss": 2.1899, "step": 16073 }, { "epoch": 0.42333421121938375, "grad_norm": 2.548888921737671, "learning_rate": 2.8846457729786675e-05, "loss": 1.6104, "step": 16074 }, { "epoch": 0.42336054780089544, "grad_norm": 4.546311855316162, "learning_rate": 2.8845140900711087e-05, "loss": 1.282, "step": 16075 }, { "epoch": 0.4233868843824072, "grad_norm": 1.860329270362854, "learning_rate": 2.8843824071635506e-05, "loss": 1.7186, "step": 16076 }, { "epoch": 0.42341322096391887, "grad_norm": 1.7091431617736816, "learning_rate": 2.884250724255992e-05, "loss": 1.9196, "step": 16077 }, { "epoch": 0.4234395575454306, "grad_norm": 2.13338565826416, "learning_rate": 2.884119041348433e-05, "loss": 1.9921, "step": 16078 }, { "epoch": 0.4234658941269423, "grad_norm": 1.7698370218276978, "learning_rate": 2.8839873584408743e-05, "loss": 1.771, "step": 16079 }, { "epoch": 0.42349223070845404, "grad_norm": 2.5107030868530273, "learning_rate": 2.883855675533316e-05, "loss": 0.5269, "step": 16080 }, { "epoch": 0.4235185672899658, "grad_norm": 1.9949605464935303, "learning_rate": 2.8837239926257574e-05, "loss": 1.9259, "step": 16081 }, { "epoch": 0.4235449038714775, "grad_norm": 4.140947341918945, "learning_rate": 2.883592309718199e-05, "loss": 0.6839, "step": 16082 }, { "epoch": 0.4235712404529892, "grad_norm": 3.2024660110473633, "learning_rate": 2.8834606268106402e-05, "loss": 2.2879, "step": 16083 }, { "epoch": 0.4235975770345009, "grad_norm": 1.5304476022720337, "learning_rate": 2.8833289439030814e-05, "loss": 1.6538, "step": 16084 }, { "epoch": 0.42362391361601265, "grad_norm": 1.7870818376541138, "learning_rate": 2.8831972609955226e-05, "loss": 1.9474, "step": 16085 }, { "epoch": 0.42365025019752434, "grad_norm": 2.1189584732055664, "learning_rate": 2.8830655780879645e-05, "loss": 1.1247, "step": 16086 }, { "epoch": 0.4236765867790361, "grad_norm": 1.9312340021133423, "learning_rate": 2.8829338951804058e-05, "loss": 1.953, "step": 16087 }, { "epoch": 0.4237029233605478, "grad_norm": 2.249635934829712, "learning_rate": 2.882802212272847e-05, "loss": 1.5643, "step": 16088 }, { "epoch": 0.4237292599420595, "grad_norm": 1.6765087842941284, "learning_rate": 2.8826705293652882e-05, "loss": 2.3425, "step": 16089 }, { "epoch": 0.42375559652357125, "grad_norm": 2.5360443592071533, "learning_rate": 2.88253884645773e-05, "loss": 1.6056, "step": 16090 }, { "epoch": 0.42378193310508294, "grad_norm": 3.798560619354248, "learning_rate": 2.8824071635501713e-05, "loss": 0.9149, "step": 16091 }, { "epoch": 0.4238082696865947, "grad_norm": 6.564652442932129, "learning_rate": 2.882275480642613e-05, "loss": 1.5192, "step": 16092 }, { "epoch": 0.4238346062681064, "grad_norm": 2.571826696395874, "learning_rate": 2.882143797735054e-05, "loss": 1.8611, "step": 16093 }, { "epoch": 0.4238609428496181, "grad_norm": 3.632727861404419, "learning_rate": 2.8820121148274953e-05, "loss": 0.7822, "step": 16094 }, { "epoch": 0.42388727943112986, "grad_norm": 2.8242435455322266, "learning_rate": 2.8818804319199372e-05, "loss": 1.3055, "step": 16095 }, { "epoch": 0.42391361601264155, "grad_norm": 2.136639356613159, "learning_rate": 2.8817487490123784e-05, "loss": 1.6273, "step": 16096 }, { "epoch": 0.4239399525941533, "grad_norm": 2.631714105606079, "learning_rate": 2.8816170661048197e-05, "loss": 2.1361, "step": 16097 }, { "epoch": 0.423966289175665, "grad_norm": 2.5165181159973145, "learning_rate": 2.881485383197261e-05, "loss": 1.7164, "step": 16098 }, { "epoch": 0.4239926257571767, "grad_norm": 3.3852922916412354, "learning_rate": 2.8813537002897025e-05, "loss": 0.5745, "step": 16099 }, { "epoch": 0.42401896233868847, "grad_norm": 2.246051073074341, "learning_rate": 2.881222017382144e-05, "loss": 1.9521, "step": 16100 }, { "epoch": 0.42404529892020015, "grad_norm": 2.8720548152923584, "learning_rate": 2.8810903344745856e-05, "loss": 1.7878, "step": 16101 }, { "epoch": 0.4240716355017119, "grad_norm": 2.173828363418579, "learning_rate": 2.8809586515670268e-05, "loss": 1.4198, "step": 16102 }, { "epoch": 0.4240979720832236, "grad_norm": 3.1550397872924805, "learning_rate": 2.880826968659468e-05, "loss": 2.065, "step": 16103 }, { "epoch": 0.42412430866473533, "grad_norm": 3.2687010765075684, "learning_rate": 2.8806952857519092e-05, "loss": 1.5574, "step": 16104 }, { "epoch": 0.424150645246247, "grad_norm": 2.112921953201294, "learning_rate": 2.880563602844351e-05, "loss": 1.754, "step": 16105 }, { "epoch": 0.42417698182775876, "grad_norm": 1.4761923551559448, "learning_rate": 2.8804319199367924e-05, "loss": 1.6966, "step": 16106 }, { "epoch": 0.4242033184092705, "grad_norm": 2.3852739334106445, "learning_rate": 2.8803002370292336e-05, "loss": 1.8944, "step": 16107 }, { "epoch": 0.4242296549907822, "grad_norm": 3.7875638008117676, "learning_rate": 2.880168554121675e-05, "loss": 1.1141, "step": 16108 }, { "epoch": 0.42425599157229393, "grad_norm": 3.1541240215301514, "learning_rate": 2.8800368712141167e-05, "loss": 0.2556, "step": 16109 }, { "epoch": 0.4242823281538056, "grad_norm": 2.062598705291748, "learning_rate": 2.8799051883065583e-05, "loss": 2.0313, "step": 16110 }, { "epoch": 0.42430866473531736, "grad_norm": 1.8286455869674683, "learning_rate": 2.8797735053989995e-05, "loss": 2.028, "step": 16111 }, { "epoch": 0.42433500131682905, "grad_norm": 2.5597963333129883, "learning_rate": 2.8796418224914407e-05, "loss": 1.7595, "step": 16112 }, { "epoch": 0.4243613378983408, "grad_norm": 6.226208209991455, "learning_rate": 2.879510139583882e-05, "loss": 1.8319, "step": 16113 }, { "epoch": 0.42438767447985254, "grad_norm": 1.7025924921035767, "learning_rate": 2.8793784566763238e-05, "loss": 1.6761, "step": 16114 }, { "epoch": 0.4244140110613642, "grad_norm": 2.502087354660034, "learning_rate": 2.879246773768765e-05, "loss": 1.4152, "step": 16115 }, { "epoch": 0.42444034764287597, "grad_norm": 2.1164026260375977, "learning_rate": 2.8791150908612063e-05, "loss": 2.0156, "step": 16116 }, { "epoch": 0.42446668422438766, "grad_norm": 4.253668308258057, "learning_rate": 2.8789834079536475e-05, "loss": 1.4962, "step": 16117 }, { "epoch": 0.4244930208058994, "grad_norm": 1.780917763710022, "learning_rate": 2.878851725046089e-05, "loss": 1.6653, "step": 16118 }, { "epoch": 0.4245193573874111, "grad_norm": 1.9676209688186646, "learning_rate": 2.8787200421385306e-05, "loss": 1.3094, "step": 16119 }, { "epoch": 0.42454569396892283, "grad_norm": 1.6165271997451782, "learning_rate": 2.878588359230972e-05, "loss": 1.7085, "step": 16120 }, { "epoch": 0.4245720305504346, "grad_norm": 1.3759636878967285, "learning_rate": 2.8784566763234134e-05, "loss": 1.5248, "step": 16121 }, { "epoch": 0.42459836713194626, "grad_norm": 2.0079309940338135, "learning_rate": 2.8783249934158546e-05, "loss": 1.3013, "step": 16122 }, { "epoch": 0.424624703713458, "grad_norm": 2.1813623905181885, "learning_rate": 2.8781933105082965e-05, "loss": 1.5161, "step": 16123 }, { "epoch": 0.4246510402949697, "grad_norm": 2.741903066635132, "learning_rate": 2.8780616276007377e-05, "loss": 1.6275, "step": 16124 }, { "epoch": 0.42467737687648144, "grad_norm": 2.2224152088165283, "learning_rate": 2.877929944693179e-05, "loss": 0.9563, "step": 16125 }, { "epoch": 0.4247037134579931, "grad_norm": 1.5816574096679688, "learning_rate": 2.87779826178562e-05, "loss": 1.986, "step": 16126 }, { "epoch": 0.42473005003950487, "grad_norm": 2.1497292518615723, "learning_rate": 2.8776665788780617e-05, "loss": 2.0907, "step": 16127 }, { "epoch": 0.4247563866210166, "grad_norm": 3.007984161376953, "learning_rate": 2.8775348959705033e-05, "loss": 0.763, "step": 16128 }, { "epoch": 0.4247827232025283, "grad_norm": 1.6721152067184448, "learning_rate": 2.877403213062945e-05, "loss": 1.8409, "step": 16129 }, { "epoch": 0.42480905978404004, "grad_norm": 2.0486481189727783, "learning_rate": 2.877271530155386e-05, "loss": 1.5911, "step": 16130 }, { "epoch": 0.42483539636555173, "grad_norm": 2.5473532676696777, "learning_rate": 2.8771398472478273e-05, "loss": 0.8705, "step": 16131 }, { "epoch": 0.4248617329470635, "grad_norm": 1.962107539176941, "learning_rate": 2.8770081643402685e-05, "loss": 1.304, "step": 16132 }, { "epoch": 0.4248880695285752, "grad_norm": 5.7192230224609375, "learning_rate": 2.8768764814327104e-05, "loss": 1.2819, "step": 16133 }, { "epoch": 0.4249144061100869, "grad_norm": 1.6997631788253784, "learning_rate": 2.8767447985251516e-05, "loss": 2.1681, "step": 16134 }, { "epoch": 0.42494074269159865, "grad_norm": 2.3568761348724365, "learning_rate": 2.876613115617593e-05, "loss": 1.8898, "step": 16135 }, { "epoch": 0.42496707927311034, "grad_norm": 2.2935540676116943, "learning_rate": 2.876481432710034e-05, "loss": 1.224, "step": 16136 }, { "epoch": 0.4249934158546221, "grad_norm": 2.655606746673584, "learning_rate": 2.8763497498024756e-05, "loss": 1.0649, "step": 16137 }, { "epoch": 0.42501975243613377, "grad_norm": 2.6828486919403076, "learning_rate": 2.8762180668949172e-05, "loss": 1.9586, "step": 16138 }, { "epoch": 0.4250460890176455, "grad_norm": 3.726508617401123, "learning_rate": 2.8760863839873587e-05, "loss": 1.5692, "step": 16139 }, { "epoch": 0.42507242559915726, "grad_norm": 7.708461284637451, "learning_rate": 2.8759547010798e-05, "loss": 1.3258, "step": 16140 }, { "epoch": 0.42509876218066894, "grad_norm": 4.94729471206665, "learning_rate": 2.8758230181722412e-05, "loss": 2.9704, "step": 16141 }, { "epoch": 0.4251250987621807, "grad_norm": 2.347440004348755, "learning_rate": 2.875691335264683e-05, "loss": 2.0847, "step": 16142 }, { "epoch": 0.4251514353436924, "grad_norm": 1.7017192840576172, "learning_rate": 2.8755596523571243e-05, "loss": 1.9728, "step": 16143 }, { "epoch": 0.4251777719252041, "grad_norm": 2.722093105316162, "learning_rate": 2.8754279694495655e-05, "loss": 0.7793, "step": 16144 }, { "epoch": 0.4252041085067158, "grad_norm": 1.8978090286254883, "learning_rate": 2.8752962865420067e-05, "loss": 1.8681, "step": 16145 }, { "epoch": 0.42523044508822755, "grad_norm": 2.12430477142334, "learning_rate": 2.8751646036344483e-05, "loss": 2.6236, "step": 16146 }, { "epoch": 0.4252567816697393, "grad_norm": 4.18602180480957, "learning_rate": 2.87503292072689e-05, "loss": 1.6183, "step": 16147 }, { "epoch": 0.425283118251251, "grad_norm": 1.5522241592407227, "learning_rate": 2.8749012378193314e-05, "loss": 1.9192, "step": 16148 }, { "epoch": 0.4253094548327627, "grad_norm": 2.20501971244812, "learning_rate": 2.8747695549117726e-05, "loss": 1.7048, "step": 16149 }, { "epoch": 0.4253357914142744, "grad_norm": 1.509581208229065, "learning_rate": 2.874637872004214e-05, "loss": 1.4333, "step": 16150 }, { "epoch": 0.42536212799578615, "grad_norm": 1.7848845720291138, "learning_rate": 2.874506189096655e-05, "loss": 1.7186, "step": 16151 }, { "epoch": 0.42538846457729784, "grad_norm": 2.1628212928771973, "learning_rate": 2.874374506189097e-05, "loss": 1.5007, "step": 16152 }, { "epoch": 0.4254148011588096, "grad_norm": 2.2218024730682373, "learning_rate": 2.8742428232815382e-05, "loss": 1.3011, "step": 16153 }, { "epoch": 0.42544113774032133, "grad_norm": 2.8692786693573, "learning_rate": 2.8741111403739794e-05, "loss": 1.2333, "step": 16154 }, { "epoch": 0.425467474321833, "grad_norm": 2.673438310623169, "learning_rate": 2.873979457466421e-05, "loss": 2.1072, "step": 16155 }, { "epoch": 0.42549381090334476, "grad_norm": 2.060781240463257, "learning_rate": 2.8738477745588625e-05, "loss": 1.981, "step": 16156 }, { "epoch": 0.42552014748485645, "grad_norm": 1.7278990745544434, "learning_rate": 2.873716091651304e-05, "loss": 1.6209, "step": 16157 }, { "epoch": 0.4255464840663682, "grad_norm": 2.8339948654174805, "learning_rate": 2.8735844087437453e-05, "loss": 1.2938, "step": 16158 }, { "epoch": 0.4255728206478799, "grad_norm": 2.2948098182678223, "learning_rate": 2.8734527258361866e-05, "loss": 1.3713, "step": 16159 }, { "epoch": 0.4255991572293916, "grad_norm": 1.9484632015228271, "learning_rate": 2.8733210429286278e-05, "loss": 1.2785, "step": 16160 }, { "epoch": 0.42562549381090337, "grad_norm": 3.2406296730041504, "learning_rate": 2.8731893600210697e-05, "loss": 0.6005, "step": 16161 }, { "epoch": 0.42565183039241505, "grad_norm": 2.459770917892456, "learning_rate": 2.873057677113511e-05, "loss": 0.7126, "step": 16162 }, { "epoch": 0.4256781669739268, "grad_norm": 2.5703790187835693, "learning_rate": 2.872925994205952e-05, "loss": 1.6406, "step": 16163 }, { "epoch": 0.4257045035554385, "grad_norm": 2.0524210929870605, "learning_rate": 2.8727943112983933e-05, "loss": 1.4072, "step": 16164 }, { "epoch": 0.42573084013695023, "grad_norm": 2.111962080001831, "learning_rate": 2.872662628390835e-05, "loss": 1.9734, "step": 16165 }, { "epoch": 0.4257571767184619, "grad_norm": 1.6880728006362915, "learning_rate": 2.8725309454832765e-05, "loss": 1.4628, "step": 16166 }, { "epoch": 0.42578351329997366, "grad_norm": 2.5431389808654785, "learning_rate": 2.872399262575718e-05, "loss": 1.1938, "step": 16167 }, { "epoch": 0.4258098498814854, "grad_norm": 2.4238691329956055, "learning_rate": 2.8722675796681592e-05, "loss": 2.3023, "step": 16168 }, { "epoch": 0.4258361864629971, "grad_norm": 6.673384666442871, "learning_rate": 2.8721358967606005e-05, "loss": 1.9657, "step": 16169 }, { "epoch": 0.42586252304450883, "grad_norm": 2.0809438228607178, "learning_rate": 2.8720042138530417e-05, "loss": 1.079, "step": 16170 }, { "epoch": 0.4258888596260205, "grad_norm": 1.7094929218292236, "learning_rate": 2.8718725309454836e-05, "loss": 1.9744, "step": 16171 }, { "epoch": 0.42591519620753227, "grad_norm": 2.3218657970428467, "learning_rate": 2.8717408480379248e-05, "loss": 1.4451, "step": 16172 }, { "epoch": 0.425941532789044, "grad_norm": 1.6189780235290527, "learning_rate": 2.871609165130366e-05, "loss": 1.8154, "step": 16173 }, { "epoch": 0.4259678693705557, "grad_norm": 3.1208300590515137, "learning_rate": 2.8714774822228076e-05, "loss": 1.5869, "step": 16174 }, { "epoch": 0.42599420595206744, "grad_norm": 1.9011083841323853, "learning_rate": 2.871345799315249e-05, "loss": 1.6565, "step": 16175 }, { "epoch": 0.4260205425335791, "grad_norm": 2.1710498332977295, "learning_rate": 2.8712141164076907e-05, "loss": 1.8116, "step": 16176 }, { "epoch": 0.42604687911509087, "grad_norm": 1.9681960344314575, "learning_rate": 2.871082433500132e-05, "loss": 0.6007, "step": 16177 }, { "epoch": 0.42607321569660256, "grad_norm": 2.6918060779571533, "learning_rate": 2.870950750592573e-05, "loss": 1.838, "step": 16178 }, { "epoch": 0.4260995522781143, "grad_norm": 2.6637935638427734, "learning_rate": 2.8708190676850144e-05, "loss": 2.1202, "step": 16179 }, { "epoch": 0.42612588885962605, "grad_norm": 2.059605836868286, "learning_rate": 2.8706873847774563e-05, "loss": 2.0905, "step": 16180 }, { "epoch": 0.42615222544113773, "grad_norm": 5.3449859619140625, "learning_rate": 2.8705557018698975e-05, "loss": 0.9404, "step": 16181 }, { "epoch": 0.4261785620226495, "grad_norm": 1.579784631729126, "learning_rate": 2.8704240189623387e-05, "loss": 1.3057, "step": 16182 }, { "epoch": 0.42620489860416116, "grad_norm": 1.8917019367218018, "learning_rate": 2.8702923360547803e-05, "loss": 1.8748, "step": 16183 }, { "epoch": 0.4262312351856729, "grad_norm": 2.118757486343384, "learning_rate": 2.8701606531472215e-05, "loss": 1.6654, "step": 16184 }, { "epoch": 0.4262575717671846, "grad_norm": 4.138052940368652, "learning_rate": 2.8700289702396634e-05, "loss": 1.891, "step": 16185 }, { "epoch": 0.42628390834869634, "grad_norm": 1.875789999961853, "learning_rate": 2.8698972873321046e-05, "loss": 1.3612, "step": 16186 }, { "epoch": 0.4263102449302081, "grad_norm": 2.8380467891693115, "learning_rate": 2.8697656044245458e-05, "loss": 1.7744, "step": 16187 }, { "epoch": 0.42633658151171977, "grad_norm": 2.5055181980133057, "learning_rate": 2.869633921516987e-05, "loss": 1.8268, "step": 16188 }, { "epoch": 0.4263629180932315, "grad_norm": 3.3114044666290283, "learning_rate": 2.869502238609429e-05, "loss": 2.2882, "step": 16189 }, { "epoch": 0.4263892546747432, "grad_norm": 4.40805196762085, "learning_rate": 2.86937055570187e-05, "loss": 1.8025, "step": 16190 }, { "epoch": 0.42641559125625494, "grad_norm": 1.9764550924301147, "learning_rate": 2.8692388727943114e-05, "loss": 1.0319, "step": 16191 }, { "epoch": 0.42644192783776663, "grad_norm": 1.6714836359024048, "learning_rate": 2.8691071898867526e-05, "loss": 2.482, "step": 16192 }, { "epoch": 0.4264682644192784, "grad_norm": 1.6736153364181519, "learning_rate": 2.868975506979194e-05, "loss": 1.6923, "step": 16193 }, { "epoch": 0.4264946010007901, "grad_norm": 1.7568933963775635, "learning_rate": 2.8688438240716357e-05, "loss": 2.3367, "step": 16194 }, { "epoch": 0.4265209375823018, "grad_norm": 2.0135488510131836, "learning_rate": 2.8687121411640773e-05, "loss": 0.7544, "step": 16195 }, { "epoch": 0.42654727416381355, "grad_norm": 1.9826570749282837, "learning_rate": 2.8685804582565185e-05, "loss": 1.8779, "step": 16196 }, { "epoch": 0.42657361074532524, "grad_norm": 2.129650354385376, "learning_rate": 2.8684487753489597e-05, "loss": 2.0082, "step": 16197 }, { "epoch": 0.426599947326837, "grad_norm": 2.671468496322632, "learning_rate": 2.868317092441401e-05, "loss": 0.7993, "step": 16198 }, { "epoch": 0.42662628390834867, "grad_norm": 3.3096554279327393, "learning_rate": 2.868185409533843e-05, "loss": 1.3271, "step": 16199 }, { "epoch": 0.4266526204898604, "grad_norm": 2.8333301544189453, "learning_rate": 2.868053726626284e-05, "loss": 1.8865, "step": 16200 }, { "epoch": 0.42667895707137216, "grad_norm": 4.446846008300781, "learning_rate": 2.8679220437187253e-05, "loss": 1.9822, "step": 16201 }, { "epoch": 0.42670529365288384, "grad_norm": 3.2590596675872803, "learning_rate": 2.867790360811167e-05, "loss": 1.6059, "step": 16202 }, { "epoch": 0.4267316302343956, "grad_norm": 2.5601813793182373, "learning_rate": 2.867658677903608e-05, "loss": 1.9725, "step": 16203 }, { "epoch": 0.4267579668159073, "grad_norm": 3.2188899517059326, "learning_rate": 2.86752699499605e-05, "loss": 2.2611, "step": 16204 }, { "epoch": 0.426784303397419, "grad_norm": 2.8070755004882812, "learning_rate": 2.8673953120884912e-05, "loss": 1.3266, "step": 16205 }, { "epoch": 0.42681063997893076, "grad_norm": 1.637706995010376, "learning_rate": 2.8672636291809324e-05, "loss": 2.1, "step": 16206 }, { "epoch": 0.42683697656044245, "grad_norm": 1.4981625080108643, "learning_rate": 2.8671319462733736e-05, "loss": 1.7029, "step": 16207 }, { "epoch": 0.4268633131419542, "grad_norm": 2.7051784992218018, "learning_rate": 2.8670002633658155e-05, "loss": 0.7684, "step": 16208 }, { "epoch": 0.4268896497234659, "grad_norm": 1.3664602041244507, "learning_rate": 2.8668685804582567e-05, "loss": 1.6947, "step": 16209 }, { "epoch": 0.4269159863049776, "grad_norm": 1.5344047546386719, "learning_rate": 2.866736897550698e-05, "loss": 1.6018, "step": 16210 }, { "epoch": 0.4269423228864893, "grad_norm": 2.9111382961273193, "learning_rate": 2.8666052146431395e-05, "loss": 0.7497, "step": 16211 }, { "epoch": 0.42696865946800105, "grad_norm": 2.486198663711548, "learning_rate": 2.8664735317355807e-05, "loss": 1.1718, "step": 16212 }, { "epoch": 0.4269949960495128, "grad_norm": 1.9173650741577148, "learning_rate": 2.8663418488280226e-05, "loss": 3.0041, "step": 16213 }, { "epoch": 0.4270213326310245, "grad_norm": 1.6175276041030884, "learning_rate": 2.866210165920464e-05, "loss": 1.1924, "step": 16214 }, { "epoch": 0.42704766921253623, "grad_norm": 1.9312342405319214, "learning_rate": 2.866078483012905e-05, "loss": 2.1435, "step": 16215 }, { "epoch": 0.4270740057940479, "grad_norm": 3.4961044788360596, "learning_rate": 2.8659468001053463e-05, "loss": 1.2657, "step": 16216 }, { "epoch": 0.42710034237555966, "grad_norm": 1.938255786895752, "learning_rate": 2.8658151171977875e-05, "loss": 2.1474, "step": 16217 }, { "epoch": 0.42712667895707135, "grad_norm": 2.5104610919952393, "learning_rate": 2.8656834342902294e-05, "loss": 0.7439, "step": 16218 }, { "epoch": 0.4271530155385831, "grad_norm": 2.8039329051971436, "learning_rate": 2.8655517513826707e-05, "loss": 1.8992, "step": 16219 }, { "epoch": 0.42717935212009484, "grad_norm": 1.9874725341796875, "learning_rate": 2.865420068475112e-05, "loss": 1.7834, "step": 16220 }, { "epoch": 0.4272056887016065, "grad_norm": 2.7436165809631348, "learning_rate": 2.8652883855675534e-05, "loss": 1.8501, "step": 16221 }, { "epoch": 0.42723202528311827, "grad_norm": 1.694075584411621, "learning_rate": 2.865156702659995e-05, "loss": 1.9433, "step": 16222 }, { "epoch": 0.42725836186462995, "grad_norm": 2.357699155807495, "learning_rate": 2.8650250197524366e-05, "loss": 0.4571, "step": 16223 }, { "epoch": 0.4272846984461417, "grad_norm": 1.7154619693756104, "learning_rate": 2.8648933368448778e-05, "loss": 2.0802, "step": 16224 }, { "epoch": 0.4273110350276534, "grad_norm": 1.6425429582595825, "learning_rate": 2.864761653937319e-05, "loss": 2.1626, "step": 16225 }, { "epoch": 0.42733737160916513, "grad_norm": 1.6343300342559814, "learning_rate": 2.8646299710297602e-05, "loss": 1.759, "step": 16226 }, { "epoch": 0.42736370819067687, "grad_norm": 1.6940662860870361, "learning_rate": 2.864498288122202e-05, "loss": 1.5748, "step": 16227 }, { "epoch": 0.42739004477218856, "grad_norm": 5.884566783905029, "learning_rate": 2.8643666052146433e-05, "loss": 1.0705, "step": 16228 }, { "epoch": 0.4274163813537003, "grad_norm": 1.9199550151824951, "learning_rate": 2.8642349223070846e-05, "loss": 2.1897, "step": 16229 }, { "epoch": 0.427442717935212, "grad_norm": 1.7519567012786865, "learning_rate": 2.864103239399526e-05, "loss": 2.28, "step": 16230 }, { "epoch": 0.42746905451672373, "grad_norm": 1.876975178718567, "learning_rate": 2.8639715564919673e-05, "loss": 1.6683, "step": 16231 }, { "epoch": 0.4274953910982354, "grad_norm": 3.0923686027526855, "learning_rate": 2.8638398735844092e-05, "loss": 0.9194, "step": 16232 }, { "epoch": 0.42752172767974717, "grad_norm": 2.1883909702301025, "learning_rate": 2.8637081906768505e-05, "loss": 0.9229, "step": 16233 }, { "epoch": 0.4275480642612589, "grad_norm": 1.7090818881988525, "learning_rate": 2.8635765077692917e-05, "loss": 1.8723, "step": 16234 }, { "epoch": 0.4275744008427706, "grad_norm": 3.919332981109619, "learning_rate": 2.863444824861733e-05, "loss": 1.8673, "step": 16235 }, { "epoch": 0.42760073742428234, "grad_norm": 3.2115514278411865, "learning_rate": 2.863313141954174e-05, "loss": 1.9871, "step": 16236 }, { "epoch": 0.42762707400579403, "grad_norm": 2.33235764503479, "learning_rate": 2.863181459046616e-05, "loss": 1.8323, "step": 16237 }, { "epoch": 0.42765341058730577, "grad_norm": 2.1285839080810547, "learning_rate": 2.8630497761390572e-05, "loss": 2.2213, "step": 16238 }, { "epoch": 0.4276797471688175, "grad_norm": 2.269050359725952, "learning_rate": 2.8629180932314985e-05, "loss": 1.4518, "step": 16239 }, { "epoch": 0.4277060837503292, "grad_norm": 2.024420738220215, "learning_rate": 2.86278641032394e-05, "loss": 2.097, "step": 16240 }, { "epoch": 0.42773242033184095, "grad_norm": 1.906901478767395, "learning_rate": 2.8626547274163816e-05, "loss": 2.1857, "step": 16241 }, { "epoch": 0.42775875691335263, "grad_norm": 2.3087074756622314, "learning_rate": 2.862523044508823e-05, "loss": 2.4593, "step": 16242 }, { "epoch": 0.4277850934948644, "grad_norm": 1.434691309928894, "learning_rate": 2.8623913616012644e-05, "loss": 1.7301, "step": 16243 }, { "epoch": 0.42781143007637606, "grad_norm": 1.7201263904571533, "learning_rate": 2.8622596786937056e-05, "loss": 1.1348, "step": 16244 }, { "epoch": 0.4278377666578878, "grad_norm": 1.6074718236923218, "learning_rate": 2.8621279957861468e-05, "loss": 1.6182, "step": 16245 }, { "epoch": 0.42786410323939955, "grad_norm": 2.414060115814209, "learning_rate": 2.8619963128785887e-05, "loss": 1.4731, "step": 16246 }, { "epoch": 0.42789043982091124, "grad_norm": 3.246783971786499, "learning_rate": 2.86186462997103e-05, "loss": 2.5249, "step": 16247 }, { "epoch": 0.427916776402423, "grad_norm": 1.9376178979873657, "learning_rate": 2.861732947063471e-05, "loss": 1.985, "step": 16248 }, { "epoch": 0.42794311298393467, "grad_norm": 1.9497365951538086, "learning_rate": 2.8616012641559127e-05, "loss": 1.4396, "step": 16249 }, { "epoch": 0.4279694495654464, "grad_norm": 3.112078905105591, "learning_rate": 2.861469581248354e-05, "loss": 1.7332, "step": 16250 }, { "epoch": 0.4279957861469581, "grad_norm": 2.140838861465454, "learning_rate": 2.8613378983407958e-05, "loss": 0.6323, "step": 16251 }, { "epoch": 0.42802212272846984, "grad_norm": 2.209380626678467, "learning_rate": 2.861206215433237e-05, "loss": 1.6335, "step": 16252 }, { "epoch": 0.4280484593099816, "grad_norm": 1.6760517358779907, "learning_rate": 2.8610745325256783e-05, "loss": 1.8567, "step": 16253 }, { "epoch": 0.4280747958914933, "grad_norm": 1.9600154161453247, "learning_rate": 2.8609428496181195e-05, "loss": 2.5125, "step": 16254 }, { "epoch": 0.428101132473005, "grad_norm": 1.9491287469863892, "learning_rate": 2.8608111667105614e-05, "loss": 2.3887, "step": 16255 }, { "epoch": 0.4281274690545167, "grad_norm": 1.8916538953781128, "learning_rate": 2.8606794838030026e-05, "loss": 0.2084, "step": 16256 }, { "epoch": 0.42815380563602845, "grad_norm": 2.9186549186706543, "learning_rate": 2.8605478008954438e-05, "loss": 1.6804, "step": 16257 }, { "epoch": 0.42818014221754014, "grad_norm": 1.9597272872924805, "learning_rate": 2.8604161179878854e-05, "loss": 0.7496, "step": 16258 }, { "epoch": 0.4282064787990519, "grad_norm": 3.079464912414551, "learning_rate": 2.8602844350803266e-05, "loss": 2.2019, "step": 16259 }, { "epoch": 0.4282328153805636, "grad_norm": 1.9543516635894775, "learning_rate": 2.8601527521727685e-05, "loss": 0.6934, "step": 16260 }, { "epoch": 0.4282591519620753, "grad_norm": 4.954905986785889, "learning_rate": 2.8600210692652097e-05, "loss": 1.1708, "step": 16261 }, { "epoch": 0.42828548854358706, "grad_norm": 3.6004514694213867, "learning_rate": 2.859889386357651e-05, "loss": 1.4056, "step": 16262 }, { "epoch": 0.42831182512509874, "grad_norm": 1.6310805082321167, "learning_rate": 2.859757703450092e-05, "loss": 1.6486, "step": 16263 }, { "epoch": 0.4283381617066105, "grad_norm": 2.038309097290039, "learning_rate": 2.8596260205425334e-05, "loss": 1.5376, "step": 16264 }, { "epoch": 0.4283644982881222, "grad_norm": 2.912075996398926, "learning_rate": 2.8594943376349753e-05, "loss": 2.2676, "step": 16265 }, { "epoch": 0.4283908348696339, "grad_norm": 2.8357462882995605, "learning_rate": 2.8593626547274165e-05, "loss": 1.5262, "step": 16266 }, { "epoch": 0.42841717145114566, "grad_norm": 2.675537586212158, "learning_rate": 2.8592309718198577e-05, "loss": 1.7726, "step": 16267 }, { "epoch": 0.42844350803265735, "grad_norm": 3.3788669109344482, "learning_rate": 2.8590992889122993e-05, "loss": 1.441, "step": 16268 }, { "epoch": 0.4284698446141691, "grad_norm": 2.9190473556518555, "learning_rate": 2.8589676060047405e-05, "loss": 0.4958, "step": 16269 }, { "epoch": 0.4284961811956808, "grad_norm": 5.56238317489624, "learning_rate": 2.8588359230971824e-05, "loss": 1.7465, "step": 16270 }, { "epoch": 0.4285225177771925, "grad_norm": 1.975429654121399, "learning_rate": 2.8587042401896236e-05, "loss": 1.1192, "step": 16271 }, { "epoch": 0.42854885435870427, "grad_norm": 1.9163085222244263, "learning_rate": 2.858572557282065e-05, "loss": 1.9938, "step": 16272 }, { "epoch": 0.42857519094021596, "grad_norm": 2.503218173980713, "learning_rate": 2.858440874374506e-05, "loss": 1.2884, "step": 16273 }, { "epoch": 0.4286015275217277, "grad_norm": 3.577244520187378, "learning_rate": 2.858309191466948e-05, "loss": 1.0316, "step": 16274 }, { "epoch": 0.4286278641032394, "grad_norm": 3.658782720565796, "learning_rate": 2.8581775085593892e-05, "loss": 1.1198, "step": 16275 }, { "epoch": 0.42865420068475113, "grad_norm": 2.451082468032837, "learning_rate": 2.8580458256518304e-05, "loss": 1.4398, "step": 16276 }, { "epoch": 0.4286805372662628, "grad_norm": 1.8286141157150269, "learning_rate": 2.857914142744272e-05, "loss": 2.6038, "step": 16277 }, { "epoch": 0.42870687384777456, "grad_norm": 2.7148830890655518, "learning_rate": 2.8577824598367132e-05, "loss": 1.314, "step": 16278 }, { "epoch": 0.4287332104292863, "grad_norm": 1.568585753440857, "learning_rate": 2.857650776929155e-05, "loss": 2.0191, "step": 16279 }, { "epoch": 0.428759547010798, "grad_norm": 2.098501443862915, "learning_rate": 2.8575190940215963e-05, "loss": 1.6439, "step": 16280 }, { "epoch": 0.42878588359230974, "grad_norm": 1.7789452075958252, "learning_rate": 2.8573874111140375e-05, "loss": 2.2273, "step": 16281 }, { "epoch": 0.4288122201738214, "grad_norm": 2.408709764480591, "learning_rate": 2.8572557282064788e-05, "loss": 1.5854, "step": 16282 }, { "epoch": 0.42883855675533317, "grad_norm": 1.5255910158157349, "learning_rate": 2.85712404529892e-05, "loss": 1.9672, "step": 16283 }, { "epoch": 0.42886489333684485, "grad_norm": 3.492717742919922, "learning_rate": 2.856992362391362e-05, "loss": 1.2016, "step": 16284 }, { "epoch": 0.4288912299183566, "grad_norm": 3.4919309616088867, "learning_rate": 2.856860679483803e-05, "loss": 1.942, "step": 16285 }, { "epoch": 0.42891756649986834, "grad_norm": 2.008453369140625, "learning_rate": 2.8567289965762447e-05, "loss": 1.498, "step": 16286 }, { "epoch": 0.42894390308138003, "grad_norm": 1.629468321800232, "learning_rate": 2.856597313668686e-05, "loss": 2.1533, "step": 16287 }, { "epoch": 0.4289702396628918, "grad_norm": 2.5563361644744873, "learning_rate": 2.8564656307611278e-05, "loss": 1.2936, "step": 16288 }, { "epoch": 0.42899657624440346, "grad_norm": 1.9235506057739258, "learning_rate": 2.856333947853569e-05, "loss": 1.0663, "step": 16289 }, { "epoch": 0.4290229128259152, "grad_norm": 1.5925219058990479, "learning_rate": 2.8562022649460102e-05, "loss": 1.638, "step": 16290 }, { "epoch": 0.4290492494074269, "grad_norm": 3.5170552730560303, "learning_rate": 2.8560705820384514e-05, "loss": 1.3771, "step": 16291 }, { "epoch": 0.42907558598893863, "grad_norm": 1.7583720684051514, "learning_rate": 2.8559388991308927e-05, "loss": 1.9183, "step": 16292 }, { "epoch": 0.4291019225704504, "grad_norm": 1.329180359840393, "learning_rate": 2.8558072162233346e-05, "loss": 0.4357, "step": 16293 }, { "epoch": 0.42912825915196207, "grad_norm": 4.115122318267822, "learning_rate": 2.8556755333157758e-05, "loss": 1.6922, "step": 16294 }, { "epoch": 0.4291545957334738, "grad_norm": 1.391040325164795, "learning_rate": 2.855543850408217e-05, "loss": 1.5295, "step": 16295 }, { "epoch": 0.4291809323149855, "grad_norm": 3.9990217685699463, "learning_rate": 2.8554121675006586e-05, "loss": 1.4757, "step": 16296 }, { "epoch": 0.42920726889649724, "grad_norm": 1.9993547201156616, "learning_rate": 2.8552804845930998e-05, "loss": 2.0338, "step": 16297 }, { "epoch": 0.42923360547800893, "grad_norm": 2.128110885620117, "learning_rate": 2.8551488016855417e-05, "loss": 2.2539, "step": 16298 }, { "epoch": 0.42925994205952067, "grad_norm": 3.921907424926758, "learning_rate": 2.855017118777983e-05, "loss": 1.1863, "step": 16299 }, { "epoch": 0.4292862786410324, "grad_norm": 1.785980224609375, "learning_rate": 2.854885435870424e-05, "loss": 1.802, "step": 16300 }, { "epoch": 0.4293126152225441, "grad_norm": 3.599180221557617, "learning_rate": 2.8547537529628653e-05, "loss": 2.0091, "step": 16301 }, { "epoch": 0.42933895180405585, "grad_norm": 2.1746342182159424, "learning_rate": 2.8546220700553066e-05, "loss": 1.7942, "step": 16302 }, { "epoch": 0.42936528838556753, "grad_norm": 2.8276827335357666, "learning_rate": 2.8544903871477485e-05, "loss": 0.3247, "step": 16303 }, { "epoch": 0.4293916249670793, "grad_norm": 4.218794345855713, "learning_rate": 2.8543587042401897e-05, "loss": 1.0507, "step": 16304 }, { "epoch": 0.429417961548591, "grad_norm": 1.9216302633285522, "learning_rate": 2.8542270213326312e-05, "loss": 2.2101, "step": 16305 }, { "epoch": 0.4294442981301027, "grad_norm": 2.8261210918426514, "learning_rate": 2.8540953384250725e-05, "loss": 1.2542, "step": 16306 }, { "epoch": 0.42947063471161445, "grad_norm": 2.011619806289673, "learning_rate": 2.8539636555175144e-05, "loss": 1.7692, "step": 16307 }, { "epoch": 0.42949697129312614, "grad_norm": 3.0594727993011475, "learning_rate": 2.8538319726099556e-05, "loss": 0.8944, "step": 16308 }, { "epoch": 0.4295233078746379, "grad_norm": 1.8649332523345947, "learning_rate": 2.8537002897023968e-05, "loss": 1.2833, "step": 16309 }, { "epoch": 0.42954964445614957, "grad_norm": 1.578713297843933, "learning_rate": 2.853568606794838e-05, "loss": 1.7837, "step": 16310 }, { "epoch": 0.4295759810376613, "grad_norm": 3.6511008739471436, "learning_rate": 2.8534369238872792e-05, "loss": 2.4896, "step": 16311 }, { "epoch": 0.42960231761917306, "grad_norm": 1.9747178554534912, "learning_rate": 2.853305240979721e-05, "loss": 1.8094, "step": 16312 }, { "epoch": 0.42962865420068475, "grad_norm": 2.046444892883301, "learning_rate": 2.8531735580721624e-05, "loss": 1.6579, "step": 16313 }, { "epoch": 0.4296549907821965, "grad_norm": 4.64616060256958, "learning_rate": 2.8530418751646036e-05, "loss": 1.9714, "step": 16314 }, { "epoch": 0.4296813273637082, "grad_norm": 4.983663558959961, "learning_rate": 2.852910192257045e-05, "loss": 1.3401, "step": 16315 }, { "epoch": 0.4297076639452199, "grad_norm": 2.8350348472595215, "learning_rate": 2.8527785093494864e-05, "loss": 2.3301, "step": 16316 }, { "epoch": 0.4297340005267316, "grad_norm": 1.832379937171936, "learning_rate": 2.8526468264419283e-05, "loss": 1.6732, "step": 16317 }, { "epoch": 0.42976033710824335, "grad_norm": 1.5315078496932983, "learning_rate": 2.8525151435343695e-05, "loss": 2.3453, "step": 16318 }, { "epoch": 0.4297866736897551, "grad_norm": 2.1844592094421387, "learning_rate": 2.8523834606268107e-05, "loss": 1.7826, "step": 16319 }, { "epoch": 0.4298130102712668, "grad_norm": 2.5775365829467773, "learning_rate": 2.852251777719252e-05, "loss": 1.9782, "step": 16320 }, { "epoch": 0.4298393468527785, "grad_norm": 1.9965500831604004, "learning_rate": 2.8521200948116938e-05, "loss": 1.702, "step": 16321 }, { "epoch": 0.4298656834342902, "grad_norm": 1.993498682975769, "learning_rate": 2.851988411904135e-05, "loss": 1.7707, "step": 16322 }, { "epoch": 0.42989202001580196, "grad_norm": 2.2596898078918457, "learning_rate": 2.8518567289965763e-05, "loss": 0.5715, "step": 16323 }, { "epoch": 0.42991835659731364, "grad_norm": 3.460712432861328, "learning_rate": 2.8517250460890178e-05, "loss": 1.5325, "step": 16324 }, { "epoch": 0.4299446931788254, "grad_norm": 3.9280924797058105, "learning_rate": 2.851593363181459e-05, "loss": 1.4971, "step": 16325 }, { "epoch": 0.42997102976033713, "grad_norm": 1.9667539596557617, "learning_rate": 2.851461680273901e-05, "loss": 1.1646, "step": 16326 }, { "epoch": 0.4299973663418488, "grad_norm": 3.4098823070526123, "learning_rate": 2.851329997366342e-05, "loss": 2.3861, "step": 16327 }, { "epoch": 0.43002370292336056, "grad_norm": 2.2132582664489746, "learning_rate": 2.8511983144587834e-05, "loss": 1.357, "step": 16328 }, { "epoch": 0.43005003950487225, "grad_norm": 2.117368459701538, "learning_rate": 2.8510666315512246e-05, "loss": 0.4702, "step": 16329 }, { "epoch": 0.430076376086384, "grad_norm": 1.898853063583374, "learning_rate": 2.8509349486436658e-05, "loss": 1.3479, "step": 16330 }, { "epoch": 0.4301027126678957, "grad_norm": 2.343618869781494, "learning_rate": 2.8508032657361077e-05, "loss": 1.9506, "step": 16331 }, { "epoch": 0.4301290492494074, "grad_norm": 1.6748192310333252, "learning_rate": 2.850671582828549e-05, "loss": 1.7221, "step": 16332 }, { "epoch": 0.43015538583091917, "grad_norm": 2.4770283699035645, "learning_rate": 2.8505398999209905e-05, "loss": 0.3854, "step": 16333 }, { "epoch": 0.43018172241243086, "grad_norm": 2.526390790939331, "learning_rate": 2.8504082170134317e-05, "loss": 2.1985, "step": 16334 }, { "epoch": 0.4302080589939426, "grad_norm": 1.5955396890640259, "learning_rate": 2.850276534105873e-05, "loss": 2.0176, "step": 16335 }, { "epoch": 0.4302343955754543, "grad_norm": 2.1602351665496826, "learning_rate": 2.850144851198315e-05, "loss": 2.0718, "step": 16336 }, { "epoch": 0.43026073215696603, "grad_norm": 2.4446284770965576, "learning_rate": 2.850013168290756e-05, "loss": 2.1879, "step": 16337 }, { "epoch": 0.4302870687384777, "grad_norm": 1.8843352794647217, "learning_rate": 2.8498814853831973e-05, "loss": 1.5665, "step": 16338 }, { "epoch": 0.43031340531998946, "grad_norm": 2.402031660079956, "learning_rate": 2.8497498024756385e-05, "loss": 1.3319, "step": 16339 }, { "epoch": 0.4303397419015012, "grad_norm": 1.7032818794250488, "learning_rate": 2.8496181195680804e-05, "loss": 1.4115, "step": 16340 }, { "epoch": 0.4303660784830129, "grad_norm": 3.1331326961517334, "learning_rate": 2.8494864366605216e-05, "loss": 1.5468, "step": 16341 }, { "epoch": 0.43039241506452464, "grad_norm": 1.6129095554351807, "learning_rate": 2.849354753752963e-05, "loss": 1.495, "step": 16342 }, { "epoch": 0.4304187516460363, "grad_norm": 2.6606736183166504, "learning_rate": 2.8492230708454044e-05, "loss": 1.5746, "step": 16343 }, { "epoch": 0.43044508822754807, "grad_norm": 4.29287576675415, "learning_rate": 2.8490913879378456e-05, "loss": 0.8516, "step": 16344 }, { "epoch": 0.4304714248090598, "grad_norm": 1.9414594173431396, "learning_rate": 2.8489597050302875e-05, "loss": 2.2639, "step": 16345 }, { "epoch": 0.4304977613905715, "grad_norm": 2.6296045780181885, "learning_rate": 2.8488280221227288e-05, "loss": 1.6935, "step": 16346 }, { "epoch": 0.43052409797208324, "grad_norm": 4.665785312652588, "learning_rate": 2.84869633921517e-05, "loss": 1.1521, "step": 16347 }, { "epoch": 0.43055043455359493, "grad_norm": 5.824530124664307, "learning_rate": 2.8485646563076112e-05, "loss": 1.3369, "step": 16348 }, { "epoch": 0.4305767711351067, "grad_norm": 3.34356689453125, "learning_rate": 2.8484329734000524e-05, "loss": 1.7588, "step": 16349 }, { "epoch": 0.43060310771661836, "grad_norm": 1.9068779945373535, "learning_rate": 2.8483012904924943e-05, "loss": 1.8227, "step": 16350 }, { "epoch": 0.4306294442981301, "grad_norm": 3.3791561126708984, "learning_rate": 2.8481696075849355e-05, "loss": 1.8706, "step": 16351 }, { "epoch": 0.43065578087964185, "grad_norm": 2.809380531311035, "learning_rate": 2.848037924677377e-05, "loss": 1.5818, "step": 16352 }, { "epoch": 0.43068211746115354, "grad_norm": 2.157747983932495, "learning_rate": 2.8479062417698183e-05, "loss": 1.8067, "step": 16353 }, { "epoch": 0.4307084540426653, "grad_norm": 1.7366297245025635, "learning_rate": 2.8477745588622602e-05, "loss": 1.6633, "step": 16354 }, { "epoch": 0.43073479062417697, "grad_norm": 2.163754463195801, "learning_rate": 2.8476428759547014e-05, "loss": 1.707, "step": 16355 }, { "epoch": 0.4307611272056887, "grad_norm": 2.410123109817505, "learning_rate": 2.8475111930471427e-05, "loss": 1.8286, "step": 16356 }, { "epoch": 0.4307874637872004, "grad_norm": 1.6513198614120483, "learning_rate": 2.847379510139584e-05, "loss": 1.8406, "step": 16357 }, { "epoch": 0.43081380036871214, "grad_norm": 3.8365464210510254, "learning_rate": 2.847247827232025e-05, "loss": 1.2741, "step": 16358 }, { "epoch": 0.4308401369502239, "grad_norm": 4.509221076965332, "learning_rate": 2.847116144324467e-05, "loss": 1.6658, "step": 16359 }, { "epoch": 0.43086647353173557, "grad_norm": 1.5654850006103516, "learning_rate": 2.8469844614169082e-05, "loss": 2.6995, "step": 16360 }, { "epoch": 0.4308928101132473, "grad_norm": 2.0247111320495605, "learning_rate": 2.8468527785093498e-05, "loss": 2.5882, "step": 16361 }, { "epoch": 0.430919146694759, "grad_norm": 3.857208251953125, "learning_rate": 2.846721095601791e-05, "loss": 1.3429, "step": 16362 }, { "epoch": 0.43094548327627075, "grad_norm": 2.45232892036438, "learning_rate": 2.8465894126942322e-05, "loss": 1.9087, "step": 16363 }, { "epoch": 0.43097181985778243, "grad_norm": 1.613037347793579, "learning_rate": 2.846457729786674e-05, "loss": 1.7584, "step": 16364 }, { "epoch": 0.4309981564392942, "grad_norm": 2.3972525596618652, "learning_rate": 2.8463260468791153e-05, "loss": 1.4356, "step": 16365 }, { "epoch": 0.4310244930208059, "grad_norm": 1.428754448890686, "learning_rate": 2.8461943639715566e-05, "loss": 0.2875, "step": 16366 }, { "epoch": 0.4310508296023176, "grad_norm": 2.2029333114624023, "learning_rate": 2.8460626810639978e-05, "loss": 1.7077, "step": 16367 }, { "epoch": 0.43107716618382935, "grad_norm": 2.456862449645996, "learning_rate": 2.845930998156439e-05, "loss": 1.5504, "step": 16368 }, { "epoch": 0.43110350276534104, "grad_norm": 2.1207661628723145, "learning_rate": 2.845799315248881e-05, "loss": 2.138, "step": 16369 }, { "epoch": 0.4311298393468528, "grad_norm": 2.8177073001861572, "learning_rate": 2.845667632341322e-05, "loss": 2.0563, "step": 16370 }, { "epoch": 0.43115617592836447, "grad_norm": 1.7018980979919434, "learning_rate": 2.8455359494337637e-05, "loss": 1.4509, "step": 16371 }, { "epoch": 0.4311825125098762, "grad_norm": 5.539059162139893, "learning_rate": 2.845404266526205e-05, "loss": 1.1372, "step": 16372 }, { "epoch": 0.43120884909138796, "grad_norm": 3.2257654666900635, "learning_rate": 2.8452725836186468e-05, "loss": 2.1277, "step": 16373 }, { "epoch": 0.43123518567289965, "grad_norm": 3.392864942550659, "learning_rate": 2.845140900711088e-05, "loss": 1.1504, "step": 16374 }, { "epoch": 0.4312615222544114, "grad_norm": 1.6590944528579712, "learning_rate": 2.8450092178035292e-05, "loss": 1.9739, "step": 16375 }, { "epoch": 0.4312878588359231, "grad_norm": 2.2991340160369873, "learning_rate": 2.8448775348959705e-05, "loss": 1.8108, "step": 16376 }, { "epoch": 0.4313141954174348, "grad_norm": 1.9851282835006714, "learning_rate": 2.8447458519884117e-05, "loss": 1.8127, "step": 16377 }, { "epoch": 0.43134053199894656, "grad_norm": 1.7383449077606201, "learning_rate": 2.8446141690808536e-05, "loss": 2.2359, "step": 16378 }, { "epoch": 0.43136686858045825, "grad_norm": 2.4224584102630615, "learning_rate": 2.8444824861732948e-05, "loss": 0.7735, "step": 16379 }, { "epoch": 0.43139320516197, "grad_norm": 3.781935930252075, "learning_rate": 2.8443508032657364e-05, "loss": 1.8866, "step": 16380 }, { "epoch": 0.4314195417434817, "grad_norm": 1.9859447479248047, "learning_rate": 2.8442191203581776e-05, "loss": 1.9327, "step": 16381 }, { "epoch": 0.4314458783249934, "grad_norm": 2.18391752243042, "learning_rate": 2.8440874374506188e-05, "loss": 1.5739, "step": 16382 }, { "epoch": 0.4314722149065051, "grad_norm": 1.9801342487335205, "learning_rate": 2.8439557545430607e-05, "loss": 1.3973, "step": 16383 }, { "epoch": 0.43149855148801686, "grad_norm": 2.1786768436431885, "learning_rate": 2.843824071635502e-05, "loss": 1.7656, "step": 16384 }, { "epoch": 0.4315248880695286, "grad_norm": 3.137179374694824, "learning_rate": 2.843692388727943e-05, "loss": 2.1335, "step": 16385 }, { "epoch": 0.4315512246510403, "grad_norm": 1.8687926530838013, "learning_rate": 2.8435607058203844e-05, "loss": 2.5486, "step": 16386 }, { "epoch": 0.43157756123255203, "grad_norm": 3.301006317138672, "learning_rate": 2.8434290229128263e-05, "loss": 0.3844, "step": 16387 }, { "epoch": 0.4316038978140637, "grad_norm": 4.079667568206787, "learning_rate": 2.8432973400052675e-05, "loss": 0.9807, "step": 16388 }, { "epoch": 0.43163023439557546, "grad_norm": 3.8437492847442627, "learning_rate": 2.843165657097709e-05, "loss": 1.8664, "step": 16389 }, { "epoch": 0.43165657097708715, "grad_norm": 1.8211383819580078, "learning_rate": 2.8430339741901503e-05, "loss": 2.09, "step": 16390 }, { "epoch": 0.4316829075585989, "grad_norm": 2.4085426330566406, "learning_rate": 2.8429022912825915e-05, "loss": 1.765, "step": 16391 }, { "epoch": 0.43170924414011064, "grad_norm": 1.6934255361557007, "learning_rate": 2.8427706083750334e-05, "loss": 2.0942, "step": 16392 }, { "epoch": 0.4317355807216223, "grad_norm": 1.5776584148406982, "learning_rate": 2.8426389254674746e-05, "loss": 1.9435, "step": 16393 }, { "epoch": 0.43176191730313407, "grad_norm": 2.0898146629333496, "learning_rate": 2.8425072425599158e-05, "loss": 1.8544, "step": 16394 }, { "epoch": 0.43178825388464576, "grad_norm": 2.8114964962005615, "learning_rate": 2.842375559652357e-05, "loss": 1.2003, "step": 16395 }, { "epoch": 0.4318145904661575, "grad_norm": 2.9184961318969727, "learning_rate": 2.8422438767447983e-05, "loss": 2.6132, "step": 16396 }, { "epoch": 0.4318409270476692, "grad_norm": 2.2989752292633057, "learning_rate": 2.84211219383724e-05, "loss": 2.1767, "step": 16397 }, { "epoch": 0.43186726362918093, "grad_norm": 2.08762788772583, "learning_rate": 2.8419805109296814e-05, "loss": 1.0648, "step": 16398 }, { "epoch": 0.4318936002106927, "grad_norm": 2.394007682800293, "learning_rate": 2.841848828022123e-05, "loss": 1.9494, "step": 16399 }, { "epoch": 0.43191993679220436, "grad_norm": 2.7329792976379395, "learning_rate": 2.8417171451145642e-05, "loss": 1.4813, "step": 16400 }, { "epoch": 0.4319462733737161, "grad_norm": 1.8522226810455322, "learning_rate": 2.841585462207006e-05, "loss": 1.7778, "step": 16401 }, { "epoch": 0.4319726099552278, "grad_norm": 2.5576720237731934, "learning_rate": 2.8414537792994473e-05, "loss": 1.631, "step": 16402 }, { "epoch": 0.43199894653673954, "grad_norm": 3.952995777130127, "learning_rate": 2.8413220963918885e-05, "loss": 0.7376, "step": 16403 }, { "epoch": 0.4320252831182512, "grad_norm": 1.7281336784362793, "learning_rate": 2.8411904134843297e-05, "loss": 1.6091, "step": 16404 }, { "epoch": 0.43205161969976297, "grad_norm": 2.3904290199279785, "learning_rate": 2.841058730576771e-05, "loss": 1.9631, "step": 16405 }, { "epoch": 0.4320779562812747, "grad_norm": 3.5860753059387207, "learning_rate": 2.840927047669213e-05, "loss": 2.4206, "step": 16406 }, { "epoch": 0.4321042928627864, "grad_norm": 4.733084201812744, "learning_rate": 2.840795364761654e-05, "loss": 1.1107, "step": 16407 }, { "epoch": 0.43213062944429814, "grad_norm": 1.4516595602035522, "learning_rate": 2.8406636818540956e-05, "loss": 1.8459, "step": 16408 }, { "epoch": 0.43215696602580983, "grad_norm": 4.789068698883057, "learning_rate": 2.840531998946537e-05, "loss": 1.3535, "step": 16409 }, { "epoch": 0.4321833026073216, "grad_norm": 1.5293134450912476, "learning_rate": 2.840400316038978e-05, "loss": 1.8594, "step": 16410 }, { "epoch": 0.4322096391888333, "grad_norm": 2.584707736968994, "learning_rate": 2.84026863313142e-05, "loss": 1.5222, "step": 16411 }, { "epoch": 0.432235975770345, "grad_norm": 1.5266469717025757, "learning_rate": 2.8401369502238612e-05, "loss": 2.3191, "step": 16412 }, { "epoch": 0.43226231235185675, "grad_norm": 1.7189316749572754, "learning_rate": 2.8400052673163024e-05, "loss": 1.7969, "step": 16413 }, { "epoch": 0.43228864893336844, "grad_norm": 1.8869389295578003, "learning_rate": 2.8398735844087436e-05, "loss": 2.1453, "step": 16414 }, { "epoch": 0.4323149855148802, "grad_norm": 1.6695712804794312, "learning_rate": 2.839741901501185e-05, "loss": 1.4628, "step": 16415 }, { "epoch": 0.43234132209639187, "grad_norm": 4.673247814178467, "learning_rate": 2.8396102185936268e-05, "loss": 0.482, "step": 16416 }, { "epoch": 0.4323676586779036, "grad_norm": 1.7116349935531616, "learning_rate": 2.839478535686068e-05, "loss": 2.653, "step": 16417 }, { "epoch": 0.43239399525941535, "grad_norm": 1.8586455583572388, "learning_rate": 2.8393468527785095e-05, "loss": 1.7581, "step": 16418 }, { "epoch": 0.43242033184092704, "grad_norm": 6.648214817047119, "learning_rate": 2.8392151698709508e-05, "loss": 1.9338, "step": 16419 }, { "epoch": 0.4324466684224388, "grad_norm": 2.884911298751831, "learning_rate": 2.8390834869633927e-05, "loss": 1.7442, "step": 16420 }, { "epoch": 0.4324730050039505, "grad_norm": 3.1319708824157715, "learning_rate": 2.838951804055834e-05, "loss": 1.619, "step": 16421 }, { "epoch": 0.4324993415854622, "grad_norm": 1.5436334609985352, "learning_rate": 2.838820121148275e-05, "loss": 1.5905, "step": 16422 }, { "epoch": 0.4325256781669739, "grad_norm": 1.6361109018325806, "learning_rate": 2.8386884382407163e-05, "loss": 1.7514, "step": 16423 }, { "epoch": 0.43255201474848565, "grad_norm": 2.9640231132507324, "learning_rate": 2.8385567553331575e-05, "loss": 1.2714, "step": 16424 }, { "epoch": 0.4325783513299974, "grad_norm": 3.1678459644317627, "learning_rate": 2.8384250724255994e-05, "loss": 0.7944, "step": 16425 }, { "epoch": 0.4326046879115091, "grad_norm": 1.590768814086914, "learning_rate": 2.8382933895180407e-05, "loss": 1.4295, "step": 16426 }, { "epoch": 0.4326310244930208, "grad_norm": 1.7419683933258057, "learning_rate": 2.8381617066104822e-05, "loss": 1.554, "step": 16427 }, { "epoch": 0.4326573610745325, "grad_norm": 2.3588669300079346, "learning_rate": 2.8380300237029234e-05, "loss": 1.8619, "step": 16428 }, { "epoch": 0.43268369765604425, "grad_norm": 1.926154613494873, "learning_rate": 2.8378983407953647e-05, "loss": 1.2702, "step": 16429 }, { "epoch": 0.43271003423755594, "grad_norm": 4.687825679779053, "learning_rate": 2.8377666578878066e-05, "loss": 1.198, "step": 16430 }, { "epoch": 0.4327363708190677, "grad_norm": 1.9334768056869507, "learning_rate": 2.8376349749802478e-05, "loss": 2.1527, "step": 16431 }, { "epoch": 0.4327627074005794, "grad_norm": 1.7973761558532715, "learning_rate": 2.837503292072689e-05, "loss": 2.0785, "step": 16432 }, { "epoch": 0.4327890439820911, "grad_norm": 1.5084431171417236, "learning_rate": 2.8373716091651302e-05, "loss": 1.7387, "step": 16433 }, { "epoch": 0.43281538056360286, "grad_norm": 2.261404037475586, "learning_rate": 2.837239926257572e-05, "loss": 0.9094, "step": 16434 }, { "epoch": 0.43284171714511455, "grad_norm": 3.752955436706543, "learning_rate": 2.8371082433500133e-05, "loss": 2.2964, "step": 16435 }, { "epoch": 0.4328680537266263, "grad_norm": 1.6899412870407104, "learning_rate": 2.836976560442455e-05, "loss": 1.3676, "step": 16436 }, { "epoch": 0.432894390308138, "grad_norm": 3.6785402297973633, "learning_rate": 2.836844877534896e-05, "loss": 1.9773, "step": 16437 }, { "epoch": 0.4329207268896497, "grad_norm": 1.5583354234695435, "learning_rate": 2.8367131946273373e-05, "loss": 1.2187, "step": 16438 }, { "epoch": 0.43294706347116146, "grad_norm": 1.9530384540557861, "learning_rate": 2.8365815117197792e-05, "loss": 1.8882, "step": 16439 }, { "epoch": 0.43297340005267315, "grad_norm": 4.859649181365967, "learning_rate": 2.8364498288122205e-05, "loss": 0.9564, "step": 16440 }, { "epoch": 0.4329997366341849, "grad_norm": 1.9613500833511353, "learning_rate": 2.8363181459046617e-05, "loss": 2.6533, "step": 16441 }, { "epoch": 0.4330260732156966, "grad_norm": 1.6729531288146973, "learning_rate": 2.836186462997103e-05, "loss": 1.797, "step": 16442 }, { "epoch": 0.4330524097972083, "grad_norm": 1.5913046598434448, "learning_rate": 2.836054780089544e-05, "loss": 1.7418, "step": 16443 }, { "epoch": 0.43307874637872007, "grad_norm": 3.0342214107513428, "learning_rate": 2.835923097181986e-05, "loss": 1.3378, "step": 16444 }, { "epoch": 0.43310508296023176, "grad_norm": 4.726546764373779, "learning_rate": 2.8357914142744272e-05, "loss": 1.392, "step": 16445 }, { "epoch": 0.4331314195417435, "grad_norm": 2.0671494007110596, "learning_rate": 2.8356597313668688e-05, "loss": 1.8711, "step": 16446 }, { "epoch": 0.4331577561232552, "grad_norm": 2.8993146419525146, "learning_rate": 2.83552804845931e-05, "loss": 2.1863, "step": 16447 }, { "epoch": 0.43318409270476693, "grad_norm": 1.5807468891143799, "learning_rate": 2.8353963655517512e-05, "loss": 2.6426, "step": 16448 }, { "epoch": 0.4332104292862786, "grad_norm": 1.556812047958374, "learning_rate": 2.835264682644193e-05, "loss": 2.0733, "step": 16449 }, { "epoch": 0.43323676586779036, "grad_norm": 2.182522773742676, "learning_rate": 2.8351329997366344e-05, "loss": 2.089, "step": 16450 }, { "epoch": 0.4332631024493021, "grad_norm": 2.5087220668792725, "learning_rate": 2.8350013168290756e-05, "loss": 1.3287, "step": 16451 }, { "epoch": 0.4332894390308138, "grad_norm": 1.84579336643219, "learning_rate": 2.8348696339215168e-05, "loss": 1.7719, "step": 16452 }, { "epoch": 0.43331577561232554, "grad_norm": 1.531093716621399, "learning_rate": 2.8347379510139587e-05, "loss": 2.1779, "step": 16453 }, { "epoch": 0.4333421121938372, "grad_norm": 2.491909980773926, "learning_rate": 2.8346062681064e-05, "loss": 1.1051, "step": 16454 }, { "epoch": 0.43336844877534897, "grad_norm": 2.3045425415039062, "learning_rate": 2.8344745851988415e-05, "loss": 1.3491, "step": 16455 }, { "epoch": 0.43339478535686066, "grad_norm": 1.6769473552703857, "learning_rate": 2.8343429022912827e-05, "loss": 1.0622, "step": 16456 }, { "epoch": 0.4334211219383724, "grad_norm": 5.449374675750732, "learning_rate": 2.834211219383724e-05, "loss": 0.9839, "step": 16457 }, { "epoch": 0.43344745851988414, "grad_norm": 2.3553080558776855, "learning_rate": 2.8340795364761658e-05, "loss": 1.5497, "step": 16458 }, { "epoch": 0.43347379510139583, "grad_norm": 2.1973652839660645, "learning_rate": 2.833947853568607e-05, "loss": 1.4848, "step": 16459 }, { "epoch": 0.4335001316829076, "grad_norm": 1.6975960731506348, "learning_rate": 2.8338161706610483e-05, "loss": 0.3975, "step": 16460 }, { "epoch": 0.43352646826441926, "grad_norm": 2.5517799854278564, "learning_rate": 2.8336844877534895e-05, "loss": 1.1243, "step": 16461 }, { "epoch": 0.433552804845931, "grad_norm": 2.489300012588501, "learning_rate": 2.833552804845931e-05, "loss": 1.3312, "step": 16462 }, { "epoch": 0.4335791414274427, "grad_norm": 3.0198323726654053, "learning_rate": 2.8334211219383726e-05, "loss": 1.0234, "step": 16463 }, { "epoch": 0.43360547800895444, "grad_norm": 3.0601694583892822, "learning_rate": 2.8332894390308142e-05, "loss": 1.3195, "step": 16464 }, { "epoch": 0.4336318145904662, "grad_norm": 1.7922035455703735, "learning_rate": 2.8331577561232554e-05, "loss": 1.2245, "step": 16465 }, { "epoch": 0.43365815117197787, "grad_norm": 3.8335766792297363, "learning_rate": 2.8330260732156966e-05, "loss": 1.001, "step": 16466 }, { "epoch": 0.4336844877534896, "grad_norm": 2.2135560512542725, "learning_rate": 2.8328943903081385e-05, "loss": 0.287, "step": 16467 }, { "epoch": 0.4337108243350013, "grad_norm": 1.8694093227386475, "learning_rate": 2.8327627074005797e-05, "loss": 1.7988, "step": 16468 }, { "epoch": 0.43373716091651304, "grad_norm": 1.8954025506973267, "learning_rate": 2.832631024493021e-05, "loss": 1.6016, "step": 16469 }, { "epoch": 0.43376349749802473, "grad_norm": 3.6818227767944336, "learning_rate": 2.8324993415854622e-05, "loss": 1.608, "step": 16470 }, { "epoch": 0.4337898340795365, "grad_norm": 3.071455240249634, "learning_rate": 2.8323676586779034e-05, "loss": 1.9356, "step": 16471 }, { "epoch": 0.4338161706610482, "grad_norm": 2.14438533782959, "learning_rate": 2.8322359757703453e-05, "loss": 2.3003, "step": 16472 }, { "epoch": 0.4338425072425599, "grad_norm": 6.42063045501709, "learning_rate": 2.8321042928627865e-05, "loss": 1.4109, "step": 16473 }, { "epoch": 0.43386884382407165, "grad_norm": 2.8539631366729736, "learning_rate": 2.831972609955228e-05, "loss": 1.4741, "step": 16474 }, { "epoch": 0.43389518040558334, "grad_norm": 2.316908359527588, "learning_rate": 2.8318409270476693e-05, "loss": 1.9613, "step": 16475 }, { "epoch": 0.4339215169870951, "grad_norm": 3.846804618835449, "learning_rate": 2.8317092441401105e-05, "loss": 0.8859, "step": 16476 }, { "epoch": 0.43394785356860677, "grad_norm": 3.548475503921509, "learning_rate": 2.8315775612325524e-05, "loss": 1.6674, "step": 16477 }, { "epoch": 0.4339741901501185, "grad_norm": 3.4965856075286865, "learning_rate": 2.8314458783249936e-05, "loss": 0.9479, "step": 16478 }, { "epoch": 0.43400052673163025, "grad_norm": 3.1194825172424316, "learning_rate": 2.831314195417435e-05, "loss": 1.5284, "step": 16479 }, { "epoch": 0.43402686331314194, "grad_norm": 1.6658433675765991, "learning_rate": 2.831182512509876e-05, "loss": 1.9135, "step": 16480 }, { "epoch": 0.4340531998946537, "grad_norm": 2.425234317779541, "learning_rate": 2.8310508296023176e-05, "loss": 1.8135, "step": 16481 }, { "epoch": 0.4340795364761654, "grad_norm": 1.751568078994751, "learning_rate": 2.8309191466947592e-05, "loss": 1.6672, "step": 16482 }, { "epoch": 0.4341058730576771, "grad_norm": 1.8724992275238037, "learning_rate": 2.8307874637872008e-05, "loss": 1.8876, "step": 16483 }, { "epoch": 0.43413220963918886, "grad_norm": 2.105757236480713, "learning_rate": 2.830655780879642e-05, "loss": 1.536, "step": 16484 }, { "epoch": 0.43415854622070055, "grad_norm": 2.004755735397339, "learning_rate": 2.8305240979720832e-05, "loss": 1.9267, "step": 16485 }, { "epoch": 0.4341848828022123, "grad_norm": 1.4966754913330078, "learning_rate": 2.830392415064525e-05, "loss": 1.4707, "step": 16486 }, { "epoch": 0.434211219383724, "grad_norm": 2.4996042251586914, "learning_rate": 2.8302607321569663e-05, "loss": 1.3735, "step": 16487 }, { "epoch": 0.4342375559652357, "grad_norm": 2.1456947326660156, "learning_rate": 2.8301290492494075e-05, "loss": 1.7783, "step": 16488 }, { "epoch": 0.4342638925467474, "grad_norm": 1.7817798852920532, "learning_rate": 2.8299973663418488e-05, "loss": 1.7208, "step": 16489 }, { "epoch": 0.43429022912825915, "grad_norm": 1.697743535041809, "learning_rate": 2.82986568343429e-05, "loss": 1.6933, "step": 16490 }, { "epoch": 0.4343165657097709, "grad_norm": 4.356972694396973, "learning_rate": 2.829734000526732e-05, "loss": 1.722, "step": 16491 }, { "epoch": 0.4343429022912826, "grad_norm": 2.211265802383423, "learning_rate": 2.8296023176191734e-05, "loss": 1.6289, "step": 16492 }, { "epoch": 0.4343692388727943, "grad_norm": 2.077662706375122, "learning_rate": 2.8294706347116147e-05, "loss": 1.8589, "step": 16493 }, { "epoch": 0.434395575454306, "grad_norm": 1.6653528213500977, "learning_rate": 2.829338951804056e-05, "loss": 1.6447, "step": 16494 }, { "epoch": 0.43442191203581776, "grad_norm": 2.913416862487793, "learning_rate": 2.829207268896497e-05, "loss": 1.32, "step": 16495 }, { "epoch": 0.43444824861732945, "grad_norm": 2.471480369567871, "learning_rate": 2.829075585988939e-05, "loss": 2.0304, "step": 16496 }, { "epoch": 0.4344745851988412, "grad_norm": 1.7987360954284668, "learning_rate": 2.8289439030813802e-05, "loss": 2.0854, "step": 16497 }, { "epoch": 0.43450092178035293, "grad_norm": 1.8381974697113037, "learning_rate": 2.8288122201738214e-05, "loss": 1.6529, "step": 16498 }, { "epoch": 0.4345272583618646, "grad_norm": 1.5506705045700073, "learning_rate": 2.8286805372662627e-05, "loss": 1.3963, "step": 16499 }, { "epoch": 0.43455359494337636, "grad_norm": 2.0843544006347656, "learning_rate": 2.8285488543587046e-05, "loss": 1.1241, "step": 16500 }, { "epoch": 0.43457993152488805, "grad_norm": 2.3446781635284424, "learning_rate": 2.8284171714511458e-05, "loss": 1.8986, "step": 16501 }, { "epoch": 0.4346062681063998, "grad_norm": 4.117410659790039, "learning_rate": 2.8282854885435873e-05, "loss": 1.0084, "step": 16502 }, { "epoch": 0.4346326046879115, "grad_norm": 4.413883686065674, "learning_rate": 2.8281538056360286e-05, "loss": 0.7021, "step": 16503 }, { "epoch": 0.4346589412694232, "grad_norm": 1.922536015510559, "learning_rate": 2.8280221227284698e-05, "loss": 2.0783, "step": 16504 }, { "epoch": 0.43468527785093497, "grad_norm": 1.968959927558899, "learning_rate": 2.8278904398209117e-05, "loss": 1.6969, "step": 16505 }, { "epoch": 0.43471161443244666, "grad_norm": 2.5623135566711426, "learning_rate": 2.827758756913353e-05, "loss": 1.3383, "step": 16506 }, { "epoch": 0.4347379510139584, "grad_norm": 2.795820951461792, "learning_rate": 2.827627074005794e-05, "loss": 1.6475, "step": 16507 }, { "epoch": 0.4347642875954701, "grad_norm": 2.042825698852539, "learning_rate": 2.8274953910982353e-05, "loss": 2.4634, "step": 16508 }, { "epoch": 0.43479062417698183, "grad_norm": 2.9703381061553955, "learning_rate": 2.827363708190677e-05, "loss": 1.4563, "step": 16509 }, { "epoch": 0.4348169607584935, "grad_norm": 1.5627120733261108, "learning_rate": 2.8272320252831185e-05, "loss": 0.5622, "step": 16510 }, { "epoch": 0.43484329734000526, "grad_norm": 2.4037153720855713, "learning_rate": 2.82710034237556e-05, "loss": 1.7923, "step": 16511 }, { "epoch": 0.434869633921517, "grad_norm": 3.218648672103882, "learning_rate": 2.8269686594680012e-05, "loss": 0.837, "step": 16512 }, { "epoch": 0.4348959705030287, "grad_norm": 2.02287220954895, "learning_rate": 2.8268369765604425e-05, "loss": 1.9907, "step": 16513 }, { "epoch": 0.43492230708454044, "grad_norm": 1.8718514442443848, "learning_rate": 2.8267052936528837e-05, "loss": 1.9464, "step": 16514 }, { "epoch": 0.4349486436660521, "grad_norm": 1.9429402351379395, "learning_rate": 2.8265736107453256e-05, "loss": 1.8846, "step": 16515 }, { "epoch": 0.43497498024756387, "grad_norm": 1.9777008295059204, "learning_rate": 2.8264419278377668e-05, "loss": 1.9679, "step": 16516 }, { "epoch": 0.4350013168290756, "grad_norm": 2.007272958755493, "learning_rate": 2.826310244930208e-05, "loss": 2.0907, "step": 16517 }, { "epoch": 0.4350276534105873, "grad_norm": 2.2078537940979004, "learning_rate": 2.8261785620226492e-05, "loss": 2.3834, "step": 16518 }, { "epoch": 0.43505398999209904, "grad_norm": 4.378069877624512, "learning_rate": 2.826046879115091e-05, "loss": 1.8814, "step": 16519 }, { "epoch": 0.43508032657361073, "grad_norm": 2.3533949851989746, "learning_rate": 2.8259151962075324e-05, "loss": 1.5736, "step": 16520 }, { "epoch": 0.4351066631551225, "grad_norm": 2.167644739151001, "learning_rate": 2.825783513299974e-05, "loss": 0.3618, "step": 16521 }, { "epoch": 0.43513299973663416, "grad_norm": 2.1816017627716064, "learning_rate": 2.825651830392415e-05, "loss": 1.7769, "step": 16522 }, { "epoch": 0.4351593363181459, "grad_norm": 1.5967211723327637, "learning_rate": 2.8255201474848564e-05, "loss": 0.2387, "step": 16523 }, { "epoch": 0.43518567289965765, "grad_norm": 1.6066944599151611, "learning_rate": 2.8253884645772983e-05, "loss": 2.1092, "step": 16524 }, { "epoch": 0.43521200948116934, "grad_norm": 2.3991541862487793, "learning_rate": 2.8252567816697395e-05, "loss": 0.4484, "step": 16525 }, { "epoch": 0.4352383460626811, "grad_norm": 1.8292269706726074, "learning_rate": 2.8251250987621807e-05, "loss": 1.7973, "step": 16526 }, { "epoch": 0.43526468264419277, "grad_norm": 1.5651960372924805, "learning_rate": 2.824993415854622e-05, "loss": 1.4939, "step": 16527 }, { "epoch": 0.4352910192257045, "grad_norm": 4.2878098487854, "learning_rate": 2.8248617329470635e-05, "loss": 0.4767, "step": 16528 }, { "epoch": 0.4353173558072162, "grad_norm": 2.1620655059814453, "learning_rate": 2.824730050039505e-05, "loss": 1.6215, "step": 16529 }, { "epoch": 0.43534369238872794, "grad_norm": 3.5705878734588623, "learning_rate": 2.8245983671319466e-05, "loss": 1.4668, "step": 16530 }, { "epoch": 0.4353700289702397, "grad_norm": 4.136373519897461, "learning_rate": 2.824466684224388e-05, "loss": 1.3312, "step": 16531 }, { "epoch": 0.4353963655517514, "grad_norm": 4.730039119720459, "learning_rate": 2.824335001316829e-05, "loss": 0.9081, "step": 16532 }, { "epoch": 0.4354227021332631, "grad_norm": 2.257317304611206, "learning_rate": 2.824203318409271e-05, "loss": 1.7244, "step": 16533 }, { "epoch": 0.4354490387147748, "grad_norm": 3.972649097442627, "learning_rate": 2.8240716355017122e-05, "loss": 1.5251, "step": 16534 }, { "epoch": 0.43547537529628655, "grad_norm": 2.4832606315612793, "learning_rate": 2.8239399525941534e-05, "loss": 2.2719, "step": 16535 }, { "epoch": 0.43550171187779824, "grad_norm": 1.695595622062683, "learning_rate": 2.8238082696865946e-05, "loss": 1.4979, "step": 16536 }, { "epoch": 0.43552804845931, "grad_norm": 2.467546224594116, "learning_rate": 2.8236765867790362e-05, "loss": 1.3863, "step": 16537 }, { "epoch": 0.4355543850408217, "grad_norm": 1.8082926273345947, "learning_rate": 2.8235449038714777e-05, "loss": 0.4872, "step": 16538 }, { "epoch": 0.4355807216223334, "grad_norm": 2.463932991027832, "learning_rate": 2.8234132209639193e-05, "loss": 2.1371, "step": 16539 }, { "epoch": 0.43560705820384515, "grad_norm": 2.318068027496338, "learning_rate": 2.8232815380563605e-05, "loss": 0.4452, "step": 16540 }, { "epoch": 0.43563339478535684, "grad_norm": 1.7529547214508057, "learning_rate": 2.8231498551488017e-05, "loss": 1.4515, "step": 16541 }, { "epoch": 0.4356597313668686, "grad_norm": 3.1177496910095215, "learning_rate": 2.823018172241243e-05, "loss": 2.0095, "step": 16542 }, { "epoch": 0.4356860679483803, "grad_norm": 2.89117169380188, "learning_rate": 2.822886489333685e-05, "loss": 1.6061, "step": 16543 }, { "epoch": 0.435712404529892, "grad_norm": 3.2339375019073486, "learning_rate": 2.822754806426126e-05, "loss": 1.7907, "step": 16544 }, { "epoch": 0.43573874111140376, "grad_norm": 2.899411916732788, "learning_rate": 2.8226231235185673e-05, "loss": 1.3294, "step": 16545 }, { "epoch": 0.43576507769291545, "grad_norm": 1.6597256660461426, "learning_rate": 2.8224914406110085e-05, "loss": 1.0791, "step": 16546 }, { "epoch": 0.4357914142744272, "grad_norm": 5.410963535308838, "learning_rate": 2.82235975770345e-05, "loss": 0.7657, "step": 16547 }, { "epoch": 0.4358177508559389, "grad_norm": 1.9413758516311646, "learning_rate": 2.8222280747958916e-05, "loss": 2.031, "step": 16548 }, { "epoch": 0.4358440874374506, "grad_norm": 1.704094648361206, "learning_rate": 2.8220963918883332e-05, "loss": 1.775, "step": 16549 }, { "epoch": 0.43587042401896237, "grad_norm": 1.5150890350341797, "learning_rate": 2.8219647089807744e-05, "loss": 1.7933, "step": 16550 }, { "epoch": 0.43589676060047405, "grad_norm": 2.559528350830078, "learning_rate": 2.8218330260732156e-05, "loss": 1.7279, "step": 16551 }, { "epoch": 0.4359230971819858, "grad_norm": 2.9103341102600098, "learning_rate": 2.8217013431656575e-05, "loss": 2.1717, "step": 16552 }, { "epoch": 0.4359494337634975, "grad_norm": 3.6804163455963135, "learning_rate": 2.8215696602580988e-05, "loss": 1.9862, "step": 16553 }, { "epoch": 0.43597577034500923, "grad_norm": 1.056111216545105, "learning_rate": 2.82143797735054e-05, "loss": 0.2084, "step": 16554 }, { "epoch": 0.4360021069265209, "grad_norm": 5.031483173370361, "learning_rate": 2.8213062944429812e-05, "loss": 1.4148, "step": 16555 }, { "epoch": 0.43602844350803266, "grad_norm": 1.9605884552001953, "learning_rate": 2.8211746115354228e-05, "loss": 1.6704, "step": 16556 }, { "epoch": 0.4360547800895444, "grad_norm": 1.915305733680725, "learning_rate": 2.8210429286278643e-05, "loss": 1.7103, "step": 16557 }, { "epoch": 0.4360811166710561, "grad_norm": 1.579426646232605, "learning_rate": 2.820911245720306e-05, "loss": 1.5001, "step": 16558 }, { "epoch": 0.43610745325256783, "grad_norm": 1.9474209547042847, "learning_rate": 2.820779562812747e-05, "loss": 2.1016, "step": 16559 }, { "epoch": 0.4361337898340795, "grad_norm": 1.8780663013458252, "learning_rate": 2.8206478799051883e-05, "loss": 1.8185, "step": 16560 }, { "epoch": 0.43616012641559126, "grad_norm": 2.063614845275879, "learning_rate": 2.8205161969976295e-05, "loss": 1.955, "step": 16561 }, { "epoch": 0.43618646299710295, "grad_norm": 2.269077777862549, "learning_rate": 2.8203845140900714e-05, "loss": 2.2423, "step": 16562 }, { "epoch": 0.4362127995786147, "grad_norm": 1.7708632946014404, "learning_rate": 2.8202528311825127e-05, "loss": 1.8232, "step": 16563 }, { "epoch": 0.43623913616012644, "grad_norm": 1.8033145666122437, "learning_rate": 2.820121148274954e-05, "loss": 1.59, "step": 16564 }, { "epoch": 0.4362654727416381, "grad_norm": 4.104933738708496, "learning_rate": 2.8199894653673954e-05, "loss": 0.6668, "step": 16565 }, { "epoch": 0.43629180932314987, "grad_norm": 3.9449446201324463, "learning_rate": 2.819857782459837e-05, "loss": 1.6086, "step": 16566 }, { "epoch": 0.43631814590466156, "grad_norm": 2.564462184906006, "learning_rate": 2.8197260995522786e-05, "loss": 1.0605, "step": 16567 }, { "epoch": 0.4363444824861733, "grad_norm": 1.6224631071090698, "learning_rate": 2.8195944166447198e-05, "loss": 1.6911, "step": 16568 }, { "epoch": 0.436370819067685, "grad_norm": 3.5223100185394287, "learning_rate": 2.819462733737161e-05, "loss": 1.7488, "step": 16569 }, { "epoch": 0.43639715564919673, "grad_norm": 3.0731112957000732, "learning_rate": 2.8193310508296022e-05, "loss": 1.5548, "step": 16570 }, { "epoch": 0.4364234922307085, "grad_norm": 3.0118627548217773, "learning_rate": 2.819199367922044e-05, "loss": 1.6599, "step": 16571 }, { "epoch": 0.43644982881222016, "grad_norm": 2.1013436317443848, "learning_rate": 2.8190676850144853e-05, "loss": 2.0399, "step": 16572 }, { "epoch": 0.4364761653937319, "grad_norm": 1.8144354820251465, "learning_rate": 2.8189360021069266e-05, "loss": 1.8986, "step": 16573 }, { "epoch": 0.4365025019752436, "grad_norm": 2.1434435844421387, "learning_rate": 2.8188043191993678e-05, "loss": 1.9027, "step": 16574 }, { "epoch": 0.43652883855675534, "grad_norm": 3.2731213569641113, "learning_rate": 2.8186726362918093e-05, "loss": 0.3589, "step": 16575 }, { "epoch": 0.436555175138267, "grad_norm": 1.8018608093261719, "learning_rate": 2.818540953384251e-05, "loss": 2.1207, "step": 16576 }, { "epoch": 0.43658151171977877, "grad_norm": 2.136406183242798, "learning_rate": 2.8184092704766925e-05, "loss": 1.8915, "step": 16577 }, { "epoch": 0.4366078483012905, "grad_norm": 1.9887404441833496, "learning_rate": 2.8182775875691337e-05, "loss": 2.1259, "step": 16578 }, { "epoch": 0.4366341848828022, "grad_norm": 3.9231603145599365, "learning_rate": 2.818145904661575e-05, "loss": 1.4021, "step": 16579 }, { "epoch": 0.43666052146431394, "grad_norm": 1.7362685203552246, "learning_rate": 2.818014221754016e-05, "loss": 0.2168, "step": 16580 }, { "epoch": 0.43668685804582563, "grad_norm": 1.8164308071136475, "learning_rate": 2.817882538846458e-05, "loss": 1.7385, "step": 16581 }, { "epoch": 0.4367131946273374, "grad_norm": 2.889979362487793, "learning_rate": 2.8177508559388992e-05, "loss": 2.1276, "step": 16582 }, { "epoch": 0.4367395312088491, "grad_norm": 1.8994073867797852, "learning_rate": 2.8176191730313405e-05, "loss": 1.5463, "step": 16583 }, { "epoch": 0.4367658677903608, "grad_norm": 2.7641348838806152, "learning_rate": 2.817487490123782e-05, "loss": 0.8215, "step": 16584 }, { "epoch": 0.43679220437187255, "grad_norm": 3.9808731079101562, "learning_rate": 2.8173558072162236e-05, "loss": 1.3807, "step": 16585 }, { "epoch": 0.43681854095338424, "grad_norm": 3.0710442066192627, "learning_rate": 2.817224124308665e-05, "loss": 0.8945, "step": 16586 }, { "epoch": 0.436844877534896, "grad_norm": 2.3296892642974854, "learning_rate": 2.8170924414011064e-05, "loss": 0.7532, "step": 16587 }, { "epoch": 0.43687121411640767, "grad_norm": 2.7271907329559326, "learning_rate": 2.8169607584935476e-05, "loss": 1.5595, "step": 16588 }, { "epoch": 0.4368975506979194, "grad_norm": 3.0335512161254883, "learning_rate": 2.8168290755859888e-05, "loss": 0.4242, "step": 16589 }, { "epoch": 0.43692388727943116, "grad_norm": 2.5800588130950928, "learning_rate": 2.8166973926784307e-05, "loss": 1.4019, "step": 16590 }, { "epoch": 0.43695022386094284, "grad_norm": 2.7184219360351562, "learning_rate": 2.816565709770872e-05, "loss": 1.6803, "step": 16591 }, { "epoch": 0.4369765604424546, "grad_norm": 2.4640495777130127, "learning_rate": 2.816434026863313e-05, "loss": 1.7125, "step": 16592 }, { "epoch": 0.4370028970239663, "grad_norm": 2.405949592590332, "learning_rate": 2.8163023439557544e-05, "loss": 1.9497, "step": 16593 }, { "epoch": 0.437029233605478, "grad_norm": 2.864454984664917, "learning_rate": 2.816170661048196e-05, "loss": 2.098, "step": 16594 }, { "epoch": 0.4370555701869897, "grad_norm": 3.776113271713257, "learning_rate": 2.8160389781406375e-05, "loss": 0.7326, "step": 16595 }, { "epoch": 0.43708190676850145, "grad_norm": 2.1270062923431396, "learning_rate": 2.815907295233079e-05, "loss": 1.8915, "step": 16596 }, { "epoch": 0.4371082433500132, "grad_norm": 2.74699330329895, "learning_rate": 2.8157756123255203e-05, "loss": 1.8578, "step": 16597 }, { "epoch": 0.4371345799315249, "grad_norm": 1.6508920192718506, "learning_rate": 2.8156439294179615e-05, "loss": 1.7382, "step": 16598 }, { "epoch": 0.4371609165130366, "grad_norm": 1.529170274734497, "learning_rate": 2.8155122465104034e-05, "loss": 1.9671, "step": 16599 }, { "epoch": 0.4371872530945483, "grad_norm": 1.6650422811508179, "learning_rate": 2.8153805636028446e-05, "loss": 1.6926, "step": 16600 }, { "epoch": 0.43721358967606005, "grad_norm": 2.509626626968384, "learning_rate": 2.815248880695286e-05, "loss": 1.7362, "step": 16601 }, { "epoch": 0.43723992625757174, "grad_norm": 1.6635472774505615, "learning_rate": 2.815117197787727e-05, "loss": 1.7993, "step": 16602 }, { "epoch": 0.4372662628390835, "grad_norm": 2.379213571548462, "learning_rate": 2.8149855148801686e-05, "loss": 1.6131, "step": 16603 }, { "epoch": 0.43729259942059523, "grad_norm": 3.236212968826294, "learning_rate": 2.8148538319726102e-05, "loss": 1.7876, "step": 16604 }, { "epoch": 0.4373189360021069, "grad_norm": 1.8496614694595337, "learning_rate": 2.8147221490650517e-05, "loss": 1.8803, "step": 16605 }, { "epoch": 0.43734527258361866, "grad_norm": 2.900906562805176, "learning_rate": 2.814590466157493e-05, "loss": 1.5396, "step": 16606 }, { "epoch": 0.43737160916513035, "grad_norm": 3.22273325920105, "learning_rate": 2.8144587832499342e-05, "loss": 0.3887, "step": 16607 }, { "epoch": 0.4373979457466421, "grad_norm": 1.628406286239624, "learning_rate": 2.8143271003423754e-05, "loss": 1.5892, "step": 16608 }, { "epoch": 0.4374242823281538, "grad_norm": 3.8181023597717285, "learning_rate": 2.8141954174348173e-05, "loss": 1.5064, "step": 16609 }, { "epoch": 0.4374506189096655, "grad_norm": 2.1755239963531494, "learning_rate": 2.8140637345272585e-05, "loss": 1.5169, "step": 16610 }, { "epoch": 0.43747695549117727, "grad_norm": 2.7795848846435547, "learning_rate": 2.8139320516196997e-05, "loss": 1.6898, "step": 16611 }, { "epoch": 0.43750329207268895, "grad_norm": 1.8404885530471802, "learning_rate": 2.8138003687121413e-05, "loss": 1.7695, "step": 16612 }, { "epoch": 0.4375296286542007, "grad_norm": 2.442927360534668, "learning_rate": 2.8136686858045825e-05, "loss": 1.8158, "step": 16613 }, { "epoch": 0.4375559652357124, "grad_norm": 2.6817586421966553, "learning_rate": 2.8135370028970244e-05, "loss": 1.2455, "step": 16614 }, { "epoch": 0.43758230181722413, "grad_norm": 2.126110553741455, "learning_rate": 2.8134053199894656e-05, "loss": 1.7892, "step": 16615 }, { "epoch": 0.4376086383987358, "grad_norm": 1.8690754175186157, "learning_rate": 2.813273637081907e-05, "loss": 1.7856, "step": 16616 }, { "epoch": 0.43763497498024756, "grad_norm": 1.87594735622406, "learning_rate": 2.813141954174348e-05, "loss": 1.724, "step": 16617 }, { "epoch": 0.4376613115617593, "grad_norm": 3.224095106124878, "learning_rate": 2.81301027126679e-05, "loss": 1.1844, "step": 16618 }, { "epoch": 0.437687648143271, "grad_norm": 2.0419390201568604, "learning_rate": 2.8128785883592312e-05, "loss": 1.6161, "step": 16619 }, { "epoch": 0.43771398472478273, "grad_norm": 1.8997228145599365, "learning_rate": 2.8127469054516724e-05, "loss": 1.7896, "step": 16620 }, { "epoch": 0.4377403213062944, "grad_norm": 2.3550984859466553, "learning_rate": 2.8126152225441136e-05, "loss": 2.0752, "step": 16621 }, { "epoch": 0.43776665788780617, "grad_norm": 1.678864598274231, "learning_rate": 2.8124835396365552e-05, "loss": 2.3086, "step": 16622 }, { "epoch": 0.4377929944693179, "grad_norm": 1.7242493629455566, "learning_rate": 2.8123518567289968e-05, "loss": 1.5141, "step": 16623 }, { "epoch": 0.4378193310508296, "grad_norm": 1.6488767862319946, "learning_rate": 2.8122201738214383e-05, "loss": 2.1585, "step": 16624 }, { "epoch": 0.43784566763234134, "grad_norm": 1.7621309757232666, "learning_rate": 2.8120884909138795e-05, "loss": 1.5271, "step": 16625 }, { "epoch": 0.437872004213853, "grad_norm": 1.4883825778961182, "learning_rate": 2.8119568080063208e-05, "loss": 1.9442, "step": 16626 }, { "epoch": 0.43789834079536477, "grad_norm": 3.7244820594787598, "learning_rate": 2.811825125098762e-05, "loss": 1.4252, "step": 16627 }, { "epoch": 0.43792467737687646, "grad_norm": 3.383676767349243, "learning_rate": 2.811693442191204e-05, "loss": 1.572, "step": 16628 }, { "epoch": 0.4379510139583882, "grad_norm": 2.4104089736938477, "learning_rate": 2.811561759283645e-05, "loss": 1.9732, "step": 16629 }, { "epoch": 0.43797735053989995, "grad_norm": 1.6271495819091797, "learning_rate": 2.8114300763760863e-05, "loss": 1.367, "step": 16630 }, { "epoch": 0.43800368712141163, "grad_norm": 1.797842025756836, "learning_rate": 2.811298393468528e-05, "loss": 2.0346, "step": 16631 }, { "epoch": 0.4380300237029234, "grad_norm": 3.5027220249176025, "learning_rate": 2.8111667105609694e-05, "loss": 1.8181, "step": 16632 }, { "epoch": 0.43805636028443506, "grad_norm": 2.5124311447143555, "learning_rate": 2.811035027653411e-05, "loss": 2.122, "step": 16633 }, { "epoch": 0.4380826968659468, "grad_norm": 4.779937744140625, "learning_rate": 2.8109033447458522e-05, "loss": 0.5993, "step": 16634 }, { "epoch": 0.4381090334474585, "grad_norm": 1.7733968496322632, "learning_rate": 2.8107716618382934e-05, "loss": 0.4342, "step": 16635 }, { "epoch": 0.43813537002897024, "grad_norm": 3.3720638751983643, "learning_rate": 2.8106399789307347e-05, "loss": 2.2293, "step": 16636 }, { "epoch": 0.438161706610482, "grad_norm": 1.5262728929519653, "learning_rate": 2.8105082960231766e-05, "loss": 1.5734, "step": 16637 }, { "epoch": 0.43818804319199367, "grad_norm": 1.7679376602172852, "learning_rate": 2.8103766131156178e-05, "loss": 1.1915, "step": 16638 }, { "epoch": 0.4382143797735054, "grad_norm": 3.353997230529785, "learning_rate": 2.810244930208059e-05, "loss": 1.4492, "step": 16639 }, { "epoch": 0.4382407163550171, "grad_norm": 1.382959008216858, "learning_rate": 2.8101132473005006e-05, "loss": 0.346, "step": 16640 }, { "epoch": 0.43826705293652884, "grad_norm": 3.6660995483398438, "learning_rate": 2.8099815643929418e-05, "loss": 1.2867, "step": 16641 }, { "epoch": 0.43829338951804053, "grad_norm": 1.4647947549819946, "learning_rate": 2.8098498814853837e-05, "loss": 1.975, "step": 16642 }, { "epoch": 0.4383197260995523, "grad_norm": 1.6251085996627808, "learning_rate": 2.809718198577825e-05, "loss": 2.3856, "step": 16643 }, { "epoch": 0.438346062681064, "grad_norm": 1.5047577619552612, "learning_rate": 2.809586515670266e-05, "loss": 1.6583, "step": 16644 }, { "epoch": 0.4383723992625757, "grad_norm": 4.208713054656982, "learning_rate": 2.8094548327627074e-05, "loss": 2.1076, "step": 16645 }, { "epoch": 0.43839873584408745, "grad_norm": 2.881943702697754, "learning_rate": 2.8093231498551486e-05, "loss": 1.4483, "step": 16646 }, { "epoch": 0.43842507242559914, "grad_norm": 2.7489466667175293, "learning_rate": 2.8091914669475905e-05, "loss": 1.7601, "step": 16647 }, { "epoch": 0.4384514090071109, "grad_norm": 3.5693933963775635, "learning_rate": 2.8090597840400317e-05, "loss": 0.9019, "step": 16648 }, { "epoch": 0.43847774558862257, "grad_norm": 1.6769102811813354, "learning_rate": 2.808928101132473e-05, "loss": 1.7101, "step": 16649 }, { "epoch": 0.4385040821701343, "grad_norm": 5.1734161376953125, "learning_rate": 2.8087964182249145e-05, "loss": 1.3493, "step": 16650 }, { "epoch": 0.43853041875164606, "grad_norm": 4.826345920562744, "learning_rate": 2.808664735317356e-05, "loss": 1.1355, "step": 16651 }, { "epoch": 0.43855675533315774, "grad_norm": 2.712402582168579, "learning_rate": 2.8085330524097976e-05, "loss": 2.102, "step": 16652 }, { "epoch": 0.4385830919146695, "grad_norm": 6.370478630065918, "learning_rate": 2.8084013695022388e-05, "loss": 2.3326, "step": 16653 }, { "epoch": 0.4386094284961812, "grad_norm": 2.029810905456543, "learning_rate": 2.80826968659468e-05, "loss": 1.8469, "step": 16654 }, { "epoch": 0.4386357650776929, "grad_norm": 2.1381540298461914, "learning_rate": 2.8081380036871213e-05, "loss": 1.8223, "step": 16655 }, { "epoch": 0.43866210165920466, "grad_norm": 2.176945924758911, "learning_rate": 2.808006320779563e-05, "loss": 1.6776, "step": 16656 }, { "epoch": 0.43868843824071635, "grad_norm": 3.5617611408233643, "learning_rate": 2.8078746378720044e-05, "loss": 0.9677, "step": 16657 }, { "epoch": 0.4387147748222281, "grad_norm": 3.248983383178711, "learning_rate": 2.8077429549644456e-05, "loss": 1.4099, "step": 16658 }, { "epoch": 0.4387411114037398, "grad_norm": 1.565604329109192, "learning_rate": 2.807611272056887e-05, "loss": 1.5412, "step": 16659 }, { "epoch": 0.4387674479852515, "grad_norm": 1.7660753726959229, "learning_rate": 2.8074795891493284e-05, "loss": 1.741, "step": 16660 }, { "epoch": 0.4387937845667632, "grad_norm": 2.713979721069336, "learning_rate": 2.8073479062417703e-05, "loss": 0.8884, "step": 16661 }, { "epoch": 0.43882012114827496, "grad_norm": 3.372013807296753, "learning_rate": 2.8072162233342115e-05, "loss": 1.1915, "step": 16662 }, { "epoch": 0.4388464577297867, "grad_norm": 4.073408126831055, "learning_rate": 2.8070845404266527e-05, "loss": 1.0933, "step": 16663 }, { "epoch": 0.4388727943112984, "grad_norm": 2.7456178665161133, "learning_rate": 2.806952857519094e-05, "loss": 1.3222, "step": 16664 }, { "epoch": 0.43889913089281013, "grad_norm": 2.710808277130127, "learning_rate": 2.806821174611536e-05, "loss": 1.5726, "step": 16665 }, { "epoch": 0.4389254674743218, "grad_norm": 4.102010250091553, "learning_rate": 2.806689491703977e-05, "loss": 0.1941, "step": 16666 }, { "epoch": 0.43895180405583356, "grad_norm": 2.143749713897705, "learning_rate": 2.8065578087964183e-05, "loss": 1.9777, "step": 16667 }, { "epoch": 0.43897814063734525, "grad_norm": 3.6210286617279053, "learning_rate": 2.80642612588886e-05, "loss": 1.636, "step": 16668 }, { "epoch": 0.439004477218857, "grad_norm": 1.8862544298171997, "learning_rate": 2.806294442981301e-05, "loss": 1.5848, "step": 16669 }, { "epoch": 0.43903081380036874, "grad_norm": 2.8148388862609863, "learning_rate": 2.806162760073743e-05, "loss": 1.8438, "step": 16670 }, { "epoch": 0.4390571503818804, "grad_norm": 1.6165825128555298, "learning_rate": 2.8060310771661842e-05, "loss": 1.2156, "step": 16671 }, { "epoch": 0.43908348696339217, "grad_norm": 6.481845855712891, "learning_rate": 2.8058993942586254e-05, "loss": 2.7948, "step": 16672 }, { "epoch": 0.43910982354490385, "grad_norm": 2.490011215209961, "learning_rate": 2.8057677113510666e-05, "loss": 2.065, "step": 16673 }, { "epoch": 0.4391361601264156, "grad_norm": 3.7045841217041016, "learning_rate": 2.805636028443508e-05, "loss": 2.1497, "step": 16674 }, { "epoch": 0.4391624967079273, "grad_norm": 1.612959384918213, "learning_rate": 2.8055043455359497e-05, "loss": 1.2847, "step": 16675 }, { "epoch": 0.43918883328943903, "grad_norm": 2.541501045227051, "learning_rate": 2.805372662628391e-05, "loss": 2.3188, "step": 16676 }, { "epoch": 0.4392151698709508, "grad_norm": 2.371471643447876, "learning_rate": 2.8052409797208322e-05, "loss": 1.6169, "step": 16677 }, { "epoch": 0.43924150645246246, "grad_norm": 1.4706392288208008, "learning_rate": 2.8051092968132737e-05, "loss": 0.4849, "step": 16678 }, { "epoch": 0.4392678430339742, "grad_norm": 1.6664483547210693, "learning_rate": 2.804977613905715e-05, "loss": 2.3979, "step": 16679 }, { "epoch": 0.4392941796154859, "grad_norm": 2.094409942626953, "learning_rate": 2.804845930998157e-05, "loss": 1.6646, "step": 16680 }, { "epoch": 0.43932051619699763, "grad_norm": 1.8919168710708618, "learning_rate": 2.804714248090598e-05, "loss": 1.6576, "step": 16681 }, { "epoch": 0.4393468527785093, "grad_norm": 1.5826287269592285, "learning_rate": 2.8045825651830393e-05, "loss": 1.6483, "step": 16682 }, { "epoch": 0.43937318936002107, "grad_norm": 4.1608710289001465, "learning_rate": 2.8044508822754805e-05, "loss": 1.9458, "step": 16683 }, { "epoch": 0.4393995259415328, "grad_norm": 2.2314000129699707, "learning_rate": 2.8043191993679224e-05, "loss": 1.8213, "step": 16684 }, { "epoch": 0.4394258625230445, "grad_norm": 1.8401408195495605, "learning_rate": 2.8041875164603636e-05, "loss": 1.965, "step": 16685 }, { "epoch": 0.43945219910455624, "grad_norm": 2.4395196437835693, "learning_rate": 2.804055833552805e-05, "loss": 1.1922, "step": 16686 }, { "epoch": 0.43947853568606793, "grad_norm": 2.1906843185424805, "learning_rate": 2.8039241506452464e-05, "loss": 1.9512, "step": 16687 }, { "epoch": 0.43950487226757967, "grad_norm": 2.4255566596984863, "learning_rate": 2.8037924677376876e-05, "loss": 2.1641, "step": 16688 }, { "epoch": 0.4395312088490914, "grad_norm": 2.333284854888916, "learning_rate": 2.8036607848301295e-05, "loss": 1.564, "step": 16689 }, { "epoch": 0.4395575454306031, "grad_norm": 1.7572884559631348, "learning_rate": 2.8035291019225708e-05, "loss": 1.8175, "step": 16690 }, { "epoch": 0.43958388201211485, "grad_norm": 3.4218058586120605, "learning_rate": 2.803397419015012e-05, "loss": 1.6107, "step": 16691 }, { "epoch": 0.43961021859362653, "grad_norm": 3.5521857738494873, "learning_rate": 2.8032657361074532e-05, "loss": 1.0413, "step": 16692 }, { "epoch": 0.4396365551751383, "grad_norm": 2.0270421504974365, "learning_rate": 2.8031340531998944e-05, "loss": 1.9875, "step": 16693 }, { "epoch": 0.43966289175664997, "grad_norm": 2.2825734615325928, "learning_rate": 2.8030023702923363e-05, "loss": 2.2645, "step": 16694 }, { "epoch": 0.4396892283381617, "grad_norm": 1.893298625946045, "learning_rate": 2.8028706873847775e-05, "loss": 1.9476, "step": 16695 }, { "epoch": 0.43971556491967345, "grad_norm": 2.5775043964385986, "learning_rate": 2.8027390044772188e-05, "loss": 1.6936, "step": 16696 }, { "epoch": 0.43974190150118514, "grad_norm": 1.5541459321975708, "learning_rate": 2.8026073215696603e-05, "loss": 1.87, "step": 16697 }, { "epoch": 0.4397682380826969, "grad_norm": 2.2654428482055664, "learning_rate": 2.802475638662102e-05, "loss": 1.2083, "step": 16698 }, { "epoch": 0.43979457466420857, "grad_norm": 1.7209984064102173, "learning_rate": 2.8023439557545434e-05, "loss": 1.4076, "step": 16699 }, { "epoch": 0.4398209112457203, "grad_norm": 1.937251329421997, "learning_rate": 2.8022122728469847e-05, "loss": 2.0598, "step": 16700 }, { "epoch": 0.439847247827232, "grad_norm": 2.355435371398926, "learning_rate": 2.802080589939426e-05, "loss": 1.7337, "step": 16701 }, { "epoch": 0.43987358440874375, "grad_norm": 1.8486247062683105, "learning_rate": 2.801948907031867e-05, "loss": 1.635, "step": 16702 }, { "epoch": 0.4398999209902555, "grad_norm": 2.5294671058654785, "learning_rate": 2.801817224124309e-05, "loss": 1.8208, "step": 16703 }, { "epoch": 0.4399262575717672, "grad_norm": 1.5869202613830566, "learning_rate": 2.8016855412167502e-05, "loss": 1.8462, "step": 16704 }, { "epoch": 0.4399525941532789, "grad_norm": 2.247387647628784, "learning_rate": 2.8015538583091915e-05, "loss": 2.0876, "step": 16705 }, { "epoch": 0.4399789307347906, "grad_norm": 1.6417685747146606, "learning_rate": 2.801422175401633e-05, "loss": 2.2676, "step": 16706 }, { "epoch": 0.44000526731630235, "grad_norm": 2.436558961868286, "learning_rate": 2.8012904924940742e-05, "loss": 2.796, "step": 16707 }, { "epoch": 0.44003160389781404, "grad_norm": 2.547675132751465, "learning_rate": 2.801158809586516e-05, "loss": 2.246, "step": 16708 }, { "epoch": 0.4400579404793258, "grad_norm": 3.9826619625091553, "learning_rate": 2.8010271266789574e-05, "loss": 1.3622, "step": 16709 }, { "epoch": 0.4400842770608375, "grad_norm": 2.6392602920532227, "learning_rate": 2.8008954437713986e-05, "loss": 1.5256, "step": 16710 }, { "epoch": 0.4401106136423492, "grad_norm": 3.0870163440704346, "learning_rate": 2.8007637608638398e-05, "loss": 1.3004, "step": 16711 }, { "epoch": 0.44013695022386096, "grad_norm": 3.5953996181488037, "learning_rate": 2.8006320779562817e-05, "loss": 2.0061, "step": 16712 }, { "epoch": 0.44016328680537264, "grad_norm": 1.9704854488372803, "learning_rate": 2.800500395048723e-05, "loss": 1.189, "step": 16713 }, { "epoch": 0.4401896233868844, "grad_norm": 1.8509185314178467, "learning_rate": 2.800368712141164e-05, "loss": 1.483, "step": 16714 }, { "epoch": 0.4402159599683961, "grad_norm": 1.6682113409042358, "learning_rate": 2.8002370292336057e-05, "loss": 1.6067, "step": 16715 }, { "epoch": 0.4402422965499078, "grad_norm": 5.056559085845947, "learning_rate": 2.800105346326047e-05, "loss": 1.0704, "step": 16716 }, { "epoch": 0.44026863313141956, "grad_norm": 1.7665220499038696, "learning_rate": 2.7999736634184888e-05, "loss": 1.7946, "step": 16717 }, { "epoch": 0.44029496971293125, "grad_norm": 2.2904481887817383, "learning_rate": 2.79984198051093e-05, "loss": 2.0682, "step": 16718 }, { "epoch": 0.440321306294443, "grad_norm": 2.2061214447021484, "learning_rate": 2.7997102976033713e-05, "loss": 2.6347, "step": 16719 }, { "epoch": 0.4403476428759547, "grad_norm": 1.9872568845748901, "learning_rate": 2.7995786146958125e-05, "loss": 1.5115, "step": 16720 }, { "epoch": 0.4403739794574664, "grad_norm": 3.753920078277588, "learning_rate": 2.7994469317882537e-05, "loss": 2.1913, "step": 16721 }, { "epoch": 0.44040031603897817, "grad_norm": 1.6522375345230103, "learning_rate": 2.7993152488806956e-05, "loss": 1.9022, "step": 16722 }, { "epoch": 0.44042665262048986, "grad_norm": 2.209785223007202, "learning_rate": 2.7991835659731368e-05, "loss": 2.0994, "step": 16723 }, { "epoch": 0.4404529892020016, "grad_norm": 1.7904874086380005, "learning_rate": 2.799051883065578e-05, "loss": 1.589, "step": 16724 }, { "epoch": 0.4404793257835133, "grad_norm": 2.5237224102020264, "learning_rate": 2.7989202001580196e-05, "loss": 1.3336, "step": 16725 }, { "epoch": 0.44050566236502503, "grad_norm": 2.4693028926849365, "learning_rate": 2.7987885172504608e-05, "loss": 2.1436, "step": 16726 }, { "epoch": 0.4405319989465367, "grad_norm": 1.6800041198730469, "learning_rate": 2.7986568343429027e-05, "loss": 2.0454, "step": 16727 }, { "epoch": 0.44055833552804846, "grad_norm": 2.56982421875, "learning_rate": 2.798525151435344e-05, "loss": 0.9311, "step": 16728 }, { "epoch": 0.4405846721095602, "grad_norm": 2.5149993896484375, "learning_rate": 2.798393468527785e-05, "loss": 1.6821, "step": 16729 }, { "epoch": 0.4406110086910719, "grad_norm": 1.8810791969299316, "learning_rate": 2.7982617856202264e-05, "loss": 2.4493, "step": 16730 }, { "epoch": 0.44063734527258364, "grad_norm": 1.9849098920822144, "learning_rate": 2.7981301027126683e-05, "loss": 1.7455, "step": 16731 }, { "epoch": 0.4406636818540953, "grad_norm": 5.133772850036621, "learning_rate": 2.7979984198051095e-05, "loss": 1.224, "step": 16732 }, { "epoch": 0.44069001843560707, "grad_norm": 1.9071556329727173, "learning_rate": 2.7978667368975507e-05, "loss": 1.7529, "step": 16733 }, { "epoch": 0.44071635501711875, "grad_norm": 1.4554475545883179, "learning_rate": 2.7977350539899923e-05, "loss": 1.5876, "step": 16734 }, { "epoch": 0.4407426915986305, "grad_norm": 3.959616184234619, "learning_rate": 2.7976033710824335e-05, "loss": 0.7249, "step": 16735 }, { "epoch": 0.44076902818014224, "grad_norm": 4.518592357635498, "learning_rate": 2.7974716881748754e-05, "loss": 2.0631, "step": 16736 }, { "epoch": 0.44079536476165393, "grad_norm": 1.9767451286315918, "learning_rate": 2.7973400052673166e-05, "loss": 1.2939, "step": 16737 }, { "epoch": 0.4408217013431657, "grad_norm": 1.6754765510559082, "learning_rate": 2.797208322359758e-05, "loss": 0.5993, "step": 16738 }, { "epoch": 0.44084803792467736, "grad_norm": 5.184513092041016, "learning_rate": 2.797076639452199e-05, "loss": 1.2088, "step": 16739 }, { "epoch": 0.4408743745061891, "grad_norm": 3.1669740676879883, "learning_rate": 2.7969449565446403e-05, "loss": 1.1096, "step": 16740 }, { "epoch": 0.4409007110877008, "grad_norm": 1.533186674118042, "learning_rate": 2.7968132736370822e-05, "loss": 0.6209, "step": 16741 }, { "epoch": 0.44092704766921254, "grad_norm": 1.5440374612808228, "learning_rate": 2.7966815907295234e-05, "loss": 1.3923, "step": 16742 }, { "epoch": 0.4409533842507243, "grad_norm": 3.21589994430542, "learning_rate": 2.796549907821965e-05, "loss": 1.1905, "step": 16743 }, { "epoch": 0.44097972083223597, "grad_norm": 1.814430594444275, "learning_rate": 2.7964182249144062e-05, "loss": 1.9907, "step": 16744 }, { "epoch": 0.4410060574137477, "grad_norm": 1.5405874252319336, "learning_rate": 2.796286542006848e-05, "loss": 1.8106, "step": 16745 }, { "epoch": 0.4410323939952594, "grad_norm": 1.4020309448242188, "learning_rate": 2.7961548590992893e-05, "loss": 1.9008, "step": 16746 }, { "epoch": 0.44105873057677114, "grad_norm": 3.371359348297119, "learning_rate": 2.7960231761917305e-05, "loss": 1.745, "step": 16747 }, { "epoch": 0.44108506715828283, "grad_norm": 4.1858930587768555, "learning_rate": 2.7958914932841717e-05, "loss": 1.5352, "step": 16748 }, { "epoch": 0.44111140373979457, "grad_norm": 2.824721336364746, "learning_rate": 2.795759810376613e-05, "loss": 1.6957, "step": 16749 }, { "epoch": 0.4411377403213063, "grad_norm": 1.7364839315414429, "learning_rate": 2.795628127469055e-05, "loss": 1.9256, "step": 16750 }, { "epoch": 0.441164076902818, "grad_norm": 2.467883825302124, "learning_rate": 2.795496444561496e-05, "loss": 1.9208, "step": 16751 }, { "epoch": 0.44119041348432975, "grad_norm": 1.568540096282959, "learning_rate": 2.7953647616539373e-05, "loss": 1.3809, "step": 16752 }, { "epoch": 0.44121675006584143, "grad_norm": 2.7559430599212646, "learning_rate": 2.795233078746379e-05, "loss": 1.5351, "step": 16753 }, { "epoch": 0.4412430866473532, "grad_norm": 3.288858413696289, "learning_rate": 2.79510139583882e-05, "loss": 1.8216, "step": 16754 }, { "epoch": 0.4412694232288649, "grad_norm": 1.822719931602478, "learning_rate": 2.794969712931262e-05, "loss": 1.8733, "step": 16755 }, { "epoch": 0.4412957598103766, "grad_norm": 2.376509666442871, "learning_rate": 2.7948380300237032e-05, "loss": 1.6281, "step": 16756 }, { "epoch": 0.44132209639188835, "grad_norm": 1.8745875358581543, "learning_rate": 2.7947063471161444e-05, "loss": 1.7885, "step": 16757 }, { "epoch": 0.44134843297340004, "grad_norm": 3.068058729171753, "learning_rate": 2.7945746642085856e-05, "loss": 1.3098, "step": 16758 }, { "epoch": 0.4413747695549118, "grad_norm": 2.4959139823913574, "learning_rate": 2.794442981301027e-05, "loss": 1.2738, "step": 16759 }, { "epoch": 0.44140110613642347, "grad_norm": 1.7837127447128296, "learning_rate": 2.7943112983934688e-05, "loss": 1.5153, "step": 16760 }, { "epoch": 0.4414274427179352, "grad_norm": 2.578883171081543, "learning_rate": 2.79417961548591e-05, "loss": 1.7842, "step": 16761 }, { "epoch": 0.44145377929944696, "grad_norm": 2.0582690238952637, "learning_rate": 2.7940479325783515e-05, "loss": 0.691, "step": 16762 }, { "epoch": 0.44148011588095865, "grad_norm": 1.7992372512817383, "learning_rate": 2.7939162496707928e-05, "loss": 2.1008, "step": 16763 }, { "epoch": 0.4415064524624704, "grad_norm": 3.998180866241455, "learning_rate": 2.7937845667632347e-05, "loss": 1.9433, "step": 16764 }, { "epoch": 0.4415327890439821, "grad_norm": 2.38696551322937, "learning_rate": 2.793652883855676e-05, "loss": 1.677, "step": 16765 }, { "epoch": 0.4415591256254938, "grad_norm": 1.9280604124069214, "learning_rate": 2.793521200948117e-05, "loss": 1.7596, "step": 16766 }, { "epoch": 0.4415854622070055, "grad_norm": 3.6320953369140625, "learning_rate": 2.7933895180405583e-05, "loss": 1.9152, "step": 16767 }, { "epoch": 0.44161179878851725, "grad_norm": 2.4863052368164062, "learning_rate": 2.7932578351329996e-05, "loss": 1.2012, "step": 16768 }, { "epoch": 0.441638135370029, "grad_norm": 1.4355405569076538, "learning_rate": 2.7931261522254414e-05, "loss": 1.1698, "step": 16769 }, { "epoch": 0.4416644719515407, "grad_norm": 1.891467809677124, "learning_rate": 2.7929944693178827e-05, "loss": 2.1571, "step": 16770 }, { "epoch": 0.4416908085330524, "grad_norm": 3.448801279067993, "learning_rate": 2.7928627864103242e-05, "loss": 1.5949, "step": 16771 }, { "epoch": 0.4417171451145641, "grad_norm": 4.097621917724609, "learning_rate": 2.7927311035027655e-05, "loss": 2.553, "step": 16772 }, { "epoch": 0.44174348169607586, "grad_norm": 1.3943984508514404, "learning_rate": 2.7925994205952067e-05, "loss": 0.5025, "step": 16773 }, { "epoch": 0.44176981827758754, "grad_norm": 1.7040599584579468, "learning_rate": 2.7924677376876486e-05, "loss": 1.3488, "step": 16774 }, { "epoch": 0.4417961548590993, "grad_norm": 4.253444194793701, "learning_rate": 2.7923360547800898e-05, "loss": 1.8751, "step": 16775 }, { "epoch": 0.44182249144061103, "grad_norm": 4.625461101531982, "learning_rate": 2.792204371872531e-05, "loss": 0.7516, "step": 16776 }, { "epoch": 0.4418488280221227, "grad_norm": 1.7392487525939941, "learning_rate": 2.7920726889649722e-05, "loss": 1.8613, "step": 16777 }, { "epoch": 0.44187516460363446, "grad_norm": 1.668346881866455, "learning_rate": 2.791941006057414e-05, "loss": 2.122, "step": 16778 }, { "epoch": 0.44190150118514615, "grad_norm": 4.862741947174072, "learning_rate": 2.7918093231498554e-05, "loss": 1.3039, "step": 16779 }, { "epoch": 0.4419278377666579, "grad_norm": 2.018497943878174, "learning_rate": 2.7916776402422966e-05, "loss": 1.6052, "step": 16780 }, { "epoch": 0.4419541743481696, "grad_norm": 1.9620503187179565, "learning_rate": 2.791545957334738e-05, "loss": 1.1701, "step": 16781 }, { "epoch": 0.4419805109296813, "grad_norm": 2.497812509536743, "learning_rate": 2.7914142744271794e-05, "loss": 1.8402, "step": 16782 }, { "epoch": 0.44200684751119307, "grad_norm": 1.4044374227523804, "learning_rate": 2.7912825915196213e-05, "loss": 1.7316, "step": 16783 }, { "epoch": 0.44203318409270476, "grad_norm": 2.0211944580078125, "learning_rate": 2.7911509086120625e-05, "loss": 1.4478, "step": 16784 }, { "epoch": 0.4420595206742165, "grad_norm": 1.9349212646484375, "learning_rate": 2.7910192257045037e-05, "loss": 2.0301, "step": 16785 }, { "epoch": 0.4420858572557282, "grad_norm": 2.043325185775757, "learning_rate": 2.790887542796945e-05, "loss": 1.3926, "step": 16786 }, { "epoch": 0.44211219383723993, "grad_norm": 2.8606808185577393, "learning_rate": 2.790755859889386e-05, "loss": 1.8603, "step": 16787 }, { "epoch": 0.4421385304187516, "grad_norm": 5.086670398712158, "learning_rate": 2.790624176981828e-05, "loss": 0.8439, "step": 16788 }, { "epoch": 0.44216486700026336, "grad_norm": 3.1945712566375732, "learning_rate": 2.7904924940742693e-05, "loss": 1.6982, "step": 16789 }, { "epoch": 0.4421912035817751, "grad_norm": 3.6901233196258545, "learning_rate": 2.7903608111667108e-05, "loss": 1.8002, "step": 16790 }, { "epoch": 0.4422175401632868, "grad_norm": 3.3323347568511963, "learning_rate": 2.790229128259152e-05, "loss": 2.2336, "step": 16791 }, { "epoch": 0.44224387674479854, "grad_norm": 5.013552188873291, "learning_rate": 2.7900974453515933e-05, "loss": 2.144, "step": 16792 }, { "epoch": 0.4422702133263102, "grad_norm": 2.5579257011413574, "learning_rate": 2.789965762444035e-05, "loss": 1.5682, "step": 16793 }, { "epoch": 0.44229654990782197, "grad_norm": 1.526326060295105, "learning_rate": 2.7898340795364764e-05, "loss": 1.4033, "step": 16794 }, { "epoch": 0.4423228864893337, "grad_norm": 1.6585986614227295, "learning_rate": 2.7897023966289176e-05, "loss": 2.1911, "step": 16795 }, { "epoch": 0.4423492230708454, "grad_norm": 2.5212934017181396, "learning_rate": 2.7895707137213588e-05, "loss": 0.2163, "step": 16796 }, { "epoch": 0.44237555965235714, "grad_norm": 1.8194775581359863, "learning_rate": 2.7894390308138007e-05, "loss": 2.0501, "step": 16797 }, { "epoch": 0.44240189623386883, "grad_norm": 1.8742762804031372, "learning_rate": 2.789307347906242e-05, "loss": 2.1743, "step": 16798 }, { "epoch": 0.4424282328153806, "grad_norm": 1.4154751300811768, "learning_rate": 2.789175664998683e-05, "loss": 2.158, "step": 16799 }, { "epoch": 0.44245456939689226, "grad_norm": 2.6107194423675537, "learning_rate": 2.7890439820911247e-05, "loss": 2.042, "step": 16800 }, { "epoch": 0.442480905978404, "grad_norm": 2.6754653453826904, "learning_rate": 2.788912299183566e-05, "loss": 2.268, "step": 16801 }, { "epoch": 0.44250724255991575, "grad_norm": 1.4304776191711426, "learning_rate": 2.788780616276008e-05, "loss": 1.39, "step": 16802 }, { "epoch": 0.44253357914142744, "grad_norm": 3.638984203338623, "learning_rate": 2.788648933368449e-05, "loss": 1.4697, "step": 16803 }, { "epoch": 0.4425599157229392, "grad_norm": 1.9505577087402344, "learning_rate": 2.7885172504608903e-05, "loss": 1.3988, "step": 16804 }, { "epoch": 0.44258625230445087, "grad_norm": 6.658775329589844, "learning_rate": 2.7883855675533315e-05, "loss": 1.7174, "step": 16805 }, { "epoch": 0.4426125888859626, "grad_norm": 1.8260091543197632, "learning_rate": 2.7882538846457727e-05, "loss": 1.6299, "step": 16806 }, { "epoch": 0.4426389254674743, "grad_norm": 2.0658438205718994, "learning_rate": 2.7881222017382146e-05, "loss": 1.519, "step": 16807 }, { "epoch": 0.44266526204898604, "grad_norm": 1.3855957984924316, "learning_rate": 2.787990518830656e-05, "loss": 1.2043, "step": 16808 }, { "epoch": 0.4426915986304978, "grad_norm": 3.339667797088623, "learning_rate": 2.7878588359230974e-05, "loss": 1.3137, "step": 16809 }, { "epoch": 0.4427179352120095, "grad_norm": 2.0247812271118164, "learning_rate": 2.7877271530155386e-05, "loss": 1.8398, "step": 16810 }, { "epoch": 0.4427442717935212, "grad_norm": 1.5882487297058105, "learning_rate": 2.7875954701079805e-05, "loss": 1.8964, "step": 16811 }, { "epoch": 0.4427706083750329, "grad_norm": 2.187187910079956, "learning_rate": 2.7874637872004217e-05, "loss": 1.5122, "step": 16812 }, { "epoch": 0.44279694495654465, "grad_norm": 2.3871068954467773, "learning_rate": 2.787332104292863e-05, "loss": 1.4994, "step": 16813 }, { "epoch": 0.44282328153805633, "grad_norm": 2.455298662185669, "learning_rate": 2.7872004213853042e-05, "loss": 1.6469, "step": 16814 }, { "epoch": 0.4428496181195681, "grad_norm": 2.073849678039551, "learning_rate": 2.7870687384777454e-05, "loss": 2.3492, "step": 16815 }, { "epoch": 0.4428759547010798, "grad_norm": 2.083263635635376, "learning_rate": 2.7869370555701873e-05, "loss": 0.7781, "step": 16816 }, { "epoch": 0.4429022912825915, "grad_norm": 3.7390551567077637, "learning_rate": 2.7868053726626285e-05, "loss": 0.8679, "step": 16817 }, { "epoch": 0.44292862786410325, "grad_norm": 2.083997964859009, "learning_rate": 2.78667368975507e-05, "loss": 1.4691, "step": 16818 }, { "epoch": 0.44295496444561494, "grad_norm": 1.7258881330490112, "learning_rate": 2.7865420068475113e-05, "loss": 2.1395, "step": 16819 }, { "epoch": 0.4429813010271267, "grad_norm": 2.7336554527282715, "learning_rate": 2.7864103239399525e-05, "loss": 2.7409, "step": 16820 }, { "epoch": 0.44300763760863837, "grad_norm": 2.1014702320098877, "learning_rate": 2.7862786410323944e-05, "loss": 1.6637, "step": 16821 }, { "epoch": 0.4430339741901501, "grad_norm": 3.676614761352539, "learning_rate": 2.7861469581248356e-05, "loss": 2.2768, "step": 16822 }, { "epoch": 0.44306031077166186, "grad_norm": 2.7421998977661133, "learning_rate": 2.786015275217277e-05, "loss": 1.7537, "step": 16823 }, { "epoch": 0.44308664735317355, "grad_norm": 1.6489410400390625, "learning_rate": 2.785883592309718e-05, "loss": 2.6386, "step": 16824 }, { "epoch": 0.4431129839346853, "grad_norm": 1.9031383991241455, "learning_rate": 2.7857519094021593e-05, "loss": 1.5666, "step": 16825 }, { "epoch": 0.443139320516197, "grad_norm": 2.2424068450927734, "learning_rate": 2.7856202264946012e-05, "loss": 1.6432, "step": 16826 }, { "epoch": 0.4431656570977087, "grad_norm": 3.7087645530700684, "learning_rate": 2.7854885435870424e-05, "loss": 1.3907, "step": 16827 }, { "epoch": 0.44319199367922046, "grad_norm": 2.816002130508423, "learning_rate": 2.785356860679484e-05, "loss": 2.2441, "step": 16828 }, { "epoch": 0.44321833026073215, "grad_norm": 2.118114709854126, "learning_rate": 2.7852251777719252e-05, "loss": 2.7322, "step": 16829 }, { "epoch": 0.4432446668422439, "grad_norm": 2.9789841175079346, "learning_rate": 2.785093494864367e-05, "loss": 1.5145, "step": 16830 }, { "epoch": 0.4432710034237556, "grad_norm": 2.5113940238952637, "learning_rate": 2.7849618119568083e-05, "loss": 0.5272, "step": 16831 }, { "epoch": 0.4432973400052673, "grad_norm": 2.500525712966919, "learning_rate": 2.7848301290492496e-05, "loss": 1.838, "step": 16832 }, { "epoch": 0.443323676586779, "grad_norm": 2.9666333198547363, "learning_rate": 2.7846984461416908e-05, "loss": 0.8301, "step": 16833 }, { "epoch": 0.44335001316829076, "grad_norm": 1.8912339210510254, "learning_rate": 2.784566763234132e-05, "loss": 1.0398, "step": 16834 }, { "epoch": 0.4433763497498025, "grad_norm": 2.5475335121154785, "learning_rate": 2.784435080326574e-05, "loss": 1.86, "step": 16835 }, { "epoch": 0.4434026863313142, "grad_norm": 2.1660566329956055, "learning_rate": 2.784303397419015e-05, "loss": 1.6221, "step": 16836 }, { "epoch": 0.44342902291282593, "grad_norm": 2.804410457611084, "learning_rate": 2.7841717145114567e-05, "loss": 2.5628, "step": 16837 }, { "epoch": 0.4434553594943376, "grad_norm": 3.7074782848358154, "learning_rate": 2.784040031603898e-05, "loss": 1.7598, "step": 16838 }, { "epoch": 0.44348169607584936, "grad_norm": 2.791229724884033, "learning_rate": 2.783908348696339e-05, "loss": 1.3794, "step": 16839 }, { "epoch": 0.44350803265736105, "grad_norm": 1.6307436227798462, "learning_rate": 2.783776665788781e-05, "loss": 1.5339, "step": 16840 }, { "epoch": 0.4435343692388728, "grad_norm": 4.305070877075195, "learning_rate": 2.7836449828812222e-05, "loss": 1.7818, "step": 16841 }, { "epoch": 0.44356070582038454, "grad_norm": 2.9363365173339844, "learning_rate": 2.7835132999736635e-05, "loss": 1.6574, "step": 16842 }, { "epoch": 0.4435870424018962, "grad_norm": 2.335444450378418, "learning_rate": 2.7833816170661047e-05, "loss": 1.6074, "step": 16843 }, { "epoch": 0.44361337898340797, "grad_norm": 2.6473681926727295, "learning_rate": 2.7832499341585466e-05, "loss": 1.4399, "step": 16844 }, { "epoch": 0.44363971556491966, "grad_norm": 1.644146203994751, "learning_rate": 2.7831182512509878e-05, "loss": 0.8183, "step": 16845 }, { "epoch": 0.4436660521464314, "grad_norm": 1.9546242952346802, "learning_rate": 2.7829865683434294e-05, "loss": 1.7601, "step": 16846 }, { "epoch": 0.4436923887279431, "grad_norm": 4.175062656402588, "learning_rate": 2.7828548854358706e-05, "loss": 2.1668, "step": 16847 }, { "epoch": 0.44371872530945483, "grad_norm": 1.99675714969635, "learning_rate": 2.7827232025283118e-05, "loss": 2.4218, "step": 16848 }, { "epoch": 0.4437450618909666, "grad_norm": 1.891690969467163, "learning_rate": 2.7825915196207537e-05, "loss": 1.7463, "step": 16849 }, { "epoch": 0.44377139847247826, "grad_norm": 5.1198320388793945, "learning_rate": 2.782459836713195e-05, "loss": 1.5876, "step": 16850 }, { "epoch": 0.44379773505399, "grad_norm": 2.1891446113586426, "learning_rate": 2.782328153805636e-05, "loss": 1.8012, "step": 16851 }, { "epoch": 0.4438240716355017, "grad_norm": 2.445836067199707, "learning_rate": 2.7821964708980774e-05, "loss": 1.7342, "step": 16852 }, { "epoch": 0.44385040821701344, "grad_norm": 1.9329490661621094, "learning_rate": 2.7820647879905186e-05, "loss": 0.5168, "step": 16853 }, { "epoch": 0.4438767447985251, "grad_norm": 1.816304087638855, "learning_rate": 2.7819331050829605e-05, "loss": 1.6229, "step": 16854 }, { "epoch": 0.44390308138003687, "grad_norm": 1.3820242881774902, "learning_rate": 2.7818014221754017e-05, "loss": 1.4886, "step": 16855 }, { "epoch": 0.4439294179615486, "grad_norm": 1.9624178409576416, "learning_rate": 2.7816697392678433e-05, "loss": 1.6843, "step": 16856 }, { "epoch": 0.4439557545430603, "grad_norm": 2.3891029357910156, "learning_rate": 2.7815380563602845e-05, "loss": 1.1776, "step": 16857 }, { "epoch": 0.44398209112457204, "grad_norm": 3.7418570518493652, "learning_rate": 2.7814063734527257e-05, "loss": 1.0569, "step": 16858 }, { "epoch": 0.44400842770608373, "grad_norm": 2.0556480884552, "learning_rate": 2.7812746905451676e-05, "loss": 0.5321, "step": 16859 }, { "epoch": 0.4440347642875955, "grad_norm": 7.790976047515869, "learning_rate": 2.7811430076376088e-05, "loss": 1.1386, "step": 16860 }, { "epoch": 0.4440611008691072, "grad_norm": 2.29221510887146, "learning_rate": 2.78101132473005e-05, "loss": 1.7673, "step": 16861 }, { "epoch": 0.4440874374506189, "grad_norm": 1.8455910682678223, "learning_rate": 2.7808796418224913e-05, "loss": 1.4893, "step": 16862 }, { "epoch": 0.44411377403213065, "grad_norm": 1.9344048500061035, "learning_rate": 2.780747958914933e-05, "loss": 1.937, "step": 16863 }, { "epoch": 0.44414011061364234, "grad_norm": 1.6465835571289062, "learning_rate": 2.7806162760073744e-05, "loss": 1.6376, "step": 16864 }, { "epoch": 0.4441664471951541, "grad_norm": 1.7242937088012695, "learning_rate": 2.780484593099816e-05, "loss": 1.8259, "step": 16865 }, { "epoch": 0.44419278377666577, "grad_norm": 2.9298622608184814, "learning_rate": 2.780352910192257e-05, "loss": 2.0969, "step": 16866 }, { "epoch": 0.4442191203581775, "grad_norm": 2.841571569442749, "learning_rate": 2.7802212272846984e-05, "loss": 1.343, "step": 16867 }, { "epoch": 0.44424545693968925, "grad_norm": 3.596860885620117, "learning_rate": 2.7800895443771403e-05, "loss": 1.51, "step": 16868 }, { "epoch": 0.44427179352120094, "grad_norm": 1.8692944049835205, "learning_rate": 2.7799578614695815e-05, "loss": 1.8425, "step": 16869 }, { "epoch": 0.4442981301027127, "grad_norm": 1.5739389657974243, "learning_rate": 2.7798261785620227e-05, "loss": 1.5801, "step": 16870 }, { "epoch": 0.4443244666842244, "grad_norm": 2.111631393432617, "learning_rate": 2.779694495654464e-05, "loss": 1.9658, "step": 16871 }, { "epoch": 0.4443508032657361, "grad_norm": 1.912125587463379, "learning_rate": 2.779562812746905e-05, "loss": 2.4808, "step": 16872 }, { "epoch": 0.4443771398472478, "grad_norm": 3.359158992767334, "learning_rate": 2.779431129839347e-05, "loss": 0.8423, "step": 16873 }, { "epoch": 0.44440347642875955, "grad_norm": 2.570143461227417, "learning_rate": 2.7792994469317883e-05, "loss": 1.1699, "step": 16874 }, { "epoch": 0.4444298130102713, "grad_norm": 2.8838579654693604, "learning_rate": 2.77916776402423e-05, "loss": 0.8558, "step": 16875 }, { "epoch": 0.444456149591783, "grad_norm": 3.3629441261291504, "learning_rate": 2.779036081116671e-05, "loss": 1.8851, "step": 16876 }, { "epoch": 0.4444824861732947, "grad_norm": 1.9049854278564453, "learning_rate": 2.778904398209113e-05, "loss": 2.054, "step": 16877 }, { "epoch": 0.4445088227548064, "grad_norm": 3.3422224521636963, "learning_rate": 2.7787727153015542e-05, "loss": 1.9309, "step": 16878 }, { "epoch": 0.44453515933631815, "grad_norm": 1.9425464868545532, "learning_rate": 2.7786410323939954e-05, "loss": 1.5455, "step": 16879 }, { "epoch": 0.44456149591782984, "grad_norm": 3.086204767227173, "learning_rate": 2.7785093494864366e-05, "loss": 1.569, "step": 16880 }, { "epoch": 0.4445878324993416, "grad_norm": 1.6296151876449585, "learning_rate": 2.778377666578878e-05, "loss": 1.8838, "step": 16881 }, { "epoch": 0.4446141690808533, "grad_norm": 1.8207261562347412, "learning_rate": 2.7782459836713197e-05, "loss": 2.1768, "step": 16882 }, { "epoch": 0.444640505662365, "grad_norm": 3.021439790725708, "learning_rate": 2.778114300763761e-05, "loss": 1.2047, "step": 16883 }, { "epoch": 0.44466684224387676, "grad_norm": 2.3001673221588135, "learning_rate": 2.7779826178562025e-05, "loss": 1.7757, "step": 16884 }, { "epoch": 0.44469317882538845, "grad_norm": 1.5953797101974487, "learning_rate": 2.7778509349486437e-05, "loss": 2.1521, "step": 16885 }, { "epoch": 0.4447195154069002, "grad_norm": 2.8892202377319336, "learning_rate": 2.777719252041085e-05, "loss": 1.5184, "step": 16886 }, { "epoch": 0.4447458519884119, "grad_norm": 2.0880119800567627, "learning_rate": 2.777587569133527e-05, "loss": 1.5588, "step": 16887 }, { "epoch": 0.4447721885699236, "grad_norm": 1.7109434604644775, "learning_rate": 2.777455886225968e-05, "loss": 1.5284, "step": 16888 }, { "epoch": 0.44479852515143536, "grad_norm": 1.627532720565796, "learning_rate": 2.7773242033184093e-05, "loss": 2.217, "step": 16889 }, { "epoch": 0.44482486173294705, "grad_norm": 2.318840265274048, "learning_rate": 2.7771925204108505e-05, "loss": 1.7881, "step": 16890 }, { "epoch": 0.4448511983144588, "grad_norm": 2.502875566482544, "learning_rate": 2.777060837503292e-05, "loss": 1.7811, "step": 16891 }, { "epoch": 0.4448775348959705, "grad_norm": 1.4858766794204712, "learning_rate": 2.7769291545957337e-05, "loss": 1.5904, "step": 16892 }, { "epoch": 0.4449038714774822, "grad_norm": 1.8423118591308594, "learning_rate": 2.7767974716881752e-05, "loss": 2.4283, "step": 16893 }, { "epoch": 0.44493020805899397, "grad_norm": 3.259521722793579, "learning_rate": 2.7766657887806164e-05, "loss": 1.1515, "step": 16894 }, { "epoch": 0.44495654464050566, "grad_norm": 3.624154806137085, "learning_rate": 2.7765341058730577e-05, "loss": 1.757, "step": 16895 }, { "epoch": 0.4449828812220174, "grad_norm": 2.348252058029175, "learning_rate": 2.7764024229654996e-05, "loss": 1.184, "step": 16896 }, { "epoch": 0.4450092178035291, "grad_norm": 2.777730941772461, "learning_rate": 2.7762707400579408e-05, "loss": 1.1515, "step": 16897 }, { "epoch": 0.44503555438504083, "grad_norm": 2.3152735233306885, "learning_rate": 2.776139057150382e-05, "loss": 2.2853, "step": 16898 }, { "epoch": 0.4450618909665525, "grad_norm": 1.5822404623031616, "learning_rate": 2.7760073742428232e-05, "loss": 1.7404, "step": 16899 }, { "epoch": 0.44508822754806426, "grad_norm": 1.573306918144226, "learning_rate": 2.7758756913352644e-05, "loss": 2.1833, "step": 16900 }, { "epoch": 0.445114564129576, "grad_norm": 1.7900347709655762, "learning_rate": 2.7757440084277063e-05, "loss": 1.7033, "step": 16901 }, { "epoch": 0.4451409007110877, "grad_norm": 2.898188591003418, "learning_rate": 2.7756123255201476e-05, "loss": 0.6554, "step": 16902 }, { "epoch": 0.44516723729259944, "grad_norm": 3.6206510066986084, "learning_rate": 2.775480642612589e-05, "loss": 2.0676, "step": 16903 }, { "epoch": 0.4451935738741111, "grad_norm": 1.631095290184021, "learning_rate": 2.7753489597050303e-05, "loss": 1.7796, "step": 16904 }, { "epoch": 0.44521991045562287, "grad_norm": 1.7430720329284668, "learning_rate": 2.7752172767974716e-05, "loss": 0.9468, "step": 16905 }, { "epoch": 0.44524624703713456, "grad_norm": 2.8095223903656006, "learning_rate": 2.7750855938899135e-05, "loss": 1.6888, "step": 16906 }, { "epoch": 0.4452725836186463, "grad_norm": 1.7991313934326172, "learning_rate": 2.7749539109823547e-05, "loss": 1.6587, "step": 16907 }, { "epoch": 0.44529892020015804, "grad_norm": 1.771796703338623, "learning_rate": 2.774822228074796e-05, "loss": 1.3306, "step": 16908 }, { "epoch": 0.44532525678166973, "grad_norm": 1.8517221212387085, "learning_rate": 2.774690545167237e-05, "loss": 2.137, "step": 16909 }, { "epoch": 0.4453515933631815, "grad_norm": 3.1675400733947754, "learning_rate": 2.774558862259679e-05, "loss": 1.8773, "step": 16910 }, { "epoch": 0.44537792994469316, "grad_norm": 2.7926506996154785, "learning_rate": 2.7744271793521202e-05, "loss": 1.8622, "step": 16911 }, { "epoch": 0.4454042665262049, "grad_norm": 1.9208461046218872, "learning_rate": 2.7742954964445618e-05, "loss": 2.5781, "step": 16912 }, { "epoch": 0.4454306031077166, "grad_norm": 2.115795373916626, "learning_rate": 2.774163813537003e-05, "loss": 1.7638, "step": 16913 }, { "epoch": 0.44545693968922834, "grad_norm": 1.5698108673095703, "learning_rate": 2.7740321306294442e-05, "loss": 1.7687, "step": 16914 }, { "epoch": 0.4454832762707401, "grad_norm": 3.2958595752716064, "learning_rate": 2.773900447721886e-05, "loss": 1.2381, "step": 16915 }, { "epoch": 0.44550961285225177, "grad_norm": 1.9297618865966797, "learning_rate": 2.7737687648143274e-05, "loss": 1.583, "step": 16916 }, { "epoch": 0.4455359494337635, "grad_norm": 1.9617490768432617, "learning_rate": 2.7736370819067686e-05, "loss": 1.5403, "step": 16917 }, { "epoch": 0.4455622860152752, "grad_norm": 2.4254512786865234, "learning_rate": 2.7735053989992098e-05, "loss": 2.1346, "step": 16918 }, { "epoch": 0.44558862259678694, "grad_norm": 1.8264394998550415, "learning_rate": 2.7733737160916514e-05, "loss": 1.9176, "step": 16919 }, { "epoch": 0.44561495917829863, "grad_norm": 3.0311176776885986, "learning_rate": 2.773242033184093e-05, "loss": 1.1973, "step": 16920 }, { "epoch": 0.4456412957598104, "grad_norm": 5.307843208312988, "learning_rate": 2.7731103502765345e-05, "loss": 1.7294, "step": 16921 }, { "epoch": 0.4456676323413221, "grad_norm": 2.270634889602661, "learning_rate": 2.7729786673689757e-05, "loss": 1.4195, "step": 16922 }, { "epoch": 0.4456939689228338, "grad_norm": 1.8978369235992432, "learning_rate": 2.772846984461417e-05, "loss": 1.6679, "step": 16923 }, { "epoch": 0.44572030550434555, "grad_norm": 3.815642833709717, "learning_rate": 2.772715301553858e-05, "loss": 1.0659, "step": 16924 }, { "epoch": 0.44574664208585724, "grad_norm": 1.8492826223373413, "learning_rate": 2.7725836186463e-05, "loss": 1.4072, "step": 16925 }, { "epoch": 0.445772978667369, "grad_norm": 1.870880126953125, "learning_rate": 2.7724519357387413e-05, "loss": 1.9449, "step": 16926 }, { "epoch": 0.44579931524888067, "grad_norm": 2.6569020748138428, "learning_rate": 2.7723202528311825e-05, "loss": 2.0963, "step": 16927 }, { "epoch": 0.4458256518303924, "grad_norm": 2.126800060272217, "learning_rate": 2.7721885699236237e-05, "loss": 1.9566, "step": 16928 }, { "epoch": 0.44585198841190415, "grad_norm": 2.0076680183410645, "learning_rate": 2.7720568870160656e-05, "loss": 2.3031, "step": 16929 }, { "epoch": 0.44587832499341584, "grad_norm": 2.5603325366973877, "learning_rate": 2.7719252041085068e-05, "loss": 0.8739, "step": 16930 }, { "epoch": 0.4459046615749276, "grad_norm": 3.1730544567108154, "learning_rate": 2.7717935212009484e-05, "loss": 1.7421, "step": 16931 }, { "epoch": 0.4459309981564393, "grad_norm": 1.8664087057113647, "learning_rate": 2.7716618382933896e-05, "loss": 1.6497, "step": 16932 }, { "epoch": 0.445957334737951, "grad_norm": 1.5322927236557007, "learning_rate": 2.7715301553858308e-05, "loss": 1.4631, "step": 16933 }, { "epoch": 0.44598367131946276, "grad_norm": 1.8551011085510254, "learning_rate": 2.7713984724782727e-05, "loss": 0.9272, "step": 16934 }, { "epoch": 0.44601000790097445, "grad_norm": 2.6447277069091797, "learning_rate": 2.771266789570714e-05, "loss": 0.9992, "step": 16935 }, { "epoch": 0.4460363444824862, "grad_norm": 3.992147922515869, "learning_rate": 2.771135106663155e-05, "loss": 1.4683, "step": 16936 }, { "epoch": 0.4460626810639979, "grad_norm": 1.8378742933273315, "learning_rate": 2.7710034237555964e-05, "loss": 1.9912, "step": 16937 }, { "epoch": 0.4460890176455096, "grad_norm": 1.8693233728408813, "learning_rate": 2.770871740848038e-05, "loss": 1.2862, "step": 16938 }, { "epoch": 0.4461153542270213, "grad_norm": 1.7912756204605103, "learning_rate": 2.7707400579404795e-05, "loss": 2.0776, "step": 16939 }, { "epoch": 0.44614169080853305, "grad_norm": 1.797825813293457, "learning_rate": 2.770608375032921e-05, "loss": 2.1249, "step": 16940 }, { "epoch": 0.4461680273900448, "grad_norm": 2.193272352218628, "learning_rate": 2.7704766921253623e-05, "loss": 0.9412, "step": 16941 }, { "epoch": 0.4461943639715565, "grad_norm": 2.6819419860839844, "learning_rate": 2.7703450092178035e-05, "loss": 1.4018, "step": 16942 }, { "epoch": 0.44622070055306823, "grad_norm": 1.5943903923034668, "learning_rate": 2.7702133263102454e-05, "loss": 0.2727, "step": 16943 }, { "epoch": 0.4462470371345799, "grad_norm": 3.126850128173828, "learning_rate": 2.7700816434026866e-05, "loss": 1.4446, "step": 16944 }, { "epoch": 0.44627337371609166, "grad_norm": 2.14018177986145, "learning_rate": 2.769949960495128e-05, "loss": 1.9665, "step": 16945 }, { "epoch": 0.44629971029760335, "grad_norm": 2.214160442352295, "learning_rate": 2.769818277587569e-05, "loss": 2.0631, "step": 16946 }, { "epoch": 0.4463260468791151, "grad_norm": 2.8749234676361084, "learning_rate": 2.7696865946800106e-05, "loss": 2.1724, "step": 16947 }, { "epoch": 0.44635238346062683, "grad_norm": 2.9379866123199463, "learning_rate": 2.7695549117724522e-05, "loss": 1.7791, "step": 16948 }, { "epoch": 0.4463787200421385, "grad_norm": 2.057192087173462, "learning_rate": 2.7694232288648937e-05, "loss": 1.4764, "step": 16949 }, { "epoch": 0.44640505662365026, "grad_norm": 2.3252058029174805, "learning_rate": 2.769291545957335e-05, "loss": 2.189, "step": 16950 }, { "epoch": 0.44643139320516195, "grad_norm": 1.5831563472747803, "learning_rate": 2.7691598630497762e-05, "loss": 1.5904, "step": 16951 }, { "epoch": 0.4464577297866737, "grad_norm": 1.7704025506973267, "learning_rate": 2.7690281801422174e-05, "loss": 1.7409, "step": 16952 }, { "epoch": 0.4464840663681854, "grad_norm": 1.6311105489730835, "learning_rate": 2.7688964972346593e-05, "loss": 1.8551, "step": 16953 }, { "epoch": 0.4465104029496971, "grad_norm": 1.8063361644744873, "learning_rate": 2.7687648143271005e-05, "loss": 1.7091, "step": 16954 }, { "epoch": 0.44653673953120887, "grad_norm": 1.9200565814971924, "learning_rate": 2.7686331314195418e-05, "loss": 1.5417, "step": 16955 }, { "epoch": 0.44656307611272056, "grad_norm": 1.5721930265426636, "learning_rate": 2.768501448511983e-05, "loss": 1.8376, "step": 16956 }, { "epoch": 0.4465894126942323, "grad_norm": 1.7521731853485107, "learning_rate": 2.7683697656044245e-05, "loss": 1.8468, "step": 16957 }, { "epoch": 0.446615749275744, "grad_norm": 2.1449780464172363, "learning_rate": 2.768238082696866e-05, "loss": 2.3527, "step": 16958 }, { "epoch": 0.44664208585725573, "grad_norm": 1.9583295583724976, "learning_rate": 2.7681063997893077e-05, "loss": 2.2316, "step": 16959 }, { "epoch": 0.4466684224387674, "grad_norm": 2.2941904067993164, "learning_rate": 2.767974716881749e-05, "loss": 1.235, "step": 16960 }, { "epoch": 0.44669475902027916, "grad_norm": 2.3666372299194336, "learning_rate": 2.76784303397419e-05, "loss": 1.3692, "step": 16961 }, { "epoch": 0.4467210956017909, "grad_norm": 1.5040719509124756, "learning_rate": 2.767711351066632e-05, "loss": 1.6571, "step": 16962 }, { "epoch": 0.4467474321833026, "grad_norm": 1.6939072608947754, "learning_rate": 2.7675796681590732e-05, "loss": 1.983, "step": 16963 }, { "epoch": 0.44677376876481434, "grad_norm": 1.7759827375411987, "learning_rate": 2.7674479852515144e-05, "loss": 1.5155, "step": 16964 }, { "epoch": 0.446800105346326, "grad_norm": 1.6097828149795532, "learning_rate": 2.7673163023439557e-05, "loss": 1.9206, "step": 16965 }, { "epoch": 0.44682644192783777, "grad_norm": 3.1648993492126465, "learning_rate": 2.7671846194363972e-05, "loss": 1.3184, "step": 16966 }, { "epoch": 0.4468527785093495, "grad_norm": 3.994750499725342, "learning_rate": 2.7670529365288388e-05, "loss": 1.7561, "step": 16967 }, { "epoch": 0.4468791150908612, "grad_norm": 3.123556137084961, "learning_rate": 2.7669212536212803e-05, "loss": 0.9361, "step": 16968 }, { "epoch": 0.44690545167237294, "grad_norm": 3.8806233406066895, "learning_rate": 2.7667895707137216e-05, "loss": 1.4404, "step": 16969 }, { "epoch": 0.44693178825388463, "grad_norm": 3.531498432159424, "learning_rate": 2.7666578878061628e-05, "loss": 1.6582, "step": 16970 }, { "epoch": 0.4469581248353964, "grad_norm": 1.766258955001831, "learning_rate": 2.766526204898604e-05, "loss": 1.7279, "step": 16971 }, { "epoch": 0.44698446141690806, "grad_norm": 2.598158121109009, "learning_rate": 2.766394521991046e-05, "loss": 1.9407, "step": 16972 }, { "epoch": 0.4470107979984198, "grad_norm": 1.6280368566513062, "learning_rate": 2.766262839083487e-05, "loss": 1.9219, "step": 16973 }, { "epoch": 0.44703713457993155, "grad_norm": 2.0324461460113525, "learning_rate": 2.7661311561759283e-05, "loss": 1.1257, "step": 16974 }, { "epoch": 0.44706347116144324, "grad_norm": 4.4627790451049805, "learning_rate": 2.7659994732683696e-05, "loss": 1.02, "step": 16975 }, { "epoch": 0.447089807742955, "grad_norm": 1.9029957056045532, "learning_rate": 2.7658677903608115e-05, "loss": 1.5703, "step": 16976 }, { "epoch": 0.44711614432446667, "grad_norm": 3.1215009689331055, "learning_rate": 2.7657361074532527e-05, "loss": 1.8366, "step": 16977 }, { "epoch": 0.4471424809059784, "grad_norm": 4.092123508453369, "learning_rate": 2.7656044245456942e-05, "loss": 2.1572, "step": 16978 }, { "epoch": 0.4471688174874901, "grad_norm": 2.174711227416992, "learning_rate": 2.7654727416381355e-05, "loss": 2.4771, "step": 16979 }, { "epoch": 0.44719515406900184, "grad_norm": 2.23246693611145, "learning_rate": 2.7653410587305767e-05, "loss": 1.8769, "step": 16980 }, { "epoch": 0.4472214906505136, "grad_norm": 2.5172669887542725, "learning_rate": 2.7652093758230186e-05, "loss": 1.5685, "step": 16981 }, { "epoch": 0.4472478272320253, "grad_norm": 2.2094478607177734, "learning_rate": 2.7650776929154598e-05, "loss": 2.0639, "step": 16982 }, { "epoch": 0.447274163813537, "grad_norm": 2.4007728099823, "learning_rate": 2.764946010007901e-05, "loss": 1.3648, "step": 16983 }, { "epoch": 0.4473005003950487, "grad_norm": 2.9891598224639893, "learning_rate": 2.7648143271003422e-05, "loss": 1.4025, "step": 16984 }, { "epoch": 0.44732683697656045, "grad_norm": 2.407750368118286, "learning_rate": 2.7646826441927838e-05, "loss": 0.4806, "step": 16985 }, { "epoch": 0.44735317355807214, "grad_norm": 2.238537549972534, "learning_rate": 2.7645509612852254e-05, "loss": 2.125, "step": 16986 }, { "epoch": 0.4473795101395839, "grad_norm": 1.7120082378387451, "learning_rate": 2.764419278377667e-05, "loss": 2.0576, "step": 16987 }, { "epoch": 0.4474058467210956, "grad_norm": 3.8782999515533447, "learning_rate": 2.764287595470108e-05, "loss": 0.9971, "step": 16988 }, { "epoch": 0.4474321833026073, "grad_norm": 2.8225274085998535, "learning_rate": 2.7641559125625494e-05, "loss": 0.3103, "step": 16989 }, { "epoch": 0.44745851988411905, "grad_norm": 1.5613248348236084, "learning_rate": 2.7640242296549906e-05, "loss": 0.4772, "step": 16990 }, { "epoch": 0.44748485646563074, "grad_norm": 1.8757102489471436, "learning_rate": 2.7638925467474325e-05, "loss": 1.378, "step": 16991 }, { "epoch": 0.4475111930471425, "grad_norm": 1.6405189037322998, "learning_rate": 2.7637608638398737e-05, "loss": 1.7303, "step": 16992 }, { "epoch": 0.4475375296286542, "grad_norm": 5.172881126403809, "learning_rate": 2.763629180932315e-05, "loss": 1.3605, "step": 16993 }, { "epoch": 0.4475638662101659, "grad_norm": 1.9535326957702637, "learning_rate": 2.7634974980247565e-05, "loss": 1.4646, "step": 16994 }, { "epoch": 0.44759020279167766, "grad_norm": 1.6215181350708008, "learning_rate": 2.763365815117198e-05, "loss": 1.4769, "step": 16995 }, { "epoch": 0.44761653937318935, "grad_norm": 2.584655284881592, "learning_rate": 2.7632341322096396e-05, "loss": 2.0825, "step": 16996 }, { "epoch": 0.4476428759547011, "grad_norm": 3.165177822113037, "learning_rate": 2.7631024493020808e-05, "loss": 1.8981, "step": 16997 }, { "epoch": 0.4476692125362128, "grad_norm": 2.9137821197509766, "learning_rate": 2.762970766394522e-05, "loss": 2.0165, "step": 16998 }, { "epoch": 0.4476955491177245, "grad_norm": 2.2985646724700928, "learning_rate": 2.7628390834869633e-05, "loss": 1.7323, "step": 16999 }, { "epoch": 0.44772188569923627, "grad_norm": 2.334723472595215, "learning_rate": 2.762707400579405e-05, "loss": 2.57, "step": 17000 }, { "epoch": 0.44774822228074795, "grad_norm": 2.095701217651367, "learning_rate": 2.7625757176718464e-05, "loss": 1.963, "step": 17001 }, { "epoch": 0.4477745588622597, "grad_norm": 4.5567731857299805, "learning_rate": 2.7624440347642876e-05, "loss": 2.1609, "step": 17002 }, { "epoch": 0.4478008954437714, "grad_norm": 3.805265426635742, "learning_rate": 2.7623123518567288e-05, "loss": 1.3903, "step": 17003 }, { "epoch": 0.44782723202528313, "grad_norm": 1.9035910367965698, "learning_rate": 2.7621806689491704e-05, "loss": 2.1309, "step": 17004 }, { "epoch": 0.4478535686067948, "grad_norm": 3.8158488273620605, "learning_rate": 2.762048986041612e-05, "loss": 1.3963, "step": 17005 }, { "epoch": 0.44787990518830656, "grad_norm": 1.8790903091430664, "learning_rate": 2.7619173031340535e-05, "loss": 2.5806, "step": 17006 }, { "epoch": 0.4479062417698183, "grad_norm": 2.065577745437622, "learning_rate": 2.7617856202264947e-05, "loss": 1.6602, "step": 17007 }, { "epoch": 0.44793257835133, "grad_norm": 2.0469491481781006, "learning_rate": 2.761653937318936e-05, "loss": 1.9029, "step": 17008 }, { "epoch": 0.44795891493284173, "grad_norm": 1.4174691438674927, "learning_rate": 2.761522254411378e-05, "loss": 1.5197, "step": 17009 }, { "epoch": 0.4479852515143534, "grad_norm": 1.9062668085098267, "learning_rate": 2.761390571503819e-05, "loss": 1.5491, "step": 17010 }, { "epoch": 0.44801158809586517, "grad_norm": 1.7393251657485962, "learning_rate": 2.7612588885962603e-05, "loss": 1.7952, "step": 17011 }, { "epoch": 0.44803792467737685, "grad_norm": 1.9266389608383179, "learning_rate": 2.7611272056887015e-05, "loss": 1.7722, "step": 17012 }, { "epoch": 0.4480642612588886, "grad_norm": 2.281890630722046, "learning_rate": 2.760995522781143e-05, "loss": 1.9477, "step": 17013 }, { "epoch": 0.44809059784040034, "grad_norm": 3.4346511363983154, "learning_rate": 2.7608638398735846e-05, "loss": 1.1425, "step": 17014 }, { "epoch": 0.448116934421912, "grad_norm": 1.7877534627914429, "learning_rate": 2.7607321569660262e-05, "loss": 2.0013, "step": 17015 }, { "epoch": 0.44814327100342377, "grad_norm": 2.627500534057617, "learning_rate": 2.7606004740584674e-05, "loss": 1.4983, "step": 17016 }, { "epoch": 0.44816960758493546, "grad_norm": 2.3048906326293945, "learning_rate": 2.7604687911509086e-05, "loss": 1.6608, "step": 17017 }, { "epoch": 0.4481959441664472, "grad_norm": 1.485011339187622, "learning_rate": 2.76033710824335e-05, "loss": 1.573, "step": 17018 }, { "epoch": 0.4482222807479589, "grad_norm": 1.5875369310379028, "learning_rate": 2.7602054253357918e-05, "loss": 1.5857, "step": 17019 }, { "epoch": 0.44824861732947063, "grad_norm": 4.307639122009277, "learning_rate": 2.760073742428233e-05, "loss": 1.2746, "step": 17020 }, { "epoch": 0.4482749539109824, "grad_norm": 2.0398101806640625, "learning_rate": 2.7599420595206742e-05, "loss": 1.9623, "step": 17021 }, { "epoch": 0.44830129049249406, "grad_norm": 3.6416165828704834, "learning_rate": 2.7598103766131158e-05, "loss": 1.0117, "step": 17022 }, { "epoch": 0.4483276270740058, "grad_norm": 1.6380103826522827, "learning_rate": 2.7596786937055573e-05, "loss": 1.8844, "step": 17023 }, { "epoch": 0.4483539636555175, "grad_norm": 3.5386505126953125, "learning_rate": 2.759547010797999e-05, "loss": 1.7687, "step": 17024 }, { "epoch": 0.44838030023702924, "grad_norm": 2.216176748275757, "learning_rate": 2.75941532789044e-05, "loss": 0.6895, "step": 17025 }, { "epoch": 0.4484066368185409, "grad_norm": 1.5757381916046143, "learning_rate": 2.7592836449828813e-05, "loss": 1.4709, "step": 17026 }, { "epoch": 0.44843297340005267, "grad_norm": 2.5789008140563965, "learning_rate": 2.7591519620753225e-05, "loss": 1.5665, "step": 17027 }, { "epoch": 0.4484593099815644, "grad_norm": 3.134542465209961, "learning_rate": 2.7590202791677644e-05, "loss": 1.4515, "step": 17028 }, { "epoch": 0.4484856465630761, "grad_norm": 2.4615089893341064, "learning_rate": 2.7588885962602057e-05, "loss": 1.675, "step": 17029 }, { "epoch": 0.44851198314458784, "grad_norm": 4.316629886627197, "learning_rate": 2.758756913352647e-05, "loss": 1.9403, "step": 17030 }, { "epoch": 0.44853831972609953, "grad_norm": 1.775210976600647, "learning_rate": 2.758625230445088e-05, "loss": 2.1424, "step": 17031 }, { "epoch": 0.4485646563076113, "grad_norm": 3.5381767749786377, "learning_rate": 2.7584935475375297e-05, "loss": 0.805, "step": 17032 }, { "epoch": 0.448590992889123, "grad_norm": 1.6372946500778198, "learning_rate": 2.7583618646299712e-05, "loss": 1.1744, "step": 17033 }, { "epoch": 0.4486173294706347, "grad_norm": 2.1658003330230713, "learning_rate": 2.7582301817224128e-05, "loss": 1.9981, "step": 17034 }, { "epoch": 0.44864366605214645, "grad_norm": 2.8261516094207764, "learning_rate": 2.758098498814854e-05, "loss": 0.9075, "step": 17035 }, { "epoch": 0.44867000263365814, "grad_norm": 2.115002393722534, "learning_rate": 2.7579668159072952e-05, "loss": 2.0283, "step": 17036 }, { "epoch": 0.4486963392151699, "grad_norm": 5.122161865234375, "learning_rate": 2.7578351329997364e-05, "loss": 2.2937, "step": 17037 }, { "epoch": 0.44872267579668157, "grad_norm": 1.5687413215637207, "learning_rate": 2.7577034500921783e-05, "loss": 1.5673, "step": 17038 }, { "epoch": 0.4487490123781933, "grad_norm": 2.282477378845215, "learning_rate": 2.7575717671846196e-05, "loss": 1.4989, "step": 17039 }, { "epoch": 0.44877534895970506, "grad_norm": 1.9963371753692627, "learning_rate": 2.7574400842770608e-05, "loss": 2.1559, "step": 17040 }, { "epoch": 0.44880168554121674, "grad_norm": 3.23002028465271, "learning_rate": 2.7573084013695023e-05, "loss": 1.5559, "step": 17041 }, { "epoch": 0.4488280221227285, "grad_norm": 1.7360063791275024, "learning_rate": 2.757176718461944e-05, "loss": 2.0031, "step": 17042 }, { "epoch": 0.4488543587042402, "grad_norm": 3.6184260845184326, "learning_rate": 2.7570450355543855e-05, "loss": 1.2703, "step": 17043 }, { "epoch": 0.4488806952857519, "grad_norm": 1.872036337852478, "learning_rate": 2.7569133526468267e-05, "loss": 2.6207, "step": 17044 }, { "epoch": 0.4489070318672636, "grad_norm": 1.8810017108917236, "learning_rate": 2.756781669739268e-05, "loss": 1.5997, "step": 17045 }, { "epoch": 0.44893336844877535, "grad_norm": 2.8621761798858643, "learning_rate": 2.756649986831709e-05, "loss": 2.0346, "step": 17046 }, { "epoch": 0.4489597050302871, "grad_norm": 1.5929418802261353, "learning_rate": 2.756518303924151e-05, "loss": 2.2243, "step": 17047 }, { "epoch": 0.4489860416117988, "grad_norm": 2.0779407024383545, "learning_rate": 2.7563866210165922e-05, "loss": 1.8046, "step": 17048 }, { "epoch": 0.4490123781933105, "grad_norm": 1.6420633792877197, "learning_rate": 2.7562549381090335e-05, "loss": 1.7348, "step": 17049 }, { "epoch": 0.4490387147748222, "grad_norm": 2.041776418685913, "learning_rate": 2.7561232552014747e-05, "loss": 1.4654, "step": 17050 }, { "epoch": 0.44906505135633396, "grad_norm": 1.5560107231140137, "learning_rate": 2.7559915722939162e-05, "loss": 1.5134, "step": 17051 }, { "epoch": 0.44909138793784564, "grad_norm": 1.781523585319519, "learning_rate": 2.755859889386358e-05, "loss": 1.7816, "step": 17052 }, { "epoch": 0.4491177245193574, "grad_norm": 1.7064391374588013, "learning_rate": 2.7557282064787994e-05, "loss": 1.6485, "step": 17053 }, { "epoch": 0.44914406110086913, "grad_norm": 2.2161433696746826, "learning_rate": 2.7555965235712406e-05, "loss": 1.9475, "step": 17054 }, { "epoch": 0.4491703976823808, "grad_norm": 4.413087844848633, "learning_rate": 2.7554648406636818e-05, "loss": 2.0317, "step": 17055 }, { "epoch": 0.44919673426389256, "grad_norm": 5.025756359100342, "learning_rate": 2.7553331577561237e-05, "loss": 1.2113, "step": 17056 }, { "epoch": 0.44922307084540425, "grad_norm": 1.4062490463256836, "learning_rate": 2.755201474848565e-05, "loss": 1.5441, "step": 17057 }, { "epoch": 0.449249407426916, "grad_norm": 2.8166022300720215, "learning_rate": 2.755069791941006e-05, "loss": 0.5119, "step": 17058 }, { "epoch": 0.4492757440084277, "grad_norm": 4.330456256866455, "learning_rate": 2.7549381090334474e-05, "loss": 0.634, "step": 17059 }, { "epoch": 0.4493020805899394, "grad_norm": 1.5672374963760376, "learning_rate": 2.754806426125889e-05, "loss": 0.8187, "step": 17060 }, { "epoch": 0.44932841717145117, "grad_norm": 2.9968671798706055, "learning_rate": 2.7546747432183305e-05, "loss": 1.5142, "step": 17061 }, { "epoch": 0.44935475375296285, "grad_norm": 3.419109344482422, "learning_rate": 2.754543060310772e-05, "loss": 2.0302, "step": 17062 }, { "epoch": 0.4493810903344746, "grad_norm": 1.6215554475784302, "learning_rate": 2.7544113774032133e-05, "loss": 1.7236, "step": 17063 }, { "epoch": 0.4494074269159863, "grad_norm": 2.2841813564300537, "learning_rate": 2.7542796944956545e-05, "loss": 1.6661, "step": 17064 }, { "epoch": 0.44943376349749803, "grad_norm": 2.1528613567352295, "learning_rate": 2.7541480115880957e-05, "loss": 1.6542, "step": 17065 }, { "epoch": 0.44946010007900977, "grad_norm": 3.6613404750823975, "learning_rate": 2.7540163286805376e-05, "loss": 1.2189, "step": 17066 }, { "epoch": 0.44948643666052146, "grad_norm": 2.3041701316833496, "learning_rate": 2.7538846457729788e-05, "loss": 2.2027, "step": 17067 }, { "epoch": 0.4495127732420332, "grad_norm": 1.9654216766357422, "learning_rate": 2.75375296286542e-05, "loss": 1.6361, "step": 17068 }, { "epoch": 0.4495391098235449, "grad_norm": 2.672168016433716, "learning_rate": 2.7536212799578616e-05, "loss": 2.4648, "step": 17069 }, { "epoch": 0.44956544640505663, "grad_norm": 1.6782050132751465, "learning_rate": 2.7534895970503028e-05, "loss": 1.8492, "step": 17070 }, { "epoch": 0.4495917829865683, "grad_norm": 2.02051043510437, "learning_rate": 2.7533579141427447e-05, "loss": 0.6029, "step": 17071 }, { "epoch": 0.44961811956808007, "grad_norm": 4.1116862297058105, "learning_rate": 2.753226231235186e-05, "loss": 2.2187, "step": 17072 }, { "epoch": 0.4496444561495918, "grad_norm": 8.349372863769531, "learning_rate": 2.7530945483276272e-05, "loss": 1.546, "step": 17073 }, { "epoch": 0.4496707927311035, "grad_norm": 1.8985421657562256, "learning_rate": 2.7529628654200684e-05, "loss": 1.5, "step": 17074 }, { "epoch": 0.44969712931261524, "grad_norm": 4.587907791137695, "learning_rate": 2.7528311825125103e-05, "loss": 0.8677, "step": 17075 }, { "epoch": 0.44972346589412693, "grad_norm": 2.4465866088867188, "learning_rate": 2.7526994996049515e-05, "loss": 2.7048, "step": 17076 }, { "epoch": 0.44974980247563867, "grad_norm": 2.727853298187256, "learning_rate": 2.7525678166973927e-05, "loss": 0.6312, "step": 17077 }, { "epoch": 0.44977613905715036, "grad_norm": 3.176668405532837, "learning_rate": 2.752436133789834e-05, "loss": 1.3678, "step": 17078 }, { "epoch": 0.4498024756386621, "grad_norm": 2.50823974609375, "learning_rate": 2.7523044508822755e-05, "loss": 0.9341, "step": 17079 }, { "epoch": 0.44982881222017385, "grad_norm": 1.958359956741333, "learning_rate": 2.752172767974717e-05, "loss": 0.9493, "step": 17080 }, { "epoch": 0.44985514880168553, "grad_norm": 1.9944642782211304, "learning_rate": 2.7520410850671586e-05, "loss": 0.6526, "step": 17081 }, { "epoch": 0.4498814853831973, "grad_norm": 2.1743321418762207, "learning_rate": 2.7519094021596e-05, "loss": 2.2238, "step": 17082 }, { "epoch": 0.44990782196470896, "grad_norm": 3.004852294921875, "learning_rate": 2.751777719252041e-05, "loss": 1.8927, "step": 17083 }, { "epoch": 0.4499341585462207, "grad_norm": 1.7980796098709106, "learning_rate": 2.7516460363444823e-05, "loss": 0.5407, "step": 17084 }, { "epoch": 0.4499604951277324, "grad_norm": 1.8922024965286255, "learning_rate": 2.7515143534369242e-05, "loss": 1.9667, "step": 17085 }, { "epoch": 0.44998683170924414, "grad_norm": 2.068293809890747, "learning_rate": 2.7513826705293654e-05, "loss": 0.4991, "step": 17086 }, { "epoch": 0.4500131682907559, "grad_norm": 2.132648468017578, "learning_rate": 2.7512509876218066e-05, "loss": 1.8514, "step": 17087 }, { "epoch": 0.45003950487226757, "grad_norm": 1.6560941934585571, "learning_rate": 2.7511193047142482e-05, "loss": 1.515, "step": 17088 }, { "epoch": 0.4500658414537793, "grad_norm": 1.729984164237976, "learning_rate": 2.7509876218066898e-05, "loss": 1.789, "step": 17089 }, { "epoch": 0.450092178035291, "grad_norm": 3.2434918880462646, "learning_rate": 2.7508559388991313e-05, "loss": 1.7042, "step": 17090 }, { "epoch": 0.45011851461680275, "grad_norm": 1.3891850709915161, "learning_rate": 2.7507242559915725e-05, "loss": 1.9058, "step": 17091 }, { "epoch": 0.45014485119831443, "grad_norm": 1.4968249797821045, "learning_rate": 2.7505925730840138e-05, "loss": 2.0176, "step": 17092 }, { "epoch": 0.4501711877798262, "grad_norm": 2.123619794845581, "learning_rate": 2.750460890176455e-05, "loss": 1.4141, "step": 17093 }, { "epoch": 0.4501975243613379, "grad_norm": 1.6175100803375244, "learning_rate": 2.750329207268897e-05, "loss": 1.7863, "step": 17094 }, { "epoch": 0.4502238609428496, "grad_norm": 2.0571160316467285, "learning_rate": 2.750197524361338e-05, "loss": 1.7893, "step": 17095 }, { "epoch": 0.45025019752436135, "grad_norm": 3.049384593963623, "learning_rate": 2.7500658414537793e-05, "loss": 1.7014, "step": 17096 }, { "epoch": 0.45027653410587304, "grad_norm": 4.013835430145264, "learning_rate": 2.749934158546221e-05, "loss": 0.8901, "step": 17097 }, { "epoch": 0.4503028706873848, "grad_norm": 2.2871170043945312, "learning_rate": 2.749802475638662e-05, "loss": 1.2354, "step": 17098 }, { "epoch": 0.45032920726889647, "grad_norm": 1.9408328533172607, "learning_rate": 2.749670792731104e-05, "loss": 1.8038, "step": 17099 }, { "epoch": 0.4503555438504082, "grad_norm": 1.685389518737793, "learning_rate": 2.7495391098235452e-05, "loss": 2.1319, "step": 17100 }, { "epoch": 0.45038188043191996, "grad_norm": 2.0820672512054443, "learning_rate": 2.7494074269159864e-05, "loss": 1.7891, "step": 17101 }, { "epoch": 0.45040821701343164, "grad_norm": 1.918860673904419, "learning_rate": 2.7492757440084277e-05, "loss": 2.04, "step": 17102 }, { "epoch": 0.4504345535949434, "grad_norm": 3.4740636348724365, "learning_rate": 2.749144061100869e-05, "loss": 0.958, "step": 17103 }, { "epoch": 0.4504608901764551, "grad_norm": 3.9740121364593506, "learning_rate": 2.7490123781933108e-05, "loss": 1.8255, "step": 17104 }, { "epoch": 0.4504872267579668, "grad_norm": 2.113982915878296, "learning_rate": 2.748880695285752e-05, "loss": 1.3238, "step": 17105 }, { "epoch": 0.45051356333947856, "grad_norm": 4.4182047843933105, "learning_rate": 2.7487490123781932e-05, "loss": 0.8614, "step": 17106 }, { "epoch": 0.45053989992099025, "grad_norm": 1.969622254371643, "learning_rate": 2.7486173294706348e-05, "loss": 1.3336, "step": 17107 }, { "epoch": 0.450566236502502, "grad_norm": 2.5526673793792725, "learning_rate": 2.7484856465630763e-05, "loss": 1.6763, "step": 17108 }, { "epoch": 0.4505925730840137, "grad_norm": 5.080016136169434, "learning_rate": 2.748353963655518e-05, "loss": 1.8847, "step": 17109 }, { "epoch": 0.4506189096655254, "grad_norm": 3.4436593055725098, "learning_rate": 2.748222280747959e-05, "loss": 1.2964, "step": 17110 }, { "epoch": 0.4506452462470371, "grad_norm": 2.404585123062134, "learning_rate": 2.7480905978404003e-05, "loss": 2.4136, "step": 17111 }, { "epoch": 0.45067158282854886, "grad_norm": 1.9660687446594238, "learning_rate": 2.7479589149328416e-05, "loss": 2.5152, "step": 17112 }, { "epoch": 0.4506979194100606, "grad_norm": 1.7157256603240967, "learning_rate": 2.7478272320252835e-05, "loss": 1.9111, "step": 17113 }, { "epoch": 0.4507242559915723, "grad_norm": 1.777350902557373, "learning_rate": 2.7476955491177247e-05, "loss": 1.5846, "step": 17114 }, { "epoch": 0.45075059257308403, "grad_norm": 1.5402629375457764, "learning_rate": 2.747563866210166e-05, "loss": 1.2306, "step": 17115 }, { "epoch": 0.4507769291545957, "grad_norm": 1.6992673873901367, "learning_rate": 2.7474321833026075e-05, "loss": 1.742, "step": 17116 }, { "epoch": 0.45080326573610746, "grad_norm": 2.3362605571746826, "learning_rate": 2.7473005003950487e-05, "loss": 2.2791, "step": 17117 }, { "epoch": 0.45082960231761915, "grad_norm": 1.3017711639404297, "learning_rate": 2.7471688174874906e-05, "loss": 0.3301, "step": 17118 }, { "epoch": 0.4508559388991309, "grad_norm": 2.409926414489746, "learning_rate": 2.7470371345799318e-05, "loss": 2.1464, "step": 17119 }, { "epoch": 0.45088227548064264, "grad_norm": 2.541557788848877, "learning_rate": 2.746905451672373e-05, "loss": 0.843, "step": 17120 }, { "epoch": 0.4509086120621543, "grad_norm": 1.480195164680481, "learning_rate": 2.7467737687648142e-05, "loss": 2.0305, "step": 17121 }, { "epoch": 0.45093494864366607, "grad_norm": 4.073674201965332, "learning_rate": 2.746642085857256e-05, "loss": 1.6269, "step": 17122 }, { "epoch": 0.45096128522517775, "grad_norm": 1.9352281093597412, "learning_rate": 2.7465104029496974e-05, "loss": 1.1902, "step": 17123 }, { "epoch": 0.4509876218066895, "grad_norm": 1.8534413576126099, "learning_rate": 2.7463787200421386e-05, "loss": 1.9605, "step": 17124 }, { "epoch": 0.4510139583882012, "grad_norm": 3.0895206928253174, "learning_rate": 2.74624703713458e-05, "loss": 0.9396, "step": 17125 }, { "epoch": 0.45104029496971293, "grad_norm": 3.2080650329589844, "learning_rate": 2.7461153542270214e-05, "loss": 1.5976, "step": 17126 }, { "epoch": 0.4510666315512247, "grad_norm": 1.801740050315857, "learning_rate": 2.7459836713194633e-05, "loss": 2.035, "step": 17127 }, { "epoch": 0.45109296813273636, "grad_norm": 2.3313302993774414, "learning_rate": 2.7458519884119045e-05, "loss": 1.7296, "step": 17128 }, { "epoch": 0.4511193047142481, "grad_norm": 2.4068751335144043, "learning_rate": 2.7457203055043457e-05, "loss": 2.4214, "step": 17129 }, { "epoch": 0.4511456412957598, "grad_norm": 1.6548713445663452, "learning_rate": 2.745588622596787e-05, "loss": 1.3809, "step": 17130 }, { "epoch": 0.45117197787727154, "grad_norm": 1.804884433746338, "learning_rate": 2.745456939689228e-05, "loss": 1.3989, "step": 17131 }, { "epoch": 0.4511983144587832, "grad_norm": 2.324575424194336, "learning_rate": 2.74532525678167e-05, "loss": 1.8062, "step": 17132 }, { "epoch": 0.45122465104029497, "grad_norm": 2.1455283164978027, "learning_rate": 2.7451935738741113e-05, "loss": 0.7217, "step": 17133 }, { "epoch": 0.4512509876218067, "grad_norm": 2.6046602725982666, "learning_rate": 2.7450618909665525e-05, "loss": 1.4877, "step": 17134 }, { "epoch": 0.4512773242033184, "grad_norm": 2.62692928314209, "learning_rate": 2.744930208058994e-05, "loss": 1.9191, "step": 17135 }, { "epoch": 0.45130366078483014, "grad_norm": 1.8383461236953735, "learning_rate": 2.7447985251514353e-05, "loss": 1.6445, "step": 17136 }, { "epoch": 0.45132999736634183, "grad_norm": 2.3609025478363037, "learning_rate": 2.7446668422438772e-05, "loss": 2.0814, "step": 17137 }, { "epoch": 0.45135633394785357, "grad_norm": 4.005642890930176, "learning_rate": 2.7445351593363184e-05, "loss": 1.7183, "step": 17138 }, { "epoch": 0.4513826705293653, "grad_norm": 1.4061229228973389, "learning_rate": 2.7444034764287596e-05, "loss": 1.2178, "step": 17139 }, { "epoch": 0.451409007110877, "grad_norm": 2.076510190963745, "learning_rate": 2.744271793521201e-05, "loss": 2.2251, "step": 17140 }, { "epoch": 0.45143534369238875, "grad_norm": 2.146402597427368, "learning_rate": 2.7441401106136427e-05, "loss": 2.0432, "step": 17141 }, { "epoch": 0.45146168027390043, "grad_norm": 2.8456602096557617, "learning_rate": 2.744008427706084e-05, "loss": 0.889, "step": 17142 }, { "epoch": 0.4514880168554122, "grad_norm": 2.419118881225586, "learning_rate": 2.7438767447985252e-05, "loss": 1.2221, "step": 17143 }, { "epoch": 0.45151435343692387, "grad_norm": 1.3562119007110596, "learning_rate": 2.7437450618909667e-05, "loss": 0.3291, "step": 17144 }, { "epoch": 0.4515406900184356, "grad_norm": 2.6415436267852783, "learning_rate": 2.743613378983408e-05, "loss": 1.3642, "step": 17145 }, { "epoch": 0.45156702659994735, "grad_norm": 1.6824222803115845, "learning_rate": 2.74348169607585e-05, "loss": 1.5869, "step": 17146 }, { "epoch": 0.45159336318145904, "grad_norm": 1.6827759742736816, "learning_rate": 2.743350013168291e-05, "loss": 1.4758, "step": 17147 }, { "epoch": 0.4516196997629708, "grad_norm": 3.5246641635894775, "learning_rate": 2.7432183302607323e-05, "loss": 2.3005, "step": 17148 }, { "epoch": 0.45164603634448247, "grad_norm": 2.109797954559326, "learning_rate": 2.7430866473531735e-05, "loss": 1.4442, "step": 17149 }, { "epoch": 0.4516723729259942, "grad_norm": 2.881183385848999, "learning_rate": 2.7429549644456147e-05, "loss": 1.8403, "step": 17150 }, { "epoch": 0.4516987095075059, "grad_norm": 3.3041598796844482, "learning_rate": 2.7428232815380566e-05, "loss": 1.2424, "step": 17151 }, { "epoch": 0.45172504608901765, "grad_norm": 3.8640763759613037, "learning_rate": 2.742691598630498e-05, "loss": 0.5971, "step": 17152 }, { "epoch": 0.4517513826705294, "grad_norm": 3.526885986328125, "learning_rate": 2.742559915722939e-05, "loss": 1.8192, "step": 17153 }, { "epoch": 0.4517777192520411, "grad_norm": 9.659627914428711, "learning_rate": 2.7424282328153806e-05, "loss": 1.1899, "step": 17154 }, { "epoch": 0.4518040558335528, "grad_norm": 2.479926347732544, "learning_rate": 2.7422965499078222e-05, "loss": 1.5591, "step": 17155 }, { "epoch": 0.4518303924150645, "grad_norm": 3.6006155014038086, "learning_rate": 2.7421648670002638e-05, "loss": 1.4015, "step": 17156 }, { "epoch": 0.45185672899657625, "grad_norm": 1.6525331735610962, "learning_rate": 2.742033184092705e-05, "loss": 1.9961, "step": 17157 }, { "epoch": 0.45188306557808794, "grad_norm": 1.9206260442733765, "learning_rate": 2.7419015011851462e-05, "loss": 1.7597, "step": 17158 }, { "epoch": 0.4519094021595997, "grad_norm": 3.4888410568237305, "learning_rate": 2.7417698182775874e-05, "loss": 1.6017, "step": 17159 }, { "epoch": 0.4519357387411114, "grad_norm": 3.253868341445923, "learning_rate": 2.7416381353700293e-05, "loss": 1.4429, "step": 17160 }, { "epoch": 0.4519620753226231, "grad_norm": 1.461851716041565, "learning_rate": 2.7415064524624705e-05, "loss": 2.1199, "step": 17161 }, { "epoch": 0.45198841190413486, "grad_norm": 3.6245734691619873, "learning_rate": 2.7413747695549118e-05, "loss": 1.3559, "step": 17162 }, { "epoch": 0.45201474848564654, "grad_norm": 2.16041898727417, "learning_rate": 2.7412430866473533e-05, "loss": 1.7796, "step": 17163 }, { "epoch": 0.4520410850671583, "grad_norm": 2.043455123901367, "learning_rate": 2.7411114037397945e-05, "loss": 1.9891, "step": 17164 }, { "epoch": 0.45206742164867, "grad_norm": 1.6806464195251465, "learning_rate": 2.7409797208322364e-05, "loss": 1.8386, "step": 17165 }, { "epoch": 0.4520937582301817, "grad_norm": 1.6694210767745972, "learning_rate": 2.7408480379246777e-05, "loss": 1.8001, "step": 17166 }, { "epoch": 0.45212009481169346, "grad_norm": 1.7431142330169678, "learning_rate": 2.740716355017119e-05, "loss": 1.2596, "step": 17167 }, { "epoch": 0.45214643139320515, "grad_norm": 1.8233349323272705, "learning_rate": 2.74058467210956e-05, "loss": 1.8809, "step": 17168 }, { "epoch": 0.4521727679747169, "grad_norm": 1.9612683057785034, "learning_rate": 2.7404529892020013e-05, "loss": 2.1106, "step": 17169 }, { "epoch": 0.4521991045562286, "grad_norm": 5.293704509735107, "learning_rate": 2.7403213062944432e-05, "loss": 1.6231, "step": 17170 }, { "epoch": 0.4522254411377403, "grad_norm": 2.054372549057007, "learning_rate": 2.7401896233868844e-05, "loss": 1.7198, "step": 17171 }, { "epoch": 0.45225177771925207, "grad_norm": 2.097214937210083, "learning_rate": 2.740057940479326e-05, "loss": 1.6886, "step": 17172 }, { "epoch": 0.45227811430076376, "grad_norm": 1.6854910850524902, "learning_rate": 2.7399262575717672e-05, "loss": 1.6825, "step": 17173 }, { "epoch": 0.4523044508822755, "grad_norm": 1.6675678491592407, "learning_rate": 2.739794574664209e-05, "loss": 1.4672, "step": 17174 }, { "epoch": 0.4523307874637872, "grad_norm": 2.632181167602539, "learning_rate": 2.7396628917566503e-05, "loss": 1.7371, "step": 17175 }, { "epoch": 0.45235712404529893, "grad_norm": 2.109461784362793, "learning_rate": 2.7395312088490916e-05, "loss": 1.8966, "step": 17176 }, { "epoch": 0.4523834606268106, "grad_norm": 2.290513277053833, "learning_rate": 2.7393995259415328e-05, "loss": 0.8599, "step": 17177 }, { "epoch": 0.45240979720832236, "grad_norm": 1.9919025897979736, "learning_rate": 2.739267843033974e-05, "loss": 0.7075, "step": 17178 }, { "epoch": 0.4524361337898341, "grad_norm": 2.0513458251953125, "learning_rate": 2.739136160126416e-05, "loss": 2.245, "step": 17179 }, { "epoch": 0.4524624703713458, "grad_norm": 2.7437376976013184, "learning_rate": 2.739004477218857e-05, "loss": 1.2284, "step": 17180 }, { "epoch": 0.45248880695285754, "grad_norm": 2.015800952911377, "learning_rate": 2.7388727943112983e-05, "loss": 1.4751, "step": 17181 }, { "epoch": 0.4525151435343692, "grad_norm": 1.7488065958023071, "learning_rate": 2.73874111140374e-05, "loss": 2.5183, "step": 17182 }, { "epoch": 0.45254148011588097, "grad_norm": 3.3676867485046387, "learning_rate": 2.738609428496181e-05, "loss": 1.5646, "step": 17183 }, { "epoch": 0.45256781669739266, "grad_norm": 1.915235996246338, "learning_rate": 2.738477745588623e-05, "loss": 1.5624, "step": 17184 }, { "epoch": 0.4525941532789044, "grad_norm": 1.3848034143447876, "learning_rate": 2.7383460626810642e-05, "loss": 1.2423, "step": 17185 }, { "epoch": 0.45262048986041614, "grad_norm": 3.018489122390747, "learning_rate": 2.7382143797735055e-05, "loss": 2.4648, "step": 17186 }, { "epoch": 0.45264682644192783, "grad_norm": 1.512101650238037, "learning_rate": 2.7380826968659467e-05, "loss": 1.5661, "step": 17187 }, { "epoch": 0.4526731630234396, "grad_norm": 3.8546786308288574, "learning_rate": 2.7379510139583886e-05, "loss": 0.9695, "step": 17188 }, { "epoch": 0.45269949960495126, "grad_norm": 2.109097719192505, "learning_rate": 2.7378193310508298e-05, "loss": 1.5202, "step": 17189 }, { "epoch": 0.452725836186463, "grad_norm": 3.920694589614868, "learning_rate": 2.737687648143271e-05, "loss": 2.5446, "step": 17190 }, { "epoch": 0.4527521727679747, "grad_norm": 1.7120757102966309, "learning_rate": 2.7375559652357126e-05, "loss": 1.5632, "step": 17191 }, { "epoch": 0.45277850934948644, "grad_norm": 1.8904166221618652, "learning_rate": 2.7374242823281538e-05, "loss": 1.6217, "step": 17192 }, { "epoch": 0.4528048459309982, "grad_norm": 5.083609104156494, "learning_rate": 2.7372925994205957e-05, "loss": 2.3104, "step": 17193 }, { "epoch": 0.45283118251250987, "grad_norm": 1.9159715175628662, "learning_rate": 2.737160916513037e-05, "loss": 1.9583, "step": 17194 }, { "epoch": 0.4528575190940216, "grad_norm": 1.686370849609375, "learning_rate": 2.737029233605478e-05, "loss": 1.5409, "step": 17195 }, { "epoch": 0.4528838556755333, "grad_norm": 2.9615120887756348, "learning_rate": 2.7368975506979194e-05, "loss": 0.836, "step": 17196 }, { "epoch": 0.45291019225704504, "grad_norm": 2.1956305503845215, "learning_rate": 2.7367658677903606e-05, "loss": 1.726, "step": 17197 }, { "epoch": 0.45293652883855673, "grad_norm": 2.52850079536438, "learning_rate": 2.7366341848828025e-05, "loss": 0.328, "step": 17198 }, { "epoch": 0.4529628654200685, "grad_norm": 1.5329182147979736, "learning_rate": 2.7365025019752437e-05, "loss": 2.0223, "step": 17199 }, { "epoch": 0.4529892020015802, "grad_norm": 2.016688346862793, "learning_rate": 2.7363708190676853e-05, "loss": 1.279, "step": 17200 }, { "epoch": 0.4530155385830919, "grad_norm": 1.8355118036270142, "learning_rate": 2.7362391361601265e-05, "loss": 1.5419, "step": 17201 }, { "epoch": 0.45304187516460365, "grad_norm": 1.8816572427749634, "learning_rate": 2.7361074532525677e-05, "loss": 1.8123, "step": 17202 }, { "epoch": 0.45306821174611533, "grad_norm": 1.9784739017486572, "learning_rate": 2.7359757703450096e-05, "loss": 1.827, "step": 17203 }, { "epoch": 0.4530945483276271, "grad_norm": 3.907982349395752, "learning_rate": 2.735844087437451e-05, "loss": 1.9803, "step": 17204 }, { "epoch": 0.4531208849091388, "grad_norm": 2.0782470703125, "learning_rate": 2.735712404529892e-05, "loss": 1.6839, "step": 17205 }, { "epoch": 0.4531472214906505, "grad_norm": 1.8123726844787598, "learning_rate": 2.7355807216223333e-05, "loss": 1.8258, "step": 17206 }, { "epoch": 0.45317355807216225, "grad_norm": 1.8910233974456787, "learning_rate": 2.7354490387147752e-05, "loss": 2.1613, "step": 17207 }, { "epoch": 0.45319989465367394, "grad_norm": 1.8334705829620361, "learning_rate": 2.7353173558072164e-05, "loss": 1.9071, "step": 17208 }, { "epoch": 0.4532262312351857, "grad_norm": 2.4042649269104004, "learning_rate": 2.7351856728996576e-05, "loss": 1.6075, "step": 17209 }, { "epoch": 0.45325256781669737, "grad_norm": 4.0542473793029785, "learning_rate": 2.7350539899920992e-05, "loss": 2.3992, "step": 17210 }, { "epoch": 0.4532789043982091, "grad_norm": 2.590764045715332, "learning_rate": 2.7349223070845404e-05, "loss": 1.7578, "step": 17211 }, { "epoch": 0.45330524097972086, "grad_norm": 2.123406171798706, "learning_rate": 2.7347906241769823e-05, "loss": 1.0294, "step": 17212 }, { "epoch": 0.45333157756123255, "grad_norm": 1.5296351909637451, "learning_rate": 2.7346589412694235e-05, "loss": 1.3012, "step": 17213 }, { "epoch": 0.4533579141427443, "grad_norm": 2.7377827167510986, "learning_rate": 2.7345272583618647e-05, "loss": 1.2522, "step": 17214 }, { "epoch": 0.453384250724256, "grad_norm": 2.5996670722961426, "learning_rate": 2.734395575454306e-05, "loss": 2.3996, "step": 17215 }, { "epoch": 0.4534105873057677, "grad_norm": 2.616281509399414, "learning_rate": 2.7342638925467472e-05, "loss": 1.3469, "step": 17216 }, { "epoch": 0.4534369238872794, "grad_norm": 2.360745429992676, "learning_rate": 2.734132209639189e-05, "loss": 0.8809, "step": 17217 }, { "epoch": 0.45346326046879115, "grad_norm": 1.5613644123077393, "learning_rate": 2.7340005267316303e-05, "loss": 1.4582, "step": 17218 }, { "epoch": 0.4534895970503029, "grad_norm": 2.0324182510375977, "learning_rate": 2.733868843824072e-05, "loss": 1.5494, "step": 17219 }, { "epoch": 0.4535159336318146, "grad_norm": 3.080385208129883, "learning_rate": 2.733737160916513e-05, "loss": 1.1272, "step": 17220 }, { "epoch": 0.4535422702133263, "grad_norm": 4.435571670532227, "learning_rate": 2.733605478008955e-05, "loss": 1.8973, "step": 17221 }, { "epoch": 0.453568606794838, "grad_norm": 1.5928090810775757, "learning_rate": 2.7334737951013962e-05, "loss": 1.6691, "step": 17222 }, { "epoch": 0.45359494337634976, "grad_norm": 1.9368505477905273, "learning_rate": 2.7333421121938374e-05, "loss": 1.7477, "step": 17223 }, { "epoch": 0.45362127995786145, "grad_norm": 2.5404584407806396, "learning_rate": 2.7332104292862786e-05, "loss": 2.0429, "step": 17224 }, { "epoch": 0.4536476165393732, "grad_norm": 1.8371343612670898, "learning_rate": 2.73307874637872e-05, "loss": 1.9979, "step": 17225 }, { "epoch": 0.45367395312088493, "grad_norm": 2.2765278816223145, "learning_rate": 2.7329470634711618e-05, "loss": 1.1807, "step": 17226 }, { "epoch": 0.4537002897023966, "grad_norm": 2.139545202255249, "learning_rate": 2.732815380563603e-05, "loss": 1.8717, "step": 17227 }, { "epoch": 0.45372662628390836, "grad_norm": 1.8141471147537231, "learning_rate": 2.7326836976560445e-05, "loss": 2.3454, "step": 17228 }, { "epoch": 0.45375296286542005, "grad_norm": 6.5997161865234375, "learning_rate": 2.7325520147484858e-05, "loss": 1.2423, "step": 17229 }, { "epoch": 0.4537792994469318, "grad_norm": 2.7536022663116455, "learning_rate": 2.732420331840927e-05, "loss": 1.9389, "step": 17230 }, { "epoch": 0.4538056360284435, "grad_norm": 1.8371065855026245, "learning_rate": 2.732288648933369e-05, "loss": 1.7975, "step": 17231 }, { "epoch": 0.4538319726099552, "grad_norm": 2.3389861583709717, "learning_rate": 2.73215696602581e-05, "loss": 1.3338, "step": 17232 }, { "epoch": 0.45385830919146697, "grad_norm": 2.5396571159362793, "learning_rate": 2.7320252831182513e-05, "loss": 2.6465, "step": 17233 }, { "epoch": 0.45388464577297866, "grad_norm": 2.511237382888794, "learning_rate": 2.7318936002106925e-05, "loss": 0.6535, "step": 17234 }, { "epoch": 0.4539109823544904, "grad_norm": 1.5506068468093872, "learning_rate": 2.7317619173031338e-05, "loss": 2.3247, "step": 17235 }, { "epoch": 0.4539373189360021, "grad_norm": 2.2335588932037354, "learning_rate": 2.7316302343955757e-05, "loss": 2.3492, "step": 17236 }, { "epoch": 0.45396365551751383, "grad_norm": 4.997518539428711, "learning_rate": 2.731498551488017e-05, "loss": 1.336, "step": 17237 }, { "epoch": 0.4539899920990255, "grad_norm": 3.5385055541992188, "learning_rate": 2.7313668685804584e-05, "loss": 1.5828, "step": 17238 }, { "epoch": 0.45401632868053726, "grad_norm": 2.7677905559539795, "learning_rate": 2.7312351856728997e-05, "loss": 1.4542, "step": 17239 }, { "epoch": 0.454042665262049, "grad_norm": 1.9444832801818848, "learning_rate": 2.7311035027653416e-05, "loss": 1.2108, "step": 17240 }, { "epoch": 0.4540690018435607, "grad_norm": 5.952491760253906, "learning_rate": 2.7309718198577828e-05, "loss": 1.1722, "step": 17241 }, { "epoch": 0.45409533842507244, "grad_norm": 1.6658071279525757, "learning_rate": 2.730840136950224e-05, "loss": 0.3918, "step": 17242 }, { "epoch": 0.4541216750065841, "grad_norm": 2.8430464267730713, "learning_rate": 2.7307084540426652e-05, "loss": 1.4477, "step": 17243 }, { "epoch": 0.45414801158809587, "grad_norm": 4.561838626861572, "learning_rate": 2.7305767711351064e-05, "loss": 0.9918, "step": 17244 }, { "epoch": 0.4541743481696076, "grad_norm": 4.652225494384766, "learning_rate": 2.7304450882275483e-05, "loss": 1.3644, "step": 17245 }, { "epoch": 0.4542006847511193, "grad_norm": 2.326185941696167, "learning_rate": 2.7303134053199896e-05, "loss": 1.5605, "step": 17246 }, { "epoch": 0.45422702133263104, "grad_norm": 2.047363758087158, "learning_rate": 2.730181722412431e-05, "loss": 1.9757, "step": 17247 }, { "epoch": 0.45425335791414273, "grad_norm": 1.630068302154541, "learning_rate": 2.7300500395048723e-05, "loss": 1.8384, "step": 17248 }, { "epoch": 0.4542796944956545, "grad_norm": 1.7588789463043213, "learning_rate": 2.7299183565973136e-05, "loss": 0.6989, "step": 17249 }, { "epoch": 0.45430603107716616, "grad_norm": 4.718081951141357, "learning_rate": 2.7297866736897555e-05, "loss": 1.2469, "step": 17250 }, { "epoch": 0.4543323676586779, "grad_norm": 3.3542864322662354, "learning_rate": 2.7296549907821967e-05, "loss": 1.7339, "step": 17251 }, { "epoch": 0.45435870424018965, "grad_norm": 2.1271185874938965, "learning_rate": 2.729523307874638e-05, "loss": 1.877, "step": 17252 }, { "epoch": 0.45438504082170134, "grad_norm": 2.3422200679779053, "learning_rate": 2.729391624967079e-05, "loss": 1.3132, "step": 17253 }, { "epoch": 0.4544113774032131, "grad_norm": 1.6371201276779175, "learning_rate": 2.729259942059521e-05, "loss": 1.5208, "step": 17254 }, { "epoch": 0.45443771398472477, "grad_norm": 2.037470817565918, "learning_rate": 2.7291282591519622e-05, "loss": 1.7234, "step": 17255 }, { "epoch": 0.4544640505662365, "grad_norm": 4.676828861236572, "learning_rate": 2.7289965762444035e-05, "loss": 2.13, "step": 17256 }, { "epoch": 0.4544903871477482, "grad_norm": 2.1511476039886475, "learning_rate": 2.728864893336845e-05, "loss": 1.208, "step": 17257 }, { "epoch": 0.45451672372925994, "grad_norm": 1.7112656831741333, "learning_rate": 2.7287332104292863e-05, "loss": 1.1046, "step": 17258 }, { "epoch": 0.4545430603107717, "grad_norm": 1.607563853263855, "learning_rate": 2.728601527521728e-05, "loss": 1.2543, "step": 17259 }, { "epoch": 0.4545693968922834, "grad_norm": 2.384552478790283, "learning_rate": 2.7284698446141694e-05, "loss": 1.6309, "step": 17260 }, { "epoch": 0.4545957334737951, "grad_norm": 4.566499710083008, "learning_rate": 2.7283381617066106e-05, "loss": 1.6098, "step": 17261 }, { "epoch": 0.4546220700553068, "grad_norm": 2.1666014194488525, "learning_rate": 2.7282064787990518e-05, "loss": 1.3774, "step": 17262 }, { "epoch": 0.45464840663681855, "grad_norm": 2.6022233963012695, "learning_rate": 2.728074795891493e-05, "loss": 1.7932, "step": 17263 }, { "epoch": 0.45467474321833024, "grad_norm": 2.702962875366211, "learning_rate": 2.727943112983935e-05, "loss": 1.159, "step": 17264 }, { "epoch": 0.454701079799842, "grad_norm": 3.4606845378875732, "learning_rate": 2.727811430076376e-05, "loss": 1.7029, "step": 17265 }, { "epoch": 0.4547274163813537, "grad_norm": 3.5899014472961426, "learning_rate": 2.7276797471688177e-05, "loss": 1.2461, "step": 17266 }, { "epoch": 0.4547537529628654, "grad_norm": 2.5236380100250244, "learning_rate": 2.727548064261259e-05, "loss": 2.1335, "step": 17267 }, { "epoch": 0.45478008954437715, "grad_norm": 2.2835521697998047, "learning_rate": 2.7274163813537e-05, "loss": 1.9793, "step": 17268 }, { "epoch": 0.45480642612588884, "grad_norm": 6.296321392059326, "learning_rate": 2.727284698446142e-05, "loss": 2.0144, "step": 17269 }, { "epoch": 0.4548327627074006, "grad_norm": 2.8443493843078613, "learning_rate": 2.7271530155385833e-05, "loss": 2.0607, "step": 17270 }, { "epoch": 0.45485909928891227, "grad_norm": 2.301936388015747, "learning_rate": 2.7270213326310245e-05, "loss": 2.331, "step": 17271 }, { "epoch": 0.454885435870424, "grad_norm": 2.5129923820495605, "learning_rate": 2.7268896497234657e-05, "loss": 0.8484, "step": 17272 }, { "epoch": 0.45491177245193576, "grad_norm": 1.8395601511001587, "learning_rate": 2.7267579668159076e-05, "loss": 0.8047, "step": 17273 }, { "epoch": 0.45493810903344745, "grad_norm": 1.784755825996399, "learning_rate": 2.726626283908349e-05, "loss": 1.9117, "step": 17274 }, { "epoch": 0.4549644456149592, "grad_norm": 2.107417583465576, "learning_rate": 2.7264946010007904e-05, "loss": 0.4097, "step": 17275 }, { "epoch": 0.4549907821964709, "grad_norm": 1.615294098854065, "learning_rate": 2.7263629180932316e-05, "loss": 1.5856, "step": 17276 }, { "epoch": 0.4550171187779826, "grad_norm": 3.5520570278167725, "learning_rate": 2.726231235185673e-05, "loss": 1.2245, "step": 17277 }, { "epoch": 0.45504345535949436, "grad_norm": 1.9660377502441406, "learning_rate": 2.7260995522781147e-05, "loss": 2.0264, "step": 17278 }, { "epoch": 0.45506979194100605, "grad_norm": 2.0342516899108887, "learning_rate": 2.725967869370556e-05, "loss": 2.1092, "step": 17279 }, { "epoch": 0.4550961285225178, "grad_norm": 2.382925033569336, "learning_rate": 2.7258361864629972e-05, "loss": 1.9994, "step": 17280 }, { "epoch": 0.4551224651040295, "grad_norm": 2.6056907176971436, "learning_rate": 2.7257045035554384e-05, "loss": 1.5397, "step": 17281 }, { "epoch": 0.4551488016855412, "grad_norm": 2.2213563919067383, "learning_rate": 2.7255728206478796e-05, "loss": 1.6928, "step": 17282 }, { "epoch": 0.4551751382670529, "grad_norm": 6.961357116699219, "learning_rate": 2.7254411377403215e-05, "loss": 2.4899, "step": 17283 }, { "epoch": 0.45520147484856466, "grad_norm": 4.042150974273682, "learning_rate": 2.7253094548327627e-05, "loss": 1.0525, "step": 17284 }, { "epoch": 0.4552278114300764, "grad_norm": 2.232088088989258, "learning_rate": 2.7251777719252043e-05, "loss": 1.8718, "step": 17285 }, { "epoch": 0.4552541480115881, "grad_norm": 1.5000238418579102, "learning_rate": 2.7250460890176455e-05, "loss": 1.59, "step": 17286 }, { "epoch": 0.45528048459309983, "grad_norm": 3.8233962059020996, "learning_rate": 2.7249144061100874e-05, "loss": 1.535, "step": 17287 }, { "epoch": 0.4553068211746115, "grad_norm": 1.9231579303741455, "learning_rate": 2.7247827232025286e-05, "loss": 1.8897, "step": 17288 }, { "epoch": 0.45533315775612326, "grad_norm": 1.9409904479980469, "learning_rate": 2.72465104029497e-05, "loss": 0.4959, "step": 17289 }, { "epoch": 0.45535949433763495, "grad_norm": 1.9427393674850464, "learning_rate": 2.724519357387411e-05, "loss": 1.8005, "step": 17290 }, { "epoch": 0.4553858309191467, "grad_norm": 3.4019124507904053, "learning_rate": 2.7243876744798523e-05, "loss": 1.533, "step": 17291 }, { "epoch": 0.45541216750065844, "grad_norm": 1.7695775032043457, "learning_rate": 2.7242559915722942e-05, "loss": 1.3177, "step": 17292 }, { "epoch": 0.4554385040821701, "grad_norm": 2.4496066570281982, "learning_rate": 2.7241243086647354e-05, "loss": 2.011, "step": 17293 }, { "epoch": 0.45546484066368187, "grad_norm": 4.076964855194092, "learning_rate": 2.723992625757177e-05, "loss": 1.7417, "step": 17294 }, { "epoch": 0.45549117724519356, "grad_norm": 1.7601439952850342, "learning_rate": 2.7238609428496182e-05, "loss": 1.1369, "step": 17295 }, { "epoch": 0.4555175138267053, "grad_norm": 4.183624744415283, "learning_rate": 2.7237292599420594e-05, "loss": 1.1913, "step": 17296 }, { "epoch": 0.455543850408217, "grad_norm": 2.683528423309326, "learning_rate": 2.7235975770345013e-05, "loss": 1.7364, "step": 17297 }, { "epoch": 0.45557018698972873, "grad_norm": 2.3656113147735596, "learning_rate": 2.7234658941269425e-05, "loss": 2.0034, "step": 17298 }, { "epoch": 0.4555965235712405, "grad_norm": 2.0474541187286377, "learning_rate": 2.7233342112193838e-05, "loss": 0.9772, "step": 17299 }, { "epoch": 0.45562286015275216, "grad_norm": 2.562882900238037, "learning_rate": 2.723202528311825e-05, "loss": 2.0706, "step": 17300 }, { "epoch": 0.4556491967342639, "grad_norm": 2.677858829498291, "learning_rate": 2.7230708454042665e-05, "loss": 0.5187, "step": 17301 }, { "epoch": 0.4556755333157756, "grad_norm": 4.931310653686523, "learning_rate": 2.722939162496708e-05, "loss": 1.8102, "step": 17302 }, { "epoch": 0.45570186989728734, "grad_norm": 1.7191283702850342, "learning_rate": 2.7228074795891497e-05, "loss": 1.812, "step": 17303 }, { "epoch": 0.455728206478799, "grad_norm": 6.109428405761719, "learning_rate": 2.722675796681591e-05, "loss": 1.0289, "step": 17304 }, { "epoch": 0.45575454306031077, "grad_norm": 1.7250807285308838, "learning_rate": 2.722544113774032e-05, "loss": 1.7173, "step": 17305 }, { "epoch": 0.4557808796418225, "grad_norm": 1.4788899421691895, "learning_rate": 2.722412430866474e-05, "loss": 1.8731, "step": 17306 }, { "epoch": 0.4558072162233342, "grad_norm": 2.513375997543335, "learning_rate": 2.7222807479589152e-05, "loss": 1.8095, "step": 17307 }, { "epoch": 0.45583355280484594, "grad_norm": 2.1410164833068848, "learning_rate": 2.7221490650513564e-05, "loss": 1.6, "step": 17308 }, { "epoch": 0.45585988938635763, "grad_norm": 1.7630873918533325, "learning_rate": 2.7220173821437977e-05, "loss": 0.6304, "step": 17309 }, { "epoch": 0.4558862259678694, "grad_norm": 1.8701564073562622, "learning_rate": 2.721885699236239e-05, "loss": 2.7404, "step": 17310 }, { "epoch": 0.4559125625493811, "grad_norm": 1.9941834211349487, "learning_rate": 2.7217540163286808e-05, "loss": 1.8618, "step": 17311 }, { "epoch": 0.4559388991308928, "grad_norm": 1.7726746797561646, "learning_rate": 2.721622333421122e-05, "loss": 0.3673, "step": 17312 }, { "epoch": 0.45596523571240455, "grad_norm": 1.9800612926483154, "learning_rate": 2.7214906505135636e-05, "loss": 2.0318, "step": 17313 }, { "epoch": 0.45599157229391624, "grad_norm": 1.7566994428634644, "learning_rate": 2.7213589676060048e-05, "loss": 0.436, "step": 17314 }, { "epoch": 0.456017908875428, "grad_norm": 2.314666748046875, "learning_rate": 2.721227284698446e-05, "loss": 1.8503, "step": 17315 }, { "epoch": 0.45604424545693967, "grad_norm": 3.5411014556884766, "learning_rate": 2.721095601790888e-05, "loss": 1.3966, "step": 17316 }, { "epoch": 0.4560705820384514, "grad_norm": 2.390590190887451, "learning_rate": 2.720963918883329e-05, "loss": 1.0384, "step": 17317 }, { "epoch": 0.45609691861996315, "grad_norm": 2.4419076442718506, "learning_rate": 2.7208322359757704e-05, "loss": 1.5649, "step": 17318 }, { "epoch": 0.45612325520147484, "grad_norm": 1.7367279529571533, "learning_rate": 2.7207005530682116e-05, "loss": 1.7707, "step": 17319 }, { "epoch": 0.4561495917829866, "grad_norm": 1.8437200784683228, "learning_rate": 2.7205688701606535e-05, "loss": 1.5748, "step": 17320 }, { "epoch": 0.4561759283644983, "grad_norm": 1.2369630336761475, "learning_rate": 2.7204371872530947e-05, "loss": 1.1754, "step": 17321 }, { "epoch": 0.45620226494601, "grad_norm": 1.8583474159240723, "learning_rate": 2.7203055043455363e-05, "loss": 1.6233, "step": 17322 }, { "epoch": 0.4562286015275217, "grad_norm": 2.628200054168701, "learning_rate": 2.7201738214379775e-05, "loss": 1.5336, "step": 17323 }, { "epoch": 0.45625493810903345, "grad_norm": 2.8460469245910645, "learning_rate": 2.7200421385304187e-05, "loss": 2.0944, "step": 17324 }, { "epoch": 0.4562812746905452, "grad_norm": 2.3785526752471924, "learning_rate": 2.7199104556228606e-05, "loss": 1.9822, "step": 17325 }, { "epoch": 0.4563076112720569, "grad_norm": 2.2259302139282227, "learning_rate": 2.7197787727153018e-05, "loss": 1.3858, "step": 17326 }, { "epoch": 0.4563339478535686, "grad_norm": 1.6828503608703613, "learning_rate": 2.719647089807743e-05, "loss": 1.0838, "step": 17327 }, { "epoch": 0.4563602844350803, "grad_norm": 1.918329119682312, "learning_rate": 2.7195154069001843e-05, "loss": 2.2113, "step": 17328 }, { "epoch": 0.45638662101659205, "grad_norm": 1.6512056589126587, "learning_rate": 2.7193837239926255e-05, "loss": 0.4725, "step": 17329 }, { "epoch": 0.45641295759810374, "grad_norm": 1.6636810302734375, "learning_rate": 2.7192520410850674e-05, "loss": 2.1088, "step": 17330 }, { "epoch": 0.4564392941796155, "grad_norm": 4.567899703979492, "learning_rate": 2.7191203581775086e-05, "loss": 1.3406, "step": 17331 }, { "epoch": 0.45646563076112723, "grad_norm": 4.268378734588623, "learning_rate": 2.71898867526995e-05, "loss": 1.2136, "step": 17332 }, { "epoch": 0.4564919673426389, "grad_norm": 4.233364582061768, "learning_rate": 2.7188569923623914e-05, "loss": 1.8267, "step": 17333 }, { "epoch": 0.45651830392415066, "grad_norm": 1.507789134979248, "learning_rate": 2.7187253094548333e-05, "loss": 2.28, "step": 17334 }, { "epoch": 0.45654464050566235, "grad_norm": 4.422889232635498, "learning_rate": 2.7185936265472745e-05, "loss": 1.8436, "step": 17335 }, { "epoch": 0.4565709770871741, "grad_norm": 1.651748538017273, "learning_rate": 2.7184619436397157e-05, "loss": 1.9707, "step": 17336 }, { "epoch": 0.4565973136686858, "grad_norm": 5.289831161499023, "learning_rate": 2.718330260732157e-05, "loss": 2.3291, "step": 17337 }, { "epoch": 0.4566236502501975, "grad_norm": 4.125948905944824, "learning_rate": 2.718198577824598e-05, "loss": 0.7519, "step": 17338 }, { "epoch": 0.45664998683170926, "grad_norm": 1.4943476915359497, "learning_rate": 2.71806689491704e-05, "loss": 1.7798, "step": 17339 }, { "epoch": 0.45667632341322095, "grad_norm": 2.3546130657196045, "learning_rate": 2.7179352120094813e-05, "loss": 1.9694, "step": 17340 }, { "epoch": 0.4567026599947327, "grad_norm": 4.659780025482178, "learning_rate": 2.717803529101923e-05, "loss": 0.3928, "step": 17341 }, { "epoch": 0.4567289965762444, "grad_norm": 2.033904552459717, "learning_rate": 2.717671846194364e-05, "loss": 1.0258, "step": 17342 }, { "epoch": 0.4567553331577561, "grad_norm": 1.9254486560821533, "learning_rate": 2.7175401632868053e-05, "loss": 0.7842, "step": 17343 }, { "epoch": 0.45678166973926787, "grad_norm": 3.2039225101470947, "learning_rate": 2.7174084803792472e-05, "loss": 1.5024, "step": 17344 }, { "epoch": 0.45680800632077956, "grad_norm": 4.272639751434326, "learning_rate": 2.7172767974716884e-05, "loss": 1.0394, "step": 17345 }, { "epoch": 0.4568343429022913, "grad_norm": 3.3715627193450928, "learning_rate": 2.7171451145641296e-05, "loss": 1.906, "step": 17346 }, { "epoch": 0.456860679483803, "grad_norm": 3.528592586517334, "learning_rate": 2.717013431656571e-05, "loss": 1.0343, "step": 17347 }, { "epoch": 0.45688701606531473, "grad_norm": 6.667778491973877, "learning_rate": 2.7168817487490124e-05, "loss": 2.2159, "step": 17348 }, { "epoch": 0.4569133526468264, "grad_norm": 2.012467622756958, "learning_rate": 2.716750065841454e-05, "loss": 2.0511, "step": 17349 }, { "epoch": 0.45693968922833816, "grad_norm": 5.494542598724365, "learning_rate": 2.7166183829338955e-05, "loss": 1.0639, "step": 17350 }, { "epoch": 0.4569660258098499, "grad_norm": 2.0097978115081787, "learning_rate": 2.7164867000263367e-05, "loss": 0.7831, "step": 17351 }, { "epoch": 0.4569923623913616, "grad_norm": 1.4346563816070557, "learning_rate": 2.716355017118778e-05, "loss": 0.416, "step": 17352 }, { "epoch": 0.45701869897287334, "grad_norm": 3.5429351329803467, "learning_rate": 2.71622333421122e-05, "loss": 1.9427, "step": 17353 }, { "epoch": 0.457045035554385, "grad_norm": 1.6483004093170166, "learning_rate": 2.716091651303661e-05, "loss": 1.8775, "step": 17354 }, { "epoch": 0.45707137213589677, "grad_norm": 3.4772427082061768, "learning_rate": 2.7159599683961023e-05, "loss": 1.2938, "step": 17355 }, { "epoch": 0.45709770871740846, "grad_norm": 2.2466113567352295, "learning_rate": 2.7158282854885435e-05, "loss": 1.8039, "step": 17356 }, { "epoch": 0.4571240452989202, "grad_norm": 3.016786575317383, "learning_rate": 2.7156966025809847e-05, "loss": 1.81, "step": 17357 }, { "epoch": 0.45715038188043194, "grad_norm": 1.6325918436050415, "learning_rate": 2.7155649196734266e-05, "loss": 1.2971, "step": 17358 }, { "epoch": 0.45717671846194363, "grad_norm": 4.765496253967285, "learning_rate": 2.715433236765868e-05, "loss": 1.2005, "step": 17359 }, { "epoch": 0.4572030550434554, "grad_norm": 3.836104393005371, "learning_rate": 2.7153015538583094e-05, "loss": 1.8405, "step": 17360 }, { "epoch": 0.45722939162496706, "grad_norm": 1.4749449491500854, "learning_rate": 2.7151698709507506e-05, "loss": 2.0393, "step": 17361 }, { "epoch": 0.4572557282064788, "grad_norm": 3.033538818359375, "learning_rate": 2.715038188043192e-05, "loss": 0.9233, "step": 17362 }, { "epoch": 0.4572820647879905, "grad_norm": 1.9844825267791748, "learning_rate": 2.7149065051356338e-05, "loss": 0.5922, "step": 17363 }, { "epoch": 0.45730840136950224, "grad_norm": 3.0275564193725586, "learning_rate": 2.714774822228075e-05, "loss": 1.7589, "step": 17364 }, { "epoch": 0.457334737951014, "grad_norm": 2.288918972015381, "learning_rate": 2.7146431393205162e-05, "loss": 1.1355, "step": 17365 }, { "epoch": 0.45736107453252567, "grad_norm": 1.8926944732666016, "learning_rate": 2.7145114564129574e-05, "loss": 1.7833, "step": 17366 }, { "epoch": 0.4573874111140374, "grad_norm": 1.709686517715454, "learning_rate": 2.7143797735053993e-05, "loss": 2.2145, "step": 17367 }, { "epoch": 0.4574137476955491, "grad_norm": 1.7107502222061157, "learning_rate": 2.7142480905978405e-05, "loss": 2.0242, "step": 17368 }, { "epoch": 0.45744008427706084, "grad_norm": 1.6546893119812012, "learning_rate": 2.714116407690282e-05, "loss": 1.8925, "step": 17369 }, { "epoch": 0.45746642085857253, "grad_norm": 3.4109745025634766, "learning_rate": 2.7139847247827233e-05, "loss": 1.1984, "step": 17370 }, { "epoch": 0.4574927574400843, "grad_norm": 1.6220732927322388, "learning_rate": 2.7138530418751645e-05, "loss": 2.0254, "step": 17371 }, { "epoch": 0.457519094021596, "grad_norm": 3.6341445446014404, "learning_rate": 2.7137213589676064e-05, "loss": 1.8416, "step": 17372 }, { "epoch": 0.4575454306031077, "grad_norm": 2.507774591445923, "learning_rate": 2.7135896760600477e-05, "loss": 2.9992, "step": 17373 }, { "epoch": 0.45757176718461945, "grad_norm": 2.6956207752227783, "learning_rate": 2.713457993152489e-05, "loss": 2.029, "step": 17374 }, { "epoch": 0.45759810376613114, "grad_norm": 2.8928754329681396, "learning_rate": 2.71332631024493e-05, "loss": 1.6337, "step": 17375 }, { "epoch": 0.4576244403476429, "grad_norm": 2.1158711910247803, "learning_rate": 2.7131946273373717e-05, "loss": 2.5666, "step": 17376 }, { "epoch": 0.4576507769291546, "grad_norm": 2.768433094024658, "learning_rate": 2.7130629444298132e-05, "loss": 1.2013, "step": 17377 }, { "epoch": 0.4576771135106663, "grad_norm": 1.7517542839050293, "learning_rate": 2.7129312615222548e-05, "loss": 1.5358, "step": 17378 }, { "epoch": 0.45770345009217805, "grad_norm": 2.7721211910247803, "learning_rate": 2.712799578614696e-05, "loss": 1.983, "step": 17379 }, { "epoch": 0.45772978667368974, "grad_norm": 2.01784348487854, "learning_rate": 2.7126678957071372e-05, "loss": 0.702, "step": 17380 }, { "epoch": 0.4577561232552015, "grad_norm": 3.2808613777160645, "learning_rate": 2.7125362127995785e-05, "loss": 1.3785, "step": 17381 }, { "epoch": 0.4577824598367132, "grad_norm": 4.0102057456970215, "learning_rate": 2.7124045298920204e-05, "loss": 0.9703, "step": 17382 }, { "epoch": 0.4578087964182249, "grad_norm": 1.7842707633972168, "learning_rate": 2.7122728469844616e-05, "loss": 1.9043, "step": 17383 }, { "epoch": 0.45783513299973666, "grad_norm": 3.1047117710113525, "learning_rate": 2.7121411640769028e-05, "loss": 1.136, "step": 17384 }, { "epoch": 0.45786146958124835, "grad_norm": 1.8260467052459717, "learning_rate": 2.712009481169344e-05, "loss": 1.5915, "step": 17385 }, { "epoch": 0.4578878061627601, "grad_norm": 1.6174358129501343, "learning_rate": 2.711877798261786e-05, "loss": 2.2055, "step": 17386 }, { "epoch": 0.4579141427442718, "grad_norm": 1.6015636920928955, "learning_rate": 2.711746115354227e-05, "loss": 2.1223, "step": 17387 }, { "epoch": 0.4579404793257835, "grad_norm": 2.2054030895233154, "learning_rate": 2.7116144324466687e-05, "loss": 2.1959, "step": 17388 }, { "epoch": 0.4579668159072952, "grad_norm": 3.391774892807007, "learning_rate": 2.71148274953911e-05, "loss": 1.6875, "step": 17389 }, { "epoch": 0.45799315248880695, "grad_norm": 1.5481411218643188, "learning_rate": 2.711351066631551e-05, "loss": 2.0179, "step": 17390 }, { "epoch": 0.4580194890703187, "grad_norm": 2.3364076614379883, "learning_rate": 2.711219383723993e-05, "loss": 2.3304, "step": 17391 }, { "epoch": 0.4580458256518304, "grad_norm": 1.7283635139465332, "learning_rate": 2.7110877008164343e-05, "loss": 1.9451, "step": 17392 }, { "epoch": 0.45807216223334213, "grad_norm": 2.9483442306518555, "learning_rate": 2.7109560179088755e-05, "loss": 1.1101, "step": 17393 }, { "epoch": 0.4580984988148538, "grad_norm": 1.640470027923584, "learning_rate": 2.7108243350013167e-05, "loss": 1.5166, "step": 17394 }, { "epoch": 0.45812483539636556, "grad_norm": 1.80405592918396, "learning_rate": 2.7106926520937583e-05, "loss": 1.8124, "step": 17395 }, { "epoch": 0.45815117197787725, "grad_norm": 3.0634729862213135, "learning_rate": 2.7105609691861998e-05, "loss": 1.051, "step": 17396 }, { "epoch": 0.458177508559389, "grad_norm": 1.7485488653182983, "learning_rate": 2.7104292862786414e-05, "loss": 1.4824, "step": 17397 }, { "epoch": 0.45820384514090073, "grad_norm": 2.075005292892456, "learning_rate": 2.7102976033710826e-05, "loss": 1.8233, "step": 17398 }, { "epoch": 0.4582301817224124, "grad_norm": 3.7069613933563232, "learning_rate": 2.7101659204635238e-05, "loss": 2.0435, "step": 17399 }, { "epoch": 0.45825651830392417, "grad_norm": 2.6145107746124268, "learning_rate": 2.7100342375559657e-05, "loss": 1.1847, "step": 17400 }, { "epoch": 0.45828285488543585, "grad_norm": 2.9730470180511475, "learning_rate": 2.709902554648407e-05, "loss": 1.0361, "step": 17401 }, { "epoch": 0.4583091914669476, "grad_norm": 1.979385495185852, "learning_rate": 2.709770871740848e-05, "loss": 1.4995, "step": 17402 }, { "epoch": 0.4583355280484593, "grad_norm": 4.62583065032959, "learning_rate": 2.7096391888332894e-05, "loss": 1.3785, "step": 17403 }, { "epoch": 0.458361864629971, "grad_norm": 1.8206110000610352, "learning_rate": 2.709507505925731e-05, "loss": 1.7719, "step": 17404 }, { "epoch": 0.45838820121148277, "grad_norm": 3.4109480381011963, "learning_rate": 2.7093758230181725e-05, "loss": 1.2555, "step": 17405 }, { "epoch": 0.45841453779299446, "grad_norm": 5.451360702514648, "learning_rate": 2.709244140110614e-05, "loss": 2.0528, "step": 17406 }, { "epoch": 0.4584408743745062, "grad_norm": 2.0876517295837402, "learning_rate": 2.7091124572030553e-05, "loss": 1.3576, "step": 17407 }, { "epoch": 0.4584672109560179, "grad_norm": 2.019777536392212, "learning_rate": 2.7089807742954965e-05, "loss": 1.7, "step": 17408 }, { "epoch": 0.45849354753752963, "grad_norm": 1.627122402191162, "learning_rate": 2.7088490913879377e-05, "loss": 2.1738, "step": 17409 }, { "epoch": 0.4585198841190413, "grad_norm": 2.301504611968994, "learning_rate": 2.7087174084803796e-05, "loss": 1.9525, "step": 17410 }, { "epoch": 0.45854622070055306, "grad_norm": 2.6174991130828857, "learning_rate": 2.708585725572821e-05, "loss": 1.3099, "step": 17411 }, { "epoch": 0.4585725572820648, "grad_norm": 2.4257068634033203, "learning_rate": 2.708454042665262e-05, "loss": 1.1037, "step": 17412 }, { "epoch": 0.4585988938635765, "grad_norm": 2.256848096847534, "learning_rate": 2.7083223597577033e-05, "loss": 2.1147, "step": 17413 }, { "epoch": 0.45862523044508824, "grad_norm": 1.6314328908920288, "learning_rate": 2.708190676850145e-05, "loss": 1.7321, "step": 17414 }, { "epoch": 0.4586515670265999, "grad_norm": 1.7508454322814941, "learning_rate": 2.7080589939425864e-05, "loss": 0.6495, "step": 17415 }, { "epoch": 0.45867790360811167, "grad_norm": 2.508268356323242, "learning_rate": 2.707927311035028e-05, "loss": 1.9909, "step": 17416 }, { "epoch": 0.4587042401896234, "grad_norm": 2.0246963500976562, "learning_rate": 2.7077956281274692e-05, "loss": 1.5806, "step": 17417 }, { "epoch": 0.4587305767711351, "grad_norm": 1.9890328645706177, "learning_rate": 2.7076639452199104e-05, "loss": 1.4487, "step": 17418 }, { "epoch": 0.45875691335264684, "grad_norm": 2.3599395751953125, "learning_rate": 2.7075322623123523e-05, "loss": 0.6264, "step": 17419 }, { "epoch": 0.45878324993415853, "grad_norm": 2.685049533843994, "learning_rate": 2.7074005794047935e-05, "loss": 0.3283, "step": 17420 }, { "epoch": 0.4588095865156703, "grad_norm": 2.487574815750122, "learning_rate": 2.7072688964972347e-05, "loss": 2.1188, "step": 17421 }, { "epoch": 0.45883592309718196, "grad_norm": 2.0520687103271484, "learning_rate": 2.707137213589676e-05, "loss": 1.9028, "step": 17422 }, { "epoch": 0.4588622596786937, "grad_norm": 1.7135155200958252, "learning_rate": 2.7070055306821175e-05, "loss": 2.0268, "step": 17423 }, { "epoch": 0.45888859626020545, "grad_norm": 2.03914213180542, "learning_rate": 2.706873847774559e-05, "loss": 1.9026, "step": 17424 }, { "epoch": 0.45891493284171714, "grad_norm": 3.514350175857544, "learning_rate": 2.7067421648670006e-05, "loss": 1.1091, "step": 17425 }, { "epoch": 0.4589412694232289, "grad_norm": 3.3137736320495605, "learning_rate": 2.706610481959442e-05, "loss": 1.9172, "step": 17426 }, { "epoch": 0.45896760600474057, "grad_norm": 2.468156099319458, "learning_rate": 2.706478799051883e-05, "loss": 2.3359, "step": 17427 }, { "epoch": 0.4589939425862523, "grad_norm": 2.0728931427001953, "learning_rate": 2.7063471161443243e-05, "loss": 2.3017, "step": 17428 }, { "epoch": 0.459020279167764, "grad_norm": 2.697718620300293, "learning_rate": 2.7062154332367662e-05, "loss": 1.3245, "step": 17429 }, { "epoch": 0.45904661574927574, "grad_norm": 3.767880439758301, "learning_rate": 2.7060837503292074e-05, "loss": 1.7804, "step": 17430 }, { "epoch": 0.4590729523307875, "grad_norm": 1.5344229936599731, "learning_rate": 2.7059520674216486e-05, "loss": 1.4511, "step": 17431 }, { "epoch": 0.4590992889122992, "grad_norm": 2.57387113571167, "learning_rate": 2.70582038451409e-05, "loss": 1.6768, "step": 17432 }, { "epoch": 0.4591256254938109, "grad_norm": 2.7254576683044434, "learning_rate": 2.7056887016065318e-05, "loss": 1.6601, "step": 17433 }, { "epoch": 0.4591519620753226, "grad_norm": 1.8453673124313354, "learning_rate": 2.705557018698973e-05, "loss": 1.3177, "step": 17434 }, { "epoch": 0.45917829865683435, "grad_norm": 2.5454792976379395, "learning_rate": 2.7054253357914145e-05, "loss": 2.777, "step": 17435 }, { "epoch": 0.45920463523834604, "grad_norm": 1.5997931957244873, "learning_rate": 2.7052936528838558e-05, "loss": 0.6221, "step": 17436 }, { "epoch": 0.4592309718198578, "grad_norm": 2.054805040359497, "learning_rate": 2.705161969976297e-05, "loss": 1.8622, "step": 17437 }, { "epoch": 0.4592573084013695, "grad_norm": 1.8314502239227295, "learning_rate": 2.705030287068739e-05, "loss": 1.6375, "step": 17438 }, { "epoch": 0.4592836449828812, "grad_norm": 1.7336448431015015, "learning_rate": 2.70489860416118e-05, "loss": 1.9737, "step": 17439 }, { "epoch": 0.45930998156439296, "grad_norm": 2.5273306369781494, "learning_rate": 2.7047669212536213e-05, "loss": 0.7548, "step": 17440 }, { "epoch": 0.45933631814590464, "grad_norm": 3.0589494705200195, "learning_rate": 2.7046352383460626e-05, "loss": 1.5871, "step": 17441 }, { "epoch": 0.4593626547274164, "grad_norm": 1.9810404777526855, "learning_rate": 2.704503555438504e-05, "loss": 1.9559, "step": 17442 }, { "epoch": 0.4593889913089281, "grad_norm": 2.150716543197632, "learning_rate": 2.7043718725309457e-05, "loss": 1.8882, "step": 17443 }, { "epoch": 0.4594153278904398, "grad_norm": 2.692133903503418, "learning_rate": 2.7042401896233872e-05, "loss": 1.4891, "step": 17444 }, { "epoch": 0.45944166447195156, "grad_norm": 1.7243131399154663, "learning_rate": 2.7041085067158285e-05, "loss": 1.8789, "step": 17445 }, { "epoch": 0.45946800105346325, "grad_norm": 1.5809563398361206, "learning_rate": 2.7039768238082697e-05, "loss": 1.55, "step": 17446 }, { "epoch": 0.459494337634975, "grad_norm": 1.7917300462722778, "learning_rate": 2.703845140900711e-05, "loss": 2.1017, "step": 17447 }, { "epoch": 0.4595206742164867, "grad_norm": 3.135422468185425, "learning_rate": 2.7037134579931528e-05, "loss": 1.8175, "step": 17448 }, { "epoch": 0.4595470107979984, "grad_norm": 2.8379335403442383, "learning_rate": 2.703581775085594e-05, "loss": 0.8343, "step": 17449 }, { "epoch": 0.45957334737951017, "grad_norm": 3.434492349624634, "learning_rate": 2.7034500921780352e-05, "loss": 2.0049, "step": 17450 }, { "epoch": 0.45959968396102185, "grad_norm": 3.260800361633301, "learning_rate": 2.7033184092704768e-05, "loss": 1.8594, "step": 17451 }, { "epoch": 0.4596260205425336, "grad_norm": 2.1037139892578125, "learning_rate": 2.7031867263629184e-05, "loss": 2.1083, "step": 17452 }, { "epoch": 0.4596523571240453, "grad_norm": 1.7813032865524292, "learning_rate": 2.70305504345536e-05, "loss": 2.3705, "step": 17453 }, { "epoch": 0.45967869370555703, "grad_norm": 3.1875293254852295, "learning_rate": 2.702923360547801e-05, "loss": 0.6947, "step": 17454 }, { "epoch": 0.4597050302870687, "grad_norm": 3.766479730606079, "learning_rate": 2.7027916776402424e-05, "loss": 1.2198, "step": 17455 }, { "epoch": 0.45973136686858046, "grad_norm": 3.2631001472473145, "learning_rate": 2.7026599947326836e-05, "loss": 2.3352, "step": 17456 }, { "epoch": 0.4597577034500922, "grad_norm": 4.20930814743042, "learning_rate": 2.7025283118251255e-05, "loss": 1.7662, "step": 17457 }, { "epoch": 0.4597840400316039, "grad_norm": 1.9887974262237549, "learning_rate": 2.7023966289175667e-05, "loss": 1.4584, "step": 17458 }, { "epoch": 0.45981037661311563, "grad_norm": 2.7577054500579834, "learning_rate": 2.702264946010008e-05, "loss": 2.2135, "step": 17459 }, { "epoch": 0.4598367131946273, "grad_norm": 1.8073235750198364, "learning_rate": 2.702133263102449e-05, "loss": 1.9439, "step": 17460 }, { "epoch": 0.45986304977613907, "grad_norm": 1.8886864185333252, "learning_rate": 2.7020015801948907e-05, "loss": 2.2911, "step": 17461 }, { "epoch": 0.45988938635765075, "grad_norm": 1.6016392707824707, "learning_rate": 2.7018698972873323e-05, "loss": 1.6465, "step": 17462 }, { "epoch": 0.4599157229391625, "grad_norm": 1.7485017776489258, "learning_rate": 2.7017382143797738e-05, "loss": 1.6884, "step": 17463 }, { "epoch": 0.45994205952067424, "grad_norm": 1.9669878482818604, "learning_rate": 2.701606531472215e-05, "loss": 1.4297, "step": 17464 }, { "epoch": 0.45996839610218593, "grad_norm": 1.8073192834854126, "learning_rate": 2.7014748485646563e-05, "loss": 2.2036, "step": 17465 }, { "epoch": 0.45999473268369767, "grad_norm": 2.008207082748413, "learning_rate": 2.701343165657098e-05, "loss": 1.7573, "step": 17466 }, { "epoch": 0.46002106926520936, "grad_norm": 2.4656214714050293, "learning_rate": 2.7012114827495394e-05, "loss": 1.8519, "step": 17467 }, { "epoch": 0.4600474058467211, "grad_norm": 2.4187605381011963, "learning_rate": 2.7010797998419806e-05, "loss": 2.0717, "step": 17468 }, { "epoch": 0.4600737424282328, "grad_norm": 1.9050865173339844, "learning_rate": 2.7009481169344218e-05, "loss": 1.7168, "step": 17469 }, { "epoch": 0.46010007900974453, "grad_norm": 2.115663528442383, "learning_rate": 2.7008164340268634e-05, "loss": 2.1858, "step": 17470 }, { "epoch": 0.4601264155912563, "grad_norm": 3.364642381668091, "learning_rate": 2.700684751119305e-05, "loss": 1.3902, "step": 17471 }, { "epoch": 0.46015275217276796, "grad_norm": 1.4723210334777832, "learning_rate": 2.7005530682117465e-05, "loss": 1.9311, "step": 17472 }, { "epoch": 0.4601790887542797, "grad_norm": 4.462168216705322, "learning_rate": 2.7004213853041877e-05, "loss": 0.6964, "step": 17473 }, { "epoch": 0.4602054253357914, "grad_norm": 2.6608057022094727, "learning_rate": 2.700289702396629e-05, "loss": 2.1537, "step": 17474 }, { "epoch": 0.46023176191730314, "grad_norm": 2.631505012512207, "learning_rate": 2.70015801948907e-05, "loss": 0.7878, "step": 17475 }, { "epoch": 0.4602580984988148, "grad_norm": 1.8283488750457764, "learning_rate": 2.700026336581512e-05, "loss": 2.6337, "step": 17476 }, { "epoch": 0.46028443508032657, "grad_norm": 6.193769454956055, "learning_rate": 2.6998946536739533e-05, "loss": 1.2673, "step": 17477 }, { "epoch": 0.4603107716618383, "grad_norm": 1.632066011428833, "learning_rate": 2.6997629707663945e-05, "loss": 1.7221, "step": 17478 }, { "epoch": 0.46033710824335, "grad_norm": 2.579348087310791, "learning_rate": 2.699631287858836e-05, "loss": 1.722, "step": 17479 }, { "epoch": 0.46036344482486175, "grad_norm": 3.4655957221984863, "learning_rate": 2.6994996049512773e-05, "loss": 1.2046, "step": 17480 }, { "epoch": 0.46038978140637343, "grad_norm": 5.123465061187744, "learning_rate": 2.6993679220437192e-05, "loss": 1.6401, "step": 17481 }, { "epoch": 0.4604161179878852, "grad_norm": 3.545768976211548, "learning_rate": 2.6992362391361604e-05, "loss": 1.5401, "step": 17482 }, { "epoch": 0.4604424545693969, "grad_norm": 1.8814715147018433, "learning_rate": 2.6991045562286016e-05, "loss": 1.4878, "step": 17483 }, { "epoch": 0.4604687911509086, "grad_norm": 1.5582077503204346, "learning_rate": 2.698972873321043e-05, "loss": 1.7208, "step": 17484 }, { "epoch": 0.46049512773242035, "grad_norm": 2.813190221786499, "learning_rate": 2.6988411904134847e-05, "loss": 1.38, "step": 17485 }, { "epoch": 0.46052146431393204, "grad_norm": 1.8609251976013184, "learning_rate": 2.698709507505926e-05, "loss": 2.4392, "step": 17486 }, { "epoch": 0.4605478008954438, "grad_norm": 3.437260627746582, "learning_rate": 2.6985778245983672e-05, "loss": 2.458, "step": 17487 }, { "epoch": 0.46057413747695547, "grad_norm": 2.206209421157837, "learning_rate": 2.6984461416908084e-05, "loss": 1.6908, "step": 17488 }, { "epoch": 0.4606004740584672, "grad_norm": 2.317737579345703, "learning_rate": 2.69831445878325e-05, "loss": 1.9039, "step": 17489 }, { "epoch": 0.46062681063997896, "grad_norm": 2.2961573600769043, "learning_rate": 2.6981827758756915e-05, "loss": 2.0017, "step": 17490 }, { "epoch": 0.46065314722149064, "grad_norm": 1.447906255722046, "learning_rate": 2.698051092968133e-05, "loss": 0.3744, "step": 17491 }, { "epoch": 0.4606794838030024, "grad_norm": 2.1364970207214355, "learning_rate": 2.6979194100605743e-05, "loss": 1.3722, "step": 17492 }, { "epoch": 0.4607058203845141, "grad_norm": 1.826667308807373, "learning_rate": 2.6977877271530155e-05, "loss": 1.3515, "step": 17493 }, { "epoch": 0.4607321569660258, "grad_norm": 1.814029335975647, "learning_rate": 2.6976560442454567e-05, "loss": 1.809, "step": 17494 }, { "epoch": 0.4607584935475375, "grad_norm": 3.5878870487213135, "learning_rate": 2.6975243613378986e-05, "loss": 1.332, "step": 17495 }, { "epoch": 0.46078483012904925, "grad_norm": 2.3971469402313232, "learning_rate": 2.69739267843034e-05, "loss": 1.5314, "step": 17496 }, { "epoch": 0.460811166710561, "grad_norm": 3.9298312664031982, "learning_rate": 2.697260995522781e-05, "loss": 1.8334, "step": 17497 }, { "epoch": 0.4608375032920727, "grad_norm": 2.1526389122009277, "learning_rate": 2.6971293126152226e-05, "loss": 1.3654, "step": 17498 }, { "epoch": 0.4608638398735844, "grad_norm": 1.6501330137252808, "learning_rate": 2.6969976297076642e-05, "loss": 1.9574, "step": 17499 }, { "epoch": 0.4608901764550961, "grad_norm": 1.8999844789505005, "learning_rate": 2.6968659468001058e-05, "loss": 1.9529, "step": 17500 }, { "epoch": 0.46091651303660786, "grad_norm": 2.108538866043091, "learning_rate": 2.696734263892547e-05, "loss": 1.6076, "step": 17501 }, { "epoch": 0.46094284961811954, "grad_norm": 3.4208884239196777, "learning_rate": 2.6966025809849882e-05, "loss": 1.666, "step": 17502 }, { "epoch": 0.4609691861996313, "grad_norm": 2.0282604694366455, "learning_rate": 2.6964708980774294e-05, "loss": 2.0153, "step": 17503 }, { "epoch": 0.46099552278114303, "grad_norm": 1.6982851028442383, "learning_rate": 2.6963392151698713e-05, "loss": 1.7184, "step": 17504 }, { "epoch": 0.4610218593626547, "grad_norm": 1.7129539251327515, "learning_rate": 2.6962075322623126e-05, "loss": 1.1235, "step": 17505 }, { "epoch": 0.46104819594416646, "grad_norm": 1.9388326406478882, "learning_rate": 2.6960758493547538e-05, "loss": 1.7993, "step": 17506 }, { "epoch": 0.46107453252567815, "grad_norm": 2.903170585632324, "learning_rate": 2.6959441664471953e-05, "loss": 1.7934, "step": 17507 }, { "epoch": 0.4611008691071899, "grad_norm": 2.008204460144043, "learning_rate": 2.6958124835396366e-05, "loss": 1.7812, "step": 17508 }, { "epoch": 0.4611272056887016, "grad_norm": 1.6647323369979858, "learning_rate": 2.6956808006320785e-05, "loss": 1.8555, "step": 17509 }, { "epoch": 0.4611535422702133, "grad_norm": 2.406745195388794, "learning_rate": 2.6955491177245197e-05, "loss": 0.7043, "step": 17510 }, { "epoch": 0.46117987885172507, "grad_norm": 1.7166666984558105, "learning_rate": 2.695417434816961e-05, "loss": 2.0748, "step": 17511 }, { "epoch": 0.46120621543323675, "grad_norm": 2.892624616622925, "learning_rate": 2.695285751909402e-05, "loss": 1.3902, "step": 17512 }, { "epoch": 0.4612325520147485, "grad_norm": 2.0963261127471924, "learning_rate": 2.6951540690018433e-05, "loss": 1.8431, "step": 17513 }, { "epoch": 0.4612588885962602, "grad_norm": 2.036587953567505, "learning_rate": 2.6950223860942852e-05, "loss": 1.5784, "step": 17514 }, { "epoch": 0.46128522517777193, "grad_norm": 1.982434630393982, "learning_rate": 2.6948907031867265e-05, "loss": 1.3778, "step": 17515 }, { "epoch": 0.4613115617592837, "grad_norm": 2.5579171180725098, "learning_rate": 2.6947590202791677e-05, "loss": 1.7708, "step": 17516 }, { "epoch": 0.46133789834079536, "grad_norm": 1.8268247842788696, "learning_rate": 2.6946273373716092e-05, "loss": 1.9356, "step": 17517 }, { "epoch": 0.4613642349223071, "grad_norm": 1.5559669733047485, "learning_rate": 2.6944956544640508e-05, "loss": 1.7244, "step": 17518 }, { "epoch": 0.4613905715038188, "grad_norm": 1.5587821006774902, "learning_rate": 2.6943639715564924e-05, "loss": 2.2493, "step": 17519 }, { "epoch": 0.46141690808533053, "grad_norm": 3.959388494491577, "learning_rate": 2.6942322886489336e-05, "loss": 2.49, "step": 17520 }, { "epoch": 0.4614432446668422, "grad_norm": 4.955875873565674, "learning_rate": 2.6941006057413748e-05, "loss": 0.7397, "step": 17521 }, { "epoch": 0.46146958124835397, "grad_norm": 1.5485541820526123, "learning_rate": 2.693968922833816e-05, "loss": 1.82, "step": 17522 }, { "epoch": 0.4614959178298657, "grad_norm": 2.207554578781128, "learning_rate": 2.693837239926258e-05, "loss": 1.9246, "step": 17523 }, { "epoch": 0.4615222544113774, "grad_norm": 2.386310338973999, "learning_rate": 2.693705557018699e-05, "loss": 1.6201, "step": 17524 }, { "epoch": 0.46154859099288914, "grad_norm": 1.8456358909606934, "learning_rate": 2.6935738741111404e-05, "loss": 1.4431, "step": 17525 }, { "epoch": 0.46157492757440083, "grad_norm": 2.117990016937256, "learning_rate": 2.693442191203582e-05, "loss": 2.1145, "step": 17526 }, { "epoch": 0.46160126415591257, "grad_norm": 1.796316385269165, "learning_rate": 2.693310508296023e-05, "loss": 1.8411, "step": 17527 }, { "epoch": 0.46162760073742426, "grad_norm": 2.610105037689209, "learning_rate": 2.693178825388465e-05, "loss": 1.1028, "step": 17528 }, { "epoch": 0.461653937318936, "grad_norm": 1.6163783073425293, "learning_rate": 2.6930471424809063e-05, "loss": 1.8427, "step": 17529 }, { "epoch": 0.46168027390044775, "grad_norm": 2.9518046379089355, "learning_rate": 2.6929154595733475e-05, "loss": 1.1236, "step": 17530 }, { "epoch": 0.46170661048195943, "grad_norm": 3.709421396255493, "learning_rate": 2.6927837766657887e-05, "loss": 1.2948, "step": 17531 }, { "epoch": 0.4617329470634712, "grad_norm": 2.0739338397979736, "learning_rate": 2.6926520937582306e-05, "loss": 2.27, "step": 17532 }, { "epoch": 0.46175928364498287, "grad_norm": 1.6710008382797241, "learning_rate": 2.6925204108506718e-05, "loss": 1.7208, "step": 17533 }, { "epoch": 0.4617856202264946, "grad_norm": 2.491178512573242, "learning_rate": 2.692388727943113e-05, "loss": 0.2786, "step": 17534 }, { "epoch": 0.4618119568080063, "grad_norm": 3.1380090713500977, "learning_rate": 2.6922570450355543e-05, "loss": 1.739, "step": 17535 }, { "epoch": 0.46183829338951804, "grad_norm": 1.7793279886245728, "learning_rate": 2.6921253621279958e-05, "loss": 1.8548, "step": 17536 }, { "epoch": 0.4618646299710298, "grad_norm": 2.807365655899048, "learning_rate": 2.6919936792204374e-05, "loss": 1.5687, "step": 17537 }, { "epoch": 0.46189096655254147, "grad_norm": 2.846344470977783, "learning_rate": 2.691861996312879e-05, "loss": 2.0694, "step": 17538 }, { "epoch": 0.4619173031340532, "grad_norm": 1.955567717552185, "learning_rate": 2.69173031340532e-05, "loss": 2.7026, "step": 17539 }, { "epoch": 0.4619436397155649, "grad_norm": 1.646132230758667, "learning_rate": 2.6915986304977614e-05, "loss": 1.7977, "step": 17540 }, { "epoch": 0.46196997629707665, "grad_norm": 3.220524549484253, "learning_rate": 2.6914669475902026e-05, "loss": 1.4106, "step": 17541 }, { "epoch": 0.46199631287858833, "grad_norm": 1.5704196691513062, "learning_rate": 2.6913352646826445e-05, "loss": 1.7109, "step": 17542 }, { "epoch": 0.4620226494601001, "grad_norm": 2.640713691711426, "learning_rate": 2.6912035817750857e-05, "loss": 1.1525, "step": 17543 }, { "epoch": 0.4620489860416118, "grad_norm": 3.208357810974121, "learning_rate": 2.691071898867527e-05, "loss": 1.7592, "step": 17544 }, { "epoch": 0.4620753226231235, "grad_norm": 1.7211967706680298, "learning_rate": 2.6909402159599685e-05, "loss": 0.9374, "step": 17545 }, { "epoch": 0.46210165920463525, "grad_norm": 1.4923902750015259, "learning_rate": 2.6908085330524097e-05, "loss": 1.2124, "step": 17546 }, { "epoch": 0.46212799578614694, "grad_norm": 3.00479793548584, "learning_rate": 2.6906768501448516e-05, "loss": 2.6912, "step": 17547 }, { "epoch": 0.4621543323676587, "grad_norm": 1.9975330829620361, "learning_rate": 2.690545167237293e-05, "loss": 1.3521, "step": 17548 }, { "epoch": 0.46218066894917037, "grad_norm": 3.134265184402466, "learning_rate": 2.690413484329734e-05, "loss": 1.334, "step": 17549 }, { "epoch": 0.4622070055306821, "grad_norm": 2.316401720046997, "learning_rate": 2.6902818014221753e-05, "loss": 1.8305, "step": 17550 }, { "epoch": 0.46223334211219386, "grad_norm": 1.648706316947937, "learning_rate": 2.6901501185146172e-05, "loss": 0.2379, "step": 17551 }, { "epoch": 0.46225967869370554, "grad_norm": 4.71798038482666, "learning_rate": 2.6900184356070584e-05, "loss": 1.7815, "step": 17552 }, { "epoch": 0.4622860152752173, "grad_norm": 3.6050045490264893, "learning_rate": 2.6898867526994996e-05, "loss": 1.2606, "step": 17553 }, { "epoch": 0.462312351856729, "grad_norm": 2.0452992916107178, "learning_rate": 2.6897550697919412e-05, "loss": 1.833, "step": 17554 }, { "epoch": 0.4623386884382407, "grad_norm": 4.002376556396484, "learning_rate": 2.6896233868843824e-05, "loss": 1.8259, "step": 17555 }, { "epoch": 0.46236502501975246, "grad_norm": 3.458533525466919, "learning_rate": 2.6894917039768243e-05, "loss": 1.0992, "step": 17556 }, { "epoch": 0.46239136160126415, "grad_norm": 1.8810521364212036, "learning_rate": 2.6893600210692655e-05, "loss": 1.8099, "step": 17557 }, { "epoch": 0.4624176981827759, "grad_norm": 2.9881768226623535, "learning_rate": 2.6892283381617067e-05, "loss": 1.0115, "step": 17558 }, { "epoch": 0.4624440347642876, "grad_norm": 4.67957067489624, "learning_rate": 2.689096655254148e-05, "loss": 1.2525, "step": 17559 }, { "epoch": 0.4624703713457993, "grad_norm": 2.159923791885376, "learning_rate": 2.6889649723465892e-05, "loss": 1.3093, "step": 17560 }, { "epoch": 0.462496707927311, "grad_norm": 1.7475998401641846, "learning_rate": 2.688833289439031e-05, "loss": 0.5041, "step": 17561 }, { "epoch": 0.46252304450882276, "grad_norm": 1.7486635446548462, "learning_rate": 2.6887016065314723e-05, "loss": 2.6904, "step": 17562 }, { "epoch": 0.4625493810903345, "grad_norm": 3.956477642059326, "learning_rate": 2.6885699236239135e-05, "loss": 1.6588, "step": 17563 }, { "epoch": 0.4625757176718462, "grad_norm": 1.904270887374878, "learning_rate": 2.688438240716355e-05, "loss": 1.8687, "step": 17564 }, { "epoch": 0.46260205425335793, "grad_norm": 1.4893171787261963, "learning_rate": 2.6883065578087967e-05, "loss": 2.3775, "step": 17565 }, { "epoch": 0.4626283908348696, "grad_norm": 1.8342007398605347, "learning_rate": 2.6881748749012382e-05, "loss": 1.9912, "step": 17566 }, { "epoch": 0.46265472741638136, "grad_norm": 2.115168809890747, "learning_rate": 2.6880431919936794e-05, "loss": 1.0927, "step": 17567 }, { "epoch": 0.46268106399789305, "grad_norm": 2.3055005073547363, "learning_rate": 2.6879115090861207e-05, "loss": 1.7059, "step": 17568 }, { "epoch": 0.4627074005794048, "grad_norm": 1.8641294240951538, "learning_rate": 2.687779826178562e-05, "loss": 1.3708, "step": 17569 }, { "epoch": 0.46273373716091654, "grad_norm": 2.036135196685791, "learning_rate": 2.6876481432710038e-05, "loss": 1.9027, "step": 17570 }, { "epoch": 0.4627600737424282, "grad_norm": 1.6262738704681396, "learning_rate": 2.687516460363445e-05, "loss": 1.4936, "step": 17571 }, { "epoch": 0.46278641032393997, "grad_norm": 1.7485545873641968, "learning_rate": 2.6873847774558862e-05, "loss": 2.2133, "step": 17572 }, { "epoch": 0.46281274690545166, "grad_norm": 1.6541633605957031, "learning_rate": 2.6872530945483278e-05, "loss": 1.5818, "step": 17573 }, { "epoch": 0.4628390834869634, "grad_norm": 2.2026326656341553, "learning_rate": 2.687121411640769e-05, "loss": 1.7335, "step": 17574 }, { "epoch": 0.4628654200684751, "grad_norm": 4.983541488647461, "learning_rate": 2.686989728733211e-05, "loss": 1.23, "step": 17575 }, { "epoch": 0.46289175664998683, "grad_norm": 2.145479679107666, "learning_rate": 2.686858045825652e-05, "loss": 2.0333, "step": 17576 }, { "epoch": 0.4629180932314986, "grad_norm": 3.012795925140381, "learning_rate": 2.6867263629180933e-05, "loss": 1.6883, "step": 17577 }, { "epoch": 0.46294442981301026, "grad_norm": 2.079068660736084, "learning_rate": 2.6865946800105346e-05, "loss": 1.524, "step": 17578 }, { "epoch": 0.462970766394522, "grad_norm": 2.492637872695923, "learning_rate": 2.6864629971029758e-05, "loss": 1.9061, "step": 17579 }, { "epoch": 0.4629971029760337, "grad_norm": 5.192150115966797, "learning_rate": 2.6863313141954177e-05, "loss": 1.5762, "step": 17580 }, { "epoch": 0.46302343955754544, "grad_norm": 3.4451215267181396, "learning_rate": 2.686199631287859e-05, "loss": 1.5547, "step": 17581 }, { "epoch": 0.4630497761390571, "grad_norm": 1.6375210285186768, "learning_rate": 2.6860679483803005e-05, "loss": 1.8389, "step": 17582 }, { "epoch": 0.46307611272056887, "grad_norm": 6.425418853759766, "learning_rate": 2.6859362654727417e-05, "loss": 1.8512, "step": 17583 }, { "epoch": 0.4631024493020806, "grad_norm": 2.2132797241210938, "learning_rate": 2.6858045825651836e-05, "loss": 1.9915, "step": 17584 }, { "epoch": 0.4631287858835923, "grad_norm": 3.884831666946411, "learning_rate": 2.6856728996576248e-05, "loss": 1.3174, "step": 17585 }, { "epoch": 0.46315512246510404, "grad_norm": 2.4993481636047363, "learning_rate": 2.685541216750066e-05, "loss": 1.9246, "step": 17586 }, { "epoch": 0.46318145904661573, "grad_norm": 2.1750473976135254, "learning_rate": 2.6854095338425072e-05, "loss": 1.3254, "step": 17587 }, { "epoch": 0.46320779562812747, "grad_norm": 1.5546739101409912, "learning_rate": 2.6852778509349485e-05, "loss": 1.5789, "step": 17588 }, { "epoch": 0.4632341322096392, "grad_norm": 2.34140944480896, "learning_rate": 2.6851461680273904e-05, "loss": 0.9459, "step": 17589 }, { "epoch": 0.4632604687911509, "grad_norm": 2.98567533493042, "learning_rate": 2.6850144851198316e-05, "loss": 1.0636, "step": 17590 }, { "epoch": 0.46328680537266265, "grad_norm": 2.0883536338806152, "learning_rate": 2.6848828022122728e-05, "loss": 1.5716, "step": 17591 }, { "epoch": 0.46331314195417433, "grad_norm": 1.701937198638916, "learning_rate": 2.6847511193047144e-05, "loss": 1.8402, "step": 17592 }, { "epoch": 0.4633394785356861, "grad_norm": 1.5772747993469238, "learning_rate": 2.6846194363971556e-05, "loss": 2.0436, "step": 17593 }, { "epoch": 0.46336581511719777, "grad_norm": 2.340604782104492, "learning_rate": 2.6844877534895975e-05, "loss": 1.521, "step": 17594 }, { "epoch": 0.4633921516987095, "grad_norm": 3.2818398475646973, "learning_rate": 2.6843560705820387e-05, "loss": 1.6473, "step": 17595 }, { "epoch": 0.46341848828022125, "grad_norm": 1.6537885665893555, "learning_rate": 2.68422438767448e-05, "loss": 1.1602, "step": 17596 }, { "epoch": 0.46344482486173294, "grad_norm": 2.0647153854370117, "learning_rate": 2.684092704766921e-05, "loss": 2.0969, "step": 17597 }, { "epoch": 0.4634711614432447, "grad_norm": 2.1821203231811523, "learning_rate": 2.683961021859363e-05, "loss": 1.9732, "step": 17598 }, { "epoch": 0.46349749802475637, "grad_norm": 2.088315486907959, "learning_rate": 2.6838293389518043e-05, "loss": 2.0616, "step": 17599 }, { "epoch": 0.4635238346062681, "grad_norm": 2.128448009490967, "learning_rate": 2.6836976560442455e-05, "loss": 2.3946, "step": 17600 }, { "epoch": 0.4635501711877798, "grad_norm": 2.9358444213867188, "learning_rate": 2.683565973136687e-05, "loss": 1.9484, "step": 17601 }, { "epoch": 0.46357650776929155, "grad_norm": 1.4702099561691284, "learning_rate": 2.6834342902291283e-05, "loss": 1.7624, "step": 17602 }, { "epoch": 0.4636028443508033, "grad_norm": 2.299086093902588, "learning_rate": 2.68330260732157e-05, "loss": 0.5126, "step": 17603 }, { "epoch": 0.463629180932315, "grad_norm": 5.270533084869385, "learning_rate": 2.6831709244140114e-05, "loss": 1.5283, "step": 17604 }, { "epoch": 0.4636555175138267, "grad_norm": 1.7086987495422363, "learning_rate": 2.6830392415064526e-05, "loss": 1.2519, "step": 17605 }, { "epoch": 0.4636818540953384, "grad_norm": 2.6047730445861816, "learning_rate": 2.6829075585988938e-05, "loss": 1.3696, "step": 17606 }, { "epoch": 0.46370819067685015, "grad_norm": 2.140594720840454, "learning_rate": 2.682775875691335e-05, "loss": 1.711, "step": 17607 }, { "epoch": 0.46373452725836184, "grad_norm": 4.106740951538086, "learning_rate": 2.682644192783777e-05, "loss": 0.3154, "step": 17608 }, { "epoch": 0.4637608638398736, "grad_norm": 2.4060792922973633, "learning_rate": 2.682512509876218e-05, "loss": 1.1164, "step": 17609 }, { "epoch": 0.4637872004213853, "grad_norm": 5.173916339874268, "learning_rate": 2.6823808269686594e-05, "loss": 1.0893, "step": 17610 }, { "epoch": 0.463813537002897, "grad_norm": 2.4310548305511475, "learning_rate": 2.682249144061101e-05, "loss": 1.3136, "step": 17611 }, { "epoch": 0.46383987358440876, "grad_norm": 1.8239504098892212, "learning_rate": 2.682117461153542e-05, "loss": 1.1987, "step": 17612 }, { "epoch": 0.46386621016592045, "grad_norm": 3.905519485473633, "learning_rate": 2.681985778245984e-05, "loss": 1.2127, "step": 17613 }, { "epoch": 0.4638925467474322, "grad_norm": 2.3367154598236084, "learning_rate": 2.6818540953384253e-05, "loss": 1.2624, "step": 17614 }, { "epoch": 0.4639188833289439, "grad_norm": 4.140835762023926, "learning_rate": 2.6817224124308665e-05, "loss": 1.4969, "step": 17615 }, { "epoch": 0.4639452199104556, "grad_norm": 3.3306164741516113, "learning_rate": 2.6815907295233077e-05, "loss": 1.5663, "step": 17616 }, { "epoch": 0.46397155649196736, "grad_norm": 2.9801230430603027, "learning_rate": 2.6814590466157496e-05, "loss": 1.3286, "step": 17617 }, { "epoch": 0.46399789307347905, "grad_norm": 2.134817600250244, "learning_rate": 2.681327363708191e-05, "loss": 1.7088, "step": 17618 }, { "epoch": 0.4640242296549908, "grad_norm": 2.7753093242645264, "learning_rate": 2.681195680800632e-05, "loss": 1.7023, "step": 17619 }, { "epoch": 0.4640505662365025, "grad_norm": 2.452871561050415, "learning_rate": 2.6810639978930736e-05, "loss": 1.6421, "step": 17620 }, { "epoch": 0.4640769028180142, "grad_norm": 1.9739270210266113, "learning_rate": 2.680932314985515e-05, "loss": 1.2964, "step": 17621 }, { "epoch": 0.46410323939952597, "grad_norm": 2.002150535583496, "learning_rate": 2.6808006320779567e-05, "loss": 2.2155, "step": 17622 }, { "epoch": 0.46412957598103766, "grad_norm": 4.563379764556885, "learning_rate": 2.680668949170398e-05, "loss": 1.5765, "step": 17623 }, { "epoch": 0.4641559125625494, "grad_norm": 1.8958802223205566, "learning_rate": 2.6805372662628392e-05, "loss": 1.0206, "step": 17624 }, { "epoch": 0.4641822491440611, "grad_norm": 3.142764091491699, "learning_rate": 2.6804055833552804e-05, "loss": 1.3983, "step": 17625 }, { "epoch": 0.46420858572557283, "grad_norm": 1.6672254800796509, "learning_rate": 2.6802739004477216e-05, "loss": 1.6891, "step": 17626 }, { "epoch": 0.4642349223070845, "grad_norm": 1.7124546766281128, "learning_rate": 2.6801422175401635e-05, "loss": 1.7931, "step": 17627 }, { "epoch": 0.46426125888859626, "grad_norm": 2.0575461387634277, "learning_rate": 2.6800105346326048e-05, "loss": 1.6328, "step": 17628 }, { "epoch": 0.464287595470108, "grad_norm": 2.658080577850342, "learning_rate": 2.6798788517250463e-05, "loss": 1.9511, "step": 17629 }, { "epoch": 0.4643139320516197, "grad_norm": 2.051379919052124, "learning_rate": 2.6797471688174875e-05, "loss": 0.7385, "step": 17630 }, { "epoch": 0.46434026863313144, "grad_norm": 1.6465489864349365, "learning_rate": 2.6796154859099294e-05, "loss": 1.8276, "step": 17631 }, { "epoch": 0.4643666052146431, "grad_norm": 1.5366212129592896, "learning_rate": 2.6794838030023707e-05, "loss": 1.8191, "step": 17632 }, { "epoch": 0.46439294179615487, "grad_norm": 1.9839245080947876, "learning_rate": 2.679352120094812e-05, "loss": 2.2081, "step": 17633 }, { "epoch": 0.46441927837766656, "grad_norm": 2.529362440109253, "learning_rate": 2.679220437187253e-05, "loss": 2.2052, "step": 17634 }, { "epoch": 0.4644456149591783, "grad_norm": 2.7027270793914795, "learning_rate": 2.6790887542796943e-05, "loss": 0.8896, "step": 17635 }, { "epoch": 0.46447195154069004, "grad_norm": 2.4789376258850098, "learning_rate": 2.6789570713721362e-05, "loss": 2.0316, "step": 17636 }, { "epoch": 0.46449828812220173, "grad_norm": 1.7484478950500488, "learning_rate": 2.6788253884645774e-05, "loss": 1.8745, "step": 17637 }, { "epoch": 0.4645246247037135, "grad_norm": 2.5566487312316895, "learning_rate": 2.6786937055570187e-05, "loss": 1.5511, "step": 17638 }, { "epoch": 0.46455096128522516, "grad_norm": 2.914454936981201, "learning_rate": 2.6785620226494602e-05, "loss": 1.4368, "step": 17639 }, { "epoch": 0.4645772978667369, "grad_norm": 1.9978471994400024, "learning_rate": 2.6784303397419014e-05, "loss": 1.3768, "step": 17640 }, { "epoch": 0.4646036344482486, "grad_norm": 2.215916156768799, "learning_rate": 2.6782986568343433e-05, "loss": 1.1244, "step": 17641 }, { "epoch": 0.46462997102976034, "grad_norm": 1.8007752895355225, "learning_rate": 2.6781669739267846e-05, "loss": 1.6294, "step": 17642 }, { "epoch": 0.4646563076112721, "grad_norm": 3.295771598815918, "learning_rate": 2.6780352910192258e-05, "loss": 0.6972, "step": 17643 }, { "epoch": 0.46468264419278377, "grad_norm": 2.87211275100708, "learning_rate": 2.677903608111667e-05, "loss": 1.4349, "step": 17644 }, { "epoch": 0.4647089807742955, "grad_norm": 1.689457893371582, "learning_rate": 2.677771925204109e-05, "loss": 2.3205, "step": 17645 }, { "epoch": 0.4647353173558072, "grad_norm": 2.2953498363494873, "learning_rate": 2.67764024229655e-05, "loss": 1.4227, "step": 17646 }, { "epoch": 0.46476165393731894, "grad_norm": 2.3698196411132812, "learning_rate": 2.6775085593889913e-05, "loss": 1.5988, "step": 17647 }, { "epoch": 0.46478799051883063, "grad_norm": 1.7133572101593018, "learning_rate": 2.677376876481433e-05, "loss": 2.2402, "step": 17648 }, { "epoch": 0.4648143271003424, "grad_norm": 2.118105411529541, "learning_rate": 2.677245193573874e-05, "loss": 1.3295, "step": 17649 }, { "epoch": 0.4648406636818541, "grad_norm": 4.4868998527526855, "learning_rate": 2.677113510666316e-05, "loss": 1.3086, "step": 17650 }, { "epoch": 0.4648670002633658, "grad_norm": 2.33290433883667, "learning_rate": 2.6769818277587572e-05, "loss": 0.7342, "step": 17651 }, { "epoch": 0.46489333684487755, "grad_norm": 2.0869383811950684, "learning_rate": 2.6768501448511985e-05, "loss": 1.6012, "step": 17652 }, { "epoch": 0.46491967342638923, "grad_norm": 3.0033810138702393, "learning_rate": 2.6767184619436397e-05, "loss": 1.715, "step": 17653 }, { "epoch": 0.464946010007901, "grad_norm": 2.684756278991699, "learning_rate": 2.676586779036081e-05, "loss": 2.2651, "step": 17654 }, { "epoch": 0.4649723465894127, "grad_norm": 1.6370489597320557, "learning_rate": 2.6764550961285228e-05, "loss": 1.7333, "step": 17655 }, { "epoch": 0.4649986831709244, "grad_norm": 1.6218193769454956, "learning_rate": 2.676323413220964e-05, "loss": 1.6966, "step": 17656 }, { "epoch": 0.46502501975243615, "grad_norm": 2.917945146560669, "learning_rate": 2.6761917303134056e-05, "loss": 1.7306, "step": 17657 }, { "epoch": 0.46505135633394784, "grad_norm": 1.9902558326721191, "learning_rate": 2.6760600474058468e-05, "loss": 1.3865, "step": 17658 }, { "epoch": 0.4650776929154596, "grad_norm": 1.5651133060455322, "learning_rate": 2.675928364498288e-05, "loss": 1.7864, "step": 17659 }, { "epoch": 0.46510402949697127, "grad_norm": 2.4701952934265137, "learning_rate": 2.67579668159073e-05, "loss": 2.3128, "step": 17660 }, { "epoch": 0.465130366078483, "grad_norm": 2.6418309211730957, "learning_rate": 2.675664998683171e-05, "loss": 1.1298, "step": 17661 }, { "epoch": 0.46515670265999476, "grad_norm": 1.8996413946151733, "learning_rate": 2.6755333157756124e-05, "loss": 1.6912, "step": 17662 }, { "epoch": 0.46518303924150645, "grad_norm": 1.6829640865325928, "learning_rate": 2.6754016328680536e-05, "loss": 0.8139, "step": 17663 }, { "epoch": 0.4652093758230182, "grad_norm": 1.9768600463867188, "learning_rate": 2.6752699499604955e-05, "loss": 1.5581, "step": 17664 }, { "epoch": 0.4652357124045299, "grad_norm": 2.5917341709136963, "learning_rate": 2.6751382670529367e-05, "loss": 1.7122, "step": 17665 }, { "epoch": 0.4652620489860416, "grad_norm": 4.4162726402282715, "learning_rate": 2.675006584145378e-05, "loss": 1.7994, "step": 17666 }, { "epoch": 0.4652883855675533, "grad_norm": 3.4718453884124756, "learning_rate": 2.6748749012378195e-05, "loss": 1.2841, "step": 17667 }, { "epoch": 0.46531472214906505, "grad_norm": 2.15373158454895, "learning_rate": 2.6747432183302607e-05, "loss": 2.0418, "step": 17668 }, { "epoch": 0.4653410587305768, "grad_norm": 1.4824116230010986, "learning_rate": 2.6746115354227026e-05, "loss": 1.5394, "step": 17669 }, { "epoch": 0.4653673953120885, "grad_norm": 5.731241703033447, "learning_rate": 2.6744798525151438e-05, "loss": 1.1126, "step": 17670 }, { "epoch": 0.4653937318936002, "grad_norm": 2.0641798973083496, "learning_rate": 2.674348169607585e-05, "loss": 1.853, "step": 17671 }, { "epoch": 0.4654200684751119, "grad_norm": 6.711053371429443, "learning_rate": 2.6742164867000263e-05, "loss": 1.9473, "step": 17672 }, { "epoch": 0.46544640505662366, "grad_norm": 2.259662389755249, "learning_rate": 2.6740848037924675e-05, "loss": 2.1195, "step": 17673 }, { "epoch": 0.46547274163813535, "grad_norm": 1.7201156616210938, "learning_rate": 2.6739531208849094e-05, "loss": 1.3219, "step": 17674 }, { "epoch": 0.4654990782196471, "grad_norm": 2.102354049682617, "learning_rate": 2.6738214379773506e-05, "loss": 0.5926, "step": 17675 }, { "epoch": 0.46552541480115883, "grad_norm": 2.0202291011810303, "learning_rate": 2.673689755069792e-05, "loss": 0.6954, "step": 17676 }, { "epoch": 0.4655517513826705, "grad_norm": 3.3457882404327393, "learning_rate": 2.6735580721622334e-05, "loss": 1.0044, "step": 17677 }, { "epoch": 0.46557808796418226, "grad_norm": 3.845686197280884, "learning_rate": 2.6734263892546753e-05, "loss": 1.5838, "step": 17678 }, { "epoch": 0.46560442454569395, "grad_norm": 1.4386787414550781, "learning_rate": 2.6732947063471165e-05, "loss": 1.6776, "step": 17679 }, { "epoch": 0.4656307611272057, "grad_norm": 2.5082128047943115, "learning_rate": 2.6731630234395577e-05, "loss": 1.9394, "step": 17680 }, { "epoch": 0.4656570977087174, "grad_norm": 2.361893653869629, "learning_rate": 2.673031340531999e-05, "loss": 2.3201, "step": 17681 }, { "epoch": 0.4656834342902291, "grad_norm": 2.3069963455200195, "learning_rate": 2.6728996576244402e-05, "loss": 2.1801, "step": 17682 }, { "epoch": 0.46570977087174087, "grad_norm": 1.9952476024627686, "learning_rate": 2.672767974716882e-05, "loss": 0.6037, "step": 17683 }, { "epoch": 0.46573610745325256, "grad_norm": 1.860139012336731, "learning_rate": 2.6726362918093233e-05, "loss": 1.4967, "step": 17684 }, { "epoch": 0.4657624440347643, "grad_norm": 3.3513987064361572, "learning_rate": 2.672504608901765e-05, "loss": 1.5793, "step": 17685 }, { "epoch": 0.465788780616276, "grad_norm": 2.2791037559509277, "learning_rate": 2.672372925994206e-05, "loss": 1.7081, "step": 17686 }, { "epoch": 0.46581511719778773, "grad_norm": 1.890899419784546, "learning_rate": 2.6722412430866473e-05, "loss": 1.5473, "step": 17687 }, { "epoch": 0.4658414537792994, "grad_norm": 3.320352792739868, "learning_rate": 2.6721095601790892e-05, "loss": 0.8191, "step": 17688 }, { "epoch": 0.46586779036081116, "grad_norm": 1.7549089193344116, "learning_rate": 2.6719778772715304e-05, "loss": 1.816, "step": 17689 }, { "epoch": 0.4658941269423229, "grad_norm": 1.6220976114273071, "learning_rate": 2.6718461943639716e-05, "loss": 1.558, "step": 17690 }, { "epoch": 0.4659204635238346, "grad_norm": 2.1625964641571045, "learning_rate": 2.671714511456413e-05, "loss": 1.6821, "step": 17691 }, { "epoch": 0.46594680010534634, "grad_norm": 1.5719045400619507, "learning_rate": 2.671582828548854e-05, "loss": 1.9854, "step": 17692 }, { "epoch": 0.465973136686858, "grad_norm": 2.02211856842041, "learning_rate": 2.671451145641296e-05, "loss": 1.8916, "step": 17693 }, { "epoch": 0.46599947326836977, "grad_norm": 2.3192813396453857, "learning_rate": 2.6713194627337372e-05, "loss": 1.6054, "step": 17694 }, { "epoch": 0.4660258098498815, "grad_norm": 1.6684606075286865, "learning_rate": 2.6711877798261788e-05, "loss": 1.1422, "step": 17695 }, { "epoch": 0.4660521464313932, "grad_norm": 2.1002683639526367, "learning_rate": 2.67105609691862e-05, "loss": 2.1899, "step": 17696 }, { "epoch": 0.46607848301290494, "grad_norm": 1.7114143371582031, "learning_rate": 2.670924414011062e-05, "loss": 0.2672, "step": 17697 }, { "epoch": 0.46610481959441663, "grad_norm": 4.462927341461182, "learning_rate": 2.670792731103503e-05, "loss": 1.9146, "step": 17698 }, { "epoch": 0.4661311561759284, "grad_norm": 1.6798112392425537, "learning_rate": 2.6706610481959443e-05, "loss": 1.3381, "step": 17699 }, { "epoch": 0.46615749275744006, "grad_norm": 1.7059693336486816, "learning_rate": 2.6705293652883855e-05, "loss": 1.9451, "step": 17700 }, { "epoch": 0.4661838293389518, "grad_norm": 1.4935001134872437, "learning_rate": 2.6703976823808268e-05, "loss": 1.7626, "step": 17701 }, { "epoch": 0.46621016592046355, "grad_norm": 1.8951994180679321, "learning_rate": 2.6702659994732687e-05, "loss": 2.3542, "step": 17702 }, { "epoch": 0.46623650250197524, "grad_norm": 2.825207233428955, "learning_rate": 2.67013431656571e-05, "loss": 0.9514, "step": 17703 }, { "epoch": 0.466262839083487, "grad_norm": 4.246274948120117, "learning_rate": 2.6700026336581514e-05, "loss": 1.3566, "step": 17704 }, { "epoch": 0.46628917566499867, "grad_norm": 2.1827847957611084, "learning_rate": 2.6698709507505927e-05, "loss": 2.1029, "step": 17705 }, { "epoch": 0.4663155122465104, "grad_norm": 2.0452218055725098, "learning_rate": 2.669739267843034e-05, "loss": 1.7554, "step": 17706 }, { "epoch": 0.4663418488280221, "grad_norm": 1.929308295249939, "learning_rate": 2.6696075849354758e-05, "loss": 1.3093, "step": 17707 }, { "epoch": 0.46636818540953384, "grad_norm": 1.682302713394165, "learning_rate": 2.669475902027917e-05, "loss": 1.5466, "step": 17708 }, { "epoch": 0.4663945219910456, "grad_norm": 1.8157784938812256, "learning_rate": 2.6693442191203582e-05, "loss": 0.9867, "step": 17709 }, { "epoch": 0.4664208585725573, "grad_norm": 6.393119812011719, "learning_rate": 2.6692125362127994e-05, "loss": 1.3851, "step": 17710 }, { "epoch": 0.466447195154069, "grad_norm": 1.5145739316940308, "learning_rate": 2.6690808533052413e-05, "loss": 1.6247, "step": 17711 }, { "epoch": 0.4664735317355807, "grad_norm": 1.7255464792251587, "learning_rate": 2.6689491703976826e-05, "loss": 1.9207, "step": 17712 }, { "epoch": 0.46649986831709245, "grad_norm": 13.894375801086426, "learning_rate": 2.6688174874901238e-05, "loss": 1.4912, "step": 17713 }, { "epoch": 0.46652620489860414, "grad_norm": 2.2871508598327637, "learning_rate": 2.6686858045825653e-05, "loss": 1.4475, "step": 17714 }, { "epoch": 0.4665525414801159, "grad_norm": 1.7496458292007446, "learning_rate": 2.6685541216750066e-05, "loss": 2.0623, "step": 17715 }, { "epoch": 0.4665788780616276, "grad_norm": 2.191917896270752, "learning_rate": 2.6684224387674485e-05, "loss": 1.9308, "step": 17716 }, { "epoch": 0.4666052146431393, "grad_norm": 1.4847074747085571, "learning_rate": 2.6682907558598897e-05, "loss": 1.2611, "step": 17717 }, { "epoch": 0.46663155122465105, "grad_norm": 1.7192156314849854, "learning_rate": 2.668159072952331e-05, "loss": 1.5391, "step": 17718 }, { "epoch": 0.46665788780616274, "grad_norm": 2.1765732765197754, "learning_rate": 2.668027390044772e-05, "loss": 0.4912, "step": 17719 }, { "epoch": 0.4666842243876745, "grad_norm": 2.5378360748291016, "learning_rate": 2.6678957071372133e-05, "loss": 1.4397, "step": 17720 }, { "epoch": 0.4667105609691862, "grad_norm": 1.8803786039352417, "learning_rate": 2.6677640242296552e-05, "loss": 1.7115, "step": 17721 }, { "epoch": 0.4667368975506979, "grad_norm": 1.6991279125213623, "learning_rate": 2.6676323413220965e-05, "loss": 1.83, "step": 17722 }, { "epoch": 0.46676323413220966, "grad_norm": 4.650506496429443, "learning_rate": 2.667500658414538e-05, "loss": 1.0469, "step": 17723 }, { "epoch": 0.46678957071372135, "grad_norm": 1.472511887550354, "learning_rate": 2.6673689755069792e-05, "loss": 1.763, "step": 17724 }, { "epoch": 0.4668159072952331, "grad_norm": 5.536637306213379, "learning_rate": 2.6672372925994205e-05, "loss": 0.6791, "step": 17725 }, { "epoch": 0.4668422438767448, "grad_norm": 1.6572816371917725, "learning_rate": 2.6671056096918624e-05, "loss": 1.9978, "step": 17726 }, { "epoch": 0.4668685804582565, "grad_norm": 1.8187482357025146, "learning_rate": 2.6669739267843036e-05, "loss": 1.5962, "step": 17727 }, { "epoch": 0.46689491703976826, "grad_norm": 2.826907157897949, "learning_rate": 2.6668422438767448e-05, "loss": 1.0366, "step": 17728 }, { "epoch": 0.46692125362127995, "grad_norm": 2.902482509613037, "learning_rate": 2.666710560969186e-05, "loss": 2.1595, "step": 17729 }, { "epoch": 0.4669475902027917, "grad_norm": 1.9059252738952637, "learning_rate": 2.666578878061628e-05, "loss": 2.1371, "step": 17730 }, { "epoch": 0.4669739267843034, "grad_norm": 3.6826422214508057, "learning_rate": 2.666447195154069e-05, "loss": 1.5158, "step": 17731 }, { "epoch": 0.4670002633658151, "grad_norm": 1.3771990537643433, "learning_rate": 2.6663155122465107e-05, "loss": 0.6562, "step": 17732 }, { "epoch": 0.4670265999473268, "grad_norm": 2.0774521827697754, "learning_rate": 2.666183829338952e-05, "loss": 0.9267, "step": 17733 }, { "epoch": 0.46705293652883856, "grad_norm": 1.9947038888931274, "learning_rate": 2.666052146431393e-05, "loss": 0.8888, "step": 17734 }, { "epoch": 0.4670792731103503, "grad_norm": 1.6749708652496338, "learning_rate": 2.665920463523835e-05, "loss": 1.7476, "step": 17735 }, { "epoch": 0.467105609691862, "grad_norm": 3.1274302005767822, "learning_rate": 2.6657887806162763e-05, "loss": 1.3334, "step": 17736 }, { "epoch": 0.46713194627337373, "grad_norm": 1.623986840248108, "learning_rate": 2.6656570977087175e-05, "loss": 1.9714, "step": 17737 }, { "epoch": 0.4671582828548854, "grad_norm": 1.9798483848571777, "learning_rate": 2.6655254148011587e-05, "loss": 2.3071, "step": 17738 }, { "epoch": 0.46718461943639716, "grad_norm": 2.107879877090454, "learning_rate": 2.6653937318936e-05, "loss": 2.0739, "step": 17739 }, { "epoch": 0.46721095601790885, "grad_norm": 1.8291912078857422, "learning_rate": 2.6652620489860418e-05, "loss": 0.6253, "step": 17740 }, { "epoch": 0.4672372925994206, "grad_norm": 1.672823429107666, "learning_rate": 2.665130366078483e-05, "loss": 1.4551, "step": 17741 }, { "epoch": 0.46726362918093234, "grad_norm": 3.9399280548095703, "learning_rate": 2.6649986831709246e-05, "loss": 0.9126, "step": 17742 }, { "epoch": 0.467289965762444, "grad_norm": 1.855733036994934, "learning_rate": 2.6648670002633658e-05, "loss": 0.7685, "step": 17743 }, { "epoch": 0.46731630234395577, "grad_norm": 1.9839106798171997, "learning_rate": 2.6647353173558077e-05, "loss": 0.8939, "step": 17744 }, { "epoch": 0.46734263892546746, "grad_norm": 1.5897464752197266, "learning_rate": 2.664603634448249e-05, "loss": 2.4577, "step": 17745 }, { "epoch": 0.4673689755069792, "grad_norm": 1.4755569696426392, "learning_rate": 2.6644719515406902e-05, "loss": 1.9235, "step": 17746 }, { "epoch": 0.4673953120884909, "grad_norm": 4.208126068115234, "learning_rate": 2.6643402686331314e-05, "loss": 0.7671, "step": 17747 }, { "epoch": 0.46742164867000263, "grad_norm": 2.061633348464966, "learning_rate": 2.6642085857255726e-05, "loss": 1.8405, "step": 17748 }, { "epoch": 0.4674479852515144, "grad_norm": 4.189487457275391, "learning_rate": 2.6640769028180145e-05, "loss": 0.8534, "step": 17749 }, { "epoch": 0.46747432183302606, "grad_norm": 3.2876577377319336, "learning_rate": 2.6639452199104557e-05, "loss": 1.3976, "step": 17750 }, { "epoch": 0.4675006584145378, "grad_norm": 2.196082830429077, "learning_rate": 2.6638135370028973e-05, "loss": 2.6455, "step": 17751 }, { "epoch": 0.4675269949960495, "grad_norm": 2.737238883972168, "learning_rate": 2.6636818540953385e-05, "loss": 1.1856, "step": 17752 }, { "epoch": 0.46755333157756124, "grad_norm": 4.558173656463623, "learning_rate": 2.6635501711877797e-05, "loss": 1.3998, "step": 17753 }, { "epoch": 0.4675796681590729, "grad_norm": 2.6295478343963623, "learning_rate": 2.6634184882802216e-05, "loss": 0.7673, "step": 17754 }, { "epoch": 0.46760600474058467, "grad_norm": 2.6075387001037598, "learning_rate": 2.663286805372663e-05, "loss": 1.9034, "step": 17755 }, { "epoch": 0.4676323413220964, "grad_norm": 2.605494260787964, "learning_rate": 2.663155122465104e-05, "loss": 2.1248, "step": 17756 }, { "epoch": 0.4676586779036081, "grad_norm": 2.8191702365875244, "learning_rate": 2.6630234395575453e-05, "loss": 1.8037, "step": 17757 }, { "epoch": 0.46768501448511984, "grad_norm": 1.878612756729126, "learning_rate": 2.662891756649987e-05, "loss": 1.8943, "step": 17758 }, { "epoch": 0.46771135106663153, "grad_norm": 1.842537522315979, "learning_rate": 2.6627600737424284e-05, "loss": 1.8705, "step": 17759 }, { "epoch": 0.4677376876481433, "grad_norm": 1.7836134433746338, "learning_rate": 2.66262839083487e-05, "loss": 0.5769, "step": 17760 }, { "epoch": 0.467764024229655, "grad_norm": 1.5259770154953003, "learning_rate": 2.6624967079273112e-05, "loss": 0.2331, "step": 17761 }, { "epoch": 0.4677903608111667, "grad_norm": 2.6390223503112793, "learning_rate": 2.6623650250197524e-05, "loss": 2.0226, "step": 17762 }, { "epoch": 0.46781669739267845, "grad_norm": 2.742978572845459, "learning_rate": 2.6622333421121943e-05, "loss": 2.0972, "step": 17763 }, { "epoch": 0.46784303397419014, "grad_norm": 2.2002484798431396, "learning_rate": 2.6621016592046355e-05, "loss": 1.7524, "step": 17764 }, { "epoch": 0.4678693705557019, "grad_norm": 1.9573888778686523, "learning_rate": 2.6619699762970768e-05, "loss": 1.6812, "step": 17765 }, { "epoch": 0.46789570713721357, "grad_norm": 1.5877116918563843, "learning_rate": 2.661838293389518e-05, "loss": 1.8839, "step": 17766 }, { "epoch": 0.4679220437187253, "grad_norm": 2.439161777496338, "learning_rate": 2.6617066104819592e-05, "loss": 1.6529, "step": 17767 }, { "epoch": 0.46794838030023705, "grad_norm": 1.6073988676071167, "learning_rate": 2.661574927574401e-05, "loss": 1.499, "step": 17768 }, { "epoch": 0.46797471688174874, "grad_norm": 1.8914847373962402, "learning_rate": 2.6614432446668423e-05, "loss": 1.3774, "step": 17769 }, { "epoch": 0.4680010534632605, "grad_norm": 4.3826189041137695, "learning_rate": 2.661311561759284e-05, "loss": 0.4888, "step": 17770 }, { "epoch": 0.4680273900447722, "grad_norm": 1.7288110256195068, "learning_rate": 2.661179878851725e-05, "loss": 1.8116, "step": 17771 }, { "epoch": 0.4680537266262839, "grad_norm": 3.684248924255371, "learning_rate": 2.6610481959441663e-05, "loss": 1.4851, "step": 17772 }, { "epoch": 0.4680800632077956, "grad_norm": 1.9046207666397095, "learning_rate": 2.6609165130366082e-05, "loss": 1.6437, "step": 17773 }, { "epoch": 0.46810639978930735, "grad_norm": 3.1491525173187256, "learning_rate": 2.6607848301290494e-05, "loss": 1.2911, "step": 17774 }, { "epoch": 0.4681327363708191, "grad_norm": 3.6731884479522705, "learning_rate": 2.6606531472214907e-05, "loss": 2.0269, "step": 17775 }, { "epoch": 0.4681590729523308, "grad_norm": 2.171692132949829, "learning_rate": 2.660521464313932e-05, "loss": 2.4977, "step": 17776 }, { "epoch": 0.4681854095338425, "grad_norm": 2.751981019973755, "learning_rate": 2.6603897814063738e-05, "loss": 1.9733, "step": 17777 }, { "epoch": 0.4682117461153542, "grad_norm": 1.8141319751739502, "learning_rate": 2.660258098498815e-05, "loss": 1.6459, "step": 17778 }, { "epoch": 0.46823808269686595, "grad_norm": 2.73344087600708, "learning_rate": 2.6601264155912566e-05, "loss": 1.9443, "step": 17779 }, { "epoch": 0.46826441927837764, "grad_norm": 2.477538585662842, "learning_rate": 2.6599947326836978e-05, "loss": 1.2884, "step": 17780 }, { "epoch": 0.4682907558598894, "grad_norm": 1.8230829238891602, "learning_rate": 2.659863049776139e-05, "loss": 2.0744, "step": 17781 }, { "epoch": 0.46831709244140113, "grad_norm": 2.1775026321411133, "learning_rate": 2.659731366868581e-05, "loss": 1.5417, "step": 17782 }, { "epoch": 0.4683434290229128, "grad_norm": 1.6220475435256958, "learning_rate": 2.659599683961022e-05, "loss": 1.8476, "step": 17783 }, { "epoch": 0.46836976560442456, "grad_norm": 1.5239176750183105, "learning_rate": 2.6594680010534633e-05, "loss": 1.5773, "step": 17784 }, { "epoch": 0.46839610218593625, "grad_norm": 1.610787034034729, "learning_rate": 2.6593363181459046e-05, "loss": 1.6851, "step": 17785 }, { "epoch": 0.468422438767448, "grad_norm": 4.847218990325928, "learning_rate": 2.6592046352383458e-05, "loss": 1.3044, "step": 17786 }, { "epoch": 0.4684487753489597, "grad_norm": 2.2366230487823486, "learning_rate": 2.6590729523307877e-05, "loss": 1.9313, "step": 17787 }, { "epoch": 0.4684751119304714, "grad_norm": 4.066692352294922, "learning_rate": 2.6589412694232292e-05, "loss": 1.9231, "step": 17788 }, { "epoch": 0.46850144851198317, "grad_norm": 2.2789418697357178, "learning_rate": 2.6588095865156705e-05, "loss": 0.8924, "step": 17789 }, { "epoch": 0.46852778509349485, "grad_norm": 1.9849071502685547, "learning_rate": 2.6586779036081117e-05, "loss": 1.7841, "step": 17790 }, { "epoch": 0.4685541216750066, "grad_norm": 1.7239867448806763, "learning_rate": 2.658546220700553e-05, "loss": 0.6548, "step": 17791 }, { "epoch": 0.4685804582565183, "grad_norm": 5.8985700607299805, "learning_rate": 2.6584145377929948e-05, "loss": 1.1502, "step": 17792 }, { "epoch": 0.46860679483803, "grad_norm": 1.953762173652649, "learning_rate": 2.658282854885436e-05, "loss": 0.5572, "step": 17793 }, { "epoch": 0.46863313141954177, "grad_norm": 1.599268913269043, "learning_rate": 2.6581511719778772e-05, "loss": 1.6468, "step": 17794 }, { "epoch": 0.46865946800105346, "grad_norm": 1.4985997676849365, "learning_rate": 2.6580194890703185e-05, "loss": 1.8841, "step": 17795 }, { "epoch": 0.4686858045825652, "grad_norm": 1.9278223514556885, "learning_rate": 2.6578878061627604e-05, "loss": 1.6121, "step": 17796 }, { "epoch": 0.4687121411640769, "grad_norm": 2.5737249851226807, "learning_rate": 2.6577561232552016e-05, "loss": 2.1832, "step": 17797 }, { "epoch": 0.46873847774558863, "grad_norm": 2.342465400695801, "learning_rate": 2.657624440347643e-05, "loss": 2.401, "step": 17798 }, { "epoch": 0.4687648143271003, "grad_norm": 3.326110601425171, "learning_rate": 2.6574927574400844e-05, "loss": 0.7558, "step": 17799 }, { "epoch": 0.46879115090861206, "grad_norm": 1.8555642366409302, "learning_rate": 2.6573610745325256e-05, "loss": 1.6713, "step": 17800 }, { "epoch": 0.4688174874901238, "grad_norm": 1.4029220342636108, "learning_rate": 2.6572293916249675e-05, "loss": 1.1246, "step": 17801 }, { "epoch": 0.4688438240716355, "grad_norm": 7.696796417236328, "learning_rate": 2.6570977087174087e-05, "loss": 2.863, "step": 17802 }, { "epoch": 0.46887016065314724, "grad_norm": 1.6174547672271729, "learning_rate": 2.65696602580985e-05, "loss": 2.256, "step": 17803 }, { "epoch": 0.4688964972346589, "grad_norm": 4.064257621765137, "learning_rate": 2.656834342902291e-05, "loss": 1.4144, "step": 17804 }, { "epoch": 0.46892283381617067, "grad_norm": 1.9717520475387573, "learning_rate": 2.6567026599947327e-05, "loss": 1.9244, "step": 17805 }, { "epoch": 0.46894917039768236, "grad_norm": 2.2301363945007324, "learning_rate": 2.6565709770871743e-05, "loss": 1.712, "step": 17806 }, { "epoch": 0.4689755069791941, "grad_norm": 1.8241791725158691, "learning_rate": 2.6564392941796158e-05, "loss": 1.6233, "step": 17807 }, { "epoch": 0.46900184356070584, "grad_norm": 2.6408090591430664, "learning_rate": 2.656307611272057e-05, "loss": 2.1652, "step": 17808 }, { "epoch": 0.46902818014221753, "grad_norm": 2.1685352325439453, "learning_rate": 2.6561759283644983e-05, "loss": 1.9775, "step": 17809 }, { "epoch": 0.4690545167237293, "grad_norm": 3.425508499145508, "learning_rate": 2.6560442454569402e-05, "loss": 2.2296, "step": 17810 }, { "epoch": 0.46908085330524096, "grad_norm": 1.4673404693603516, "learning_rate": 2.6559125625493814e-05, "loss": 1.8413, "step": 17811 }, { "epoch": 0.4691071898867527, "grad_norm": 1.9336313009262085, "learning_rate": 2.6557808796418226e-05, "loss": 2.6583, "step": 17812 }, { "epoch": 0.4691335264682644, "grad_norm": 1.85164213180542, "learning_rate": 2.655649196734264e-05, "loss": 2.2075, "step": 17813 }, { "epoch": 0.46915986304977614, "grad_norm": 1.7610957622528076, "learning_rate": 2.655517513826705e-05, "loss": 1.7066, "step": 17814 }, { "epoch": 0.4691861996312879, "grad_norm": 1.6094934940338135, "learning_rate": 2.655385830919147e-05, "loss": 1.6337, "step": 17815 }, { "epoch": 0.46921253621279957, "grad_norm": 1.860531210899353, "learning_rate": 2.6552541480115882e-05, "loss": 1.6072, "step": 17816 }, { "epoch": 0.4692388727943113, "grad_norm": 2.4694485664367676, "learning_rate": 2.6551224651040297e-05, "loss": 1.3012, "step": 17817 }, { "epoch": 0.469265209375823, "grad_norm": 1.6511313915252686, "learning_rate": 2.654990782196471e-05, "loss": 1.4186, "step": 17818 }, { "epoch": 0.46929154595733474, "grad_norm": 1.8920927047729492, "learning_rate": 2.6548590992889122e-05, "loss": 2.0614, "step": 17819 }, { "epoch": 0.46931788253884643, "grad_norm": 4.017232894897461, "learning_rate": 2.654727416381354e-05, "loss": 1.0195, "step": 17820 }, { "epoch": 0.4693442191203582, "grad_norm": 1.7876805067062378, "learning_rate": 2.6545957334737953e-05, "loss": 2.0411, "step": 17821 }, { "epoch": 0.4693705557018699, "grad_norm": 3.1118571758270264, "learning_rate": 2.6544640505662365e-05, "loss": 1.5229, "step": 17822 }, { "epoch": 0.4693968922833816, "grad_norm": 1.5109143257141113, "learning_rate": 2.6543323676586777e-05, "loss": 1.8546, "step": 17823 }, { "epoch": 0.46942322886489335, "grad_norm": 1.5621002912521362, "learning_rate": 2.6542006847511193e-05, "loss": 1.9555, "step": 17824 }, { "epoch": 0.46944956544640504, "grad_norm": 1.9511710405349731, "learning_rate": 2.654069001843561e-05, "loss": 1.85, "step": 17825 }, { "epoch": 0.4694759020279168, "grad_norm": 2.2528293132781982, "learning_rate": 2.6539373189360024e-05, "loss": 1.5248, "step": 17826 }, { "epoch": 0.4695022386094285, "grad_norm": 1.5066628456115723, "learning_rate": 2.6538056360284436e-05, "loss": 1.6815, "step": 17827 }, { "epoch": 0.4695285751909402, "grad_norm": 1.8154276609420776, "learning_rate": 2.653673953120885e-05, "loss": 0.7288, "step": 17828 }, { "epoch": 0.46955491177245195, "grad_norm": 2.893597364425659, "learning_rate": 2.6535422702133268e-05, "loss": 1.793, "step": 17829 }, { "epoch": 0.46958124835396364, "grad_norm": 1.9779239892959595, "learning_rate": 2.653410587305768e-05, "loss": 1.9619, "step": 17830 }, { "epoch": 0.4696075849354754, "grad_norm": 4.148197174072266, "learning_rate": 2.6532789043982092e-05, "loss": 1.1131, "step": 17831 }, { "epoch": 0.4696339215169871, "grad_norm": 1.5699442625045776, "learning_rate": 2.6531472214906504e-05, "loss": 1.6655, "step": 17832 }, { "epoch": 0.4696602580984988, "grad_norm": 1.997094988822937, "learning_rate": 2.653015538583092e-05, "loss": 0.9221, "step": 17833 }, { "epoch": 0.46968659468001056, "grad_norm": 1.4420344829559326, "learning_rate": 2.6528838556755335e-05, "loss": 1.3992, "step": 17834 }, { "epoch": 0.46971293126152225, "grad_norm": 2.0375351905822754, "learning_rate": 2.652752172767975e-05, "loss": 1.6928, "step": 17835 }, { "epoch": 0.469739267843034, "grad_norm": 1.5903856754302979, "learning_rate": 2.6526204898604163e-05, "loss": 0.8805, "step": 17836 }, { "epoch": 0.4697656044245457, "grad_norm": 2.220182418823242, "learning_rate": 2.6524888069528575e-05, "loss": 1.5697, "step": 17837 }, { "epoch": 0.4697919410060574, "grad_norm": 4.313830375671387, "learning_rate": 2.6523571240452988e-05, "loss": 1.0573, "step": 17838 }, { "epoch": 0.4698182775875691, "grad_norm": 4.469368934631348, "learning_rate": 2.6522254411377407e-05, "loss": 1.5517, "step": 17839 }, { "epoch": 0.46984461416908085, "grad_norm": 2.0591492652893066, "learning_rate": 2.652093758230182e-05, "loss": 2.056, "step": 17840 }, { "epoch": 0.4698709507505926, "grad_norm": 2.1079163551330566, "learning_rate": 2.651962075322623e-05, "loss": 1.9779, "step": 17841 }, { "epoch": 0.4698972873321043, "grad_norm": 2.601976156234741, "learning_rate": 2.6518303924150643e-05, "loss": 1.542, "step": 17842 }, { "epoch": 0.46992362391361603, "grad_norm": 3.2868494987487793, "learning_rate": 2.6516987095075062e-05, "loss": 0.8687, "step": 17843 }, { "epoch": 0.4699499604951277, "grad_norm": 2.5924289226531982, "learning_rate": 2.6515670265999474e-05, "loss": 0.5282, "step": 17844 }, { "epoch": 0.46997629707663946, "grad_norm": 1.372226595878601, "learning_rate": 2.651435343692389e-05, "loss": 1.534, "step": 17845 }, { "epoch": 0.47000263365815115, "grad_norm": 2.584397077560425, "learning_rate": 2.6513036607848302e-05, "loss": 1.6455, "step": 17846 }, { "epoch": 0.4700289702396629, "grad_norm": 1.9899561405181885, "learning_rate": 2.6511719778772714e-05, "loss": 2.0372, "step": 17847 }, { "epoch": 0.47005530682117463, "grad_norm": 1.6505098342895508, "learning_rate": 2.6510402949697133e-05, "loss": 0.5741, "step": 17848 }, { "epoch": 0.4700816434026863, "grad_norm": 2.215165376663208, "learning_rate": 2.6509086120621546e-05, "loss": 1.6832, "step": 17849 }, { "epoch": 0.47010797998419807, "grad_norm": 1.9001377820968628, "learning_rate": 2.6507769291545958e-05, "loss": 1.8595, "step": 17850 }, { "epoch": 0.47013431656570975, "grad_norm": 1.6046241521835327, "learning_rate": 2.650645246247037e-05, "loss": 0.3189, "step": 17851 }, { "epoch": 0.4701606531472215, "grad_norm": 1.8992936611175537, "learning_rate": 2.6505135633394786e-05, "loss": 1.89, "step": 17852 }, { "epoch": 0.4701869897287332, "grad_norm": 2.1740028858184814, "learning_rate": 2.65038188043192e-05, "loss": 0.4599, "step": 17853 }, { "epoch": 0.47021332631024493, "grad_norm": 1.810161828994751, "learning_rate": 2.6502501975243617e-05, "loss": 1.7993, "step": 17854 }, { "epoch": 0.47023966289175667, "grad_norm": 3.468501091003418, "learning_rate": 2.650118514616803e-05, "loss": 0.9058, "step": 17855 }, { "epoch": 0.47026599947326836, "grad_norm": 1.8690470457077026, "learning_rate": 2.649986831709244e-05, "loss": 1.6319, "step": 17856 }, { "epoch": 0.4702923360547801, "grad_norm": 1.8879598379135132, "learning_rate": 2.6498551488016853e-05, "loss": 1.692, "step": 17857 }, { "epoch": 0.4703186726362918, "grad_norm": 2.298372983932495, "learning_rate": 2.6497234658941272e-05, "loss": 1.6616, "step": 17858 }, { "epoch": 0.47034500921780353, "grad_norm": 2.492218017578125, "learning_rate": 2.6495917829865685e-05, "loss": 1.9995, "step": 17859 }, { "epoch": 0.4703713457993152, "grad_norm": 1.637429118156433, "learning_rate": 2.6494601000790097e-05, "loss": 2.4575, "step": 17860 }, { "epoch": 0.47039768238082696, "grad_norm": 3.52032208442688, "learning_rate": 2.6493284171714512e-05, "loss": 1.7719, "step": 17861 }, { "epoch": 0.4704240189623387, "grad_norm": 2.648847818374634, "learning_rate": 2.6491967342638928e-05, "loss": 1.8128, "step": 17862 }, { "epoch": 0.4704503555438504, "grad_norm": 1.9688808917999268, "learning_rate": 2.6490650513563344e-05, "loss": 1.785, "step": 17863 }, { "epoch": 0.47047669212536214, "grad_norm": 2.3816568851470947, "learning_rate": 2.6489333684487756e-05, "loss": 1.5061, "step": 17864 }, { "epoch": 0.4705030287068738, "grad_norm": 1.7334983348846436, "learning_rate": 2.6488016855412168e-05, "loss": 1.4209, "step": 17865 }, { "epoch": 0.47052936528838557, "grad_norm": 1.994781732559204, "learning_rate": 2.648670002633658e-05, "loss": 1.7904, "step": 17866 }, { "epoch": 0.4705557018698973, "grad_norm": 1.6072520017623901, "learning_rate": 2.6485383197261e-05, "loss": 1.8495, "step": 17867 }, { "epoch": 0.470582038451409, "grad_norm": 1.9076699018478394, "learning_rate": 2.648406636818541e-05, "loss": 1.8467, "step": 17868 }, { "epoch": 0.47060837503292074, "grad_norm": 2.63183331489563, "learning_rate": 2.6482749539109824e-05, "loss": 0.3947, "step": 17869 }, { "epoch": 0.47063471161443243, "grad_norm": 2.840358257293701, "learning_rate": 2.6481432710034236e-05, "loss": 1.7305, "step": 17870 }, { "epoch": 0.4706610481959442, "grad_norm": 1.6165119409561157, "learning_rate": 2.648011588095865e-05, "loss": 1.5707, "step": 17871 }, { "epoch": 0.47068738477745586, "grad_norm": 1.985432744026184, "learning_rate": 2.6478799051883067e-05, "loss": 2.2516, "step": 17872 }, { "epoch": 0.4707137213589676, "grad_norm": 2.791285991668701, "learning_rate": 2.6477482222807483e-05, "loss": 2.2673, "step": 17873 }, { "epoch": 0.47074005794047935, "grad_norm": 2.594158172607422, "learning_rate": 2.6476165393731895e-05, "loss": 1.5482, "step": 17874 }, { "epoch": 0.47076639452199104, "grad_norm": 1.4221909046173096, "learning_rate": 2.6474848564656307e-05, "loss": 1.4854, "step": 17875 }, { "epoch": 0.4707927311035028, "grad_norm": 1.8340985774993896, "learning_rate": 2.6473531735580726e-05, "loss": 2.3747, "step": 17876 }, { "epoch": 0.47081906768501447, "grad_norm": 1.8144675493240356, "learning_rate": 2.647221490650514e-05, "loss": 2.0887, "step": 17877 }, { "epoch": 0.4708454042665262, "grad_norm": 1.6621029376983643, "learning_rate": 2.647089807742955e-05, "loss": 1.6556, "step": 17878 }, { "epoch": 0.4708717408480379, "grad_norm": 4.519250392913818, "learning_rate": 2.6469581248353963e-05, "loss": 1.6906, "step": 17879 }, { "epoch": 0.47089807742954964, "grad_norm": 2.018160820007324, "learning_rate": 2.646826441927838e-05, "loss": 1.36, "step": 17880 }, { "epoch": 0.4709244140110614, "grad_norm": 5.2710185050964355, "learning_rate": 2.6466947590202794e-05, "loss": 1.416, "step": 17881 }, { "epoch": 0.4709507505925731, "grad_norm": 3.0456268787384033, "learning_rate": 2.646563076112721e-05, "loss": 1.1083, "step": 17882 }, { "epoch": 0.4709770871740848, "grad_norm": 1.931447982788086, "learning_rate": 2.6464313932051622e-05, "loss": 2.1125, "step": 17883 }, { "epoch": 0.4710034237555965, "grad_norm": 1.7303372621536255, "learning_rate": 2.6462997102976034e-05, "loss": 0.5131, "step": 17884 }, { "epoch": 0.47102976033710825, "grad_norm": 1.66446852684021, "learning_rate": 2.6461680273900446e-05, "loss": 1.5977, "step": 17885 }, { "epoch": 0.47105609691861994, "grad_norm": 3.5797624588012695, "learning_rate": 2.6460363444824865e-05, "loss": 1.2369, "step": 17886 }, { "epoch": 0.4710824335001317, "grad_norm": 1.716167688369751, "learning_rate": 2.6459046615749277e-05, "loss": 0.5763, "step": 17887 }, { "epoch": 0.4711087700816434, "grad_norm": 1.7607096433639526, "learning_rate": 2.645772978667369e-05, "loss": 1.8462, "step": 17888 }, { "epoch": 0.4711351066631551, "grad_norm": 3.1781349182128906, "learning_rate": 2.6456412957598102e-05, "loss": 1.4719, "step": 17889 }, { "epoch": 0.47116144324466686, "grad_norm": 4.016254425048828, "learning_rate": 2.6455096128522517e-05, "loss": 0.9295, "step": 17890 }, { "epoch": 0.47118777982617854, "grad_norm": 1.9094973802566528, "learning_rate": 2.6453779299446933e-05, "loss": 1.9185, "step": 17891 }, { "epoch": 0.4712141164076903, "grad_norm": 2.1751456260681152, "learning_rate": 2.645246247037135e-05, "loss": 1.6324, "step": 17892 }, { "epoch": 0.471240452989202, "grad_norm": 1.5610445737838745, "learning_rate": 2.645114564129576e-05, "loss": 1.6727, "step": 17893 }, { "epoch": 0.4712667895707137, "grad_norm": 2.7875514030456543, "learning_rate": 2.6449828812220173e-05, "loss": 1.2816, "step": 17894 }, { "epoch": 0.47129312615222546, "grad_norm": 1.7084176540374756, "learning_rate": 2.6448511983144592e-05, "loss": 1.4615, "step": 17895 }, { "epoch": 0.47131946273373715, "grad_norm": 2.6210553646087646, "learning_rate": 2.6447195154069004e-05, "loss": 0.7052, "step": 17896 }, { "epoch": 0.4713457993152489, "grad_norm": 4.016355514526367, "learning_rate": 2.6445878324993416e-05, "loss": 1.0527, "step": 17897 }, { "epoch": 0.4713721358967606, "grad_norm": 1.9485079050064087, "learning_rate": 2.644456149591783e-05, "loss": 1.3472, "step": 17898 }, { "epoch": 0.4713984724782723, "grad_norm": 1.6094517707824707, "learning_rate": 2.6443244666842244e-05, "loss": 1.1668, "step": 17899 }, { "epoch": 0.47142480905978407, "grad_norm": 1.8517144918441772, "learning_rate": 2.644192783776666e-05, "loss": 1.6535, "step": 17900 }, { "epoch": 0.47145114564129575, "grad_norm": 2.763026714324951, "learning_rate": 2.6440611008691075e-05, "loss": 1.4338, "step": 17901 }, { "epoch": 0.4714774822228075, "grad_norm": 1.8545913696289062, "learning_rate": 2.6439294179615488e-05, "loss": 2.0696, "step": 17902 }, { "epoch": 0.4715038188043192, "grad_norm": 4.549075603485107, "learning_rate": 2.64379773505399e-05, "loss": 2.1094, "step": 17903 }, { "epoch": 0.47153015538583093, "grad_norm": 1.748066782951355, "learning_rate": 2.6436660521464312e-05, "loss": 1.3943, "step": 17904 }, { "epoch": 0.4715564919673426, "grad_norm": 5.617864608764648, "learning_rate": 2.643534369238873e-05, "loss": 2.3394, "step": 17905 }, { "epoch": 0.47158282854885436, "grad_norm": 2.0360803604125977, "learning_rate": 2.6434026863313143e-05, "loss": 1.8966, "step": 17906 }, { "epoch": 0.4716091651303661, "grad_norm": 2.6220288276672363, "learning_rate": 2.6432710034237555e-05, "loss": 2.1283, "step": 17907 }, { "epoch": 0.4716355017118778, "grad_norm": 2.480895519256592, "learning_rate": 2.643139320516197e-05, "loss": 2.0587, "step": 17908 }, { "epoch": 0.47166183829338953, "grad_norm": 2.9045658111572266, "learning_rate": 2.6430076376086387e-05, "loss": 1.8739, "step": 17909 }, { "epoch": 0.4716881748749012, "grad_norm": 1.576075553894043, "learning_rate": 2.6428759547010802e-05, "loss": 1.7105, "step": 17910 }, { "epoch": 0.47171451145641297, "grad_norm": 1.9509483575820923, "learning_rate": 2.6427442717935214e-05, "loss": 1.4462, "step": 17911 }, { "epoch": 0.47174084803792465, "grad_norm": 2.1344716548919678, "learning_rate": 2.6426125888859627e-05, "loss": 2.1516, "step": 17912 }, { "epoch": 0.4717671846194364, "grad_norm": 2.135622978210449, "learning_rate": 2.642480905978404e-05, "loss": 1.8478, "step": 17913 }, { "epoch": 0.47179352120094814, "grad_norm": 3.6220579147338867, "learning_rate": 2.6423492230708458e-05, "loss": 2.0593, "step": 17914 }, { "epoch": 0.47181985778245983, "grad_norm": 2.3813655376434326, "learning_rate": 2.642217540163287e-05, "loss": 1.8815, "step": 17915 }, { "epoch": 0.47184619436397157, "grad_norm": 1.762906551361084, "learning_rate": 2.6420858572557282e-05, "loss": 1.7766, "step": 17916 }, { "epoch": 0.47187253094548326, "grad_norm": 2.101269483566284, "learning_rate": 2.6419541743481694e-05, "loss": 1.9748, "step": 17917 }, { "epoch": 0.471898867526995, "grad_norm": 4.175267219543457, "learning_rate": 2.641822491440611e-05, "loss": 1.6209, "step": 17918 }, { "epoch": 0.4719252041085067, "grad_norm": 1.826284408569336, "learning_rate": 2.6416908085330526e-05, "loss": 1.7092, "step": 17919 }, { "epoch": 0.47195154069001843, "grad_norm": 2.0861289501190186, "learning_rate": 2.641559125625494e-05, "loss": 1.2414, "step": 17920 }, { "epoch": 0.4719778772715302, "grad_norm": 2.045217275619507, "learning_rate": 2.6414274427179353e-05, "loss": 1.5074, "step": 17921 }, { "epoch": 0.47200421385304187, "grad_norm": 4.767573356628418, "learning_rate": 2.6412957598103766e-05, "loss": 1.2399, "step": 17922 }, { "epoch": 0.4720305504345536, "grad_norm": 1.6747349500656128, "learning_rate": 2.6411640769028178e-05, "loss": 1.9176, "step": 17923 }, { "epoch": 0.4720568870160653, "grad_norm": 2.0052506923675537, "learning_rate": 2.6410323939952597e-05, "loss": 2.047, "step": 17924 }, { "epoch": 0.47208322359757704, "grad_norm": 1.6829941272735596, "learning_rate": 2.640900711087701e-05, "loss": 0.3368, "step": 17925 }, { "epoch": 0.4721095601790887, "grad_norm": 2.3700201511383057, "learning_rate": 2.640769028180142e-05, "loss": 0.8107, "step": 17926 }, { "epoch": 0.47213589676060047, "grad_norm": 1.6656286716461182, "learning_rate": 2.6406373452725837e-05, "loss": 0.4055, "step": 17927 }, { "epoch": 0.4721622333421122, "grad_norm": 1.7245970964431763, "learning_rate": 2.6405056623650252e-05, "loss": 1.5221, "step": 17928 }, { "epoch": 0.4721885699236239, "grad_norm": 2.6271071434020996, "learning_rate": 2.6403739794574668e-05, "loss": 1.541, "step": 17929 }, { "epoch": 0.47221490650513565, "grad_norm": 3.7257206439971924, "learning_rate": 2.640242296549908e-05, "loss": 0.2719, "step": 17930 }, { "epoch": 0.47224124308664733, "grad_norm": 1.7708683013916016, "learning_rate": 2.6401106136423493e-05, "loss": 1.7959, "step": 17931 }, { "epoch": 0.4722675796681591, "grad_norm": 1.5768908262252808, "learning_rate": 2.6399789307347905e-05, "loss": 0.3727, "step": 17932 }, { "epoch": 0.4722939162496708, "grad_norm": 1.5334714651107788, "learning_rate": 2.6398472478272324e-05, "loss": 2.0055, "step": 17933 }, { "epoch": 0.4723202528311825, "grad_norm": 1.5198733806610107, "learning_rate": 2.6397155649196736e-05, "loss": 1.9119, "step": 17934 }, { "epoch": 0.47234658941269425, "grad_norm": 2.3848814964294434, "learning_rate": 2.6395838820121148e-05, "loss": 1.7405, "step": 17935 }, { "epoch": 0.47237292599420594, "grad_norm": 2.7062489986419678, "learning_rate": 2.6394521991045564e-05, "loss": 0.9261, "step": 17936 }, { "epoch": 0.4723992625757177, "grad_norm": 1.8381402492523193, "learning_rate": 2.6393205161969976e-05, "loss": 2.0577, "step": 17937 }, { "epoch": 0.47242559915722937, "grad_norm": 3.229065418243408, "learning_rate": 2.6391888332894395e-05, "loss": 1.9767, "step": 17938 }, { "epoch": 0.4724519357387411, "grad_norm": 1.5846070051193237, "learning_rate": 2.6390571503818807e-05, "loss": 0.4382, "step": 17939 }, { "epoch": 0.47247827232025286, "grad_norm": 2.2891552448272705, "learning_rate": 2.638925467474322e-05, "loss": 1.0503, "step": 17940 }, { "epoch": 0.47250460890176454, "grad_norm": 1.9028269052505493, "learning_rate": 2.638793784566763e-05, "loss": 1.7442, "step": 17941 }, { "epoch": 0.4725309454832763, "grad_norm": 3.3676249980926514, "learning_rate": 2.638662101659205e-05, "loss": 1.8799, "step": 17942 }, { "epoch": 0.472557282064788, "grad_norm": 1.905550241470337, "learning_rate": 2.6385304187516463e-05, "loss": 1.6442, "step": 17943 }, { "epoch": 0.4725836186462997, "grad_norm": 1.5947978496551514, "learning_rate": 2.6383987358440875e-05, "loss": 1.7902, "step": 17944 }, { "epoch": 0.4726099552278114, "grad_norm": 2.3338849544525146, "learning_rate": 2.6382670529365287e-05, "loss": 1.7916, "step": 17945 }, { "epoch": 0.47263629180932315, "grad_norm": 3.3827695846557617, "learning_rate": 2.6381353700289703e-05, "loss": 0.7726, "step": 17946 }, { "epoch": 0.4726626283908349, "grad_norm": 2.1535520553588867, "learning_rate": 2.638003687121412e-05, "loss": 2.0259, "step": 17947 }, { "epoch": 0.4726889649723466, "grad_norm": 2.0482892990112305, "learning_rate": 2.6378720042138534e-05, "loss": 2.9368, "step": 17948 }, { "epoch": 0.4727153015538583, "grad_norm": 2.736971855163574, "learning_rate": 2.6377403213062946e-05, "loss": 0.9796, "step": 17949 }, { "epoch": 0.47274163813537, "grad_norm": 3.0548458099365234, "learning_rate": 2.637608638398736e-05, "loss": 1.293, "step": 17950 }, { "epoch": 0.47276797471688176, "grad_norm": 3.1834046840667725, "learning_rate": 2.637476955491177e-05, "loss": 1.6554, "step": 17951 }, { "epoch": 0.47279431129839344, "grad_norm": 1.7934788465499878, "learning_rate": 2.637345272583619e-05, "loss": 1.4249, "step": 17952 }, { "epoch": 0.4728206478799052, "grad_norm": 3.496013641357422, "learning_rate": 2.6372135896760602e-05, "loss": 0.8848, "step": 17953 }, { "epoch": 0.47284698446141693, "grad_norm": 1.5670204162597656, "learning_rate": 2.6370819067685014e-05, "loss": 1.9357, "step": 17954 }, { "epoch": 0.4728733210429286, "grad_norm": 1.8909703493118286, "learning_rate": 2.636950223860943e-05, "loss": 1.8578, "step": 17955 }, { "epoch": 0.47289965762444036, "grad_norm": 2.0545642375946045, "learning_rate": 2.6368185409533845e-05, "loss": 2.5095, "step": 17956 }, { "epoch": 0.47292599420595205, "grad_norm": 2.18129825592041, "learning_rate": 2.636686858045826e-05, "loss": 1.8496, "step": 17957 }, { "epoch": 0.4729523307874638, "grad_norm": 2.0868897438049316, "learning_rate": 2.6365551751382673e-05, "loss": 1.4307, "step": 17958 }, { "epoch": 0.4729786673689755, "grad_norm": 3.3818743228912354, "learning_rate": 2.6364234922307085e-05, "loss": 1.6019, "step": 17959 }, { "epoch": 0.4730050039504872, "grad_norm": 2.294980525970459, "learning_rate": 2.6362918093231497e-05, "loss": 1.5046, "step": 17960 }, { "epoch": 0.47303134053199897, "grad_norm": 3.9720730781555176, "learning_rate": 2.6361601264155916e-05, "loss": 1.7485, "step": 17961 }, { "epoch": 0.47305767711351066, "grad_norm": 1.6498215198516846, "learning_rate": 2.636028443508033e-05, "loss": 1.7652, "step": 17962 }, { "epoch": 0.4730840136950224, "grad_norm": 4.694159030914307, "learning_rate": 2.635896760600474e-05, "loss": 0.8819, "step": 17963 }, { "epoch": 0.4731103502765341, "grad_norm": 3.2613956928253174, "learning_rate": 2.6357650776929156e-05, "loss": 1.0847, "step": 17964 }, { "epoch": 0.47313668685804583, "grad_norm": 1.9620249271392822, "learning_rate": 2.635633394785357e-05, "loss": 1.7778, "step": 17965 }, { "epoch": 0.4731630234395576, "grad_norm": 1.7562206983566284, "learning_rate": 2.6355017118777988e-05, "loss": 1.9489, "step": 17966 }, { "epoch": 0.47318936002106926, "grad_norm": 2.8412766456604004, "learning_rate": 2.63537002897024e-05, "loss": 1.81, "step": 17967 }, { "epoch": 0.473215696602581, "grad_norm": 1.8545047044754028, "learning_rate": 2.6352383460626812e-05, "loss": 1.7426, "step": 17968 }, { "epoch": 0.4732420331840927, "grad_norm": 2.510364294052124, "learning_rate": 2.6351066631551224e-05, "loss": 2.2014, "step": 17969 }, { "epoch": 0.47326836976560444, "grad_norm": 1.8358944654464722, "learning_rate": 2.6349749802475636e-05, "loss": 1.4303, "step": 17970 }, { "epoch": 0.4732947063471161, "grad_norm": 1.7235865592956543, "learning_rate": 2.6348432973400055e-05, "loss": 2.365, "step": 17971 }, { "epoch": 0.47332104292862787, "grad_norm": 1.8355944156646729, "learning_rate": 2.6347116144324468e-05, "loss": 1.6013, "step": 17972 }, { "epoch": 0.4733473795101396, "grad_norm": 2.8692994117736816, "learning_rate": 2.634579931524888e-05, "loss": 1.0572, "step": 17973 }, { "epoch": 0.4733737160916513, "grad_norm": 1.9615451097488403, "learning_rate": 2.6344482486173295e-05, "loss": 1.7517, "step": 17974 }, { "epoch": 0.47340005267316304, "grad_norm": 1.7628802061080933, "learning_rate": 2.634316565709771e-05, "loss": 1.8108, "step": 17975 }, { "epoch": 0.47342638925467473, "grad_norm": 1.5804575681686401, "learning_rate": 2.6341848828022127e-05, "loss": 1.7977, "step": 17976 }, { "epoch": 0.47345272583618647, "grad_norm": 4.294353008270264, "learning_rate": 2.634053199894654e-05, "loss": 1.0865, "step": 17977 }, { "epoch": 0.47347906241769816, "grad_norm": 3.211378335952759, "learning_rate": 2.633921516987095e-05, "loss": 0.8451, "step": 17978 }, { "epoch": 0.4735053989992099, "grad_norm": 1.733656644821167, "learning_rate": 2.6337898340795363e-05, "loss": 1.308, "step": 17979 }, { "epoch": 0.47353173558072165, "grad_norm": 1.7178661823272705, "learning_rate": 2.6336581511719782e-05, "loss": 1.8142, "step": 17980 }, { "epoch": 0.47355807216223333, "grad_norm": 2.3543434143066406, "learning_rate": 2.6335264682644194e-05, "loss": 1.7021, "step": 17981 }, { "epoch": 0.4735844087437451, "grad_norm": 3.654123306274414, "learning_rate": 2.6333947853568607e-05, "loss": 2.5898, "step": 17982 }, { "epoch": 0.47361074532525677, "grad_norm": 4.701235771179199, "learning_rate": 2.6332631024493022e-05, "loss": 1.4104, "step": 17983 }, { "epoch": 0.4736370819067685, "grad_norm": 3.404085874557495, "learning_rate": 2.6331314195417434e-05, "loss": 0.8055, "step": 17984 }, { "epoch": 0.4736634184882802, "grad_norm": 1.8302545547485352, "learning_rate": 2.6329997366341853e-05, "loss": 1.7302, "step": 17985 }, { "epoch": 0.47368975506979194, "grad_norm": 1.708420753479004, "learning_rate": 2.6328680537266266e-05, "loss": 2.1536, "step": 17986 }, { "epoch": 0.4737160916513037, "grad_norm": 2.7158589363098145, "learning_rate": 2.6327363708190678e-05, "loss": 2.1039, "step": 17987 }, { "epoch": 0.47374242823281537, "grad_norm": 1.9999464750289917, "learning_rate": 2.632604687911509e-05, "loss": 0.6847, "step": 17988 }, { "epoch": 0.4737687648143271, "grad_norm": 2.6629791259765625, "learning_rate": 2.632473005003951e-05, "loss": 2.8301, "step": 17989 }, { "epoch": 0.4737951013958388, "grad_norm": 2.133427143096924, "learning_rate": 2.632341322096392e-05, "loss": 1.8183, "step": 17990 }, { "epoch": 0.47382143797735055, "grad_norm": 2.132113218307495, "learning_rate": 2.6322096391888334e-05, "loss": 0.4059, "step": 17991 }, { "epoch": 0.47384777455886223, "grad_norm": 2.1298694610595703, "learning_rate": 2.6320779562812746e-05, "loss": 1.7905, "step": 17992 }, { "epoch": 0.473874111140374, "grad_norm": 2.297606945037842, "learning_rate": 2.631946273373716e-05, "loss": 1.2471, "step": 17993 }, { "epoch": 0.4739004477218857, "grad_norm": 2.932337760925293, "learning_rate": 2.6318145904661577e-05, "loss": 0.6356, "step": 17994 }, { "epoch": 0.4739267843033974, "grad_norm": 2.2184042930603027, "learning_rate": 2.6316829075585993e-05, "loss": 1.4082, "step": 17995 }, { "epoch": 0.47395312088490915, "grad_norm": 1.7243010997772217, "learning_rate": 2.6315512246510405e-05, "loss": 1.5012, "step": 17996 }, { "epoch": 0.47397945746642084, "grad_norm": 1.9842314720153809, "learning_rate": 2.6314195417434817e-05, "loss": 1.4877, "step": 17997 }, { "epoch": 0.4740057940479326, "grad_norm": 2.626722574234009, "learning_rate": 2.631287858835923e-05, "loss": 2.0232, "step": 17998 }, { "epoch": 0.47403213062944427, "grad_norm": 2.446789264678955, "learning_rate": 2.6311561759283648e-05, "loss": 1.7035, "step": 17999 }, { "epoch": 0.474058467210956, "grad_norm": 3.0998497009277344, "learning_rate": 2.631024493020806e-05, "loss": 1.8519, "step": 18000 }, { "epoch": 0.47408480379246776, "grad_norm": 3.261470317840576, "learning_rate": 2.6308928101132473e-05, "loss": 1.7502, "step": 18001 }, { "epoch": 0.47411114037397944, "grad_norm": 3.209693431854248, "learning_rate": 2.6307611272056888e-05, "loss": 0.7119, "step": 18002 }, { "epoch": 0.4741374769554912, "grad_norm": 1.4962162971496582, "learning_rate": 2.63062944429813e-05, "loss": 1.766, "step": 18003 }, { "epoch": 0.4741638135370029, "grad_norm": 1.7053147554397583, "learning_rate": 2.630497761390572e-05, "loss": 1.629, "step": 18004 }, { "epoch": 0.4741901501185146, "grad_norm": 3.461153268814087, "learning_rate": 2.630366078483013e-05, "loss": 1.2987, "step": 18005 }, { "epoch": 0.47421648670002636, "grad_norm": 5.332024574279785, "learning_rate": 2.6302343955754544e-05, "loss": 1.4677, "step": 18006 }, { "epoch": 0.47424282328153805, "grad_norm": 3.26020884513855, "learning_rate": 2.6301027126678956e-05, "loss": 2.0604, "step": 18007 }, { "epoch": 0.4742691598630498, "grad_norm": 1.7316187620162964, "learning_rate": 2.6299710297603375e-05, "loss": 1.8331, "step": 18008 }, { "epoch": 0.4742954964445615, "grad_norm": 3.970562219619751, "learning_rate": 2.6298393468527787e-05, "loss": 1.4967, "step": 18009 }, { "epoch": 0.4743218330260732, "grad_norm": 1.3757078647613525, "learning_rate": 2.62970766394522e-05, "loss": 2.077, "step": 18010 }, { "epoch": 0.4743481696075849, "grad_norm": 2.3905718326568604, "learning_rate": 2.6295759810376615e-05, "loss": 1.4518, "step": 18011 }, { "epoch": 0.47437450618909666, "grad_norm": 4.833202362060547, "learning_rate": 2.6294442981301027e-05, "loss": 2.1774, "step": 18012 }, { "epoch": 0.4744008427706084, "grad_norm": 2.995392322540283, "learning_rate": 2.6293126152225446e-05, "loss": 1.8886, "step": 18013 }, { "epoch": 0.4744271793521201, "grad_norm": 1.6298213005065918, "learning_rate": 2.629180932314986e-05, "loss": 1.6117, "step": 18014 }, { "epoch": 0.47445351593363183, "grad_norm": 2.83526611328125, "learning_rate": 2.629049249407427e-05, "loss": 0.5354, "step": 18015 }, { "epoch": 0.4744798525151435, "grad_norm": 2.0443403720855713, "learning_rate": 2.6289175664998683e-05, "loss": 1.8038, "step": 18016 }, { "epoch": 0.47450618909665526, "grad_norm": 2.0520477294921875, "learning_rate": 2.6287858835923095e-05, "loss": 0.98, "step": 18017 }, { "epoch": 0.47453252567816695, "grad_norm": 1.6275315284729004, "learning_rate": 2.6286542006847514e-05, "loss": 1.947, "step": 18018 }, { "epoch": 0.4745588622596787, "grad_norm": 1.6136394739151, "learning_rate": 2.6285225177771926e-05, "loss": 1.8218, "step": 18019 }, { "epoch": 0.47458519884119044, "grad_norm": 3.588017225265503, "learning_rate": 2.628390834869634e-05, "loss": 1.897, "step": 18020 }, { "epoch": 0.4746115354227021, "grad_norm": 1.9499776363372803, "learning_rate": 2.6282591519620754e-05, "loss": 2.0482, "step": 18021 }, { "epoch": 0.47463787200421387, "grad_norm": 1.7800002098083496, "learning_rate": 2.628127469054517e-05, "loss": 1.8747, "step": 18022 }, { "epoch": 0.47466420858572556, "grad_norm": 4.833249568939209, "learning_rate": 2.6279957861469585e-05, "loss": 1.8213, "step": 18023 }, { "epoch": 0.4746905451672373, "grad_norm": 2.453986883163452, "learning_rate": 2.6278641032393997e-05, "loss": 1.0012, "step": 18024 }, { "epoch": 0.474716881748749, "grad_norm": 2.8022005558013916, "learning_rate": 2.627732420331841e-05, "loss": 0.8829, "step": 18025 }, { "epoch": 0.47474321833026073, "grad_norm": 1.5881845951080322, "learning_rate": 2.6276007374242822e-05, "loss": 1.8127, "step": 18026 }, { "epoch": 0.4747695549117725, "grad_norm": 2.413527011871338, "learning_rate": 2.627469054516724e-05, "loss": 2.3022, "step": 18027 }, { "epoch": 0.47479589149328416, "grad_norm": 1.7777903079986572, "learning_rate": 2.6273373716091653e-05, "loss": 0.6391, "step": 18028 }, { "epoch": 0.4748222280747959, "grad_norm": 2.0796475410461426, "learning_rate": 2.6272056887016065e-05, "loss": 1.2444, "step": 18029 }, { "epoch": 0.4748485646563076, "grad_norm": 1.6656315326690674, "learning_rate": 2.627074005794048e-05, "loss": 1.5637, "step": 18030 }, { "epoch": 0.47487490123781934, "grad_norm": 1.4762217998504639, "learning_rate": 2.6269423228864893e-05, "loss": 1.8589, "step": 18031 }, { "epoch": 0.474901237819331, "grad_norm": 1.888584852218628, "learning_rate": 2.6268106399789312e-05, "loss": 2.2992, "step": 18032 }, { "epoch": 0.47492757440084277, "grad_norm": 3.5453085899353027, "learning_rate": 2.6266789570713724e-05, "loss": 1.7032, "step": 18033 }, { "epoch": 0.4749539109823545, "grad_norm": 2.0390994548797607, "learning_rate": 2.6265472741638136e-05, "loss": 1.3441, "step": 18034 }, { "epoch": 0.4749802475638662, "grad_norm": 1.6482908725738525, "learning_rate": 2.626415591256255e-05, "loss": 0.3402, "step": 18035 }, { "epoch": 0.47500658414537794, "grad_norm": 3.6160387992858887, "learning_rate": 2.626283908348696e-05, "loss": 0.8216, "step": 18036 }, { "epoch": 0.47503292072688963, "grad_norm": 1.7986429929733276, "learning_rate": 2.626152225441138e-05, "loss": 1.7053, "step": 18037 }, { "epoch": 0.4750592573084014, "grad_norm": 1.9232220649719238, "learning_rate": 2.6260205425335792e-05, "loss": 2.0929, "step": 18038 }, { "epoch": 0.4750855938899131, "grad_norm": 2.988654851913452, "learning_rate": 2.6258888596260208e-05, "loss": 1.2714, "step": 18039 }, { "epoch": 0.4751119304714248, "grad_norm": 1.9279531240463257, "learning_rate": 2.625757176718462e-05, "loss": 1.3988, "step": 18040 }, { "epoch": 0.47513826705293655, "grad_norm": 3.3459930419921875, "learning_rate": 2.625625493810904e-05, "loss": 1.2646, "step": 18041 }, { "epoch": 0.47516460363444823, "grad_norm": 4.728421688079834, "learning_rate": 2.625493810903345e-05, "loss": 0.8438, "step": 18042 }, { "epoch": 0.47519094021596, "grad_norm": 2.0103824138641357, "learning_rate": 2.6253621279957863e-05, "loss": 1.8723, "step": 18043 }, { "epoch": 0.47521727679747167, "grad_norm": 2.583697557449341, "learning_rate": 2.6252304450882275e-05, "loss": 2.0797, "step": 18044 }, { "epoch": 0.4752436133789834, "grad_norm": 6.7954559326171875, "learning_rate": 2.6250987621806688e-05, "loss": 2.6605, "step": 18045 }, { "epoch": 0.47526994996049515, "grad_norm": 3.5881361961364746, "learning_rate": 2.6249670792731107e-05, "loss": 2.1259, "step": 18046 }, { "epoch": 0.47529628654200684, "grad_norm": 2.464256525039673, "learning_rate": 2.624835396365552e-05, "loss": 0.9051, "step": 18047 }, { "epoch": 0.4753226231235186, "grad_norm": 7.696624755859375, "learning_rate": 2.624703713457993e-05, "loss": 2.0595, "step": 18048 }, { "epoch": 0.47534895970503027, "grad_norm": 5.125574588775635, "learning_rate": 2.6245720305504347e-05, "loss": 0.432, "step": 18049 }, { "epoch": 0.475375296286542, "grad_norm": 2.2241811752319336, "learning_rate": 2.624440347642876e-05, "loss": 1.2333, "step": 18050 }, { "epoch": 0.4754016328680537, "grad_norm": 2.0132031440734863, "learning_rate": 2.6243086647353178e-05, "loss": 1.7055, "step": 18051 }, { "epoch": 0.47542796944956545, "grad_norm": 1.9278985261917114, "learning_rate": 2.624176981827759e-05, "loss": 1.8635, "step": 18052 }, { "epoch": 0.4754543060310772, "grad_norm": 2.030756711959839, "learning_rate": 2.6240452989202002e-05, "loss": 0.4427, "step": 18053 }, { "epoch": 0.4754806426125889, "grad_norm": 1.6091283559799194, "learning_rate": 2.6239136160126415e-05, "loss": 1.5705, "step": 18054 }, { "epoch": 0.4755069791941006, "grad_norm": 2.5038156509399414, "learning_rate": 2.6237819331050834e-05, "loss": 0.2297, "step": 18055 }, { "epoch": 0.4755333157756123, "grad_norm": 2.3296658992767334, "learning_rate": 2.6236502501975246e-05, "loss": 1.9659, "step": 18056 }, { "epoch": 0.47555965235712405, "grad_norm": 2.38259220123291, "learning_rate": 2.6235185672899658e-05, "loss": 1.9652, "step": 18057 }, { "epoch": 0.47558598893863574, "grad_norm": 1.7944610118865967, "learning_rate": 2.6233868843824074e-05, "loss": 0.7851, "step": 18058 }, { "epoch": 0.4756123255201475, "grad_norm": 1.4154953956604004, "learning_rate": 2.6232552014748486e-05, "loss": 1.5766, "step": 18059 }, { "epoch": 0.4756386621016592, "grad_norm": 1.9560688734054565, "learning_rate": 2.6231235185672905e-05, "loss": 1.9532, "step": 18060 }, { "epoch": 0.4756649986831709, "grad_norm": 1.9043681621551514, "learning_rate": 2.6229918356597317e-05, "loss": 1.5029, "step": 18061 }, { "epoch": 0.47569133526468266, "grad_norm": 2.039125680923462, "learning_rate": 2.622860152752173e-05, "loss": 1.3208, "step": 18062 }, { "epoch": 0.47571767184619435, "grad_norm": 1.4919660091400146, "learning_rate": 2.622728469844614e-05, "loss": 1.4258, "step": 18063 }, { "epoch": 0.4757440084277061, "grad_norm": 2.184338331222534, "learning_rate": 2.6225967869370554e-05, "loss": 1.5314, "step": 18064 }, { "epoch": 0.4757703450092178, "grad_norm": 3.4698526859283447, "learning_rate": 2.6224651040294973e-05, "loss": 1.8335, "step": 18065 }, { "epoch": 0.4757966815907295, "grad_norm": 2.0690839290618896, "learning_rate": 2.6223334211219385e-05, "loss": 1.3201, "step": 18066 }, { "epoch": 0.47582301817224126, "grad_norm": 1.9289405345916748, "learning_rate": 2.62220173821438e-05, "loss": 1.8343, "step": 18067 }, { "epoch": 0.47584935475375295, "grad_norm": 1.8096168041229248, "learning_rate": 2.6220700553068213e-05, "loss": 0.6406, "step": 18068 }, { "epoch": 0.4758756913352647, "grad_norm": 1.51333749294281, "learning_rate": 2.6219383723992625e-05, "loss": 2.2432, "step": 18069 }, { "epoch": 0.4759020279167764, "grad_norm": 1.675201416015625, "learning_rate": 2.6218066894917044e-05, "loss": 1.9982, "step": 18070 }, { "epoch": 0.4759283644982881, "grad_norm": 3.723294734954834, "learning_rate": 2.6216750065841456e-05, "loss": 1.6848, "step": 18071 }, { "epoch": 0.47595470107979987, "grad_norm": 1.902472734451294, "learning_rate": 2.6215433236765868e-05, "loss": 1.6854, "step": 18072 }, { "epoch": 0.47598103766131156, "grad_norm": 2.0708296298980713, "learning_rate": 2.621411640769028e-05, "loss": 1.6196, "step": 18073 }, { "epoch": 0.4760073742428233, "grad_norm": 2.0596871376037598, "learning_rate": 2.62127995786147e-05, "loss": 0.9181, "step": 18074 }, { "epoch": 0.476033710824335, "grad_norm": 1.582932949066162, "learning_rate": 2.621148274953911e-05, "loss": 1.9876, "step": 18075 }, { "epoch": 0.47606004740584673, "grad_norm": 2.512171983718872, "learning_rate": 2.6210165920463524e-05, "loss": 2.3788, "step": 18076 }, { "epoch": 0.4760863839873584, "grad_norm": 2.1394431591033936, "learning_rate": 2.620884909138794e-05, "loss": 0.6411, "step": 18077 }, { "epoch": 0.47611272056887016, "grad_norm": 1.6871570348739624, "learning_rate": 2.620753226231235e-05, "loss": 1.7547, "step": 18078 }, { "epoch": 0.4761390571503819, "grad_norm": 3.2810633182525635, "learning_rate": 2.620621543323677e-05, "loss": 1.3224, "step": 18079 }, { "epoch": 0.4761653937318936, "grad_norm": 2.3216912746429443, "learning_rate": 2.6204898604161183e-05, "loss": 0.2762, "step": 18080 }, { "epoch": 0.47619173031340534, "grad_norm": 2.0798046588897705, "learning_rate": 2.6203581775085595e-05, "loss": 1.8598, "step": 18081 }, { "epoch": 0.476218066894917, "grad_norm": 2.1319127082824707, "learning_rate": 2.6202264946010007e-05, "loss": 1.3377, "step": 18082 }, { "epoch": 0.47624440347642877, "grad_norm": 2.078587055206299, "learning_rate": 2.620094811693442e-05, "loss": 1.2751, "step": 18083 }, { "epoch": 0.47627074005794046, "grad_norm": 2.120863914489746, "learning_rate": 2.619963128785884e-05, "loss": 0.5739, "step": 18084 }, { "epoch": 0.4762970766394522, "grad_norm": 1.5160518884658813, "learning_rate": 2.619831445878325e-05, "loss": 1.5942, "step": 18085 }, { "epoch": 0.47632341322096394, "grad_norm": 2.545600175857544, "learning_rate": 2.6196997629707666e-05, "loss": 1.5817, "step": 18086 }, { "epoch": 0.47634974980247563, "grad_norm": 2.5309865474700928, "learning_rate": 2.619568080063208e-05, "loss": 1.8868, "step": 18087 }, { "epoch": 0.4763760863839874, "grad_norm": 3.169647216796875, "learning_rate": 2.6194363971556497e-05, "loss": 1.0168, "step": 18088 }, { "epoch": 0.47640242296549906, "grad_norm": 1.8885867595672607, "learning_rate": 2.619304714248091e-05, "loss": 1.8344, "step": 18089 }, { "epoch": 0.4764287595470108, "grad_norm": 1.8588078022003174, "learning_rate": 2.6191730313405322e-05, "loss": 1.9987, "step": 18090 }, { "epoch": 0.4764550961285225, "grad_norm": 2.69921875, "learning_rate": 2.6190413484329734e-05, "loss": 1.5042, "step": 18091 }, { "epoch": 0.47648143271003424, "grad_norm": 1.9179502725601196, "learning_rate": 2.6189096655254146e-05, "loss": 2.1202, "step": 18092 }, { "epoch": 0.476507769291546, "grad_norm": 1.971161127090454, "learning_rate": 2.6187779826178565e-05, "loss": 0.6615, "step": 18093 }, { "epoch": 0.47653410587305767, "grad_norm": 4.208078384399414, "learning_rate": 2.6186462997102977e-05, "loss": 1.2328, "step": 18094 }, { "epoch": 0.4765604424545694, "grad_norm": 2.099757432937622, "learning_rate": 2.618514616802739e-05, "loss": 2.2761, "step": 18095 }, { "epoch": 0.4765867790360811, "grad_norm": 1.5804299116134644, "learning_rate": 2.6183829338951805e-05, "loss": 1.5915, "step": 18096 }, { "epoch": 0.47661311561759284, "grad_norm": 2.426229953765869, "learning_rate": 2.6182512509876217e-05, "loss": 0.1391, "step": 18097 }, { "epoch": 0.47663945219910453, "grad_norm": 3.338651418685913, "learning_rate": 2.6181195680800636e-05, "loss": 0.873, "step": 18098 }, { "epoch": 0.4766657887806163, "grad_norm": 3.4374072551727295, "learning_rate": 2.617987885172505e-05, "loss": 1.6158, "step": 18099 }, { "epoch": 0.476692125362128, "grad_norm": 1.865473985671997, "learning_rate": 2.617856202264946e-05, "loss": 2.1476, "step": 18100 }, { "epoch": 0.4767184619436397, "grad_norm": 5.584752559661865, "learning_rate": 2.6177245193573873e-05, "loss": 0.7848, "step": 18101 }, { "epoch": 0.47674479852515145, "grad_norm": 2.1740102767944336, "learning_rate": 2.6175928364498285e-05, "loss": 1.6618, "step": 18102 }, { "epoch": 0.47677113510666314, "grad_norm": 1.813247561454773, "learning_rate": 2.6174611535422704e-05, "loss": 2.3947, "step": 18103 }, { "epoch": 0.4767974716881749, "grad_norm": 2.389674663543701, "learning_rate": 2.6173294706347116e-05, "loss": 0.7996, "step": 18104 }, { "epoch": 0.4768238082696866, "grad_norm": 2.842863082885742, "learning_rate": 2.6171977877271532e-05, "loss": 1.9356, "step": 18105 }, { "epoch": 0.4768501448511983, "grad_norm": 2.1585025787353516, "learning_rate": 2.6170661048195944e-05, "loss": 2.4486, "step": 18106 }, { "epoch": 0.47687648143271005, "grad_norm": 2.1451637744903564, "learning_rate": 2.6169344219120363e-05, "loss": 2.1151, "step": 18107 }, { "epoch": 0.47690281801422174, "grad_norm": 5.163798809051514, "learning_rate": 2.6168027390044775e-05, "loss": 0.9405, "step": 18108 }, { "epoch": 0.4769291545957335, "grad_norm": 1.7317043542861938, "learning_rate": 2.6166710560969188e-05, "loss": 2.1485, "step": 18109 }, { "epoch": 0.47695549117724517, "grad_norm": 1.8907835483551025, "learning_rate": 2.61653937318936e-05, "loss": 1.6107, "step": 18110 }, { "epoch": 0.4769818277587569, "grad_norm": 3.3649308681488037, "learning_rate": 2.6164076902818012e-05, "loss": 1.0804, "step": 18111 }, { "epoch": 0.47700816434026866, "grad_norm": 4.079046726226807, "learning_rate": 2.616276007374243e-05, "loss": 1.4524, "step": 18112 }, { "epoch": 0.47703450092178035, "grad_norm": 1.972220540046692, "learning_rate": 2.6161443244666843e-05, "loss": 2.1815, "step": 18113 }, { "epoch": 0.4770608375032921, "grad_norm": 1.8180415630340576, "learning_rate": 2.616012641559126e-05, "loss": 2.4495, "step": 18114 }, { "epoch": 0.4770871740848038, "grad_norm": 1.9662320613861084, "learning_rate": 2.615880958651567e-05, "loss": 1.647, "step": 18115 }, { "epoch": 0.4771135106663155, "grad_norm": 2.095525026321411, "learning_rate": 2.6157492757440083e-05, "loss": 1.5167, "step": 18116 }, { "epoch": 0.4771398472478272, "grad_norm": 2.066540479660034, "learning_rate": 2.6156175928364502e-05, "loss": 1.2558, "step": 18117 }, { "epoch": 0.47716618382933895, "grad_norm": 3.7202439308166504, "learning_rate": 2.6154859099288915e-05, "loss": 1.4575, "step": 18118 }, { "epoch": 0.4771925204108507, "grad_norm": 1.8115872144699097, "learning_rate": 2.6153542270213327e-05, "loss": 2.5709, "step": 18119 }, { "epoch": 0.4772188569923624, "grad_norm": 2.043302536010742, "learning_rate": 2.615222544113774e-05, "loss": 1.1423, "step": 18120 }, { "epoch": 0.4772451935738741, "grad_norm": 2.0807416439056396, "learning_rate": 2.6150908612062158e-05, "loss": 1.6837, "step": 18121 }, { "epoch": 0.4772715301553858, "grad_norm": 3.1593334674835205, "learning_rate": 2.614959178298657e-05, "loss": 1.8308, "step": 18122 }, { "epoch": 0.47729786673689756, "grad_norm": 3.4992871284484863, "learning_rate": 2.6148274953910982e-05, "loss": 2.1369, "step": 18123 }, { "epoch": 0.47732420331840925, "grad_norm": 1.5538591146469116, "learning_rate": 2.6146958124835398e-05, "loss": 2.5318, "step": 18124 }, { "epoch": 0.477350539899921, "grad_norm": 1.703216552734375, "learning_rate": 2.614564129575981e-05, "loss": 2.0921, "step": 18125 }, { "epoch": 0.47737687648143273, "grad_norm": 2.0972647666931152, "learning_rate": 2.614432446668423e-05, "loss": 2.5599, "step": 18126 }, { "epoch": 0.4774032130629444, "grad_norm": 4.844968795776367, "learning_rate": 2.614300763760864e-05, "loss": 1.8993, "step": 18127 }, { "epoch": 0.47742954964445616, "grad_norm": 1.9064234495162964, "learning_rate": 2.6141690808533054e-05, "loss": 1.6282, "step": 18128 }, { "epoch": 0.47745588622596785, "grad_norm": 3.336031198501587, "learning_rate": 2.6140373979457466e-05, "loss": 1.6488, "step": 18129 }, { "epoch": 0.4774822228074796, "grad_norm": 1.554504632949829, "learning_rate": 2.6139057150381878e-05, "loss": 0.3676, "step": 18130 }, { "epoch": 0.4775085593889913, "grad_norm": 1.4876564741134644, "learning_rate": 2.6137740321306297e-05, "loss": 1.8622, "step": 18131 }, { "epoch": 0.477534895970503, "grad_norm": 3.5113332271575928, "learning_rate": 2.613642349223071e-05, "loss": 1.1884, "step": 18132 }, { "epoch": 0.47756123255201477, "grad_norm": 2.460200786590576, "learning_rate": 2.6135106663155125e-05, "loss": 0.6754, "step": 18133 }, { "epoch": 0.47758756913352646, "grad_norm": 2.1989099979400635, "learning_rate": 2.6133789834079537e-05, "loss": 2.3714, "step": 18134 }, { "epoch": 0.4776139057150382, "grad_norm": 2.667492389678955, "learning_rate": 2.613247300500395e-05, "loss": 0.6263, "step": 18135 }, { "epoch": 0.4776402422965499, "grad_norm": 2.299617052078247, "learning_rate": 2.6131156175928368e-05, "loss": 2.8219, "step": 18136 }, { "epoch": 0.47766657887806163, "grad_norm": 1.612807035446167, "learning_rate": 2.612983934685278e-05, "loss": 0.2344, "step": 18137 }, { "epoch": 0.4776929154595734, "grad_norm": 1.7575221061706543, "learning_rate": 2.6128522517777193e-05, "loss": 1.6634, "step": 18138 }, { "epoch": 0.47771925204108506, "grad_norm": 4.1696457862854, "learning_rate": 2.6127205688701605e-05, "loss": 0.5157, "step": 18139 }, { "epoch": 0.4777455886225968, "grad_norm": 1.7531259059906006, "learning_rate": 2.6125888859626024e-05, "loss": 2.1103, "step": 18140 }, { "epoch": 0.4777719252041085, "grad_norm": 1.9951260089874268, "learning_rate": 2.6124572030550436e-05, "loss": 1.7099, "step": 18141 }, { "epoch": 0.47779826178562024, "grad_norm": 2.5088744163513184, "learning_rate": 2.612325520147485e-05, "loss": 1.277, "step": 18142 }, { "epoch": 0.4778245983671319, "grad_norm": 1.8694902658462524, "learning_rate": 2.6121938372399264e-05, "loss": 1.7016, "step": 18143 }, { "epoch": 0.47785093494864367, "grad_norm": 2.1027448177337646, "learning_rate": 2.6120621543323676e-05, "loss": 2.3466, "step": 18144 }, { "epoch": 0.4778772715301554, "grad_norm": 5.901076793670654, "learning_rate": 2.6119304714248095e-05, "loss": 1.5969, "step": 18145 }, { "epoch": 0.4779036081116671, "grad_norm": 1.9883469343185425, "learning_rate": 2.6117987885172507e-05, "loss": 1.7854, "step": 18146 }, { "epoch": 0.47792994469317884, "grad_norm": 2.4258086681365967, "learning_rate": 2.611667105609692e-05, "loss": 2.0893, "step": 18147 }, { "epoch": 0.47795628127469053, "grad_norm": 1.5321277379989624, "learning_rate": 2.611535422702133e-05, "loss": 2.0222, "step": 18148 }, { "epoch": 0.4779826178562023, "grad_norm": 2.2107131481170654, "learning_rate": 2.6114037397945744e-05, "loss": 0.276, "step": 18149 }, { "epoch": 0.47800895443771396, "grad_norm": 2.2959635257720947, "learning_rate": 2.6112720568870163e-05, "loss": 2.1733, "step": 18150 }, { "epoch": 0.4780352910192257, "grad_norm": 2.7273948192596436, "learning_rate": 2.6111403739794575e-05, "loss": 2.3528, "step": 18151 }, { "epoch": 0.47806162760073745, "grad_norm": 1.675957441329956, "learning_rate": 2.611008691071899e-05, "loss": 1.7183, "step": 18152 }, { "epoch": 0.47808796418224914, "grad_norm": 2.2190258502960205, "learning_rate": 2.6108770081643403e-05, "loss": 1.826, "step": 18153 }, { "epoch": 0.4781143007637609, "grad_norm": 3.512843370437622, "learning_rate": 2.6107453252567822e-05, "loss": 2.0663, "step": 18154 }, { "epoch": 0.47814063734527257, "grad_norm": 1.5189660787582397, "learning_rate": 2.6106136423492234e-05, "loss": 2.2426, "step": 18155 }, { "epoch": 0.4781669739267843, "grad_norm": 5.404087543487549, "learning_rate": 2.6104819594416646e-05, "loss": 2.1597, "step": 18156 }, { "epoch": 0.478193310508296, "grad_norm": 2.591707229614258, "learning_rate": 2.610350276534106e-05, "loss": 2.4166, "step": 18157 }, { "epoch": 0.47821964708980774, "grad_norm": 4.782519817352295, "learning_rate": 2.610218593626547e-05, "loss": 1.4625, "step": 18158 }, { "epoch": 0.4782459836713195, "grad_norm": 2.67582106590271, "learning_rate": 2.610086910718989e-05, "loss": 1.7045, "step": 18159 }, { "epoch": 0.4782723202528312, "grad_norm": 1.6655147075653076, "learning_rate": 2.6099552278114302e-05, "loss": 1.421, "step": 18160 }, { "epoch": 0.4782986568343429, "grad_norm": 3.701991081237793, "learning_rate": 2.6098235449038717e-05, "loss": 1.9044, "step": 18161 }, { "epoch": 0.4783249934158546, "grad_norm": 1.59872567653656, "learning_rate": 2.609691861996313e-05, "loss": 2.3976, "step": 18162 }, { "epoch": 0.47835132999736635, "grad_norm": 1.5172570943832397, "learning_rate": 2.6095601790887542e-05, "loss": 1.5677, "step": 18163 }, { "epoch": 0.47837766657887804, "grad_norm": 2.137535333633423, "learning_rate": 2.609428496181196e-05, "loss": 1.8379, "step": 18164 }, { "epoch": 0.4784040031603898, "grad_norm": 5.482143402099609, "learning_rate": 2.6092968132736373e-05, "loss": 1.0263, "step": 18165 }, { "epoch": 0.4784303397419015, "grad_norm": 1.9915279150009155, "learning_rate": 2.6091651303660785e-05, "loss": 1.9052, "step": 18166 }, { "epoch": 0.4784566763234132, "grad_norm": 2.3375186920166016, "learning_rate": 2.6090334474585197e-05, "loss": 0.7127, "step": 18167 }, { "epoch": 0.47848301290492495, "grad_norm": 1.8854601383209229, "learning_rate": 2.608901764550961e-05, "loss": 2.0186, "step": 18168 }, { "epoch": 0.47850934948643664, "grad_norm": 1.6205345392227173, "learning_rate": 2.608770081643403e-05, "loss": 2.5091, "step": 18169 }, { "epoch": 0.4785356860679484, "grad_norm": 1.597391128540039, "learning_rate": 2.608638398735844e-05, "loss": 1.7359, "step": 18170 }, { "epoch": 0.4785620226494601, "grad_norm": 3.3700568675994873, "learning_rate": 2.6085067158282856e-05, "loss": 1.5659, "step": 18171 }, { "epoch": 0.4785883592309718, "grad_norm": 1.645609974861145, "learning_rate": 2.608375032920727e-05, "loss": 1.4701, "step": 18172 }, { "epoch": 0.47861469581248356, "grad_norm": 2.141824245452881, "learning_rate": 2.6082433500131688e-05, "loss": 2.4564, "step": 18173 }, { "epoch": 0.47864103239399525, "grad_norm": 1.818982481956482, "learning_rate": 2.60811166710561e-05, "loss": 1.02, "step": 18174 }, { "epoch": 0.478667368975507, "grad_norm": 1.7090140581130981, "learning_rate": 2.6079799841980512e-05, "loss": 1.7205, "step": 18175 }, { "epoch": 0.4786937055570187, "grad_norm": 2.0209977626800537, "learning_rate": 2.6078483012904924e-05, "loss": 1.3472, "step": 18176 }, { "epoch": 0.4787200421385304, "grad_norm": 1.909293532371521, "learning_rate": 2.6077166183829337e-05, "loss": 1.2932, "step": 18177 }, { "epoch": 0.47874637872004216, "grad_norm": 2.426377773284912, "learning_rate": 2.6075849354753756e-05, "loss": 1.3127, "step": 18178 }, { "epoch": 0.47877271530155385, "grad_norm": 5.718318939208984, "learning_rate": 2.6074532525678168e-05, "loss": 2.0245, "step": 18179 }, { "epoch": 0.4787990518830656, "grad_norm": 3.2824246883392334, "learning_rate": 2.6073215696602583e-05, "loss": 1.6523, "step": 18180 }, { "epoch": 0.4788253884645773, "grad_norm": 2.88808274269104, "learning_rate": 2.6071898867526996e-05, "loss": 1.4564, "step": 18181 }, { "epoch": 0.478851725046089, "grad_norm": 1.67848539352417, "learning_rate": 2.6070582038451408e-05, "loss": 1.81, "step": 18182 }, { "epoch": 0.4788780616276007, "grad_norm": 6.433162689208984, "learning_rate": 2.6069265209375827e-05, "loss": 1.011, "step": 18183 }, { "epoch": 0.47890439820911246, "grad_norm": 2.4767961502075195, "learning_rate": 2.606794838030024e-05, "loss": 1.3045, "step": 18184 }, { "epoch": 0.4789307347906242, "grad_norm": 5.748504638671875, "learning_rate": 2.606663155122465e-05, "loss": 1.2878, "step": 18185 }, { "epoch": 0.4789570713721359, "grad_norm": 1.5263091325759888, "learning_rate": 2.6065314722149063e-05, "loss": 1.87, "step": 18186 }, { "epoch": 0.47898340795364763, "grad_norm": 1.9353070259094238, "learning_rate": 2.6063997893073482e-05, "loss": 2.8552, "step": 18187 }, { "epoch": 0.4790097445351593, "grad_norm": 1.4300810098648071, "learning_rate": 2.6062681063997895e-05, "loss": 2.3514, "step": 18188 }, { "epoch": 0.47903608111667106, "grad_norm": 1.5277667045593262, "learning_rate": 2.606136423492231e-05, "loss": 1.7833, "step": 18189 }, { "epoch": 0.47906241769818275, "grad_norm": 1.8952159881591797, "learning_rate": 2.6060047405846722e-05, "loss": 1.7228, "step": 18190 }, { "epoch": 0.4790887542796945, "grad_norm": 1.6774052381515503, "learning_rate": 2.6058730576771135e-05, "loss": 1.7671, "step": 18191 }, { "epoch": 0.47911509086120624, "grad_norm": 2.0082924365997314, "learning_rate": 2.6057413747695554e-05, "loss": 1.6773, "step": 18192 }, { "epoch": 0.4791414274427179, "grad_norm": 4.2807297706604, "learning_rate": 2.6056096918619966e-05, "loss": 0.799, "step": 18193 }, { "epoch": 0.47916776402422967, "grad_norm": 2.550044059753418, "learning_rate": 2.6054780089544378e-05, "loss": 2.0308, "step": 18194 }, { "epoch": 0.47919410060574136, "grad_norm": 1.816925287246704, "learning_rate": 2.605346326046879e-05, "loss": 1.781, "step": 18195 }, { "epoch": 0.4792204371872531, "grad_norm": 1.9360864162445068, "learning_rate": 2.6052146431393202e-05, "loss": 2.1195, "step": 18196 }, { "epoch": 0.4792467737687648, "grad_norm": 2.2189080715179443, "learning_rate": 2.605082960231762e-05, "loss": 1.9188, "step": 18197 }, { "epoch": 0.47927311035027653, "grad_norm": 1.4427847862243652, "learning_rate": 2.6049512773242034e-05, "loss": 1.5472, "step": 18198 }, { "epoch": 0.4792994469317883, "grad_norm": 2.8791027069091797, "learning_rate": 2.604819594416645e-05, "loss": 1.3314, "step": 18199 }, { "epoch": 0.47932578351329996, "grad_norm": 5.540414333343506, "learning_rate": 2.604687911509086e-05, "loss": 2.2421, "step": 18200 }, { "epoch": 0.4793521200948117, "grad_norm": 5.057022571563721, "learning_rate": 2.6045562286015274e-05, "loss": 0.7574, "step": 18201 }, { "epoch": 0.4793784566763234, "grad_norm": 2.3301544189453125, "learning_rate": 2.6044245456939693e-05, "loss": 1.0735, "step": 18202 }, { "epoch": 0.47940479325783514, "grad_norm": 1.5644989013671875, "learning_rate": 2.6042928627864105e-05, "loss": 1.4331, "step": 18203 }, { "epoch": 0.4794311298393468, "grad_norm": 3.384436845779419, "learning_rate": 2.6041611798788517e-05, "loss": 0.5878, "step": 18204 }, { "epoch": 0.47945746642085857, "grad_norm": 4.111256122589111, "learning_rate": 2.604029496971293e-05, "loss": 2.0897, "step": 18205 }, { "epoch": 0.4794838030023703, "grad_norm": 2.0982885360717773, "learning_rate": 2.6038978140637348e-05, "loss": 1.9615, "step": 18206 }, { "epoch": 0.479510139583882, "grad_norm": 3.7931666374206543, "learning_rate": 2.603766131156176e-05, "loss": 0.831, "step": 18207 }, { "epoch": 0.47953647616539374, "grad_norm": 4.855052947998047, "learning_rate": 2.6036344482486176e-05, "loss": 0.9936, "step": 18208 }, { "epoch": 0.47956281274690543, "grad_norm": 1.6595323085784912, "learning_rate": 2.6035027653410588e-05, "loss": 1.7072, "step": 18209 }, { "epoch": 0.4795891493284172, "grad_norm": 1.57021963596344, "learning_rate": 2.6033710824335e-05, "loss": 1.811, "step": 18210 }, { "epoch": 0.4796154859099289, "grad_norm": 1.3585107326507568, "learning_rate": 2.603239399525942e-05, "loss": 1.5773, "step": 18211 }, { "epoch": 0.4796418224914406, "grad_norm": 1.8488690853118896, "learning_rate": 2.603107716618383e-05, "loss": 1.7188, "step": 18212 }, { "epoch": 0.47966815907295235, "grad_norm": 3.2584662437438965, "learning_rate": 2.6029760337108244e-05, "loss": 1.1955, "step": 18213 }, { "epoch": 0.47969449565446404, "grad_norm": 1.6268631219863892, "learning_rate": 2.6028443508032656e-05, "loss": 1.4492, "step": 18214 }, { "epoch": 0.4797208322359758, "grad_norm": 1.6208415031433105, "learning_rate": 2.602712667895707e-05, "loss": 1.8537, "step": 18215 }, { "epoch": 0.47974716881748747, "grad_norm": 1.8901538848876953, "learning_rate": 2.6025809849881487e-05, "loss": 2.321, "step": 18216 }, { "epoch": 0.4797735053989992, "grad_norm": 1.631158471107483, "learning_rate": 2.6024493020805903e-05, "loss": 0.445, "step": 18217 }, { "epoch": 0.47979984198051095, "grad_norm": 2.484971523284912, "learning_rate": 2.6023176191730315e-05, "loss": 2.1254, "step": 18218 }, { "epoch": 0.47982617856202264, "grad_norm": 1.7783170938491821, "learning_rate": 2.6021859362654727e-05, "loss": 1.9419, "step": 18219 }, { "epoch": 0.4798525151435344, "grad_norm": 2.196690082550049, "learning_rate": 2.6020542533579146e-05, "loss": 2.1618, "step": 18220 }, { "epoch": 0.4798788517250461, "grad_norm": 1.8459336757659912, "learning_rate": 2.601922570450356e-05, "loss": 1.5345, "step": 18221 }, { "epoch": 0.4799051883065578, "grad_norm": 2.015507936477661, "learning_rate": 2.601790887542797e-05, "loss": 1.4419, "step": 18222 }, { "epoch": 0.4799315248880695, "grad_norm": 1.6010900735855103, "learning_rate": 2.6016592046352383e-05, "loss": 1.935, "step": 18223 }, { "epoch": 0.47995786146958125, "grad_norm": 1.7097499370574951, "learning_rate": 2.6015275217276795e-05, "loss": 1.0837, "step": 18224 }, { "epoch": 0.479984198051093, "grad_norm": 3.9856553077697754, "learning_rate": 2.6013958388201214e-05, "loss": 2.4888, "step": 18225 }, { "epoch": 0.4800105346326047, "grad_norm": 3.3391363620758057, "learning_rate": 2.6012641559125626e-05, "loss": 1.6933, "step": 18226 }, { "epoch": 0.4800368712141164, "grad_norm": 2.165415048599243, "learning_rate": 2.6011324730050042e-05, "loss": 2.046, "step": 18227 }, { "epoch": 0.4800632077956281, "grad_norm": 3.911407232284546, "learning_rate": 2.6010007900974454e-05, "loss": 0.6511, "step": 18228 }, { "epoch": 0.48008954437713985, "grad_norm": 1.8012775182724, "learning_rate": 2.6008691071898866e-05, "loss": 1.3854, "step": 18229 }, { "epoch": 0.48011588095865154, "grad_norm": 2.567828416824341, "learning_rate": 2.6007374242823285e-05, "loss": 0.7743, "step": 18230 }, { "epoch": 0.4801422175401633, "grad_norm": 2.0181589126586914, "learning_rate": 2.6006057413747697e-05, "loss": 2.3429, "step": 18231 }, { "epoch": 0.48016855412167503, "grad_norm": 1.8341394662857056, "learning_rate": 2.600474058467211e-05, "loss": 1.6094, "step": 18232 }, { "epoch": 0.4801948907031867, "grad_norm": 3.8895981311798096, "learning_rate": 2.6003423755596522e-05, "loss": 1.0459, "step": 18233 }, { "epoch": 0.48022122728469846, "grad_norm": 2.5247411727905273, "learning_rate": 2.600210692652094e-05, "loss": 1.2766, "step": 18234 }, { "epoch": 0.48024756386621015, "grad_norm": 1.8207776546478271, "learning_rate": 2.6000790097445353e-05, "loss": 2.0494, "step": 18235 }, { "epoch": 0.4802739004477219, "grad_norm": 2.1421167850494385, "learning_rate": 2.599947326836977e-05, "loss": 2.0554, "step": 18236 }, { "epoch": 0.4803002370292336, "grad_norm": 1.9610754251480103, "learning_rate": 2.599815643929418e-05, "loss": 1.6792, "step": 18237 }, { "epoch": 0.4803265736107453, "grad_norm": 2.152301073074341, "learning_rate": 2.5996839610218593e-05, "loss": 1.2206, "step": 18238 }, { "epoch": 0.48035291019225707, "grad_norm": 1.690805435180664, "learning_rate": 2.5995522781143012e-05, "loss": 2.1232, "step": 18239 }, { "epoch": 0.48037924677376875, "grad_norm": 1.7222952842712402, "learning_rate": 2.5994205952067424e-05, "loss": 0.591, "step": 18240 }, { "epoch": 0.4804055833552805, "grad_norm": 1.9552398920059204, "learning_rate": 2.5992889122991837e-05, "loss": 1.0758, "step": 18241 }, { "epoch": 0.4804319199367922, "grad_norm": 2.508448839187622, "learning_rate": 2.599157229391625e-05, "loss": 1.3292, "step": 18242 }, { "epoch": 0.48045825651830393, "grad_norm": 2.046084403991699, "learning_rate": 2.5990255464840664e-05, "loss": 1.4425, "step": 18243 }, { "epoch": 0.48048459309981567, "grad_norm": 1.7621982097625732, "learning_rate": 2.598893863576508e-05, "loss": 1.9327, "step": 18244 }, { "epoch": 0.48051092968132736, "grad_norm": 3.080350399017334, "learning_rate": 2.5987621806689496e-05, "loss": 0.7348, "step": 18245 }, { "epoch": 0.4805372662628391, "grad_norm": 2.543154001235962, "learning_rate": 2.5986304977613908e-05, "loss": 1.0225, "step": 18246 }, { "epoch": 0.4805636028443508, "grad_norm": 2.1923797130584717, "learning_rate": 2.598498814853832e-05, "loss": 2.7072, "step": 18247 }, { "epoch": 0.48058993942586253, "grad_norm": 1.423647403717041, "learning_rate": 2.5983671319462732e-05, "loss": 1.8857, "step": 18248 }, { "epoch": 0.4806162760073742, "grad_norm": 2.073836326599121, "learning_rate": 2.598235449038715e-05, "loss": 1.231, "step": 18249 }, { "epoch": 0.48064261258888596, "grad_norm": 2.9497969150543213, "learning_rate": 2.5981037661311563e-05, "loss": 1.6494, "step": 18250 }, { "epoch": 0.4806689491703977, "grad_norm": 1.5961450338363647, "learning_rate": 2.5979720832235976e-05, "loss": 2.1089, "step": 18251 }, { "epoch": 0.4806952857519094, "grad_norm": 1.5951591730117798, "learning_rate": 2.5978404003160388e-05, "loss": 1.7174, "step": 18252 }, { "epoch": 0.48072162233342114, "grad_norm": 2.378539800643921, "learning_rate": 2.5977087174084807e-05, "loss": 1.7267, "step": 18253 }, { "epoch": 0.4807479589149328, "grad_norm": 2.2286441326141357, "learning_rate": 2.597577034500922e-05, "loss": 1.2339, "step": 18254 }, { "epoch": 0.48077429549644457, "grad_norm": 2.03003191947937, "learning_rate": 2.5974453515933635e-05, "loss": 2.0078, "step": 18255 }, { "epoch": 0.48080063207795626, "grad_norm": 1.9827549457550049, "learning_rate": 2.5973136686858047e-05, "loss": 1.9373, "step": 18256 }, { "epoch": 0.480826968659468, "grad_norm": 3.3949034214019775, "learning_rate": 2.597181985778246e-05, "loss": 2.0879, "step": 18257 }, { "epoch": 0.48085330524097974, "grad_norm": 1.7559983730316162, "learning_rate": 2.5970503028706878e-05, "loss": 1.0735, "step": 18258 }, { "epoch": 0.48087964182249143, "grad_norm": 2.3310892581939697, "learning_rate": 2.596918619963129e-05, "loss": 0.3768, "step": 18259 }, { "epoch": 0.4809059784040032, "grad_norm": 1.615135908126831, "learning_rate": 2.5967869370555702e-05, "loss": 2.1364, "step": 18260 }, { "epoch": 0.48093231498551486, "grad_norm": 3.7796401977539062, "learning_rate": 2.5966552541480115e-05, "loss": 1.1534, "step": 18261 }, { "epoch": 0.4809586515670266, "grad_norm": 2.0392417907714844, "learning_rate": 2.596523571240453e-05, "loss": 1.9045, "step": 18262 }, { "epoch": 0.4809849881485383, "grad_norm": 6.02974271774292, "learning_rate": 2.5963918883328946e-05, "loss": 0.8431, "step": 18263 }, { "epoch": 0.48101132473005004, "grad_norm": 1.9574013948440552, "learning_rate": 2.596260205425336e-05, "loss": 2.3259, "step": 18264 }, { "epoch": 0.4810376613115618, "grad_norm": 4.27133321762085, "learning_rate": 2.5961285225177774e-05, "loss": 0.8533, "step": 18265 }, { "epoch": 0.48106399789307347, "grad_norm": 2.9611685276031494, "learning_rate": 2.5959968396102186e-05, "loss": 1.4665, "step": 18266 }, { "epoch": 0.4810903344745852, "grad_norm": 2.1282074451446533, "learning_rate": 2.5958651567026605e-05, "loss": 0.51, "step": 18267 }, { "epoch": 0.4811166710560969, "grad_norm": 3.044217109680176, "learning_rate": 2.5957334737951017e-05, "loss": 2.7244, "step": 18268 }, { "epoch": 0.48114300763760864, "grad_norm": 1.874723196029663, "learning_rate": 2.595601790887543e-05, "loss": 1.4811, "step": 18269 }, { "epoch": 0.48116934421912033, "grad_norm": 1.7602438926696777, "learning_rate": 2.595470107979984e-05, "loss": 1.9281, "step": 18270 }, { "epoch": 0.4811956808006321, "grad_norm": 2.15421462059021, "learning_rate": 2.5953384250724254e-05, "loss": 2.1792, "step": 18271 }, { "epoch": 0.4812220173821438, "grad_norm": 3.2429089546203613, "learning_rate": 2.5952067421648673e-05, "loss": 2.1389, "step": 18272 }, { "epoch": 0.4812483539636555, "grad_norm": 1.878605604171753, "learning_rate": 2.5950750592573085e-05, "loss": 1.7448, "step": 18273 }, { "epoch": 0.48127469054516725, "grad_norm": 2.79583477973938, "learning_rate": 2.59494337634975e-05, "loss": 1.6102, "step": 18274 }, { "epoch": 0.48130102712667894, "grad_norm": 2.2260453701019287, "learning_rate": 2.5948116934421913e-05, "loss": 1.8317, "step": 18275 }, { "epoch": 0.4813273637081907, "grad_norm": 2.351231813430786, "learning_rate": 2.5946800105346325e-05, "loss": 1.7722, "step": 18276 }, { "epoch": 0.4813537002897024, "grad_norm": 2.5532896518707275, "learning_rate": 2.5945483276270744e-05, "loss": 2.4244, "step": 18277 }, { "epoch": 0.4813800368712141, "grad_norm": 4.415773391723633, "learning_rate": 2.5944166447195156e-05, "loss": 1.5416, "step": 18278 }, { "epoch": 0.48140637345272586, "grad_norm": 2.400348663330078, "learning_rate": 2.5942849618119568e-05, "loss": 1.9609, "step": 18279 }, { "epoch": 0.48143271003423754, "grad_norm": 2.8336586952209473, "learning_rate": 2.594153278904398e-05, "loss": 0.6991, "step": 18280 }, { "epoch": 0.4814590466157493, "grad_norm": 1.6775230169296265, "learning_rate": 2.5940215959968396e-05, "loss": 1.4768, "step": 18281 }, { "epoch": 0.481485383197261, "grad_norm": 1.9266937971115112, "learning_rate": 2.593889913089281e-05, "loss": 1.7842, "step": 18282 }, { "epoch": 0.4815117197787727, "grad_norm": 2.667290210723877, "learning_rate": 2.5937582301817227e-05, "loss": 1.1571, "step": 18283 }, { "epoch": 0.48153805636028446, "grad_norm": 1.645289421081543, "learning_rate": 2.593626547274164e-05, "loss": 1.5977, "step": 18284 }, { "epoch": 0.48156439294179615, "grad_norm": 3.0950942039489746, "learning_rate": 2.593494864366605e-05, "loss": 2.1776, "step": 18285 }, { "epoch": 0.4815907295233079, "grad_norm": 2.822568893432617, "learning_rate": 2.593363181459047e-05, "loss": 1.7076, "step": 18286 }, { "epoch": 0.4816170661048196, "grad_norm": 1.8029379844665527, "learning_rate": 2.5932314985514883e-05, "loss": 1.6493, "step": 18287 }, { "epoch": 0.4816434026863313, "grad_norm": 1.8903818130493164, "learning_rate": 2.5930998156439295e-05, "loss": 1.4257, "step": 18288 }, { "epoch": 0.481669739267843, "grad_norm": 1.7043659687042236, "learning_rate": 2.5929681327363707e-05, "loss": 1.8823, "step": 18289 }, { "epoch": 0.48169607584935475, "grad_norm": 2.1602940559387207, "learning_rate": 2.5928364498288123e-05, "loss": 1.569, "step": 18290 }, { "epoch": 0.4817224124308665, "grad_norm": 1.5170824527740479, "learning_rate": 2.592704766921254e-05, "loss": 2.5523, "step": 18291 }, { "epoch": 0.4817487490123782, "grad_norm": 1.7479232549667358, "learning_rate": 2.5925730840136954e-05, "loss": 2.0913, "step": 18292 }, { "epoch": 0.48177508559388993, "grad_norm": 2.471151351928711, "learning_rate": 2.5924414011061366e-05, "loss": 1.2299, "step": 18293 }, { "epoch": 0.4818014221754016, "grad_norm": 2.0613322257995605, "learning_rate": 2.592309718198578e-05, "loss": 0.9024, "step": 18294 }, { "epoch": 0.48182775875691336, "grad_norm": 2.1863367557525635, "learning_rate": 2.592178035291019e-05, "loss": 1.3717, "step": 18295 }, { "epoch": 0.48185409533842505, "grad_norm": 1.8875648975372314, "learning_rate": 2.592046352383461e-05, "loss": 1.5818, "step": 18296 }, { "epoch": 0.4818804319199368, "grad_norm": 3.3052406311035156, "learning_rate": 2.5919146694759022e-05, "loss": 1.8997, "step": 18297 }, { "epoch": 0.48190676850144853, "grad_norm": 2.7625679969787598, "learning_rate": 2.5917829865683434e-05, "loss": 1.493, "step": 18298 }, { "epoch": 0.4819331050829602, "grad_norm": 2.534208297729492, "learning_rate": 2.5916513036607846e-05, "loss": 1.599, "step": 18299 }, { "epoch": 0.48195944166447197, "grad_norm": 4.247109413146973, "learning_rate": 2.5915196207532265e-05, "loss": 1.4251, "step": 18300 }, { "epoch": 0.48198577824598365, "grad_norm": 2.1316025257110596, "learning_rate": 2.5913879378456678e-05, "loss": 2.5669, "step": 18301 }, { "epoch": 0.4820121148274954, "grad_norm": 3.536114454269409, "learning_rate": 2.5912562549381093e-05, "loss": 0.5308, "step": 18302 }, { "epoch": 0.4820384514090071, "grad_norm": 2.2415454387664795, "learning_rate": 2.5911245720305505e-05, "loss": 1.7294, "step": 18303 }, { "epoch": 0.48206478799051883, "grad_norm": 1.5645477771759033, "learning_rate": 2.5909928891229918e-05, "loss": 1.7197, "step": 18304 }, { "epoch": 0.48209112457203057, "grad_norm": 1.7059056758880615, "learning_rate": 2.5908612062154337e-05, "loss": 0.9997, "step": 18305 }, { "epoch": 0.48211746115354226, "grad_norm": 3.156730890274048, "learning_rate": 2.590729523307875e-05, "loss": 0.336, "step": 18306 }, { "epoch": 0.482143797735054, "grad_norm": 1.3372992277145386, "learning_rate": 2.590597840400316e-05, "loss": 1.411, "step": 18307 }, { "epoch": 0.4821701343165657, "grad_norm": 2.8801941871643066, "learning_rate": 2.5904661574927573e-05, "loss": 1.6704, "step": 18308 }, { "epoch": 0.48219647089807743, "grad_norm": 1.8397315740585327, "learning_rate": 2.590334474585199e-05, "loss": 2.1636, "step": 18309 }, { "epoch": 0.4822228074795891, "grad_norm": 2.7086033821105957, "learning_rate": 2.5902027916776404e-05, "loss": 0.6801, "step": 18310 }, { "epoch": 0.48224914406110087, "grad_norm": 4.560272693634033, "learning_rate": 2.590071108770082e-05, "loss": 0.6993, "step": 18311 }, { "epoch": 0.4822754806426126, "grad_norm": 1.794438362121582, "learning_rate": 2.5899394258625232e-05, "loss": 2.3285, "step": 18312 }, { "epoch": 0.4823018172241243, "grad_norm": 1.6634902954101562, "learning_rate": 2.5898077429549644e-05, "loss": 1.7613, "step": 18313 }, { "epoch": 0.48232815380563604, "grad_norm": 2.5338780879974365, "learning_rate": 2.5896760600474057e-05, "loss": 0.7794, "step": 18314 }, { "epoch": 0.4823544903871477, "grad_norm": 1.5597007274627686, "learning_rate": 2.5895443771398476e-05, "loss": 1.9812, "step": 18315 }, { "epoch": 0.48238082696865947, "grad_norm": 1.5586351156234741, "learning_rate": 2.5894126942322888e-05, "loss": 2.5375, "step": 18316 }, { "epoch": 0.4824071635501712, "grad_norm": 2.1696550846099854, "learning_rate": 2.58928101132473e-05, "loss": 1.6872, "step": 18317 }, { "epoch": 0.4824335001316829, "grad_norm": 2.1278188228607178, "learning_rate": 2.5891493284171716e-05, "loss": 1.9642, "step": 18318 }, { "epoch": 0.48245983671319465, "grad_norm": 1.742224097251892, "learning_rate": 2.589017645509613e-05, "loss": 0.7188, "step": 18319 }, { "epoch": 0.48248617329470633, "grad_norm": 4.575339317321777, "learning_rate": 2.5888859626020547e-05, "loss": 1.0558, "step": 18320 }, { "epoch": 0.4825125098762181, "grad_norm": 5.727512359619141, "learning_rate": 2.588754279694496e-05, "loss": 1.3297, "step": 18321 }, { "epoch": 0.48253884645772976, "grad_norm": 1.7224305868148804, "learning_rate": 2.588622596786937e-05, "loss": 1.606, "step": 18322 }, { "epoch": 0.4825651830392415, "grad_norm": 1.7859320640563965, "learning_rate": 2.5884909138793783e-05, "loss": 1.5833, "step": 18323 }, { "epoch": 0.48259151962075325, "grad_norm": 1.3457127809524536, "learning_rate": 2.5883592309718202e-05, "loss": 1.9783, "step": 18324 }, { "epoch": 0.48261785620226494, "grad_norm": 1.807705044746399, "learning_rate": 2.5882275480642615e-05, "loss": 1.6852, "step": 18325 }, { "epoch": 0.4826441927837767, "grad_norm": 3.553330421447754, "learning_rate": 2.5880958651567027e-05, "loss": 1.1183, "step": 18326 }, { "epoch": 0.48267052936528837, "grad_norm": 3.3810200691223145, "learning_rate": 2.587964182249144e-05, "loss": 2.8145, "step": 18327 }, { "epoch": 0.4826968659468001, "grad_norm": 2.4965574741363525, "learning_rate": 2.5878324993415855e-05, "loss": 1.7972, "step": 18328 }, { "epoch": 0.4827232025283118, "grad_norm": 2.483815908432007, "learning_rate": 2.587700816434027e-05, "loss": 0.8307, "step": 18329 }, { "epoch": 0.48274953910982354, "grad_norm": 2.7984135150909424, "learning_rate": 2.5875691335264686e-05, "loss": 2.2607, "step": 18330 }, { "epoch": 0.4827758756913353, "grad_norm": 1.6728575229644775, "learning_rate": 2.5874374506189098e-05, "loss": 1.8287, "step": 18331 }, { "epoch": 0.482802212272847, "grad_norm": 2.076826810836792, "learning_rate": 2.587305767711351e-05, "loss": 1.2793, "step": 18332 }, { "epoch": 0.4828285488543587, "grad_norm": 2.7556533813476562, "learning_rate": 2.587174084803793e-05, "loss": 2.1633, "step": 18333 }, { "epoch": 0.4828548854358704, "grad_norm": 2.11775803565979, "learning_rate": 2.587042401896234e-05, "loss": 1.8455, "step": 18334 }, { "epoch": 0.48288122201738215, "grad_norm": 3.899864435195923, "learning_rate": 2.5869107189886754e-05, "loss": 1.2825, "step": 18335 }, { "epoch": 0.48290755859889384, "grad_norm": 2.0632147789001465, "learning_rate": 2.5867790360811166e-05, "loss": 2.3838, "step": 18336 }, { "epoch": 0.4829338951804056, "grad_norm": 1.4783732891082764, "learning_rate": 2.586647353173558e-05, "loss": 0.6448, "step": 18337 }, { "epoch": 0.4829602317619173, "grad_norm": 3.414386510848999, "learning_rate": 2.5865156702659997e-05, "loss": 1.6303, "step": 18338 }, { "epoch": 0.482986568343429, "grad_norm": 2.6229095458984375, "learning_rate": 2.5863839873584413e-05, "loss": 1.1264, "step": 18339 }, { "epoch": 0.48301290492494076, "grad_norm": 1.6566879749298096, "learning_rate": 2.5862523044508825e-05, "loss": 1.7613, "step": 18340 }, { "epoch": 0.48303924150645244, "grad_norm": 2.410510778427124, "learning_rate": 2.5861206215433237e-05, "loss": 0.9519, "step": 18341 }, { "epoch": 0.4830655780879642, "grad_norm": 1.8161232471466064, "learning_rate": 2.585988938635765e-05, "loss": 1.752, "step": 18342 }, { "epoch": 0.4830919146694759, "grad_norm": 3.2136988639831543, "learning_rate": 2.5858572557282068e-05, "loss": 1.9701, "step": 18343 }, { "epoch": 0.4831182512509876, "grad_norm": 1.500807523727417, "learning_rate": 2.585725572820648e-05, "loss": 1.8475, "step": 18344 }, { "epoch": 0.48314458783249936, "grad_norm": 2.430117607116699, "learning_rate": 2.5855938899130893e-05, "loss": 2.1502, "step": 18345 }, { "epoch": 0.48317092441401105, "grad_norm": 1.8485294580459595, "learning_rate": 2.5854622070055305e-05, "loss": 2.0114, "step": 18346 }, { "epoch": 0.4831972609955228, "grad_norm": 3.348314046859741, "learning_rate": 2.585330524097972e-05, "loss": 1.9875, "step": 18347 }, { "epoch": 0.4832235975770345, "grad_norm": 1.8929311037063599, "learning_rate": 2.585198841190414e-05, "loss": 1.7272, "step": 18348 }, { "epoch": 0.4832499341585462, "grad_norm": 1.764998197555542, "learning_rate": 2.585067158282855e-05, "loss": 1.7428, "step": 18349 }, { "epoch": 0.48327627074005797, "grad_norm": 4.091507911682129, "learning_rate": 2.5849354753752964e-05, "loss": 2.518, "step": 18350 }, { "epoch": 0.48330260732156965, "grad_norm": 3.2935714721679688, "learning_rate": 2.5848037924677376e-05, "loss": 1.1943, "step": 18351 }, { "epoch": 0.4833289439030814, "grad_norm": 2.3311710357666016, "learning_rate": 2.5846721095601795e-05, "loss": 0.9638, "step": 18352 }, { "epoch": 0.4833552804845931, "grad_norm": 1.7031148672103882, "learning_rate": 2.5845404266526207e-05, "loss": 2.1818, "step": 18353 }, { "epoch": 0.48338161706610483, "grad_norm": 3.7465553283691406, "learning_rate": 2.584408743745062e-05, "loss": 1.2991, "step": 18354 }, { "epoch": 0.4834079536476165, "grad_norm": 3.8707756996154785, "learning_rate": 2.5842770608375032e-05, "loss": 2.0623, "step": 18355 }, { "epoch": 0.48343429022912826, "grad_norm": 1.9105716943740845, "learning_rate": 2.5841453779299447e-05, "loss": 1.3937, "step": 18356 }, { "epoch": 0.48346062681064, "grad_norm": 2.868964672088623, "learning_rate": 2.5840136950223863e-05, "loss": 1.844, "step": 18357 }, { "epoch": 0.4834869633921517, "grad_norm": 2.0943338871002197, "learning_rate": 2.583882012114828e-05, "loss": 1.582, "step": 18358 }, { "epoch": 0.48351329997366344, "grad_norm": 3.172908067703247, "learning_rate": 2.583750329207269e-05, "loss": 1.6974, "step": 18359 }, { "epoch": 0.4835396365551751, "grad_norm": 2.3387062549591064, "learning_rate": 2.5836186462997103e-05, "loss": 1.1011, "step": 18360 }, { "epoch": 0.48356597313668687, "grad_norm": 3.5115859508514404, "learning_rate": 2.5834869633921515e-05, "loss": 1.3581, "step": 18361 }, { "epoch": 0.48359230971819855, "grad_norm": 2.8621480464935303, "learning_rate": 2.5833552804845934e-05, "loss": 1.1026, "step": 18362 }, { "epoch": 0.4836186462997103, "grad_norm": 2.5172183513641357, "learning_rate": 2.5832235975770346e-05, "loss": 1.8828, "step": 18363 }, { "epoch": 0.48364498288122204, "grad_norm": 2.953972816467285, "learning_rate": 2.583091914669476e-05, "loss": 0.8589, "step": 18364 }, { "epoch": 0.48367131946273373, "grad_norm": 1.8448625802993774, "learning_rate": 2.5829602317619174e-05, "loss": 1.9318, "step": 18365 }, { "epoch": 0.48369765604424547, "grad_norm": 1.8998172283172607, "learning_rate": 2.582828548854359e-05, "loss": 1.8291, "step": 18366 }, { "epoch": 0.48372399262575716, "grad_norm": 1.906227469444275, "learning_rate": 2.5826968659468005e-05, "loss": 1.3863, "step": 18367 }, { "epoch": 0.4837503292072689, "grad_norm": 1.814586877822876, "learning_rate": 2.5825651830392418e-05, "loss": 1.9128, "step": 18368 }, { "epoch": 0.4837766657887806, "grad_norm": 2.9641284942626953, "learning_rate": 2.582433500131683e-05, "loss": 1.5397, "step": 18369 }, { "epoch": 0.48380300237029233, "grad_norm": 4.049939155578613, "learning_rate": 2.5823018172241242e-05, "loss": 1.5262, "step": 18370 }, { "epoch": 0.4838293389518041, "grad_norm": 1.870164155960083, "learning_rate": 2.582170134316566e-05, "loss": 0.9732, "step": 18371 }, { "epoch": 0.48385567553331577, "grad_norm": 3.756333351135254, "learning_rate": 2.5820384514090073e-05, "loss": 1.1043, "step": 18372 }, { "epoch": 0.4838820121148275, "grad_norm": 2.0509378910064697, "learning_rate": 2.5819067685014485e-05, "loss": 1.9517, "step": 18373 }, { "epoch": 0.4839083486963392, "grad_norm": 5.011322975158691, "learning_rate": 2.5817750855938898e-05, "loss": 1.9828, "step": 18374 }, { "epoch": 0.48393468527785094, "grad_norm": 2.274775743484497, "learning_rate": 2.5816434026863313e-05, "loss": 2.181, "step": 18375 }, { "epoch": 0.48396102185936263, "grad_norm": 3.4376394748687744, "learning_rate": 2.581511719778773e-05, "loss": 2.1538, "step": 18376 }, { "epoch": 0.48398735844087437, "grad_norm": 1.577967643737793, "learning_rate": 2.5813800368712144e-05, "loss": 2.4193, "step": 18377 }, { "epoch": 0.4840136950223861, "grad_norm": 2.75225830078125, "learning_rate": 2.5812483539636557e-05, "loss": 1.4493, "step": 18378 }, { "epoch": 0.4840400316038978, "grad_norm": 1.5984172821044922, "learning_rate": 2.581116671056097e-05, "loss": 1.8152, "step": 18379 }, { "epoch": 0.48406636818540955, "grad_norm": 2.0202548503875732, "learning_rate": 2.580984988148538e-05, "loss": 1.726, "step": 18380 }, { "epoch": 0.48409270476692123, "grad_norm": 1.554030418395996, "learning_rate": 2.58085330524098e-05, "loss": 1.9233, "step": 18381 }, { "epoch": 0.484119041348433, "grad_norm": 1.8082032203674316, "learning_rate": 2.5807216223334212e-05, "loss": 1.015, "step": 18382 }, { "epoch": 0.4841453779299447, "grad_norm": 1.779860258102417, "learning_rate": 2.5805899394258624e-05, "loss": 0.5784, "step": 18383 }, { "epoch": 0.4841717145114564, "grad_norm": 2.2123851776123047, "learning_rate": 2.580458256518304e-05, "loss": 1.8172, "step": 18384 }, { "epoch": 0.48419805109296815, "grad_norm": 1.8945305347442627, "learning_rate": 2.5803265736107456e-05, "loss": 1.8525, "step": 18385 }, { "epoch": 0.48422438767447984, "grad_norm": 2.622844934463501, "learning_rate": 2.580194890703187e-05, "loss": 0.7779, "step": 18386 }, { "epoch": 0.4842507242559916, "grad_norm": 2.0889954566955566, "learning_rate": 2.5800632077956283e-05, "loss": 1.5266, "step": 18387 }, { "epoch": 0.48427706083750327, "grad_norm": 2.3298048973083496, "learning_rate": 2.5799315248880696e-05, "loss": 2.2032, "step": 18388 }, { "epoch": 0.484303397419015, "grad_norm": 1.5841577053070068, "learning_rate": 2.5797998419805108e-05, "loss": 1.8769, "step": 18389 }, { "epoch": 0.48432973400052676, "grad_norm": 3.13581919670105, "learning_rate": 2.5796681590729527e-05, "loss": 1.656, "step": 18390 }, { "epoch": 0.48435607058203844, "grad_norm": 2.65350341796875, "learning_rate": 2.579536476165394e-05, "loss": 0.3908, "step": 18391 }, { "epoch": 0.4843824071635502, "grad_norm": 3.9427220821380615, "learning_rate": 2.579404793257835e-05, "loss": 0.5817, "step": 18392 }, { "epoch": 0.4844087437450619, "grad_norm": 2.438901424407959, "learning_rate": 2.5792731103502767e-05, "loss": 2.3633, "step": 18393 }, { "epoch": 0.4844350803265736, "grad_norm": 2.4363667964935303, "learning_rate": 2.579141427442718e-05, "loss": 2.085, "step": 18394 }, { "epoch": 0.4844614169080853, "grad_norm": 1.9361634254455566, "learning_rate": 2.5790097445351598e-05, "loss": 2.1063, "step": 18395 }, { "epoch": 0.48448775348959705, "grad_norm": 2.282162666320801, "learning_rate": 2.578878061627601e-05, "loss": 0.898, "step": 18396 }, { "epoch": 0.4845140900711088, "grad_norm": 1.8887823820114136, "learning_rate": 2.5787463787200422e-05, "loss": 1.9128, "step": 18397 }, { "epoch": 0.4845404266526205, "grad_norm": 2.7833309173583984, "learning_rate": 2.5786146958124835e-05, "loss": 1.4563, "step": 18398 }, { "epoch": 0.4845667632341322, "grad_norm": 1.7120144367218018, "learning_rate": 2.5784830129049254e-05, "loss": 1.7172, "step": 18399 }, { "epoch": 0.4845930998156439, "grad_norm": 2.7394728660583496, "learning_rate": 2.5783513299973666e-05, "loss": 1.1691, "step": 18400 }, { "epoch": 0.48461943639715566, "grad_norm": 2.6390151977539062, "learning_rate": 2.5782196470898078e-05, "loss": 0.4177, "step": 18401 }, { "epoch": 0.48464577297866734, "grad_norm": 1.8348826169967651, "learning_rate": 2.578087964182249e-05, "loss": 3.2067, "step": 18402 }, { "epoch": 0.4846721095601791, "grad_norm": 2.5606539249420166, "learning_rate": 2.5779562812746906e-05, "loss": 2.3002, "step": 18403 }, { "epoch": 0.48469844614169083, "grad_norm": 2.3585174083709717, "learning_rate": 2.577824598367132e-05, "loss": 1.7013, "step": 18404 }, { "epoch": 0.4847247827232025, "grad_norm": 1.8755167722702026, "learning_rate": 2.5776929154595737e-05, "loss": 1.9228, "step": 18405 }, { "epoch": 0.48475111930471426, "grad_norm": 1.3895341157913208, "learning_rate": 2.577561232552015e-05, "loss": 1.8269, "step": 18406 }, { "epoch": 0.48477745588622595, "grad_norm": 1.6985033750534058, "learning_rate": 2.577429549644456e-05, "loss": 1.8735, "step": 18407 }, { "epoch": 0.4848037924677377, "grad_norm": 3.8967151641845703, "learning_rate": 2.5772978667368974e-05, "loss": 0.7406, "step": 18408 }, { "epoch": 0.4848301290492494, "grad_norm": 4.177510738372803, "learning_rate": 2.5771661838293393e-05, "loss": 1.4695, "step": 18409 }, { "epoch": 0.4848564656307611, "grad_norm": 1.816548228263855, "learning_rate": 2.5770345009217805e-05, "loss": 1.8405, "step": 18410 }, { "epoch": 0.48488280221227287, "grad_norm": 2.2118592262268066, "learning_rate": 2.5769028180142217e-05, "loss": 1.6375, "step": 18411 }, { "epoch": 0.48490913879378456, "grad_norm": 1.968703269958496, "learning_rate": 2.5767711351066633e-05, "loss": 1.5512, "step": 18412 }, { "epoch": 0.4849354753752963, "grad_norm": 2.9055683612823486, "learning_rate": 2.5766394521991045e-05, "loss": 1.5991, "step": 18413 }, { "epoch": 0.484961811956808, "grad_norm": 2.03057861328125, "learning_rate": 2.5765077692915464e-05, "loss": 0.9919, "step": 18414 }, { "epoch": 0.48498814853831973, "grad_norm": 1.6863213777542114, "learning_rate": 2.5763760863839876e-05, "loss": 1.9363, "step": 18415 }, { "epoch": 0.4850144851198315, "grad_norm": 2.704174041748047, "learning_rate": 2.5762444034764288e-05, "loss": 1.5539, "step": 18416 }, { "epoch": 0.48504082170134316, "grad_norm": 6.385892391204834, "learning_rate": 2.57611272056887e-05, "loss": 1.2474, "step": 18417 }, { "epoch": 0.4850671582828549, "grad_norm": 1.6319364309310913, "learning_rate": 2.575981037661312e-05, "loss": 2.1538, "step": 18418 }, { "epoch": 0.4850934948643666, "grad_norm": 2.219048261642456, "learning_rate": 2.5758493547537532e-05, "loss": 1.7055, "step": 18419 }, { "epoch": 0.48511983144587834, "grad_norm": 1.4454690217971802, "learning_rate": 2.5757176718461944e-05, "loss": 2.0454, "step": 18420 }, { "epoch": 0.48514616802739, "grad_norm": 1.5924925804138184, "learning_rate": 2.575585988938636e-05, "loss": 1.7619, "step": 18421 }, { "epoch": 0.48517250460890177, "grad_norm": 1.780044436454773, "learning_rate": 2.5754543060310772e-05, "loss": 1.9857, "step": 18422 }, { "epoch": 0.4851988411904135, "grad_norm": 2.2101314067840576, "learning_rate": 2.575322623123519e-05, "loss": 1.9862, "step": 18423 }, { "epoch": 0.4852251777719252, "grad_norm": 2.3404316902160645, "learning_rate": 2.5751909402159603e-05, "loss": 1.8558, "step": 18424 }, { "epoch": 0.48525151435343694, "grad_norm": 2.050753355026245, "learning_rate": 2.5750592573084015e-05, "loss": 1.8381, "step": 18425 }, { "epoch": 0.48527785093494863, "grad_norm": 3.0578525066375732, "learning_rate": 2.5749275744008427e-05, "loss": 0.5958, "step": 18426 }, { "epoch": 0.4853041875164604, "grad_norm": 1.8695067167282104, "learning_rate": 2.574795891493284e-05, "loss": 1.622, "step": 18427 }, { "epoch": 0.48533052409797206, "grad_norm": 2.1240391731262207, "learning_rate": 2.574664208585726e-05, "loss": 1.2508, "step": 18428 }, { "epoch": 0.4853568606794838, "grad_norm": 2.0256903171539307, "learning_rate": 2.574532525678167e-05, "loss": 1.4946, "step": 18429 }, { "epoch": 0.48538319726099555, "grad_norm": 2.7355430126190186, "learning_rate": 2.5744008427706083e-05, "loss": 2.0422, "step": 18430 }, { "epoch": 0.48540953384250723, "grad_norm": 1.7283477783203125, "learning_rate": 2.57426915986305e-05, "loss": 1.4768, "step": 18431 }, { "epoch": 0.485435870424019, "grad_norm": 3.126765727996826, "learning_rate": 2.5741374769554914e-05, "loss": 1.2515, "step": 18432 }, { "epoch": 0.48546220700553067, "grad_norm": 3.3315203189849854, "learning_rate": 2.574005794047933e-05, "loss": 2.1508, "step": 18433 }, { "epoch": 0.4854885435870424, "grad_norm": 1.8240033388137817, "learning_rate": 2.5738741111403742e-05, "loss": 1.3215, "step": 18434 }, { "epoch": 0.4855148801685541, "grad_norm": 1.2926433086395264, "learning_rate": 2.5737424282328154e-05, "loss": 1.5452, "step": 18435 }, { "epoch": 0.48554121675006584, "grad_norm": 3.809260845184326, "learning_rate": 2.5736107453252566e-05, "loss": 1.4537, "step": 18436 }, { "epoch": 0.4855675533315776, "grad_norm": 2.7715842723846436, "learning_rate": 2.5734790624176985e-05, "loss": 0.8447, "step": 18437 }, { "epoch": 0.48559388991308927, "grad_norm": 5.419256687164307, "learning_rate": 2.5733473795101398e-05, "loss": 2.0753, "step": 18438 }, { "epoch": 0.485620226494601, "grad_norm": 1.581094741821289, "learning_rate": 2.573215696602581e-05, "loss": 1.4794, "step": 18439 }, { "epoch": 0.4856465630761127, "grad_norm": 2.4569525718688965, "learning_rate": 2.5730840136950225e-05, "loss": 1.0419, "step": 18440 }, { "epoch": 0.48567289965762445, "grad_norm": 2.8132851123809814, "learning_rate": 2.5729523307874638e-05, "loss": 1.2611, "step": 18441 }, { "epoch": 0.48569923623913613, "grad_norm": 3.084397315979004, "learning_rate": 2.5728206478799057e-05, "loss": 2.0227, "step": 18442 }, { "epoch": 0.4857255728206479, "grad_norm": 3.458346128463745, "learning_rate": 2.572688964972347e-05, "loss": 2.8425, "step": 18443 }, { "epoch": 0.4857519094021596, "grad_norm": 2.111849069595337, "learning_rate": 2.572557282064788e-05, "loss": 1.9156, "step": 18444 }, { "epoch": 0.4857782459836713, "grad_norm": 1.8623846769332886, "learning_rate": 2.5724255991572293e-05, "loss": 2.0972, "step": 18445 }, { "epoch": 0.48580458256518305, "grad_norm": 2.8974735736846924, "learning_rate": 2.5722939162496705e-05, "loss": 1.4019, "step": 18446 }, { "epoch": 0.48583091914669474, "grad_norm": 1.8519647121429443, "learning_rate": 2.5721622333421124e-05, "loss": 2.0956, "step": 18447 }, { "epoch": 0.4858572557282065, "grad_norm": 3.604668617248535, "learning_rate": 2.5720305504345537e-05, "loss": 0.9024, "step": 18448 }, { "epoch": 0.4858835923097182, "grad_norm": 2.3852856159210205, "learning_rate": 2.571898867526995e-05, "loss": 1.6787, "step": 18449 }, { "epoch": 0.4859099288912299, "grad_norm": 1.5685434341430664, "learning_rate": 2.5717671846194364e-05, "loss": 1.896, "step": 18450 }, { "epoch": 0.48593626547274166, "grad_norm": 3.2516274452209473, "learning_rate": 2.571635501711878e-05, "loss": 1.9251, "step": 18451 }, { "epoch": 0.48596260205425335, "grad_norm": 1.8900718688964844, "learning_rate": 2.5715038188043196e-05, "loss": 1.4476, "step": 18452 }, { "epoch": 0.4859889386357651, "grad_norm": 3.0719430446624756, "learning_rate": 2.5713721358967608e-05, "loss": 1.9875, "step": 18453 }, { "epoch": 0.4860152752172768, "grad_norm": 2.0585649013519287, "learning_rate": 2.571240452989202e-05, "loss": 0.7618, "step": 18454 }, { "epoch": 0.4860416117987885, "grad_norm": 1.8559067249298096, "learning_rate": 2.5711087700816432e-05, "loss": 1.5609, "step": 18455 }, { "epoch": 0.48606794838030026, "grad_norm": 2.169935941696167, "learning_rate": 2.570977087174085e-05, "loss": 2.1225, "step": 18456 }, { "epoch": 0.48609428496181195, "grad_norm": 2.295045852661133, "learning_rate": 2.5708454042665263e-05, "loss": 1.6724, "step": 18457 }, { "epoch": 0.4861206215433237, "grad_norm": 1.7074811458587646, "learning_rate": 2.5707137213589676e-05, "loss": 1.6424, "step": 18458 }, { "epoch": 0.4861469581248354, "grad_norm": 4.542028903961182, "learning_rate": 2.570582038451409e-05, "loss": 1.4893, "step": 18459 }, { "epoch": 0.4861732947063471, "grad_norm": 1.9196478128433228, "learning_rate": 2.5704503555438503e-05, "loss": 1.4574, "step": 18460 }, { "epoch": 0.4861996312878588, "grad_norm": 2.9205195903778076, "learning_rate": 2.5703186726362922e-05, "loss": 1.4391, "step": 18461 }, { "epoch": 0.48622596786937056, "grad_norm": 1.8574875593185425, "learning_rate": 2.5701869897287335e-05, "loss": 2.1417, "step": 18462 }, { "epoch": 0.4862523044508823, "grad_norm": 2.81107234954834, "learning_rate": 2.5700553068211747e-05, "loss": 2.5147, "step": 18463 }, { "epoch": 0.486278641032394, "grad_norm": 2.6533210277557373, "learning_rate": 2.569923623913616e-05, "loss": 1.7436, "step": 18464 }, { "epoch": 0.48630497761390573, "grad_norm": 4.165168762207031, "learning_rate": 2.5697919410060578e-05, "loss": 2.0054, "step": 18465 }, { "epoch": 0.4863313141954174, "grad_norm": 1.9256856441497803, "learning_rate": 2.569660258098499e-05, "loss": 1.9637, "step": 18466 }, { "epoch": 0.48635765077692916, "grad_norm": 1.7320513725280762, "learning_rate": 2.5695285751909402e-05, "loss": 1.8098, "step": 18467 }, { "epoch": 0.48638398735844085, "grad_norm": 2.1129705905914307, "learning_rate": 2.5693968922833818e-05, "loss": 1.1088, "step": 18468 }, { "epoch": 0.4864103239399526, "grad_norm": 5.155686378479004, "learning_rate": 2.569265209375823e-05, "loss": 1.3488, "step": 18469 }, { "epoch": 0.48643666052146434, "grad_norm": 3.6876680850982666, "learning_rate": 2.569133526468265e-05, "loss": 1.6031, "step": 18470 }, { "epoch": 0.486462997102976, "grad_norm": 1.8359041213989258, "learning_rate": 2.569001843560706e-05, "loss": 1.4716, "step": 18471 }, { "epoch": 0.48648933368448777, "grad_norm": 2.9589924812316895, "learning_rate": 2.5688701606531474e-05, "loss": 0.6421, "step": 18472 }, { "epoch": 0.48651567026599946, "grad_norm": 1.6885493993759155, "learning_rate": 2.5687384777455886e-05, "loss": 2.1624, "step": 18473 }, { "epoch": 0.4865420068475112, "grad_norm": 3.8228204250335693, "learning_rate": 2.5686067948380298e-05, "loss": 1.9418, "step": 18474 }, { "epoch": 0.4865683434290229, "grad_norm": 3.8486132621765137, "learning_rate": 2.5684751119304717e-05, "loss": 1.9678, "step": 18475 }, { "epoch": 0.48659468001053463, "grad_norm": 2.923513412475586, "learning_rate": 2.568343429022913e-05, "loss": 1.6008, "step": 18476 }, { "epoch": 0.4866210165920464, "grad_norm": 2.0346717834472656, "learning_rate": 2.568211746115354e-05, "loss": 1.6052, "step": 18477 }, { "epoch": 0.48664735317355806, "grad_norm": 4.474893569946289, "learning_rate": 2.5680800632077957e-05, "loss": 1.6256, "step": 18478 }, { "epoch": 0.4866736897550698, "grad_norm": 4.485257625579834, "learning_rate": 2.567948380300237e-05, "loss": 1.468, "step": 18479 }, { "epoch": 0.4867000263365815, "grad_norm": 2.5774567127227783, "learning_rate": 2.5678166973926788e-05, "loss": 1.6469, "step": 18480 }, { "epoch": 0.48672636291809324, "grad_norm": 2.047030448913574, "learning_rate": 2.56768501448512e-05, "loss": 2.1457, "step": 18481 }, { "epoch": 0.4867526994996049, "grad_norm": 1.8974688053131104, "learning_rate": 2.5675533315775613e-05, "loss": 2.2784, "step": 18482 }, { "epoch": 0.48677903608111667, "grad_norm": 4.346259593963623, "learning_rate": 2.5674216486700025e-05, "loss": 1.9088, "step": 18483 }, { "epoch": 0.4868053726626284, "grad_norm": 1.8092743158340454, "learning_rate": 2.5672899657624444e-05, "loss": 1.8927, "step": 18484 }, { "epoch": 0.4868317092441401, "grad_norm": 3.205528736114502, "learning_rate": 2.5671582828548856e-05, "loss": 1.3568, "step": 18485 }, { "epoch": 0.48685804582565184, "grad_norm": 1.6814615726470947, "learning_rate": 2.567026599947327e-05, "loss": 2.2728, "step": 18486 }, { "epoch": 0.48688438240716353, "grad_norm": 5.064326763153076, "learning_rate": 2.5668949170397684e-05, "loss": 1.3509, "step": 18487 }, { "epoch": 0.4869107189886753, "grad_norm": 3.4415476322174072, "learning_rate": 2.5667632341322096e-05, "loss": 2.8161, "step": 18488 }, { "epoch": 0.486937055570187, "grad_norm": 2.522833824157715, "learning_rate": 2.5666315512246515e-05, "loss": 1.0294, "step": 18489 }, { "epoch": 0.4869633921516987, "grad_norm": 2.1585633754730225, "learning_rate": 2.5664998683170927e-05, "loss": 1.3564, "step": 18490 }, { "epoch": 0.48698972873321045, "grad_norm": 2.394378900527954, "learning_rate": 2.566368185409534e-05, "loss": 2.4195, "step": 18491 }, { "epoch": 0.48701606531472214, "grad_norm": 2.5911078453063965, "learning_rate": 2.5662365025019752e-05, "loss": 1.2326, "step": 18492 }, { "epoch": 0.4870424018962339, "grad_norm": 1.7802774906158447, "learning_rate": 2.5661048195944164e-05, "loss": 0.6364, "step": 18493 }, { "epoch": 0.48706873847774557, "grad_norm": 2.638225555419922, "learning_rate": 2.5659731366868583e-05, "loss": 1.918, "step": 18494 }, { "epoch": 0.4870950750592573, "grad_norm": 1.846629023551941, "learning_rate": 2.5658414537792995e-05, "loss": 1.646, "step": 18495 }, { "epoch": 0.48712141164076905, "grad_norm": 2.197716236114502, "learning_rate": 2.565709770871741e-05, "loss": 1.7315, "step": 18496 }, { "epoch": 0.48714774822228074, "grad_norm": 1.91517174243927, "learning_rate": 2.5655780879641823e-05, "loss": 1.6118, "step": 18497 }, { "epoch": 0.4871740848037925, "grad_norm": 4.605914115905762, "learning_rate": 2.5654464050566242e-05, "loss": 1.6682, "step": 18498 }, { "epoch": 0.48720042138530417, "grad_norm": 2.0153045654296875, "learning_rate": 2.5653147221490654e-05, "loss": 1.3794, "step": 18499 }, { "epoch": 0.4872267579668159, "grad_norm": 1.849891185760498, "learning_rate": 2.5651830392415066e-05, "loss": 1.4523, "step": 18500 } ], "logging_steps": 1, "max_steps": 37970, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.6567889688508826e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }