diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.7506750675067507, + "epoch": 0.9998199819981998, "eval_steps": 695, - "global_step": 2085, + "global_step": 2777, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -14634,6 +14634,4850 @@ "eval_samples_per_second": 45.959, "eval_steps_per_second": 11.495, "step": 2085 + }, + { + "epoch": 0.7510351035103511, + "grad_norm": 0.671138346195221, + "learning_rate": 3.0046836669235433e-05, + "loss": 4.6252, + "step": 2086 + }, + { + "epoch": 0.7513951395139514, + "grad_norm": 0.9334102869033813, + "learning_rate": 2.996455867635155e-05, + "loss": 4.5999, + "step": 2087 + }, + { + "epoch": 0.7517551755175518, + "grad_norm": 0.8894442319869995, + "learning_rate": 2.988237363301758e-05, + "loss": 5.0383, + "step": 2088 + }, + { + "epoch": 0.7521152115211521, + "grad_norm": 0.6898563504219055, + "learning_rate": 2.9800281648307794e-05, + "loss": 4.8665, + "step": 2089 + }, + { + "epoch": 0.7524752475247525, + "grad_norm": 0.6982426047325134, + "learning_rate": 2.9718282831172883e-05, + "loss": 4.8052, + "step": 2090 + }, + { + "epoch": 0.7528352835283528, + "grad_norm": 0.6563032865524292, + "learning_rate": 2.9636377290439944e-05, + "loss": 4.4563, + "step": 2091 + }, + { + "epoch": 0.7531953195319532, + "grad_norm": 0.747215986251831, + "learning_rate": 2.9554565134812294e-05, + "loss": 5.0272, + "step": 2092 + }, + { + "epoch": 0.7535553555355535, + "grad_norm": 0.842694878578186, + "learning_rate": 2.9472846472869298e-05, + "loss": 5.1278, + "step": 2093 + }, + { + "epoch": 0.7539153915391539, + "grad_norm": 1.041561484336853, + "learning_rate": 2.9391221413066182e-05, + "loss": 5.2336, + "step": 2094 + }, + { + "epoch": 0.7542754275427542, + "grad_norm": 0.7632551193237305, + "learning_rate": 2.930969006373402e-05, + "loss": 4.7186, + "step": 2095 + }, + { + "epoch": 0.7546354635463547, + "grad_norm": 1.0100700855255127, + "learning_rate": 2.922825253307947e-05, + "loss": 5.0076, + "step": 2096 + }, + { + "epoch": 0.754995499549955, + "grad_norm": 0.5503267049789429, + "learning_rate": 2.9146908929184713e-05, + "loss": 4.6652, + "step": 2097 + }, + { + "epoch": 0.7553555355535554, + "grad_norm": 0.7919767498970032, + "learning_rate": 2.9065659360007247e-05, + "loss": 5.4393, + "step": 2098 + }, + { + "epoch": 0.7557155715571557, + "grad_norm": 1.0583096742630005, + "learning_rate": 2.898450393337977e-05, + "loss": 5.2044, + "step": 2099 + }, + { + "epoch": 0.7560756075607561, + "grad_norm": 1.1439197063446045, + "learning_rate": 2.8903442757010035e-05, + "loss": 5.2308, + "step": 2100 + }, + { + "epoch": 0.7564356435643564, + "grad_norm": 1.3537611961364746, + "learning_rate": 2.8822475938480764e-05, + "loss": 4.7218, + "step": 2101 + }, + { + "epoch": 0.7567956795679568, + "grad_norm": 0.6820851564407349, + "learning_rate": 2.874160358524931e-05, + "loss": 4.7861, + "step": 2102 + }, + { + "epoch": 0.7571557155715571, + "grad_norm": 1.5382546186447144, + "learning_rate": 2.8660825804647795e-05, + "loss": 4.6253, + "step": 2103 + }, + { + "epoch": 0.7575157515751575, + "grad_norm": 0.9953200221061707, + "learning_rate": 2.8580142703882796e-05, + "loss": 4.5608, + "step": 2104 + }, + { + "epoch": 0.7578757875787578, + "grad_norm": 0.7481911778450012, + "learning_rate": 2.8499554390035143e-05, + "loss": 4.9828, + "step": 2105 + }, + { + "epoch": 0.7582358235823582, + "grad_norm": 0.8973466753959656, + "learning_rate": 2.8419060970059974e-05, + "loss": 4.4942, + "step": 2106 + }, + { + "epoch": 0.7585958595859585, + "grad_norm": 0.6188588738441467, + "learning_rate": 2.8338662550786443e-05, + "loss": 4.917, + "step": 2107 + }, + { + "epoch": 0.758955895589559, + "grad_norm": 0.8116443157196045, + "learning_rate": 2.8258359238917665e-05, + "loss": 4.8898, + "step": 2108 + }, + { + "epoch": 0.7593159315931594, + "grad_norm": 0.996688187122345, + "learning_rate": 2.8178151141030406e-05, + "loss": 4.6581, + "step": 2109 + }, + { + "epoch": 0.7596759675967597, + "grad_norm": 1.0279045104980469, + "learning_rate": 2.8098038363575186e-05, + "loss": 4.8797, + "step": 2110 + }, + { + "epoch": 0.76003600360036, + "grad_norm": 0.7636517286300659, + "learning_rate": 2.8018021012875994e-05, + "loss": 4.4845, + "step": 2111 + }, + { + "epoch": 0.7603960396039604, + "grad_norm": 0.8215141892433167, + "learning_rate": 2.7938099195130153e-05, + "loss": 4.7628, + "step": 2112 + }, + { + "epoch": 0.7607560756075608, + "grad_norm": 1.067277431488037, + "learning_rate": 2.7858273016408197e-05, + "loss": 5.2078, + "step": 2113 + }, + { + "epoch": 0.7611161116111611, + "grad_norm": 0.7776371240615845, + "learning_rate": 2.7778542582653744e-05, + "loss": 5.1417, + "step": 2114 + }, + { + "epoch": 0.7614761476147615, + "grad_norm": 0.8139586448669434, + "learning_rate": 2.769890799968332e-05, + "loss": 4.8309, + "step": 2115 + }, + { + "epoch": 0.7618361836183618, + "grad_norm": 0.7961241006851196, + "learning_rate": 2.7619369373186288e-05, + "loss": 4.9364, + "step": 2116 + }, + { + "epoch": 0.7621962196219622, + "grad_norm": 0.6579238176345825, + "learning_rate": 2.753992680872457e-05, + "loss": 4.8429, + "step": 2117 + }, + { + "epoch": 0.7625562556255625, + "grad_norm": 0.6686379909515381, + "learning_rate": 2.746058041173266e-05, + "loss": 4.8558, + "step": 2118 + }, + { + "epoch": 0.762916291629163, + "grad_norm": 0.6807066202163696, + "learning_rate": 2.7381330287517426e-05, + "loss": 4.4459, + "step": 2119 + }, + { + "epoch": 0.7632763276327633, + "grad_norm": 0.6293361783027649, + "learning_rate": 2.7302176541257986e-05, + "loss": 4.898, + "step": 2120 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 0.8723804950714111, + "learning_rate": 2.7223119278005438e-05, + "loss": 4.6484, + "step": 2121 + }, + { + "epoch": 0.763996399639964, + "grad_norm": 0.848070502281189, + "learning_rate": 2.7144158602682924e-05, + "loss": 5.2841, + "step": 2122 + }, + { + "epoch": 0.7643564356435644, + "grad_norm": 0.9075490832328796, + "learning_rate": 2.7065294620085424e-05, + "loss": 5.2137, + "step": 2123 + }, + { + "epoch": 0.7647164716471647, + "grad_norm": 1.0051218271255493, + "learning_rate": 2.6986527434879472e-05, + "loss": 5.1463, + "step": 2124 + }, + { + "epoch": 0.7650765076507651, + "grad_norm": 1.058005928993225, + "learning_rate": 2.6907857151603234e-05, + "loss": 5.2255, + "step": 2125 + }, + { + "epoch": 0.7654365436543654, + "grad_norm": 1.0294654369354248, + "learning_rate": 2.6829283874666233e-05, + "loss": 4.6688, + "step": 2126 + }, + { + "epoch": 0.7657965796579658, + "grad_norm": 0.8858208656311035, + "learning_rate": 2.6750807708349267e-05, + "loss": 4.6658, + "step": 2127 + }, + { + "epoch": 0.7661566156615661, + "grad_norm": 0.8013286590576172, + "learning_rate": 2.6672428756804225e-05, + "loss": 4.849, + "step": 2128 + }, + { + "epoch": 0.7665166516651665, + "grad_norm": 0.7389269471168518, + "learning_rate": 2.659414712405398e-05, + "loss": 4.4728, + "step": 2129 + }, + { + "epoch": 0.7668766876687669, + "grad_norm": 0.6583431363105774, + "learning_rate": 2.6515962913992275e-05, + "loss": 4.9133, + "step": 2130 + }, + { + "epoch": 0.7672367236723673, + "grad_norm": 0.6830927133560181, + "learning_rate": 2.643787623038354e-05, + "loss": 4.7863, + "step": 2131 + }, + { + "epoch": 0.7675967596759676, + "grad_norm": 0.6234518885612488, + "learning_rate": 2.6359887176862718e-05, + "loss": 4.6174, + "step": 2132 + }, + { + "epoch": 0.767956795679568, + "grad_norm": 0.7106865048408508, + "learning_rate": 2.6281995856935237e-05, + "loss": 4.7574, + "step": 2133 + }, + { + "epoch": 0.7683168316831683, + "grad_norm": 0.9051821231842041, + "learning_rate": 2.6204202373976818e-05, + "loss": 4.8563, + "step": 2134 + }, + { + "epoch": 0.7686768676867687, + "grad_norm": 0.6196146011352539, + "learning_rate": 2.6126506831233344e-05, + "loss": 4.5112, + "step": 2135 + }, + { + "epoch": 0.769036903690369, + "grad_norm": 0.6337679624557495, + "learning_rate": 2.6048909331820636e-05, + "loss": 4.6024, + "step": 2136 + }, + { + "epoch": 0.7693969396939694, + "grad_norm": 0.6263675689697266, + "learning_rate": 2.5971409978724458e-05, + "loss": 4.6405, + "step": 2137 + }, + { + "epoch": 0.7697569756975697, + "grad_norm": 1.0419749021530151, + "learning_rate": 2.5894008874800325e-05, + "loss": 4.7905, + "step": 2138 + }, + { + "epoch": 0.7701170117011701, + "grad_norm": 0.8129563927650452, + "learning_rate": 2.581670612277335e-05, + "loss": 4.4431, + "step": 2139 + }, + { + "epoch": 0.7704770477047704, + "grad_norm": 0.7132530212402344, + "learning_rate": 2.5739501825238053e-05, + "loss": 4.7698, + "step": 2140 + }, + { + "epoch": 0.7708370837083708, + "grad_norm": 0.6291266083717346, + "learning_rate": 2.566239608465838e-05, + "loss": 4.5729, + "step": 2141 + }, + { + "epoch": 0.7711971197119712, + "grad_norm": 0.5569373965263367, + "learning_rate": 2.558538900336741e-05, + "loss": 4.9298, + "step": 2142 + }, + { + "epoch": 0.7715571557155716, + "grad_norm": 0.72600257396698, + "learning_rate": 2.5508480683567315e-05, + "loss": 4.8837, + "step": 2143 + }, + { + "epoch": 0.771917191719172, + "grad_norm": 0.7014186978340149, + "learning_rate": 2.543167122732918e-05, + "loss": 4.6547, + "step": 2144 + }, + { + "epoch": 0.7722772277227723, + "grad_norm": 0.8487231731414795, + "learning_rate": 2.5354960736592883e-05, + "loss": 4.6923, + "step": 2145 + }, + { + "epoch": 0.7726372637263726, + "grad_norm": 0.6868178844451904, + "learning_rate": 2.5278349313166992e-05, + "loss": 4.6032, + "step": 2146 + }, + { + "epoch": 0.772997299729973, + "grad_norm": 0.739762544631958, + "learning_rate": 2.5201837058728505e-05, + "loss": 4.8253, + "step": 2147 + }, + { + "epoch": 0.7733573357335733, + "grad_norm": 1.2171458005905151, + "learning_rate": 2.512542407482289e-05, + "loss": 5.0895, + "step": 2148 + }, + { + "epoch": 0.7737173717371737, + "grad_norm": 1.3037205934524536, + "learning_rate": 2.504911046286382e-05, + "loss": 4.9572, + "step": 2149 + }, + { + "epoch": 0.774077407740774, + "grad_norm": 1.3282265663146973, + "learning_rate": 2.4972896324133144e-05, + "loss": 5.4213, + "step": 2150 + }, + { + "epoch": 0.7744374437443744, + "grad_norm": 1.392838716506958, + "learning_rate": 2.4896781759780585e-05, + "loss": 4.7398, + "step": 2151 + }, + { + "epoch": 0.7747974797479747, + "grad_norm": 0.6054560542106628, + "learning_rate": 2.4820766870823807e-05, + "loss": 4.7054, + "step": 2152 + }, + { + "epoch": 0.7751575157515752, + "grad_norm": 0.6699235439300537, + "learning_rate": 2.4744851758148156e-05, + "loss": 4.6727, + "step": 2153 + }, + { + "epoch": 0.7755175517551756, + "grad_norm": 0.5790958404541016, + "learning_rate": 2.4669036522506584e-05, + "loss": 4.8101, + "step": 2154 + }, + { + "epoch": 0.7758775877587759, + "grad_norm": 0.8083108067512512, + "learning_rate": 2.45933212645194e-05, + "loss": 4.5601, + "step": 2155 + }, + { + "epoch": 0.7762376237623763, + "grad_norm": 1.0053972005844116, + "learning_rate": 2.451770608467432e-05, + "loss": 4.6501, + "step": 2156 + }, + { + "epoch": 0.7765976597659766, + "grad_norm": 0.6478819847106934, + "learning_rate": 2.4442191083326195e-05, + "loss": 4.6527, + "step": 2157 + }, + { + "epoch": 0.776957695769577, + "grad_norm": 1.433661937713623, + "learning_rate": 2.4366776360696942e-05, + "loss": 5.0813, + "step": 2158 + }, + { + "epoch": 0.7773177317731773, + "grad_norm": 1.1521214246749878, + "learning_rate": 2.429146201687538e-05, + "loss": 4.5685, + "step": 2159 + }, + { + "epoch": 0.7776777677767777, + "grad_norm": 0.6991515159606934, + "learning_rate": 2.42162481518171e-05, + "loss": 4.7082, + "step": 2160 + }, + { + "epoch": 0.778037803780378, + "grad_norm": 0.8507077693939209, + "learning_rate": 2.414113486534434e-05, + "loss": 4.6635, + "step": 2161 + }, + { + "epoch": 0.7783978397839784, + "grad_norm": 0.6432574987411499, + "learning_rate": 2.4066122257145894e-05, + "loss": 4.5532, + "step": 2162 + }, + { + "epoch": 0.7787578757875787, + "grad_norm": 0.6845300793647766, + "learning_rate": 2.3991210426776855e-05, + "loss": 4.6837, + "step": 2163 + }, + { + "epoch": 0.7791179117911791, + "grad_norm": 0.743084728717804, + "learning_rate": 2.3916399473658623e-05, + "loss": 4.7629, + "step": 2164 + }, + { + "epoch": 0.7794779477947795, + "grad_norm": 0.7912582159042358, + "learning_rate": 2.3841689497078746e-05, + "loss": 4.8711, + "step": 2165 + }, + { + "epoch": 0.7798379837983799, + "grad_norm": 0.6151897311210632, + "learning_rate": 2.376708059619065e-05, + "loss": 4.5838, + "step": 2166 + }, + { + "epoch": 0.7801980198019802, + "grad_norm": 0.8373451828956604, + "learning_rate": 2.3692572870013718e-05, + "loss": 4.9176, + "step": 2167 + }, + { + "epoch": 0.7805580558055806, + "grad_norm": 0.5105034708976746, + "learning_rate": 2.361816641743303e-05, + "loss": 4.5368, + "step": 2168 + }, + { + "epoch": 0.7809180918091809, + "grad_norm": 0.6912795305252075, + "learning_rate": 2.354386133719927e-05, + "loss": 4.3363, + "step": 2169 + }, + { + "epoch": 0.7812781278127813, + "grad_norm": 0.8666832447052002, + "learning_rate": 2.3469657727928506e-05, + "loss": 4.5923, + "step": 2170 + }, + { + "epoch": 0.7816381638163816, + "grad_norm": 0.8544076085090637, + "learning_rate": 2.339555568810221e-05, + "loss": 4.9953, + "step": 2171 + }, + { + "epoch": 0.781998199819982, + "grad_norm": 1.0247130393981934, + "learning_rate": 2.3321555316067045e-05, + "loss": 5.4613, + "step": 2172 + }, + { + "epoch": 0.7823582358235823, + "grad_norm": 1.0253169536590576, + "learning_rate": 2.3247656710034737e-05, + "loss": 5.1228, + "step": 2173 + }, + { + "epoch": 0.7827182718271827, + "grad_norm": 1.0105392932891846, + "learning_rate": 2.3173859968081944e-05, + "loss": 4.7797, + "step": 2174 + }, + { + "epoch": 0.783078307830783, + "grad_norm": 0.8575915098190308, + "learning_rate": 2.3100165188150125e-05, + "loss": 5.0751, + "step": 2175 + }, + { + "epoch": 0.7834383438343835, + "grad_norm": 1.8292618989944458, + "learning_rate": 2.3026572468045437e-05, + "loss": 4.8729, + "step": 2176 + }, + { + "epoch": 0.7837983798379838, + "grad_norm": 0.7877120971679688, + "learning_rate": 2.295308190543859e-05, + "loss": 4.3804, + "step": 2177 + }, + { + "epoch": 0.7841584158415842, + "grad_norm": 2.0647926330566406, + "learning_rate": 2.287969359786466e-05, + "loss": 4.7416, + "step": 2178 + }, + { + "epoch": 0.7845184518451845, + "grad_norm": 1.065865159034729, + "learning_rate": 2.280640764272306e-05, + "loss": 4.7786, + "step": 2179 + }, + { + "epoch": 0.7848784878487849, + "grad_norm": 0.7052769064903259, + "learning_rate": 2.2733224137277366e-05, + "loss": 4.9935, + "step": 2180 + }, + { + "epoch": 0.7852385238523852, + "grad_norm": 0.8767135739326477, + "learning_rate": 2.266014317865519e-05, + "loss": 4.7496, + "step": 2181 + }, + { + "epoch": 0.7855985598559856, + "grad_norm": 0.5927574634552002, + "learning_rate": 2.2587164863847975e-05, + "loss": 5.0468, + "step": 2182 + }, + { + "epoch": 0.7859585958595859, + "grad_norm": 1.1241964101791382, + "learning_rate": 2.251428928971102e-05, + "loss": 4.837, + "step": 2183 + }, + { + "epoch": 0.7863186318631863, + "grad_norm": 0.5535566806793213, + "learning_rate": 2.244151655296327e-05, + "loss": 4.5108, + "step": 2184 + }, + { + "epoch": 0.7866786678667866, + "grad_norm": 0.8191620111465454, + "learning_rate": 2.236884675018709e-05, + "loss": 4.9146, + "step": 2185 + }, + { + "epoch": 0.787038703870387, + "grad_norm": 0.7227510809898376, + "learning_rate": 2.2296279977828337e-05, + "loss": 4.4775, + "step": 2186 + }, + { + "epoch": 0.7873987398739875, + "grad_norm": 0.5830511450767517, + "learning_rate": 2.222381633219608e-05, + "loss": 4.604, + "step": 2187 + }, + { + "epoch": 0.7877587758775878, + "grad_norm": 0.6540189981460571, + "learning_rate": 2.2151455909462538e-05, + "loss": 4.5976, + "step": 2188 + }, + { + "epoch": 0.7881188118811882, + "grad_norm": 0.6274272203445435, + "learning_rate": 2.2079198805662914e-05, + "loss": 4.5981, + "step": 2189 + }, + { + "epoch": 0.7884788478847885, + "grad_norm": 0.9411934018135071, + "learning_rate": 2.2007045116695313e-05, + "loss": 4.6338, + "step": 2190 + }, + { + "epoch": 0.7888388838883889, + "grad_norm": 0.640446662902832, + "learning_rate": 2.1934994938320584e-05, + "loss": 4.5651, + "step": 2191 + }, + { + "epoch": 0.7891989198919892, + "grad_norm": 0.6163681149482727, + "learning_rate": 2.1863048366162208e-05, + "loss": 4.4887, + "step": 2192 + }, + { + "epoch": 0.7895589558955896, + "grad_norm": 0.6504824757575989, + "learning_rate": 2.179120549570609e-05, + "loss": 4.84, + "step": 2193 + }, + { + "epoch": 0.7899189918991899, + "grad_norm": 0.6089352965354919, + "learning_rate": 2.1719466422300607e-05, + "loss": 4.8128, + "step": 2194 + }, + { + "epoch": 0.7902790279027903, + "grad_norm": 0.8295852541923523, + "learning_rate": 2.1647831241156302e-05, + "loss": 5.124, + "step": 2195 + }, + { + "epoch": 0.7906390639063906, + "grad_norm": 0.6523507237434387, + "learning_rate": 2.1576300047345932e-05, + "loss": 4.6459, + "step": 2196 + }, + { + "epoch": 0.790999099909991, + "grad_norm": 0.6584704518318176, + "learning_rate": 2.15048729358041e-05, + "loss": 4.6353, + "step": 2197 + }, + { + "epoch": 0.7913591359135913, + "grad_norm": 0.9848087430000305, + "learning_rate": 2.1433550001327373e-05, + "loss": 5.323, + "step": 2198 + }, + { + "epoch": 0.7917191719171918, + "grad_norm": 0.8073936104774475, + "learning_rate": 2.136233133857405e-05, + "loss": 5.1241, + "step": 2199 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 1.2630630731582642, + "learning_rate": 2.129121704206405e-05, + "loss": 5.2807, + "step": 2200 + }, + { + "epoch": 0.7924392439243925, + "grad_norm": 1.061435580253601, + "learning_rate": 2.1220207206178688e-05, + "loss": 4.6137, + "step": 2201 + }, + { + "epoch": 0.7927992799279928, + "grad_norm": 0.7881227731704712, + "learning_rate": 2.114930192516076e-05, + "loss": 4.5962, + "step": 2202 + }, + { + "epoch": 0.7931593159315932, + "grad_norm": 0.7256401777267456, + "learning_rate": 2.107850129311426e-05, + "loss": 4.8192, + "step": 2203 + }, + { + "epoch": 0.7935193519351935, + "grad_norm": 0.9993703365325928, + "learning_rate": 2.1007805404004242e-05, + "loss": 5.1882, + "step": 2204 + }, + { + "epoch": 0.7938793879387939, + "grad_norm": 0.714908242225647, + "learning_rate": 2.09372143516568e-05, + "loss": 4.6632, + "step": 2205 + }, + { + "epoch": 0.7942394239423942, + "grad_norm": 0.9289857745170593, + "learning_rate": 2.0866728229758857e-05, + "loss": 4.8051, + "step": 2206 + }, + { + "epoch": 0.7945994599459946, + "grad_norm": 1.0063886642456055, + "learning_rate": 2.0796347131858186e-05, + "loss": 4.8422, + "step": 2207 + }, + { + "epoch": 0.7949594959495949, + "grad_norm": 0.7214351296424866, + "learning_rate": 2.072607115136298e-05, + "loss": 4.639, + "step": 2208 + }, + { + "epoch": 0.7953195319531953, + "grad_norm": 0.7175842523574829, + "learning_rate": 2.065590038154209e-05, + "loss": 5.0236, + "step": 2209 + }, + { + "epoch": 0.7956795679567957, + "grad_norm": 0.5457351207733154, + "learning_rate": 2.058583491552465e-05, + "loss": 4.5711, + "step": 2210 + }, + { + "epoch": 0.7960396039603961, + "grad_norm": 0.6841213703155518, + "learning_rate": 2.0515874846300077e-05, + "loss": 4.6929, + "step": 2211 + }, + { + "epoch": 0.7963996399639964, + "grad_norm": 0.8968706727027893, + "learning_rate": 2.044602026671786e-05, + "loss": 4.7611, + "step": 2212 + }, + { + "epoch": 0.7967596759675968, + "grad_norm": 0.6250977516174316, + "learning_rate": 2.0376271269487514e-05, + "loss": 4.3924, + "step": 2213 + }, + { + "epoch": 0.7971197119711971, + "grad_norm": 0.6848214864730835, + "learning_rate": 2.0306627947178446e-05, + "loss": 4.5518, + "step": 2214 + }, + { + "epoch": 0.7974797479747975, + "grad_norm": 0.5987040996551514, + "learning_rate": 2.0237090392219805e-05, + "loss": 4.9023, + "step": 2215 + }, + { + "epoch": 0.7978397839783978, + "grad_norm": 0.6589849591255188, + "learning_rate": 2.0167658696900317e-05, + "loss": 4.42, + "step": 2216 + }, + { + "epoch": 0.7981998199819982, + "grad_norm": 0.7822523713111877, + "learning_rate": 2.0098332953368272e-05, + "loss": 4.8455, + "step": 2217 + }, + { + "epoch": 0.7985598559855985, + "grad_norm": 0.8310371041297913, + "learning_rate": 2.0029113253631314e-05, + "loss": 5.1098, + "step": 2218 + }, + { + "epoch": 0.7989198919891989, + "grad_norm": 0.8362820148468018, + "learning_rate": 1.995999968955641e-05, + "loss": 4.5659, + "step": 2219 + }, + { + "epoch": 0.7992799279927992, + "grad_norm": 0.7935851812362671, + "learning_rate": 1.9890992352869543e-05, + "loss": 4.5946, + "step": 2220 + }, + { + "epoch": 0.7996399639963997, + "grad_norm": 0.6581932902336121, + "learning_rate": 1.9822091335155812e-05, + "loss": 5.0322, + "step": 2221 + }, + { + "epoch": 0.8, + "grad_norm": 0.6121277213096619, + "learning_rate": 1.9753296727859195e-05, + "loss": 4.5539, + "step": 2222 + }, + { + "epoch": 0.8003600360036004, + "grad_norm": 0.7371995449066162, + "learning_rate": 1.9684608622282417e-05, + "loss": 5.2824, + "step": 2223 + }, + { + "epoch": 0.8007200720072007, + "grad_norm": 0.825187087059021, + "learning_rate": 1.9616027109586887e-05, + "loss": 5.0987, + "step": 2224 + }, + { + "epoch": 0.8010801080108011, + "grad_norm": 1.2908596992492676, + "learning_rate": 1.9547552280792524e-05, + "loss": 5.2177, + "step": 2225 + }, + { + "epoch": 0.8014401440144014, + "grad_norm": 2.2421438694000244, + "learning_rate": 1.947918422677769e-05, + "loss": 4.7303, + "step": 2226 + }, + { + "epoch": 0.8018001800180018, + "grad_norm": 0.5044226050376892, + "learning_rate": 1.941092303827896e-05, + "loss": 4.6811, + "step": 2227 + }, + { + "epoch": 0.8021602160216021, + "grad_norm": 0.5311564207077026, + "learning_rate": 1.9342768805891178e-05, + "loss": 4.8866, + "step": 2228 + }, + { + "epoch": 0.8025202520252025, + "grad_norm": 0.8058616518974304, + "learning_rate": 1.927472162006717e-05, + "loss": 4.5184, + "step": 2229 + }, + { + "epoch": 0.8028802880288028, + "grad_norm": 0.690186619758606, + "learning_rate": 1.920678157111776e-05, + "loss": 4.657, + "step": 2230 + }, + { + "epoch": 0.8032403240324032, + "grad_norm": 0.7219494581222534, + "learning_rate": 1.9138948749211472e-05, + "loss": 4.7445, + "step": 2231 + }, + { + "epoch": 0.8036003600360035, + "grad_norm": 0.6288260221481323, + "learning_rate": 1.9071223244374614e-05, + "loss": 4.9032, + "step": 2232 + }, + { + "epoch": 0.803960396039604, + "grad_norm": 0.7226089239120483, + "learning_rate": 1.9003605146491054e-05, + "loss": 5.1746, + "step": 2233 + }, + { + "epoch": 0.8043204320432044, + "grad_norm": 0.8098820447921753, + "learning_rate": 1.8936094545302095e-05, + "loss": 4.8424, + "step": 2234 + }, + { + "epoch": 0.8046804680468047, + "grad_norm": 0.7526129484176636, + "learning_rate": 1.8868691530406336e-05, + "loss": 4.5838, + "step": 2235 + }, + { + "epoch": 0.8050405040504051, + "grad_norm": 0.8739807605743408, + "learning_rate": 1.8801396191259645e-05, + "loss": 4.7443, + "step": 2236 + }, + { + "epoch": 0.8054005400540054, + "grad_norm": 0.6892088651657104, + "learning_rate": 1.8734208617174988e-05, + "loss": 4.9657, + "step": 2237 + }, + { + "epoch": 0.8057605760576058, + "grad_norm": 0.5015419125556946, + "learning_rate": 1.866712889732225e-05, + "loss": 4.7528, + "step": 2238 + }, + { + "epoch": 0.8061206120612061, + "grad_norm": 0.5005907416343689, + "learning_rate": 1.8600157120728244e-05, + "loss": 4.7449, + "step": 2239 + }, + { + "epoch": 0.8064806480648065, + "grad_norm": 0.526184618473053, + "learning_rate": 1.8533293376276472e-05, + "loss": 4.6524, + "step": 2240 + }, + { + "epoch": 0.8068406840684068, + "grad_norm": 0.6554297804832458, + "learning_rate": 1.8466537752707068e-05, + "loss": 4.8402, + "step": 2241 + }, + { + "epoch": 0.8072007200720072, + "grad_norm": 0.6219531893730164, + "learning_rate": 1.839989033861673e-05, + "loss": 4.6053, + "step": 2242 + }, + { + "epoch": 0.8075607560756075, + "grad_norm": 0.6740393042564392, + "learning_rate": 1.8333351222458407e-05, + "loss": 4.385, + "step": 2243 + }, + { + "epoch": 0.807920792079208, + "grad_norm": 0.7463712096214294, + "learning_rate": 1.826692049254145e-05, + "loss": 4.8666, + "step": 2244 + }, + { + "epoch": 0.8082808280828083, + "grad_norm": 0.6941218376159668, + "learning_rate": 1.820059823703133e-05, + "loss": 4.6104, + "step": 2245 + }, + { + "epoch": 0.8086408640864087, + "grad_norm": 0.8766574859619141, + "learning_rate": 1.8134384543949478e-05, + "loss": 4.6754, + "step": 2246 + }, + { + "epoch": 0.809000900090009, + "grad_norm": 0.7981788516044617, + "learning_rate": 1.8068279501173335e-05, + "loss": 4.8805, + "step": 2247 + }, + { + "epoch": 0.8093609360936094, + "grad_norm": 1.02590811252594, + "learning_rate": 1.8002283196436097e-05, + "loss": 4.9571, + "step": 2248 + }, + { + "epoch": 0.8097209720972097, + "grad_norm": 1.0470219850540161, + "learning_rate": 1.7936395717326704e-05, + "loss": 5.117, + "step": 2249 + }, + { + "epoch": 0.8100810081008101, + "grad_norm": 1.3908179998397827, + "learning_rate": 1.787061715128956e-05, + "loss": 5.2106, + "step": 2250 + }, + { + "epoch": 0.8104410441044104, + "grad_norm": 0.970249354839325, + "learning_rate": 1.7804947585624588e-05, + "loss": 5.0495, + "step": 2251 + }, + { + "epoch": 0.8108010801080108, + "grad_norm": 0.7076825499534607, + "learning_rate": 1.773938710748706e-05, + "loss": 4.6217, + "step": 2252 + }, + { + "epoch": 0.8111611161116111, + "grad_norm": 0.657702624797821, + "learning_rate": 1.7673935803887453e-05, + "loss": 4.4113, + "step": 2253 + }, + { + "epoch": 0.8115211521152115, + "grad_norm": 0.6246639490127563, + "learning_rate": 1.760859376169133e-05, + "loss": 4.5696, + "step": 2254 + }, + { + "epoch": 0.8118811881188119, + "grad_norm": 0.5377760529518127, + "learning_rate": 1.754336106761927e-05, + "loss": 4.5045, + "step": 2255 + }, + { + "epoch": 0.8122412241224123, + "grad_norm": 0.5879418849945068, + "learning_rate": 1.7478237808246722e-05, + "loss": 5.1566, + "step": 2256 + }, + { + "epoch": 0.8126012601260126, + "grad_norm": 0.6851582527160645, + "learning_rate": 1.741322407000391e-05, + "loss": 4.776, + "step": 2257 + }, + { + "epoch": 0.812961296129613, + "grad_norm": 1.0152539014816284, + "learning_rate": 1.7348319939175637e-05, + "loss": 4.4992, + "step": 2258 + }, + { + "epoch": 0.8133213321332133, + "grad_norm": 0.6916372179985046, + "learning_rate": 1.7283525501901323e-05, + "loss": 4.1276, + "step": 2259 + }, + { + "epoch": 0.8136813681368137, + "grad_norm": 0.5402399897575378, + "learning_rate": 1.7218840844174754e-05, + "loss": 4.7419, + "step": 2260 + }, + { + "epoch": 0.814041404140414, + "grad_norm": 0.48339545726776123, + "learning_rate": 1.715426605184407e-05, + "loss": 4.3284, + "step": 2261 + }, + { + "epoch": 0.8144014401440144, + "grad_norm": 0.7083166241645813, + "learning_rate": 1.70898012106115e-05, + "loss": 4.5239, + "step": 2262 + }, + { + "epoch": 0.8147614761476147, + "grad_norm": 0.5646843314170837, + "learning_rate": 1.7025446406033453e-05, + "loss": 4.5845, + "step": 2263 + }, + { + "epoch": 0.8151215121512151, + "grad_norm": 0.7311326265335083, + "learning_rate": 1.696120172352025e-05, + "loss": 4.5815, + "step": 2264 + }, + { + "epoch": 0.8154815481548154, + "grad_norm": 0.743579626083374, + "learning_rate": 1.6897067248336095e-05, + "loss": 5.0977, + "step": 2265 + }, + { + "epoch": 0.8158415841584158, + "grad_norm": 0.6290472149848938, + "learning_rate": 1.683304306559884e-05, + "loss": 4.7559, + "step": 2266 + }, + { + "epoch": 0.8162016201620163, + "grad_norm": 0.5165727734565735, + "learning_rate": 1.676912926028007e-05, + "loss": 4.8021, + "step": 2267 + }, + { + "epoch": 0.8165616561656166, + "grad_norm": 0.5954656600952148, + "learning_rate": 1.6705325917204805e-05, + "loss": 4.7522, + "step": 2268 + }, + { + "epoch": 0.816921692169217, + "grad_norm": 0.6288970708847046, + "learning_rate": 1.66416331210515e-05, + "loss": 4.7196, + "step": 2269 + }, + { + "epoch": 0.8172817281728173, + "grad_norm": 0.6287972331047058, + "learning_rate": 1.6578050956351886e-05, + "loss": 5.1404, + "step": 2270 + }, + { + "epoch": 0.8176417641764177, + "grad_norm": 0.8024221658706665, + "learning_rate": 1.6514579507490848e-05, + "loss": 5.0936, + "step": 2271 + }, + { + "epoch": 0.818001800180018, + "grad_norm": 0.5822760462760925, + "learning_rate": 1.6451218858706374e-05, + "loss": 4.9177, + "step": 2272 + }, + { + "epoch": 0.8183618361836184, + "grad_norm": 0.8307278752326965, + "learning_rate": 1.6387969094089316e-05, + "loss": 5.2499, + "step": 2273 + }, + { + "epoch": 0.8187218721872187, + "grad_norm": 0.9024346470832825, + "learning_rate": 1.632483029758345e-05, + "loss": 5.276, + "step": 2274 + }, + { + "epoch": 0.819081908190819, + "grad_norm": 1.2056093215942383, + "learning_rate": 1.626180255298525e-05, + "loss": 5.3459, + "step": 2275 + }, + { + "epoch": 0.8194419441944194, + "grad_norm": 1.9106166362762451, + "learning_rate": 1.619888594394382e-05, + "loss": 4.8601, + "step": 2276 + }, + { + "epoch": 0.8198019801980198, + "grad_norm": 0.6660627126693726, + "learning_rate": 1.6136080553960687e-05, + "loss": 4.7553, + "step": 2277 + }, + { + "epoch": 0.8201620162016202, + "grad_norm": 0.4384493827819824, + "learning_rate": 1.6073386466389872e-05, + "loss": 4.5587, + "step": 2278 + }, + { + "epoch": 0.8205220522052206, + "grad_norm": 0.5030012726783752, + "learning_rate": 1.601080376443763e-05, + "loss": 4.5943, + "step": 2279 + }, + { + "epoch": 0.8208820882088209, + "grad_norm": 0.5574566721916199, + "learning_rate": 1.5948332531162413e-05, + "loss": 4.5229, + "step": 2280 + }, + { + "epoch": 0.8212421242124213, + "grad_norm": 0.519743025302887, + "learning_rate": 1.5885972849474672e-05, + "loss": 4.5851, + "step": 2281 + }, + { + "epoch": 0.8216021602160216, + "grad_norm": 0.5831529498100281, + "learning_rate": 1.5823724802136865e-05, + "loss": 4.7909, + "step": 2282 + }, + { + "epoch": 0.821962196219622, + "grad_norm": 0.8960453271865845, + "learning_rate": 1.576158847176329e-05, + "loss": 4.8842, + "step": 2283 + }, + { + "epoch": 0.8223222322232223, + "grad_norm": 0.6710848808288574, + "learning_rate": 1.5699563940819962e-05, + "loss": 4.8542, + "step": 2284 + }, + { + "epoch": 0.8226822682268227, + "grad_norm": 0.5260566473007202, + "learning_rate": 1.5637651291624523e-05, + "loss": 4.6999, + "step": 2285 + }, + { + "epoch": 0.823042304230423, + "grad_norm": 0.6472052335739136, + "learning_rate": 1.557585060634612e-05, + "loss": 4.5737, + "step": 2286 + }, + { + "epoch": 0.8234023402340234, + "grad_norm": 0.757953941822052, + "learning_rate": 1.5514161967005337e-05, + "loss": 4.4606, + "step": 2287 + }, + { + "epoch": 0.8237623762376237, + "grad_norm": 0.5936845541000366, + "learning_rate": 1.5452585455473977e-05, + "loss": 4.5589, + "step": 2288 + }, + { + "epoch": 0.8241224122412241, + "grad_norm": 0.6369442343711853, + "learning_rate": 1.539112115347511e-05, + "loss": 4.963, + "step": 2289 + }, + { + "epoch": 0.8244824482448245, + "grad_norm": 0.8031920194625854, + "learning_rate": 1.5329769142582827e-05, + "loss": 4.9157, + "step": 2290 + }, + { + "epoch": 0.8248424842484249, + "grad_norm": 0.756523847579956, + "learning_rate": 1.526852950422226e-05, + "loss": 4.6052, + "step": 2291 + }, + { + "epoch": 0.8252025202520252, + "grad_norm": 0.6446996331214905, + "learning_rate": 1.5207402319669306e-05, + "loss": 4.7325, + "step": 2292 + }, + { + "epoch": 0.8255625562556256, + "grad_norm": 0.6594321727752686, + "learning_rate": 1.5146387670050687e-05, + "loss": 4.7268, + "step": 2293 + }, + { + "epoch": 0.8259225922592259, + "grad_norm": 0.6736615896224976, + "learning_rate": 1.5085485636343755e-05, + "loss": 5.0275, + "step": 2294 + }, + { + "epoch": 0.8262826282628263, + "grad_norm": 0.5317438244819641, + "learning_rate": 1.5024696299376407e-05, + "loss": 4.7624, + "step": 2295 + }, + { + "epoch": 0.8266426642664266, + "grad_norm": 0.6013367772102356, + "learning_rate": 1.4964019739826907e-05, + "loss": 4.5998, + "step": 2296 + }, + { + "epoch": 0.827002700270027, + "grad_norm": 0.5864752531051636, + "learning_rate": 1.4903456038223939e-05, + "loss": 4.7793, + "step": 2297 + }, + { + "epoch": 0.8273627362736273, + "grad_norm": 1.0099995136260986, + "learning_rate": 1.4843005274946365e-05, + "loss": 5.441, + "step": 2298 + }, + { + "epoch": 0.8277227722772277, + "grad_norm": 1.1725364923477173, + "learning_rate": 1.4782667530223126e-05, + "loss": 4.894, + "step": 2299 + }, + { + "epoch": 0.828082808280828, + "grad_norm": 1.4512721300125122, + "learning_rate": 1.4722442884133214e-05, + "loss": 5.316, + "step": 2300 + }, + { + "epoch": 0.8284428442844285, + "grad_norm": 0.6787356734275818, + "learning_rate": 1.4662331416605501e-05, + "loss": 4.4722, + "step": 2301 + }, + { + "epoch": 0.8288028802880288, + "grad_norm": 1.0046409368515015, + "learning_rate": 1.4602333207418651e-05, + "loss": 4.455, + "step": 2302 + }, + { + "epoch": 0.8291629162916292, + "grad_norm": 0.7777742743492126, + "learning_rate": 1.454244833620102e-05, + "loss": 4.277, + "step": 2303 + }, + { + "epoch": 0.8295229522952295, + "grad_norm": 1.2001330852508545, + "learning_rate": 1.4482676882430502e-05, + "loss": 4.9538, + "step": 2304 + }, + { + "epoch": 0.8298829882988299, + "grad_norm": 0.6005185842514038, + "learning_rate": 1.4423018925434528e-05, + "loss": 4.5779, + "step": 2305 + }, + { + "epoch": 0.8302430243024302, + "grad_norm": 0.7251618504524231, + "learning_rate": 1.4363474544389877e-05, + "loss": 4.6006, + "step": 2306 + }, + { + "epoch": 0.8306030603060306, + "grad_norm": 0.9123652577400208, + "learning_rate": 1.4304043818322565e-05, + "loss": 4.927, + "step": 2307 + }, + { + "epoch": 0.8309630963096309, + "grad_norm": 0.9391204118728638, + "learning_rate": 1.424472682610779e-05, + "loss": 5.0999, + "step": 2308 + }, + { + "epoch": 0.8313231323132313, + "grad_norm": 0.6396461725234985, + "learning_rate": 1.4185523646469822e-05, + "loss": 4.552, + "step": 2309 + }, + { + "epoch": 0.8316831683168316, + "grad_norm": 0.5968081951141357, + "learning_rate": 1.4126434357981877e-05, + "loss": 4.6309, + "step": 2310 + }, + { + "epoch": 0.832043204320432, + "grad_norm": 0.8504002690315247, + "learning_rate": 1.4067459039065956e-05, + "loss": 4.995, + "step": 2311 + }, + { + "epoch": 0.8324032403240325, + "grad_norm": 0.9265114068984985, + "learning_rate": 1.4008597767992871e-05, + "loss": 4.8508, + "step": 2312 + }, + { + "epoch": 0.8327632763276328, + "grad_norm": 0.5217415690422058, + "learning_rate": 1.3949850622882054e-05, + "loss": 4.8427, + "step": 2313 + }, + { + "epoch": 0.8331233123312332, + "grad_norm": 0.8731891512870789, + "learning_rate": 1.3891217681701474e-05, + "loss": 4.713, + "step": 2314 + }, + { + "epoch": 0.8334833483348335, + "grad_norm": 0.602576732635498, + "learning_rate": 1.3832699022267515e-05, + "loss": 4.5357, + "step": 2315 + }, + { + "epoch": 0.8338433843384339, + "grad_norm": 0.6332751512527466, + "learning_rate": 1.3774294722244907e-05, + "loss": 4.6608, + "step": 2316 + }, + { + "epoch": 0.8342034203420342, + "grad_norm": 0.7543874979019165, + "learning_rate": 1.3716004859146592e-05, + "loss": 4.6207, + "step": 2317 + }, + { + "epoch": 0.8345634563456346, + "grad_norm": 0.888930082321167, + "learning_rate": 1.3657829510333654e-05, + "loss": 4.6348, + "step": 2318 + }, + { + "epoch": 0.8349234923492349, + "grad_norm": 0.8453302979469299, + "learning_rate": 1.3599768753015152e-05, + "loss": 4.7868, + "step": 2319 + }, + { + "epoch": 0.8352835283528353, + "grad_norm": 0.5163218379020691, + "learning_rate": 1.3541822664248094e-05, + "loss": 4.4345, + "step": 2320 + }, + { + "epoch": 0.8356435643564356, + "grad_norm": 0.49913036823272705, + "learning_rate": 1.3483991320937306e-05, + "loss": 4.961, + "step": 2321 + }, + { + "epoch": 0.836003600360036, + "grad_norm": 1.1673535108566284, + "learning_rate": 1.3426274799835337e-05, + "loss": 4.8401, + "step": 2322 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 0.8345517516136169, + "learning_rate": 1.336867317754229e-05, + "loss": 5.0522, + "step": 2323 + }, + { + "epoch": 0.8367236723672368, + "grad_norm": 0.9774869680404663, + "learning_rate": 1.3311186530505838e-05, + "loss": 5.1665, + "step": 2324 + }, + { + "epoch": 0.8370837083708371, + "grad_norm": 1.130599856376648, + "learning_rate": 1.3253814935021026e-05, + "loss": 5.1565, + "step": 2325 + }, + { + "epoch": 0.8374437443744375, + "grad_norm": 0.9888586401939392, + "learning_rate": 1.3196558467230247e-05, + "loss": 4.9575, + "step": 2326 + }, + { + "epoch": 0.8378037803780378, + "grad_norm": 0.6156508326530457, + "learning_rate": 1.3139417203123027e-05, + "loss": 4.9267, + "step": 2327 + }, + { + "epoch": 0.8381638163816382, + "grad_norm": 0.6930103302001953, + "learning_rate": 1.3082391218536061e-05, + "loss": 4.7754, + "step": 2328 + }, + { + "epoch": 0.8385238523852385, + "grad_norm": 0.7321805357933044, + "learning_rate": 1.3025480589153005e-05, + "loss": 4.8377, + "step": 2329 + }, + { + "epoch": 0.8388838883888389, + "grad_norm": 0.7025576233863831, + "learning_rate": 1.2968685390504465e-05, + "loss": 4.8009, + "step": 2330 + }, + { + "epoch": 0.8392439243924392, + "grad_norm": 0.5497130155563354, + "learning_rate": 1.29120056979678e-05, + "loss": 4.4953, + "step": 2331 + }, + { + "epoch": 0.8396039603960396, + "grad_norm": 0.5606801509857178, + "learning_rate": 1.2855441586767113e-05, + "loss": 4.4783, + "step": 2332 + }, + { + "epoch": 0.8399639963996399, + "grad_norm": 0.5393441915512085, + "learning_rate": 1.2798993131973091e-05, + "loss": 4.4954, + "step": 2333 + }, + { + "epoch": 0.8403240324032403, + "grad_norm": 0.601349949836731, + "learning_rate": 1.2742660408502904e-05, + "loss": 4.6084, + "step": 2334 + }, + { + "epoch": 0.8406840684068407, + "grad_norm": 0.5767045617103577, + "learning_rate": 1.2686443491120149e-05, + "loss": 4.7144, + "step": 2335 + }, + { + "epoch": 0.8410441044104411, + "grad_norm": 0.7713471055030823, + "learning_rate": 1.263034245443473e-05, + "loss": 4.673, + "step": 2336 + }, + { + "epoch": 0.8414041404140414, + "grad_norm": 0.771685779094696, + "learning_rate": 1.2574357372902767e-05, + "loss": 4.9869, + "step": 2337 + }, + { + "epoch": 0.8417641764176418, + "grad_norm": 0.5168091654777527, + "learning_rate": 1.2518488320826449e-05, + "loss": 4.5899, + "step": 2338 + }, + { + "epoch": 0.8421242124212421, + "grad_norm": 1.225448727607727, + "learning_rate": 1.2462735372353996e-05, + "loss": 4.7007, + "step": 2339 + }, + { + "epoch": 0.8424842484248425, + "grad_norm": 0.5362923741340637, + "learning_rate": 1.2407098601479539e-05, + "loss": 4.696, + "step": 2340 + }, + { + "epoch": 0.8428442844284428, + "grad_norm": 0.6797324419021606, + "learning_rate": 1.2351578082043047e-05, + "loss": 4.791, + "step": 2341 + }, + { + "epoch": 0.8432043204320432, + "grad_norm": 1.1674048900604248, + "learning_rate": 1.2296173887730123e-05, + "loss": 4.9394, + "step": 2342 + }, + { + "epoch": 0.8435643564356435, + "grad_norm": 0.6934359073638916, + "learning_rate": 1.2240886092072068e-05, + "loss": 4.5077, + "step": 2343 + }, + { + "epoch": 0.8439243924392439, + "grad_norm": 0.5877205729484558, + "learning_rate": 1.2185714768445667e-05, + "loss": 4.5705, + "step": 2344 + }, + { + "epoch": 0.8442844284428442, + "grad_norm": 0.6093941926956177, + "learning_rate": 1.2130659990073146e-05, + "loss": 4.5415, + "step": 2345 + }, + { + "epoch": 0.8446444644464447, + "grad_norm": 0.7287856936454773, + "learning_rate": 1.2075721830021969e-05, + "loss": 5.0374, + "step": 2346 + }, + { + "epoch": 0.845004500450045, + "grad_norm": 0.8647234439849854, + "learning_rate": 1.2020900361204968e-05, + "loss": 4.6786, + "step": 2347 + }, + { + "epoch": 0.8453645364536454, + "grad_norm": 0.7940008044242859, + "learning_rate": 1.1966195656380031e-05, + "loss": 5.0893, + "step": 2348 + }, + { + "epoch": 0.8457245724572457, + "grad_norm": 1.0051583051681519, + "learning_rate": 1.1911607788150036e-05, + "loss": 5.203, + "step": 2349 + }, + { + "epoch": 0.8460846084608461, + "grad_norm": 0.9990129470825195, + "learning_rate": 1.1857136828962855e-05, + "loss": 5.0418, + "step": 2350 + }, + { + "epoch": 0.8464446444644464, + "grad_norm": 1.3434467315673828, + "learning_rate": 1.1802782851111205e-05, + "loss": 5.1628, + "step": 2351 + }, + { + "epoch": 0.8468046804680468, + "grad_norm": 0.7505450248718262, + "learning_rate": 1.1748545926732535e-05, + "loss": 4.6661, + "step": 2352 + }, + { + "epoch": 0.8471647164716472, + "grad_norm": 1.0241285562515259, + "learning_rate": 1.169442612780891e-05, + "loss": 4.7007, + "step": 2353 + }, + { + "epoch": 0.8475247524752475, + "grad_norm": 1.4250359535217285, + "learning_rate": 1.1640423526166988e-05, + "loss": 4.5734, + "step": 2354 + }, + { + "epoch": 0.8478847884788479, + "grad_norm": 0.8431483507156372, + "learning_rate": 1.158653819347788e-05, + "loss": 4.6603, + "step": 2355 + }, + { + "epoch": 0.8482448244824482, + "grad_norm": 0.6706793904304504, + "learning_rate": 1.1532770201257082e-05, + "loss": 4.7574, + "step": 2356 + }, + { + "epoch": 0.8486048604860486, + "grad_norm": 0.6527566909790039, + "learning_rate": 1.1479119620864276e-05, + "loss": 4.6939, + "step": 2357 + }, + { + "epoch": 0.848964896489649, + "grad_norm": 0.846868634223938, + "learning_rate": 1.1425586523503395e-05, + "loss": 4.7602, + "step": 2358 + }, + { + "epoch": 0.8493249324932494, + "grad_norm": 0.8098002672195435, + "learning_rate": 1.1372170980222441e-05, + "loss": 4.9113, + "step": 2359 + }, + { + "epoch": 0.8496849684968497, + "grad_norm": 0.6968252062797546, + "learning_rate": 1.1318873061913405e-05, + "loss": 4.8477, + "step": 2360 + }, + { + "epoch": 0.8500450045004501, + "grad_norm": 0.6603767275810242, + "learning_rate": 1.1265692839312092e-05, + "loss": 4.7987, + "step": 2361 + }, + { + "epoch": 0.8504050405040504, + "grad_norm": 0.9465769529342651, + "learning_rate": 1.1212630382998213e-05, + "loss": 4.5938, + "step": 2362 + }, + { + "epoch": 0.8507650765076508, + "grad_norm": 0.6345024704933167, + "learning_rate": 1.1159685763395111e-05, + "loss": 4.7153, + "step": 2363 + }, + { + "epoch": 0.8511251125112511, + "grad_norm": 0.7378025054931641, + "learning_rate": 1.1106859050769769e-05, + "loss": 4.5385, + "step": 2364 + }, + { + "epoch": 0.8514851485148515, + "grad_norm": 0.9905508160591125, + "learning_rate": 1.1054150315232681e-05, + "loss": 4.7941, + "step": 2365 + }, + { + "epoch": 0.8518451845184518, + "grad_norm": 0.8407430648803711, + "learning_rate": 1.1001559626737756e-05, + "loss": 4.7788, + "step": 2366 + }, + { + "epoch": 0.8522052205220522, + "grad_norm": 0.8498520255088806, + "learning_rate": 1.0949087055082252e-05, + "loss": 4.1181, + "step": 2367 + }, + { + "epoch": 0.8525652565256525, + "grad_norm": 0.9800708889961243, + "learning_rate": 1.089673266990663e-05, + "loss": 4.8319, + "step": 2368 + }, + { + "epoch": 0.852925292529253, + "grad_norm": 0.7625902891159058, + "learning_rate": 1.0844496540694515e-05, + "loss": 4.7761, + "step": 2369 + }, + { + "epoch": 0.8532853285328533, + "grad_norm": 0.638638436794281, + "learning_rate": 1.0792378736772612e-05, + "loss": 4.7487, + "step": 2370 + }, + { + "epoch": 0.8536453645364537, + "grad_norm": 0.6259344220161438, + "learning_rate": 1.0740379327310569e-05, + "loss": 4.9022, + "step": 2371 + }, + { + "epoch": 0.854005400540054, + "grad_norm": 0.9138006567955017, + "learning_rate": 1.0688498381320855e-05, + "loss": 5.0246, + "step": 2372 + }, + { + "epoch": 0.8543654365436544, + "grad_norm": 0.9508568048477173, + "learning_rate": 1.0636735967658784e-05, + "loss": 4.9478, + "step": 2373 + }, + { + "epoch": 0.8547254725472547, + "grad_norm": 1.1043336391448975, + "learning_rate": 1.0585092155022336e-05, + "loss": 4.8974, + "step": 2374 + }, + { + "epoch": 0.8550855085508551, + "grad_norm": 1.3299425840377808, + "learning_rate": 1.0533567011952094e-05, + "loss": 5.1492, + "step": 2375 + }, + { + "epoch": 0.8554455445544554, + "grad_norm": 0.6157066822052002, + "learning_rate": 1.0482160606831093e-05, + "loss": 4.7135, + "step": 2376 + }, + { + "epoch": 0.8558055805580558, + "grad_norm": 1.1526126861572266, + "learning_rate": 1.0430873007884857e-05, + "loss": 4.8683, + "step": 2377 + }, + { + "epoch": 0.8561656165616561, + "grad_norm": 0.5212879180908203, + "learning_rate": 1.0379704283181179e-05, + "loss": 4.5955, + "step": 2378 + }, + { + "epoch": 0.8565256525652565, + "grad_norm": 0.8130112290382385, + "learning_rate": 1.0328654500630108e-05, + "loss": 4.7918, + "step": 2379 + }, + { + "epoch": 0.8568856885688569, + "grad_norm": 0.7200890183448792, + "learning_rate": 1.0277723727983845e-05, + "loss": 4.8406, + "step": 2380 + }, + { + "epoch": 0.8572457245724573, + "grad_norm": 0.6026584506034851, + "learning_rate": 1.0226912032836611e-05, + "loss": 4.7515, + "step": 2381 + }, + { + "epoch": 0.8576057605760576, + "grad_norm": 0.9684290885925293, + "learning_rate": 1.0176219482624616e-05, + "loss": 4.8093, + "step": 2382 + }, + { + "epoch": 0.857965796579658, + "grad_norm": 0.6427994966506958, + "learning_rate": 1.0125646144625955e-05, + "loss": 4.5308, + "step": 2383 + }, + { + "epoch": 0.8583258325832583, + "grad_norm": 1.033554196357727, + "learning_rate": 1.007519208596045e-05, + "loss": 4.8341, + "step": 2384 + }, + { + "epoch": 0.8586858685868587, + "grad_norm": 0.6669801473617554, + "learning_rate": 1.002485737358968e-05, + "loss": 4.8964, + "step": 2385 + }, + { + "epoch": 0.859045904590459, + "grad_norm": 0.6307418942451477, + "learning_rate": 9.974642074316798e-06, + "loss": 4.8266, + "step": 2386 + }, + { + "epoch": 0.8594059405940594, + "grad_norm": 0.6424444913864136, + "learning_rate": 9.924546254786493e-06, + "loss": 4.7471, + "step": 2387 + }, + { + "epoch": 0.8597659765976597, + "grad_norm": 0.8725467920303345, + "learning_rate": 9.874569981484861e-06, + "loss": 4.5142, + "step": 2388 + }, + { + "epoch": 0.8601260126012601, + "grad_norm": 1.1564704179763794, + "learning_rate": 9.824713320739342e-06, + "loss": 4.7016, + "step": 2389 + }, + { + "epoch": 0.8604860486048604, + "grad_norm": 0.7655138969421387, + "learning_rate": 9.774976338718677e-06, + "loss": 4.3319, + "step": 2390 + }, + { + "epoch": 0.8608460846084608, + "grad_norm": 0.7302666306495667, + "learning_rate": 9.725359101432674e-06, + "loss": 4.6624, + "step": 2391 + }, + { + "epoch": 0.8612061206120613, + "grad_norm": 0.7123817801475525, + "learning_rate": 9.675861674732312e-06, + "loss": 4.5181, + "step": 2392 + }, + { + "epoch": 0.8615661566156616, + "grad_norm": 0.5299736261367798, + "learning_rate": 9.62648412430951e-06, + "loss": 4.6567, + "step": 2393 + }, + { + "epoch": 0.861926192619262, + "grad_norm": 0.711216390132904, + "learning_rate": 9.577226515697124e-06, + "loss": 4.6318, + "step": 2394 + }, + { + "epoch": 0.8622862286228623, + "grad_norm": 0.7324408888816833, + "learning_rate": 9.528088914268784e-06, + "loss": 4.6413, + "step": 2395 + }, + { + "epoch": 0.8626462646264627, + "grad_norm": 0.6073545217514038, + "learning_rate": 9.479071385238892e-06, + "loss": 4.6813, + "step": 2396 + }, + { + "epoch": 0.863006300630063, + "grad_norm": 0.9429351687431335, + "learning_rate": 9.430173993662451e-06, + "loss": 4.7784, + "step": 2397 + }, + { + "epoch": 0.8633663366336634, + "grad_norm": 0.8551303148269653, + "learning_rate": 9.381396804435061e-06, + "loss": 5.4424, + "step": 2398 + }, + { + "epoch": 0.8637263726372637, + "grad_norm": 0.9081370234489441, + "learning_rate": 9.332739882292752e-06, + "loss": 5.1161, + "step": 2399 + }, + { + "epoch": 0.8640864086408641, + "grad_norm": 0.9314940571784973, + "learning_rate": 9.284203291811954e-06, + "loss": 5.0829, + "step": 2400 + }, + { + "epoch": 0.8644464446444644, + "grad_norm": 0.747048556804657, + "learning_rate": 9.23578709740942e-06, + "loss": 4.671, + "step": 2401 + }, + { + "epoch": 0.8648064806480648, + "grad_norm": 0.8901441097259521, + "learning_rate": 9.187491363342093e-06, + "loss": 4.7503, + "step": 2402 + }, + { + "epoch": 0.8651665166516652, + "grad_norm": 0.8733905553817749, + "learning_rate": 9.139316153707023e-06, + "loss": 4.1668, + "step": 2403 + }, + { + "epoch": 0.8655265526552656, + "grad_norm": 0.8293418288230896, + "learning_rate": 9.091261532441342e-06, + "loss": 4.9468, + "step": 2404 + }, + { + "epoch": 0.8658865886588659, + "grad_norm": 0.6938745379447937, + "learning_rate": 9.043327563322112e-06, + "loss": 4.8042, + "step": 2405 + }, + { + "epoch": 0.8662466246624663, + "grad_norm": 0.7350160479545593, + "learning_rate": 8.995514309966302e-06, + "loss": 5.0458, + "step": 2406 + }, + { + "epoch": 0.8666066606660666, + "grad_norm": 0.5033836960792542, + "learning_rate": 8.947821835830616e-06, + "loss": 4.7879, + "step": 2407 + }, + { + "epoch": 0.866966696669667, + "grad_norm": 0.9561224579811096, + "learning_rate": 8.900250204211514e-06, + "loss": 4.8389, + "step": 2408 + }, + { + "epoch": 0.8673267326732673, + "grad_norm": 0.8856688141822815, + "learning_rate": 8.852799478245032e-06, + "loss": 4.5677, + "step": 2409 + }, + { + "epoch": 0.8676867686768677, + "grad_norm": 0.48359963297843933, + "learning_rate": 8.80546972090679e-06, + "loss": 4.6154, + "step": 2410 + }, + { + "epoch": 0.868046804680468, + "grad_norm": 0.6133762001991272, + "learning_rate": 8.758260995011825e-06, + "loss": 4.818, + "step": 2411 + }, + { + "epoch": 0.8684068406840684, + "grad_norm": 0.7412658333778381, + "learning_rate": 8.711173363214553e-06, + "loss": 4.5966, + "step": 2412 + }, + { + "epoch": 0.8687668766876687, + "grad_norm": 0.681463897228241, + "learning_rate": 8.664206888008697e-06, + "loss": 4.5624, + "step": 2413 + }, + { + "epoch": 0.8691269126912692, + "grad_norm": 0.7318177223205566, + "learning_rate": 8.617361631727138e-06, + "loss": 4.5771, + "step": 2414 + }, + { + "epoch": 0.8694869486948695, + "grad_norm": 0.8274372220039368, + "learning_rate": 8.570637656541914e-06, + "loss": 4.9709, + "step": 2415 + }, + { + "epoch": 0.8698469846984699, + "grad_norm": 0.6884218454360962, + "learning_rate": 8.524035024464105e-06, + "loss": 4.9416, + "step": 2416 + }, + { + "epoch": 0.8702070207020702, + "grad_norm": 0.7390003800392151, + "learning_rate": 8.47755379734373e-06, + "loss": 4.7099, + "step": 2417 + }, + { + "epoch": 0.8705670567056706, + "grad_norm": 1.129050374031067, + "learning_rate": 8.431194036869672e-06, + "loss": 4.8879, + "step": 2418 + }, + { + "epoch": 0.8709270927092709, + "grad_norm": 0.7855664491653442, + "learning_rate": 8.384955804569627e-06, + "loss": 4.8775, + "step": 2419 + }, + { + "epoch": 0.8712871287128713, + "grad_norm": 0.5347578525543213, + "learning_rate": 8.338839161809997e-06, + "loss": 4.6191, + "step": 2420 + }, + { + "epoch": 0.8716471647164716, + "grad_norm": 0.754165768623352, + "learning_rate": 8.292844169795833e-06, + "loss": 4.5964, + "step": 2421 + }, + { + "epoch": 0.872007200720072, + "grad_norm": 0.7422668933868408, + "learning_rate": 8.24697088957066e-06, + "loss": 4.804, + "step": 2422 + }, + { + "epoch": 0.8723672367236723, + "grad_norm": 0.8895533680915833, + "learning_rate": 8.201219382016556e-06, + "loss": 5.1019, + "step": 2423 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 0.7388155460357666, + "learning_rate": 8.15558970785395e-06, + "loss": 4.7046, + "step": 2424 + }, + { + "epoch": 0.873087308730873, + "grad_norm": 1.3550125360488892, + "learning_rate": 8.110081927641566e-06, + "loss": 4.9702, + "step": 2425 + }, + { + "epoch": 0.8734473447344735, + "grad_norm": 2.8201375007629395, + "learning_rate": 8.064696101776358e-06, + "loss": 5.3071, + "step": 2426 + }, + { + "epoch": 0.8738073807380738, + "grad_norm": 1.0882468223571777, + "learning_rate": 8.019432290493457e-06, + "loss": 4.851, + "step": 2427 + }, + { + "epoch": 0.8741674167416742, + "grad_norm": 0.9948346018791199, + "learning_rate": 7.974290553866005e-06, + "loss": 5.0427, + "step": 2428 + }, + { + "epoch": 0.8745274527452745, + "grad_norm": 0.6691415309906006, + "learning_rate": 7.929270951805178e-06, + "loss": 4.9892, + "step": 2429 + }, + { + "epoch": 0.8748874887488749, + "grad_norm": 0.775093138217926, + "learning_rate": 7.884373544060009e-06, + "loss": 4.3908, + "step": 2430 + }, + { + "epoch": 0.8752475247524752, + "grad_norm": 0.6868644952774048, + "learning_rate": 7.839598390217396e-06, + "loss": 4.7946, + "step": 2431 + }, + { + "epoch": 0.8756075607560756, + "grad_norm": 0.6689639091491699, + "learning_rate": 7.794945549701993e-06, + "loss": 4.8674, + "step": 2432 + }, + { + "epoch": 0.875967596759676, + "grad_norm": 0.9124707579612732, + "learning_rate": 7.750415081776063e-06, + "loss": 4.9911, + "step": 2433 + }, + { + "epoch": 0.8763276327632763, + "grad_norm": 0.7038251161575317, + "learning_rate": 7.70600704553951e-06, + "loss": 4.8584, + "step": 2434 + }, + { + "epoch": 0.8766876687668766, + "grad_norm": 0.7156389951705933, + "learning_rate": 7.661721499929753e-06, + "loss": 4.3274, + "step": 2435 + }, + { + "epoch": 0.877047704770477, + "grad_norm": 0.8068670034408569, + "learning_rate": 7.6175585037216226e-06, + "loss": 4.5658, + "step": 2436 + }, + { + "epoch": 0.8774077407740775, + "grad_norm": 0.7935437560081482, + "learning_rate": 7.573518115527289e-06, + "loss": 4.9122, + "step": 2437 + }, + { + "epoch": 0.8777677767776778, + "grad_norm": 0.9261611700057983, + "learning_rate": 7.529600393796232e-06, + "loss": 4.8509, + "step": 2438 + }, + { + "epoch": 0.8781278127812782, + "grad_norm": 0.8355916738510132, + "learning_rate": 7.485805396815126e-06, + "loss": 4.3652, + "step": 2439 + }, + { + "epoch": 0.8784878487848785, + "grad_norm": 0.43560856580734253, + "learning_rate": 7.442133182707745e-06, + "loss": 4.5542, + "step": 2440 + }, + { + "epoch": 0.8788478847884789, + "grad_norm": 0.7524927258491516, + "learning_rate": 7.3985838094349444e-06, + "loss": 4.7226, + "step": 2441 + }, + { + "epoch": 0.8792079207920792, + "grad_norm": 0.7564715147018433, + "learning_rate": 7.355157334794516e-06, + "loss": 4.7208, + "step": 2442 + }, + { + "epoch": 0.8795679567956796, + "grad_norm": 0.9967451691627502, + "learning_rate": 7.3118538164211545e-06, + "loss": 5.2971, + "step": 2443 + }, + { + "epoch": 0.8799279927992799, + "grad_norm": 0.8346577286720276, + "learning_rate": 7.2686733117863784e-06, + "loss": 4.7256, + "step": 2444 + }, + { + "epoch": 0.8802880288028803, + "grad_norm": 0.638346254825592, + "learning_rate": 7.225615878198422e-06, + "loss": 4.8184, + "step": 2445 + }, + { + "epoch": 0.8806480648064806, + "grad_norm": 0.5529339909553528, + "learning_rate": 7.1826815728021965e-06, + "loss": 4.5564, + "step": 2446 + }, + { + "epoch": 0.881008100810081, + "grad_norm": 0.5807334184646606, + "learning_rate": 7.1398704525792e-06, + "loss": 4.8166, + "step": 2447 + }, + { + "epoch": 0.8813681368136813, + "grad_norm": 0.7800282835960388, + "learning_rate": 7.097182574347472e-06, + "loss": 5.1405, + "step": 2448 + }, + { + "epoch": 0.8817281728172818, + "grad_norm": 0.6698582768440247, + "learning_rate": 7.054617994761414e-06, + "loss": 4.907, + "step": 2449 + }, + { + "epoch": 0.8820882088208821, + "grad_norm": 1.5296711921691895, + "learning_rate": 7.012176770311862e-06, + "loss": 5.3242, + "step": 2450 + }, + { + "epoch": 0.8824482448244825, + "grad_norm": 1.2979846000671387, + "learning_rate": 6.969858957325904e-06, + "loss": 4.922, + "step": 2451 + }, + { + "epoch": 0.8828082808280828, + "grad_norm": 0.6412333250045776, + "learning_rate": 6.927664611966811e-06, + "loss": 5.0412, + "step": 2452 + }, + { + "epoch": 0.8831683168316832, + "grad_norm": 0.619648814201355, + "learning_rate": 6.8855937902340576e-06, + "loss": 4.507, + "step": 2453 + }, + { + "epoch": 0.8835283528352835, + "grad_norm": 0.5163532495498657, + "learning_rate": 6.843646547963123e-06, + "loss": 4.9747, + "step": 2454 + }, + { + "epoch": 0.8838883888388839, + "grad_norm": 0.7194183468818665, + "learning_rate": 6.801822940825509e-06, + "loss": 4.5837, + "step": 2455 + }, + { + "epoch": 0.8842484248424842, + "grad_norm": 0.8414213061332703, + "learning_rate": 6.760123024328624e-06, + "loss": 4.6327, + "step": 2456 + }, + { + "epoch": 0.8846084608460846, + "grad_norm": 0.684772253036499, + "learning_rate": 6.718546853815688e-06, + "loss": 4.9001, + "step": 2457 + }, + { + "epoch": 0.8849684968496849, + "grad_norm": 0.47863101959228516, + "learning_rate": 6.67709448446574e-06, + "loss": 4.8486, + "step": 2458 + }, + { + "epoch": 0.8853285328532853, + "grad_norm": 0.6075344681739807, + "learning_rate": 6.635765971293484e-06, + "loss": 4.9541, + "step": 2459 + }, + { + "epoch": 0.8856885688568857, + "grad_norm": 0.5354955196380615, + "learning_rate": 6.594561369149199e-06, + "loss": 4.6317, + "step": 2460 + }, + { + "epoch": 0.8860486048604861, + "grad_norm": 0.7306193113327026, + "learning_rate": 6.553480732718808e-06, + "loss": 4.524, + "step": 2461 + }, + { + "epoch": 0.8864086408640864, + "grad_norm": 0.6062951683998108, + "learning_rate": 6.512524116523633e-06, + "loss": 4.5702, + "step": 2462 + }, + { + "epoch": 0.8867686768676868, + "grad_norm": 0.7549055814743042, + "learning_rate": 6.4716915749204465e-06, + "loss": 4.6434, + "step": 2463 + }, + { + "epoch": 0.8871287128712871, + "grad_norm": 0.83303302526474, + "learning_rate": 6.4309831621013005e-06, + "loss": 4.7192, + "step": 2464 + }, + { + "epoch": 0.8874887488748875, + "grad_norm": 0.6464311480522156, + "learning_rate": 6.390398932093555e-06, + "loss": 4.7064, + "step": 2465 + }, + { + "epoch": 0.8878487848784878, + "grad_norm": 1.1855717897415161, + "learning_rate": 6.3499389387597254e-06, + "loss": 4.9074, + "step": 2466 + }, + { + "epoch": 0.8882088208820882, + "grad_norm": 0.8404142260551453, + "learning_rate": 6.30960323579749e-06, + "loss": 4.8212, + "step": 2467 + }, + { + "epoch": 0.8885688568856885, + "grad_norm": 0.5642232894897461, + "learning_rate": 6.269391876739495e-06, + "loss": 4.8076, + "step": 2468 + }, + { + "epoch": 0.8889288928892889, + "grad_norm": 0.8032687306404114, + "learning_rate": 6.229304914953405e-06, + "loss": 5.019, + "step": 2469 + }, + { + "epoch": 0.8892889288928892, + "grad_norm": 0.8474968671798706, + "learning_rate": 6.189342403641807e-06, + "loss": 5.0512, + "step": 2470 + }, + { + "epoch": 0.8896489648964897, + "grad_norm": 0.6336872577667236, + "learning_rate": 6.149504395842087e-06, + "loss": 4.6737, + "step": 2471 + }, + { + "epoch": 0.89000900090009, + "grad_norm": 0.704339325428009, + "learning_rate": 6.109790944426397e-06, + "loss": 4.5293, + "step": 2472 + }, + { + "epoch": 0.8903690369036904, + "grad_norm": 0.8684128522872925, + "learning_rate": 6.070202102101597e-06, + "loss": 4.7989, + "step": 2473 + }, + { + "epoch": 0.8907290729072908, + "grad_norm": 1.042490839958191, + "learning_rate": 6.030737921409169e-06, + "loss": 4.9338, + "step": 2474 + }, + { + "epoch": 0.8910891089108911, + "grad_norm": 1.1774296760559082, + "learning_rate": 5.9913984547250945e-06, + "loss": 5.2439, + "step": 2475 + }, + { + "epoch": 0.8914491449144915, + "grad_norm": 1.316601037979126, + "learning_rate": 5.95218375425991e-06, + "loss": 5.2065, + "step": 2476 + }, + { + "epoch": 0.8918091809180918, + "grad_norm": 0.9096778631210327, + "learning_rate": 5.913093872058528e-06, + "loss": 4.6322, + "step": 2477 + }, + { + "epoch": 0.8921692169216922, + "grad_norm": 0.6777582764625549, + "learning_rate": 5.874128860000216e-06, + "loss": 4.4147, + "step": 2478 + }, + { + "epoch": 0.8925292529252925, + "grad_norm": 0.5754499435424805, + "learning_rate": 5.835288769798486e-06, + "loss": 4.833, + "step": 2479 + }, + { + "epoch": 0.8928892889288929, + "grad_norm": 1.0769809484481812, + "learning_rate": 5.7965736530010916e-06, + "loss": 4.4729, + "step": 2480 + }, + { + "epoch": 0.8932493249324932, + "grad_norm": 0.6358700394630432, + "learning_rate": 5.757983560989921e-06, + "loss": 4.9246, + "step": 2481 + }, + { + "epoch": 0.8936093609360936, + "grad_norm": 1.1338918209075928, + "learning_rate": 5.719518544980929e-06, + "loss": 4.7706, + "step": 2482 + }, + { + "epoch": 0.893969396939694, + "grad_norm": 0.4643517732620239, + "learning_rate": 5.681178656024055e-06, + "loss": 4.7676, + "step": 2483 + }, + { + "epoch": 0.8943294329432944, + "grad_norm": 1.0510177612304688, + "learning_rate": 5.642963945003188e-06, + "loss": 4.6983, + "step": 2484 + }, + { + "epoch": 0.8946894689468947, + "grad_norm": 0.8604787588119507, + "learning_rate": 5.604874462636078e-06, + "loss": 4.4971, + "step": 2485 + }, + { + "epoch": 0.8950495049504951, + "grad_norm": 0.8846144080162048, + "learning_rate": 5.566910259474289e-06, + "loss": 4.4979, + "step": 2486 + }, + { + "epoch": 0.8954095409540954, + "grad_norm": 0.7206079959869385, + "learning_rate": 5.529071385903084e-06, + "loss": 4.8595, + "step": 2487 + }, + { + "epoch": 0.8957695769576958, + "grad_norm": 0.8029129505157471, + "learning_rate": 5.491357892141425e-06, + "loss": 4.871, + "step": 2488 + }, + { + "epoch": 0.8961296129612961, + "grad_norm": 0.5468530654907227, + "learning_rate": 5.453769828241872e-06, + "loss": 4.4908, + "step": 2489 + }, + { + "epoch": 0.8964896489648965, + "grad_norm": 1.086614727973938, + "learning_rate": 5.416307244090502e-06, + "loss": 4.8457, + "step": 2490 + }, + { + "epoch": 0.8968496849684968, + "grad_norm": 1.064418077468872, + "learning_rate": 5.378970189406829e-06, + "loss": 4.6813, + "step": 2491 + }, + { + "epoch": 0.8972097209720972, + "grad_norm": 0.5295194387435913, + "learning_rate": 5.341758713743828e-06, + "loss": 4.557, + "step": 2492 + }, + { + "epoch": 0.8975697569756975, + "grad_norm": 0.8219357132911682, + "learning_rate": 5.304672866487792e-06, + "loss": 4.8301, + "step": 2493 + }, + { + "epoch": 0.897929792979298, + "grad_norm": 0.9336304664611816, + "learning_rate": 5.267712696858229e-06, + "loss": 4.7836, + "step": 2494 + }, + { + "epoch": 0.8982898289828983, + "grad_norm": 0.8698000907897949, + "learning_rate": 5.230878253907912e-06, + "loss": 4.6849, + "step": 2495 + }, + { + "epoch": 0.8986498649864987, + "grad_norm": 0.6905087232589722, + "learning_rate": 5.194169586522734e-06, + "loss": 4.7967, + "step": 2496 + }, + { + "epoch": 0.899009900990099, + "grad_norm": 0.754138171672821, + "learning_rate": 5.157586743421672e-06, + "loss": 4.9295, + "step": 2497 + }, + { + "epoch": 0.8993699369936994, + "grad_norm": 1.8262755870819092, + "learning_rate": 5.121129773156663e-06, + "loss": 5.313, + "step": 2498 + }, + { + "epoch": 0.8997299729972997, + "grad_norm": 0.7745803594589233, + "learning_rate": 5.0847987241126385e-06, + "loss": 5.1595, + "step": 2499 + }, + { + "epoch": 0.9000900090009001, + "grad_norm": 1.3876433372497559, + "learning_rate": 5.0485936445074046e-06, + "loss": 5.2019, + "step": 2500 + }, + { + "epoch": 0.9004500450045004, + "grad_norm": 1.133023738861084, + "learning_rate": 5.012514582391592e-06, + "loss": 4.5723, + "step": 2501 + }, + { + "epoch": 0.9008100810081008, + "grad_norm": 0.6465590000152588, + "learning_rate": 4.976561585648509e-06, + "loss": 4.7929, + "step": 2502 + }, + { + "epoch": 0.9011701170117011, + "grad_norm": 1.2847857475280762, + "learning_rate": 4.9407347019942544e-06, + "loss": 4.8718, + "step": 2503 + }, + { + "epoch": 0.9015301530153015, + "grad_norm": 0.569114089012146, + "learning_rate": 4.905033978977491e-06, + "loss": 4.4804, + "step": 2504 + }, + { + "epoch": 0.9018901890189019, + "grad_norm": 0.9793164134025574, + "learning_rate": 4.869459463979465e-06, + "loss": 4.986, + "step": 2505 + }, + { + "epoch": 0.9022502250225023, + "grad_norm": 0.5514426231384277, + "learning_rate": 4.8340112042139065e-06, + "loss": 4.9524, + "step": 2506 + }, + { + "epoch": 0.9026102610261026, + "grad_norm": 0.8211607336997986, + "learning_rate": 4.798689246727006e-06, + "loss": 4.8468, + "step": 2507 + }, + { + "epoch": 0.902970297029703, + "grad_norm": 0.5269903540611267, + "learning_rate": 4.7634936383973095e-06, + "loss": 4.8626, + "step": 2508 + }, + { + "epoch": 0.9033303330333033, + "grad_norm": 0.6444000005722046, + "learning_rate": 4.728424425935707e-06, + "loss": 4.551, + "step": 2509 + }, + { + "epoch": 0.9036903690369037, + "grad_norm": 1.075435757637024, + "learning_rate": 4.693481655885257e-06, + "loss": 4.8247, + "step": 2510 + }, + { + "epoch": 0.904050405040504, + "grad_norm": 1.0397629737854004, + "learning_rate": 4.658665374621307e-06, + "loss": 4.6963, + "step": 2511 + }, + { + "epoch": 0.9044104410441044, + "grad_norm": 0.6805405616760254, + "learning_rate": 4.623975628351273e-06, + "loss": 4.4516, + "step": 2512 + }, + { + "epoch": 0.9047704770477047, + "grad_norm": 0.7398169040679932, + "learning_rate": 4.58941246311464e-06, + "loss": 4.7951, + "step": 2513 + }, + { + "epoch": 0.9051305130513051, + "grad_norm": 0.6716864109039307, + "learning_rate": 4.554975924782912e-06, + "loss": 4.7471, + "step": 2514 + }, + { + "epoch": 0.9054905490549054, + "grad_norm": 0.6767914295196533, + "learning_rate": 4.520666059059531e-06, + "loss": 4.5634, + "step": 2515 + }, + { + "epoch": 0.9058505850585058, + "grad_norm": 0.7175542712211609, + "learning_rate": 4.486482911479839e-06, + "loss": 4.719, + "step": 2516 + }, + { + "epoch": 0.9062106210621063, + "grad_norm": 0.9069615602493286, + "learning_rate": 4.452426527410947e-06, + "loss": 5.5713, + "step": 2517 + }, + { + "epoch": 0.9065706570657066, + "grad_norm": 0.6263923048973083, + "learning_rate": 4.418496952051798e-06, + "loss": 4.7829, + "step": 2518 + }, + { + "epoch": 0.906930693069307, + "grad_norm": 0.7558562159538269, + "learning_rate": 4.384694230432984e-06, + "loss": 4.9266, + "step": 2519 + }, + { + "epoch": 0.9072907290729073, + "grad_norm": 0.6696991324424744, + "learning_rate": 4.351018407416763e-06, + "loss": 4.3571, + "step": 2520 + }, + { + "epoch": 0.9076507650765077, + "grad_norm": 0.6993823051452637, + "learning_rate": 4.317469527696983e-06, + "loss": 5.2419, + "step": 2521 + }, + { + "epoch": 0.908010801080108, + "grad_norm": 0.6072081923484802, + "learning_rate": 4.2840476357989825e-06, + "loss": 4.9883, + "step": 2522 + }, + { + "epoch": 0.9083708370837084, + "grad_norm": 0.8503673672676086, + "learning_rate": 4.250752776079614e-06, + "loss": 5.0176, + "step": 2523 + }, + { + "epoch": 0.9087308730873087, + "grad_norm": 0.9142279624938965, + "learning_rate": 4.217584992727108e-06, + "loss": 5.2182, + "step": 2524 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.77701735496521, + "learning_rate": 4.184544329761009e-06, + "loss": 5.2836, + "step": 2525 + }, + { + "epoch": 0.9094509450945094, + "grad_norm": 2.4094419479370117, + "learning_rate": 4.151630831032205e-06, + "loss": 4.853, + "step": 2526 + }, + { + "epoch": 0.9098109810981098, + "grad_norm": 0.5602378249168396, + "learning_rate": 4.118844540222788e-06, + "loss": 5.1699, + "step": 2527 + }, + { + "epoch": 0.9101710171017102, + "grad_norm": 0.7844763994216919, + "learning_rate": 4.0861855008460405e-06, + "loss": 4.8816, + "step": 2528 + }, + { + "epoch": 0.9105310531053106, + "grad_norm": 0.5660812258720398, + "learning_rate": 4.0536537562463225e-06, + "loss": 4.8106, + "step": 2529 + }, + { + "epoch": 0.9108910891089109, + "grad_norm": 0.5048322081565857, + "learning_rate": 4.021249349599077e-06, + "loss": 4.2835, + "step": 2530 + }, + { + "epoch": 0.9112511251125113, + "grad_norm": 0.7268801927566528, + "learning_rate": 3.988972323910778e-06, + "loss": 4.59, + "step": 2531 + }, + { + "epoch": 0.9116111611161116, + "grad_norm": 0.7188135981559753, + "learning_rate": 3.95682272201876e-06, + "loss": 4.6502, + "step": 2532 + }, + { + "epoch": 0.911971197119712, + "grad_norm": 0.6325691342353821, + "learning_rate": 3.924800586591326e-06, + "loss": 4.7787, + "step": 2533 + }, + { + "epoch": 0.9123312331233123, + "grad_norm": 0.5503108501434326, + "learning_rate": 3.892905960127546e-06, + "loss": 4.6064, + "step": 2534 + }, + { + "epoch": 0.9126912691269127, + "grad_norm": 0.8401983380317688, + "learning_rate": 3.861138884957316e-06, + "loss": 4.3366, + "step": 2535 + }, + { + "epoch": 0.913051305130513, + "grad_norm": 0.521528422832489, + "learning_rate": 3.829499403241221e-06, + "loss": 4.7727, + "step": 2536 + }, + { + "epoch": 0.9134113411341134, + "grad_norm": 0.5778352618217468, + "learning_rate": 3.797987556970495e-06, + "loss": 4.9353, + "step": 2537 + }, + { + "epoch": 0.9137713771377137, + "grad_norm": 0.5739848613739014, + "learning_rate": 3.7666033879670048e-06, + "loss": 4.973, + "step": 2538 + }, + { + "epoch": 0.9141314131413142, + "grad_norm": 0.5352855324745178, + "learning_rate": 3.735346937883144e-06, + "loss": 4.511, + "step": 2539 + }, + { + "epoch": 0.9144914491449145, + "grad_norm": 0.7256152033805847, + "learning_rate": 3.7042182482018075e-06, + "loss": 4.5012, + "step": 2540 + }, + { + "epoch": 0.9148514851485149, + "grad_norm": 0.8612756133079529, + "learning_rate": 3.6732173602363363e-06, + "loss": 4.6615, + "step": 2541 + }, + { + "epoch": 0.9152115211521152, + "grad_norm": 0.6105715036392212, + "learning_rate": 3.6423443151304526e-06, + "loss": 4.451, + "step": 2542 + }, + { + "epoch": 0.9155715571557156, + "grad_norm": 0.6826533079147339, + "learning_rate": 3.611599153858214e-06, + "loss": 4.6159, + "step": 2543 + }, + { + "epoch": 0.9159315931593159, + "grad_norm": 0.6430963277816772, + "learning_rate": 3.580981917223913e-06, + "loss": 4.7071, + "step": 2544 + }, + { + "epoch": 0.9162916291629163, + "grad_norm": 0.6871779561042786, + "learning_rate": 3.5504926458621246e-06, + "loss": 4.741, + "step": 2545 + }, + { + "epoch": 0.9166516651665166, + "grad_norm": 0.7385034561157227, + "learning_rate": 3.5201313802375456e-06, + "loss": 4.6154, + "step": 2546 + }, + { + "epoch": 0.917011701170117, + "grad_norm": 0.9472239017486572, + "learning_rate": 3.4898981606450333e-06, + "loss": 5.0247, + "step": 2547 + }, + { + "epoch": 0.9173717371737173, + "grad_norm": 0.7648311853408813, + "learning_rate": 3.4597930272094235e-06, + "loss": 5.1778, + "step": 2548 + }, + { + "epoch": 0.9177317731773177, + "grad_norm": 1.013818383216858, + "learning_rate": 3.4298160198856568e-06, + "loss": 4.9648, + "step": 2549 + }, + { + "epoch": 0.918091809180918, + "grad_norm": 1.1295243501663208, + "learning_rate": 3.3999671784585517e-06, + "loss": 5.2376, + "step": 2550 + }, + { + "epoch": 0.9184518451845185, + "grad_norm": 3.160092830657959, + "learning_rate": 3.370246542542865e-06, + "loss": 4.6393, + "step": 2551 + }, + { + "epoch": 0.9188118811881189, + "grad_norm": 0.7330535054206848, + "learning_rate": 3.3406541515832003e-06, + "loss": 4.5272, + "step": 2552 + }, + { + "epoch": 0.9191719171917192, + "grad_norm": 0.9496917724609375, + "learning_rate": 3.311190044853951e-06, + "loss": 5.049, + "step": 2553 + }, + { + "epoch": 0.9195319531953196, + "grad_norm": 0.7707210779190063, + "learning_rate": 3.2818542614592497e-06, + "loss": 4.7573, + "step": 2554 + }, + { + "epoch": 0.9198919891989199, + "grad_norm": 0.673030436038971, + "learning_rate": 3.252646840332918e-06, + "loss": 4.6971, + "step": 2555 + }, + { + "epoch": 0.9202520252025203, + "grad_norm": 0.6012186408042908, + "learning_rate": 3.2235678202384267e-06, + "loss": 4.5512, + "step": 2556 + }, + { + "epoch": 0.9206120612061206, + "grad_norm": 1.0911415815353394, + "learning_rate": 3.1946172397688267e-06, + "loss": 4.3919, + "step": 2557 + }, + { + "epoch": 0.920972097209721, + "grad_norm": 0.7308849692344666, + "learning_rate": 3.1657951373467497e-06, + "loss": 4.8589, + "step": 2558 + }, + { + "epoch": 0.9213321332133213, + "grad_norm": 0.676698625087738, + "learning_rate": 3.1371015512242306e-06, + "loss": 4.8728, + "step": 2559 + }, + { + "epoch": 0.9216921692169217, + "grad_norm": 1.072251796722412, + "learning_rate": 3.1085365194828075e-06, + "loss": 5.1019, + "step": 2560 + }, + { + "epoch": 0.922052205220522, + "grad_norm": 0.6631917953491211, + "learning_rate": 3.0801000800333877e-06, + "loss": 4.6797, + "step": 2561 + }, + { + "epoch": 0.9224122412241225, + "grad_norm": 0.5610212683677673, + "learning_rate": 3.051792270616216e-06, + "loss": 4.6816, + "step": 2562 + }, + { + "epoch": 0.9227722772277228, + "grad_norm": 0.5413954257965088, + "learning_rate": 3.023613128800795e-06, + "loss": 4.6725, + "step": 2563 + }, + { + "epoch": 0.9231323132313232, + "grad_norm": 0.7505655288696289, + "learning_rate": 2.995562691985898e-06, + "loss": 4.7661, + "step": 2564 + }, + { + "epoch": 0.9234923492349235, + "grad_norm": 0.810063362121582, + "learning_rate": 2.9676409973994566e-06, + "loss": 4.773, + "step": 2565 + }, + { + "epoch": 0.9238523852385239, + "grad_norm": 1.0670089721679688, + "learning_rate": 2.939848082098562e-06, + "loss": 5.1925, + "step": 2566 + }, + { + "epoch": 0.9242124212421242, + "grad_norm": 0.6282406449317932, + "learning_rate": 2.912183982969385e-06, + "loss": 4.9323, + "step": 2567 + }, + { + "epoch": 0.9245724572457246, + "grad_norm": 0.9556277394294739, + "learning_rate": 2.8846487367271135e-06, + "loss": 4.8946, + "step": 2568 + }, + { + "epoch": 0.9249324932493249, + "grad_norm": 0.5398349761962891, + "learning_rate": 2.8572423799159586e-06, + "loss": 4.5384, + "step": 2569 + }, + { + "epoch": 0.9252925292529253, + "grad_norm": 0.6668215990066528, + "learning_rate": 2.8299649489090475e-06, + "loss": 4.6729, + "step": 2570 + }, + { + "epoch": 0.9256525652565256, + "grad_norm": 0.6637030839920044, + "learning_rate": 2.802816479908399e-06, + "loss": 4.8677, + "step": 2571 + }, + { + "epoch": 0.926012601260126, + "grad_norm": 0.5411921143531799, + "learning_rate": 2.7757970089449024e-06, + "loss": 4.6855, + "step": 2572 + }, + { + "epoch": 0.9263726372637263, + "grad_norm": 0.5961670875549316, + "learning_rate": 2.748906571878207e-06, + "loss": 4.913, + "step": 2573 + }, + { + "epoch": 0.9267326732673268, + "grad_norm": 0.8857282400131226, + "learning_rate": 2.722145204396742e-06, + "loss": 5.2247, + "step": 2574 + }, + { + "epoch": 0.9270927092709271, + "grad_norm": 1.259244680404663, + "learning_rate": 2.6955129420176196e-06, + "loss": 5.4074, + "step": 2575 + }, + { + "epoch": 0.9274527452745275, + "grad_norm": 0.9549702405929565, + "learning_rate": 2.6690098200866098e-06, + "loss": 4.5297, + "step": 2576 + }, + { + "epoch": 0.9278127812781278, + "grad_norm": 0.7229859232902527, + "learning_rate": 2.6426358737781098e-06, + "loss": 4.9248, + "step": 2577 + }, + { + "epoch": 0.9281728172817282, + "grad_norm": 0.722059428691864, + "learning_rate": 2.6163911380950425e-06, + "loss": 4.8236, + "step": 2578 + }, + { + "epoch": 0.9285328532853285, + "grad_norm": 0.6506609916687012, + "learning_rate": 2.590275647868867e-06, + "loss": 4.4557, + "step": 2579 + }, + { + "epoch": 0.9288928892889289, + "grad_norm": 0.7389491200447083, + "learning_rate": 2.564289437759515e-06, + "loss": 4.7994, + "step": 2580 + }, + { + "epoch": 0.9292529252925292, + "grad_norm": 0.5561854839324951, + "learning_rate": 2.53843254225532e-06, + "loss": 4.2714, + "step": 2581 + }, + { + "epoch": 0.9296129612961296, + "grad_norm": 0.7266308069229126, + "learning_rate": 2.5127049956730207e-06, + "loss": 4.3677, + "step": 2582 + }, + { + "epoch": 0.9299729972997299, + "grad_norm": 0.7052050828933716, + "learning_rate": 2.4871068321576596e-06, + "loss": 4.5698, + "step": 2583 + }, + { + "epoch": 0.9303330333033303, + "grad_norm": 0.7006728649139404, + "learning_rate": 2.4616380856825716e-06, + "loss": 4.7132, + "step": 2584 + }, + { + "epoch": 0.9306930693069307, + "grad_norm": 0.6953990459442139, + "learning_rate": 2.436298790049363e-06, + "loss": 4.6789, + "step": 2585 + }, + { + "epoch": 0.9310531053105311, + "grad_norm": 0.6005169749259949, + "learning_rate": 2.4110889788877656e-06, + "loss": 4.4247, + "step": 2586 + }, + { + "epoch": 0.9314131413141314, + "grad_norm": 0.5607119798660278, + "learning_rate": 2.3860086856557383e-06, + "loss": 4.5032, + "step": 2587 + }, + { + "epoch": 0.9317731773177318, + "grad_norm": 0.8107718825340271, + "learning_rate": 2.3610579436393e-06, + "loss": 4.9354, + "step": 2588 + }, + { + "epoch": 0.9321332133213321, + "grad_norm": 0.5992099642753601, + "learning_rate": 2.33623678595255e-06, + "loss": 4.5928, + "step": 2589 + }, + { + "epoch": 0.9324932493249325, + "grad_norm": 0.5759278535842896, + "learning_rate": 2.311545245537594e-06, + "loss": 4.8887, + "step": 2590 + }, + { + "epoch": 0.9328532853285328, + "grad_norm": 0.7263954281806946, + "learning_rate": 2.286983355164529e-06, + "loss": 4.8844, + "step": 2591 + }, + { + "epoch": 0.9332133213321332, + "grad_norm": 0.6867969632148743, + "learning_rate": 2.2625511474313685e-06, + "loss": 4.5716, + "step": 2592 + }, + { + "epoch": 0.9335733573357335, + "grad_norm": 0.6144258379936218, + "learning_rate": 2.23824865476403e-06, + "loss": 4.7591, + "step": 2593 + }, + { + "epoch": 0.9339333933393339, + "grad_norm": 0.6983000040054321, + "learning_rate": 2.2140759094162467e-06, + "loss": 4.5595, + "step": 2594 + }, + { + "epoch": 0.9342934293429342, + "grad_norm": 0.7773663997650146, + "learning_rate": 2.1900329434695887e-06, + "loss": 4.6514, + "step": 2595 + }, + { + "epoch": 0.9346534653465347, + "grad_norm": 0.7720439434051514, + "learning_rate": 2.166119788833354e-06, + "loss": 4.9227, + "step": 2596 + }, + { + "epoch": 0.9350135013501351, + "grad_norm": 0.7285527586936951, + "learning_rate": 2.1423364772445887e-06, + "loss": 4.8956, + "step": 2597 + }, + { + "epoch": 0.9353735373537354, + "grad_norm": 1.0870779752731323, + "learning_rate": 2.118683040267999e-06, + "loss": 5.0033, + "step": 2598 + }, + { + "epoch": 0.9357335733573358, + "grad_norm": 0.965726912021637, + "learning_rate": 2.095159509295919e-06, + "loss": 5.2102, + "step": 2599 + }, + { + "epoch": 0.9360936093609361, + "grad_norm": 1.4614653587341309, + "learning_rate": 2.0717659155482738e-06, + "loss": 5.5101, + "step": 2600 + }, + { + "epoch": 0.9364536453645365, + "grad_norm": 0.5772082209587097, + "learning_rate": 2.0485022900725513e-06, + "loss": 4.786, + "step": 2601 + }, + { + "epoch": 0.9368136813681368, + "grad_norm": 0.7124701142311096, + "learning_rate": 2.025368663743743e-06, + "loss": 4.4701, + "step": 2602 + }, + { + "epoch": 0.9371737173717372, + "grad_norm": 0.7923992872238159, + "learning_rate": 2.002365067264289e-06, + "loss": 5.1817, + "step": 2603 + }, + { + "epoch": 0.9375337533753375, + "grad_norm": 0.6645485758781433, + "learning_rate": 1.9794915311641018e-06, + "loss": 4.5608, + "step": 2604 + }, + { + "epoch": 0.9378937893789379, + "grad_norm": 1.0128847360610962, + "learning_rate": 1.9567480858004306e-06, + "loss": 4.549, + "step": 2605 + }, + { + "epoch": 0.9382538253825382, + "grad_norm": 0.6514415144920349, + "learning_rate": 1.9341347613579087e-06, + "loss": 5.0031, + "step": 2606 + }, + { + "epoch": 0.9386138613861386, + "grad_norm": 0.9877171516418457, + "learning_rate": 1.91165158784844e-06, + "loss": 5.0519, + "step": 2607 + }, + { + "epoch": 0.938973897389739, + "grad_norm": 0.6819136738777161, + "learning_rate": 1.889298595111233e-06, + "loss": 4.6016, + "step": 2608 + }, + { + "epoch": 0.9393339333933394, + "grad_norm": 0.9286605715751648, + "learning_rate": 1.8670758128126909e-06, + "loss": 4.8841, + "step": 2609 + }, + { + "epoch": 0.9396939693969397, + "grad_norm": 0.5582537651062012, + "learning_rate": 1.844983270446432e-06, + "loss": 4.7466, + "step": 2610 + }, + { + "epoch": 0.9400540054005401, + "grad_norm": 0.9149574041366577, + "learning_rate": 1.8230209973331914e-06, + "loss": 4.5378, + "step": 2611 + }, + { + "epoch": 0.9404140414041404, + "grad_norm": 0.5852335691452026, + "learning_rate": 1.8011890226208527e-06, + "loss": 4.906, + "step": 2612 + }, + { + "epoch": 0.9407740774077408, + "grad_norm": 0.7403162717819214, + "learning_rate": 1.7794873752843277e-06, + "loss": 4.6849, + "step": 2613 + }, + { + "epoch": 0.9411341134113411, + "grad_norm": 0.7608280777931213, + "learning_rate": 1.7579160841256104e-06, + "loss": 4.6213, + "step": 2614 + }, + { + "epoch": 0.9414941494149415, + "grad_norm": 0.7691354751586914, + "learning_rate": 1.7364751777736332e-06, + "loss": 4.6725, + "step": 2615 + }, + { + "epoch": 0.9418541854185418, + "grad_norm": 0.7262532114982605, + "learning_rate": 1.7151646846843227e-06, + "loss": 4.9798, + "step": 2616 + }, + { + "epoch": 0.9422142214221422, + "grad_norm": 0.5878902077674866, + "learning_rate": 1.6939846331405108e-06, + "loss": 4.707, + "step": 2617 + }, + { + "epoch": 0.9425742574257425, + "grad_norm": 0.637844443321228, + "learning_rate": 1.6729350512519005e-06, + "loss": 4.9285, + "step": 2618 + }, + { + "epoch": 0.942934293429343, + "grad_norm": 0.6767174005508423, + "learning_rate": 1.6520159669550783e-06, + "loss": 4.8655, + "step": 2619 + }, + { + "epoch": 0.9432943294329433, + "grad_norm": 0.6745604872703552, + "learning_rate": 1.6312274080133804e-06, + "loss": 4.9893, + "step": 2620 + }, + { + "epoch": 0.9436543654365437, + "grad_norm": 0.6165941953659058, + "learning_rate": 1.6105694020169593e-06, + "loss": 4.676, + "step": 2621 + }, + { + "epoch": 0.944014401440144, + "grad_norm": 1.2223420143127441, + "learning_rate": 1.5900419763826614e-06, + "loss": 4.7306, + "step": 2622 + }, + { + "epoch": 0.9443744374437444, + "grad_norm": 0.6925899386405945, + "learning_rate": 1.5696451583540827e-06, + "loss": 5.2183, + "step": 2623 + }, + { + "epoch": 0.9447344734473447, + "grad_norm": 0.6742193102836609, + "learning_rate": 1.5493789750014031e-06, + "loss": 4.9525, + "step": 2624 + }, + { + "epoch": 0.9450945094509451, + "grad_norm": 1.818490982055664, + "learning_rate": 1.5292434532215072e-06, + "loss": 5.372, + "step": 2625 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 3.1944949626922607, + "learning_rate": 1.5092386197378183e-06, + "loss": 4.7497, + "step": 2626 + }, + { + "epoch": 0.9458145814581458, + "grad_norm": 0.5739015340805054, + "learning_rate": 1.489364501100332e-06, + "loss": 4.6389, + "step": 2627 + }, + { + "epoch": 0.9461746174617461, + "grad_norm": 0.8385379910469055, + "learning_rate": 1.4696211236855272e-06, + "loss": 4.8675, + "step": 2628 + }, + { + "epoch": 0.9465346534653465, + "grad_norm": 0.7263203859329224, + "learning_rate": 1.4500085136964326e-06, + "loss": 4.5676, + "step": 2629 + }, + { + "epoch": 0.946894689468947, + "grad_norm": 0.9063706398010254, + "learning_rate": 1.430526697162482e-06, + "loss": 4.4987, + "step": 2630 + }, + { + "epoch": 0.9472547254725473, + "grad_norm": 0.9547297954559326, + "learning_rate": 1.4111756999395154e-06, + "loss": 4.7062, + "step": 2631 + }, + { + "epoch": 0.9476147614761476, + "grad_norm": 0.7871283292770386, + "learning_rate": 1.3919555477097668e-06, + "loss": 4.6941, + "step": 2632 + }, + { + "epoch": 0.947974797479748, + "grad_norm": 0.8934873342514038, + "learning_rate": 1.3728662659818204e-06, + "loss": 4.5796, + "step": 2633 + }, + { + "epoch": 0.9483348334833483, + "grad_norm": 0.5550655126571655, + "learning_rate": 1.3539078800905659e-06, + "loss": 4.6309, + "step": 2634 + }, + { + "epoch": 0.9486948694869487, + "grad_norm": 0.8748136758804321, + "learning_rate": 1.3350804151971653e-06, + "loss": 4.7027, + "step": 2635 + }, + { + "epoch": 0.949054905490549, + "grad_norm": 0.6412554383277893, + "learning_rate": 1.3163838962890195e-06, + "loss": 4.7536, + "step": 2636 + }, + { + "epoch": 0.9494149414941494, + "grad_norm": 0.5575628280639648, + "learning_rate": 1.2978183481797801e-06, + "loss": 4.9255, + "step": 2637 + }, + { + "epoch": 0.9497749774977498, + "grad_norm": 0.6832976341247559, + "learning_rate": 1.2793837955092258e-06, + "loss": 4.8386, + "step": 2638 + }, + { + "epoch": 0.9501350135013501, + "grad_norm": 0.7369568347930908, + "learning_rate": 1.261080262743297e-06, + "loss": 4.8507, + "step": 2639 + }, + { + "epoch": 0.9504950495049505, + "grad_norm": 0.6423168778419495, + "learning_rate": 1.2429077741740736e-06, + "loss": 4.5613, + "step": 2640 + }, + { + "epoch": 0.9508550855085508, + "grad_norm": 0.6358969807624817, + "learning_rate": 1.2248663539196848e-06, + "loss": 4.5165, + "step": 2641 + }, + { + "epoch": 0.9512151215121513, + "grad_norm": 0.726514995098114, + "learning_rate": 1.2069560259243328e-06, + "loss": 5.0404, + "step": 2642 + }, + { + "epoch": 0.9515751575157516, + "grad_norm": 0.6469401121139526, + "learning_rate": 1.1891768139582037e-06, + "loss": 4.3748, + "step": 2643 + }, + { + "epoch": 0.951935193519352, + "grad_norm": 0.7196247577667236, + "learning_rate": 1.1715287416175113e-06, + "loss": 4.7141, + "step": 2644 + }, + { + "epoch": 0.9522952295229523, + "grad_norm": 0.6038268804550171, + "learning_rate": 1.1540118323243865e-06, + "loss": 4.5587, + "step": 2645 + }, + { + "epoch": 0.9526552655265527, + "grad_norm": 0.6786099076271057, + "learning_rate": 1.1366261093268992e-06, + "loss": 5.1968, + "step": 2646 + }, + { + "epoch": 0.953015301530153, + "grad_norm": 0.5878410339355469, + "learning_rate": 1.1193715956990258e-06, + "loss": 4.7435, + "step": 2647 + }, + { + "epoch": 0.9533753375337534, + "grad_norm": 0.8352496027946472, + "learning_rate": 1.1022483143405705e-06, + "loss": 4.9383, + "step": 2648 + }, + { + "epoch": 0.9537353735373537, + "grad_norm": 0.7436163425445557, + "learning_rate": 1.08525628797721e-06, + "loss": 4.9565, + "step": 2649 + }, + { + "epoch": 0.9540954095409541, + "grad_norm": 0.9482848644256592, + "learning_rate": 1.068395539160394e-06, + "loss": 4.9916, + "step": 2650 + }, + { + "epoch": 0.9544554455445544, + "grad_norm": 1.374624252319336, + "learning_rate": 1.0516660902673448e-06, + "loss": 4.6837, + "step": 2651 + }, + { + "epoch": 0.9548154815481548, + "grad_norm": 0.5657400488853455, + "learning_rate": 1.035067963501024e-06, + "loss": 4.3842, + "step": 2652 + }, + { + "epoch": 0.9551755175517552, + "grad_norm": 0.640466570854187, + "learning_rate": 1.018601180890133e-06, + "loss": 4.8339, + "step": 2653 + }, + { + "epoch": 0.9555355535553556, + "grad_norm": 0.7675738334655762, + "learning_rate": 1.0022657642890231e-06, + "loss": 4.5358, + "step": 2654 + }, + { + "epoch": 0.9558955895589559, + "grad_norm": 0.6688444018363953, + "learning_rate": 9.86061735377708e-07, + "loss": 4.8008, + "step": 2655 + }, + { + "epoch": 0.9562556255625563, + "grad_norm": 0.7679070830345154, + "learning_rate": 9.699891156618402e-07, + "loss": 4.8804, + "step": 2656 + }, + { + "epoch": 0.9566156615661566, + "grad_norm": 0.7303805351257324, + "learning_rate": 9.540479264726676e-07, + "loss": 4.6286, + "step": 2657 + }, + { + "epoch": 0.956975697569757, + "grad_norm": 0.7357656955718994, + "learning_rate": 9.382381889669667e-07, + "loss": 4.7679, + "step": 2658 + }, + { + "epoch": 0.9573357335733573, + "grad_norm": 0.8778092265129089, + "learning_rate": 9.225599241271199e-07, + "loss": 4.8732, + "step": 2659 + }, + { + "epoch": 0.9576957695769577, + "grad_norm": 0.4600130021572113, + "learning_rate": 9.070131527609604e-07, + "loss": 4.8048, + "step": 2660 + }, + { + "epoch": 0.958055805580558, + "grad_norm": 0.6551803946495056, + "learning_rate": 8.9159789550185e-07, + "loss": 4.506, + "step": 2661 + }, + { + "epoch": 0.9584158415841584, + "grad_norm": 0.7119221687316895, + "learning_rate": 8.763141728085789e-07, + "loss": 4.5518, + "step": 2662 + }, + { + "epoch": 0.9587758775877587, + "grad_norm": 0.571420431137085, + "learning_rate": 8.611620049653879e-07, + "loss": 4.8277, + "step": 2663 + }, + { + "epoch": 0.9591359135913592, + "grad_norm": 0.9699096083641052, + "learning_rate": 8.461414120819133e-07, + "loss": 4.9504, + "step": 2664 + }, + { + "epoch": 0.9594959495949595, + "grad_norm": 1.1072616577148438, + "learning_rate": 8.312524140931644e-07, + "loss": 4.5514, + "step": 2665 + }, + { + "epoch": 0.9598559855985599, + "grad_norm": 0.5243213176727295, + "learning_rate": 8.16495030759501e-07, + "loss": 5.0715, + "step": 2666 + }, + { + "epoch": 0.9602160216021602, + "grad_norm": 1.0006332397460938, + "learning_rate": 8.018692816666118e-07, + "loss": 5.0062, + "step": 2667 + }, + { + "epoch": 0.9605760576057606, + "grad_norm": 0.7771649956703186, + "learning_rate": 7.873751862254696e-07, + "loss": 4.8082, + "step": 2668 + }, + { + "epoch": 0.9609360936093609, + "grad_norm": 0.6831532120704651, + "learning_rate": 7.730127636723539e-07, + "loss": 4.8668, + "step": 2669 + }, + { + "epoch": 0.9612961296129613, + "grad_norm": 0.9201724529266357, + "learning_rate": 7.587820330687389e-07, + "loss": 4.5572, + "step": 2670 + }, + { + "epoch": 0.9616561656165616, + "grad_norm": 0.730021595954895, + "learning_rate": 7.446830133013616e-07, + "loss": 4.5861, + "step": 2671 + }, + { + "epoch": 0.962016201620162, + "grad_norm": 0.7570610642433167, + "learning_rate": 7.307157230821426e-07, + "loss": 5.0164, + "step": 2672 + }, + { + "epoch": 0.9623762376237623, + "grad_norm": 0.8982148170471191, + "learning_rate": 7.168801809481763e-07, + "loss": 5.0613, + "step": 2673 + }, + { + "epoch": 0.9627362736273627, + "grad_norm": 0.7683990001678467, + "learning_rate": 7.031764052616852e-07, + "loss": 4.9976, + "step": 2674 + }, + { + "epoch": 0.963096309630963, + "grad_norm": 1.3601149320602417, + "learning_rate": 6.896044142100433e-07, + "loss": 5.3859, + "step": 2675 + }, + { + "epoch": 0.9634563456345635, + "grad_norm": 0.7018367648124695, + "learning_rate": 6.761642258056978e-07, + "loss": 4.7254, + "step": 2676 + }, + { + "epoch": 0.9638163816381639, + "grad_norm": 0.8308577537536621, + "learning_rate": 6.628558578862021e-07, + "loss": 5.1009, + "step": 2677 + }, + { + "epoch": 0.9641764176417642, + "grad_norm": 0.6796321272850037, + "learning_rate": 6.496793281141056e-07, + "loss": 4.7258, + "step": 2678 + }, + { + "epoch": 0.9645364536453646, + "grad_norm": 0.8252047300338745, + "learning_rate": 6.366346539770529e-07, + "loss": 4.5518, + "step": 2679 + }, + { + "epoch": 0.9648964896489649, + "grad_norm": 0.7564746737480164, + "learning_rate": 6.237218527876399e-07, + "loss": 4.4544, + "step": 2680 + }, + { + "epoch": 0.9652565256525653, + "grad_norm": 0.49899938702583313, + "learning_rate": 6.109409416834688e-07, + "loss": 4.7035, + "step": 2681 + }, + { + "epoch": 0.9656165616561656, + "grad_norm": 0.8740639686584473, + "learning_rate": 5.982919376270823e-07, + "loss": 4.5884, + "step": 2682 + }, + { + "epoch": 0.965976597659766, + "grad_norm": 1.1928685903549194, + "learning_rate": 5.857748574059851e-07, + "loss": 4.7686, + "step": 2683 + }, + { + "epoch": 0.9663366336633663, + "grad_norm": 0.8761278986930847, + "learning_rate": 5.733897176325665e-07, + "loss": 4.9492, + "step": 2684 + }, + { + "epoch": 0.9666966696669667, + "grad_norm": 0.6431811451911926, + "learning_rate": 5.611365347441334e-07, + "loss": 4.8721, + "step": 2685 + }, + { + "epoch": 0.967056705670567, + "grad_norm": 0.7401660084724426, + "learning_rate": 5.49015325002833e-07, + "loss": 4.8349, + "step": 2686 + }, + { + "epoch": 0.9674167416741675, + "grad_norm": 0.7588707804679871, + "learning_rate": 5.370261044956971e-07, + "loss": 4.6324, + "step": 2687 + }, + { + "epoch": 0.9677767776777678, + "grad_norm": 0.5506744980812073, + "learning_rate": 5.25168889134553e-07, + "loss": 4.6068, + "step": 2688 + }, + { + "epoch": 0.9681368136813682, + "grad_norm": 0.6155896782875061, + "learning_rate": 5.134436946560572e-07, + "loss": 4.7839, + "step": 2689 + }, + { + "epoch": 0.9684968496849685, + "grad_norm": 0.7731359601020813, + "learning_rate": 5.018505366216175e-07, + "loss": 4.8457, + "step": 2690 + }, + { + "epoch": 0.9688568856885689, + "grad_norm": 0.6838685870170593, + "learning_rate": 4.903894304174372e-07, + "loss": 4.9202, + "step": 2691 + }, + { + "epoch": 0.9692169216921692, + "grad_norm": 0.7036203742027283, + "learning_rate": 4.790603912544489e-07, + "loss": 4.609, + "step": 2692 + }, + { + "epoch": 0.9695769576957696, + "grad_norm": 0.6218310594558716, + "learning_rate": 4.678634341683252e-07, + "loss": 4.7149, + "step": 2693 + }, + { + "epoch": 0.9699369936993699, + "grad_norm": 0.6616173982620239, + "learning_rate": 4.567985740194236e-07, + "loss": 4.5665, + "step": 2694 + }, + { + "epoch": 0.9702970297029703, + "grad_norm": 0.677174985408783, + "learning_rate": 4.458658254927972e-07, + "loss": 5.1044, + "step": 2695 + }, + { + "epoch": 0.9706570657065706, + "grad_norm": 0.7272735238075256, + "learning_rate": 4.3506520309813947e-07, + "loss": 5.0046, + "step": 2696 + }, + { + "epoch": 0.971017101710171, + "grad_norm": 0.9171267151832581, + "learning_rate": 4.2439672116982855e-07, + "loss": 5.1378, + "step": 2697 + }, + { + "epoch": 0.9713771377137714, + "grad_norm": 0.9265881180763245, + "learning_rate": 4.138603938668273e-07, + "loss": 5.1032, + "step": 2698 + }, + { + "epoch": 0.9717371737173718, + "grad_norm": 0.8648788332939148, + "learning_rate": 4.034562351727389e-07, + "loss": 5.3116, + "step": 2699 + }, + { + "epoch": 0.9720972097209721, + "grad_norm": 1.2427067756652832, + "learning_rate": 3.9318425889574017e-07, + "loss": 5.5403, + "step": 2700 + }, + { + "epoch": 0.9724572457245725, + "grad_norm": 1.1116799116134644, + "learning_rate": 3.8304447866857053e-07, + "loss": 5.1365, + "step": 2701 + }, + { + "epoch": 0.9728172817281728, + "grad_norm": 0.5051364898681641, + "learning_rate": 3.73036907948543e-07, + "loss": 4.3293, + "step": 2702 + }, + { + "epoch": 0.9731773177317732, + "grad_norm": 1.0161361694335938, + "learning_rate": 3.631615600174887e-07, + "loss": 4.4407, + "step": 2703 + }, + { + "epoch": 0.9735373537353735, + "grad_norm": 0.8723888993263245, + "learning_rate": 3.5341844798174594e-07, + "loss": 5.0663, + "step": 2704 + }, + { + "epoch": 0.9738973897389739, + "grad_norm": 0.6905868649482727, + "learning_rate": 3.4380758477219333e-07, + "loss": 4.9586, + "step": 2705 + }, + { + "epoch": 0.9742574257425742, + "grad_norm": 0.6786131262779236, + "learning_rate": 3.343289831441387e-07, + "loss": 4.4407, + "step": 2706 + }, + { + "epoch": 0.9746174617461746, + "grad_norm": 0.7154151201248169, + "learning_rate": 3.2498265567739717e-07, + "loss": 4.6941, + "step": 2707 + }, + { + "epoch": 0.9749774977497749, + "grad_norm": 0.7994034290313721, + "learning_rate": 3.1576861477621287e-07, + "loss": 4.857, + "step": 2708 + }, + { + "epoch": 0.9753375337533753, + "grad_norm": 0.6819135546684265, + "learning_rate": 3.0668687266925956e-07, + "loss": 5.0646, + "step": 2709 + }, + { + "epoch": 0.9756975697569757, + "grad_norm": 0.613059401512146, + "learning_rate": 2.977374414096401e-07, + "loss": 4.9047, + "step": 2710 + }, + { + "epoch": 0.9760576057605761, + "grad_norm": 0.9619209170341492, + "learning_rate": 2.889203328748424e-07, + "loss": 4.5585, + "step": 2711 + }, + { + "epoch": 0.9764176417641764, + "grad_norm": 0.5810762047767639, + "learning_rate": 2.8023555876673937e-07, + "loss": 4.8418, + "step": 2712 + }, + { + "epoch": 0.9767776777677768, + "grad_norm": 0.6123738884925842, + "learning_rate": 2.7168313061159964e-07, + "loss": 4.5524, + "step": 2713 + }, + { + "epoch": 0.9771377137713771, + "grad_norm": 0.6715987324714661, + "learning_rate": 2.6326305976001055e-07, + "loss": 4.9585, + "step": 2714 + }, + { + "epoch": 0.9774977497749775, + "grad_norm": 0.6126198768615723, + "learning_rate": 2.549753573869107e-07, + "loss": 4.6261, + "step": 2715 + }, + { + "epoch": 0.9778577857785778, + "grad_norm": 0.6247376203536987, + "learning_rate": 2.468200344915572e-07, + "loss": 4.7465, + "step": 2716 + }, + { + "epoch": 0.9782178217821782, + "grad_norm": 0.6758084297180176, + "learning_rate": 2.3879710189753656e-07, + "loss": 4.6146, + "step": 2717 + }, + { + "epoch": 0.9785778577857785, + "grad_norm": 0.8351864218711853, + "learning_rate": 2.3090657025270912e-07, + "loss": 4.9151, + "step": 2718 + }, + { + "epoch": 0.9789378937893789, + "grad_norm": 1.0861241817474365, + "learning_rate": 2.2314845002922025e-07, + "loss": 4.75, + "step": 2719 + }, + { + "epoch": 0.9792979297929792, + "grad_norm": 0.8706967830657959, + "learning_rate": 2.15522751523467e-07, + "loss": 5.033, + "step": 2720 + }, + { + "epoch": 0.9796579657965797, + "grad_norm": 0.8538120985031128, + "learning_rate": 2.080294848561426e-07, + "loss": 4.5049, + "step": 2721 + }, + { + "epoch": 0.9800180018001801, + "grad_norm": 0.567323625087738, + "learning_rate": 2.0066865997212525e-07, + "loss": 4.7168, + "step": 2722 + }, + { + "epoch": 0.9803780378037804, + "grad_norm": 0.8506109118461609, + "learning_rate": 1.9344028664056713e-07, + "loss": 5.0249, + "step": 2723 + }, + { + "epoch": 0.9807380738073808, + "grad_norm": 1.3589588403701782, + "learning_rate": 1.8634437445479435e-07, + "loss": 5.1037, + "step": 2724 + }, + { + "epoch": 0.9810981098109811, + "grad_norm": 1.3143948316574097, + "learning_rate": 1.7938093283236258e-07, + "loss": 5.555, + "step": 2725 + }, + { + "epoch": 0.9814581458145815, + "grad_norm": 1.7907097339630127, + "learning_rate": 1.7254997101500137e-07, + "loss": 4.8558, + "step": 2726 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 0.6720486283302307, + "learning_rate": 1.6585149806860324e-07, + "loss": 4.4563, + "step": 2727 + }, + { + "epoch": 0.9821782178217822, + "grad_norm": 0.6979610919952393, + "learning_rate": 1.5928552288326793e-07, + "loss": 4.5348, + "step": 2728 + }, + { + "epoch": 0.9825382538253825, + "grad_norm": 0.8894920349121094, + "learning_rate": 1.5285205417319149e-07, + "loss": 4.7034, + "step": 2729 + }, + { + "epoch": 0.9828982898289829, + "grad_norm": 0.6074763536453247, + "learning_rate": 1.4655110047675503e-07, + "loss": 4.5014, + "step": 2730 + }, + { + "epoch": 0.9832583258325832, + "grad_norm": 0.7769091129302979, + "learning_rate": 1.403826701564359e-07, + "loss": 4.7347, + "step": 2731 + }, + { + "epoch": 0.9836183618361836, + "grad_norm": 0.7279649376869202, + "learning_rate": 1.3434677139885222e-07, + "loss": 4.5844, + "step": 2732 + }, + { + "epoch": 0.983978397839784, + "grad_norm": 0.8169944882392883, + "learning_rate": 1.2844341221471824e-07, + "loss": 4.939, + "step": 2733 + }, + { + "epoch": 0.9843384338433844, + "grad_norm": 0.7737520337104797, + "learning_rate": 1.2267260043885564e-07, + "loss": 4.3103, + "step": 2734 + }, + { + "epoch": 0.9846984698469847, + "grad_norm": 0.7852435111999512, + "learning_rate": 1.170343437301491e-07, + "loss": 4.6278, + "step": 2735 + }, + { + "epoch": 0.9850585058505851, + "grad_norm": 0.5969253182411194, + "learning_rate": 1.1152864957157949e-07, + "loss": 4.5914, + "step": 2736 + }, + { + "epoch": 0.9854185418541854, + "grad_norm": 0.8359677791595459, + "learning_rate": 1.0615552527017958e-07, + "loss": 4.5289, + "step": 2737 + }, + { + "epoch": 0.9857785778577858, + "grad_norm": 0.8683612942695618, + "learning_rate": 1.0091497795706728e-07, + "loss": 4.5384, + "step": 2738 + }, + { + "epoch": 0.9861386138613861, + "grad_norm": 0.6547925472259521, + "learning_rate": 9.580701458736796e-08, + "loss": 4.7082, + "step": 2739 + }, + { + "epoch": 0.9864986498649865, + "grad_norm": 0.5458620190620422, + "learning_rate": 9.083164194025883e-08, + "loss": 4.6053, + "step": 2740 + }, + { + "epoch": 0.9868586858685868, + "grad_norm": 0.6568934917449951, + "learning_rate": 8.598886661895788e-08, + "loss": 4.5799, + "step": 2741 + }, + { + "epoch": 0.9872187218721872, + "grad_norm": 0.661688506603241, + "learning_rate": 8.127869505069053e-08, + "loss": 4.3995, + "step": 2742 + }, + { + "epoch": 0.9875787578757875, + "grad_norm": 0.7148826122283936, + "learning_rate": 7.670113348670071e-08, + "loss": 4.607, + "step": 2743 + }, + { + "epoch": 0.987938793879388, + "grad_norm": 0.5700961947441101, + "learning_rate": 7.225618800222877e-08, + "loss": 4.7385, + "step": 2744 + }, + { + "epoch": 0.9882988298829883, + "grad_norm": 0.6849833130836487, + "learning_rate": 6.794386449651135e-08, + "loss": 4.9545, + "step": 2745 + }, + { + "epoch": 0.9886588658865887, + "grad_norm": 0.6745620965957642, + "learning_rate": 6.376416869277036e-08, + "loss": 4.8148, + "step": 2746 + }, + { + "epoch": 0.989018901890189, + "grad_norm": 0.9582657814025879, + "learning_rate": 5.971710613821291e-08, + "loss": 5.3346, + "step": 2747 + }, + { + "epoch": 0.9893789378937894, + "grad_norm": 0.8358666300773621, + "learning_rate": 5.5802682204009194e-08, + "loss": 5.1755, + "step": 2748 + }, + { + "epoch": 0.9897389738973897, + "grad_norm": 0.9131016135215759, + "learning_rate": 5.2020902085303525e-08, + "loss": 5.2653, + "step": 2749 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 1.4401273727416992, + "learning_rate": 4.837177080119215e-08, + "loss": 5.5157, + "step": 2750 + }, + { + "epoch": 0.9904590459045904, + "grad_norm": 0.676723301410675, + "learning_rate": 4.485529319473436e-08, + "loss": 4.8633, + "step": 2751 + }, + { + "epoch": 0.9908190819081908, + "grad_norm": 0.7600442171096802, + "learning_rate": 4.147147393290807e-08, + "loss": 4.7974, + "step": 2752 + }, + { + "epoch": 0.9911791179117911, + "grad_norm": 1.2827167510986328, + "learning_rate": 3.8220317506654226e-08, + "loss": 4.8168, + "step": 2753 + }, + { + "epoch": 0.9915391539153915, + "grad_norm": 0.9814483523368835, + "learning_rate": 3.510182823083241e-08, + "loss": 4.736, + "step": 2754 + }, + { + "epoch": 0.991899189918992, + "grad_norm": 0.9813688397407532, + "learning_rate": 3.2116010244254144e-08, + "loss": 4.7668, + "step": 2755 + }, + { + "epoch": 0.9922592259225923, + "grad_norm": 0.7599554657936096, + "learning_rate": 2.9262867509605163e-08, + "loss": 4.7808, + "step": 2756 + }, + { + "epoch": 0.9926192619261927, + "grad_norm": 0.5308656692504883, + "learning_rate": 2.6542403813545334e-08, + "loss": 4.571, + "step": 2757 + }, + { + "epoch": 0.992979297929793, + "grad_norm": 0.5827195048332214, + "learning_rate": 2.3954622766597657e-08, + "loss": 4.7896, + "step": 2758 + }, + { + "epoch": 0.9933393339333934, + "grad_norm": 0.9100288152694702, + "learning_rate": 2.1499527803214846e-08, + "loss": 4.8101, + "step": 2759 + }, + { + "epoch": 0.9936993699369937, + "grad_norm": 1.0028470754623413, + "learning_rate": 1.9177122181757156e-08, + "loss": 4.6816, + "step": 2760 + }, + { + "epoch": 0.994059405940594, + "grad_norm": 0.7796440720558167, + "learning_rate": 1.698740898444795e-08, + "loss": 4.6857, + "step": 2761 + }, + { + "epoch": 0.9944194419441944, + "grad_norm": 0.6619350910186768, + "learning_rate": 1.4930391117451426e-08, + "loss": 4.8632, + "step": 2762 + }, + { + "epoch": 0.9947794779477948, + "grad_norm": 0.6579605937004089, + "learning_rate": 1.3006071310783797e-08, + "loss": 4.8485, + "step": 2763 + }, + { + "epoch": 0.9951395139513951, + "grad_norm": 0.7904688119888306, + "learning_rate": 1.1214452118368802e-08, + "loss": 4.6414, + "step": 2764 + }, + { + "epoch": 0.9954995499549955, + "grad_norm": 0.6213950514793396, + "learning_rate": 9.555535917993297e-09, + "loss": 4.56, + "step": 2765 + }, + { + "epoch": 0.9958595859585958, + "grad_norm": 0.5204569697380066, + "learning_rate": 8.029324911351666e-09, + "loss": 4.42, + "step": 2766 + }, + { + "epoch": 0.9962196219621963, + "grad_norm": 0.5820871591567993, + "learning_rate": 6.635821124001406e-09, + "loss": 4.7025, + "step": 2767 + }, + { + "epoch": 0.9965796579657966, + "grad_norm": 0.5928208231925964, + "learning_rate": 5.375026405352035e-09, + "loss": 5.1903, + "step": 2768 + }, + { + "epoch": 0.996939693969397, + "grad_norm": 0.5016415119171143, + "learning_rate": 4.246942428709488e-09, + "loss": 4.7524, + "step": 2769 + }, + { + "epoch": 0.9972997299729973, + "grad_norm": 0.650364339351654, + "learning_rate": 3.2515706912539245e-09, + "loss": 4.3618, + "step": 2770 + }, + { + "epoch": 0.9976597659765977, + "grad_norm": 1.0009171962738037, + "learning_rate": 2.388912514017516e-09, + "loss": 4.9389, + "step": 2771 + }, + { + "epoch": 0.998019801980198, + "grad_norm": 1.0807517766952515, + "learning_rate": 1.6589690418955528e-09, + "loss": 5.1394, + "step": 2772 + }, + { + "epoch": 0.9983798379837984, + "grad_norm": 0.9495770335197449, + "learning_rate": 1.0617412436464413e-09, + "loss": 5.1927, + "step": 2773 + }, + { + "epoch": 0.9987398739873987, + "grad_norm": 0.8993694186210632, + "learning_rate": 5.972299119250125e-10, + "loss": 5.2012, + "step": 2774 + }, + { + "epoch": 0.9990999099909991, + "grad_norm": 1.2877267599105835, + "learning_rate": 2.6543566319370275e-10, + "loss": 5.2814, + "step": 2775 + }, + { + "epoch": 0.9994599459945994, + "grad_norm": 0.5570570826530457, + "learning_rate": 6.63589378113727e-11, + "loss": 4.7664, + "step": 2776 + }, + { + "epoch": 0.9998199819981998, + "grad_norm": 0.6813939213752747, + "learning_rate": 0.0, + "loss": 4.551, + "step": 2777 } ], "logging_steps": 1, @@ -14648,12 +19492,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 2264269974405120.0, + "total_flos": 3015768690130944.0, "train_batch_size": 2, "trial_name": null, "trial_params": null